├── .github
└── workflows
│ └── tests.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── RELEASING.md
├── deploy.sh
├── develop.sh
├── lint.sh
├── pylintrc
├── requirements.txt
├── setup.py
├── test.sh
├── test
├── __init__.py
├── data.py
├── data
│ ├── B16-StringTie-chr1-subset.gtf
│ ├── genes.fpkm_tracking
│ ├── isoforms.fpkm_tracking
│ └── tiny_test_ligandome_dir
│ │ ├── A0201
│ │ └── HLA-B0704
├── test_args_outputs.py
├── test_cli_protein_changes.py
├── test_contains_mutant_residues.py
├── test_dataframe.py
├── test_effect_expression_filters.py
├── test_epitopes_from_commandline_args.py
├── test_load_cufflinks_fpkm.py
├── test_load_stringtie_gtf_fpkm.py
├── test_mutant_epitope_predictions_class1.py
├── test_mutant_epitope_predictions_class2.py
├── test_padding.py
├── test_peptide_mutation_interval.py
├── test_rna_helpers.py
└── test_variant_expression_filters.py
└── topiary
├── __init__.py
├── cli
├── __init__.py
├── args.py
├── errors.py
├── filtering.py
├── outputs.py
├── protein_changes.py
├── rna.py
├── script.py
└── sequence.py
├── filters.py
├── predictor.py
├── rna
├── __init__.py
├── common.py
├── cufflinks.py
└── gtf.py
└── sequence_helpers.py
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: Tests
5 | on: [push, pull_request]
6 |
7 | jobs:
8 | build:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | fail-fast: true
12 | matrix:
13 | python-version: ["3.10", "3.11", "3.12"]
14 | steps:
15 | - uses: actions/checkout@v3
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v3
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | - name: Checkout private netmhc-bundle repo
21 | uses: actions/checkout@v4
22 | with:
23 | repository: openvax/netmhc-bundle
24 | token: ${{ secrets.NETMHC_BUNDLE_ACCESS_TOKEN }}
25 | path: netmhc-bundle
26 |
27 | - name: Install netmhc-bundle dependencies
28 | uses: awalsh128/cache-apt-pkgs-action@latest
29 | with:
30 | packages: tcsh gawk python2-minimal
31 | version: 1.0
32 | - name: Install dependencies
33 | run: |
34 | python -m pip install --upgrade pip
35 | python -m pip install pytest pytest-cov pylint
36 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
37 | - name: Install wkthtmltopdf
38 | run: |
39 | sudo apt-get install -y xfonts-base xfonts-75dpi
40 | wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.bionic_amd64.deb
41 | sudo dpkg -i wkhtmltox_0.12.6-1.bionic_amd64.deb
42 | - name: Lint with PyLint
43 | run: |
44 | ./lint.sh
45 | - name: Download Ensembl data
46 | run: |
47 | echo "Before installing Ensembl releases" && df -h
48 | pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/
49 | pyensembl install --release 102 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.102/
50 | pyensembl install --release 93 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.93/
51 | pyensembl install --release 93 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.93/
52 | - name: Test with pytest
53 | run: |
54 | # configure netmhc-bundle paths
55 | export NETMHC_BUNDLE_HOME=$PWD/netmhc-bundle
56 | echo "NetMHC-bundle dir:" && ls -l $NETMHC_BUNDLE_HOME
57 | mkdir $PWD/netmhc-bundle-tmp
58 | export NETMHC_BUNDLE_TMPDIR=$PWD/netmhc-bundle-tmp
59 | export PATH=$PATH:$NETMHC_BUNDLE_HOME/bin
60 | ./test.sh
61 | - name: Publish coverage to Coveralls
62 | uses: coverallsapp/github-action@v2.2.3
63 | with:
64 | parallel: true
65 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 |
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 |
37 | # Translations
38 | *.mo
39 |
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 |
45 | # Rope
46 | .ropeproject
47 |
48 | # Django stuff:
49 | *.log
50 | *.pot
51 |
52 | # Sphinx documentation
53 | docs/_build/
54 |
55 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false # Use container-based infrastructure
2 | language: python
3 | dist: trusty
4 | python:
5 | - "2.7"
6 | - "3.6"
7 | git:
8 | # don't need the default depth of 50
9 | # but don't want to use a depth of 1 since that affects
10 | # whether jobs run when you have multiple commits queued
11 | # https://github.com/travis-ci/travis-ci/issues/4575
12 | depth: 10
13 | cache:
14 | pip: true
15 | # cache directory used for Ensembl downloads of GTF and FASTA files
16 | # along with the indexed db of intervals and ID mappings and pickles
17 | # of sequence dictionaries. Also, pip
18 | directories:
19 | - $HOME/.cache/pyensembl/
20 | addons:
21 | apt:
22 | packages:
23 | # Needed for NetMHC
24 | - tcsh
25 | env:
26 | global:
27 | # MHC_BUNDLE_PASS
28 | - secure: "TLTzSIABO/iYke8C66c0PRaWDZ5lx90s8XimSfDONOTXaX74V25O65qxzIWPAihxcdfLYA+bE2YRsjYOtuK+6DB2vjXbmoCQAXIFT/QXz4+iZTxN3g/s5N4hIR8tf9MSQ3KdNHOw7lKzdgAWKsFDQ8vwrqzYUNJGVtvoQSWCmPw="
29 | before_install:
30 | - |
31 | if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
32 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
33 | else
34 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
35 | fi
36 | - bash miniconda.sh -b -p $HOME/miniconda
37 | - export PATH="$HOME/miniconda/bin:$PATH"
38 | - hash -r
39 | - conda config --set always_yes yes --set changeps1 no
40 | - conda update -q conda
41 | # Useful for debugging any issues with conda
42 | - conda info -a
43 | - python --version
44 | # install MHC predictors
45 | - git clone https://mhcbundle:$MHC_BUNDLE_PASS@github.com/openvax/netmhc-bundle.git
46 | - export NETMHC_BUNDLE_HOME=$PWD/netmhc-bundle
47 | - mkdir tmp
48 | - export NETMHC_BUNDLE_TMPDIR=$PWD/tmp
49 | - export PATH=$PATH:$NETMHC_BUNDLE_HOME/bin
50 | install:
51 | - >
52 | conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
53 | numpy nose pandas pandoc
54 | - source activate test-environment
55 | - pip install pypandoc
56 | - pip install -r requirements.txt
57 | - pip install .
58 | - pip install coveralls
59 | - pyensembl install --release 75 --species human
60 | - pyensembl install --release 87 --species human
61 | script:
62 | - ./lint.sh
63 | - nosetests test --with-coverage --cover-package=topiary
64 | after_success:
65 | coveralls
66 | deploy:
67 | provider: pypi
68 | distributions: sdist
69 | user: openvax
70 | password: # See http://docs.travis-ci.com/user/encryption-keys/
71 | secure: "S4KWAhJpKYx5F/cBc6cf9GCZ8Hd+WtMA6V6PP25PglLnVaXrxB5QxuAIWGAvr/jGuTHjfCSCNDwTptW3natLjJR9IfJdJPp3gNvM0RDjWY4FsziFz/nG/bZo9qnh4ZCDhK/Po1izxXM0u9z6gUc0U2iKK1ZSdfawyW4nZbAXQUU="
72 | on:
73 | branch: master
74 | condition: $TRAVIS_PYTHON_VERSION = "2.7"
75 |
76 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/openvax/topiary/actions/workflows/tests.yml)
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | # Topiary
10 |
11 | Predict mutation-derived cancer T-cell epitopes from (1) somatic variants (2) tumor RNA expression data, and (3) patient HLA type.
12 |
13 | ## Example
14 |
15 | ```sh
16 | ./topiary \
17 | --vcf somatic.vcf \
18 | --mhc-predictor netmhcpan \
19 | --mhc-alleles HLA-A*02:01,HLA-B*07:02 \
20 | --ic50-cutoff 500 \
21 | --percentile-cutoff 2.0 \
22 | --mhc-epitope-lengths 8-11 \
23 | --rna-gene-fpkm-tracking-file genes.fpkm_tracking \
24 | --rna-min-gene-expression 4.0 \
25 | --rna-transcript-fpkm-tracking-file isoforms.fpkm_tracking \
26 | --rna-min-transcript-expression 1.5 \
27 | --output-csv epitopes.csv \
28 | --output-html epitopes.html
29 | ```
30 |
31 | ## Installation
32 |
33 | You can install Topiary and all of the libraries it depends on by running:
34 | ```
35 | pip install topiary
36 | ```
37 |
38 | You'll need to download the reference genome sequences and annotations for a
39 | recent Ensembl release (e.g. 81) by running:
40 |
41 | ```
42 | pyensembl install --release 81 --species human
43 | ```
44 |
45 | If you want to work with variants which were aligned against the older reference
46 | GRCh37, you will need to also download its annotation data, which is contained
47 | in Ensembl release 75:
48 |
49 | ```
50 | pyensembl install --release 75 --species human
51 | ```
52 |
53 |
54 | ## Commandline Arguments
55 |
56 | ### Genomic Variants
57 |
58 | Specify some variants by giving at least one of the following options. They can
59 | be used in combination and repeated.
60 |
61 | * `--vcf VCF_FILENAME`: Load a [VCF](http://www.1000genomes.org/wiki/analysis/variant%20call%20format/vcf-variant-call-format-version-41) file
62 | * `--maf MAF_FILENAME`: Load a TCGA [MAF](https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+%28MAF%29+Specification) file
63 | * `--variant CHR POS REF ALT : Specify an individual variant (requires --ensembl-version)`
64 |
65 | ### Output Format
66 |
67 | * `--output-csv OUTPUT_CSV_FILENAME`: Path to an output CSV file
68 | * `--output-html OUTPUT_HTML_FILENAME`: Path to an output HTML file
69 |
70 | ### RNA Expression Filtering
71 |
72 | Optional flags to use Cufflinks expression estimates for dropping epitopes
73 | arising from genes or transcripts that are not highly expressed.
74 |
75 | * `--rna-gene-fpkm-tracking-file RNA_GENE_FPKM_TRACKING_FILE`: Cufflinks FPKM tracking file
76 | containing gene expression estimates.
77 | * `--rna-min-gene-expression RNA_MIN_GENE_EXPRESSION`: Minimum FPKM for genes
78 | * `--rna-transcript-fpkm-tracking-file RNA_TRANSCRIPT_FPKM_TRACKING_FILE`: Cufflinks FPKM tracking
79 | file containing transcript expression estimates.
80 | * `--rna-min-transcript-expression RNA_MIN_TRANSCRIPT_EXPRESSION`: Minimum FPKM
81 | for transcripts
82 | * `--rna-transcript-fpkm-gtf-file RNA_TRANSCRIPT_FPKM_GTF_FILE`: StringTie GTF file
83 | file containing transcript expression estimates.
84 |
85 | ### Choose an MHC Binding Predictor
86 |
87 | You *must* choose an MHC binding predictor using one of the following values
88 | for the `--mhc-predictor` flag:
89 |
90 | * `netmhc`: Local [NetMHC](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHC) predictor (Topiary will attempt to automatically detect whether NetMHC 3.x or 4.0 is available)
91 | * `netmhcpan`: Local [NetMHCpan](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCpan) predictor
92 | * `netmhciipan`: Local [NetMHCIIpan](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCIIpan) predictor
93 | * `netmhccons`: Local [NetMHCcons](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCcons)
94 | * `random`: Random IC50 values
95 | * `smm`: Local [SMM](http://www.mhc-pathway.net/smm) predictor
96 | * `smm-pmbec`: Local [SMM-PMBEC](http://www.mhc-pathway.net/smmpmbec) predictor
97 | * `netmhcpan-iedb`: Use NetMHCpan via the IEDB web API
98 | * `netmhccons-iedb`: Use NetMHCcons via the IEDB web API
99 | * `smm-iedb`: Use SMM via the IEDB web API
100 | * `smm-pmbec-iedb`: Use SMM-PMBEC via the IEDB web API
101 |
102 | ### MHC Alleles
103 | You must specify the alleles to perform binding prediction for using one of
104 | the following flags:
105 |
106 | * `--mhc-alleles-file MHC_ALLELES_FILE`: Text file containing one allele name per
107 | line
108 | * `--mhc-alleles MHC_ALLELES`: Comma separated list of allele names,
109 | e.g. "HLA-A02:01,HLA-B07:02"
110 |
111 | ### Peptide Length
112 |
113 | * `--mhc-epitope-lengths MHC_EPITOPE_LENGTHS`: comma separated list of integers
114 | specifying which peptide lengths to use for MHC binding prediction
115 |
116 | ### Binding Prediction Filtering
117 |
118 | * `--only-novel-epitopes`: Topiary will normally keep all predicted epitopes,
119 | even those which occur in a given self-ligandome or don't overlap a mutated region
120 | of a protein. Use this flag to drop any epitopes which don't contain mutations
121 | or that occur elsewhere in the self-ligandome.
122 | * `--ic50-cutoff IC50_CUTOFF`: Drop peptides with predicted IC50 nM greater
123 | than this value (typical value is 500.0)
124 | * `--percentile-cutoff PERCENTILE_CUTOFF`: Drop peptides with percentile rank
125 | of their predicted IC50 (among predictions for a particular allele) fall below
126 | this threshold (lower values are stricter filters, typical value is 2.0)
127 |
128 | ### Misc
129 |
130 | * `--padding-around-mutation PADDING_AROUND_MUTATION`: Include more unmutated residues
131 | around the mutation (useful when not using `--only-novel-epitopes`)
132 | * `--self-filter-directory SELF_FILTER_DIRECTORY`: Directory of files named by MHC allele
133 | containing a self peptide ligandome (peptides which should be excluded from
134 | results)
135 | * `--skip-variant-errors`: If a particular mutation causes an exception to be raised
136 | during annotation, you can skip it using this flag.
137 |
138 |
--------------------------------------------------------------------------------
/RELEASING.md:
--------------------------------------------------------------------------------
1 | # Releasing Topiary
2 |
3 | This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world:
4 |
5 | 1. Bump the [version](http://semver.org/) on __init__.py, as part of the PR you want to release.
6 | 2. Merge your branch into master.
7 | 3. After the Topiary unit tests complete successfully on Travis then the latest version
8 | of the code (with the version specified above) will be pushed to [PyPI](https://pypi.python.org/pypi) automatically. If you're curious about how automatic deployment is achieved, see our [Travis configuration](https://github.com/hammerlab/topiary/blob/master/.travis.yml#L58).
9 |
--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | ./lint.sh && \
2 | ./test.sh && \
3 | python3 -m pip install --upgrade build && \
4 | python3 -m pip install --upgrade twine && \
5 | rm -rf dist && \
6 | python3 -m build && \
7 | git --version && \
8 | python3 -m twine upload dist/* && \
9 | git tag "$(python3 topiary/version.py)" && \
10 | git push --tags
11 |
12 |
--------------------------------------------------------------------------------
/develop.sh:
--------------------------------------------------------------------------------
1 | set -e
2 |
3 | uv pip install -e .
4 |
--------------------------------------------------------------------------------
/lint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -o errexit
3 |
4 |
5 | # disabling several categories of errors due to false positives in pylint,
6 | # see these issues:
7 | # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and
8 | # - https://bitbucket.org/logilab/pylint/issues/58
9 |
10 | find topiary/ -name '*.py' \
11 | | xargs pylint \
12 | --errors-only \
13 | --disable=unsubscriptable-object,not-an-iterable,no-member,invalid-unary-operand-type
14 |
15 | echo 'Passes pylint check'
16 |
--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [TYPECHECK]
2 | # Without ignoring this, we get errors like:
3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member)
4 | ignored-modules = numpy
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.7
2 | pandas>=0.13.1
3 | mhctools>=1.3.0
4 | varcode>=0.3.17
5 | pylint>=1.4.4
6 | nose>=1.3.6
7 | gtfparse>=0.0.4
8 | mhcnames
9 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-2018. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function
16 | import os
17 | import re
18 |
19 | from setuptools import setup, find_packages
20 |
21 | readme_dir = os.path.dirname(__file__)
22 | readme_path = os.path.join(readme_dir, 'README.md')
23 |
24 | try:
25 | with open(readme_path, 'r') as f:
26 | readme_markdown = f.read()
27 | except:
28 | readme_markdown = ""
29 |
30 | try:
31 | import pypandoc
32 | readme_restructured = pypandoc.convert(readme_markdown, to='rst', format='md')
33 | except:
34 | readme_restructured = readme_markdown
35 | print(
36 | "Conversion of long_description from MD to reStructuredText failed...")
37 |
38 | with open('topiary/__init__.py', 'r') as f:
39 | version = re.search(
40 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
41 | f.read(),
42 | re.MULTILINE).group(1)
43 |
44 | if not version:
45 | raise RuntimeError('Cannot find version information')
46 |
47 | if __name__ == '__main__':
48 | setup(
49 | name='topiary',
50 | version=version,
51 | description="Predict cancer epitopes from cancer sequence data",
52 | author="Alex Rubinsteyn, Tavi Nathanson",
53 | author_email="alex.rubinsteyn@gmail.com",
54 | url="https://github.com/hammerlab/topiary",
55 | license="http://www.apache.org/licenses/LICENSE-2.0.html",
56 | classifiers=[
57 | 'Development Status :: 3 - Alpha',
58 | 'Environment :: Console',
59 | 'Operating System :: OS Independent',
60 | 'Intended Audience :: Science/Research',
61 | 'License :: OSI Approved :: Apache Software License',
62 | 'Programming Language :: Python',
63 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
64 | ],
65 | install_requires=[
66 | 'numpy >=1.7, <2.0',
67 | 'pandas >=0.13.1',
68 | 'mhctools >= 1.3.0',
69 | 'varcode >=0.3.17',
70 | 'nose >=1.3.6',
71 | 'gtfparse >=0.0.4',
72 | 'mhcnames',
73 | ],
74 | long_description=readme_restructured,
75 | packages=find_packages(exclude="test"),
76 | entry_points={
77 | 'console_scripts': [
78 | 'topiary = topiary.cli.script:main'
79 | ]
80 | }
81 | )
82 |
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | pytest --cov=topiary/ --cov-report=term-missing tests
2 |
3 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/topiary/4ce5ed007a7a19d9666ba9f20cfcf5dfe745a4e3/test/__init__.py
--------------------------------------------------------------------------------
/test/data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Helper functions and shared datasets for tests
17 | """
18 |
19 |
20 | from __future__ import print_function, division, absolute_import
21 | import os
22 |
23 | from varcode import Variant, VariantCollection
24 | from pyensembl import ensembl_grch38
25 |
26 | def data_path(name):
27 | """
28 | Return the absolute path to a file in the varcode/test/data directory.
29 | The name specified should be relative to varcode/test/data.
30 | """
31 | return os.path.join(os.path.dirname(__file__), "data", name)
32 |
33 | # BRAF variant coordinates from COSMIC entry:
34 | # http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=476
35 | braf_V600E_variant = Variant(7, 140753336, "A", "T", ensembl_grch38)
36 |
37 | # TP53 variant coordinates from COSMIC entry:
38 | # http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=10656
39 | tp53_R248W_variant = Variant(17, 7674221, "G", "A", ensembl_grch38)
40 |
41 | cancer_test_variants = VariantCollection([
42 | braf_V600E_variant,
43 | tp53_R248W_variant
44 | ])
45 |
46 | cancer_test_variant_gene_ids = {
47 | gene_id
48 | for v in cancer_test_variants
49 | for gene_id in v.gene_ids
50 | }
51 |
52 | cancer_test_variant_transcript_ids = {
53 | transcript_id
54 | for v in cancer_test_variants
55 | for transcript_id in v.transcript_ids
56 | }
57 |
--------------------------------------------------------------------------------
/test/data/B16-StringTie-chr1-subset.gtf:
--------------------------------------------------------------------------------
1 | # StringTie version 1.1.2
2 | 1 StringTie transcript 4492457 4493604 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.2"; reference_id "ENSMUST00000192505"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.168215"; FPKM "0.125126"; TPM "0.255858";
3 | 1 StringTie exon 4492457 4493604 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.2"; exon_number "1"; reference_id "ENSMUST00000192505"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.168215";
4 | 1 StringTie transcript 4492465 4493735 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.1"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "6.349273"; FPKM "0.680062"; TPM "1.390592";
5 | 1 StringTie exon 4492465 4492668 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.1"; exon_number "1"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "20.261032";
6 | 1 StringTie exon 4493100 4493735 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.1"; exon_number "2"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.887011";
7 | 1 StringTie transcript 4687934 4689403 1000 - . gene_id "STRG.1"; transcript_id "STRG.1.1"; reference_id "ENSMUST00000182774"; ref_gene_id "ENSMUSG00000098104"; ref_gene_name "Gm6085"; cov "0.504422"; FPKM "0.054028"; TPM "0.110476";
8 | 1 StringTie exon 4687934 4689403 1000 - . gene_id "STRG.1"; transcript_id "STRG.1.1"; exon_number "1"; reference_id "ENSMUST00000182774"; ref_gene_id "ENSMUSG00000098104"; ref_gene_name "Gm6085"; cov "0.504422";
--------------------------------------------------------------------------------
/test/data/genes.fpkm_tracking:
--------------------------------------------------------------------------------
1 | tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage FPKM FPKM_conf_lo FPKM_conf_hi FPKM_status
2 | ENSG00000240361 - - ENSG00000240361 OR4G11P - chr1:62947-63887 - - 0 0 0 OK
3 | ENSG00000268020 - - ENSG00000268020 AL627309.1 - chr1:53048-54936 - - 0 0 0 OK
4 | ENSG00000186092 - - ENSG00000186092 OR4F5 - chr1:69090-70008 - - 0 0 0 OK
5 | CUFF.1 - - CUFF.1 FAM138A - chr1:34553-36081 - - 0.0222016 0 0.0614304 OK
6 | CUFF.2 - - CUFF.2 DDX11L1 - chr1:11868-14412 - - 0 0 0.0497629 OK
7 | CUFF.3 - - CUFF.3 MIR1302-10 - chr1:29553-31109 - - 0 0 0.154007 OK
8 | CUFF.4 - - CUFF.4 WASH7P - chr1:14362-29806 - - 10.3844 9.63011 11.2268 OK
9 | ENSG00000269308 - - ENSG00000269308 AL645608.2 - chr1:818042-819983 - - 0 0 0 OK
10 | CUFF.5 - - CUFF.5 - - chr1:841474-842801 - - 0.172295 0.113069 0.231522 OK
11 | CUFF.6 - - CUFF.6 - - chr1:1-20 - - 0.172295 0.113069 0.231522 FAIL
12 | CUFF.7 - - CUFF.7 - - chr2:1-20 - - 0.172295 0.113069 0.231522 LOWDATA
13 | CUFF.8 - - CUFF.8 - - chr3:1-20 - - 0.172295 0.113069 0.231522 HIDATA
14 |
--------------------------------------------------------------------------------
/test/data/isoforms.fpkm_tracking:
--------------------------------------------------------------------------------
1 | tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage FPKM FPKM_conf_lo FPKM_conf_hi FPKM_status
2 | ENST00000492842 - - ENSG00000240361 OR4G11P - chr1:62947-63887 940 0 0 0 0 OK
3 | ENST00000594647 - - ENSG00000268020 AL627309.1 - chr1:53048-54936 126 0 0 0 0 OK
4 | ENST00000335137 - - ENSG00000186092 OR4F5 - chr1:69090-70008 918 0 0 0 0 OK
5 | ENST00000417324 - - CUFF.1 FAM138A - chr1:34553-36081 1187 0 0 0 0.0120385 OK
6 | ENST00000461467 - - CUFF.1 FAM138A - chr1:35244-36073 590 0.621469 0.0222016 0 0.0484398 OK
7 | ENST00000456328 - - CUFF.2 DDX11L1 - chr1:11868-14409 1657 0 0 0 0.0129358 LOWDATA
8 | ENST00000515242 - - CUFF.2 DDX11L1 - chr1:11871-14412 1653 0 0 0 0.00864472 LOWDATA
9 | ENST00000518655 - - CUFF.2 DDX11L1 - chr1:11873-14409 1483 0 0 0 0.00963569 OK
10 | ENST00000450305 - - CUFF.2 DDX11L1 - chr1:12009-13670 632 0 0 0 0.0226103 LOWDATA
11 | CUFF.7604.1 - - CUFF.7604 - - chr2:45395607-45402815 1004 4.73445 0.194033 0.113862 0.27754 OK
12 | ENST00000496445 - - CUFF.38259 VTI1A - chr10:114207021-114298405 853 0 0 0 0 FAIL
--------------------------------------------------------------------------------
/test/data/tiny_test_ligandome_dir/A0201:
--------------------------------------------------------------------------------
1 | SIINFKEL
2 | QQQQQQQQ
3 |
--------------------------------------------------------------------------------
/test/data/tiny_test_ligandome_dir/HLA-B0704:
--------------------------------------------------------------------------------
1 | RRRRRRRRR
--------------------------------------------------------------------------------
/test/test_args_outputs.py:
--------------------------------------------------------------------------------
1 | from topiary.cli.args import arg_parser
2 | from topiary.cli.outputs import write_outputs
3 | import tempfile
4 | import pandas as pd
5 | from nose.tools import eq_
6 |
7 |
8 | def test_write_outputs():
9 |
10 | with tempfile.NamedTemporaryFile(mode="r+", delete=False) as f:
11 | df = pd.DataFrame({
12 | "x": [1, 2, 3],
13 | "y": [10, 20, 30]
14 | })
15 | args = arg_parser.parse_args([
16 | "--output-csv", f.name,
17 | "--subset-output-columns", "x",
18 | "--rename-output-column", "x", "X",
19 | "--mhc-predictor", "random",
20 | "--mhc-alleles", "A0201",
21 | ])
22 |
23 | write_outputs(
24 | df,
25 | args,
26 | print_df_before_filtering=True,
27 | print_df_after_filtering=True)
28 | print("File: %s" % f.name)
29 | df_from_file = pd.read_csv(f.name, index_col="#")
30 |
31 | df_expected = pd.DataFrame({
32 | "X": [1, 2, 3]})
33 | print(df_from_file)
34 | eq_(len(df_expected), len(df_from_file))
35 | assert (df_expected == df_from_file).all().all()
36 |
--------------------------------------------------------------------------------
/test/test_cli_protein_changes.py:
--------------------------------------------------------------------------------
1 | from nose.tools import eq_
2 | from topiary.cli.protein_changes import protein_change_effects_from_args
3 | from topiary.cli.args import create_arg_parser
4 |
5 | arg_parser = create_arg_parser(
6 | mhc=False,
7 | rna=False,
8 | output=False)
9 |
10 | def test_protein_change_effects_from_args_substitutions():
11 | args = arg_parser.parse_args([
12 | "--protein-change", "EGFR", "T790M",
13 | "--genome", "grch37",
14 | ])
15 |
16 | effects = protein_change_effects_from_args(args)
17 | eq_(len(effects), 1)
18 | effect = effects[0]
19 | eq_(effect.aa_ref, "T")
20 | eq_(effect.aa_mutation_start_offset, 789)
21 | eq_(effect.aa_alt, "M")
22 |
23 | transcript = effect.transcript
24 | eq_(transcript.name, "EGFR-001")
25 |
26 | def test_protein_change_effects_from_args_malformed_missing_ref():
27 |
28 | args = arg_parser.parse_args([
29 | "--protein-change", "EGFR", "790M",
30 | "--genome", "grch37"])
31 |
32 | effects = protein_change_effects_from_args(args)
33 | eq_(len(effects), 0)
34 |
35 | def test_protein_change_effects_from_args_malformed_missing_alt():
36 | args = arg_parser.parse_args([
37 | "--protein-change", "EGFR", "T790",
38 | "--genome", "grch37"])
39 | effects = protein_change_effects_from_args(args)
40 | eq_(len(effects), 0)
41 |
42 | def test_protein_change_effects_from_args_multiple_effects():
43 | args = arg_parser.parse_args([
44 | "--protein-change", "EGFR", "T790M",
45 | "--protein-change", "KRAS", "G10D",
46 | "--genome", "grch37"])
47 | effects = protein_change_effects_from_args(args)
48 | print(effects)
49 | eq_(len(effects), 2)
50 |
--------------------------------------------------------------------------------
/test/test_contains_mutant_residues.py:
--------------------------------------------------------------------------------
1 | from nose.tools import eq_
2 | from topiary import contains_mutant_residues
3 |
4 | def test_contains_mutant_residues_before():
5 | eq_(
6 | contains_mutant_residues(
7 | peptide_start_in_protein=10,
8 | peptide_length=9,
9 | mutation_start_in_protein=5,
10 | mutation_end_in_protein=6),
11 | False)
12 |
13 |
14 | def test_contains_mutant_residues_after():
15 | eq_(
16 | contains_mutant_residues(
17 | peptide_start_in_protein=10,
18 | peptide_length=9,
19 | mutation_start_in_protein=25,
20 | mutation_end_in_protein=26),
21 | False)
22 |
23 | def test_contains_mutant_residues_inside():
24 | eq_(
25 | contains_mutant_residues(
26 | peptide_start_in_protein=10,
27 | peptide_length=9,
28 | mutation_start_in_protein=12,
29 | mutation_end_in_protein=13),
30 | True)
31 |
32 | def test_contains_mutant_residues_deletion_before_beginning():
33 | # peptide only contains the residue *after* the mutation
34 | # so it still looks like it's wildtype
35 | eq_(
36 | contains_mutant_residues(
37 | peptide_start_in_protein=10,
38 | peptide_length=9,
39 | mutation_start_in_protein=10,
40 | mutation_end_in_protein=10),
41 | False)
42 |
43 |
44 | def test_contains_mutant_residues_deletion_at_beginning():
45 | # peptide contains mutation before *and* after mutation so
46 | # it should count as having a mutant juxtaposition of residues
47 | eq_(
48 | contains_mutant_residues(
49 | peptide_start_in_protein=10,
50 | peptide_length=9,
51 | mutation_start_in_protein=11,
52 | mutation_end_in_protein=11),
53 | True)
54 |
55 | def test_contains_mutant_residues_deletion_after_end():
56 | # peptide only contains the residue *before* the mutation
57 | # so it still looks like it's wildtype
58 | eq_(
59 | contains_mutant_residues(
60 | peptide_start_in_protein=10,
61 | peptide_length=9,
62 | mutation_start_in_protein=19,
63 | mutation_end_in_protein=19),
64 | False)
65 |
66 | def test_contains_mutant_residues_deletion_at_end():
67 | # peptide contains mutation before *and* after mutation so
68 | # it should count as having a mutant juxtaposition of residues
69 | eq_(
70 | contains_mutant_residues(
71 | peptide_start_in_protein=10,
72 | peptide_length=9,
73 | mutation_start_in_protein=18,
74 | mutation_end_in_protein=18),
75 | True)
76 |
--------------------------------------------------------------------------------
/test/test_dataframe.py:
--------------------------------------------------------------------------------
1 |
2 | from mhctools import NetMHC
3 | from topiary import TopiaryPredictor
4 | from .data import cancer_test_variants
5 |
6 | alleles = [
7 | 'A02:01',
8 | 'B*07:02',
9 | 'HLA-C*07:02',
10 | ]
11 |
12 | mhc_model = NetMHC(
13 | alleles=alleles,
14 | default_peptide_lengths=[8, 9, 10])
15 |
16 | DEFAULT_FPKM = 1.0
17 |
18 | def test_epitopes_to_dataframe_transcript_expression():
19 | predictor = TopiaryPredictor(
20 | mhc_model=mhc_model,
21 | only_novel_epitopes=False)
22 | df = predictor.predict_from_variants(
23 | variants=cancer_test_variants,
24 | transcript_expression_dict={
25 | transcript_id: DEFAULT_FPKM
26 | for variant in cancer_test_variants
27 | for transcript_id in variant.transcript_ids
28 | })
29 |
30 | assert "transcript_expression" in df.columns, \
31 | "transcript_expression missing from %s" % (df.columns,)
32 | assert(df["transcript_expression"] == DEFAULT_FPKM).all(), \
33 | "Invalid FPKM values in DataFrame transcript_expression column"
34 |
35 | def test_epitopes_to_dataframe_gene_expression():
36 | predictor = TopiaryPredictor(
37 | mhc_model=mhc_model,
38 | only_novel_epitopes=False)
39 |
40 | df = predictor.predict_from_variants(
41 | variants=cancer_test_variants,
42 | gene_expression_dict={
43 | gene_id: DEFAULT_FPKM
44 | for variant in cancer_test_variants
45 | for gene_id in variant.gene_ids
46 | })
47 |
48 | assert "gene_expression" in df.columns, \
49 | "gene_expression missing from %s" % (df.columns,)
50 | assert(df["gene_expression"] == DEFAULT_FPKM).all(), \
51 | "Invalid FPKM values in DataFrame gene_expression column"
52 |
--------------------------------------------------------------------------------
/test/test_effect_expression_filters.py:
--------------------------------------------------------------------------------
1 |
2 | from .data import (
3 | cancer_test_variants,
4 | cancer_test_variant_gene_ids,
5 | cancer_test_variant_transcript_ids
6 | )
7 | from topiary.filters import apply_effect_expression_filters
8 |
9 | cancer_test_effects = cancer_test_variants.effects()
10 |
11 | DEFAULT_FPKM = 1.0
12 |
13 | # associate every gene ID with 1.0 FPKM
14 | gene_expression_dict = {
15 | gene_id: DEFAULT_FPKM
16 | for gene_id in cancer_test_variant_gene_ids
17 | }
18 |
19 | # associate every transcript with 1.0 FPKM
20 | transcript_expression_dict = {
21 | transcript_id: DEFAULT_FPKM
22 | for transcript_id in cancer_test_variant_transcript_ids
23 | }
24 |
25 |
26 | def test_apply_effect_gene_expression_below_threshold():
27 | filtered = apply_effect_expression_filters(
28 | cancer_test_effects,
29 | gene_expression_dict=gene_expression_dict,
30 | gene_expression_threshold=2 * DEFAULT_FPKM,
31 | transcript_expression_dict=None,
32 | transcript_expression_threshold=None)
33 | assert len(filtered) == 0, \
34 | "All variants should have been filtered out but got: %s" % (filtered,)
35 |
36 | def test_apply_effect_gene_expression_above_threshold():
37 | filtered = apply_effect_expression_filters(
38 | cancer_test_effects,
39 | gene_expression_dict=gene_expression_dict,
40 | gene_expression_threshold=0.5 * DEFAULT_FPKM,
41 | transcript_expression_dict=None,
42 | transcript_expression_threshold=None)
43 | assert len(filtered) == len(cancer_test_effects), \
44 | "Expected %s effects but got %s" % (len(
45 | cancer_test_effects), len(filtered))
46 |
47 | def test_apply_effect_gene_expression_equal_threshold():
48 | # expect genes with expression at threshold to NOT get filtered
49 | filtered = apply_effect_expression_filters(
50 | cancer_test_effects,
51 | gene_expression_dict=gene_expression_dict,
52 | gene_expression_threshold=DEFAULT_FPKM,
53 | transcript_expression_dict=None,
54 | transcript_expression_threshold=None)
55 | assert len(filtered) == len(cancer_test_effects), \
56 | "Expected %s effects but got %s" % (len(
57 | cancer_test_effects), len(filtered))
58 |
59 | def test_apply_effect_transcript_expression_below_threshold():
60 | filtered = apply_effect_expression_filters(
61 | cancer_test_effects,
62 | gene_expression_dict=None,
63 | gene_expression_threshold=None,
64 | transcript_expression_dict=transcript_expression_dict,
65 | transcript_expression_threshold=2 * DEFAULT_FPKM)
66 | assert len(filtered) == 0, \
67 | "All effects should have been filtered out but got: %s" % (filtered,)
68 |
69 | def test_apply_effect_transcript_expression_above_threshold():
70 | filtered = apply_effect_expression_filters(
71 | cancer_test_effects,
72 | gene_expression_dict=None,
73 | gene_expression_threshold=None,
74 | transcript_expression_dict=transcript_expression_dict,
75 | transcript_expression_threshold=0.5 * DEFAULT_FPKM)
76 | assert len(filtered) == len(cancer_test_effects), \
77 | "Expected %s effects but got %s" % (
78 | len(cancer_test_effects), len(filtered))
79 |
80 | def test_apply_effect_transcript_expression_equal_threshold():
81 | # expect transcripts with expression at threshold to NOT be filtered
82 | filtered = apply_effect_expression_filters(
83 | cancer_test_effects,
84 | gene_expression_dict=None,
85 | gene_expression_threshold=None,
86 | transcript_expression_dict=transcript_expression_dict,
87 | transcript_expression_threshold=DEFAULT_FPKM)
88 | assert len(filtered) == len(cancer_test_effects), \
89 | "Expected %s effects but got %s" % (
90 | len(cancer_test_effects), len(filtered))
91 |
--------------------------------------------------------------------------------
/test/test_epitopes_from_commandline_args.py:
--------------------------------------------------------------------------------
1 | from nose.tools import eq_
2 |
3 | from topiary.cli.args import arg_parser, predict_epitopes_from_args
4 |
5 | from .data import cancer_test_variants
6 |
7 |
8 | def test_cancer_epitopes_from_args():
9 | epitope_lengths = [9, 10]
10 | alleles = ["HLA-A*02:01", "C0701"]
11 | args_list = [
12 | "--mhc-predictor", "netmhc",
13 | "--mhc-epitope-lengths", ",".join(str(x) for x in epitope_lengths),
14 | "--mhc-alleles", ",".join(alleles),
15 | "--genome", "GRCh38",
16 | "--only-novel-epitopes",
17 | ]
18 | for variant in cancer_test_variants:
19 | args_list.append("--variant")
20 | args_list.append(str(variant.contig))
21 | args_list.append(str(variant.start))
22 | args_list.append(variant.ref)
23 | args_list.append(variant.alt)
24 |
25 | parsed_args = arg_parser.parse_args(args_list)
26 | epitope_predictions = predict_epitopes_from_args(parsed_args)
27 | expected_number_of_epitopes = 0
28 | for epitope_length in epitope_lengths:
29 | expected_number_of_epitopes += epitope_length * len(cancer_test_variants) * len(alleles)
30 | eq_(len(epitope_predictions), expected_number_of_epitopes)
31 |
--------------------------------------------------------------------------------
/test/test_load_cufflinks_fpkm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | test_cufflinks : Test that we can correctly load Cufflinks tracking files which
17 | contain the estimated expression levels of genes and isoforms (computed from
18 | RNA-Seq reads).
19 | """
20 |
21 |
22 | from __future__ import print_function, division, absolute_import
23 |
24 | from topiary.rna import load_cufflinks_dataframe
25 |
26 | from nose.tools import eq_
27 |
28 | from .data import data_path
29 |
30 | def test_load_cufflinks_genes():
31 | genes_df = load_cufflinks_dataframe(
32 | data_path("genes.fpkm_tracking"),
33 | drop_lowdata=True,
34 | drop_hidata=True,
35 | drop_failed=True,
36 | drop_novel=False)
37 | gene_ids = set(genes_df.id)
38 | expected_gene_ids = {
39 | "ENSG00000240361",
40 | "ENSG00000268020",
41 | "ENSG00000186092",
42 | "ENSG00000269308",
43 | "CUFF.1",
44 | "CUFF.2",
45 | "CUFF.3",
46 | "CUFF.4",
47 | "CUFF.5"
48 | }
49 | eq_(gene_ids, expected_gene_ids)
50 |
51 | def test_load_cufflinks_genes_drop_novel():
52 | genes_df = load_cufflinks_dataframe(
53 | data_path("genes.fpkm_tracking"),
54 | drop_lowdata=True,
55 | drop_hidata=True,
56 | drop_failed=True,
57 | drop_novel=True)
58 | gene_ids = set(genes_df.id)
59 | expected_gene_ids = {
60 | "ENSG00000240361",
61 | "ENSG00000268020",
62 | "ENSG00000186092",
63 | "ENSG00000269308",
64 | }
65 | eq_(gene_ids, expected_gene_ids)
66 |
67 |
68 | def test_load_cufflinks_isoforms():
69 | transcripts_df = load_cufflinks_dataframe(
70 | data_path("isoforms.fpkm_tracking"),
71 | drop_lowdata=True,
72 | drop_hidata=True,
73 | drop_failed=True,
74 | drop_novel=False)
75 | transcript_ids = set(transcripts_df.id)
76 | expected_transcript_ids = {
77 | "ENST00000492842",
78 | "ENST00000594647",
79 | "ENST00000335137",
80 | "ENST00000417324",
81 | "ENST00000461467",
82 | "ENST00000518655",
83 | "CUFF.7604.1",
84 | }
85 | eq_(transcript_ids, expected_transcript_ids)
86 |
87 | def test_load_cufflinks_isoforms_drop_novel():
88 | transcripts_df = load_cufflinks_dataframe(
89 | data_path("isoforms.fpkm_tracking"),
90 | drop_lowdata=True,
91 | drop_hidata=True,
92 | drop_failed=True,
93 | drop_novel=True)
94 | transcript_ids = set(transcripts_df.id)
95 | expected_transcript_ids = {
96 | "ENST00000492842",
97 | "ENST00000594647",
98 | "ENST00000335137",
99 | "ENST00000417324",
100 | "ENST00000461467",
101 | "ENST00000518655",
102 | }
103 | eq_(transcript_ids, expected_transcript_ids)
104 |
--------------------------------------------------------------------------------
/test/test_load_stringtie_gtf_fpkm.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division, absolute_import
2 |
3 | from topiary.rna import load_transcript_fpkm_dict_from_gtf
4 |
5 | from nose.tools import eq_
6 |
7 | from .data import data_path
8 |
9 |
10 | def test_load_stringtie_gtf_transcripts():
11 | transcript_fpkms = load_transcript_fpkm_dict_from_gtf(
12 | data_path("B16-StringTie-chr1-subset.gtf"))
13 | transcript_ids = set(transcript_fpkms.keys())
14 | expected_fpkms_dict = {
15 | "ENSMUST00000192505": 0.125126,
16 | "ENSMUST00000191939": 0.680062,
17 | "ENSMUST00000182774": 0.054028,
18 | }
19 | expected_transcript_ids = set(expected_fpkms_dict.keys())
20 | eq_(expected_transcript_ids, transcript_ids)
21 | for transcript_id, fpkm in expected_fpkms_dict.items():
22 | eq_(fpkm, transcript_fpkms[transcript_id])
23 |
--------------------------------------------------------------------------------
/test/test_mutant_epitope_predictions_class1.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from __future__ import print_function, division, absolute_import
17 |
18 | from mhctools import NetMHCpan
19 | from nose.tools import eq_, raises
20 | from pyensembl import ensembl_grch37
21 | from topiary import TopiaryPredictor
22 | from varcode import Variant, VariantCollection
23 |
24 | # TODO: find out about these variants,
25 | # what do we expect from them? Are they SNVs?
26 | variants = VariantCollection([
27 | Variant(
28 | contig=10,
29 | start=100018900,
30 | ref='C',
31 | alt='T',
32 | ensembl=ensembl_grch37),
33 | Variant(
34 | contig=11,
35 | start=32861682,
36 | ref='G',
37 | alt='A',
38 | ensembl=ensembl_grch37)])
39 |
40 | alleles = [
41 | 'A02:01',
42 | 'a0204',
43 | 'B*07:02',
44 | 'HLA-B14:02',
45 | 'HLA-C*07:02',
46 | 'hla-c07:01'
47 | ]
48 |
49 | mhc_model = NetMHCpan(
50 | alleles=alleles,
51 | default_peptide_lengths=[9])
52 |
53 |
54 | def test_epitope_prediction_without_padding():
55 | output_without_padding = TopiaryPredictor(
56 | mhc_model=mhc_model,
57 | only_novel_epitopes=True).predict_from_variants(variants=variants)
58 | # one prediction for each variant * number of alleles
59 | strong_binders = output_without_padding[output_without_padding.affinity <= 500]
60 | eq_(len(strong_binders), 5)
61 |
62 | @raises(ValueError)
63 | def test_epitope_prediction_with_invalid_padding():
64 | TopiaryPredictor(
65 | mhc_model=mhc_model,
66 | padding_around_mutation=7).predict_from_variants(variants=variants)
67 |
68 |
69 | @raises(ValueError)
70 | def test_epitope_prediction_with_invalid_zero_padding():
71 | TopiaryPredictor(
72 | mhc_model=mhc_model,
73 | padding_around_mutation=7).predict_from_variants(variants=variants)
74 |
75 |
76 | def test_epitope_prediction_with_valid_padding():
77 | predictor = TopiaryPredictor(
78 | mhc_model=mhc_model,
79 | padding_around_mutation=8,
80 | only_novel_epitopes=True)
81 | output_with_padding = predictor.predict_from_variants(variants=variants)
82 | # 6 alleles * 2 mutations * 9 distinct windows = 108
83 | eq_(len(output_with_padding), 108)
84 |
--------------------------------------------------------------------------------
/test/test_mutant_epitope_predictions_class2.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from __future__ import print_function, division, absolute_import
17 |
18 | from mhctools import NetMHCIIpan
19 | from nose.tools import eq_
20 | from pyensembl import ensembl_grch37
21 | from topiary import TopiaryPredictor
22 | from varcode import Variant, VariantCollection
23 |
24 | # TODO: find out about these variants,
25 | # what do we expect from them? Are they SNVs?
26 | variants = VariantCollection([
27 | Variant(
28 | contig=10,
29 | start=100018900,
30 | ref='C',
31 | alt='T',
32 | ensembl=ensembl_grch37),
33 | Variant(
34 | contig=11,
35 | start=32861682,
36 | ref='G',
37 | alt='A',
38 | ensembl=ensembl_grch37)])
39 |
40 | alleles = [
41 | "HLA-DPA1*01:05/DPB1*100:01",
42 | "DRB10102"
43 | ]
44 |
45 | mhc_model = NetMHCIIpan(
46 | alleles=alleles,
47 | default_peptide_lengths=[15, 16])
48 |
49 | def test_netmhcii_pan_epitopes():
50 | epitope_predictions = TopiaryPredictor(
51 | mhc_model=mhc_model,
52 | only_novel_epitopes=True).predict_from_variants(variants=variants)
53 |
54 | # expect (15 + 16 mutant peptides) * (2 alleles) * 2 variants =
55 | # 124 total epitope predictions
56 | eq_(len(epitope_predictions), 124)
57 | unique_alleles = set(epitope_predictions.allele)
58 | assert len(unique_alleles) == 2, \
59 | "Expected 2 unique alleles, got %s" % (unique_alleles,)
60 | unique_lengths = set(epitope_predictions.peptide_length)
61 | assert unique_lengths == {15, 16}, \
62 | "Expected epitopes of length 15 and 16 but got lengths %s" % (unique_lengths,)
63 |
--------------------------------------------------------------------------------
/test/test_padding.py:
--------------------------------------------------------------------------------
1 | from nose.tools import eq_, assert_raises
2 | from topiary import check_padding_around_mutation
3 |
4 | def test_default_padding():
5 | # expect padding to be one less than the largest epitope length
6 | eq_(check_padding_around_mutation(None, [8, 9, 10]), 9)
7 |
8 | def test_invalid_padding():
9 | # padding is insufficient for the epitope lengths given
10 | with assert_raises(ValueError):
11 | check_padding_around_mutation(2, [9])
12 |
--------------------------------------------------------------------------------
/test/test_peptide_mutation_interval.py:
--------------------------------------------------------------------------------
1 | from nose.tools import eq_, assert_raises
2 | from topiary import peptide_mutation_interval
3 |
4 | def test_peptide_mutation_interval_middle():
5 | start, end = peptide_mutation_interval(
6 | peptide_start_in_protein=10,
7 | peptide_length=9,
8 | mutation_start_in_protein=11,
9 | mutation_end_in_protein=12)
10 | eq_(start, 1)
11 | eq_(end, 2)
12 |
13 |
14 | def test_peptide_mutation_interval_start():
15 | start, end = peptide_mutation_interval(
16 | peptide_start_in_protein=10,
17 | peptide_length=9,
18 | mutation_start_in_protein=7,
19 | mutation_end_in_protein=12)
20 | eq_(start, 0)
21 | eq_(end, 2)
22 |
23 | def test_peptide_mutation_interval_end():
24 | start, end = peptide_mutation_interval(
25 | peptide_start_in_protein=10,
26 | peptide_length=9,
27 | mutation_start_in_protein=18,
28 | mutation_end_in_protein=20)
29 | eq_(start, 8)
30 | eq_(end, 9)
31 |
32 | def test_peptide_mutation_interval_deletion():
33 | start, end = peptide_mutation_interval(
34 | peptide_start_in_protein=10,
35 | peptide_length=9,
36 | mutation_start_in_protein=15,
37 | mutation_end_in_protein=15)
38 | eq_(start, 5)
39 | eq_(end, 5)
40 |
41 |
42 | def test_peptide_mutation_interval_no_overlap_before():
43 | with assert_raises(ValueError):
44 | peptide_mutation_interval(
45 | peptide_start_in_protein=10,
46 | peptide_length=9,
47 | mutation_start_in_protein=5,
48 | mutation_end_in_protein=6)
49 |
50 | def test_peptide_mutation_interval_no_overlap_after():
51 | with assert_raises(ValueError):
52 | peptide_mutation_interval(
53 | peptide_start_in_protein=10,
54 | peptide_length=9,
55 | mutation_start_in_protein=25,
56 | mutation_end_in_protein=26)
57 |
--------------------------------------------------------------------------------
/test/test_rna_helpers.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from topiary.rna.cufflinks import parse_locus_column
3 | from nose.tools import eq_
4 |
5 |
6 | def test_parse_locus_column_with_chr():
7 | """
8 | test_parse_locus_column_with_chr: Test that 'chr' prefix from
9 | chromosome names gets correctly dropped
10 | """
11 | df = pd.DataFrame({"locus": ["chr1:10-20", "chrX:30-40"]})
12 | loci = df["locus"]
13 | chromosomes, starts, ends = parse_locus_column(loci)
14 | eq_(list(chromosomes), ["1", "X"])
15 | eq_(list(starts), [10, 30])
16 | eq_(list(ends), [20, 40])
17 |
18 |
19 | def test_parse_locus_column_without_chr():
20 | """
21 | test_parse_locus_column_without_chr: Test that chromosome names can be
22 | parsed without 'chr' prefix
23 | """
24 | df = pd.DataFrame({"locus": ["1:10-20", "X:30-40"]})
25 | loci = df["locus"]
26 | chromosomes, starts, ends = parse_locus_column(loci)
27 | eq_(list(chromosomes), ["1", "X"])
28 | eq_(list(starts), [10, 30])
29 | eq_(list(ends), [20, 40])
30 |
--------------------------------------------------------------------------------
/test/test_variant_expression_filters.py:
--------------------------------------------------------------------------------
1 |
2 | from topiary.filters import apply_variant_expression_filters
3 |
4 | from .data import (
5 | cancer_test_variants,
6 | cancer_test_variant_gene_ids,
7 | cancer_test_variant_transcript_ids,
8 | )
9 |
10 | DEFAULT_FPKM = 1.0
11 |
12 | # associate every gene ID with 1.0 FPKM
13 | gene_expression_dict = {
14 | gene_id: DEFAULT_FPKM
15 | for gene_id in cancer_test_variant_gene_ids
16 | }
17 |
18 | # associate every transcript with 1.0 FPKM
19 | transcript_expression_dict = {
20 | transcript_id: DEFAULT_FPKM
21 | for transcript_id in cancer_test_variant_transcript_ids
22 | }
23 |
24 | def test_apply_variant_gene_expression_below_threshold():
25 | filtered = apply_variant_expression_filters(
26 | cancer_test_variants,
27 | gene_expression_dict=gene_expression_dict,
28 | gene_expression_threshold=2 * DEFAULT_FPKM,
29 | transcript_expression_dict=None,
30 | transcript_expression_threshold=None)
31 | assert len(filtered) == 0, \
32 | "All variants should have been filtered out but got: %s" % (filtered,)
33 |
34 | def test_apply_variant_gene_expression_above_threshold():
35 | filtered = apply_variant_expression_filters(
36 | cancer_test_variants,
37 | gene_expression_dict=gene_expression_dict,
38 | gene_expression_threshold=0.5 * DEFAULT_FPKM,
39 | transcript_expression_dict=None,
40 | transcript_expression_threshold=None)
41 | assert len(filtered) == len(cancer_test_variants), \
42 | "Expected %s variants but got %s" % (len(cancer_test_variants), len(filtered))
43 |
44 | def test_apply_variant_transcript_expression_below_threshold():
45 | filtered = apply_variant_expression_filters(
46 | cancer_test_variants,
47 | gene_expression_dict=None,
48 | gene_expression_threshold=None,
49 | transcript_expression_dict=transcript_expression_dict,
50 | transcript_expression_threshold=2 * DEFAULT_FPKM)
51 | assert len(filtered) == 0, \
52 | "All variants should have been filtered out but got: %s" % (filtered,)
53 |
54 | def test_apply_variant_transcript_expression_above_threshold():
55 | filtered = apply_variant_expression_filters(
56 | cancer_test_variants,
57 | gene_expression_dict=None,
58 | gene_expression_threshold=None,
59 | transcript_expression_dict=transcript_expression_dict,
60 | transcript_expression_threshold=0.5 * DEFAULT_FPKM)
61 | assert len(filtered) == len(cancer_test_variants), \
62 | "Expected %s variants but got %s" % (len(cancer_test_variants), len(filtered))
63 |
--------------------------------------------------------------------------------
/topiary/__init__.py:
--------------------------------------------------------------------------------
1 | from .predictor import TopiaryPredictor
2 | from .sequence_helpers import (
3 | check_padding_around_mutation,
4 | peptide_mutation_interval,
5 | contains_mutant_residues,
6 | protein_subsequences_around_mutations,
7 | )
8 |
9 | __version__ = '3.0.6'
10 |
11 | __all__ = [
12 | "TopiaryPredictor",
13 | "contains_mutant_residues",
14 | "check_padding_around_mutation",
15 | "peptide_mutation_interval",
16 | "protein_subsequences_around_mutations",
17 | ]
18 |
--------------------------------------------------------------------------------
/topiary/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/topiary/cli/args.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Common commandline arguments used by scripts
17 | """
18 |
19 | from __future__ import print_function, division, absolute_import
20 |
21 | from argparse import ArgumentParser
22 | from mhctools.cli import add_mhc_args, mhc_binding_predictor_from_args
23 | from varcode.cli import add_variant_args, variant_collection_from_args
24 |
25 | from .filtering import add_filter_args
26 | from .rna import (
27 | add_rna_args,
28 | rna_gene_expression_dict_from_args,
29 | rna_transcript_expression_dict_from_args,
30 | )
31 | from .sequence import add_sequence_args
32 | from .errors import add_error_args
33 | from .outputs import add_output_args
34 | from .protein_changes import add_protein_change_args
35 | from ..predictor import TopiaryPredictor
36 |
37 | def create_arg_parser(
38 | rna=True,
39 | mhc=True,
40 | variants=True,
41 | protein_changes=True,
42 | filters=True,
43 | sequence_options=True,
44 | error_options=True,
45 | output=True):
46 | arg_parser = ArgumentParser()
47 | if rna:
48 | add_rna_args(arg_parser)
49 | if mhc:
50 | add_mhc_args(arg_parser)
51 | if variants:
52 | add_variant_args(arg_parser)
53 | if protein_changes:
54 | add_protein_change_args(arg_parser)
55 | if filters:
56 | add_filter_args(arg_parser)
57 | if sequence_options:
58 | add_sequence_args(arg_parser)
59 | if error_options:
60 | add_error_args(arg_parser)
61 | if output:
62 | add_output_args(arg_parser)
63 | return arg_parser
64 |
65 | # keeping global instance for backwards compatibility with existing code
66 | arg_parser = create_arg_parser()
67 |
68 | def predict_epitopes_from_args(args):
69 | """
70 | Returns an epitope collection from the given commandline arguments.
71 |
72 | Parameters
73 | ----------
74 | args : argparse.Namespace
75 | Parsed commandline arguments for Topiary
76 | """
77 | mhc_model = mhc_binding_predictor_from_args(args)
78 | variants = variant_collection_from_args(args)
79 | gene_expression_dict = rna_gene_expression_dict_from_args(args)
80 | transcript_expression_dict = rna_transcript_expression_dict_from_args(args)
81 |
82 | predictor = TopiaryPredictor(
83 | mhc_model=mhc_model,
84 | padding_around_mutation=args.padding_around_mutation,
85 | ic50_cutoff=args.ic50_cutoff,
86 | percentile_cutoff=args.percentile_cutoff,
87 | min_transcript_expression=args.rna_min_transcript_expression,
88 | min_gene_expression=args.rna_min_gene_expression,
89 | only_novel_epitopes=args.only_novel_epitopes,
90 | raise_on_error=not args.skip_variant_errors)
91 | return predictor.predict_from_variants(
92 | variants=variants,
93 | transcript_expression_dict=transcript_expression_dict,
94 | gene_expression_dict=gene_expression_dict)
95 |
--------------------------------------------------------------------------------
/topiary/cli/errors.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Commandline arguments related to error handling
17 | """
18 |
19 | from __future__ import print_function, division, absolute_import
20 |
21 | def add_error_args(arg_parser):
22 | error_group = arg_parser.add_argument_group(
23 | title="Errors",
24 | description="Options for error handling")
25 |
26 | error_group.add_argument(
27 | "--skip-variant-errors",
28 | default=False,
29 | action="store_true",
30 | help="Skip variants which cause runtime errors of any kind")
31 |
32 | return error_group
33 |
--------------------------------------------------------------------------------
/topiary/cli/filtering.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | """
17 | Commandline arguments related to epitope filtering
18 | """
19 |
20 | from __future__ import print_function, division, absolute_import
21 |
22 | def add_filter_args(arg_parser):
23 | filter_group = arg_parser.add_argument_group(
24 | title="Filtering Options",
25 | description="Criteria for removing epitopes from results")
26 |
27 | filter_group.add_argument(
28 | "--ic50-cutoff",
29 | help="Drop epitopes with predicted IC50 nM affinity above this value",
30 | default=None,
31 | type=float)
32 |
33 | filter_group.add_argument(
34 | "--percentile-cutoff",
35 | help="Drop epitopes with predicted IC50 percentile rank above this value",
36 | default=None,
37 | type=float)
38 |
39 | filter_group.add_argument(
40 | "--only-novel-epitopes",
41 | help="".join([
42 | "Drop epitopes which do not contain mutated residues or occur ",
43 | "in the self-ligandome."]),
44 | default=False,
45 | action="store_true")
46 |
47 | filter_group.add_argument(
48 | "--wildtype-ligandome-directory",
49 | help="".join([
50 | "Directory of 'self' ligand peptide sets, in files named ",
51 | "by allele (e.g. 'A0201'). Any predicted mutant epitope which ",
52 | "is in the files associated with the given alleles is treated as ",
53 | "wildtype (non-mutated)."]))
54 | return filter_group
55 |
--------------------------------------------------------------------------------
/topiary/cli/outputs.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Common commandline arguments for output files
17 | """
18 |
19 | from __future__ import print_function, division, absolute_import
20 |
21 | import logging
22 |
23 | def add_output_args(arg_parser):
24 | output_group = arg_parser.add_argument_group(
25 | title="Output",
26 | description="How and where to write results")
27 |
28 | output_group.add_argument(
29 | "--output-csv",
30 | default=None,
31 | help="Path to output CSV file")
32 |
33 | output_group.add_argument(
34 | "--output-html",
35 | default=None,
36 | help="Path to output HTML file")
37 |
38 | output_group.add_argument(
39 | "--output-csv-sep",
40 | default=",",
41 | help="Separator for CSV file")
42 |
43 | output_group.add_argument(
44 | "--subset-output-columns",
45 | nargs="*")
46 |
47 | output_group.add_argument(
48 | "--rename-output-column",
49 | nargs=2,
50 | action="append",
51 | help=(
52 | "Rename original column (first parameter) to new"
53 | " name (second parameter)"))
54 |
55 | output_group.add_argument(
56 | "--print-columns",
57 | default=False,
58 | action="store_true",
59 | help="Print columns before writing data to file(s)")
60 |
61 | return output_group
62 |
63 | def write_outputs(
64 | df,
65 | args,
66 | print_df_before_filtering=False,
67 | print_df_after_filtering=False):
68 | if print_df_before_filtering:
69 | print(df)
70 |
71 | if args.subset_output_columns:
72 | subset_columns = []
73 | for column in args.subset_output_columns:
74 | if column not in df.columns:
75 | logging.warn(
76 | "Invalid column name '%s', available: %s" % (
77 | column, list(df.columns)))
78 | else:
79 | subset_columns.append(column)
80 | df = df[subset_columns]
81 |
82 | if args.rename_output_column:
83 | for (old_name, new_name) in args.rename_output_column:
84 | if old_name not in df.columns:
85 | logging.warn(
86 | "Can't rename column '%s' since it doesn't exist, available: %s" % (
87 | old_name, list(df.columns)))
88 | else:
89 | df.rename(columns={old_name: new_name}, inplace=True)
90 |
91 | if print_df_after_filtering:
92 | print(df)
93 |
94 | if args.print_columns:
95 | print("Columns:")
96 | for column in df.columns:
97 | print("-- %s" % column)
98 |
99 | if args.output_csv:
100 | print("Saving %s..." % args.output_csv)
101 | df.to_csv(
102 | args.output_csv,
103 | index=True,
104 | index_label="#",
105 | sep=args.output_csv_sep)
106 |
107 | if args.output_html:
108 | print("Saving %s..." % args.output_html)
109 | df.to_html(args.output_html, index=True)
110 |
--------------------------------------------------------------------------------
/topiary/cli/protein_changes.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2018. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | from pyensembl import ensembl_grch38
17 | from varcode import EffectCollection
18 | from varcode.effects import Substitution
19 | from varcode.reference import infer_genome
20 | import re
21 |
22 | def add_protein_change_args(arg_parser):
23 | protein_change_group = arg_parser.add_argument_group(
24 | title="Protein Changes",
25 | description="Input protein changes without associated genomic variants")
26 |
27 | protein_change_group.add_argument(
28 | "--protein-change",
29 | default=[],
30 | nargs=2,
31 | action="append",
32 | help="Protein modification without genomic variant (e.g. EGFR T790M)")
33 |
34 | return arg_parser
35 |
36 | def genome_from_args(args):
37 | if args.genome:
38 | return infer_genome(args.genome)
39 | else:
40 | # no genome specified, assume it can be inferred from the file(s)
41 | # we're loading
42 | return ensembl_grch38
43 |
44 | def transcript_sort_key(transcript):
45 | """
46 | Key function used to sort transcripts. Taking the negative of
47 | protein sequence length and nucleotide sequence length so that
48 | the transcripts with longest sequences come first in the list. This couldn't
49 | be accomplished with `reverse=True` since we're also sorting by
50 | transcript name (which places TP53-001 before TP53-002).
51 | """
52 | return (
53 | -len(transcript.protein_sequence),
54 | -len(transcript.sequence),
55 | transcript.name
56 | )
57 |
58 | def best_transcript(transcripts):
59 | """
60 | Given a set of coding transcripts, choose the one with the longest
61 | protein sequence and in cases of ties use the following tie-breaking
62 | criteria:
63 | - transcript sequence (including UTRs)
64 | - transcript name (so TP53-001 should come before TP53-202)
65 | """
66 | assert len(transcripts) > 0
67 | sorted_list = sorted(transcripts, key=transcript_sort_key)
68 | return sorted_list[0]
69 |
70 | def protein_change_effects_from_args(args):
71 | genome = genome_from_args(args)
72 | valid_gene_names = set(genome.gene_names())
73 | substitution_regex = re.compile("([A-Z]+)([0-9]+)([A-Z]+)")
74 | effects = []
75 | for gene_name, protein_change_string in args.protein_change:
76 | match_obj = substitution_regex.match(protein_change_string)
77 | if match_obj is None:
78 | logging.warn(
79 | "Unable to parse protein modification: '%s'" % protein_change_string)
80 | continue
81 |
82 | ref, base1_pos, alt = match_obj.groups()
83 |
84 | base1_pos = int(base1_pos)
85 |
86 | if gene_name not in valid_gene_names:
87 | logging.warn("Invalid gene name '%s' in protein modification: '%s'" % (
88 | gene_name, protein_change_string))
89 | continue
90 |
91 | candidate_transcripts = []
92 | for candidate_gene in genome.genes_by_name(gene_name):
93 | for candidate_transcript in candidate_gene.transcripts:
94 | if not candidate_transcript.is_protein_coding:
95 | continue
96 | protein_sequence = candidate_transcript.protein_sequence
97 | if protein_sequence is None:
98 | continue
99 | if len(protein_sequence) < (base1_pos + len(ref) - 1):
100 | # protein sequence too short for this modification
101 | # e.g. EGFR T790M can't happen in an EGFR transcript
102 | # with only 789 amino acids
103 | continue
104 |
105 | seq_at_pos = protein_sequence[base1_pos - 1: base1_pos + len(ref) - 1]
106 | if seq_at_pos != ref:
107 | # if this transcript doesn't have the same reference amino
108 | # acids as the change then skip it and use a different
109 | # transcript
110 | continue
111 | candidate_transcripts.append(candidate_transcript)
112 | if len(candidate_transcripts) > 0:
113 | transcript = best_transcript(candidate_transcripts)
114 | effects.append(Substitution(
115 | variant=None,
116 | transcript=transcript,
117 | aa_ref=ref,
118 | aa_alt=alt,
119 | aa_mutation_start_offset=base1_pos - 1))
120 | return EffectCollection(effects)
121 |
--------------------------------------------------------------------------------
/topiary/cli/rna.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Common commandline arguments for filtering by gene/transcript expression
17 | """
18 |
19 | from __future__ import print_function, division, absolute_import
20 |
21 | from ..rna import (
22 | load_cufflinks_fpkm_dict,
23 | load_transcript_fpkm_dict_from_gtf
24 | )
25 |
26 | def add_rna_args(arg_parser):
27 | rna_group = arg_parser.add_argument_group(
28 | title="RNA-Seq",
29 | description="Transcript and gene abundance quantification")
30 |
31 | rna_group.add_argument(
32 | "--rna-transcript-fpkm-tracking-file",
33 | help="".join([
34 | "Cufflinks tracking file (FPKM estimates for Ensembl transcripts). ",
35 | "Used both for expression filtering and selecting the most abundant ",
36 | "transcript to use for determining a mutant protein sequence."]))
37 |
38 | rna_group.add_argument(
39 | "--rna-transcript-fpkm-gtf-file",
40 | help="".join([
41 | "GTF file containing FPKM estimates for Ensembl transcripts.",
42 | "Used both for expression filtering and selecting the most abundant ",
43 | "transcript to use for determining a mutant protein sequence."]))
44 |
45 | rna_group.add_argument(
46 | "--rna-min-transcript-expression",
47 | help="Minimum FPKM for transcript expression",
48 | default=0.0,
49 | type=float)
50 |
51 | rna_group.add_argument(
52 | "--rna-gene-fpkm-tracking-file",
53 | help="Cufflinks tracking file (FPKM estimates for Ensembl genes)",
54 | required=False)
55 |
56 | rna_group.add_argument(
57 | "--rna-min-gene-expression",
58 | help="Minimum FPKM for gene expression",
59 | default=0.0,
60 | type=float)
61 |
62 | return rna_group
63 |
64 | def rna_gene_expression_dict_from_args(args):
65 | """
66 | Returns a dictionary mapping Ensembl gene IDs to FPKM expression values
67 | or None if neither Cufflinks tracking file nor StringTie GTF file specified
68 | in the commandline arguments.
69 | """
70 | if args.rna_gene_fpkm_tracking_file:
71 | return load_cufflinks_fpkm_dict(args.rna_gene_fpkm_tracking_file)
72 | else:
73 | return None
74 |
75 | def rna_transcript_expression_dict_from_args(args):
76 | """
77 | Returns a dictionary mapping Ensembl transcript IDs to FPKM expression
78 | values or None if neither Cufflinks tracking file nor StringTie GTF file
79 | were specified.
80 | """
81 | if args.rna_transcript_fpkm_tracking_file:
82 | return load_cufflinks_fpkm_dict(args.rna_transcript_fpkm_tracking_file)
83 | elif args.rna_transcript_fpkm_gtf_file:
84 | return load_transcript_fpkm_dict_from_gtf(
85 | args.rna_transcript_fpkm_gtf_file)
86 | else:
87 | return None
88 |
--------------------------------------------------------------------------------
/topiary/cli/script.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Script to generate epitope predictions from somatic cancer variants
17 | and (optionally) tumor RNA-seq data.
18 |
19 | Example usage:
20 | topiary \
21 | --mhc-predictor netmhcpan
22 | --mhc-alleles-file HLA.txt
23 | --vcf somatic.vcf
24 | --rna-gene-fpkm-file genes.fpkm_tracking
25 | --rna-transcript-fpkm-file isoforms.fpkm_tracking
26 | --filter-ic50 500
27 | --filter-percentile 2
28 | --output results.csv
29 | """
30 |
31 | from __future__ import print_function, division, absolute_import
32 |
33 | import sys
34 |
35 | from .args import arg_parser, predict_epitopes_from_args
36 |
37 | from .outputs import write_outputs
38 |
39 |
40 | def parse_args(args_list=None):
41 | if args_list is None:
42 | args_list = sys.argv[1:]
43 | return arg_parser.parse_args(args_list)
44 |
45 | def main(args_list=None):
46 | """
47 | Script entry-point to predict neo-epitopes from genomic variants using
48 | Topiary.
49 | """
50 | args = parse_args(args_list)
51 | print("Topiary commandline arguments:")
52 | print(args)
53 | df = predict_epitopes_from_args(args)
54 | write_outputs(df, args)
55 | print("Total count: %d" % len(df))
56 |
--------------------------------------------------------------------------------
/topiary/cli/sequence.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | """
17 | Commandline arguments related to translated variant protein sequences.
18 | """
19 |
20 | from __future__ import print_function, division, absolute_import
21 |
22 | def add_sequence_args(arg_parser):
23 | sequence_group = arg_parser.add_argument_group(
24 | title="Protein Sequence Options",
25 | description="Parameters related to the mutant protein sequence")
26 |
27 | sequence_group.add_argument(
28 | "--padding-around-mutation",
29 | default=None,
30 | help="".join([
31 | "How many extra amino acids to include on either side of a mutation.",
32 | "Default is determined by epitope lengths but can be overridden to ",
33 | "predict wildtype epitopes in a larger context around a mutant residue.",
34 | ]),
35 | type=int)
36 |
37 | return sequence_group
38 |
--------------------------------------------------------------------------------
/topiary/filters.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Helper functions for filtering variants, effects, and epitope predictions
17 | """
18 |
19 | from __future__ import print_function, division, absolute_import
20 | import logging
21 |
22 | from varcode import NonsilentCodingMutation
23 |
24 | def apply_filter(
25 | filter_fn,
26 | collection,
27 | result_fn=None,
28 | filter_name="",
29 | collection_name=""):
30 | """
31 | Apply filter to effect collection and print number of dropped elements
32 |
33 | Parameters
34 | ----------
35 | """
36 | n_before = len(collection)
37 | filtered = [x for x in collection if filter_fn(x)]
38 | n_after = len(filtered)
39 | if not collection_name:
40 | collection_name = collection.__class__.__name__
41 | logging.info(
42 | "%s filtering removed %d/%d entries of %s",
43 | filter_name,
44 | (n_before - n_after),
45 | n_before,
46 | collection_name)
47 | return result_fn(filtered) if result_fn else collection.__class__(filtered)
48 |
49 | def filter_silent_and_noncoding_effects(effects):
50 | """
51 | Keep only variant effects which result in modified proteins.
52 |
53 | Parameters
54 | ----------
55 | effects : varcode.EffectCollection
56 | """
57 | return apply_filter(
58 | filter_fn=lambda effect: isinstance(effect, NonsilentCodingMutation),
59 | collection=effects,
60 | result_fn=effects.clone_with_new_elements,
61 | filter_name="Silent mutation")
62 |
63 |
64 | def apply_variant_expression_filters(
65 | variants,
66 | gene_expression_dict,
67 | gene_expression_threshold,
68 | transcript_expression_dict,
69 | transcript_expression_threshold):
70 | """
71 | Filter a collection of variants by gene and transcript expression thresholds
72 |
73 | Parameters
74 | ----------
75 | variants : varcode.VariantCollection
76 |
77 | gene_expression_dict : dict
78 |
79 | gene_expression_threshold : float
80 |
81 | transcript_expression_dict : dict
82 |
83 | transcript_expression_threshold : float
84 | """
85 | if gene_expression_dict:
86 | variants = apply_filter(
87 | lambda variant: any(
88 | gene_expression_dict.get(gene_id, 0.0) >=
89 | gene_expression_threshold
90 | for gene_id in variant.gene_ids
91 | ),
92 | variants,
93 | result_fn=variants.clone_with_new_elements,
94 | filter_name="Variant gene expression (min=%0.4f)" % gene_expression_threshold)
95 | if transcript_expression_dict:
96 | variants = apply_filter(
97 | lambda variant: any(
98 | transcript_expression_dict.get(transcript_id, 0.0) >=
99 | transcript_expression_threshold
100 | for transcript_id in variant.transcript_ids
101 | ),
102 | variants,
103 | result_fn=variants.clone_with_new_elements,
104 | filter_name=(
105 | "Variant transcript expression (min=%0.4f)" % (
106 | transcript_expression_threshold,)))
107 | return variants
108 |
109 | def apply_effect_expression_filters(
110 | effects,
111 | gene_expression_dict,
112 | gene_expression_threshold,
113 | transcript_expression_dict,
114 | transcript_expression_threshold):
115 | """
116 | Filter collection of varcode effects by given gene
117 | and transcript expression thresholds.
118 |
119 | Parameters
120 | ----------
121 | effects : varcode.EffectCollection
122 |
123 | gene_expression_dict : dict
124 |
125 | gene_expression_threshold : float
126 |
127 | transcript_expression_dict : dict
128 |
129 | transcript_expression_threshold : float
130 | """
131 | if gene_expression_dict:
132 | effects = apply_filter(
133 | lambda effect: (
134 | gene_expression_dict.get(effect.gene_id, 0.0) >=
135 | gene_expression_threshold),
136 | effects,
137 | result_fn=effects.clone_with_new_elements,
138 | filter_name="Effect gene expression (min = %0.4f)" % gene_expression_threshold)
139 |
140 | if transcript_expression_dict:
141 | effects = apply_filter(
142 | lambda effect: (
143 | transcript_expression_dict.get(effect.transcript_id, 0.0) >=
144 | transcript_expression_threshold
145 | ),
146 | effects,
147 | result_fn=effects.clone_with_new_elements,
148 | filter_name=(
149 | "Effect transcript expression (min=%0.4f)" % (
150 | transcript_expression_threshold,)))
151 | return effects
152 |
--------------------------------------------------------------------------------
/topiary/predictor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function, division, absolute_import
16 |
17 | import logging
18 |
19 | from collections import OrderedDict
20 |
21 |
22 | from .filters import (
23 | apply_effect_expression_filters,
24 | apply_variant_expression_filters,
25 | filter_silent_and_noncoding_effects,
26 | )
27 | from .sequence_helpers import (
28 | protein_subsequences_around_mutations,
29 | check_padding_around_mutation,
30 | contains_mutant_residues,
31 | peptide_mutation_interval,
32 | )
33 |
34 | class TopiaryPredictor(object):
35 | def __init__(
36 | self,
37 | mhc_model,
38 | padding_around_mutation=None,
39 | ic50_cutoff=None,
40 | percentile_cutoff=None,
41 | min_gene_expression=0.0,
42 | min_transcript_expression=0.0,
43 | only_novel_epitopes=False,
44 | raise_on_error=True):
45 | """
46 | Parameters
47 | ----------
48 | mhc_model : mhctools.BasePredictor
49 | Any instance of a peptide-MHC binding affinity predictor
50 |
51 | padding_around_mutation : int
52 | How many residues surrounding a mutation to consider including in a
53 | candidate epitope. Default is the minimum size necessary for epitope
54 | length of the mhc model.
55 |
56 | min_gene_expression : float, optional
57 | If gene expression values are provided, only keep effects on
58 | genes with expression above this threshold.
59 |
60 | min_transcript_expression : float, optional
61 | If transcript expression values are provided, only keep effects on
62 | transcripts with expression above this threshold.
63 |
64 | ic50_cutoff : float, optional
65 | Maximum predicted IC50 value for a peptide to be considered a binder.
66 |
67 | percentile_cutoff : float, optional
68 | Maximum percentile rank of IC50 values for a peptide to be considered
69 | a binder.
70 |
71 | only_novel_epitopes : bool, optional
72 | If True, then drop peptides which either don't contain a mutation.
73 | TODO: make this also check that peptide doesn't occur elsewhere in
74 | the reference ligandome
75 |
76 | raise_on_error : bool
77 | Raise an exception if error is encountered or skip
78 | the variant or peptide which generated the error.
79 | """
80 | self.mhc_model = mhc_model
81 | self.padding_around_mutation = check_padding_around_mutation(
82 | given_padding=padding_around_mutation,
83 | epitope_lengths=self.mhc_model.default_peptide_lengths)
84 | self.ic50_cutoff = ic50_cutoff
85 | self.percentile_cutoff = percentile_cutoff
86 | self.min_transcript_expression = min_transcript_expression
87 | self.min_gene_expression = min_gene_expression
88 | self.only_novel_epitopes = only_novel_epitopes
89 | self.raise_on_error = raise_on_error
90 |
91 | def predict_from_named_sequences(
92 | self, name_to_sequence_dict):
93 | """
94 | Parameters
95 | ----------
96 | name_to_sequence_dict : (str->str) dict
97 | Dictionary mapping sequence names to amino acid sequences
98 |
99 | Returns pandas.DataFrame with the following columns:
100 | - source_sequence_name
101 | - peptide
102 | - peptide_offset
103 | - peptide_length
104 | - allele
105 | - affinity
106 | - percentile_rank
107 | - prediction_method_name
108 | """
109 | df = self.mhc_model.predict_subsequences_dataframe(name_to_sequence_dict)
110 | return df.rename(
111 | columns={
112 | "length": "peptide_length",
113 | "offset": "peptide_offset"})
114 |
115 | def predict_from_sequences(self, sequences):
116 | """
117 | Predict MHC ligands for sub-sequences of each input sequence.
118 |
119 | Parameters
120 | ----------
121 | sequences : list of str
122 | Multiple amino acid sequences (without any names or IDs)
123 |
124 | Returns DataFrame with the following fields:
125 | - source_sequence
126 | - peptide
127 | - peptide_offset
128 | - peptide_length
129 | - allele
130 | - affinity
131 | - percentile_rank
132 | - prediction_method_name
133 | """
134 | # make each sequence its own unique ID
135 | sequence_dict = {
136 | seq: seq
137 | for seq in sequences
138 | }
139 | df = self.predict_from_named_sequences(sequence_dict)
140 | return df.rename(columns={"source_sequence_name": "source_sequence"})
141 |
142 | def predict_from_mutation_effects(
143 | self,
144 | effects,
145 | transcript_expression_dict=None,
146 | gene_expression_dict=None):
147 | """Given a Varcode.EffectCollection of predicted protein effects,
148 | return predicted epitopes around each mutation.
149 |
150 | Parameters
151 | ----------
152 | effects : Varcode.EffectCollection
153 |
154 | transcript_expression_dict : dict
155 | Dictionary mapping transcript IDs to RNA expression estimates. Used
156 | both for transcript expression filtering and for selecting the
157 | most abundant transcript for a particular variant. If omitted then
158 | transcript selection is done using priority of variant effects and
159 | transcript length.
160 |
161 | gene_expression_dict : dict, optional
162 | Dictionary mapping gene IDs to RNA expression estimates
163 |
164 | Returns DataFrame with the following columns:
165 | - variant
166 | - gene
167 | - gene_id
168 | - transcript_id
169 | - transcript_name
170 | - effect
171 | - effect_type
172 | - peptide
173 | - peptide_offset
174 | - peptide_length
175 | - allele
176 | - affinity
177 | - percentile_rank
178 | - prediction_method_name
179 | - contains_mutant_residues
180 | - mutation_start_in_peptide
181 | - mutation_end_in_peptide
182 |
183 | Optionall will also include the following columns if corresponding
184 | expression dictionary inputs are provided:
185 | - gene_expression
186 | - transcript_expression
187 | """
188 |
189 | # we only care about effects which impact the coding sequence of a
190 | # protein
191 | effects = filter_silent_and_noncoding_effects(effects)
192 |
193 | effects = apply_effect_expression_filters(
194 | effects,
195 | transcript_expression_dict=transcript_expression_dict,
196 | transcript_expression_threshold=self.min_transcript_expression,
197 | gene_expression_dict=gene_expression_dict,
198 | gene_expression_threshold=self.min_gene_expression)
199 |
200 | # group by variants, so that we end up with only one mutant
201 | # sequence per mutation
202 | variant_effect_groups = effects.groupby_variant()
203 |
204 | if len(variant_effect_groups) == 0:
205 | logging.warn("No candidates for MHC binding prediction")
206 | return []
207 |
208 | if transcript_expression_dict:
209 | # if expression data is available, then for each variant
210 | # keep the effect annotation for the most abundant transcript
211 | top_effects = [
212 | variant_effects.top_expression_effect(
213 | transcript_expression_dict)
214 | for variant_effects in variant_effect_groups.values()
215 | ]
216 | else:
217 | # if no transcript abundance data is available, then
218 | # for each variant keep the effect with the most significant
219 | # predicted effect on the protein sequence, along with using
220 | # transcript/CDS length as a tie-breaker for effects with the same
221 | # priority.
222 | top_effects = [
223 | variant_effects.top_priority_effect()
224 | for variant_effects in variant_effect_groups.values()
225 | ]
226 |
227 | # 1) dictionary mapping varcode effect objects to subsequences
228 | # around each mutation
229 | # 2) dictionary mapping varcode effect to start offset of subsequence
230 | # within the full mutant protein sequence
231 | effect_to_subsequence_dict, effect_to_offset_dict = \
232 | protein_subsequences_around_mutations(
233 | effects=top_effects,
234 | padding_around_mutation=self.padding_around_mutation)
235 |
236 | # since we know that each set of variant effects has been
237 | # reduced to a single 'top priority' effect, we can uniquely
238 | # identify each variant sequence by its original genomic variant
239 | variant_string_to_effect_dict = {
240 | effect.variant.short_description: effect
241 | for effect in effect_to_subsequence_dict.keys()
242 | }
243 | variant_string_to_subsequence_dict = {
244 | effect.variant.short_description: subseq
245 | for (effect, subseq) in effect_to_subsequence_dict.items()
246 | }
247 | variant_string_to_offset_dict = {
248 | effect.variant.short_description: subseq_offset
249 | for (effect, subseq_offset) in effect_to_offset_dict.items()
250 | }
251 | df = self.predict_from_named_sequences(variant_string_to_subsequence_dict)
252 | logging.info("MHC predictor returned %d peptide binding predictions" % (
253 | len(df)))
254 |
255 | # since we used variant descrptions as the name of each sequence
256 | # let's rename that column to be more informative
257 | df = df.rename(columns={"source_sequence_name": "variant"})
258 |
259 | # adjust offset to be relative to start of protein, rather
260 | # than whatever subsequence we used for prediction
261 | def compute_peptide_offset_relative_to_protein(row):
262 | subsequence_offset = variant_string_to_offset_dict[row.variant]
263 | return row.peptide_offset + subsequence_offset
264 |
265 | df["peptide_offset"] = df.apply(
266 | compute_peptide_offset_relative_to_protein,
267 | axis=1)
268 |
269 | if self.ic50_cutoff:
270 | df = df[df.affinity <= self.ic50_cutoff]
271 | logging.info("Kept %d predictions after filtering affinity <= %f" % (
272 | len(df), self.ic50_cutoff))
273 |
274 | if self.percentile_cutoff:
275 | df = df[df.percentile_rank <= self.percentile_cutoff]
276 | logging.info("Kept %d predictions after filtering percentile <= %f" % (
277 | len(df), self.percentile_cutoff))
278 |
279 | extra_columns = OrderedDict([
280 | ('gene', []),
281 | ('gene_id', []),
282 | ('transcript_id', []),
283 | ('transcript_name', []),
284 | ('effect', []),
285 | ('effect_type', []),
286 | ('contains_mutant_residues', []),
287 | ('mutation_start_in_peptide', []),
288 | ('mutation_end_in_peptide', []),
289 | ])
290 | if gene_expression_dict is not None:
291 | extra_columns["gene_expression"] = []
292 | if transcript_expression_dict is not None:
293 | extra_columns["transcript_expression"] = []
294 |
295 | for _, row in df.iterrows():
296 | effect = variant_string_to_effect_dict[row.variant]
297 | mutation_start_in_protein = effect.aa_mutation_start_offset
298 | mutation_end_in_protein = effect.aa_mutation_end_offset
299 | peptide_length = len(row.peptide)
300 | is_mutant = contains_mutant_residues(
301 | peptide_start_in_protein=row.peptide_offset,
302 | peptide_length=peptide_length,
303 | mutation_start_in_protein=mutation_start_in_protein,
304 | mutation_end_in_protein=mutation_end_in_protein)
305 | if is_mutant:
306 | mutation_start_in_peptide, mutation_end_in_peptide = peptide_mutation_interval(
307 | peptide_start_in_protein=row.peptide_offset,
308 | peptide_length=peptide_length,
309 | mutation_start_in_protein=mutation_start_in_protein,
310 | mutation_end_in_protein=mutation_end_in_protein)
311 | else:
312 | mutation_start_in_peptide = mutation_end_in_peptide = None
313 |
314 | extra_columns["gene"].append(effect.gene_name)
315 | gene_id = effect.gene_id
316 | extra_columns["gene_id"].append(gene_id)
317 | if gene_expression_dict is not None:
318 | extra_columns["gene_expression"].append(
319 | gene_expression_dict.get(gene_id, 0.0))
320 |
321 | transcript_id = effect.transcript_id
322 | extra_columns["transcript_id"].append(transcript_id)
323 | extra_columns["transcript_name"].append(effect.transcript_name)
324 | if transcript_expression_dict is not None:
325 | extra_columns["transcript_expression"].append(
326 | transcript_expression_dict.get(transcript_id, 0.0))
327 |
328 | extra_columns["effect"].append(effect.short_description)
329 | extra_columns["effect_type"].append(effect.__class__.__name__)
330 |
331 | extra_columns["contains_mutant_residues"].append(is_mutant)
332 | extra_columns["mutation_start_in_peptide"].append(mutation_start_in_peptide)
333 | extra_columns["mutation_end_in_peptide"].append(mutation_end_in_peptide)
334 |
335 | for col, values in extra_columns.items():
336 | df[col] = values
337 |
338 | # TODO: add extra boolean field
339 | # novel = is_mutant | not_in_reference
340 | # Requires keeping a quick lookup structure for all peptides in
341 | # the reference proteome
342 | if self.only_novel_epitopes:
343 | df = df[df.contains_mutant_residues]
344 |
345 | return df
346 |
347 | def predict_from_variants(
348 | self,
349 | variants,
350 | transcript_expression_dict=None,
351 | gene_expression_dict=None):
352 | """
353 | Predict epitopes from a Variant collection, filtering options, and
354 | optional gene and transcript expression data.
355 |
356 | Parameters
357 | ----------
358 | variants : varcode.VariantCollection
359 |
360 | transcript_expression_dict : dict
361 | Maps from Ensembl transcript IDs to FPKM expression values.
362 |
363 | gene_expression_dict : dict, optional
364 | Maps from Ensembl gene IDs to FPKM expression values.
365 |
366 | Returns DataFrame with the following columns:
367 | - variant
368 | - gene
369 | - gene_id
370 | - transcript_id
371 | - transcript_name
372 | - effect
373 | - effect_type
374 | - peptide
375 | - peptide_offset
376 | - peptide_length
377 | - allele
378 | - affinity
379 | - percentile_rank
380 | - prediction_method_name
381 | - contains_mutant_residues
382 | - mutation_start_in_peptide
383 | - mutation_end_in_peptide
384 |
385 | Optionall will also include the following columns if corresponding
386 | expression dictionary inputs are provided:
387 | - gene_expression
388 | - transcript_expression
389 | """
390 | # pre-filter variants by checking if any of the genes or
391 | # transcripts they overlap have sufficient expression.
392 | # I'm tolerating the redundancy of this code since it's much cheaper
393 | # to filter a variant *before* trying to predict its impact/effect
394 | # on the protein sequence.
395 | variants = apply_variant_expression_filters(
396 | variants,
397 | transcript_expression_dict=transcript_expression_dict,
398 | transcript_expression_threshold=self.min_transcript_expression,
399 | gene_expression_dict=gene_expression_dict,
400 | gene_expression_threshold=self.min_gene_expression)
401 |
402 | effects = variants.effects(raise_on_error=self.raise_on_error)
403 |
404 | return self.predict_from_mutation_effects(
405 | effects=effects,
406 | transcript_expression_dict=transcript_expression_dict,
407 | gene_expression_dict=gene_expression_dict)
408 |
--------------------------------------------------------------------------------
/topiary/rna/__init__.py:
--------------------------------------------------------------------------------
1 | from .cufflinks import (
2 | load_cufflinks_dataframe,
3 | load_cufflinks_dict,
4 | load_cufflinks_fpkm_dict,
5 | )
6 | from .gtf import load_transcript_fpkm_dict_from_gtf
7 |
8 | __all__ = [
9 | "load_cufflinks_dataframe",
10 | "load_cufflinks_dict",
11 | "load_cufflinks_fpkm_dict",
12 | "load_transcript_fpkm_dict_from_gtf",
13 | ]
14 |
--------------------------------------------------------------------------------
/topiary/rna/common.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function, division, absolute_import
16 |
17 | import re
18 |
19 | def infer_delimiter(filename, comment_char="#", n_lines=3):
20 | """
21 | Given a file which contains data separated by one of the following:
22 | - commas
23 | - tabs
24 | - spaces
25 | Return the most likely separator by sniffing the first few lines
26 | of the file's contents.
27 | """
28 | lines = []
29 | with open(filename, "r") as f:
30 | for line in f:
31 | if line.startswith(comment_char):
32 | continue
33 | if len(lines) < n_lines:
34 | lines.append(line)
35 | else:
36 | break
37 | if len(lines) < n_lines:
38 | raise ValueError(
39 | "Not enough lines in %s to infer delimiter" % filename)
40 | candidate_delimiters = ["\t", ",", "\s+"]
41 | for candidate_delimiter in candidate_delimiters:
42 | counts = [len(re.split(candidate_delimiter, line)) for line in lines]
43 | first_line_count = counts[0]
44 | if all(c == first_line_count for c in counts) and first_line_count > 1:
45 | return candidate_delimiter
46 | raise ValueError("Could not determine delimiter for %s" % filename)
47 |
48 |
49 | def check_required_columns(df, filename, required_columns):
50 | """
51 | Ensure that all required columns are present in the given dataframe,
52 | otherwise raise an exception.
53 | """
54 | available_columns = set(df.columns)
55 | for column_name in required_columns:
56 | if column_name not in available_columns:
57 | raise ValueError("FPKM tracking file %s missing column '%s'" % (
58 | filename,
59 | column_name))
60 |
--------------------------------------------------------------------------------
/topiary/rna/cufflinks.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function, division, absolute_import
16 |
17 | import logging
18 |
19 | import pandas as pd
20 | import numpy as np
21 |
22 | from .common import infer_delimiter, check_required_columns
23 |
24 | def parse_locus_column(loci):
25 | # capture all characters before ':' (drop 'chr' if present)
26 | chromosomes = loci.str.extract("(?:chr)?([^:]*):.*", expand=False)
27 | # capture all characters after e.g. 'chr1:', which look like '132-394'
28 | ranges = loci.str.extract("(?:chr)?[^:]*:(.*)", expand=False)
29 | # capture all numbers before the dash
30 | starts = ranges.str.extract("(\d*)-\d*", expand=False).astype(int)
31 | # capture all numbers after the dash
32 | ends = ranges.str.extract("\d*-(\d*)", expand=False).astype(int)
33 | return chromosomes, starts, ends
34 |
35 |
36 | # default column names from cufflinks tracking files
37 | # for gene and isoform expression levels
38 | STATUS_COLUMN = "FPKM_status"
39 | ID_COLUMN = "tracking_id"
40 | FPKM_COLUMN = "FPKM"
41 | LOCUS_COLUMN = "locus"
42 | GENE_NAMES_COLUMN = "gene_short_name"
43 |
44 |
45 | def load_cufflinks_dataframe(
46 | filename,
47 | id_column=ID_COLUMN,
48 | fpkm_column=FPKM_COLUMN,
49 | status_column=STATUS_COLUMN,
50 | locus_column=LOCUS_COLUMN,
51 | gene_names_column=GENE_NAMES_COLUMN,
52 | drop_failed=True,
53 | drop_lowdata=False,
54 | drop_hidata=True,
55 | replace_hidata_fpkm_value=None,
56 | drop_nonchromosomal_loci=False,
57 | drop_novel=False,
58 | sep=None):
59 | """
60 | Loads a Cufflinks tracking file, which contains expression levels
61 | (in FPKM: Fragments Per Kilobase of transcript per Million fragments)
62 | for transcript isoforms or whole genes. These transcripts/genes may be
63 | previously known (in which case they have an Ensembl ID) or a novel
64 | assembly from the RNA-Seq data (in which case their IDs look like "CUFF.1")
65 |
66 | Parameters
67 | ----------
68 |
69 | filename : str
70 | Filename of tracking file e.g. "genes.tracking_fpkm"
71 |
72 | id_column : str, optional
73 |
74 | fpkm_column : str, optional
75 |
76 | status_column : str, optional
77 | Name of column which indicates the FPKM estimate status. The column
78 | name is typically "FPKM_status". Possible contained within this column
79 | will be OK, FAIL, LOWDATA, HIDATA.
80 |
81 | locus_column : str, optional
82 |
83 | gene_names_column : str, optional
84 |
85 | drop_failed : bool, optional
86 | Drop rows whose FPKM status is "FAIL" (default=True)
87 |
88 | drop_lowdata : bool, optional
89 | Drop rows whose FPKM status is "LOWDATA", meaning that Cufflinks thought
90 | there were too few reads to accurately estimate the FPKM (default=False)
91 |
92 | drop_hidata : bool, optional
93 | Drop rows whose FPKM status is "HIDATA", meaning that too many
94 | fragments aligned to a feature for Cufflinks to process. Dropping
95 | the most expressed genes seems like a stupid idea so: default=False
96 |
97 | replace_hidata_fpkm_value : float, optional
98 | If drop_hidata=False, the HIDATA entries will still have an FPKM=0.0,
99 | this argument lets you replace the FPKM with some known constant.
100 |
101 | drop_nonchromosomal_loci : bool, optional
102 | Drop rows whose location isn't on a canonical chromosome
103 | i.e. doesn't start with "chr" (default=False)
104 |
105 | drop_novel : bool, optional
106 | Drop genes or isoforms that aren't found in Ensembl (default = False)
107 |
108 | sep : str, optional
109 | Separator between data fields in the FPKM tracking file
110 | (default is to infer whether the file uses comma or whitespace)
111 |
112 | Returns DataFrame with columns:
113 | id : str
114 | novel : bool
115 | fpkm : float
116 | chr : str
117 | start : int
118 | end : int
119 | gene_names : str list
120 | """
121 | if sep is None:
122 | sep = infer_delimiter(filename)
123 |
124 | df = pd.read_csv(filename, sep=sep, engine="c")
125 |
126 | required_columns = {
127 | status_column,
128 | locus_column,
129 | id_column,
130 | gene_names_column,
131 | fpkm_column
132 | }
133 | check_required_columns(df, filename, required_columns)
134 |
135 | for flag, status_value in [
136 | (drop_failed, "FAIL"),
137 | (drop_lowdata, "LOWDATA"),
138 | (drop_hidata, "HIDATA")]:
139 | mask = df[status_column] == status_value
140 | mask_count = mask.sum()
141 | total_count = len(df)
142 | if flag and mask_count > 0:
143 | verb_str = "Dropping"
144 | df = df[~mask]
145 | else:
146 | verb_str = "Keeping"
147 | logging.info(
148 | "%s %d/%d entries from %s with status=%s",
149 | verb_str,
150 | mask_count,
151 | total_count,
152 | filename,
153 | status_value)
154 |
155 | if drop_nonchromosomal_loci:
156 | loci = df[locus_column]
157 | chromosomal_loci = loci.str.startswith("chr")
158 | n_dropped = (~chromosomal_loci).sum()
159 | if n_dropped > 0:
160 | logging.info("Dropping %d/%d non-chromosomal loci from %s" % (
161 | n_dropped, len(df), filename))
162 | df = df[chromosomal_loci]
163 |
164 | if replace_hidata_fpkm_value:
165 | hidata_mask = df[status_column] == "HIDATA"
166 | n_hidata = hidata_mask.sum()
167 | logging.info(
168 | "Setting FPKM=%s for %d/%d entries with status=HIDATA",
169 | replace_hidata_fpkm_value,
170 | n_hidata,
171 | len(df))
172 | df[fpkm_column][hidata_mask] = replace_hidata_fpkm_value
173 |
174 | if len(df) == 0:
175 | raise ValueError("Empty FPKM tracking file: %s" % filename)
176 |
177 | ids = df[id_column]
178 | known = ids.str.startswith("ENS")
179 |
180 | if known.sum() == 0:
181 | raise ValueError("No Ensembl IDs found in %s" % filename)
182 |
183 | if drop_novel:
184 | n_dropped = (~known).sum()
185 | if n_dropped > 0:
186 | logging.info(
187 | "Dropping %d/%d novel entries from %s",
188 | n_dropped,
189 | len(df),
190 | filename)
191 | df = df[known]
192 | known = np.ones(len(df), dtype='bool')
193 |
194 | loci = df[locus_column]
195 | chromosomes, starts, ends = parse_locus_column(df[locus_column])
196 |
197 | # gene names are given either as "-" or a comma separated list
198 | # e.g. "BRAF1,PFAM2"
199 | gene_names_strings = df[gene_names_column].copy()
200 | gene_names_strings[gene_names_strings == "-"] = ""
201 | # split each entry into a list of zero or more strings
202 | gene_names_lists = gene_names_strings.str.split(",")
203 |
204 | return pd.DataFrame({
205 | "id": df[id_column],
206 | "novel": ~known,
207 | "fpkm": df[fpkm_column],
208 | "chr": chromosomes,
209 | "start": starts,
210 | "end": ends,
211 | "gene_names": gene_names_lists
212 | })
213 |
214 |
215 | def load_cufflinks_dict(*args, **kwargs):
216 | """
217 | Returns dictionary mapping feature identifier (either transcript or gene ID)
218 | to a DataFrame row with fields:
219 | id : str
220 | novel : bool
221 | fpkm : float
222 | chr : str
223 | start : int
224 | end : int
225 | gene_names : str list
226 | """
227 | return {
228 | row.id: row
229 | for (_, row)
230 | in load_cufflinks_dataframe(*args, **kwargs).iterrows()
231 | }
232 |
233 |
234 | def load_cufflinks_fpkm_dict(*args, **kwargs):
235 | """
236 | Returns dictionary mapping feature identifier (either transcript or gene ID)
237 | to FPKM expression value.
238 | """
239 | return {
240 | row.id: row.fpkm
241 | for (_, row)
242 | in load_cufflinks_dataframe(*args, **kwargs).iterrows()
243 | }
244 |
--------------------------------------------------------------------------------
/topiary/rna/gtf.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-2018. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function, division, absolute_import
16 |
17 | import logging
18 |
19 | import gtfparse
20 |
21 |
22 | def _get_gtf_column(column_name, gtf_path, df):
23 | """
24 | Helper function which returns a dictionary column or raises an ValueError
25 | abou the absence of that column in a GTF file.
26 | """
27 | if column_name in df.columns:
28 | return list(df[column_name])
29 |
30 | else:
31 | raise ValueError(
32 | "Missing '%s' in columns of %s, available: %s" % (
33 | column_name,
34 | gtf_path,
35 | list(df.columns)))
36 |
37 | def load_transcript_fpkm_dict_from_gtf(
38 | gtf_path,
39 | transcript_id_column_name="reference_id",
40 | fpkm_column_name="FPKM",
41 | feature_column_name="feature"):
42 | """
43 | Load a GTF file generated by StringTie which contains transcript-level
44 | quantification of abundance. Returns a dictionary mapping Ensembl
45 | IDs of transcripts to FPKM values.
46 | """
47 | df = gtfparse.read_gtf(
48 | gtf_path,
49 | column_converters={fpkm_column_name: float})
50 | transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path, df)
51 | fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, df)
52 | features = _get_gtf_column(feature_column_name, gtf_path, df)
53 | logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path))
54 | logging.info("Found %s transcript entries" % sum(
55 | feature == "transcript" for feature in features))
56 | result = {
57 | transcript_id: float(fpkm)
58 | for (transcript_id, fpkm, feature)
59 | in zip(transcript_ids, fpkm_values, features)
60 | if (
61 | (transcript_id is not None) and
62 | (len(transcript_id) > 0) and
63 | (feature == "transcript")
64 | )
65 | }
66 | logging.info("Keeping %d transcript rows with reference IDs" % (
67 | len(result),))
68 | return result
69 |
--------------------------------------------------------------------------------
/topiary/sequence_helpers.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function, division, absolute_import
16 |
17 | from typechecks import require_integer
18 |
19 | def protein_subsequences_around_mutations(effects, padding_around_mutation):
20 | """
21 | From each effect get a mutant protein sequence and pull out a subsequence
22 | around the mutation (based on the given padding). Returns a dictionary
23 | of subsequences and a dictionary of subsequence start offsets.
24 | """
25 | protein_subsequences = {}
26 | protein_subsequence_start_offsets = {}
27 | for effect in effects:
28 | protein_sequence = effect.mutant_protein_sequence
29 | # some effects will lack a mutant protein sequence since
30 | # they are either silent or unpredictable
31 | if protein_sequence:
32 | mutation_start = effect.aa_mutation_start_offset
33 | mutation_end = effect.aa_mutation_end_offset
34 | seq_start_offset = max(
35 | 0,
36 | mutation_start - padding_around_mutation)
37 | # some pseudogenes have stop codons in the reference sequence,
38 | # if we try to use them for epitope prediction we should trim
39 | # the sequence to not include the stop character '*'
40 | first_stop_codon_index = protein_sequence.find("*")
41 | if first_stop_codon_index < 0:
42 | first_stop_codon_index = len(protein_sequence)
43 |
44 | seq_end_offset = min(
45 | first_stop_codon_index,
46 | mutation_end + padding_around_mutation)
47 | subsequence = protein_sequence[seq_start_offset:seq_end_offset]
48 | protein_subsequences[effect] = subsequence
49 | protein_subsequence_start_offsets[effect] = seq_start_offset
50 | return protein_subsequences, protein_subsequence_start_offsets
51 |
52 | def check_padding_around_mutation(given_padding, epitope_lengths):
53 | """
54 | If user doesn't provide any padding around the mutation we need
55 | to at least include enough of the surrounding non-mutated
56 | esidues to construct candidate epitopes of the specified lengths.
57 | """
58 | min_required_padding = max(epitope_lengths) - 1
59 | if not given_padding:
60 | return min_required_padding
61 | else:
62 | require_integer(given_padding, "Padding around mutation")
63 | if given_padding < min_required_padding:
64 | raise ValueError(
65 | "Padding around mutation %d cannot be less than %d "
66 | "for epitope lengths %s" % (
67 | given_padding,
68 | min_required_padding,
69 | epitope_lengths))
70 | return given_padding
71 |
72 | def contains_mutant_residues(
73 | peptide_start_in_protein,
74 | peptide_length,
75 | mutation_start_in_protein,
76 | mutation_end_in_protein):
77 | peptide_end_in_protein = peptide_start_in_protein + peptide_length - 1
78 | return (
79 | peptide_start_in_protein < mutation_end_in_protein and
80 | peptide_end_in_protein >= mutation_start_in_protein
81 | )
82 |
83 | def peptide_mutation_interval(
84 | peptide_start_in_protein,
85 | peptide_length,
86 | mutation_start_in_protein,
87 | mutation_end_in_protein):
88 | """
89 | Half-open interval of mutated residues in the peptide, determined
90 | from the mutation interval in the original protein sequence.
91 |
92 | Parameters
93 | ----------
94 | peptide_start_in_protein : int
95 | Position of the first peptide residue within the protein
96 | (starting from 0)
97 |
98 | peptide_length : int
99 |
100 | mutation_start_in_protein : int
101 | Position of the first mutated residue starting from 0. In the case of a
102 | deletion, the position where the first residue had been.
103 |
104 | mutation_end_in_protein : int
105 | Position of the last mutated residue in the mutant protein. In the case
106 | of a deletion, this is equal to the mutation_start_in_protein.
107 | )
108 | """
109 | if peptide_start_in_protein > mutation_end_in_protein:
110 | raise ValueError("Peptide starts after mutation")
111 | elif peptide_start_in_protein + peptide_length < mutation_start_in_protein:
112 | raise ValueError("Peptide ends before mutation")
113 |
114 | # need a half-open start/end interval
115 | peptide_mutation_start_offset = min(
116 | peptide_length,
117 | max(0, mutation_start_in_protein - peptide_start_in_protein))
118 | peptide_mutation_end_offset = min(
119 | peptide_length,
120 | max(0, mutation_end_in_protein - peptide_start_in_protein))
121 | return (peptide_mutation_start_offset, peptide_mutation_end_offset)
122 |
--------------------------------------------------------------------------------