├── .github └── workflows │ └── tests.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── RELEASING.md ├── deploy.sh ├── develop.sh ├── lint.sh ├── pylintrc ├── requirements.txt ├── setup.py ├── test.sh ├── test ├── __init__.py ├── data.py ├── data │ ├── B16-StringTie-chr1-subset.gtf │ ├── genes.fpkm_tracking │ ├── isoforms.fpkm_tracking │ └── tiny_test_ligandome_dir │ │ ├── A0201 │ │ └── HLA-B0704 ├── test_args_outputs.py ├── test_cli_protein_changes.py ├── test_contains_mutant_residues.py ├── test_dataframe.py ├── test_effect_expression_filters.py ├── test_epitopes_from_commandline_args.py ├── test_load_cufflinks_fpkm.py ├── test_load_stringtie_gtf_fpkm.py ├── test_mutant_epitope_predictions_class1.py ├── test_mutant_epitope_predictions_class2.py ├── test_padding.py ├── test_peptide_mutation_interval.py ├── test_rna_helpers.py └── test_variant_expression_filters.py └── topiary ├── __init__.py ├── cli ├── __init__.py ├── args.py ├── errors.py ├── filtering.py ├── outputs.py ├── protein_changes.py ├── rna.py ├── script.py └── sequence.py ├── filters.py ├── predictor.py ├── rna ├── __init__.py ├── common.py ├── cufflinks.py └── gtf.py └── sequence_helpers.py /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Tests 5 | on: [push, pull_request] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | fail-fast: true 12 | matrix: 13 | python-version: ["3.10", "3.11", "3.12"] 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v3 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Checkout private netmhc-bundle repo 21 | uses: actions/checkout@v4 22 | with: 23 | repository: openvax/netmhc-bundle 24 | token: ${{ secrets.NETMHC_BUNDLE_ACCESS_TOKEN }} 25 | path: netmhc-bundle 26 | 27 | - name: Install netmhc-bundle dependencies 28 | uses: awalsh128/cache-apt-pkgs-action@latest 29 | with: 30 | packages: tcsh gawk python2-minimal 31 | version: 1.0 32 | - name: Install dependencies 33 | run: | 34 | python -m pip install --upgrade pip 35 | python -m pip install pytest pytest-cov pylint 36 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 37 | - name: Install wkthtmltopdf 38 | run: | 39 | sudo apt-get install -y xfonts-base xfonts-75dpi 40 | wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.bionic_amd64.deb 41 | sudo dpkg -i wkhtmltox_0.12.6-1.bionic_amd64.deb 42 | - name: Lint with PyLint 43 | run: | 44 | ./lint.sh 45 | - name: Download Ensembl data 46 | run: | 47 | echo "Before installing Ensembl releases" && df -h 48 | pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/ 49 | pyensembl install --release 102 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.102/ 50 | pyensembl install --release 93 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.93/ 51 | pyensembl install --release 93 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.93/ 52 | - name: Test with pytest 53 | run: | 54 | # configure netmhc-bundle paths 55 | export NETMHC_BUNDLE_HOME=$PWD/netmhc-bundle 56 | echo "NetMHC-bundle dir:" && ls -l $NETMHC_BUNDLE_HOME 57 | mkdir $PWD/netmhc-bundle-tmp 58 | export NETMHC_BUNDLE_TMPDIR=$PWD/netmhc-bundle-tmp 59 | export PATH=$PATH:$NETMHC_BUNDLE_HOME/bin 60 | ./test.sh 61 | - name: Publish coverage to Coveralls 62 | uses: coverallsapp/github-action@v2.2.3 63 | with: 64 | parallel: true 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false # Use container-based infrastructure 2 | language: python 3 | dist: trusty 4 | python: 5 | - "2.7" 6 | - "3.6" 7 | git: 8 | # don't need the default depth of 50 9 | # but don't want to use a depth of 1 since that affects 10 | # whether jobs run when you have multiple commits queued 11 | # https://github.com/travis-ci/travis-ci/issues/4575 12 | depth: 10 13 | cache: 14 | pip: true 15 | # cache directory used for Ensembl downloads of GTF and FASTA files 16 | # along with the indexed db of intervals and ID mappings and pickles 17 | # of sequence dictionaries. Also, pip 18 | directories: 19 | - $HOME/.cache/pyensembl/ 20 | addons: 21 | apt: 22 | packages: 23 | # Needed for NetMHC 24 | - tcsh 25 | env: 26 | global: 27 | # MHC_BUNDLE_PASS 28 | - secure: "TLTzSIABO/iYke8C66c0PRaWDZ5lx90s8XimSfDONOTXaX74V25O65qxzIWPAihxcdfLYA+bE2YRsjYOtuK+6DB2vjXbmoCQAXIFT/QXz4+iZTxN3g/s5N4hIR8tf9MSQ3KdNHOw7lKzdgAWKsFDQ8vwrqzYUNJGVtvoQSWCmPw=" 29 | before_install: 30 | - | 31 | if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 32 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; 33 | else 34 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 35 | fi 36 | - bash miniconda.sh -b -p $HOME/miniconda 37 | - export PATH="$HOME/miniconda/bin:$PATH" 38 | - hash -r 39 | - conda config --set always_yes yes --set changeps1 no 40 | - conda update -q conda 41 | # Useful for debugging any issues with conda 42 | - conda info -a 43 | - python --version 44 | # install MHC predictors 45 | - git clone https://mhcbundle:$MHC_BUNDLE_PASS@github.com/openvax/netmhc-bundle.git 46 | - export NETMHC_BUNDLE_HOME=$PWD/netmhc-bundle 47 | - mkdir tmp 48 | - export NETMHC_BUNDLE_TMPDIR=$PWD/tmp 49 | - export PATH=$PATH:$NETMHC_BUNDLE_HOME/bin 50 | install: 51 | - > 52 | conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION 53 | numpy nose pandas pandoc 54 | - source activate test-environment 55 | - pip install pypandoc 56 | - pip install -r requirements.txt 57 | - pip install . 58 | - pip install coveralls 59 | - pyensembl install --release 75 --species human 60 | - pyensembl install --release 87 --species human 61 | script: 62 | - ./lint.sh 63 | - nosetests test --with-coverage --cover-package=topiary 64 | after_success: 65 | coveralls 66 | deploy: 67 | provider: pypi 68 | distributions: sdist 69 | user: openvax 70 | password: # See http://docs.travis-ci.com/user/encryption-keys/ 71 | secure: "S4KWAhJpKYx5F/cBc6cf9GCZ8Hd+WtMA6V6PP25PglLnVaXrxB5QxuAIWGAvr/jGuTHjfCSCNDwTptW3natLjJR9IfJdJPp3gNvM0RDjWY4FsziFz/nG/bZo9qnh4ZCDhK/Po1izxXM0u9z6gUc0U2iKK1ZSdfawyW4nZbAXQUU=" 72 | on: 73 | branch: master 74 | condition: $TRAVIS_PYTHON_VERSION = "2.7" 75 | 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Tests](https://github.com/openvax/topiary/actions/workflows/tests.yml/badge.svg)](https://github.com/openvax/topiary/actions/workflows/tests.yml) 2 | 3 | Coverage Status 4 | 5 | 6 | PyPI 7 | 8 | 9 | # Topiary 10 | 11 | Predict mutation-derived cancer T-cell epitopes from (1) somatic variants (2) tumor RNA expression data, and (3) patient HLA type. 12 | 13 | ## Example 14 | 15 | ```sh 16 | ./topiary \ 17 | --vcf somatic.vcf \ 18 | --mhc-predictor netmhcpan \ 19 | --mhc-alleles HLA-A*02:01,HLA-B*07:02 \ 20 | --ic50-cutoff 500 \ 21 | --percentile-cutoff 2.0 \ 22 | --mhc-epitope-lengths 8-11 \ 23 | --rna-gene-fpkm-tracking-file genes.fpkm_tracking \ 24 | --rna-min-gene-expression 4.0 \ 25 | --rna-transcript-fpkm-tracking-file isoforms.fpkm_tracking \ 26 | --rna-min-transcript-expression 1.5 \ 27 | --output-csv epitopes.csv \ 28 | --output-html epitopes.html 29 | ``` 30 | 31 | ## Installation 32 | 33 | You can install Topiary and all of the libraries it depends on by running: 34 | ``` 35 | pip install topiary 36 | ``` 37 | 38 | You'll need to download the reference genome sequences and annotations for a 39 | recent Ensembl release (e.g. 81) by running: 40 | 41 | ``` 42 | pyensembl install --release 81 --species human 43 | ``` 44 | 45 | If you want to work with variants which were aligned against the older reference 46 | GRCh37, you will need to also download its annotation data, which is contained 47 | in Ensembl release 75: 48 | 49 | ``` 50 | pyensembl install --release 75 --species human 51 | ``` 52 | 53 | 54 | ## Commandline Arguments 55 | 56 | ### Genomic Variants 57 | 58 | Specify some variants by giving at least one of the following options. They can 59 | be used in combination and repeated. 60 | 61 | * `--vcf VCF_FILENAME`: Load a [VCF](http://www.1000genomes.org/wiki/analysis/variant%20call%20format/vcf-variant-call-format-version-41) file 62 | * `--maf MAF_FILENAME`: Load a TCGA [MAF](https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+%28MAF%29+Specification) file 63 | * `--variant CHR POS REF ALT : Specify an individual variant (requires --ensembl-version)` 64 | 65 | ### Output Format 66 | 67 | * `--output-csv OUTPUT_CSV_FILENAME`: Path to an output CSV file 68 | * `--output-html OUTPUT_HTML_FILENAME`: Path to an output HTML file 69 | 70 | ### RNA Expression Filtering 71 | 72 | Optional flags to use Cufflinks expression estimates for dropping epitopes 73 | arising from genes or transcripts that are not highly expressed. 74 | 75 | * `--rna-gene-fpkm-tracking-file RNA_GENE_FPKM_TRACKING_FILE`: Cufflinks FPKM tracking file 76 | containing gene expression estimates. 77 | * `--rna-min-gene-expression RNA_MIN_GENE_EXPRESSION`: Minimum FPKM for genes 78 | * `--rna-transcript-fpkm-tracking-file RNA_TRANSCRIPT_FPKM_TRACKING_FILE`: Cufflinks FPKM tracking 79 | file containing transcript expression estimates. 80 | * `--rna-min-transcript-expression RNA_MIN_TRANSCRIPT_EXPRESSION`: Minimum FPKM 81 | for transcripts 82 | * `--rna-transcript-fpkm-gtf-file RNA_TRANSCRIPT_FPKM_GTF_FILE`: StringTie GTF file 83 | file containing transcript expression estimates. 84 | 85 | ### Choose an MHC Binding Predictor 86 | 87 | You *must* choose an MHC binding predictor using one of the following values 88 | for the `--mhc-predictor` flag: 89 | 90 | * `netmhc`: Local [NetMHC](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHC) predictor (Topiary will attempt to automatically detect whether NetMHC 3.x or 4.0 is available) 91 | * `netmhcpan`: Local [NetMHCpan](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCpan) predictor 92 | * `netmhciipan`: Local [NetMHCIIpan](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCIIpan) predictor 93 | * `netmhccons`: Local [NetMHCcons](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCcons) 94 | * `random`: Random IC50 values 95 | * `smm`: Local [SMM](http://www.mhc-pathway.net/smm) predictor 96 | * `smm-pmbec`: Local [SMM-PMBEC](http://www.mhc-pathway.net/smmpmbec) predictor 97 | * `netmhcpan-iedb`: Use NetMHCpan via the IEDB web API 98 | * `netmhccons-iedb`: Use NetMHCcons via the IEDB web API 99 | * `smm-iedb`: Use SMM via the IEDB web API 100 | * `smm-pmbec-iedb`: Use SMM-PMBEC via the IEDB web API 101 | 102 | ### MHC Alleles 103 | You must specify the alleles to perform binding prediction for using one of 104 | the following flags: 105 | 106 | * `--mhc-alleles-file MHC_ALLELES_FILE`: Text file containing one allele name per 107 | line 108 | * `--mhc-alleles MHC_ALLELES`: Comma separated list of allele names, 109 | e.g. "HLA-A02:01,HLA-B07:02" 110 | 111 | ### Peptide Length 112 | 113 | * `--mhc-epitope-lengths MHC_EPITOPE_LENGTHS`: comma separated list of integers 114 | specifying which peptide lengths to use for MHC binding prediction 115 | 116 | ### Binding Prediction Filtering 117 | 118 | * `--only-novel-epitopes`: Topiary will normally keep all predicted epitopes, 119 | even those which occur in a given self-ligandome or don't overlap a mutated region 120 | of a protein. Use this flag to drop any epitopes which don't contain mutations 121 | or that occur elsewhere in the self-ligandome. 122 | * `--ic50-cutoff IC50_CUTOFF`: Drop peptides with predicted IC50 nM greater 123 | than this value (typical value is 500.0) 124 | * `--percentile-cutoff PERCENTILE_CUTOFF`: Drop peptides with percentile rank 125 | of their predicted IC50 (among predictions for a particular allele) fall below 126 | this threshold (lower values are stricter filters, typical value is 2.0) 127 | 128 | ### Misc 129 | 130 | * `--padding-around-mutation PADDING_AROUND_MUTATION`: Include more unmutated residues 131 | around the mutation (useful when not using `--only-novel-epitopes`) 132 | * `--self-filter-directory SELF_FILTER_DIRECTORY`: Directory of files named by MHC allele 133 | containing a self peptide ligandome (peptides which should be excluded from 134 | results) 135 | * `--skip-variant-errors`: If a particular mutation causes an exception to be raised 136 | during annotation, you can skip it using this flag. 137 | 138 | -------------------------------------------------------------------------------- /RELEASING.md: -------------------------------------------------------------------------------- 1 | # Releasing Topiary 2 | 3 | This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world: 4 | 5 | 1. Bump the [version](http://semver.org/) on __init__.py, as part of the PR you want to release. 6 | 2. Merge your branch into master. 7 | 3. After the Topiary unit tests complete successfully on Travis then the latest version 8 | of the code (with the version specified above) will be pushed to [PyPI](https://pypi.python.org/pypi) automatically. If you're curious about how automatic deployment is achieved, see our [Travis configuration](https://github.com/hammerlab/topiary/blob/master/.travis.yml#L58). 9 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | ./lint.sh && \ 2 | ./test.sh && \ 3 | python3 -m pip install --upgrade build && \ 4 | python3 -m pip install --upgrade twine && \ 5 | rm -rf dist && \ 6 | python3 -m build && \ 7 | git --version && \ 8 | python3 -m twine upload dist/* && \ 9 | git tag "$(python3 topiary/version.py)" && \ 10 | git push --tags 11 | 12 | -------------------------------------------------------------------------------- /develop.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | uv pip install -e . 4 | -------------------------------------------------------------------------------- /lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | 4 | 5 | # disabling several categories of errors due to false positives in pylint, 6 | # see these issues: 7 | # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and 8 | # - https://bitbucket.org/logilab/pylint/issues/58 9 | 10 | find topiary/ -name '*.py' \ 11 | | xargs pylint \ 12 | --errors-only \ 13 | --disable=unsubscriptable-object,not-an-iterable,no-member,invalid-unary-operand-type 14 | 15 | echo 'Passes pylint check' 16 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [TYPECHECK] 2 | # Without ignoring this, we get errors like: 3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member) 4 | ignored-modules = numpy -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.7 2 | pandas>=0.13.1 3 | mhctools>=1.3.0 4 | varcode>=0.3.17 5 | pylint>=1.4.4 6 | nose>=1.3.6 7 | gtfparse>=0.0.4 8 | mhcnames 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | import os 17 | import re 18 | 19 | from setuptools import setup, find_packages 20 | 21 | readme_dir = os.path.dirname(__file__) 22 | readme_path = os.path.join(readme_dir, 'README.md') 23 | 24 | try: 25 | with open(readme_path, 'r') as f: 26 | readme_markdown = f.read() 27 | except: 28 | readme_markdown = "" 29 | 30 | try: 31 | import pypandoc 32 | readme_restructured = pypandoc.convert(readme_markdown, to='rst', format='md') 33 | except: 34 | readme_restructured = readme_markdown 35 | print( 36 | "Conversion of long_description from MD to reStructuredText failed...") 37 | 38 | with open('topiary/__init__.py', 'r') as f: 39 | version = re.search( 40 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 41 | f.read(), 42 | re.MULTILINE).group(1) 43 | 44 | if not version: 45 | raise RuntimeError('Cannot find version information') 46 | 47 | if __name__ == '__main__': 48 | setup( 49 | name='topiary', 50 | version=version, 51 | description="Predict cancer epitopes from cancer sequence data", 52 | author="Alex Rubinsteyn, Tavi Nathanson", 53 | author_email="alex.rubinsteyn@gmail.com", 54 | url="https://github.com/hammerlab/topiary", 55 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 56 | classifiers=[ 57 | 'Development Status :: 3 - Alpha', 58 | 'Environment :: Console', 59 | 'Operating System :: OS Independent', 60 | 'Intended Audience :: Science/Research', 61 | 'License :: OSI Approved :: Apache Software License', 62 | 'Programming Language :: Python', 63 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 64 | ], 65 | install_requires=[ 66 | 'numpy >=1.7, <2.0', 67 | 'pandas >=0.13.1', 68 | 'mhctools >= 1.3.0', 69 | 'varcode >=0.3.17', 70 | 'nose >=1.3.6', 71 | 'gtfparse >=0.0.4', 72 | 'mhcnames', 73 | ], 74 | long_description=readme_restructured, 75 | packages=find_packages(exclude="test"), 76 | entry_points={ 77 | 'console_scripts': [ 78 | 'topiary = topiary.cli.script:main' 79 | ] 80 | } 81 | ) 82 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | pytest --cov=topiary/ --cov-report=term-missing tests 2 | 3 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/topiary/4ce5ed007a7a19d9666ba9f20cfcf5dfe745a4e3/test/__init__.py -------------------------------------------------------------------------------- /test/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Helper functions and shared datasets for tests 17 | """ 18 | 19 | 20 | from __future__ import print_function, division, absolute_import 21 | import os 22 | 23 | from varcode import Variant, VariantCollection 24 | from pyensembl import ensembl_grch38 25 | 26 | def data_path(name): 27 | """ 28 | Return the absolute path to a file in the varcode/test/data directory. 29 | The name specified should be relative to varcode/test/data. 30 | """ 31 | return os.path.join(os.path.dirname(__file__), "data", name) 32 | 33 | # BRAF variant coordinates from COSMIC entry: 34 | # http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=476 35 | braf_V600E_variant = Variant(7, 140753336, "A", "T", ensembl_grch38) 36 | 37 | # TP53 variant coordinates from COSMIC entry: 38 | # http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=10656 39 | tp53_R248W_variant = Variant(17, 7674221, "G", "A", ensembl_grch38) 40 | 41 | cancer_test_variants = VariantCollection([ 42 | braf_V600E_variant, 43 | tp53_R248W_variant 44 | ]) 45 | 46 | cancer_test_variant_gene_ids = { 47 | gene_id 48 | for v in cancer_test_variants 49 | for gene_id in v.gene_ids 50 | } 51 | 52 | cancer_test_variant_transcript_ids = { 53 | transcript_id 54 | for v in cancer_test_variants 55 | for transcript_id in v.transcript_ids 56 | } 57 | -------------------------------------------------------------------------------- /test/data/B16-StringTie-chr1-subset.gtf: -------------------------------------------------------------------------------- 1 | # StringTie version 1.1.2 2 | 1 StringTie transcript 4492457 4493604 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.2"; reference_id "ENSMUST00000192505"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.168215"; FPKM "0.125126"; TPM "0.255858"; 3 | 1 StringTie exon 4492457 4493604 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.2"; exon_number "1"; reference_id "ENSMUST00000192505"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.168215"; 4 | 1 StringTie transcript 4492465 4493735 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.1"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "6.349273"; FPKM "0.680062"; TPM "1.390592"; 5 | 1 StringTie exon 4492465 4492668 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.1"; exon_number "1"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "20.261032"; 6 | 1 StringTie exon 4493100 4493735 1000 - . gene_id "STRG.2"; transcript_id "STRG.2.1"; exon_number "2"; reference_id "ENSMUST00000191939"; ref_gene_id "ENSMUSG00000025902"; ref_gene_name "Sox17"; cov "1.887011"; 7 | 1 StringTie transcript 4687934 4689403 1000 - . gene_id "STRG.1"; transcript_id "STRG.1.1"; reference_id "ENSMUST00000182774"; ref_gene_id "ENSMUSG00000098104"; ref_gene_name "Gm6085"; cov "0.504422"; FPKM "0.054028"; TPM "0.110476"; 8 | 1 StringTie exon 4687934 4689403 1000 - . gene_id "STRG.1"; transcript_id "STRG.1.1"; exon_number "1"; reference_id "ENSMUST00000182774"; ref_gene_id "ENSMUSG00000098104"; ref_gene_name "Gm6085"; cov "0.504422"; -------------------------------------------------------------------------------- /test/data/genes.fpkm_tracking: -------------------------------------------------------------------------------- 1 | tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage FPKM FPKM_conf_lo FPKM_conf_hi FPKM_status 2 | ENSG00000240361 - - ENSG00000240361 OR4G11P - chr1:62947-63887 - - 0 0 0 OK 3 | ENSG00000268020 - - ENSG00000268020 AL627309.1 - chr1:53048-54936 - - 0 0 0 OK 4 | ENSG00000186092 - - ENSG00000186092 OR4F5 - chr1:69090-70008 - - 0 0 0 OK 5 | CUFF.1 - - CUFF.1 FAM138A - chr1:34553-36081 - - 0.0222016 0 0.0614304 OK 6 | CUFF.2 - - CUFF.2 DDX11L1 - chr1:11868-14412 - - 0 0 0.0497629 OK 7 | CUFF.3 - - CUFF.3 MIR1302-10 - chr1:29553-31109 - - 0 0 0.154007 OK 8 | CUFF.4 - - CUFF.4 WASH7P - chr1:14362-29806 - - 10.3844 9.63011 11.2268 OK 9 | ENSG00000269308 - - ENSG00000269308 AL645608.2 - chr1:818042-819983 - - 0 0 0 OK 10 | CUFF.5 - - CUFF.5 - - chr1:841474-842801 - - 0.172295 0.113069 0.231522 OK 11 | CUFF.6 - - CUFF.6 - - chr1:1-20 - - 0.172295 0.113069 0.231522 FAIL 12 | CUFF.7 - - CUFF.7 - - chr2:1-20 - - 0.172295 0.113069 0.231522 LOWDATA 13 | CUFF.8 - - CUFF.8 - - chr3:1-20 - - 0.172295 0.113069 0.231522 HIDATA 14 | -------------------------------------------------------------------------------- /test/data/isoforms.fpkm_tracking: -------------------------------------------------------------------------------- 1 | tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage FPKM FPKM_conf_lo FPKM_conf_hi FPKM_status 2 | ENST00000492842 - - ENSG00000240361 OR4G11P - chr1:62947-63887 940 0 0 0 0 OK 3 | ENST00000594647 - - ENSG00000268020 AL627309.1 - chr1:53048-54936 126 0 0 0 0 OK 4 | ENST00000335137 - - ENSG00000186092 OR4F5 - chr1:69090-70008 918 0 0 0 0 OK 5 | ENST00000417324 - - CUFF.1 FAM138A - chr1:34553-36081 1187 0 0 0 0.0120385 OK 6 | ENST00000461467 - - CUFF.1 FAM138A - chr1:35244-36073 590 0.621469 0.0222016 0 0.0484398 OK 7 | ENST00000456328 - - CUFF.2 DDX11L1 - chr1:11868-14409 1657 0 0 0 0.0129358 LOWDATA 8 | ENST00000515242 - - CUFF.2 DDX11L1 - chr1:11871-14412 1653 0 0 0 0.00864472 LOWDATA 9 | ENST00000518655 - - CUFF.2 DDX11L1 - chr1:11873-14409 1483 0 0 0 0.00963569 OK 10 | ENST00000450305 - - CUFF.2 DDX11L1 - chr1:12009-13670 632 0 0 0 0.0226103 LOWDATA 11 | CUFF.7604.1 - - CUFF.7604 - - chr2:45395607-45402815 1004 4.73445 0.194033 0.113862 0.27754 OK 12 | ENST00000496445 - - CUFF.38259 VTI1A - chr10:114207021-114298405 853 0 0 0 0 FAIL -------------------------------------------------------------------------------- /test/data/tiny_test_ligandome_dir/A0201: -------------------------------------------------------------------------------- 1 | SIINFKEL 2 | QQQQQQQQ 3 | -------------------------------------------------------------------------------- /test/data/tiny_test_ligandome_dir/HLA-B0704: -------------------------------------------------------------------------------- 1 | RRRRRRRRR -------------------------------------------------------------------------------- /test/test_args_outputs.py: -------------------------------------------------------------------------------- 1 | from topiary.cli.args import arg_parser 2 | from topiary.cli.outputs import write_outputs 3 | import tempfile 4 | import pandas as pd 5 | from nose.tools import eq_ 6 | 7 | 8 | def test_write_outputs(): 9 | 10 | with tempfile.NamedTemporaryFile(mode="r+", delete=False) as f: 11 | df = pd.DataFrame({ 12 | "x": [1, 2, 3], 13 | "y": [10, 20, 30] 14 | }) 15 | args = arg_parser.parse_args([ 16 | "--output-csv", f.name, 17 | "--subset-output-columns", "x", 18 | "--rename-output-column", "x", "X", 19 | "--mhc-predictor", "random", 20 | "--mhc-alleles", "A0201", 21 | ]) 22 | 23 | write_outputs( 24 | df, 25 | args, 26 | print_df_before_filtering=True, 27 | print_df_after_filtering=True) 28 | print("File: %s" % f.name) 29 | df_from_file = pd.read_csv(f.name, index_col="#") 30 | 31 | df_expected = pd.DataFrame({ 32 | "X": [1, 2, 3]}) 33 | print(df_from_file) 34 | eq_(len(df_expected), len(df_from_file)) 35 | assert (df_expected == df_from_file).all().all() 36 | -------------------------------------------------------------------------------- /test/test_cli_protein_changes.py: -------------------------------------------------------------------------------- 1 | from nose.tools import eq_ 2 | from topiary.cli.protein_changes import protein_change_effects_from_args 3 | from topiary.cli.args import create_arg_parser 4 | 5 | arg_parser = create_arg_parser( 6 | mhc=False, 7 | rna=False, 8 | output=False) 9 | 10 | def test_protein_change_effects_from_args_substitutions(): 11 | args = arg_parser.parse_args([ 12 | "--protein-change", "EGFR", "T790M", 13 | "--genome", "grch37", 14 | ]) 15 | 16 | effects = protein_change_effects_from_args(args) 17 | eq_(len(effects), 1) 18 | effect = effects[0] 19 | eq_(effect.aa_ref, "T") 20 | eq_(effect.aa_mutation_start_offset, 789) 21 | eq_(effect.aa_alt, "M") 22 | 23 | transcript = effect.transcript 24 | eq_(transcript.name, "EGFR-001") 25 | 26 | def test_protein_change_effects_from_args_malformed_missing_ref(): 27 | 28 | args = arg_parser.parse_args([ 29 | "--protein-change", "EGFR", "790M", 30 | "--genome", "grch37"]) 31 | 32 | effects = protein_change_effects_from_args(args) 33 | eq_(len(effects), 0) 34 | 35 | def test_protein_change_effects_from_args_malformed_missing_alt(): 36 | args = arg_parser.parse_args([ 37 | "--protein-change", "EGFR", "T790", 38 | "--genome", "grch37"]) 39 | effects = protein_change_effects_from_args(args) 40 | eq_(len(effects), 0) 41 | 42 | def test_protein_change_effects_from_args_multiple_effects(): 43 | args = arg_parser.parse_args([ 44 | "--protein-change", "EGFR", "T790M", 45 | "--protein-change", "KRAS", "G10D", 46 | "--genome", "grch37"]) 47 | effects = protein_change_effects_from_args(args) 48 | print(effects) 49 | eq_(len(effects), 2) 50 | -------------------------------------------------------------------------------- /test/test_contains_mutant_residues.py: -------------------------------------------------------------------------------- 1 | from nose.tools import eq_ 2 | from topiary import contains_mutant_residues 3 | 4 | def test_contains_mutant_residues_before(): 5 | eq_( 6 | contains_mutant_residues( 7 | peptide_start_in_protein=10, 8 | peptide_length=9, 9 | mutation_start_in_protein=5, 10 | mutation_end_in_protein=6), 11 | False) 12 | 13 | 14 | def test_contains_mutant_residues_after(): 15 | eq_( 16 | contains_mutant_residues( 17 | peptide_start_in_protein=10, 18 | peptide_length=9, 19 | mutation_start_in_protein=25, 20 | mutation_end_in_protein=26), 21 | False) 22 | 23 | def test_contains_mutant_residues_inside(): 24 | eq_( 25 | contains_mutant_residues( 26 | peptide_start_in_protein=10, 27 | peptide_length=9, 28 | mutation_start_in_protein=12, 29 | mutation_end_in_protein=13), 30 | True) 31 | 32 | def test_contains_mutant_residues_deletion_before_beginning(): 33 | # peptide only contains the residue *after* the mutation 34 | # so it still looks like it's wildtype 35 | eq_( 36 | contains_mutant_residues( 37 | peptide_start_in_protein=10, 38 | peptide_length=9, 39 | mutation_start_in_protein=10, 40 | mutation_end_in_protein=10), 41 | False) 42 | 43 | 44 | def test_contains_mutant_residues_deletion_at_beginning(): 45 | # peptide contains mutation before *and* after mutation so 46 | # it should count as having a mutant juxtaposition of residues 47 | eq_( 48 | contains_mutant_residues( 49 | peptide_start_in_protein=10, 50 | peptide_length=9, 51 | mutation_start_in_protein=11, 52 | mutation_end_in_protein=11), 53 | True) 54 | 55 | def test_contains_mutant_residues_deletion_after_end(): 56 | # peptide only contains the residue *before* the mutation 57 | # so it still looks like it's wildtype 58 | eq_( 59 | contains_mutant_residues( 60 | peptide_start_in_protein=10, 61 | peptide_length=9, 62 | mutation_start_in_protein=19, 63 | mutation_end_in_protein=19), 64 | False) 65 | 66 | def test_contains_mutant_residues_deletion_at_end(): 67 | # peptide contains mutation before *and* after mutation so 68 | # it should count as having a mutant juxtaposition of residues 69 | eq_( 70 | contains_mutant_residues( 71 | peptide_start_in_protein=10, 72 | peptide_length=9, 73 | mutation_start_in_protein=18, 74 | mutation_end_in_protein=18), 75 | True) 76 | -------------------------------------------------------------------------------- /test/test_dataframe.py: -------------------------------------------------------------------------------- 1 | 2 | from mhctools import NetMHC 3 | from topiary import TopiaryPredictor 4 | from .data import cancer_test_variants 5 | 6 | alleles = [ 7 | 'A02:01', 8 | 'B*07:02', 9 | 'HLA-C*07:02', 10 | ] 11 | 12 | mhc_model = NetMHC( 13 | alleles=alleles, 14 | default_peptide_lengths=[8, 9, 10]) 15 | 16 | DEFAULT_FPKM = 1.0 17 | 18 | def test_epitopes_to_dataframe_transcript_expression(): 19 | predictor = TopiaryPredictor( 20 | mhc_model=mhc_model, 21 | only_novel_epitopes=False) 22 | df = predictor.predict_from_variants( 23 | variants=cancer_test_variants, 24 | transcript_expression_dict={ 25 | transcript_id: DEFAULT_FPKM 26 | for variant in cancer_test_variants 27 | for transcript_id in variant.transcript_ids 28 | }) 29 | 30 | assert "transcript_expression" in df.columns, \ 31 | "transcript_expression missing from %s" % (df.columns,) 32 | assert(df["transcript_expression"] == DEFAULT_FPKM).all(), \ 33 | "Invalid FPKM values in DataFrame transcript_expression column" 34 | 35 | def test_epitopes_to_dataframe_gene_expression(): 36 | predictor = TopiaryPredictor( 37 | mhc_model=mhc_model, 38 | only_novel_epitopes=False) 39 | 40 | df = predictor.predict_from_variants( 41 | variants=cancer_test_variants, 42 | gene_expression_dict={ 43 | gene_id: DEFAULT_FPKM 44 | for variant in cancer_test_variants 45 | for gene_id in variant.gene_ids 46 | }) 47 | 48 | assert "gene_expression" in df.columns, \ 49 | "gene_expression missing from %s" % (df.columns,) 50 | assert(df["gene_expression"] == DEFAULT_FPKM).all(), \ 51 | "Invalid FPKM values in DataFrame gene_expression column" 52 | -------------------------------------------------------------------------------- /test/test_effect_expression_filters.py: -------------------------------------------------------------------------------- 1 | 2 | from .data import ( 3 | cancer_test_variants, 4 | cancer_test_variant_gene_ids, 5 | cancer_test_variant_transcript_ids 6 | ) 7 | from topiary.filters import apply_effect_expression_filters 8 | 9 | cancer_test_effects = cancer_test_variants.effects() 10 | 11 | DEFAULT_FPKM = 1.0 12 | 13 | # associate every gene ID with 1.0 FPKM 14 | gene_expression_dict = { 15 | gene_id: DEFAULT_FPKM 16 | for gene_id in cancer_test_variant_gene_ids 17 | } 18 | 19 | # associate every transcript with 1.0 FPKM 20 | transcript_expression_dict = { 21 | transcript_id: DEFAULT_FPKM 22 | for transcript_id in cancer_test_variant_transcript_ids 23 | } 24 | 25 | 26 | def test_apply_effect_gene_expression_below_threshold(): 27 | filtered = apply_effect_expression_filters( 28 | cancer_test_effects, 29 | gene_expression_dict=gene_expression_dict, 30 | gene_expression_threshold=2 * DEFAULT_FPKM, 31 | transcript_expression_dict=None, 32 | transcript_expression_threshold=None) 33 | assert len(filtered) == 0, \ 34 | "All variants should have been filtered out but got: %s" % (filtered,) 35 | 36 | def test_apply_effect_gene_expression_above_threshold(): 37 | filtered = apply_effect_expression_filters( 38 | cancer_test_effects, 39 | gene_expression_dict=gene_expression_dict, 40 | gene_expression_threshold=0.5 * DEFAULT_FPKM, 41 | transcript_expression_dict=None, 42 | transcript_expression_threshold=None) 43 | assert len(filtered) == len(cancer_test_effects), \ 44 | "Expected %s effects but got %s" % (len( 45 | cancer_test_effects), len(filtered)) 46 | 47 | def test_apply_effect_gene_expression_equal_threshold(): 48 | # expect genes with expression at threshold to NOT get filtered 49 | filtered = apply_effect_expression_filters( 50 | cancer_test_effects, 51 | gene_expression_dict=gene_expression_dict, 52 | gene_expression_threshold=DEFAULT_FPKM, 53 | transcript_expression_dict=None, 54 | transcript_expression_threshold=None) 55 | assert len(filtered) == len(cancer_test_effects), \ 56 | "Expected %s effects but got %s" % (len( 57 | cancer_test_effects), len(filtered)) 58 | 59 | def test_apply_effect_transcript_expression_below_threshold(): 60 | filtered = apply_effect_expression_filters( 61 | cancer_test_effects, 62 | gene_expression_dict=None, 63 | gene_expression_threshold=None, 64 | transcript_expression_dict=transcript_expression_dict, 65 | transcript_expression_threshold=2 * DEFAULT_FPKM) 66 | assert len(filtered) == 0, \ 67 | "All effects should have been filtered out but got: %s" % (filtered,) 68 | 69 | def test_apply_effect_transcript_expression_above_threshold(): 70 | filtered = apply_effect_expression_filters( 71 | cancer_test_effects, 72 | gene_expression_dict=None, 73 | gene_expression_threshold=None, 74 | transcript_expression_dict=transcript_expression_dict, 75 | transcript_expression_threshold=0.5 * DEFAULT_FPKM) 76 | assert len(filtered) == len(cancer_test_effects), \ 77 | "Expected %s effects but got %s" % ( 78 | len(cancer_test_effects), len(filtered)) 79 | 80 | def test_apply_effect_transcript_expression_equal_threshold(): 81 | # expect transcripts with expression at threshold to NOT be filtered 82 | filtered = apply_effect_expression_filters( 83 | cancer_test_effects, 84 | gene_expression_dict=None, 85 | gene_expression_threshold=None, 86 | transcript_expression_dict=transcript_expression_dict, 87 | transcript_expression_threshold=DEFAULT_FPKM) 88 | assert len(filtered) == len(cancer_test_effects), \ 89 | "Expected %s effects but got %s" % ( 90 | len(cancer_test_effects), len(filtered)) 91 | -------------------------------------------------------------------------------- /test/test_epitopes_from_commandline_args.py: -------------------------------------------------------------------------------- 1 | from nose.tools import eq_ 2 | 3 | from topiary.cli.args import arg_parser, predict_epitopes_from_args 4 | 5 | from .data import cancer_test_variants 6 | 7 | 8 | def test_cancer_epitopes_from_args(): 9 | epitope_lengths = [9, 10] 10 | alleles = ["HLA-A*02:01", "C0701"] 11 | args_list = [ 12 | "--mhc-predictor", "netmhc", 13 | "--mhc-epitope-lengths", ",".join(str(x) for x in epitope_lengths), 14 | "--mhc-alleles", ",".join(alleles), 15 | "--genome", "GRCh38", 16 | "--only-novel-epitopes", 17 | ] 18 | for variant in cancer_test_variants: 19 | args_list.append("--variant") 20 | args_list.append(str(variant.contig)) 21 | args_list.append(str(variant.start)) 22 | args_list.append(variant.ref) 23 | args_list.append(variant.alt) 24 | 25 | parsed_args = arg_parser.parse_args(args_list) 26 | epitope_predictions = predict_epitopes_from_args(parsed_args) 27 | expected_number_of_epitopes = 0 28 | for epitope_length in epitope_lengths: 29 | expected_number_of_epitopes += epitope_length * len(cancer_test_variants) * len(alleles) 30 | eq_(len(epitope_predictions), expected_number_of_epitopes) 31 | -------------------------------------------------------------------------------- /test/test_load_cufflinks_fpkm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | test_cufflinks : Test that we can correctly load Cufflinks tracking files which 17 | contain the estimated expression levels of genes and isoforms (computed from 18 | RNA-Seq reads). 19 | """ 20 | 21 | 22 | from __future__ import print_function, division, absolute_import 23 | 24 | from topiary.rna import load_cufflinks_dataframe 25 | 26 | from nose.tools import eq_ 27 | 28 | from .data import data_path 29 | 30 | def test_load_cufflinks_genes(): 31 | genes_df = load_cufflinks_dataframe( 32 | data_path("genes.fpkm_tracking"), 33 | drop_lowdata=True, 34 | drop_hidata=True, 35 | drop_failed=True, 36 | drop_novel=False) 37 | gene_ids = set(genes_df.id) 38 | expected_gene_ids = { 39 | "ENSG00000240361", 40 | "ENSG00000268020", 41 | "ENSG00000186092", 42 | "ENSG00000269308", 43 | "CUFF.1", 44 | "CUFF.2", 45 | "CUFF.3", 46 | "CUFF.4", 47 | "CUFF.5" 48 | } 49 | eq_(gene_ids, expected_gene_ids) 50 | 51 | def test_load_cufflinks_genes_drop_novel(): 52 | genes_df = load_cufflinks_dataframe( 53 | data_path("genes.fpkm_tracking"), 54 | drop_lowdata=True, 55 | drop_hidata=True, 56 | drop_failed=True, 57 | drop_novel=True) 58 | gene_ids = set(genes_df.id) 59 | expected_gene_ids = { 60 | "ENSG00000240361", 61 | "ENSG00000268020", 62 | "ENSG00000186092", 63 | "ENSG00000269308", 64 | } 65 | eq_(gene_ids, expected_gene_ids) 66 | 67 | 68 | def test_load_cufflinks_isoforms(): 69 | transcripts_df = load_cufflinks_dataframe( 70 | data_path("isoforms.fpkm_tracking"), 71 | drop_lowdata=True, 72 | drop_hidata=True, 73 | drop_failed=True, 74 | drop_novel=False) 75 | transcript_ids = set(transcripts_df.id) 76 | expected_transcript_ids = { 77 | "ENST00000492842", 78 | "ENST00000594647", 79 | "ENST00000335137", 80 | "ENST00000417324", 81 | "ENST00000461467", 82 | "ENST00000518655", 83 | "CUFF.7604.1", 84 | } 85 | eq_(transcript_ids, expected_transcript_ids) 86 | 87 | def test_load_cufflinks_isoforms_drop_novel(): 88 | transcripts_df = load_cufflinks_dataframe( 89 | data_path("isoforms.fpkm_tracking"), 90 | drop_lowdata=True, 91 | drop_hidata=True, 92 | drop_failed=True, 93 | drop_novel=True) 94 | transcript_ids = set(transcripts_df.id) 95 | expected_transcript_ids = { 96 | "ENST00000492842", 97 | "ENST00000594647", 98 | "ENST00000335137", 99 | "ENST00000417324", 100 | "ENST00000461467", 101 | "ENST00000518655", 102 | } 103 | eq_(transcript_ids, expected_transcript_ids) 104 | -------------------------------------------------------------------------------- /test/test_load_stringtie_gtf_fpkm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | 3 | from topiary.rna import load_transcript_fpkm_dict_from_gtf 4 | 5 | from nose.tools import eq_ 6 | 7 | from .data import data_path 8 | 9 | 10 | def test_load_stringtie_gtf_transcripts(): 11 | transcript_fpkms = load_transcript_fpkm_dict_from_gtf( 12 | data_path("B16-StringTie-chr1-subset.gtf")) 13 | transcript_ids = set(transcript_fpkms.keys()) 14 | expected_fpkms_dict = { 15 | "ENSMUST00000192505": 0.125126, 16 | "ENSMUST00000191939": 0.680062, 17 | "ENSMUST00000182774": 0.054028, 18 | } 19 | expected_transcript_ids = set(expected_fpkms_dict.keys()) 20 | eq_(expected_transcript_ids, transcript_ids) 21 | for transcript_id, fpkm in expected_fpkms_dict.items(): 22 | eq_(fpkm, transcript_fpkms[transcript_id]) 23 | -------------------------------------------------------------------------------- /test/test_mutant_epitope_predictions_class1.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from __future__ import print_function, division, absolute_import 17 | 18 | from mhctools import NetMHCpan 19 | from nose.tools import eq_, raises 20 | from pyensembl import ensembl_grch37 21 | from topiary import TopiaryPredictor 22 | from varcode import Variant, VariantCollection 23 | 24 | # TODO: find out about these variants, 25 | # what do we expect from them? Are they SNVs? 26 | variants = VariantCollection([ 27 | Variant( 28 | contig=10, 29 | start=100018900, 30 | ref='C', 31 | alt='T', 32 | ensembl=ensembl_grch37), 33 | Variant( 34 | contig=11, 35 | start=32861682, 36 | ref='G', 37 | alt='A', 38 | ensembl=ensembl_grch37)]) 39 | 40 | alleles = [ 41 | 'A02:01', 42 | 'a0204', 43 | 'B*07:02', 44 | 'HLA-B14:02', 45 | 'HLA-C*07:02', 46 | 'hla-c07:01' 47 | ] 48 | 49 | mhc_model = NetMHCpan( 50 | alleles=alleles, 51 | default_peptide_lengths=[9]) 52 | 53 | 54 | def test_epitope_prediction_without_padding(): 55 | output_without_padding = TopiaryPredictor( 56 | mhc_model=mhc_model, 57 | only_novel_epitopes=True).predict_from_variants(variants=variants) 58 | # one prediction for each variant * number of alleles 59 | strong_binders = output_without_padding[output_without_padding.affinity <= 500] 60 | eq_(len(strong_binders), 5) 61 | 62 | @raises(ValueError) 63 | def test_epitope_prediction_with_invalid_padding(): 64 | TopiaryPredictor( 65 | mhc_model=mhc_model, 66 | padding_around_mutation=7).predict_from_variants(variants=variants) 67 | 68 | 69 | @raises(ValueError) 70 | def test_epitope_prediction_with_invalid_zero_padding(): 71 | TopiaryPredictor( 72 | mhc_model=mhc_model, 73 | padding_around_mutation=7).predict_from_variants(variants=variants) 74 | 75 | 76 | def test_epitope_prediction_with_valid_padding(): 77 | predictor = TopiaryPredictor( 78 | mhc_model=mhc_model, 79 | padding_around_mutation=8, 80 | only_novel_epitopes=True) 81 | output_with_padding = predictor.predict_from_variants(variants=variants) 82 | # 6 alleles * 2 mutations * 9 distinct windows = 108 83 | eq_(len(output_with_padding), 108) 84 | -------------------------------------------------------------------------------- /test/test_mutant_epitope_predictions_class2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from __future__ import print_function, division, absolute_import 17 | 18 | from mhctools import NetMHCIIpan 19 | from nose.tools import eq_ 20 | from pyensembl import ensembl_grch37 21 | from topiary import TopiaryPredictor 22 | from varcode import Variant, VariantCollection 23 | 24 | # TODO: find out about these variants, 25 | # what do we expect from them? Are they SNVs? 26 | variants = VariantCollection([ 27 | Variant( 28 | contig=10, 29 | start=100018900, 30 | ref='C', 31 | alt='T', 32 | ensembl=ensembl_grch37), 33 | Variant( 34 | contig=11, 35 | start=32861682, 36 | ref='G', 37 | alt='A', 38 | ensembl=ensembl_grch37)]) 39 | 40 | alleles = [ 41 | "HLA-DPA1*01:05/DPB1*100:01", 42 | "DRB10102" 43 | ] 44 | 45 | mhc_model = NetMHCIIpan( 46 | alleles=alleles, 47 | default_peptide_lengths=[15, 16]) 48 | 49 | def test_netmhcii_pan_epitopes(): 50 | epitope_predictions = TopiaryPredictor( 51 | mhc_model=mhc_model, 52 | only_novel_epitopes=True).predict_from_variants(variants=variants) 53 | 54 | # expect (15 + 16 mutant peptides) * (2 alleles) * 2 variants = 55 | # 124 total epitope predictions 56 | eq_(len(epitope_predictions), 124) 57 | unique_alleles = set(epitope_predictions.allele) 58 | assert len(unique_alleles) == 2, \ 59 | "Expected 2 unique alleles, got %s" % (unique_alleles,) 60 | unique_lengths = set(epitope_predictions.peptide_length) 61 | assert unique_lengths == {15, 16}, \ 62 | "Expected epitopes of length 15 and 16 but got lengths %s" % (unique_lengths,) 63 | -------------------------------------------------------------------------------- /test/test_padding.py: -------------------------------------------------------------------------------- 1 | from nose.tools import eq_, assert_raises 2 | from topiary import check_padding_around_mutation 3 | 4 | def test_default_padding(): 5 | # expect padding to be one less than the largest epitope length 6 | eq_(check_padding_around_mutation(None, [8, 9, 10]), 9) 7 | 8 | def test_invalid_padding(): 9 | # padding is insufficient for the epitope lengths given 10 | with assert_raises(ValueError): 11 | check_padding_around_mutation(2, [9]) 12 | -------------------------------------------------------------------------------- /test/test_peptide_mutation_interval.py: -------------------------------------------------------------------------------- 1 | from nose.tools import eq_, assert_raises 2 | from topiary import peptide_mutation_interval 3 | 4 | def test_peptide_mutation_interval_middle(): 5 | start, end = peptide_mutation_interval( 6 | peptide_start_in_protein=10, 7 | peptide_length=9, 8 | mutation_start_in_protein=11, 9 | mutation_end_in_protein=12) 10 | eq_(start, 1) 11 | eq_(end, 2) 12 | 13 | 14 | def test_peptide_mutation_interval_start(): 15 | start, end = peptide_mutation_interval( 16 | peptide_start_in_protein=10, 17 | peptide_length=9, 18 | mutation_start_in_protein=7, 19 | mutation_end_in_protein=12) 20 | eq_(start, 0) 21 | eq_(end, 2) 22 | 23 | def test_peptide_mutation_interval_end(): 24 | start, end = peptide_mutation_interval( 25 | peptide_start_in_protein=10, 26 | peptide_length=9, 27 | mutation_start_in_protein=18, 28 | mutation_end_in_protein=20) 29 | eq_(start, 8) 30 | eq_(end, 9) 31 | 32 | def test_peptide_mutation_interval_deletion(): 33 | start, end = peptide_mutation_interval( 34 | peptide_start_in_protein=10, 35 | peptide_length=9, 36 | mutation_start_in_protein=15, 37 | mutation_end_in_protein=15) 38 | eq_(start, 5) 39 | eq_(end, 5) 40 | 41 | 42 | def test_peptide_mutation_interval_no_overlap_before(): 43 | with assert_raises(ValueError): 44 | peptide_mutation_interval( 45 | peptide_start_in_protein=10, 46 | peptide_length=9, 47 | mutation_start_in_protein=5, 48 | mutation_end_in_protein=6) 49 | 50 | def test_peptide_mutation_interval_no_overlap_after(): 51 | with assert_raises(ValueError): 52 | peptide_mutation_interval( 53 | peptide_start_in_protein=10, 54 | peptide_length=9, 55 | mutation_start_in_protein=25, 56 | mutation_end_in_protein=26) 57 | -------------------------------------------------------------------------------- /test/test_rna_helpers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from topiary.rna.cufflinks import parse_locus_column 3 | from nose.tools import eq_ 4 | 5 | 6 | def test_parse_locus_column_with_chr(): 7 | """ 8 | test_parse_locus_column_with_chr: Test that 'chr' prefix from 9 | chromosome names gets correctly dropped 10 | """ 11 | df = pd.DataFrame({"locus": ["chr1:10-20", "chrX:30-40"]}) 12 | loci = df["locus"] 13 | chromosomes, starts, ends = parse_locus_column(loci) 14 | eq_(list(chromosomes), ["1", "X"]) 15 | eq_(list(starts), [10, 30]) 16 | eq_(list(ends), [20, 40]) 17 | 18 | 19 | def test_parse_locus_column_without_chr(): 20 | """ 21 | test_parse_locus_column_without_chr: Test that chromosome names can be 22 | parsed without 'chr' prefix 23 | """ 24 | df = pd.DataFrame({"locus": ["1:10-20", "X:30-40"]}) 25 | loci = df["locus"] 26 | chromosomes, starts, ends = parse_locus_column(loci) 27 | eq_(list(chromosomes), ["1", "X"]) 28 | eq_(list(starts), [10, 30]) 29 | eq_(list(ends), [20, 40]) 30 | -------------------------------------------------------------------------------- /test/test_variant_expression_filters.py: -------------------------------------------------------------------------------- 1 | 2 | from topiary.filters import apply_variant_expression_filters 3 | 4 | from .data import ( 5 | cancer_test_variants, 6 | cancer_test_variant_gene_ids, 7 | cancer_test_variant_transcript_ids, 8 | ) 9 | 10 | DEFAULT_FPKM = 1.0 11 | 12 | # associate every gene ID with 1.0 FPKM 13 | gene_expression_dict = { 14 | gene_id: DEFAULT_FPKM 15 | for gene_id in cancer_test_variant_gene_ids 16 | } 17 | 18 | # associate every transcript with 1.0 FPKM 19 | transcript_expression_dict = { 20 | transcript_id: DEFAULT_FPKM 21 | for transcript_id in cancer_test_variant_transcript_ids 22 | } 23 | 24 | def test_apply_variant_gene_expression_below_threshold(): 25 | filtered = apply_variant_expression_filters( 26 | cancer_test_variants, 27 | gene_expression_dict=gene_expression_dict, 28 | gene_expression_threshold=2 * DEFAULT_FPKM, 29 | transcript_expression_dict=None, 30 | transcript_expression_threshold=None) 31 | assert len(filtered) == 0, \ 32 | "All variants should have been filtered out but got: %s" % (filtered,) 33 | 34 | def test_apply_variant_gene_expression_above_threshold(): 35 | filtered = apply_variant_expression_filters( 36 | cancer_test_variants, 37 | gene_expression_dict=gene_expression_dict, 38 | gene_expression_threshold=0.5 * DEFAULT_FPKM, 39 | transcript_expression_dict=None, 40 | transcript_expression_threshold=None) 41 | assert len(filtered) == len(cancer_test_variants), \ 42 | "Expected %s variants but got %s" % (len(cancer_test_variants), len(filtered)) 43 | 44 | def test_apply_variant_transcript_expression_below_threshold(): 45 | filtered = apply_variant_expression_filters( 46 | cancer_test_variants, 47 | gene_expression_dict=None, 48 | gene_expression_threshold=None, 49 | transcript_expression_dict=transcript_expression_dict, 50 | transcript_expression_threshold=2 * DEFAULT_FPKM) 51 | assert len(filtered) == 0, \ 52 | "All variants should have been filtered out but got: %s" % (filtered,) 53 | 54 | def test_apply_variant_transcript_expression_above_threshold(): 55 | filtered = apply_variant_expression_filters( 56 | cancer_test_variants, 57 | gene_expression_dict=None, 58 | gene_expression_threshold=None, 59 | transcript_expression_dict=transcript_expression_dict, 60 | transcript_expression_threshold=0.5 * DEFAULT_FPKM) 61 | assert len(filtered) == len(cancer_test_variants), \ 62 | "Expected %s variants but got %s" % (len(cancer_test_variants), len(filtered)) 63 | -------------------------------------------------------------------------------- /topiary/__init__.py: -------------------------------------------------------------------------------- 1 | from .predictor import TopiaryPredictor 2 | from .sequence_helpers import ( 3 | check_padding_around_mutation, 4 | peptide_mutation_interval, 5 | contains_mutant_residues, 6 | protein_subsequences_around_mutations, 7 | ) 8 | 9 | __version__ = '3.0.6' 10 | 11 | __all__ = [ 12 | "TopiaryPredictor", 13 | "contains_mutant_residues", 14 | "check_padding_around_mutation", 15 | "peptide_mutation_interval", 16 | "protein_subsequences_around_mutations", 17 | ] 18 | -------------------------------------------------------------------------------- /topiary/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /topiary/cli/args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Common commandline arguments used by scripts 17 | """ 18 | 19 | from __future__ import print_function, division, absolute_import 20 | 21 | from argparse import ArgumentParser 22 | from mhctools.cli import add_mhc_args, mhc_binding_predictor_from_args 23 | from varcode.cli import add_variant_args, variant_collection_from_args 24 | 25 | from .filtering import add_filter_args 26 | from .rna import ( 27 | add_rna_args, 28 | rna_gene_expression_dict_from_args, 29 | rna_transcript_expression_dict_from_args, 30 | ) 31 | from .sequence import add_sequence_args 32 | from .errors import add_error_args 33 | from .outputs import add_output_args 34 | from .protein_changes import add_protein_change_args 35 | from ..predictor import TopiaryPredictor 36 | 37 | def create_arg_parser( 38 | rna=True, 39 | mhc=True, 40 | variants=True, 41 | protein_changes=True, 42 | filters=True, 43 | sequence_options=True, 44 | error_options=True, 45 | output=True): 46 | arg_parser = ArgumentParser() 47 | if rna: 48 | add_rna_args(arg_parser) 49 | if mhc: 50 | add_mhc_args(arg_parser) 51 | if variants: 52 | add_variant_args(arg_parser) 53 | if protein_changes: 54 | add_protein_change_args(arg_parser) 55 | if filters: 56 | add_filter_args(arg_parser) 57 | if sequence_options: 58 | add_sequence_args(arg_parser) 59 | if error_options: 60 | add_error_args(arg_parser) 61 | if output: 62 | add_output_args(arg_parser) 63 | return arg_parser 64 | 65 | # keeping global instance for backwards compatibility with existing code 66 | arg_parser = create_arg_parser() 67 | 68 | def predict_epitopes_from_args(args): 69 | """ 70 | Returns an epitope collection from the given commandline arguments. 71 | 72 | Parameters 73 | ---------- 74 | args : argparse.Namespace 75 | Parsed commandline arguments for Topiary 76 | """ 77 | mhc_model = mhc_binding_predictor_from_args(args) 78 | variants = variant_collection_from_args(args) 79 | gene_expression_dict = rna_gene_expression_dict_from_args(args) 80 | transcript_expression_dict = rna_transcript_expression_dict_from_args(args) 81 | 82 | predictor = TopiaryPredictor( 83 | mhc_model=mhc_model, 84 | padding_around_mutation=args.padding_around_mutation, 85 | ic50_cutoff=args.ic50_cutoff, 86 | percentile_cutoff=args.percentile_cutoff, 87 | min_transcript_expression=args.rna_min_transcript_expression, 88 | min_gene_expression=args.rna_min_gene_expression, 89 | only_novel_epitopes=args.only_novel_epitopes, 90 | raise_on_error=not args.skip_variant_errors) 91 | return predictor.predict_from_variants( 92 | variants=variants, 93 | transcript_expression_dict=transcript_expression_dict, 94 | gene_expression_dict=gene_expression_dict) 95 | -------------------------------------------------------------------------------- /topiary/cli/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Commandline arguments related to error handling 17 | """ 18 | 19 | from __future__ import print_function, division, absolute_import 20 | 21 | def add_error_args(arg_parser): 22 | error_group = arg_parser.add_argument_group( 23 | title="Errors", 24 | description="Options for error handling") 25 | 26 | error_group.add_argument( 27 | "--skip-variant-errors", 28 | default=False, 29 | action="store_true", 30 | help="Skip variants which cause runtime errors of any kind") 31 | 32 | return error_group 33 | -------------------------------------------------------------------------------- /topiary/cli/filtering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | """ 17 | Commandline arguments related to epitope filtering 18 | """ 19 | 20 | from __future__ import print_function, division, absolute_import 21 | 22 | def add_filter_args(arg_parser): 23 | filter_group = arg_parser.add_argument_group( 24 | title="Filtering Options", 25 | description="Criteria for removing epitopes from results") 26 | 27 | filter_group.add_argument( 28 | "--ic50-cutoff", 29 | help="Drop epitopes with predicted IC50 nM affinity above this value", 30 | default=None, 31 | type=float) 32 | 33 | filter_group.add_argument( 34 | "--percentile-cutoff", 35 | help="Drop epitopes with predicted IC50 percentile rank above this value", 36 | default=None, 37 | type=float) 38 | 39 | filter_group.add_argument( 40 | "--only-novel-epitopes", 41 | help="".join([ 42 | "Drop epitopes which do not contain mutated residues or occur ", 43 | "in the self-ligandome."]), 44 | default=False, 45 | action="store_true") 46 | 47 | filter_group.add_argument( 48 | "--wildtype-ligandome-directory", 49 | help="".join([ 50 | "Directory of 'self' ligand peptide sets, in files named ", 51 | "by allele (e.g. 'A0201'). Any predicted mutant epitope which ", 52 | "is in the files associated with the given alleles is treated as ", 53 | "wildtype (non-mutated)."])) 54 | return filter_group 55 | -------------------------------------------------------------------------------- /topiary/cli/outputs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Common commandline arguments for output files 17 | """ 18 | 19 | from __future__ import print_function, division, absolute_import 20 | 21 | import logging 22 | 23 | def add_output_args(arg_parser): 24 | output_group = arg_parser.add_argument_group( 25 | title="Output", 26 | description="How and where to write results") 27 | 28 | output_group.add_argument( 29 | "--output-csv", 30 | default=None, 31 | help="Path to output CSV file") 32 | 33 | output_group.add_argument( 34 | "--output-html", 35 | default=None, 36 | help="Path to output HTML file") 37 | 38 | output_group.add_argument( 39 | "--output-csv-sep", 40 | default=",", 41 | help="Separator for CSV file") 42 | 43 | output_group.add_argument( 44 | "--subset-output-columns", 45 | nargs="*") 46 | 47 | output_group.add_argument( 48 | "--rename-output-column", 49 | nargs=2, 50 | action="append", 51 | help=( 52 | "Rename original column (first parameter) to new" 53 | " name (second parameter)")) 54 | 55 | output_group.add_argument( 56 | "--print-columns", 57 | default=False, 58 | action="store_true", 59 | help="Print columns before writing data to file(s)") 60 | 61 | return output_group 62 | 63 | def write_outputs( 64 | df, 65 | args, 66 | print_df_before_filtering=False, 67 | print_df_after_filtering=False): 68 | if print_df_before_filtering: 69 | print(df) 70 | 71 | if args.subset_output_columns: 72 | subset_columns = [] 73 | for column in args.subset_output_columns: 74 | if column not in df.columns: 75 | logging.warn( 76 | "Invalid column name '%s', available: %s" % ( 77 | column, list(df.columns))) 78 | else: 79 | subset_columns.append(column) 80 | df = df[subset_columns] 81 | 82 | if args.rename_output_column: 83 | for (old_name, new_name) in args.rename_output_column: 84 | if old_name not in df.columns: 85 | logging.warn( 86 | "Can't rename column '%s' since it doesn't exist, available: %s" % ( 87 | old_name, list(df.columns))) 88 | else: 89 | df.rename(columns={old_name: new_name}, inplace=True) 90 | 91 | if print_df_after_filtering: 92 | print(df) 93 | 94 | if args.print_columns: 95 | print("Columns:") 96 | for column in df.columns: 97 | print("-- %s" % column) 98 | 99 | if args.output_csv: 100 | print("Saving %s..." % args.output_csv) 101 | df.to_csv( 102 | args.output_csv, 103 | index=True, 104 | index_label="#", 105 | sep=args.output_csv_sep) 106 | 107 | if args.output_html: 108 | print("Saving %s..." % args.output_html) 109 | df.to_html(args.output_html, index=True) 110 | -------------------------------------------------------------------------------- /topiary/cli/protein_changes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from pyensembl import ensembl_grch38 17 | from varcode import EffectCollection 18 | from varcode.effects import Substitution 19 | from varcode.reference import infer_genome 20 | import re 21 | 22 | def add_protein_change_args(arg_parser): 23 | protein_change_group = arg_parser.add_argument_group( 24 | title="Protein Changes", 25 | description="Input protein changes without associated genomic variants") 26 | 27 | protein_change_group.add_argument( 28 | "--protein-change", 29 | default=[], 30 | nargs=2, 31 | action="append", 32 | help="Protein modification without genomic variant (e.g. EGFR T790M)") 33 | 34 | return arg_parser 35 | 36 | def genome_from_args(args): 37 | if args.genome: 38 | return infer_genome(args.genome) 39 | else: 40 | # no genome specified, assume it can be inferred from the file(s) 41 | # we're loading 42 | return ensembl_grch38 43 | 44 | def transcript_sort_key(transcript): 45 | """ 46 | Key function used to sort transcripts. Taking the negative of 47 | protein sequence length and nucleotide sequence length so that 48 | the transcripts with longest sequences come first in the list. This couldn't 49 | be accomplished with `reverse=True` since we're also sorting by 50 | transcript name (which places TP53-001 before TP53-002). 51 | """ 52 | return ( 53 | -len(transcript.protein_sequence), 54 | -len(transcript.sequence), 55 | transcript.name 56 | ) 57 | 58 | def best_transcript(transcripts): 59 | """ 60 | Given a set of coding transcripts, choose the one with the longest 61 | protein sequence and in cases of ties use the following tie-breaking 62 | criteria: 63 | - transcript sequence (including UTRs) 64 | - transcript name (so TP53-001 should come before TP53-202) 65 | """ 66 | assert len(transcripts) > 0 67 | sorted_list = sorted(transcripts, key=transcript_sort_key) 68 | return sorted_list[0] 69 | 70 | def protein_change_effects_from_args(args): 71 | genome = genome_from_args(args) 72 | valid_gene_names = set(genome.gene_names()) 73 | substitution_regex = re.compile("([A-Z]+)([0-9]+)([A-Z]+)") 74 | effects = [] 75 | for gene_name, protein_change_string in args.protein_change: 76 | match_obj = substitution_regex.match(protein_change_string) 77 | if match_obj is None: 78 | logging.warn( 79 | "Unable to parse protein modification: '%s'" % protein_change_string) 80 | continue 81 | 82 | ref, base1_pos, alt = match_obj.groups() 83 | 84 | base1_pos = int(base1_pos) 85 | 86 | if gene_name not in valid_gene_names: 87 | logging.warn("Invalid gene name '%s' in protein modification: '%s'" % ( 88 | gene_name, protein_change_string)) 89 | continue 90 | 91 | candidate_transcripts = [] 92 | for candidate_gene in genome.genes_by_name(gene_name): 93 | for candidate_transcript in candidate_gene.transcripts: 94 | if not candidate_transcript.is_protein_coding: 95 | continue 96 | protein_sequence = candidate_transcript.protein_sequence 97 | if protein_sequence is None: 98 | continue 99 | if len(protein_sequence) < (base1_pos + len(ref) - 1): 100 | # protein sequence too short for this modification 101 | # e.g. EGFR T790M can't happen in an EGFR transcript 102 | # with only 789 amino acids 103 | continue 104 | 105 | seq_at_pos = protein_sequence[base1_pos - 1: base1_pos + len(ref) - 1] 106 | if seq_at_pos != ref: 107 | # if this transcript doesn't have the same reference amino 108 | # acids as the change then skip it and use a different 109 | # transcript 110 | continue 111 | candidate_transcripts.append(candidate_transcript) 112 | if len(candidate_transcripts) > 0: 113 | transcript = best_transcript(candidate_transcripts) 114 | effects.append(Substitution( 115 | variant=None, 116 | transcript=transcript, 117 | aa_ref=ref, 118 | aa_alt=alt, 119 | aa_mutation_start_offset=base1_pos - 1)) 120 | return EffectCollection(effects) 121 | -------------------------------------------------------------------------------- /topiary/cli/rna.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Common commandline arguments for filtering by gene/transcript expression 17 | """ 18 | 19 | from __future__ import print_function, division, absolute_import 20 | 21 | from ..rna import ( 22 | load_cufflinks_fpkm_dict, 23 | load_transcript_fpkm_dict_from_gtf 24 | ) 25 | 26 | def add_rna_args(arg_parser): 27 | rna_group = arg_parser.add_argument_group( 28 | title="RNA-Seq", 29 | description="Transcript and gene abundance quantification") 30 | 31 | rna_group.add_argument( 32 | "--rna-transcript-fpkm-tracking-file", 33 | help="".join([ 34 | "Cufflinks tracking file (FPKM estimates for Ensembl transcripts). ", 35 | "Used both for expression filtering and selecting the most abundant ", 36 | "transcript to use for determining a mutant protein sequence."])) 37 | 38 | rna_group.add_argument( 39 | "--rna-transcript-fpkm-gtf-file", 40 | help="".join([ 41 | "GTF file containing FPKM estimates for Ensembl transcripts.", 42 | "Used both for expression filtering and selecting the most abundant ", 43 | "transcript to use for determining a mutant protein sequence."])) 44 | 45 | rna_group.add_argument( 46 | "--rna-min-transcript-expression", 47 | help="Minimum FPKM for transcript expression", 48 | default=0.0, 49 | type=float) 50 | 51 | rna_group.add_argument( 52 | "--rna-gene-fpkm-tracking-file", 53 | help="Cufflinks tracking file (FPKM estimates for Ensembl genes)", 54 | required=False) 55 | 56 | rna_group.add_argument( 57 | "--rna-min-gene-expression", 58 | help="Minimum FPKM for gene expression", 59 | default=0.0, 60 | type=float) 61 | 62 | return rna_group 63 | 64 | def rna_gene_expression_dict_from_args(args): 65 | """ 66 | Returns a dictionary mapping Ensembl gene IDs to FPKM expression values 67 | or None if neither Cufflinks tracking file nor StringTie GTF file specified 68 | in the commandline arguments. 69 | """ 70 | if args.rna_gene_fpkm_tracking_file: 71 | return load_cufflinks_fpkm_dict(args.rna_gene_fpkm_tracking_file) 72 | else: 73 | return None 74 | 75 | def rna_transcript_expression_dict_from_args(args): 76 | """ 77 | Returns a dictionary mapping Ensembl transcript IDs to FPKM expression 78 | values or None if neither Cufflinks tracking file nor StringTie GTF file 79 | were specified. 80 | """ 81 | if args.rna_transcript_fpkm_tracking_file: 82 | return load_cufflinks_fpkm_dict(args.rna_transcript_fpkm_tracking_file) 83 | elif args.rna_transcript_fpkm_gtf_file: 84 | return load_transcript_fpkm_dict_from_gtf( 85 | args.rna_transcript_fpkm_gtf_file) 86 | else: 87 | return None 88 | -------------------------------------------------------------------------------- /topiary/cli/script.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Script to generate epitope predictions from somatic cancer variants 17 | and (optionally) tumor RNA-seq data. 18 | 19 | Example usage: 20 | topiary \ 21 | --mhc-predictor netmhcpan 22 | --mhc-alleles-file HLA.txt 23 | --vcf somatic.vcf 24 | --rna-gene-fpkm-file genes.fpkm_tracking 25 | --rna-transcript-fpkm-file isoforms.fpkm_tracking 26 | --filter-ic50 500 27 | --filter-percentile 2 28 | --output results.csv 29 | """ 30 | 31 | from __future__ import print_function, division, absolute_import 32 | 33 | import sys 34 | 35 | from .args import arg_parser, predict_epitopes_from_args 36 | 37 | from .outputs import write_outputs 38 | 39 | 40 | def parse_args(args_list=None): 41 | if args_list is None: 42 | args_list = sys.argv[1:] 43 | return arg_parser.parse_args(args_list) 44 | 45 | def main(args_list=None): 46 | """ 47 | Script entry-point to predict neo-epitopes from genomic variants using 48 | Topiary. 49 | """ 50 | args = parse_args(args_list) 51 | print("Topiary commandline arguments:") 52 | print(args) 53 | df = predict_epitopes_from_args(args) 54 | write_outputs(df, args) 55 | print("Total count: %d" % len(df)) 56 | -------------------------------------------------------------------------------- /topiary/cli/sequence.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | """ 17 | Commandline arguments related to translated variant protein sequences. 18 | """ 19 | 20 | from __future__ import print_function, division, absolute_import 21 | 22 | def add_sequence_args(arg_parser): 23 | sequence_group = arg_parser.add_argument_group( 24 | title="Protein Sequence Options", 25 | description="Parameters related to the mutant protein sequence") 26 | 27 | sequence_group.add_argument( 28 | "--padding-around-mutation", 29 | default=None, 30 | help="".join([ 31 | "How many extra amino acids to include on either side of a mutation.", 32 | "Default is determined by epitope lengths but can be overridden to ", 33 | "predict wildtype epitopes in a larger context around a mutant residue.", 34 | ]), 35 | type=int) 36 | 37 | return sequence_group 38 | -------------------------------------------------------------------------------- /topiary/filters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Helper functions for filtering variants, effects, and epitope predictions 17 | """ 18 | 19 | from __future__ import print_function, division, absolute_import 20 | import logging 21 | 22 | from varcode import NonsilentCodingMutation 23 | 24 | def apply_filter( 25 | filter_fn, 26 | collection, 27 | result_fn=None, 28 | filter_name="", 29 | collection_name=""): 30 | """ 31 | Apply filter to effect collection and print number of dropped elements 32 | 33 | Parameters 34 | ---------- 35 | """ 36 | n_before = len(collection) 37 | filtered = [x for x in collection if filter_fn(x)] 38 | n_after = len(filtered) 39 | if not collection_name: 40 | collection_name = collection.__class__.__name__ 41 | logging.info( 42 | "%s filtering removed %d/%d entries of %s", 43 | filter_name, 44 | (n_before - n_after), 45 | n_before, 46 | collection_name) 47 | return result_fn(filtered) if result_fn else collection.__class__(filtered) 48 | 49 | def filter_silent_and_noncoding_effects(effects): 50 | """ 51 | Keep only variant effects which result in modified proteins. 52 | 53 | Parameters 54 | ---------- 55 | effects : varcode.EffectCollection 56 | """ 57 | return apply_filter( 58 | filter_fn=lambda effect: isinstance(effect, NonsilentCodingMutation), 59 | collection=effects, 60 | result_fn=effects.clone_with_new_elements, 61 | filter_name="Silent mutation") 62 | 63 | 64 | def apply_variant_expression_filters( 65 | variants, 66 | gene_expression_dict, 67 | gene_expression_threshold, 68 | transcript_expression_dict, 69 | transcript_expression_threshold): 70 | """ 71 | Filter a collection of variants by gene and transcript expression thresholds 72 | 73 | Parameters 74 | ---------- 75 | variants : varcode.VariantCollection 76 | 77 | gene_expression_dict : dict 78 | 79 | gene_expression_threshold : float 80 | 81 | transcript_expression_dict : dict 82 | 83 | transcript_expression_threshold : float 84 | """ 85 | if gene_expression_dict: 86 | variants = apply_filter( 87 | lambda variant: any( 88 | gene_expression_dict.get(gene_id, 0.0) >= 89 | gene_expression_threshold 90 | for gene_id in variant.gene_ids 91 | ), 92 | variants, 93 | result_fn=variants.clone_with_new_elements, 94 | filter_name="Variant gene expression (min=%0.4f)" % gene_expression_threshold) 95 | if transcript_expression_dict: 96 | variants = apply_filter( 97 | lambda variant: any( 98 | transcript_expression_dict.get(transcript_id, 0.0) >= 99 | transcript_expression_threshold 100 | for transcript_id in variant.transcript_ids 101 | ), 102 | variants, 103 | result_fn=variants.clone_with_new_elements, 104 | filter_name=( 105 | "Variant transcript expression (min=%0.4f)" % ( 106 | transcript_expression_threshold,))) 107 | return variants 108 | 109 | def apply_effect_expression_filters( 110 | effects, 111 | gene_expression_dict, 112 | gene_expression_threshold, 113 | transcript_expression_dict, 114 | transcript_expression_threshold): 115 | """ 116 | Filter collection of varcode effects by given gene 117 | and transcript expression thresholds. 118 | 119 | Parameters 120 | ---------- 121 | effects : varcode.EffectCollection 122 | 123 | gene_expression_dict : dict 124 | 125 | gene_expression_threshold : float 126 | 127 | transcript_expression_dict : dict 128 | 129 | transcript_expression_threshold : float 130 | """ 131 | if gene_expression_dict: 132 | effects = apply_filter( 133 | lambda effect: ( 134 | gene_expression_dict.get(effect.gene_id, 0.0) >= 135 | gene_expression_threshold), 136 | effects, 137 | result_fn=effects.clone_with_new_elements, 138 | filter_name="Effect gene expression (min = %0.4f)" % gene_expression_threshold) 139 | 140 | if transcript_expression_dict: 141 | effects = apply_filter( 142 | lambda effect: ( 143 | transcript_expression_dict.get(effect.transcript_id, 0.0) >= 144 | transcript_expression_threshold 145 | ), 146 | effects, 147 | result_fn=effects.clone_with_new_elements, 148 | filter_name=( 149 | "Effect transcript expression (min=%0.4f)" % ( 150 | transcript_expression_threshold,))) 151 | return effects 152 | -------------------------------------------------------------------------------- /topiary/predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function, division, absolute_import 16 | 17 | import logging 18 | 19 | from collections import OrderedDict 20 | 21 | 22 | from .filters import ( 23 | apply_effect_expression_filters, 24 | apply_variant_expression_filters, 25 | filter_silent_and_noncoding_effects, 26 | ) 27 | from .sequence_helpers import ( 28 | protein_subsequences_around_mutations, 29 | check_padding_around_mutation, 30 | contains_mutant_residues, 31 | peptide_mutation_interval, 32 | ) 33 | 34 | class TopiaryPredictor(object): 35 | def __init__( 36 | self, 37 | mhc_model, 38 | padding_around_mutation=None, 39 | ic50_cutoff=None, 40 | percentile_cutoff=None, 41 | min_gene_expression=0.0, 42 | min_transcript_expression=0.0, 43 | only_novel_epitopes=False, 44 | raise_on_error=True): 45 | """ 46 | Parameters 47 | ---------- 48 | mhc_model : mhctools.BasePredictor 49 | Any instance of a peptide-MHC binding affinity predictor 50 | 51 | padding_around_mutation : int 52 | How many residues surrounding a mutation to consider including in a 53 | candidate epitope. Default is the minimum size necessary for epitope 54 | length of the mhc model. 55 | 56 | min_gene_expression : float, optional 57 | If gene expression values are provided, only keep effects on 58 | genes with expression above this threshold. 59 | 60 | min_transcript_expression : float, optional 61 | If transcript expression values are provided, only keep effects on 62 | transcripts with expression above this threshold. 63 | 64 | ic50_cutoff : float, optional 65 | Maximum predicted IC50 value for a peptide to be considered a binder. 66 | 67 | percentile_cutoff : float, optional 68 | Maximum percentile rank of IC50 values for a peptide to be considered 69 | a binder. 70 | 71 | only_novel_epitopes : bool, optional 72 | If True, then drop peptides which either don't contain a mutation. 73 | TODO: make this also check that peptide doesn't occur elsewhere in 74 | the reference ligandome 75 | 76 | raise_on_error : bool 77 | Raise an exception if error is encountered or skip 78 | the variant or peptide which generated the error. 79 | """ 80 | self.mhc_model = mhc_model 81 | self.padding_around_mutation = check_padding_around_mutation( 82 | given_padding=padding_around_mutation, 83 | epitope_lengths=self.mhc_model.default_peptide_lengths) 84 | self.ic50_cutoff = ic50_cutoff 85 | self.percentile_cutoff = percentile_cutoff 86 | self.min_transcript_expression = min_transcript_expression 87 | self.min_gene_expression = min_gene_expression 88 | self.only_novel_epitopes = only_novel_epitopes 89 | self.raise_on_error = raise_on_error 90 | 91 | def predict_from_named_sequences( 92 | self, name_to_sequence_dict): 93 | """ 94 | Parameters 95 | ---------- 96 | name_to_sequence_dict : (str->str) dict 97 | Dictionary mapping sequence names to amino acid sequences 98 | 99 | Returns pandas.DataFrame with the following columns: 100 | - source_sequence_name 101 | - peptide 102 | - peptide_offset 103 | - peptide_length 104 | - allele 105 | - affinity 106 | - percentile_rank 107 | - prediction_method_name 108 | """ 109 | df = self.mhc_model.predict_subsequences_dataframe(name_to_sequence_dict) 110 | return df.rename( 111 | columns={ 112 | "length": "peptide_length", 113 | "offset": "peptide_offset"}) 114 | 115 | def predict_from_sequences(self, sequences): 116 | """ 117 | Predict MHC ligands for sub-sequences of each input sequence. 118 | 119 | Parameters 120 | ---------- 121 | sequences : list of str 122 | Multiple amino acid sequences (without any names or IDs) 123 | 124 | Returns DataFrame with the following fields: 125 | - source_sequence 126 | - peptide 127 | - peptide_offset 128 | - peptide_length 129 | - allele 130 | - affinity 131 | - percentile_rank 132 | - prediction_method_name 133 | """ 134 | # make each sequence its own unique ID 135 | sequence_dict = { 136 | seq: seq 137 | for seq in sequences 138 | } 139 | df = self.predict_from_named_sequences(sequence_dict) 140 | return df.rename(columns={"source_sequence_name": "source_sequence"}) 141 | 142 | def predict_from_mutation_effects( 143 | self, 144 | effects, 145 | transcript_expression_dict=None, 146 | gene_expression_dict=None): 147 | """Given a Varcode.EffectCollection of predicted protein effects, 148 | return predicted epitopes around each mutation. 149 | 150 | Parameters 151 | ---------- 152 | effects : Varcode.EffectCollection 153 | 154 | transcript_expression_dict : dict 155 | Dictionary mapping transcript IDs to RNA expression estimates. Used 156 | both for transcript expression filtering and for selecting the 157 | most abundant transcript for a particular variant. If omitted then 158 | transcript selection is done using priority of variant effects and 159 | transcript length. 160 | 161 | gene_expression_dict : dict, optional 162 | Dictionary mapping gene IDs to RNA expression estimates 163 | 164 | Returns DataFrame with the following columns: 165 | - variant 166 | - gene 167 | - gene_id 168 | - transcript_id 169 | - transcript_name 170 | - effect 171 | - effect_type 172 | - peptide 173 | - peptide_offset 174 | - peptide_length 175 | - allele 176 | - affinity 177 | - percentile_rank 178 | - prediction_method_name 179 | - contains_mutant_residues 180 | - mutation_start_in_peptide 181 | - mutation_end_in_peptide 182 | 183 | Optionall will also include the following columns if corresponding 184 | expression dictionary inputs are provided: 185 | - gene_expression 186 | - transcript_expression 187 | """ 188 | 189 | # we only care about effects which impact the coding sequence of a 190 | # protein 191 | effects = filter_silent_and_noncoding_effects(effects) 192 | 193 | effects = apply_effect_expression_filters( 194 | effects, 195 | transcript_expression_dict=transcript_expression_dict, 196 | transcript_expression_threshold=self.min_transcript_expression, 197 | gene_expression_dict=gene_expression_dict, 198 | gene_expression_threshold=self.min_gene_expression) 199 | 200 | # group by variants, so that we end up with only one mutant 201 | # sequence per mutation 202 | variant_effect_groups = effects.groupby_variant() 203 | 204 | if len(variant_effect_groups) == 0: 205 | logging.warn("No candidates for MHC binding prediction") 206 | return [] 207 | 208 | if transcript_expression_dict: 209 | # if expression data is available, then for each variant 210 | # keep the effect annotation for the most abundant transcript 211 | top_effects = [ 212 | variant_effects.top_expression_effect( 213 | transcript_expression_dict) 214 | for variant_effects in variant_effect_groups.values() 215 | ] 216 | else: 217 | # if no transcript abundance data is available, then 218 | # for each variant keep the effect with the most significant 219 | # predicted effect on the protein sequence, along with using 220 | # transcript/CDS length as a tie-breaker for effects with the same 221 | # priority. 222 | top_effects = [ 223 | variant_effects.top_priority_effect() 224 | for variant_effects in variant_effect_groups.values() 225 | ] 226 | 227 | # 1) dictionary mapping varcode effect objects to subsequences 228 | # around each mutation 229 | # 2) dictionary mapping varcode effect to start offset of subsequence 230 | # within the full mutant protein sequence 231 | effect_to_subsequence_dict, effect_to_offset_dict = \ 232 | protein_subsequences_around_mutations( 233 | effects=top_effects, 234 | padding_around_mutation=self.padding_around_mutation) 235 | 236 | # since we know that each set of variant effects has been 237 | # reduced to a single 'top priority' effect, we can uniquely 238 | # identify each variant sequence by its original genomic variant 239 | variant_string_to_effect_dict = { 240 | effect.variant.short_description: effect 241 | for effect in effect_to_subsequence_dict.keys() 242 | } 243 | variant_string_to_subsequence_dict = { 244 | effect.variant.short_description: subseq 245 | for (effect, subseq) in effect_to_subsequence_dict.items() 246 | } 247 | variant_string_to_offset_dict = { 248 | effect.variant.short_description: subseq_offset 249 | for (effect, subseq_offset) in effect_to_offset_dict.items() 250 | } 251 | df = self.predict_from_named_sequences(variant_string_to_subsequence_dict) 252 | logging.info("MHC predictor returned %d peptide binding predictions" % ( 253 | len(df))) 254 | 255 | # since we used variant descrptions as the name of each sequence 256 | # let's rename that column to be more informative 257 | df = df.rename(columns={"source_sequence_name": "variant"}) 258 | 259 | # adjust offset to be relative to start of protein, rather 260 | # than whatever subsequence we used for prediction 261 | def compute_peptide_offset_relative_to_protein(row): 262 | subsequence_offset = variant_string_to_offset_dict[row.variant] 263 | return row.peptide_offset + subsequence_offset 264 | 265 | df["peptide_offset"] = df.apply( 266 | compute_peptide_offset_relative_to_protein, 267 | axis=1) 268 | 269 | if self.ic50_cutoff: 270 | df = df[df.affinity <= self.ic50_cutoff] 271 | logging.info("Kept %d predictions after filtering affinity <= %f" % ( 272 | len(df), self.ic50_cutoff)) 273 | 274 | if self.percentile_cutoff: 275 | df = df[df.percentile_rank <= self.percentile_cutoff] 276 | logging.info("Kept %d predictions after filtering percentile <= %f" % ( 277 | len(df), self.percentile_cutoff)) 278 | 279 | extra_columns = OrderedDict([ 280 | ('gene', []), 281 | ('gene_id', []), 282 | ('transcript_id', []), 283 | ('transcript_name', []), 284 | ('effect', []), 285 | ('effect_type', []), 286 | ('contains_mutant_residues', []), 287 | ('mutation_start_in_peptide', []), 288 | ('mutation_end_in_peptide', []), 289 | ]) 290 | if gene_expression_dict is not None: 291 | extra_columns["gene_expression"] = [] 292 | if transcript_expression_dict is not None: 293 | extra_columns["transcript_expression"] = [] 294 | 295 | for _, row in df.iterrows(): 296 | effect = variant_string_to_effect_dict[row.variant] 297 | mutation_start_in_protein = effect.aa_mutation_start_offset 298 | mutation_end_in_protein = effect.aa_mutation_end_offset 299 | peptide_length = len(row.peptide) 300 | is_mutant = contains_mutant_residues( 301 | peptide_start_in_protein=row.peptide_offset, 302 | peptide_length=peptide_length, 303 | mutation_start_in_protein=mutation_start_in_protein, 304 | mutation_end_in_protein=mutation_end_in_protein) 305 | if is_mutant: 306 | mutation_start_in_peptide, mutation_end_in_peptide = peptide_mutation_interval( 307 | peptide_start_in_protein=row.peptide_offset, 308 | peptide_length=peptide_length, 309 | mutation_start_in_protein=mutation_start_in_protein, 310 | mutation_end_in_protein=mutation_end_in_protein) 311 | else: 312 | mutation_start_in_peptide = mutation_end_in_peptide = None 313 | 314 | extra_columns["gene"].append(effect.gene_name) 315 | gene_id = effect.gene_id 316 | extra_columns["gene_id"].append(gene_id) 317 | if gene_expression_dict is not None: 318 | extra_columns["gene_expression"].append( 319 | gene_expression_dict.get(gene_id, 0.0)) 320 | 321 | transcript_id = effect.transcript_id 322 | extra_columns["transcript_id"].append(transcript_id) 323 | extra_columns["transcript_name"].append(effect.transcript_name) 324 | if transcript_expression_dict is not None: 325 | extra_columns["transcript_expression"].append( 326 | transcript_expression_dict.get(transcript_id, 0.0)) 327 | 328 | extra_columns["effect"].append(effect.short_description) 329 | extra_columns["effect_type"].append(effect.__class__.__name__) 330 | 331 | extra_columns["contains_mutant_residues"].append(is_mutant) 332 | extra_columns["mutation_start_in_peptide"].append(mutation_start_in_peptide) 333 | extra_columns["mutation_end_in_peptide"].append(mutation_end_in_peptide) 334 | 335 | for col, values in extra_columns.items(): 336 | df[col] = values 337 | 338 | # TODO: add extra boolean field 339 | # novel = is_mutant | not_in_reference 340 | # Requires keeping a quick lookup structure for all peptides in 341 | # the reference proteome 342 | if self.only_novel_epitopes: 343 | df = df[df.contains_mutant_residues] 344 | 345 | return df 346 | 347 | def predict_from_variants( 348 | self, 349 | variants, 350 | transcript_expression_dict=None, 351 | gene_expression_dict=None): 352 | """ 353 | Predict epitopes from a Variant collection, filtering options, and 354 | optional gene and transcript expression data. 355 | 356 | Parameters 357 | ---------- 358 | variants : varcode.VariantCollection 359 | 360 | transcript_expression_dict : dict 361 | Maps from Ensembl transcript IDs to FPKM expression values. 362 | 363 | gene_expression_dict : dict, optional 364 | Maps from Ensembl gene IDs to FPKM expression values. 365 | 366 | Returns DataFrame with the following columns: 367 | - variant 368 | - gene 369 | - gene_id 370 | - transcript_id 371 | - transcript_name 372 | - effect 373 | - effect_type 374 | - peptide 375 | - peptide_offset 376 | - peptide_length 377 | - allele 378 | - affinity 379 | - percentile_rank 380 | - prediction_method_name 381 | - contains_mutant_residues 382 | - mutation_start_in_peptide 383 | - mutation_end_in_peptide 384 | 385 | Optionall will also include the following columns if corresponding 386 | expression dictionary inputs are provided: 387 | - gene_expression 388 | - transcript_expression 389 | """ 390 | # pre-filter variants by checking if any of the genes or 391 | # transcripts they overlap have sufficient expression. 392 | # I'm tolerating the redundancy of this code since it's much cheaper 393 | # to filter a variant *before* trying to predict its impact/effect 394 | # on the protein sequence. 395 | variants = apply_variant_expression_filters( 396 | variants, 397 | transcript_expression_dict=transcript_expression_dict, 398 | transcript_expression_threshold=self.min_transcript_expression, 399 | gene_expression_dict=gene_expression_dict, 400 | gene_expression_threshold=self.min_gene_expression) 401 | 402 | effects = variants.effects(raise_on_error=self.raise_on_error) 403 | 404 | return self.predict_from_mutation_effects( 405 | effects=effects, 406 | transcript_expression_dict=transcript_expression_dict, 407 | gene_expression_dict=gene_expression_dict) 408 | -------------------------------------------------------------------------------- /topiary/rna/__init__.py: -------------------------------------------------------------------------------- 1 | from .cufflinks import ( 2 | load_cufflinks_dataframe, 3 | load_cufflinks_dict, 4 | load_cufflinks_fpkm_dict, 5 | ) 6 | from .gtf import load_transcript_fpkm_dict_from_gtf 7 | 8 | __all__ = [ 9 | "load_cufflinks_dataframe", 10 | "load_cufflinks_dict", 11 | "load_cufflinks_fpkm_dict", 12 | "load_transcript_fpkm_dict_from_gtf", 13 | ] 14 | -------------------------------------------------------------------------------- /topiary/rna/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function, division, absolute_import 16 | 17 | import re 18 | 19 | def infer_delimiter(filename, comment_char="#", n_lines=3): 20 | """ 21 | Given a file which contains data separated by one of the following: 22 | - commas 23 | - tabs 24 | - spaces 25 | Return the most likely separator by sniffing the first few lines 26 | of the file's contents. 27 | """ 28 | lines = [] 29 | with open(filename, "r") as f: 30 | for line in f: 31 | if line.startswith(comment_char): 32 | continue 33 | if len(lines) < n_lines: 34 | lines.append(line) 35 | else: 36 | break 37 | if len(lines) < n_lines: 38 | raise ValueError( 39 | "Not enough lines in %s to infer delimiter" % filename) 40 | candidate_delimiters = ["\t", ",", "\s+"] 41 | for candidate_delimiter in candidate_delimiters: 42 | counts = [len(re.split(candidate_delimiter, line)) for line in lines] 43 | first_line_count = counts[0] 44 | if all(c == first_line_count for c in counts) and first_line_count > 1: 45 | return candidate_delimiter 46 | raise ValueError("Could not determine delimiter for %s" % filename) 47 | 48 | 49 | def check_required_columns(df, filename, required_columns): 50 | """ 51 | Ensure that all required columns are present in the given dataframe, 52 | otherwise raise an exception. 53 | """ 54 | available_columns = set(df.columns) 55 | for column_name in required_columns: 56 | if column_name not in available_columns: 57 | raise ValueError("FPKM tracking file %s missing column '%s'" % ( 58 | filename, 59 | column_name)) 60 | -------------------------------------------------------------------------------- /topiary/rna/cufflinks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function, division, absolute_import 16 | 17 | import logging 18 | 19 | import pandas as pd 20 | import numpy as np 21 | 22 | from .common import infer_delimiter, check_required_columns 23 | 24 | def parse_locus_column(loci): 25 | # capture all characters before ':' (drop 'chr' if present) 26 | chromosomes = loci.str.extract("(?:chr)?([^:]*):.*", expand=False) 27 | # capture all characters after e.g. 'chr1:', which look like '132-394' 28 | ranges = loci.str.extract("(?:chr)?[^:]*:(.*)", expand=False) 29 | # capture all numbers before the dash 30 | starts = ranges.str.extract("(\d*)-\d*", expand=False).astype(int) 31 | # capture all numbers after the dash 32 | ends = ranges.str.extract("\d*-(\d*)", expand=False).astype(int) 33 | return chromosomes, starts, ends 34 | 35 | 36 | # default column names from cufflinks tracking files 37 | # for gene and isoform expression levels 38 | STATUS_COLUMN = "FPKM_status" 39 | ID_COLUMN = "tracking_id" 40 | FPKM_COLUMN = "FPKM" 41 | LOCUS_COLUMN = "locus" 42 | GENE_NAMES_COLUMN = "gene_short_name" 43 | 44 | 45 | def load_cufflinks_dataframe( 46 | filename, 47 | id_column=ID_COLUMN, 48 | fpkm_column=FPKM_COLUMN, 49 | status_column=STATUS_COLUMN, 50 | locus_column=LOCUS_COLUMN, 51 | gene_names_column=GENE_NAMES_COLUMN, 52 | drop_failed=True, 53 | drop_lowdata=False, 54 | drop_hidata=True, 55 | replace_hidata_fpkm_value=None, 56 | drop_nonchromosomal_loci=False, 57 | drop_novel=False, 58 | sep=None): 59 | """ 60 | Loads a Cufflinks tracking file, which contains expression levels 61 | (in FPKM: Fragments Per Kilobase of transcript per Million fragments) 62 | for transcript isoforms or whole genes. These transcripts/genes may be 63 | previously known (in which case they have an Ensembl ID) or a novel 64 | assembly from the RNA-Seq data (in which case their IDs look like "CUFF.1") 65 | 66 | Parameters 67 | ---------- 68 | 69 | filename : str 70 | Filename of tracking file e.g. "genes.tracking_fpkm" 71 | 72 | id_column : str, optional 73 | 74 | fpkm_column : str, optional 75 | 76 | status_column : str, optional 77 | Name of column which indicates the FPKM estimate status. The column 78 | name is typically "FPKM_status". Possible contained within this column 79 | will be OK, FAIL, LOWDATA, HIDATA. 80 | 81 | locus_column : str, optional 82 | 83 | gene_names_column : str, optional 84 | 85 | drop_failed : bool, optional 86 | Drop rows whose FPKM status is "FAIL" (default=True) 87 | 88 | drop_lowdata : bool, optional 89 | Drop rows whose FPKM status is "LOWDATA", meaning that Cufflinks thought 90 | there were too few reads to accurately estimate the FPKM (default=False) 91 | 92 | drop_hidata : bool, optional 93 | Drop rows whose FPKM status is "HIDATA", meaning that too many 94 | fragments aligned to a feature for Cufflinks to process. Dropping 95 | the most expressed genes seems like a stupid idea so: default=False 96 | 97 | replace_hidata_fpkm_value : float, optional 98 | If drop_hidata=False, the HIDATA entries will still have an FPKM=0.0, 99 | this argument lets you replace the FPKM with some known constant. 100 | 101 | drop_nonchromosomal_loci : bool, optional 102 | Drop rows whose location isn't on a canonical chromosome 103 | i.e. doesn't start with "chr" (default=False) 104 | 105 | drop_novel : bool, optional 106 | Drop genes or isoforms that aren't found in Ensembl (default = False) 107 | 108 | sep : str, optional 109 | Separator between data fields in the FPKM tracking file 110 | (default is to infer whether the file uses comma or whitespace) 111 | 112 | Returns DataFrame with columns: 113 | id : str 114 | novel : bool 115 | fpkm : float 116 | chr : str 117 | start : int 118 | end : int 119 | gene_names : str list 120 | """ 121 | if sep is None: 122 | sep = infer_delimiter(filename) 123 | 124 | df = pd.read_csv(filename, sep=sep, engine="c") 125 | 126 | required_columns = { 127 | status_column, 128 | locus_column, 129 | id_column, 130 | gene_names_column, 131 | fpkm_column 132 | } 133 | check_required_columns(df, filename, required_columns) 134 | 135 | for flag, status_value in [ 136 | (drop_failed, "FAIL"), 137 | (drop_lowdata, "LOWDATA"), 138 | (drop_hidata, "HIDATA")]: 139 | mask = df[status_column] == status_value 140 | mask_count = mask.sum() 141 | total_count = len(df) 142 | if flag and mask_count > 0: 143 | verb_str = "Dropping" 144 | df = df[~mask] 145 | else: 146 | verb_str = "Keeping" 147 | logging.info( 148 | "%s %d/%d entries from %s with status=%s", 149 | verb_str, 150 | mask_count, 151 | total_count, 152 | filename, 153 | status_value) 154 | 155 | if drop_nonchromosomal_loci: 156 | loci = df[locus_column] 157 | chromosomal_loci = loci.str.startswith("chr") 158 | n_dropped = (~chromosomal_loci).sum() 159 | if n_dropped > 0: 160 | logging.info("Dropping %d/%d non-chromosomal loci from %s" % ( 161 | n_dropped, len(df), filename)) 162 | df = df[chromosomal_loci] 163 | 164 | if replace_hidata_fpkm_value: 165 | hidata_mask = df[status_column] == "HIDATA" 166 | n_hidata = hidata_mask.sum() 167 | logging.info( 168 | "Setting FPKM=%s for %d/%d entries with status=HIDATA", 169 | replace_hidata_fpkm_value, 170 | n_hidata, 171 | len(df)) 172 | df[fpkm_column][hidata_mask] = replace_hidata_fpkm_value 173 | 174 | if len(df) == 0: 175 | raise ValueError("Empty FPKM tracking file: %s" % filename) 176 | 177 | ids = df[id_column] 178 | known = ids.str.startswith("ENS") 179 | 180 | if known.sum() == 0: 181 | raise ValueError("No Ensembl IDs found in %s" % filename) 182 | 183 | if drop_novel: 184 | n_dropped = (~known).sum() 185 | if n_dropped > 0: 186 | logging.info( 187 | "Dropping %d/%d novel entries from %s", 188 | n_dropped, 189 | len(df), 190 | filename) 191 | df = df[known] 192 | known = np.ones(len(df), dtype='bool') 193 | 194 | loci = df[locus_column] 195 | chromosomes, starts, ends = parse_locus_column(df[locus_column]) 196 | 197 | # gene names are given either as "-" or a comma separated list 198 | # e.g. "BRAF1,PFAM2" 199 | gene_names_strings = df[gene_names_column].copy() 200 | gene_names_strings[gene_names_strings == "-"] = "" 201 | # split each entry into a list of zero or more strings 202 | gene_names_lists = gene_names_strings.str.split(",") 203 | 204 | return pd.DataFrame({ 205 | "id": df[id_column], 206 | "novel": ~known, 207 | "fpkm": df[fpkm_column], 208 | "chr": chromosomes, 209 | "start": starts, 210 | "end": ends, 211 | "gene_names": gene_names_lists 212 | }) 213 | 214 | 215 | def load_cufflinks_dict(*args, **kwargs): 216 | """ 217 | Returns dictionary mapping feature identifier (either transcript or gene ID) 218 | to a DataFrame row with fields: 219 | id : str 220 | novel : bool 221 | fpkm : float 222 | chr : str 223 | start : int 224 | end : int 225 | gene_names : str list 226 | """ 227 | return { 228 | row.id: row 229 | for (_, row) 230 | in load_cufflinks_dataframe(*args, **kwargs).iterrows() 231 | } 232 | 233 | 234 | def load_cufflinks_fpkm_dict(*args, **kwargs): 235 | """ 236 | Returns dictionary mapping feature identifier (either transcript or gene ID) 237 | to FPKM expression value. 238 | """ 239 | return { 240 | row.id: row.fpkm 241 | for (_, row) 242 | in load_cufflinks_dataframe(*args, **kwargs).iterrows() 243 | } 244 | -------------------------------------------------------------------------------- /topiary/rna/gtf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2018. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function, division, absolute_import 16 | 17 | import logging 18 | 19 | import gtfparse 20 | 21 | 22 | def _get_gtf_column(column_name, gtf_path, df): 23 | """ 24 | Helper function which returns a dictionary column or raises an ValueError 25 | abou the absence of that column in a GTF file. 26 | """ 27 | if column_name in df.columns: 28 | return list(df[column_name]) 29 | 30 | else: 31 | raise ValueError( 32 | "Missing '%s' in columns of %s, available: %s" % ( 33 | column_name, 34 | gtf_path, 35 | list(df.columns))) 36 | 37 | def load_transcript_fpkm_dict_from_gtf( 38 | gtf_path, 39 | transcript_id_column_name="reference_id", 40 | fpkm_column_name="FPKM", 41 | feature_column_name="feature"): 42 | """ 43 | Load a GTF file generated by StringTie which contains transcript-level 44 | quantification of abundance. Returns a dictionary mapping Ensembl 45 | IDs of transcripts to FPKM values. 46 | """ 47 | df = gtfparse.read_gtf( 48 | gtf_path, 49 | column_converters={fpkm_column_name: float}) 50 | transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path, df) 51 | fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, df) 52 | features = _get_gtf_column(feature_column_name, gtf_path, df) 53 | logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path)) 54 | logging.info("Found %s transcript entries" % sum( 55 | feature == "transcript" for feature in features)) 56 | result = { 57 | transcript_id: float(fpkm) 58 | for (transcript_id, fpkm, feature) 59 | in zip(transcript_ids, fpkm_values, features) 60 | if ( 61 | (transcript_id is not None) and 62 | (len(transcript_id) > 0) and 63 | (feature == "transcript") 64 | ) 65 | } 66 | logging.info("Keeping %d transcript rows with reference IDs" % ( 67 | len(result),)) 68 | return result 69 | -------------------------------------------------------------------------------- /topiary/sequence_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function, division, absolute_import 16 | 17 | from typechecks import require_integer 18 | 19 | def protein_subsequences_around_mutations(effects, padding_around_mutation): 20 | """ 21 | From each effect get a mutant protein sequence and pull out a subsequence 22 | around the mutation (based on the given padding). Returns a dictionary 23 | of subsequences and a dictionary of subsequence start offsets. 24 | """ 25 | protein_subsequences = {} 26 | protein_subsequence_start_offsets = {} 27 | for effect in effects: 28 | protein_sequence = effect.mutant_protein_sequence 29 | # some effects will lack a mutant protein sequence since 30 | # they are either silent or unpredictable 31 | if protein_sequence: 32 | mutation_start = effect.aa_mutation_start_offset 33 | mutation_end = effect.aa_mutation_end_offset 34 | seq_start_offset = max( 35 | 0, 36 | mutation_start - padding_around_mutation) 37 | # some pseudogenes have stop codons in the reference sequence, 38 | # if we try to use them for epitope prediction we should trim 39 | # the sequence to not include the stop character '*' 40 | first_stop_codon_index = protein_sequence.find("*") 41 | if first_stop_codon_index < 0: 42 | first_stop_codon_index = len(protein_sequence) 43 | 44 | seq_end_offset = min( 45 | first_stop_codon_index, 46 | mutation_end + padding_around_mutation) 47 | subsequence = protein_sequence[seq_start_offset:seq_end_offset] 48 | protein_subsequences[effect] = subsequence 49 | protein_subsequence_start_offsets[effect] = seq_start_offset 50 | return protein_subsequences, protein_subsequence_start_offsets 51 | 52 | def check_padding_around_mutation(given_padding, epitope_lengths): 53 | """ 54 | If user doesn't provide any padding around the mutation we need 55 | to at least include enough of the surrounding non-mutated 56 | esidues to construct candidate epitopes of the specified lengths. 57 | """ 58 | min_required_padding = max(epitope_lengths) - 1 59 | if not given_padding: 60 | return min_required_padding 61 | else: 62 | require_integer(given_padding, "Padding around mutation") 63 | if given_padding < min_required_padding: 64 | raise ValueError( 65 | "Padding around mutation %d cannot be less than %d " 66 | "for epitope lengths %s" % ( 67 | given_padding, 68 | min_required_padding, 69 | epitope_lengths)) 70 | return given_padding 71 | 72 | def contains_mutant_residues( 73 | peptide_start_in_protein, 74 | peptide_length, 75 | mutation_start_in_protein, 76 | mutation_end_in_protein): 77 | peptide_end_in_protein = peptide_start_in_protein + peptide_length - 1 78 | return ( 79 | peptide_start_in_protein < mutation_end_in_protein and 80 | peptide_end_in_protein >= mutation_start_in_protein 81 | ) 82 | 83 | def peptide_mutation_interval( 84 | peptide_start_in_protein, 85 | peptide_length, 86 | mutation_start_in_protein, 87 | mutation_end_in_protein): 88 | """ 89 | Half-open interval of mutated residues in the peptide, determined 90 | from the mutation interval in the original protein sequence. 91 | 92 | Parameters 93 | ---------- 94 | peptide_start_in_protein : int 95 | Position of the first peptide residue within the protein 96 | (starting from 0) 97 | 98 | peptide_length : int 99 | 100 | mutation_start_in_protein : int 101 | Position of the first mutated residue starting from 0. In the case of a 102 | deletion, the position where the first residue had been. 103 | 104 | mutation_end_in_protein : int 105 | Position of the last mutated residue in the mutant protein. In the case 106 | of a deletion, this is equal to the mutation_start_in_protein. 107 | ) 108 | """ 109 | if peptide_start_in_protein > mutation_end_in_protein: 110 | raise ValueError("Peptide starts after mutation") 111 | elif peptide_start_in_protein + peptide_length < mutation_start_in_protein: 112 | raise ValueError("Peptide ends before mutation") 113 | 114 | # need a half-open start/end interval 115 | peptide_mutation_start_offset = min( 116 | peptide_length, 117 | max(0, mutation_start_in_protein - peptide_start_in_protein)) 118 | peptide_mutation_end_offset = min( 119 | peptide_length, 120 | max(0, mutation_end_in_protein - peptide_start_in_protein)) 121 | return (peptide_mutation_start_offset, peptide_mutation_end_offset) 122 | --------------------------------------------------------------------------------