├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── deploy.sh ├── develop.sh ├── lint.sh ├── pepdata ├── __init__.py ├── amino_acid.py ├── amino_acid_alphabet.py ├── amino_acid_properties.py ├── blosum.py ├── chou_fasman.py ├── common.py ├── iedb │ ├── __init__.py │ ├── alleles.py │ ├── columns.py │ ├── common.py │ ├── memoize.py │ ├── mhc.py │ └── tcell.py ├── matrices │ ├── BLOSUM30 │ ├── BLOSUM50 │ ├── BLOSUM62 │ ├── __init__.py │ ├── amino_acid_properties.txt │ ├── helix_vs_coil.txt │ ├── helix_vs_strand.txt │ ├── pmbec.mat │ └── strand_vs_coil.txt ├── peptide_vectorizer.py ├── pmbec.py ├── reduced_alphabet.py ├── residue_contact_energies.py ├── static_data.py └── version.py ├── pylintrc ├── requirements.txt ├── setup.py ├── test.sh └── tests ├── test_amino_acids.py ├── test_blosum.py ├── test_iedb_alleles.py ├── test_iedb_mhc.py ├── test_iedb_tcell.py ├── test_ngram.py └── test_pmbec.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | *.csv 39 | *.fa 40 | *.faa 41 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false # Use container-based infrastructure 2 | language: python 3 | python: 4 | - "2.7" 5 | - "3.6" 6 | git: 7 | # don't need the default depth of 50 8 | # but don't want to use a depth of 1 since that affects 9 | # whether jobs run when you have multiple commits queued 10 | # https://github.com/travis-ci/travis-ci/issues/4575 11 | depth: 10 12 | cache: 13 | pip: true 14 | before_install: 15 | # download different versions of mini-conda for py2 vs. py3 16 | - | 17 | if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 18 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; 19 | else 20 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 21 | fi 22 | - bash miniconda.sh -b -p $HOME/miniconda 23 | - export PATH="$HOME/miniconda/bin:$PATH" 24 | # reset the shell's lookup table for program name to path mappings 25 | - hash -r 26 | - conda config --set always_yes yes --set changeps1 no 27 | - conda update -q conda 28 | # Useful for debugging any issues with conda 29 | - conda info -a 30 | - python --version 31 | install: 32 | - > 33 | conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION 34 | numpy scipy nose pandas pandoc 35 | - source activate test-environment 36 | - pip install pypandoc pylint 37 | - pip install -r requirements.txt 38 | - pip install . 39 | - pip install coveralls 40 | script: 41 | - ./lint.sh 42 | - nosetests test --with-coverage --cover-package=pepdata 43 | after_success: 44 | coveralls 45 | deploy: 46 | provider: pypi 47 | user: openvax 48 | distributions: sdist 49 | password: 50 | secure: "adaJvYZ6lDNqhf4jwrI3tsNVymL54yfKl8ymQPUYaL2yK75MaTurfoqqHEt31FXiZNUvwOz+o0i9GHoGRhoVlHKNoe/bN6f69qkNZNW/YC4b061/kPOdzpdpwFzrxXE9Zr6KPsbnGNGcJXzga9rd7XTh8Y34VDylylb5bhYmTC0=" 51 | on: 52 | branch: master 53 | condition: $TRAVIS_PYTHON_VERSION = "2.7" 54 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include pepdata/matrices * 2 | recursive-include pepdata/data *csv 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Build Status 3 | 4 | 5 | Coverage Status 6 | 7 | 8 | PyPI 9 | 10 | 11 | PepData 12 | ======= 13 | 14 | Formerly a repository for diverse peptide datasets, now only contains the [Immune Epitope Database](http://www.iedb.org) and a variety of amino acid property matrices. This package 15 | will probably be eventually split and the IEDB portions placed into something 16 | named `pyiedb`. 17 | 18 | **Amino Acid Properties** 19 | 20 | The `amino_acid` module contains a variety of physical/chemical properties for both single amino residues and interactions between pairs of residues. 21 | 22 | Single residue feature tables are parsed into `StringTransformer` objects, which can be treated as dictionaries or will vectorize a string when you call their method `transform_string`. 23 | 24 | Examples of single residue features: 25 | - `hydropathy` 26 | - `volume` 27 | - `polarity` 28 | - `pK_side_chain` 29 | - `prct_exposed_residues` 30 | - `hydrophilicity` 31 | - `accessible_surface_area` 32 | - `refractivity` 33 | - `local_flexibility` 34 | - `accessible_surface_area_folded` 35 | - `alpha_helix_score` (Chou-Fasman) 36 | - `beta_sheet_score` (Chou-Fasman) 37 | - `turn_score` (Chou-Fasman) 38 | 39 | Pairwise interaction tables are parsed into nested dictionaries, so that the interaction between amino acids `x` and `y` can be determined from `d[x][y]`. 40 | 41 | Pairwise interaction dictionaries: 42 | - `strand_vs_coil` (and its transpose `coil_vs_strand`) 43 | - `helix_vs_strand` (and its transpose `strand_vs_helix`) 44 | - `helix_vs_coil` (and its transpose `coil_vs_helix`) 45 | - `blosum30` 46 | - `blosum50` 47 | - `blosum62` 48 | 49 | There is also a function to parse the coefficients of the [PMBEC similarity matrix](http://www.biomedcentral.com/1471-2105/10/394), though this currently lives in the separate `pmbec` module. 50 | 51 | 52 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | ./lint.sh && \ 2 | ./test.sh && \ 3 | python3 -m pip install --upgrade build && \ 4 | python3 -m pip install --upgrade twine && \ 5 | rm -rf dist && \ 6 | python3 -m build && \ 7 | git --version && \ 8 | python3 -m twine upload dist/* && \ 9 | git tag "$(python3 pepdata/version.py)" && \ 10 | git push --tags -------------------------------------------------------------------------------- /develop.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | pip install -e . 4 | -------------------------------------------------------------------------------- /lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | 4 | find pepdata test -name '*.py' \ 5 | | xargs pylint \ 6 | --errors-only \ 7 | --disable=print-statement 8 | 9 | echo 'Passes pylint check' 10 | -------------------------------------------------------------------------------- /pepdata/__init__.py: -------------------------------------------------------------------------------- 1 | from .amino_acid_alphabet import ( 2 | AminoAcid, 3 | canonical_amino_acids, 4 | canonical_amino_acid_letters, 5 | extended_amino_acids, 6 | extended_amino_acid_letters, 7 | amino_acid_letter_indices, 8 | amino_acid_name_indices, 9 | ) 10 | from .peptide_vectorizer import PeptideVectorizer 11 | from .version import __version__ 12 | from . import iedb 13 | 14 | 15 | 16 | __all__ = [ 17 | "iedb", 18 | "AminoAcid", 19 | "canonical_amino_acids", 20 | "canonical_amino_acid_letters", 21 | "extended_amino_acids", 22 | "extended_amino_acid_letters", 23 | "amino_acid_letter_indices", 24 | "amino_acid_name_indices", 25 | "PeptideVectorizer", 26 | ] 27 | -------------------------------------------------------------------------------- /pepdata/amino_acid.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from __future__ import print_function, division, absolute_import 15 | 16 | class AminoAcid(object): 17 | def __init__( 18 | self, full_name, short_name, letter, contains=None): 19 | self.letter = letter 20 | self.full_name = full_name 21 | self.short_name = short_name 22 | if not contains: 23 | contains = [letter] 24 | self.contains = contains 25 | 26 | def __str__(self): 27 | return ( 28 | ("AminoAcid(full_name='%s', short_name='%s', letter='%s', " 29 | "contains=%s)") % ( 30 | self.letter, self.full_name, self.short_name, self.contains)) 31 | 32 | def __repr__(self): 33 | return str(self) 34 | 35 | def __eq__(self, other): 36 | return other.__class__ is AminoAcid and self.letter == other.letter 37 | -------------------------------------------------------------------------------- /pepdata/amino_acid_alphabet.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | """ 15 | Quantify amino acids by their physical/chemical properties 16 | """ 17 | 18 | from __future__ import print_function, division, absolute_import 19 | 20 | import numpy as np 21 | 22 | from .amino_acid import AminoAcid 23 | 24 | canonical_amino_acids = [ 25 | AminoAcid("Alanine", "Ala", "A"), 26 | AminoAcid("Arginine", "Arg", "R"), 27 | AminoAcid("Asparagine","Asn", "N"), 28 | AminoAcid("Aspartic Acid", "Asp", "D"), 29 | AminoAcid("Cysteine", "Cys", "C"), 30 | AminoAcid("Glutamic Acid", "Glu", "E"), 31 | AminoAcid("Glutamine", "Gln", "Q"), 32 | AminoAcid("Glycine", "Gly", "G"), 33 | AminoAcid("Histidine", "His", "H"), 34 | AminoAcid("Isoleucine", "Ile", "I"), 35 | AminoAcid("Leucine", "Leu", "L"), 36 | AminoAcid("Lysine", "Lys", "K"), 37 | AminoAcid("Methionine", "Met", "M"), 38 | AminoAcid("Phenylalanine", "Phe", "F"), 39 | AminoAcid("Proline", "Pro", "P"), 40 | AminoAcid("Serine", "Ser", "S"), 41 | AminoAcid("Threonine", "Thr", "T"), 42 | AminoAcid("Tryptophan", "Trp", "W"), 43 | AminoAcid("Tyrosine", "Tyr", "Y"), 44 | AminoAcid("Valine", "Val", "V") 45 | ] 46 | 47 | canonical_amino_acid_letters = [aa.letter for aa in canonical_amino_acids] 48 | 49 | ### 50 | # Post-translation modifications commonly detected by mass-spec 51 | ### 52 | 53 | # TODO: figure out three letter codes for modified AAs 54 | 55 | modified_amino_acids = [ 56 | AminoAcid("Phospho-Serine", "Sep", "s"), 57 | AminoAcid("Phospho-Threonine", "???", "t"), 58 | AminoAcid("Phospho-Tyrosine", "???", "y"), 59 | AminoAcid("Cystine", "???", "c"), 60 | AminoAcid("Methionine sulfoxide", "???", "m"), 61 | AminoAcid("Pyroglutamate", "???", "q"), 62 | AminoAcid("Pyroglutamic acid", "???", "n"), 63 | ] 64 | 65 | ### 66 | # Amino acid tokens which represent multiple canonical amino acids 67 | ### 68 | wildcard_amino_acids = [ 69 | AminoAcid("Unknown", "Xaa", "X", contains=set(canonical_amino_acid_letters)), 70 | AminoAcid("Asparagine-or-Aspartic-Acid", "Asx", "B", contains={"D", "N"}), 71 | AminoAcid("Glutamine-or-Glutamic-Acid", "Glx", "Z", contains={"E", "Q"}), 72 | AminoAcid("Leucine-or-Isoleucine", "Xle", "J", contains={"I", "L"}) 73 | ] 74 | 75 | ### 76 | # Canonical amino acids + wilcard tokens 77 | ### 78 | 79 | canonical_amino_acids_with_unknown = canonical_amino_acids + wildcard_amino_acids 80 | 81 | 82 | ### 83 | # Rare amino acids which aren't considered part of the core 20 "canonical" 84 | ### 85 | 86 | rare_amino_acids = [ 87 | AminoAcid("Selenocysteine", "Sec", "U"), 88 | AminoAcid("Pyrrolysine", "Pyl", "O"), 89 | ] 90 | 91 | ### 92 | # Extended amino acids + wildcard tokens 93 | ### 94 | 95 | extended_amino_acids = canonical_amino_acids + rare_amino_acids + wildcard_amino_acids 96 | extended_amino_acid_letters = [ 97 | aa.letter for aa in extended_amino_acids 98 | ] 99 | extended_amino_acids_with_unknown_names = [ 100 | aa.full_name for aa in extended_amino_acids 101 | ] 102 | 103 | 104 | amino_acid_letter_indices = { 105 | c: i for (i, c) in 106 | enumerate(extended_amino_acid_letters) 107 | } 108 | 109 | 110 | amino_acid_letter_pairs = [ 111 | "%s%s" % (x, y) 112 | for y in extended_amino_acids 113 | for x in extended_amino_acids 114 | ] 115 | 116 | 117 | amino_acid_name_indices = { 118 | aa_name: i for (i, aa_name) 119 | in enumerate(extended_amino_acids_with_unknown_names) 120 | } 121 | 122 | amino_acid_pair_positions = { 123 | pair: i for (i, pair) in enumerate(amino_acid_letter_pairs) 124 | } 125 | 126 | def index_to_full_name(idx): 127 | return extended_amino_acids[idx].full_name 128 | 129 | def index_to_short_name(idx): 130 | return extended_amino_acids[idx].short_name 131 | 132 | def index_to_letter(idx): 133 | return extended_amino_acids[idx] 134 | 135 | def letter_to_index(x): 136 | """ 137 | Convert from an amino acid's letter code to its position index 138 | """ 139 | assert x in amino_acid_letter_indices, "Unknown amino acid: %s" % x 140 | return amino_acid_letter_indices[x] 141 | 142 | def peptide_to_indices(xs): 143 | return [amino_acid_letter_indices[x] for x in xs] 144 | 145 | def letter_to_short_name(x): 146 | return index_to_short_name(letter_to_index(x)) 147 | 148 | def peptide_to_short_amino_acid_names(xs): 149 | return [amino_acid_letter_indices[x] for x in xs] 150 | 151 | def dict_to_amino_acid_matrix(d, alphabet=canonical_amino_acids): 152 | n_aa = len(d) 153 | result_matrix = np.zeros((n_aa, n_aa), dtype="float32") 154 | for i, aa_row in enumerate(alphabet): 155 | d_row = d[aa_row.letter] 156 | for j, aa_col in enumerate(alphabet): 157 | value = d_row[aa_col.letter] 158 | result_matrix[i, j] = value 159 | return result_matrix 160 | 161 | -------------------------------------------------------------------------------- /pepdata/amino_acid_properties.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | 15 | from .amino_acid_alphabet import letter_to_index 16 | 17 | """ 18 | Quantify amino acids by their physical/chemical properties 19 | """ 20 | 21 | 22 | def aa_dict_to_positional_list(aa_property_dict): 23 | value_list = [None] * 20 24 | for letter, value in aa_property_dict.items(): 25 | idx = letter_to_index(letter) 26 | assert idx >= 0 27 | assert idx < 20 28 | value_list[idx] = value 29 | assert all(elt is not None for elt in value_list), \ 30 | "Missing amino acids in:\n%s" % aa_property_dict.keys() 31 | return value_list 32 | 33 | def parse_property_table(table_string): 34 | value_dict = {} 35 | for line in table_string.splitlines(): 36 | line = line.strip() 37 | if not line: 38 | continue 39 | fields = line.split(" ") 40 | fields = [f for f in fields if len(f.strip()) > 0] 41 | assert len(fields) >= 2 42 | value, letter = fields[:2] 43 | assert letter not in value_dict, "Repeated amino acid " + line 44 | value_dict[letter] = float(value) 45 | return value_dict 46 | 47 | 48 | """ 49 | Amino acids property tables copied from CRASP website 50 | """ 51 | 52 | hydropathy = parse_property_table(""" 53 | 1.80000 A ALA 54 | -4.5000 R ARG 55 | -3.5000 N ASN 56 | -3.5000 D ASP 57 | 2.50000 C CYS 58 | -3.5000 Q GLN 59 | -3.5000 E GLU 60 | -0.4000 G GLY 61 | -3.2000 H HIS 62 | 4.50000 I ILE 63 | 3.80000 L LEU 64 | -3.9000 K LYS 65 | 1.90000 M MET 66 | 2.80000 F PHE 67 | -1.6000 P PRO 68 | -0.8000 S SER 69 | -0.7000 T THR 70 | -0.9000 W TRP 71 | -1.3000 Y TYR 72 | 4.20000 V VAL 73 | """) 74 | 75 | volume = parse_property_table(""" 76 | 91.5000 A ALA 77 | 202.0000 R ARG 78 | 135.2000 N ASN 79 | 124.5000 D ASP 80 | 118.0000 C CYS 81 | 161.1000 Q GLN 82 | 155.1000 E GLU 83 | 66.40000 G GLY 84 | 167.3000 H HIS 85 | 168.8000 I ILE 86 | 167.9000 L LEU 87 | 171.3000 K LYS 88 | 170.8000 M MET 89 | 203.4000 F PHE 90 | 129.3000 P PRO 91 | 99.10000 S SER 92 | 122.1000 T THR 93 | 237.6000 W TRP 94 | 203.6000 Y TYR 95 | 141.7000 V VAL 96 | """) 97 | 98 | polarity = parse_property_table(""" 99 | 0.0000 A ALA 100 | 52.000 R ARG 101 | 3.3800 N ASN 102 | 40.700 D ASP 103 | 1.4800 C CYS 104 | 3.5300 Q GLN 105 | 49.910 E GLU 106 | 0.0000 G GLY 107 | 51.600 H HIS 108 | 0.1500 I ILE 109 | 0.4500 L LEU 110 | 49.500 K LYS 111 | 1.4300 M MET 112 | 0.3500 F PHE 113 | 1.5800 P PRO 114 | 1.6700 S SER 115 | 1.6600 T THR 116 | 2.1000 W TRP 117 | 1.6100 Y TYR 118 | 0.1300 V VAL 119 | """) 120 | 121 | pK_side_chain = parse_property_table(""" 122 | 0.0000 A ALA 123 | 12.480 R ARG 124 | 0.0000 N ASN 125 | 3.6500 D ASP 126 | 8.1800 C CYS 127 | 0.0000 Q GLN 128 | 4.2500 E GLU 129 | 0.0000 G GLY 130 | 6.0000 H HIS 131 | 0.0000 I ILE 132 | 0.0000 L LEU 133 | 10.530 K LYS 134 | 0.0000 M MET 135 | 0.0000 F PHE 136 | 0.0000 P PRO 137 | 0.0000 S SER 138 | 0.0000 T THR 139 | 0.0000 W TRP 140 | 10.700 Y TYR 141 | 0.0000 V VAL 142 | """) 143 | 144 | prct_exposed_residues = parse_property_table(""" 145 | 15.0000 A ALA 146 | 67.0000 R ARG 147 | 49.0000 N ASN 148 | 50.0000 D ASP 149 | 5.00000 C CYS 150 | 56.0000 Q GLN 151 | 55.0000 E GLU 152 | 10.0000 G GLY 153 | 34.0000 H HIS 154 | 13.0000 I ILE 155 | 16.0000 L LEU 156 | 85.0000 K LYS 157 | 20.0000 M MET 158 | 10.0000 F PHE 159 | 45.0000 P PRO 160 | 32.0000 S SER 161 | 32.0000 T THR 162 | 17.0000 W TRP 163 | 41.0000 Y TYR 164 | 14.0000 V VAL 165 | """) 166 | 167 | hydrophilicity = parse_property_table(""" 168 | -0.5000 A ALA 169 | 3.00000 R ARG 170 | 0.20000 N ASN 171 | 3.00000 D ASP 172 | -1.0000 C CYS 173 | 0.20000 Q GLN 174 | 3.00000 E GLU 175 | 0.00000 G GLY 176 | -0.5000 H HIS 177 | -1.8000 I ILE 178 | -1.8000 L LEU 179 | 3.00000 K LYS 180 | -1.3000 M MET 181 | -2.5000 F PHE 182 | 0.00000 P PRO 183 | 0.30000 S SER 184 | -0.4000 T THR 185 | -3.4000 W TRP 186 | -2.3000 Y TYR 187 | -1.5000 V VAL 188 | """) 189 | 190 | accessible_surface_area = parse_property_table(""" 191 | 27.8000 A ALA 192 | 94.7000 R ARG 193 | 60.1000 N ASN 194 | 60.6000 D ASP 195 | 15.5000 C CYS 196 | 68.7000 Q GLN 197 | 68.2000 E GLU 198 | 24.5000 G GLY 199 | 50.7000 H HIS 200 | 22.8000 I ILE 201 | 27.6000 L LEU 202 | 103.000 K LYS 203 | 33.5000 M MET 204 | 25.5000 F PHE 205 | 51.5000 P PRO 206 | 42.0000 S SER 207 | 45.0000 T THR 208 | 34.7000 W TRP 209 | 55.2000 Y TYR 210 | 23.7000 V VAL 211 | """) 212 | 213 | local_flexibility = parse_property_table(""" 214 | 705.42000 A ALA 215 | 1484.2800 R ARG 216 | 513.46010 N ASN 217 | 34.960000 D ASP 218 | 2412.5601 C CYS 219 | 1087.8300 Q GLN 220 | 1158.6600 E GLU 221 | 33.180000 G GLY 222 | 1637.1300 H HIS 223 | 5979.3701 I ILE 224 | 4985.7300 L LEU 225 | 699.69000 K LYS 226 | 4491.6602 M MET 227 | 5203.8599 F PHE 228 | 431.96000 P PRO 229 | 174.76000 S SER 230 | 601.88000 T THR 231 | 6374.0698 W TRP 232 | 4291.1001 Y TYR 233 | 4474.4199 V VAL 234 | """) 235 | 236 | accessible_surface_area_folded = parse_property_table(""" 237 | 31.5000 A ALA 238 | 93.8000 R ARG 239 | 62.2000 N ASN 240 | 60.9000 D ASP 241 | 13.9000 C CYS 242 | 74.0000 Q GLN 243 | 72.3000 E GLU 244 | 25.2000 G GLY 245 | 46.7000 H HIS 246 | 23.0000 I ILE 247 | 29.0000 L LEU 248 | 110.300 K LYS 249 | 30.5000 M MET 250 | 28.7000 F PHE 251 | 53.7000 P PRO 252 | 44.2000 S SER 253 | 46.0000 T THR 254 | 41.7000 W TRP 255 | 59.1000 Y TYR 256 | 23.5000 V VAL 257 | """) 258 | 259 | refractivity = parse_property_table(""" 260 | 4.34000 A ALA 261 | 26.6600 R ARG 262 | 13.2800 N ASN 263 | 12.0000 D ASP 264 | 35.7700 C CYS 265 | 17.5600 Q GLN 266 | 17.2600 E GLU 267 | 0.00000 G GLY 268 | 21.8100 H HIS 269 | 19.0600 I ILE 270 | 18.7800 L LEU 271 | 21.2900 K LYS 272 | 21.6400 M MET 273 | 29.4000 F PHE 274 | 10.9300 P PRO 275 | 6.35000 S SER 276 | 11.0100 T THR 277 | 42.5300 W TRP 278 | 31.5300 Y TYR 279 | 13.9200 V VAL 280 | """) 281 | 282 | 283 | mass = parse_property_table(""" 284 | 70.079 A ALA 285 | 156.188 R ARG 286 | 114.104 N ASN 287 | 115.089 D ASP 288 | 103.144 C CYS 289 | 128.131 Q GLN 290 | 129.116 E GLU 291 | 57.052 G GLY 292 | 137.142 H HIS 293 | 113.160 I ILE 294 | 113.160 L LEU 295 | 128.174 K LYS 296 | 131.198 M MET 297 | 147.177 F PHE 298 | 97.177 P PRO 299 | 87.078 S SER 300 | 101.105 T THR 301 | 186.213 W TRP 302 | 163.170 Y TYR 303 | 99.133 V VAL 304 | """) 305 | 306 | ### 307 | # Values copied from: 308 | # "Solvent accessibility of AA in known protein structures" 309 | # http://prowl.rockefeller.edu/aainfo/access.htm 310 | ### 311 | """ 312 | Solvent accessibility of AA in known protein structures 313 | 314 | Figure 1. 315 | 316 | S 0.70 0.20 0.10 317 | T 0.71 0.16 0.13 318 | A 0.48 0.35 0.17 319 | G 0.51 0.36 0.13 320 | P 0.78 0.13 0.09 321 | C 0.32 0.54 0.14 322 | D 0.81 0.09 0.10 323 | E 0.93 0.04 0.03 324 | Q 0.81 0.10 0.09 325 | N 0.82 0.10 0.08 326 | L 0.41 0.49 0.10 327 | I 0.39 0.47 0.14 328 | V 0.40 0.50 0.10 329 | M 0.44 0.20 0.36 330 | F 0.42 0.42 0.16 331 | Y 0.67 0.20 0.13 332 | W 0.49 0.44 0.07 333 | K 0.93 0.02 0.05 334 | R 0.84 0.05 0.11 335 | H 0.66 0.19 0.15 336 | """ 337 | 338 | solvent_exposed_area = dict( 339 | S=0.70, 340 | T=0.71, 341 | A=0.48, 342 | G=0.51, 343 | P=0.78, 344 | C=0.32, 345 | D=0.81, 346 | E=0.93, 347 | Q=0.81, 348 | N=0.82, 349 | L=0.41, 350 | I=0.39, 351 | V=0.40, 352 | M=0.44, 353 | F=0.42, 354 | Y=0.67, 355 | W=0.49, 356 | K=0.93, 357 | R=0.84, 358 | H=0.66, 359 | ) 360 | -------------------------------------------------------------------------------- /pepdata/blosum.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | 15 | from os.path import join 16 | 17 | from .static_data import MATRIX_DIR 18 | 19 | from .amino_acid_alphabet import dict_to_amino_acid_matrix 20 | 21 | def parse_blosum_table(table, coeff_type=int, key_type='row'): 22 | """ 23 | Parse a table of pairwise amino acid coefficient (e.g. BLOSUM50) 24 | """ 25 | 26 | lines = table.split("\n") 27 | # drop comments 28 | lines = [line for line in lines if not line.startswith("#")] 29 | # drop CR endline characters 30 | lines = [line.replace("\r", "") for line in lines] 31 | # skip empty lines 32 | lines = [line for line in lines if line] 33 | 34 | labels = lines[0].split() 35 | 36 | if len(labels) < 20: 37 | raise ValueError( 38 | "Expected 20+ amino acids but first line '%s' has %d fields" % ( 39 | lines[0], 40 | len(labels))) 41 | coeffs = {} 42 | for line in lines[1:]: 43 | 44 | fields = line.split() 45 | assert len(fields) >= 21, \ 46 | "Expected AA and 20+ coefficients but '%s' has %d fields" % ( 47 | line, len(fields)) 48 | x = fields[0] 49 | for i, coeff_str in enumerate(fields[1:]): 50 | y = labels[i] 51 | coeff = coeff_type(coeff_str) 52 | if key_type == 'pair': 53 | coeffs[(x, y)] = coeff 54 | elif key_type == 'pair_string': 55 | coeffs[x + y] = coeff 56 | else: 57 | assert key_type == 'row', "Unknown key type: %s" % key_type 58 | if x not in coeffs: 59 | coeffs[x] = {} 60 | coeffs[x][y] = coeff 61 | return coeffs 62 | 63 | 64 | with open(join(MATRIX_DIR, 'BLOSUM30'), 'r') as f: 65 | blosum30_dict = parse_blosum_table(f.read()) 66 | blosum30_matrix = dict_to_amino_acid_matrix(blosum30_dict) 67 | 68 | with open(join(MATRIX_DIR, 'BLOSUM50'), 'r') as f: 69 | blosum50_dict = parse_blosum_table(f.read()) 70 | blosum50_matrix = dict_to_amino_acid_matrix(blosum50_dict) 71 | 72 | with open(join(MATRIX_DIR, 'BLOSUM62'), 'r') as f: 73 | blosum62_dict = parse_blosum_table(f.read()) 74 | blosum62_matrix = dict_to_amino_acid_matrix(blosum62_dict) 75 | 76 | -------------------------------------------------------------------------------- /pepdata/chou_fasman.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | 15 | from .amino_acid_alphabet import amino_acid_name_indices 16 | 17 | # Chou-Fasman of structural properties from 18 | # http://prowl.rockefeller.edu/aainfo/chou.htm 19 | chou_fasman_table = """ 20 | Alanine 142 83 66 0.06 0.076 0.035 0.058 21 | Arginine 98 93 95 0.070 0.106 0.099 0.085 22 | Aspartic Acid 101 54 146 0.147 0.110 0.179 0.081 23 | Asparagine 67 89 156 0.161 0.083 0.191 0.091 24 | Cysteine 70 119 119 0.149 0.050 0.117 0.128 25 | Glutamic Acid 151 037 74 0.056 0.060 0.077 0.064 26 | Glutamine 111 110 98 0.074 0.098 0.037 0.098 27 | Glycine 57 75 156 0.102 0.085 0.190 0.152 28 | Histidine 100 87 95 0.140 0.047 0.093 0.054 29 | Isoleucine 108 160 47 0.043 0.034 0.013 0.056 30 | Leucine 121 130 59 0.061 0.025 0.036 0.070 31 | Lysine 114 74 101 0.055 0.115 0.072 0.095 32 | Methionine 145 105 60 0.068 0.082 0.014 0.055 33 | Phenylalanine 113 138 60 0.059 0.041 0.065 0.065 34 | Proline 57 55 152 0.102 0.301 0.034 0.068 35 | Serine 77 75 143 0.120 0.139 0.125 0.106 36 | Threonine 83 119 96 0.086 0.108 0.065 0.079 37 | Tryptophan 108 137 96 0.077 0.013 0.064 0.167 38 | Tyrosine 69 147 114 0.082 0.065 0.114 0.125 39 | Valine 106 170 50 0.062 0.048 0.028 0.053 40 | """ 41 | 42 | 43 | def parse_chou_fasman(table): 44 | alpha_helix_score_dict = {} 45 | beta_sheet_score_dict = {} 46 | turn_score_dict = {} 47 | 48 | for line in table.split("\n"): 49 | fields = [field for field in line.split(" ") if len(field.strip()) > 0] 50 | if len(fields) == 0: 51 | continue 52 | 53 | if fields[1] == 'Acid': 54 | name = fields[0] + " " + fields[1] 55 | fields = fields[1:] 56 | else: 57 | name = fields[0] 58 | 59 | assert name in amino_acid_name_indices, "Invalid amino acid name %s" % name 60 | letter = amino_acid_name_indices[name] 61 | alpha = int(fields[1]) 62 | beta = int(fields[2]) 63 | turn = int(fields[3]) 64 | alpha_helix_score_dict[letter] = alpha 65 | beta_sheet_score_dict[letter] = beta 66 | turn_score_dict[letter] = turn 67 | 68 | assert len(alpha_helix_score_dict) == 20 69 | assert len(beta_sheet_score_dict) == 20 70 | assert len(turn_score_dict) == 20 71 | return alpha_helix_score_dict, beta_sheet_score_dict, turn_score_dict 72 | 73 | alpha_helix_score, beta_sheet_score, turn_score = \ 74 | parse_chou_fasman(chou_fasman_table) 75 | -------------------------------------------------------------------------------- /pepdata/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2016. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from __future__ import print_function, division, absolute_import 17 | 18 | import numpy as np 19 | 20 | def transform_peptide(peptide, property_dict): 21 | return np.array([property_dict[amino_acid] for amino_acid in peptide]) 22 | 23 | def transform_peptides(peptides, property_dict): 24 | return np.array([ 25 | [property_dict[aa] for aa in peptide] 26 | for peptide in peptides]) 27 | 28 | -------------------------------------------------------------------------------- /pepdata/iedb/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ( 2 | alleles, 3 | mhc, 4 | tcell 5 | ) 6 | 7 | __all__ = [ 8 | "alleles", 9 | "mhc", 10 | "tcell", 11 | ] -------------------------------------------------------------------------------- /pepdata/iedb/alleles.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | from collections import namedtuple 15 | import os 16 | import xml 17 | 18 | from .common import cache 19 | from .memoize import memoize 20 | 21 | ALLELE_XML_FILENAME = "MhcAlleleNames.xml" 22 | ALLELE_XML_URL = "http://www.iedb.org/doc/MhcAlleleNameList.zip" 23 | ALLELE_XML_DECOMPRESS = True 24 | 25 | def local_path(force_download=False): 26 | """Downloads allele database from IEDB, returns local path to XML file.""" 27 | return cache.fetch( 28 | filename=ALLELE_XML_FILENAME, 29 | url=ALLELE_XML_URL, 30 | decompress=ALLELE_XML_DECOMPRESS, 31 | force=force_download) 32 | 33 | def delete(): 34 | """Deletes local XML file""" 35 | path = cache.local_path( 36 | filename=ALLELE_XML_FILENAME, 37 | url=ALLELE_XML_URL, 38 | decompress=ALLELE_XML_DECOMPRESS) 39 | os.remove(path) 40 | 41 | Allele = namedtuple("Allele", [ 42 | "name", 43 | "mhc_class", 44 | "locus", 45 | "organism", 46 | "synonyms" 47 | ]) 48 | 49 | @memoize 50 | def load_alleles(): 51 | """Parses the IEDB MhcAlleleName XML file and returns a list of Allele 52 | namedtuple objects containing information about that each allele's HLA 53 | class and source organism. 54 | """ 55 | result = [] 56 | path = local_path() 57 | etree = xml.etree.ElementTree.parse(path) 58 | for allele in etree.iterfind("MhcAlleleName"): 59 | name_element = allele.find("DisplayedRestriction") 60 | mhc_class_element = allele.find("Class") 61 | # need at least a name and an HLA class 62 | if name_element is None or mhc_class_element is None: 63 | continue 64 | name = name_element.text 65 | 66 | synonyms = set([]) 67 | for synonym_element in allele.iterfind("Synonyms"): 68 | for synonym in synonym_element.text.split(","): 69 | synonyms.add(synonym.strip()) 70 | mhc_class = mhc_class_element.text 71 | organism_element = allele.find("Organsim") 72 | if organism_element is None: 73 | organism = None 74 | else: 75 | organism = organism_element.text 76 | 77 | locus_element = allele.find("Locus") 78 | 79 | if locus_element is None: 80 | locus = None 81 | else: 82 | locus = locus_element.text 83 | 84 | allele_object = Allele( 85 | name=name, 86 | mhc_class=mhc_class, 87 | locus=locus, 88 | organism=organism, 89 | synonyms=synonyms) 90 | result.append(allele_object) 91 | return result 92 | 93 | @memoize 94 | def load_alleles_dict(): 95 | """Create a dictionary mapping each unique allele name to a namedtuple 96 | containing information about that alleles class, locus, species, &c. 97 | """ 98 | alleles = load_alleles() 99 | result = {} 100 | for allele in alleles: 101 | for name in {allele.name}.union(allele.synonyms): 102 | result[name] = allele 103 | return result 104 | -------------------------------------------------------------------------------- /pepdata/iedb/columns.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import annotations 14 | 15 | 16 | import pandas as pd 17 | 18 | def find(df : pd.DataFrame, group_candidates : list[str], column_candidates : list[str]) -> pd.Series | None: 19 | """ 20 | Try to find a column that contains a combination of the two candidate lists. 21 | 22 | Motivation: format for MHC ligand CSV used to have: 23 | epitope_key = ("Epitope", "Description") 24 | mhc_allele_key = ("MHC", "Allele Name") 25 | mhc_class_key = ("MHC", "MHC allele class") 26 | mhc_assay_key = ("Assay", "Method/Technique") 27 | 28 | Now it's: 29 | epitope_key = ("Epitope", "Name") 30 | mhc_allele_key = ("MHC Restriction", "Name") 31 | mhc_class_key = ("MHC Restriction", "Class") 32 | mhc_assay_key = ("Assay", "Method") 33 | 34 | ...who knows what it will be next! 35 | """ 36 | group_candidates = [s.lower() for s in group_candidates] 37 | column_candidates = [s.lower() for s in column_candidates] 38 | 39 | possible_matches = [] 40 | for a in group_candidates: 41 | for b in column_candidates: 42 | for pair in df.columns: 43 | assert type(pair) is tuple and len(pair) == 2 44 | group, col = pair 45 | if a in group.lower() and b in col.lower(): 46 | possible_matches.append(pair) 47 | 48 | if len(possible_matches) == 0: 49 | return None 50 | # get the shortest matches 51 | 52 | 53 | 54 | MHC_GROUP_CANDIDATES : list[str] = ["MHC", "MHC Restriction"] 55 | EPITOPE_GROUP_CANDIDATES : list[str] = ["Epitope"] 56 | ASSAY_GROUP_CANDIDATES : list[str] = ["Assay"] 57 | HOST_GROUP_CANDIDATES : list[str] = ["Host"] 58 | 59 | def get_mhc_allele( 60 | df : pd.DataFrame, 61 | group_candidates : list[str] = MHC_GROUP_CANDIDATES, 62 | column_candidates : list[str] = ["Allele", "Allele name", "Name"]) -> pd.Series | None: 63 | return find(df, group_candidates, column_candidates) 64 | 65 | 66 | def get_mhc_class( 67 | df : pd.DataFrame, 68 | group_candidates : list[str] = MHC_GROUP_CANDIDATES, 69 | column_candidates : list[str] =["Class", "MHC allele class"]) -> pd.Series | None: 70 | return find(df, group_candidates, column_candidates) 71 | 72 | 73 | def get_mhc_assay( 74 | df : pd.Series, 75 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES, 76 | column_candidates : list[str] =["method"]) -> pd.Series | None: 77 | return find(df, group_candidates, column_candidates) 78 | 79 | 80 | def get_epitope_name( 81 | df : pd.DataFrame, 82 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES, 83 | column_candidates : list[str] =["name"]) -> pd.Series | None: 84 | return find(df, group_candidates, column_candidates) 85 | 86 | 87 | def get_epitope_type( 88 | df : pd.DataFrame, 89 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES, 90 | column_candidates : list[str] =["Object Type", "Type"]) -> pd.Series | None: 91 | return find(df, group_candidates, column_candidates) 92 | 93 | def get_epitope_modifications( 94 | df : pd.DataFrame, 95 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES, 96 | column_candidates : list[str] = ["Modified Residue(s)"]) -> pd.Series | None: 97 | return find(df, group_candidates, column_candidates) 98 | 99 | 100 | def get_epitope_IRI( 101 | df : pd.DataFrame, 102 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES, 103 | column_candidates : list[str] =["Epitope IRI"]) -> pd.Series | None: 104 | return find(df, group_candidates, column_candidates) 105 | 106 | 107 | def get_epitope_source_molecule( 108 | df : pd.DataFrame, 109 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES, 110 | column_candidates=["Source Molecule"]) -> pd.Series | None: 111 | return find(df, group_candidates, column_candidates) 112 | 113 | def get_epitope_source_molecule_iri( 114 | df : pd.DataFrame, 115 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES, 116 | column_candidates : list[str] = ["Source Molecule IRI"]) -> pd.Series | None: 117 | return find(df, group_candidates, column_candidates) 118 | 119 | 120 | def get_epitope_source_organism( 121 | df : pd.DataFrame, 122 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES, 123 | column_candidates : list[str] = ["Source Organism"]) -> pd.Series | None: 124 | return find(df, group_candidates, column_candidates) 125 | 126 | 127 | def get_epitope_source_organism_iri( 128 | df : pd.DataFrame, 129 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES, 130 | column_candidates : list[str] = ["Source Organism IRI"]) -> pd.Series | None: 131 | return find(df, group_candidates, column_candidates) 132 | 133 | def get_assay_method( 134 | df : pd.DataFrame, 135 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES, 136 | column_candidates : list[str] = ["Method", "Method/Technique"]) -> pd.Series | None: 137 | return find(df, group_candidates, column_candidates) 138 | 139 | def get_assay_response_measured( 140 | df : pd.DataFrame, 141 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES, 142 | column_candidates : list[str] = ["Response measured"]) -> pd.Series | None: 143 | return find(df, group_candidates, column_candidates) 144 | 145 | 146 | def get_assay_units( 147 | df : pd.DataFrame, 148 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES, 149 | column_candidates : list[str] = ["Units"]) -> pd.Series | None: 150 | return find(df, group_candidates, column_candidates) 151 | 152 | 153 | def get_assay_qualitative( 154 | df : pd.DataFrame, 155 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES, 156 | column_candidates : list[str] = ["Qualitative Measurement"]) -> pd.Series | None: 157 | return find(df, group_candidates, column_candidates) 158 | 159 | def get_assay_num_tested( 160 | df : pd.DataFrame, 161 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES, 162 | column_candidates : list[str] = ["Number of Subjects Tested"]) -> pd.Series | None: 163 | return find(df, group_candidates, column_candidates) 164 | 165 | def get_assay_num_responded( 166 | df : pd.DataFrame, 167 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES, 168 | column_candidates : list[str] = ["Number of Subjects Responded"]) -> pd.Series | None: 169 | return find(df, group_candidates, column_candidates) 170 | 171 | 172 | def get_host_name( 173 | df : pd.DataFrame, 174 | group_candidates : list[str] = HOST_GROUP_CANDIDATES, 175 | column_candidates : list[str] = ["Name"]) -> pd.Series | None: 176 | return find(df, group_candidates, column_candidates) 177 | -------------------------------------------------------------------------------- /pepdata/iedb/common.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import annotations 14 | 15 | import datacache 16 | 17 | cache = datacache.Cache("pepdata") 18 | 19 | bad_amino_acids = 'U|X|J|B|Z' 20 | -------------------------------------------------------------------------------- /pepdata/iedb/memoize.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | 15 | from functools import wraps 16 | 17 | def _prepare_memoization_key(args, kwargs): 18 | """ 19 | Make a tuple of arguments which can be used as a key 20 | for a memoized function's lookup_table. If some object can't be hashed 21 | then used its __repr__ instead. 22 | """ 23 | key_list = [] 24 | for arg in args: 25 | try: 26 | hash(arg) 27 | key_list.append(arg) 28 | except: 29 | key_list.append(repr(arg)) 30 | for (k, v) in kwargs.items(): 31 | try: 32 | hash(k) 33 | hash(v) 34 | key_list.append((k, v)) 35 | except: 36 | key_list.append((repr(k), repr(v))) 37 | return tuple(key_list) 38 | 39 | def memoize(fn): 40 | lookup_table = {} 41 | 42 | @wraps(fn) 43 | def wrapped_fn(*args, **kwargs): 44 | key = _prepare_memoization_key(args, kwargs) 45 | if key not in lookup_table: 46 | lookup_table[key] = fn(*args, **kwargs) 47 | return lookup_table[key] 48 | 49 | return wrapped_fn 50 | -------------------------------------------------------------------------------- /pepdata/iedb/mhc.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | import logging 15 | import os 16 | 17 | import pandas as pd 18 | 19 | from .memoize import memoize 20 | from .common import bad_amino_acids, cache 21 | 22 | 23 | MHC_URL = "https://www.iedb.org/downloader.php?file_name=doc/mhc_ligand_full_single_file.zip" 24 | MHC_LOCAL_FILENAME = "mhc_ligand_full.csv" 25 | MHC_DECOMPRESS = True 26 | 27 | def download(force=False): 28 | return cache.fetch( 29 | filename=MHC_LOCAL_FILENAME, 30 | url=MHC_URL, 31 | decompress=MHC_DECOMPRESS, 32 | force=force) 33 | 34 | def local_path(auto_download=True): 35 | path = cache.local_path( 36 | filename=MHC_LOCAL_FILENAME, 37 | url=MHC_URL, 38 | decompress=MHC_DECOMPRESS) 39 | if not os.path.exists(path): 40 | if auto_download: 41 | return download() 42 | raise ValueError( 43 | ("MHC data file %s does not exist locally," 44 | " call pepdata.mhc.download() to get a copy from IEDB") % path) 45 | return path 46 | 47 | def delete(): 48 | os.remove(local_path()) 49 | 50 | @memoize 51 | def load_dataframe( 52 | mhc_class : int | None = None, # 1, 2, or None for neither 53 | hla : str | None = None, 54 | exclude_hla : str | None = None, 55 | human_only : bool = False, 56 | peptide_length : int | None = None, 57 | assay_method : str | None = None, 58 | only_standard_amino_acids : bool = True, 59 | warn_bad_lines : bool = True, 60 | nrows : int | None = None): 61 | """ 62 | Load IEDB MHC data without aggregating multiple entries for the same epitope 63 | 64 | Parameters 65 | ---------- 66 | mhc_class 67 | Restrict to MHC Class I or Class II (or None for neither) 68 | 69 | hla 70 | Restrict results to specific HLA type used in assay (regex pattern) 71 | 72 | exclude_hla 73 | Regex pattern to exclude certain HLA types 74 | 75 | human_only 76 | Restrict to human samples (default False) 77 | 78 | peptide_length 79 | Restrict epitopes to amino acid strings of given length 80 | 81 | assay_method 82 | Limit to assay methods which contain the given string 83 | 84 | only_standard_amino_acids 85 | Drop sequences which use non-standard amino acids, anything outside 86 | the core 20, such as X or U (default = True) 87 | 88 | warn_bad_lines 89 | The full MHC ligand dataset seems to contain several dozen lines with 90 | too many fields. This currently results in a lot of warning messages 91 | from Pandas, which you can turn off with this option (default = True) 92 | 93 | nrows 94 | Don't load the full IEDB dataset but instead read only the first nrows 95 | """ 96 | df = pd.read_csv( 97 | local_path(), 98 | header=[0, 1], 99 | skipinitialspace=True, 100 | nrows=nrows, 101 | low_memory=False, 102 | on_bad_lines='warn' if warn_bad_lines else 'skip', 103 | encoding="latin-1") 104 | 105 | # Sometimes the IEDB seems to put in an extra comma in the 106 | # header line, which creates an unnamed column of NaNs. 107 | # To deal with this, drop any columns which are all NaN 108 | df = df.dropna(axis=1, how="all") 109 | 110 | print(df.head()) 111 | 112 | n = len(df) 113 | 114 | mhc_group_key = "MHC Restriction" 115 | epitope_group_key = "Epitope" 116 | epitope_column_key = (epitope_group_key, "Name") 117 | 118 | mhc_allele_column_key = (mhc_group_key, "Name") 119 | 120 | epitopes = df[epitope_column_key] = df[epitope_column_key].str.upper() 121 | 122 | null_epitope_seq = epitopes.isnull() 123 | n_null = null_epitope_seq.sum() 124 | if n_null > 0: 125 | logging.info("Dropping %d null sequences", n_null) 126 | 127 | mask = ~null_epitope_seq 128 | 129 | if only_standard_amino_acids: 130 | # if have rare or unknown amino acids, drop the sequence 131 | bad_epitope_seq = \ 132 | epitopes.str.contains(bad_amino_acids, na=False).astype("bool") 133 | n_bad = bad_epitope_seq.sum() 134 | if n_bad > 0: 135 | logging.info("Dropping %d bad sequences", n_bad) 136 | 137 | mask &= ~bad_epitope_seq 138 | 139 | if human_only: 140 | mask &= df[mhc_allele_column_key].str.startswith("HLA").astype("bool") 141 | 142 | if mhc_class == 1: 143 | mask &= df[mhc_group_key]["Class"] == "I" 144 | elif mhc_class == 2: 145 | mask &= df[mhc_group_key]["Class"] == "II" 146 | 147 | if hla: 148 | mask &= df[mhc_allele_column_key].str.contains(hla, na=False) 149 | 150 | if exclude_hla: 151 | mask &= ~(df[mhc_allele_column_key].str.contains(exclude_hla, na=False)) 152 | 153 | if assay_method: 154 | mask &= df["Assay"]["Method"].str.contains(assay_method) 155 | 156 | if peptide_length: 157 | assert peptide_length > 0 158 | mask &= df[epitope_column_key].str.len() == peptide_length 159 | 160 | df = df[mask].copy() 161 | 162 | logging.info("Returning %d / %d entries after filtering", len(df), n) 163 | 164 | return df 165 | -------------------------------------------------------------------------------- /pepdata/iedb/tcell.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from __future__ import print_function, division, absolute_import 15 | import logging 16 | import os 17 | 18 | import numpy as np 19 | import pandas as pd 20 | 21 | 22 | from .alleles import load_alleles_dict 23 | from .memoize import memoize 24 | from .common import bad_amino_acids, cache 25 | from .columns import ( 26 | get_assay_method, 27 | get_assay_num_tested, 28 | get_assay_response_measured, 29 | get_assay_units, 30 | get_host_name, 31 | get_mhc_allele, 32 | get_mhc_assay, 33 | get_mhc_class, 34 | get_epitope_source_organism, 35 | get_epitope_type, 36 | get_epitope_name, 37 | 38 | ) 39 | 40 | TCELL_COMPACT_FILENAME = "tcell_full.csv" 41 | TCELL_COMPACT_URL = "http://www.iedb.org/downloader.php?file_name=doc/tcell_full_v3.zip" 42 | TCELL_COMPACT_DECOMPRESS = True 43 | 44 | def download(force=False): 45 | return cache.fetch( 46 | filename=TCELL_COMPACT_FILENAME, 47 | url=TCELL_COMPACT_URL, 48 | decompress=TCELL_COMPACT_DECOMPRESS, 49 | force=force) 50 | 51 | def local_path(auto_download=True): 52 | path = cache.local_path( 53 | filename=TCELL_COMPACT_FILENAME, 54 | url=TCELL_COMPACT_URL, 55 | decompress=TCELL_COMPACT_DECOMPRESS) 56 | if not os.path.exists(path): 57 | if auto_download: 58 | return download() 59 | raise ValueError( 60 | ("Local file %s does not exist, call" 61 | " pepdata.iedb.tcell.download()") % path) 62 | return path 63 | 64 | def delete(): 65 | os.remove(local_path()) 66 | 67 | @memoize 68 | def load_dataframe( 69 | mhc_class : str | None = None, # 1, 2, or None for neither 70 | mhc_pattern : str | None = None, 71 | exclude_mhc : str | None = None, 72 | human_only : bool =False, 73 | peptide_length : int | None = None, 74 | assay_method : str | None = None, 75 | only_standard_amino_acids : bool = True, 76 | reduced_alphabet : dict | None = None, # 20 letter AA strings -> simpler alphabet 77 | nrows : int | None = None): 78 | """ 79 | Load IEDB T-cell data without aggregating multiple entries for same epitope 80 | 81 | Parameters 82 | ---------- 83 | mhc_class: {None, 1, 2} 84 | Restrict to MHC Class I or Class II (or None for neither) 85 | 86 | mhc_pattern: regex pattern, optional 87 | Restrict results to specific MHC used in assay 88 | 89 | exclude_mhc: regex pattern, optional 90 | Exclude certain MHC allele patterns 91 | 92 | human_only: bool 93 | Restrict to human samples (default False) 94 | 95 | peptide_length: int, optional 96 | Restrict epitopes to amino acid strings of given length 97 | 98 | assay_method string, optional 99 | Only collect results with assay methods containing the given string 100 | 101 | only_standard_amino_acids : bool, optional 102 | Drop sequences which use non-standard amino acids, anything outside 103 | the core 20, such as X or U (default = True) 104 | 105 | reduced_alphabet: dictionary, optional 106 | Remap amino acid letters to some other alphabet 107 | 108 | nrows: int, optional 109 | Don't load the full IEDB dataset but instead read only the first nrows 110 | """ 111 | path = local_path() 112 | df = pd.read_csv( 113 | path, 114 | header=[0, 1], 115 | skipinitialspace=True, 116 | nrows=nrows, 117 | low_memory=False, 118 | on_bad_lines='warn', 119 | encoding="latin-1") 120 | 121 | mhc = get_mhc_allele(df) 122 | mhc_class = get_mhc_class(df) 123 | epitopes = get_epitope_name(df) 124 | organism = get_host_name(df) 125 | assay_method = get_assay_method(df) 126 | 127 | 128 | # Sometimes the IEDB seems to put in an extra comma in the 129 | # header line, which creates an unnamed column of NaNs. 130 | # To deal with this, drop any columns which are all NaN 131 | df = df.dropna(axis=1, how="all") 132 | 133 | n = len(df) 134 | 135 | null_epitope_seq = epitopes.isnull() 136 | n_null = null_epitope_seq.sum() 137 | 138 | if n_null > 0: 139 | logging.info("Dropping %d null sequences", n_null) 140 | 141 | mask = ~null_epitope_seq 142 | 143 | if only_standard_amino_acids: 144 | # if have rare or unknown amino acids, drop the sequence 145 | bad_epitope_seq = \ 146 | epitopes.str.contains(bad_amino_acids, na=False).astype("bool") 147 | n_bad = bad_epitope_seq.sum() 148 | if n_bad > 0: 149 | logging.info("Dropping %d bad sequences", n_bad) 150 | 151 | mask &= ~bad_epitope_seq 152 | 153 | if human_only: 154 | mask &= organism.str.startswith('Homo sapiens', na=False).astype('bool') 155 | 156 | 157 | if mhc_class is not None: 158 | # since MHC classes can be specified as either strings ("I") or integers 159 | # standard them to be strings 160 | if mhc_class == 1: 161 | mhc_class = "I" 162 | elif mhc_class == 2: 163 | mhc_class = "II" 164 | if mhc_class not in {"I", "II"}: 165 | raise ValueError("Invalid MHC class: %s" % mhc_class) 166 | allele_dict = load_alleles_dict() 167 | mhc_class_mask = [False] * len(df) 168 | for i, allele_name in enumerate(mhc): 169 | allele_object = allele_dict.get(allele_name) 170 | if allele_object and allele_object.mhc_class == mhc_class: 171 | mhc_class_mask[i] = True 172 | mask &= np.array(mhc_class_mask) 173 | 174 | # Match known alleles such as "HLA-A*02:01", 175 | # broader groupings such as "HLA-A2" 176 | # and unknown alleles of the MHC-1 listed either as 177 | # "HLA-Class I,allele undetermined" 178 | # or 179 | # "Class I,allele undetermined" 180 | ] 181 | 182 | if hla: 183 | mask &= df[mhc_allele_column_key].str.contains(hla, na=False) 184 | 185 | if exclude_hla: 186 | mask &= ~(df[mhc_allele_column_key].str.contains(exclude_hla, na=False)) 187 | 188 | if assay_group: 189 | mask &= df[assay_group_column_key].str.contains(assay_group) 190 | 191 | if assay_method: 192 | mask &= df[assay_method_column_key].str.contains(assay_method) 193 | 194 | if peptide_length: 195 | assert peptide_length > 0 196 | mask &= df[epitope_column_key].str.len() == peptide_length 197 | 198 | df = df[mask] 199 | 200 | logging.info("Returning %d / %d entries after filtering", len(df), n) 201 | return df 202 | -------------------------------------------------------------------------------- /pepdata/matrices/BLOSUM30: -------------------------------------------------------------------------------- 1 | A B C D E F G H I K L M N P Q R S T V W X Y Z * 2 | A 4 0 -3 0 0 -2 0 -2 0 0 -1 1 0 -1 1 -1 1 1 1 -5 0 -4 0 -7 3 | B 0 5 -2 5 0 -3 0 -2 -2 0 -1 -2 4 -2 -1 -2 0 0 -2 -5 -1 -3 0 -7 4 | C -3 -2 17 -3 1 -3 -4 -5 -2 -3 0 -2 -1 -3 -2 -2 -2 -2 -2 -2 -2 -6 0 -7 5 | D 0 5 -3 9 1 -5 -1 -2 -4 0 -1 -3 1 -1 -1 -1 0 -1 -2 -4 -1 -1 0 -7 6 | E 0 0 1 1 6 -4 -2 0 -3 2 -1 -1 -1 1 2 -1 0 -2 -3 -1 -1 -2 5 -7 7 | F -2 -3 -3 -5 -4 10 -3 -3 0 -1 2 -2 -1 -4 -3 -1 -1 -2 1 1 -1 3 -4 -7 8 | G 0 0 -4 -1 -2 -3 8 -3 -1 -1 -2 -2 0 -1 -2 -2 0 -2 -3 1 -1 -3 -2 -7 9 | H -2 -2 -5 -2 0 -3 -3 14 -2 -2 -1 2 -1 1 0 -1 -1 -2 -3 -5 -1 0 0 -7 10 | I 0 -2 -2 -4 -3 0 -1 -2 6 -2 2 1 0 -3 -2 -3 -1 0 4 -3 0 -1 -3 -7 11 | K 0 0 -3 0 2 -1 -1 -2 -2 4 -2 2 0 1 0 1 0 -1 -2 -2 0 -1 1 -7 12 | L -1 -1 0 -1 -1 2 -2 -1 2 -2 4 2 -2 -3 -2 -2 -2 0 1 -2 0 3 -1 -7 13 | M 1 -2 -2 -3 -1 -2 -2 2 1 2 2 6 0 -4 -1 0 -2 0 0 -3 0 -1 -1 -7 14 | N 0 4 -1 1 -1 -1 0 -1 0 0 -2 0 8 -3 -1 -2 0 1 -2 -7 0 -4 -1 -7 15 | P -1 -2 -3 -1 1 -4 -1 1 -3 1 -3 -4 -3 11 0 -1 -1 0 -4 -3 -1 -2 0 -7 16 | Q 1 -1 -2 -1 2 -3 -2 0 -2 0 -2 -1 -1 0 8 3 -1 0 -3 -1 0 -1 4 -7 17 | R -1 -2 -2 -1 -1 -1 -2 -1 -3 1 -2 0 -2 -1 3 8 -1 -3 -1 0 -1 0 0 -7 18 | S 1 0 -2 0 0 -1 0 -1 -1 0 -2 -2 0 -1 -1 -1 4 2 -1 -3 0 -2 -1 -7 19 | T 1 0 -2 -1 -2 -2 -2 -2 0 -1 0 0 1 0 0 -3 2 5 1 -5 0 -1 -1 -7 20 | V 1 -2 -2 -2 -3 1 -3 -3 4 -2 1 0 -2 -4 -3 -1 -1 1 5 -3 0 1 -3 -7 21 | W -5 -5 -2 -4 -1 1 1 -5 -3 -2 -2 -3 -7 -3 -1 0 -3 -5 -3 20 -2 5 -1 -7 22 | X 0 -1 -2 -1 -1 -1 -1 -1 0 0 0 0 0 -1 0 -1 0 0 0 -2 -1 -1 0 -7 23 | Y -4 -3 -6 -1 -2 3 -3 0 -1 -1 3 -1 -4 -2 -1 0 -2 -1 1 5 -1 9 -2 -7 24 | Z 0 0 0 0 5 -4 -2 0 -3 1 -1 -1 -1 0 4 0 -1 -1 -3 -1 0 -2 4 -7 25 | * -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 1 26 | -------------------------------------------------------------------------------- /pepdata/matrices/BLOSUM50: -------------------------------------------------------------------------------- 1 | A R N D C Q E G H I L K M F P S T W Y V 2 | A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0 3 | R -2 7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3 4 | N -1 -1 7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3 5 | D -2 -2 2 8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4 6 | C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 7 | Q -1 1 0 0 -3 7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3 8 | E -1 0 0 2 -3 2 6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3 9 | G 0 -3 0 -1 -3 -2 -3 8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4 10 | H -2 0 1 -1 -3 1 0 -2 10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4 11 | I -1 -4 -3 -4 -2 -3 -4 -4 -4 5 2 -3 2 0 -3 -3 -1 -3 -1 4 12 | L -2 -3 -4 -4 -2 -2 -3 -4 -3 2 5 -3 3 1 -4 -3 -1 -2 -1 1 13 | K -1 3 0 -1 -3 2 1 -2 0 -3 -3 6 -2 -4 -1 0 -1 -3 -2 -3 14 | M -1 -2 -2 -4 -2 0 -2 -3 -1 2 3 -2 7 0 -3 -2 -1 -1 0 1 15 | F -3 -3 -4 -5 -2 -4 -3 -4 -1 0 1 -4 0 8 -4 -3 -2 1 4 -1 16 | P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 17 | S 1 -1 1 0 -1 0 -1 0 -1 -3 -3 0 -2 -3 -1 5 2 -4 -2 -2 18 | T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 2 5 -3 -2 0 19 | W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1 1 -4 -4 -4 15 2 -3 20 | Y -2 -1 -2 -3 -3 -1 -2 -3 2 -1 -1 -2 0 4 -3 -2 -2 2 8 -1 21 | V 0 -3 -3 -4 -1 -3 -3 -4 -4 4 1 -3 1 -1 -3 -2 0 -3 -1 5 -------------------------------------------------------------------------------- /pepdata/matrices/BLOSUM62: -------------------------------------------------------------------------------- 1 | # Entries for the BLOSUM62 matrix at a scale of ln(2)/2.0. 2 | A R N D C Q E G H I L K M F P S T W Y V B J Z X * 3 | A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 -1 -1 -4 4 | R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 -2 0 -1 -4 5 | N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 4 -3 0 -1 -4 6 | D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 -3 1 -1 -4 7 | C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -1 -3 -1 -4 8 | Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 -2 4 -1 -4 9 | E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 -3 4 -1 -4 10 | G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -4 -2 -1 -4 11 | H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 -3 0 -1 -4 12 | I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 3 -3 -1 -4 13 | L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 3 -3 -1 -4 14 | K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 -3 1 -1 -4 15 | M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 2 -1 -1 -4 16 | F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 0 -3 -1 -4 17 | P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -3 -1 -1 -4 18 | S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 -2 0 -1 -4 19 | T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 -1 -1 -4 20 | W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -2 -2 -1 -4 21 | Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -1 -2 -1 -4 22 | V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 2 -2 -1 -4 23 | B -2 -1 4 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 -3 0 -1 -4 24 | J -1 -2 -3 -3 -1 -2 -3 -4 -3 3 3 -3 2 0 -3 -2 -1 -2 -1 2 -3 3 -3 -1 -4 25 | Z -1 0 0 1 -3 4 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -2 -2 -2 0 -3 4 -1 -4 26 | X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4 27 | * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 28 | -------------------------------------------------------------------------------- /pepdata/matrices/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/pepdata/cae7b52f668e3111848c9263474d7aa2bacf8274/pepdata/matrices/__init__.py -------------------------------------------------------------------------------- /pepdata/matrices/amino_acid_properties.txt: -------------------------------------------------------------------------------- 1 | http://www.sgi.sscc.ru/mgs/programs/crasp/texts/AA_Properties.htm 2 | 3 | 1. Free energy of transfer to surface (Bull & Breese 1974) 4 | Arch. Bioch. Biophys. 1974 161 665-670 5 | 610.0000 A ALA 6 | 690.0000 R ARG 7 | 890.0000 N ASN 8 | 610.0000 D ASP 9 | 360.0000 C CYS 10 | 970.0000 Q GLN 11 | 510.0000 E GLU 12 | 810.0000 G GLY 13 | 690.0000 H HIS 14 | -1450.0000 I ILE 15 | -1650.0000 L LEU 16 | 460.0000 K LYS 17 | -660.0000 M MET 18 | -1520.0000 F PHE 19 | -170.0000 P PRO 20 | 420.0000 S SER 21 | 290.0000 T THR 22 | -1200.0000 W TRP 23 | -1430.0000 Y TYR 24 | -750.0000 V VAL 25 | 26 | 2. Surrounding hydrophobicity in alpha-helix (Ponnuswamy et al. 1980) 27 | Biochim.Biophys.Acta(1980) 623 301-316 28 | 13.6500 A ALA 29 | 11.2800 R ARG 30 | 12.2400 N ASN 31 | 10.9800 D ASP 32 | 14.4900 C CYS 33 | 11.3000 Q GLN 34 | 12.5500 E GLU 35 | 15.3600 G GLY 36 | 11.5900 H HIS 37 | 14.6300 I ILE 38 | 14.0100 L LEU 39 | 11.9600 K LYS 40 | 13.4000 M MET 41 | 14.0800 F PHE 42 | 11.5100 P PRO 43 | 11.2600 S SER 44 | 13.0000 T THR 45 | 12.0600 W TRP 46 | 12.6400 Y TYR 47 | 12.8800 V VAL 48 | 49 | 3. Surrounding hydrophobicity in beta-sheet (Ponnuswamy et al. 1980) 50 | Biochim.Biophys.Acta(1980) 623 301-316 51 | 14.6000 A ALA 52 | 13.2400 R ARG 53 | 11.7900 N ASN 54 | 13.7800 D ASP 55 | 15.9000 C CYS 56 | 12.0200 Q GLN 57 | 13.5900 E GLU 58 | 14.1800 G GLY 59 | 15.3500 H HIS 60 | 14.1000 I ILE 61 | 16.4900 L LEU 62 | 13.2800 K LYS 63 | 16.2300 M MET 64 | 14.1800 F PHE 65 | 14.1000 P PRO 66 | 13.3600 S SER 67 | 14.5000 T THR 68 | 13.9000 W TRP 69 | 14.7600 Y TYR 70 | 16.3000 V VAL 71 | 72 | 4. Surrounding hydrophobicity in beta-turn (Ponnuswamy et al. 1980) 73 | Biochim.Biophys.Acta(1980) 623 301-316 74 | 10.6700 A ALA 75 | 11.0500 R ARG 76 | 10.8500 N ASN 77 | 10.2100 D ASP 78 | 14.1500 C CYS 79 | 11.7100 Q GLN 80 | 11.7100 E GLU 81 | 10.9500 G GLY 82 | 12.0700 H HIS 83 | 12.9500 I ILE 84 | 13.0700 L LEU 85 | 9.9300 K LYS 86 | 15.0000 M MET 87 | 13.2700 F PHE 88 | 10.6200 P PRO 89 | 11.1800 S SER 90 | 10.5300 T THR 91 | 11.4100 W TRP 92 | 11.5200 Y TYR 93 | 13.8600 V VAL 94 | 95 | 5. Accessibility reduction ratio (Ponnuswamy et al. 1980) 96 | Biochim.Biophys.Acta(1980) 623 301-316 97 | 3.7000 A ALA 98 | 2.5300 R ARG 99 | 2.1200 N ASN 100 | 2.6000 D ASP 101 | 3.0300 C CYS 102 | 2.7000 Q GLN 103 | 3.3000 E GLU 104 | 3.1300 G GLY 105 | 3.5700 H HIS 106 | 7.6900 I ILE 107 | 5.8800 L LEU 108 | 1.7900 K LYS 109 | 5.2100 M MET 110 | 6.6000 F PHE 111 | 2.1200 P PRO 112 | 2.4300 S SER 113 | 2.6000 T THR 114 | 6.2500 W TRP 115 | 3.0300 Y TYR 116 | 7.1400 V VAL 117 | 118 | 6. Average number of surrounding residues (Ponnuswamy et al. 1980) 119 | Biochim.Biophys.Acta(1980) 623 301-316 120 | 6.0500 A ALA 121 | 5.7000 R ARG 122 | 5.0400 N ASN 123 | 4.9500 D ASP 124 | 7.8600 C CYS 125 | 5.4500 Q GLN 126 | 5.1000 E GLU 127 | 6.1600 G GLY 128 | 5.8000 H HIS 129 | 7.5100 I ILE 130 | 7.3700 L LEU 131 | 4.8800 K LYS 132 | 6.3900 M MET 133 | 6.6200 F PHE 134 | 5.6500 P PRO 135 | 5.5300 S SER 136 | 5.8100 T THR 137 | 6.9800 W TRP 138 | 6.7300 Y TYR 139 | 7.6200 V VAL 140 | 141 | 7. Volume (Chothia 1984) 142 | Annu. Rev. Biochem. (1984) 53 537-572 143 | 91.5000 A ALA 144 | 202.0000 R ARG 145 | 135.2000 N ASN 146 | 124.5000 D ASP 147 | 118.0000 C CYS 148 | 161.1000 Q GLN 149 | 155.1000 E GLU 150 | 66.4000 G GLY 151 | 167.3000 H HIS 152 | 168.8000 I ILE 153 | 167.9000 L LEU 154 | 171.3000 K LYS 155 | 170.8000 M MET 156 | 203.4000 F PHE 157 | 129.3000 P PRO 158 | 99.1000 S SER 159 | 122.1000 T THR 160 | 237.6000 W TRP 161 | 203.6000 Y TYR 162 | 141.7000 V VAL 163 | 164 | 8. Local flexibility (Ragone et al. 1989) 165 | Prot. Engineering(1989) 2 497-504 166 | 705.4200 A ALA 167 | 1484.2800 R ARG 168 | 513.4601 N ASN 169 | 34.9600 D ASP 170 | 2412.5601 C CYS 171 | 1087.8300 Q GLN 172 | 1158.6600 E GLU 173 | 33.1800 G GLY 174 | 1637.1300 H HIS 175 | 5979.3701 I ILE 176 | 4985.7300 L LEU 177 | 699.6900 K LYS 178 | 4491.6602 M MET 179 | 5203.8599 F PHE 180 | 431.9600 P PRO 181 | 174.7600 S SER 182 | 601.8800 T THR 183 | 6374.0698 W TRP 184 | 4291.1001 Y TYR 185 | 4474.4199 V VAL 186 | 187 | 9. Flexibility (Bhaskaran and Ponnuswamy 1988) 188 | Int. J. Peptide Protein Res.(1988) 32 241-255 189 | 0.3570 A ALA 190 | 0.5290 R ARG 191 | 0.4630 N ASN 192 | 0.5110 D ASP 193 | 0.3460 C CYS 194 | 0.4930 Q GLN 195 | 0.4970 E GLU 196 | 0.5440 G GLY 197 | 0.3230 H HIS 198 | 0.4620 I ILE 199 | 0.3650 L LEU 200 | 0.4660 K LYS 201 | 0.2950 M MET 202 | 0.3140 F PHE 203 | 0.5090 P PRO 204 | 0.5070 S SER 205 | 0.4440 T THR 206 | 0.3050 W TRP 207 | 0.4200 Y TYR 208 | 0.3860 V VAL 209 | 210 | 10. Flexibility for no rigid neighbours (Karplus & Schulz 1985) 211 | Naturwissenschaften (1985) 72 212-213 212 | 1.0410 A ALA 213 | 1.0380 R ARG 214 | 1.1170 N ASN 215 | 1.0330 D ASP 216 | 0.9600 C CYS 217 | 1.1650 Q GLN 218 | 1.0940 E GLU 219 | 1.1420 G GLY 220 | 0.9820 H HIS 221 | 1.0020 I ILE 222 | 0.9670 L LEU 223 | 1.0930 K LYS 224 | 0.9470 M MET 225 | 0.9300 F PHE 226 | 1.0550 P PRO 227 | 1.1690 S SER 228 | 1.0730 T THR 229 | 0.9250 W TRP 230 | 0.9610 Y TYR 231 | 0.9820 V VAL 232 | 233 | 11. Flexibility for one rigid neighbour (Karplus & Schulz 1985) 234 | Naturwissenschaften (1985) 72 212-213 235 | 0.9460 A ALA 236 | 1.0280 R ARG 237 | 1.0060 N ASN 238 | 1.0890 D ASP 239 | 0.8780 C CYS 240 | 1.0280 Q GLN 241 | 1.0360 E GLU 242 | 1.0420 G GLY 243 | 0.9520 H HIS 244 | 0.8920 I ILE 245 | 0.9610 L LEU 246 | 1.0820 K LYS 247 | 0.8620 M MET 248 | 0.9120 F PHE 249 | 1.0850 P PRO 250 | 1.0480 S SER 251 | 1.0510 T THR 252 | 0.9170 W TRP 253 | 0.9300 Y TYR 254 | 0.9270 V VAL 255 | 256 | 12. Average accessibility surface area (Janin et al. 1978) 257 | J. Mol. Biol. (1978) 125 357-386 258 | 27.8000 A ALA 259 | 94.7000 R ARG 260 | 60.1000 N ASN 261 | 60.6000 D ASP 262 | 15.5000 C CYS 263 | 68.7000 Q GLN 264 | 68.2000 E GLU 265 | 24.5000 G GLY 266 | 50.7000 H HIS 267 | 22.8000 I ILE 268 | 27.6000 L LEU 269 | 103.000 K LYS 270 | 33.5000 M MET 271 | 25.5000 F PHE 272 | 51.5000 P PRO 273 | 42.0000 S SER 274 | 45.0000 T THR 275 | 34.7000 W TRP 276 | 55.2000 Y TYR 277 | 23.7000 V VAL 278 | 279 | 13. Flexibility for two rigid neighbours (Karplus & Schulz 1985) 280 | Naturwissenschaften (1985) 72 212-213 281 | 0.8920 A ALA 282 | 0.9010 R ARG 283 | 0.9300 N ASN 284 | 0.9320 D ASP 285 | 0.9250 C CYS 286 | 0.8850 Q GLN 287 | 0.9330 E GLU 288 | 0.9230 G GLY 289 | 0.8940 H HIS 290 | 0.8720 I ILE 291 | 0.9210 L LEU 292 | 1.0570 K LYS 293 | 0.8040 M MET 294 | 0.9140 F PHE 295 | 0.9320 P PRO 296 | 0.9230 S SER 297 | 0.9340 T THR 298 | 0.8030 W TRP 299 | 0.8370 Y TYR 300 | 0.9130 V VAL 301 | 302 | 14. Hydrophobicity (Eisenberg et al. 1984) 303 | J. Mol. Biol. (1984) 179 125-142 304 | 0.2500 A ALA 305 | -1.7600 R ARG 306 | -0.6400 N ASN 307 | -0.7200 D ASP 308 | 0.0400 C CYS 309 | -0.6900 Q GLN 310 | -0.6200 E GLU 311 | 0.1600 G GLY 312 | -0.4000 H HIS 313 | 0.7300 I ILE 314 | 0.5300 L LEU 315 | -1.1000 K LYS 316 | 0.2600 M MET 317 | 0.6100 F PHE 318 | -0.0700 P PRO 319 | -0.2600 S SER 320 | -0.1800 T THR 321 | 0.3700 W TRP 322 | 0.0200 Y TYR 323 | 0.5400 V VAL 324 | 325 | 15. Accessible surface area in the standard state (Rose et al. 1985) 326 | Science (1975) 229 834-838 327 | 118.1000 A ALA 328 | 256.0000 R ARG 329 | 165.5000 N ASN 330 | 158.7000 D ASP 331 | 146.1000 C CYS 332 | 193.2000 Q GLN 333 | 186.2000 E GLU 334 | 88.1000 G GLY 335 | 202.5000 H HIS 336 | 181.0000 I ILE 337 | 193.1000 L LEU 338 | 225.8000 K LYS 339 | 203.4000 M MET 340 | 222.8000 F PHE 341 | 146.8000 P PRO 342 | 129.8000 S SER 343 | 152.5000 T THR 344 | 266.3000 W TRP 345 | 236.8000 Y TYR 346 | 164.5000 V VAL 347 | 348 | 16. Average accessible surface area in folded proteins (Rose et al. 1985) 349 | Science (1975) 229 834-838 350 | 31.5000 A ALA 351 | 93.8000 R ARG 352 | 62.2000 N ASN 353 | 60.9000 D ASP 354 | 13.9000 C CYS 355 | 74.0000 Q GLN 356 | 72.3000 E GLU 357 | 25.2000 G GLY 358 | 46.7000 H HIS 359 | 23.0000 I ILE 360 | 29.0000 L LEU 361 | 110.300 K LYS 362 | 30.5000 M MET 363 | 28.7000 F PHE 364 | 53.7000 P PRO 365 | 44.2000 S SER 366 | 46.0000 T THR 367 | 41.7000 W TRP 368 | 59.1000 Y TYR 369 | 23.5000 V VAL 370 | 371 | 17. Average surrounding hydrophobicity (Manavalan and Ponnuswamy 1978) 372 | Nature (1978) 275 673-674 373 | 12.9700 A ALA 374 | 11.7200 R ARG 375 | 11.4200 N ASN 376 | 10.8500 D ASP 377 | 14.6400 C CYS 378 | 11.7600 Q GLN 379 | 11.8900 E GLU 380 | 12.4300 G GLY 381 | 12.1600 H HIS 382 | 15.6700 I ILE 383 | 14.9000 L LEU 384 | 11.3600 K LYS 385 | 14.3900 M MET 386 | 14.0000 F PHE 387 | 11.3700 P PRO 388 | 11.2300 S SER 389 | 11.6900 T THR 390 | 13.9300 W TRP 391 | 13.4200 Y TYR 392 | 15.7100 V VAL 393 | 394 | 18. Hydrophilicity (Hopp and Woods 1981) 395 | Proc. Natl. Acad. Sci. USA (1981) 78 3824-3828 396 | -0.5000 A ALA 397 | 3.0000 R ARG 398 | 0.2000 N ASN 399 | 3.0000 D ASP 400 | -1.0000 C CYS 401 | 0.2000 Q GLN 402 | 3.0000 E GLU 403 | 0.0000 G GLY 404 | -0.5000 H HIS 405 | -1.8000 I ILE 406 | -1.8000 L LEU 407 | 3.0000 K LYS 408 | -1.3000 M MET 409 | -2.5000 F PHE 410 | 0.0000 P PRO 411 | 0.3000 S SER 412 | -0.4000 T THR 413 | -3.4000 W TRP 414 | -2.3000 Y TYR 415 | -1.5000 V VAL 416 | 417 | 19. Hydropathy (Kyte and Doolittle 1982) 418 | J. Mol. Biol. (1982) 157 105-132 419 | 1.8000 A ALA 420 | -4.5000 R ARG 421 | -3.5000 N ASN 422 | -3.5000 D ASP 423 | 2.5000 C CYS 424 | -3.5000 Q GLN 425 | -3.5000 E GLU 426 | -0.4000 G GLY 427 | -3.2000 H HIS 428 | 4.5000 I ILE 429 | 3.8000 L LEU 430 | -3.9000 K LYS 431 | 1.9000 M MET 432 | 2.8000 F PHE 433 | -1.6000 P PRO 434 | -0.8000 S SER 435 | -0.7000 T THR 436 | -0.9000 W TRP 437 | -1.3000 Y TYR 438 | 4.2000 V VAL 439 | 440 | 20. Hydrophilicity from HPLC (Parker et al. 1986) 441 | Biochemistry (1986) 25 5425-5432 442 | 2.1000 A ALA 443 | 4.2000 R ARG 444 | 7.0000 N ASN 445 | 10.000 D ASP 446 | 1.4000 C CYS 447 | 6.0000 Q GLN 448 | 7.8000 E GLU 449 | 5.7000 G GLY 450 | 2.1000 H HIS 451 | -8.000 I ILE 452 | -9.200 L LEU 453 | 5.7000 K LYS 454 | -4.200 M MET 455 | -9.200 F PHE 456 | 2.1000 P PRO 457 | 6.5000 S SER 458 | 5.2000 T THR 459 | -10.00 W TRP 460 | -1.900 Y TYR 461 | -3.700 V VAL 462 | 463 | 21. Hydrophobicity (Jones 1975) 464 | J.theor.Biol.(1975) 50 167-183 465 | 0.8700 A ALA 466 | 0.8500 R ARG 467 | 0.0900 N ASN 468 | 0.6600 D ASP 469 | 1.5200 C CYS 470 | 0.0000 Q GLN 471 | 0.6700 E GLU 472 | 0.1000 G GLY 473 | 0.8700 H HIS 474 | 3.1500 I ILE 475 | 2.1700 L LEU 476 | 1.6400 K LYS 477 | 1.6700 M MET 478 | 2.8700 F PHE 479 | 2.7700 P PRO 480 | 0.0700 S SER 481 | 0.0700 T THR 482 | 3.7700 W TRP 483 | 2.6700 Y TYR 484 | 1.8700 V VAL 485 | 486 | 22. Refractivity (Jones 1975) 487 | J.theor.Biol.(1975) 50 167-183 488 | 4.3400 A ALA 489 | 26.6600 R ARG 490 | 13.2800 N ASN 491 | 12.0000 D ASP 492 | 35.7700 C CYS 493 | 17.5600 Q GLN 494 | 17.2600 E GLU 495 | 0.0000 G GLY 496 | 21.8100 H HIS 497 | 19.0600 I ILE 498 | 18.7800 L LEU 499 | 21.2900 K LYS 500 | 21.6400 M MET 501 | 29.4000 F PHE 502 | 10.9300 P PRO 503 | 6.3500 S SER 504 | 11.0100 T THR 505 | 42.5300 W TRP 506 | 31.5300 Y TYR 507 | 13.9200 V VAL 508 | 509 | 23. Percentage of buried residues (Janin et al. 1978) 510 | J. Mol. Biol. (1978) 125 357-386 511 | 51.0000 A ALA 512 | 5.0000 R ARG 513 | 22.0000 N ASN 514 | 19.0000 D ASP 515 | 74.0000 C CYS 516 | 16.0000 Q GLN 517 | 16.0000 E GLU 518 | 52.0000 G GLY 519 | 34.0000 H HIS 520 | 66.0000 I ILE 521 | 60.0000 L LEU 522 | 3.0000 K LYS 523 | 52.0000 M MET 524 | 58.0000 F PHE 525 | 25.0000 P PRO 526 | 35.0000 S SER 527 | 30.0000 T THR 528 | 49.0000 W TRP 529 | 24.0000 Y TYR 530 | 64.0000 V VAL 531 | 532 | 24. Normalized frequency of alpha-helix with weights (Levitt 1978) 533 | Biochemistry (1978) 17 4277-4285 534 | 1.2900 A ALA 535 | 0.9600 R ARG 536 | 0.9000 N ASN 537 | 1.0400 D ASP 538 | 1.1100 C CYS 539 | 1.2700 Q GLN 540 | 1.4400 E GLU 541 | 0.5600 G GLY 542 | 1.2200 H HIS 543 | 0.9700 I ILE 544 | 1.3000 L LEU 545 | 1.2300 K LYS 546 | 1.4700 M MET 547 | 1.0700 F PHE 548 | 0.5200 P PRO 549 | 0.8200 S SER 550 | 0.8200 T THR 551 | 0.9900 W TRP 552 | 0.7200 Y TYR 553 | 0.9100 V VAL 554 | 555 | 25. Normalized frequency of beta-sheet with weights (Levitt 1978) 556 | Biochemistry (1978) 17 4277-4285 557 | 0.9000 A ALA 558 | 0.9900 R ARG 559 | 0.7600 N ASN 560 | 0.7200 D ASP 561 | 0.7400 C CYS 562 | 0.8000 Q GLN 563 | 0.7500 E GLU 564 | 0.9200 G GLY 565 | 1.0800 H HIS 566 | 1.4500 I ILE 567 | 1.0200 L LEU 568 | 0.7700 K LYS 569 | 0.9700 M MET 570 | 1.3200 F PHE 571 | 0.6400 P PRO 572 | 0.9500 S SER 573 | 1.2100 T THR 574 | 1.1400 W TRP 575 | 1.2500 Y TYR 576 | 1.4900 V VAL 577 | 578 | 26. Normalized frequency for reverse turn with weights (Levitt 1978) 579 | Biochemistry (1978) 17 4277-4285 580 | 0.7800 A ALA 581 | 0.8800 R ARG 582 | 1.2800 N ASN 583 | 1.4100 D ASP 584 | 0.8000 C CYS 585 | 0.9700 Q GLN 586 | 1.0000 E GLU 587 | 1.6400 G GLY 588 | 0.6900 H HIS 589 | 0.5100 I ILE 590 | 0.5900 L LEU 591 | 0.9600 K LYS 592 | 0.3900 M MET 593 | 0.5800 F PHE 594 | 1.9100 P PRO 595 | 1.3300 S SER 596 | 1.0300 T THR 597 | 0.7500 W TRP 598 | 1.0500 Y TYR 599 | 0.4700 V VAL 600 | 601 | 27. Percentage of exposed residues (Janin et al. 1978) 602 | J. Mol. Biol. (1978) 125 357-386 603 | 15.0000 A ALA 604 | 67.0000 R ARG 605 | 49.0000 N ASN 606 | 50.0000 D ASP 607 | 5.0000 C CYS 608 | 56.0000 Q GLN 609 | 55.0000 E GLU 610 | 10.0000 G GLY 611 | 34.0000 H HIS 612 | 13.0000 I ILE 613 | 16.0000 L LEU 614 | 85.0000 K LYS 615 | 20.0000 M MET 616 | 10.0000 F PHE 617 | 45.0000 P PRO 618 | 32.0000 S SER 619 | 32.0000 T THR 620 | 17.0000 W TRP 621 | 41.0000 Y TYR 622 | 14.0000 V VAL 623 | 624 | 28. Hydrophobic index (Ponnuswamy et al. 1980) 625 | Biochim.Biophys.Acta(1980) 623 301-316 626 | 0.8700 A ALA 627 | 0.8500 R ARG 628 | 0.0900 N ASN 629 | 0.6600 D ASP 630 | 1.5200 C CYS 631 | 0.0000 Q GLN 632 | 0.6700 E GLU 633 | 0.1000 G GLY 634 | 0.8000 H HIS 635 | 3.1500 I ILE 636 | 2.1700 L LEU 637 | 1.6400 K LYS 638 | 1.6700 M MET 639 | 2.8700 F PHE 640 | 2.7700 P PRO 641 | 0.0700 S SER 642 | 0.0700 T THR 643 | 3.7700 W TRP 644 | 2.6700 Y TYR 645 | 1.8700 V VAL 646 | 647 | 29. Hydrophobicity in folded form (Ponnuswamy et al. 1980) 648 | Biochim.Biophys.Acta(1980) 623 301-316 649 | 12.2800 A ALA 650 | 11.4900 R ARG 651 | 11.0000 N ASN 652 | 10.9700 D ASP 653 | 14.9300 C CYS 654 | 11.2800 Q GLN 655 | 11.1900 E GLU 656 | 12.0100 G GLY 657 | 12.8400 H HIS 658 | 14.7700 I ILE 659 | 14.1000 L LEU 660 | 10.8000 K LYS 661 | 14.3300 M MET 662 | 13.4300 F PHE 663 | 11.1900 P PRO 664 | 11.2600 S SER 665 | 11.6500 T THR 666 | 12.9500 W TRP 667 | 13.2900 Y TYR 668 | 15.0700 V VAL 669 | 670 | 30. Hydrophobicity in unfolded form (Ponnuswamy et al. 1980) 671 | Biochim.Biophys.Acta(1980) 623 301-316 672 | 4.6600 A ALA 673 | 4.6800 R ARG 674 | 4.8700 N ASN 675 | 4.7900 D ASP 676 | 4.0000 C CYS 677 | 4.6100 Q GLN 678 | 4.8100 E GLU 679 | 4.7000 G GLY 680 | 4.9900 H HIS 681 | 4.7800 I ILE 682 | 4.7300 L LEU 683 | 5.0800 K LYS 684 | 4.5000 M MET 685 | 4.4400 F PHE 686 | 4.5500 P PRO 687 | 4.3300 S SER 688 | 4.7500 T THR 689 | 4.5400 W TRP 690 | 4.7600 Y TYR 691 | 4.6900 V VAL 692 | 693 | 31. Hydrophobicity gain (Ponnuswamy et al. 1980) 694 | Biochim.Biophys.Acta(1980) 623 301-316 695 | 2.6300 A ALA 696 | 2.4500 R ARG 697 | 2.2700 N ASN 698 | 2.2900 D ASP 699 | 3.3600 C CYS 700 | 2.4500 Q GLN 701 | 2.3100 E GLU 702 | 2.5500 G GLY 703 | 2.5700 H HIS 704 | 3.0800 I ILE 705 | 2.9800 L LEU 706 | 2.1200 K LYS 707 | 3.1800 M MET 708 | 3.0200 F PHE 709 | 2.4600 P PRO 710 | 2.6000 S SER 711 | 2.5500 T THR 712 | 2.8500 W TRP 713 | 2.7900 Y TYR 714 | 3.2100 V VAL 715 | 716 | 32. Polarity (Ponnuswamy et al. 1980) 717 | Biochim.Biophys.Acta(1980) 623 301-316 718 | 0.0000 A ALA 719 | 52.000 R ARG 720 | 3.3800 N ASN 721 | 40.700 D ASP 722 | 1.4800 C CYS 723 | 3.5300 Q GLN 724 | 49.910 E GLU 725 | 0.0000 G GLY 726 | 51.600 H HIS 727 | 0.1500 I ILE 728 | 0.4500 L LEU 729 | 49.500 K LYS 730 | 1.4300 M MET 731 | 0.3500 F PHE 732 | 1.5800 P PRO 733 | 1.6700 S SER 734 | 1.6600 T THR 735 | 2.1000 W TRP 736 | 1.6100 Y TYR 737 | 0.1300 V VAL 738 | 739 | 33. Average isotopic mass (Biemann, 1990) 740 | Meth.Enzimol. (1990) v.193, p.888 741 | 70.079 A ALA 742 | 156.188 R ARG 743 | 114.104 N ASN 744 | 115.089 D ASP 745 | 103.144 C CYS 746 | 128.131 Q GLN 747 | 129.116 E GLU 748 | 57.052 G GLY 749 | 137.142 H HIS 750 | 113.160 I ILE 751 | 113.160 L LEU 752 | 128.174 K LYS 753 | 131.198 M MET 754 | 147.177 F PHE 755 | 97.177 P PRO 756 | 87.078 S SER 757 | 101.105 T THR 758 | 186.213 W TRP 759 | 163.170 Y TYR 760 | 99.133 V VAL 761 | 762 | 34.Isoelectric point (Zimmerman et al., 1968) 763 | J. Theor. Biol. (1968) v.21, p.170-201 764 | 6.0000 A ALA 765 | 10.7600 R ARG 766 | 5.4100 N ASN 767 | 2.7700 D ASP 768 | 5.0500 C CYS 769 | 5.6500 Q GLN 770 | 3.2200 E GLU 771 | 5.9700 G GLY 772 | 7.5900 H HIS 773 | 6.0200 I ILE 774 | 5.9800 L LEU 775 | 9.7400 K LYS 776 | 5.7400 M MET 777 | 5.4800 F PHE 778 | 6.3000 P PRO 779 | 5.6800 S SER 780 | 5.6600 T THR 781 | 5.8900 W TRP 782 | 5.6600 Y TYR 783 | 5.9600 V VAL 784 | 785 | 35. pK of side chain (White et al., 1978) 786 | White A,Handler P,Smith EL,Hill RL,Lehman IR;Principles of Biochemistry;1978 787 | 0.0000 A ALA 788 | 12.480 R ARG 789 | 0.0000 N ASN 790 | 3.6500 D ASP 791 | 8.1800 C CYS 792 | 0.0000 Q GLN 793 | 4.2500 E GLU 794 | 0.0000 G GLY 795 | 6.0000 H HIS 796 | 0.0000 I ILE 797 | 0.0000 L LEU 798 | 10.530 K LYS 799 | 0.0000 M MET 800 | 0.0000 F PHE 801 | 0.0000 P PRO 802 | 0.0000 S SER 803 | 0.0000 T THR 804 | 0.0000 W TRP 805 | 10.700 Y TYR 806 | 0.0000 V VAL 807 | 808 | 36. Energy of transfer from water to ethanol kcal/mol (Nozaki & Tanford, 1971) 809 | J.Biol.Chem. (1971) 246 2211-2217 810 | -0.5000 A ALA 811 | -0.1000 R ARG 812 | 0.1100 N ASN 813 | -0.2000 D ASP 814 | -0.5000 C CYS 815 | 0.2000 Q GLN 816 | -0.3000 E GLU 817 | 0.0000 G GLY 818 | -0.4000 H HIS 819 | -2.0000 I ILE 820 | -2.0000 L LEU 821 | -0.3000 K LYS 822 | -1.3000 M MET 823 | -2.5000 F PHE 824 | -1.0000 P PRO 825 | -0.2000 S SER 826 | -0.4000 T THR 827 | -3.0000 W TRP 828 | -2.2000 Y TYR 829 | -1.5000 V VAL 830 | -------------------------------------------------------------------------------- /pepdata/matrices/helix_vs_coil.txt: -------------------------------------------------------------------------------- 1 | #H ZHAC000103 2 | #D Environment-dependent residue contact energies (rows = helix, cols = coil) 3 | #R PMID:10706611 4 | #A Zhang, C. and Kim, S.H. 5 | #T Environment-dependent residue contact energies for proteins 6 | #J Proc. Natl. Acad. Sci. USA 97, 2550-2555 (2000) 7 | #M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV 8 | 0.12 1.17 0.84 0.90 -0.81 1.16 1.44 0.10 0.69 -0.81 -0.78 1.16 -0.22 -0.67 0.61 0.47 0.36 -0.72 -0.37 -0.43 9 | 0.98 1.65 1.16 0.60 -0.21 1.26 1.12 1.09 1.16 -0.04 -0.09 2.37 0.47 -0.04 1.22 1.05 0.92 -0.09 0.06 0.32 10 | 0.69 1.16 1.16 1.22 -0.06 1.23 1.45 0.96 0.88 0.26 0.12 1.48 0.32 0.03 1.14 0.73 0.62 0.62 0.53 0.23 11 | 0.90 0.40 1.06 1.45 0.58 1.88 2.18 1.13 0.69 0.43 0.65 0.95 0.75 0.33 1.41 0.39 0.54 -0.10 0.12 0.77 12 | -0.83 0.10 0.40 0.12 -2.65 -0.24 0.96 -0.26 -0.26 -1.61 -1.77 0.80 -1.02 -1.47 -0.31 -0.31 -0.49 -1.30 -0.98 -1.62 13 | 1.13 1.10 1.28 1.37 0.14 1.62 1.84 1.29 1.31 0.05 -0.05 1.50 0.41 0.20 1.14 0.86 0.62 0.45 0.31 0.48 14 | 1.33 0.91 1.33 1.60 0.31 1.60 1.93 1.62 1.01 0.33 0.38 1.12 0.82 0.55 1.54 0.78 0.54 0.23 0.52 0.86 15 | -0.22 0.72 0.27 0.47 -0.95 0.42 1.39 -0.23 0.40 -0.48 -0.81 1.04 -0.62 -0.36 0.41 0.23 -0.04 -0.71 0.08 -0.35 16 | 0.47 0.81 0.95 0.51 -1.56 0.90 0.89 0.86 0.20 -0.43 -0.48 1.31 -0.63 -0.41 0.56 0.40 0.28 -0.20 -0.22 -0.21 17 | -0.58 0.17 0.61 0.46 -1.17 0.24 0.80 0.04 -0.16 -1.64 -1.66 0.87 -0.89 -1.56 -0.27 0.02 -0.32 -1.40 -1.13 -1.36 18 | -0.44 0.20 0.50 0.71 -1.56 0.11 0.82 0.28 -0.15 -1.67 -1.62 0.72 -0.96 -1.55 0.02 0.19 -0.09 -1.46 -0.95 -1.32 19 | 1.07 2.48 1.75 0.98 0.42 1.68 1.04 1.31 1.39 0.41 0.29 2.95 0.98 0.27 1.63 1.51 1.48 0.32 0.60 0.64 20 | -0.22 0.65 0.76 0.88 -0.95 0.68 1.92 0.27 0.31 -1.32 -1.04 1.02 -0.57 -1.60 0.07 0.47 0.04 -1.29 -0.85 -0.82 21 | -0.33 -0.06 0.42 0.42 -1.90 0.25 0.64 0.12 -0.01 -1.64 -1.50 0.58 -1.36 -1.77 -0.30 0.02 0.04 -1.41 -1.36 -1.34 22 | 0.78 1.30 1.31 1.27 -0.04 1.44 1.71 0.69 0.84 0.05 0.15 1.68 0.38 0.27 1.05 1.19 0.83 -0.24 0.23 0.12 23 | 0.46 1.07 1.04 0.73 -0.31 1.47 1.23 0.57 0.58 -0.11 -0.24 1.37 0.08 -0.34 0.76 0.51 0.48 -0.04 0.47 0.18 24 | 0.50 0.90 0.75 0.91 -0.26 1.03 1.25 0.55 0.55 -0.20 -0.26 1.42 0.50 -0.22 0.88 0.69 0.56 0.41 0.11 -0.15 25 | -0.41 -0.06 -0.19 0.32 -0.79 -0.14 0.58 0.07 -0.62 -1.58 -1.16 0.18 -1.03 -1.33 -0.56 0.15 -0.19 -1.83 -0.67 -0.92 26 | -0.22 -0.07 0.52 0.46 -0.87 0.38 0.59 0.40 -0.17 -1.29 -1.15 0.83 -0.98 -1.16 -0.16 0.34 -0.12 -0.79 -0.77 -0.78 27 | -0.51 0.49 0.48 0.67 -1.40 0.66 0.63 -0.06 0.28 -1.25 -1.50 1.14 -0.93 -1.36 -0.04 0.10 -0.01 -1.11 -0.82 -1.14 28 | 29 | -------------------------------------------------------------------------------- /pepdata/matrices/helix_vs_strand.txt: -------------------------------------------------------------------------------- 1 | #H ZHAC000102 2 | #D Environment-dependent residue contact energies (rows = helix, cols = strand) 3 | #R PMID:10706611 4 | #A Zhang, C. and Kim, S.H. 5 | #T Environment-dependent residue contact energies for proteins 6 | #J Proc. Natl. Acad. Sci. USA 97, 2550-2555 (2000) 7 | #M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV 8 | -0.94 1.26 0.55 0.76 -1.54 1.14 1.57 -0.78 0.44 -1.59 -1.64 1.91 -0.90 -1.49 0.28 0.20 -0.04 -0.92 -0.75 -1.45 9 | 0.56 1.79 2.31 0.79 -0.67 2.54 0.72 1.09 0.94 -0.01 0.01 3.68 0.89 -0.05 1.37 0.83 1.35 0.00 0.33 0.44 10 | 0.59 2.21 1.82 0.77 -0.90 0.46 3.06 -0.16 0.63 -0.33 0.20 2.43 0.99 0.63 0.54 0.24 0.63 0.11 -0.19 0.23 11 | 0.66 0.76 0.76 1.19 -0.21 1.66 2.22 0.29 0.57 0.59 0.79 1.13 1.41 0.49 1.70 1.03 1.19 1.85 0.18 0.86 12 | -1.75 0.78 -1.00 0.32 -3.64 0.48 0.87 -1.67 -0.62 -2.77 -2.32 0.19 -1.22 -2.67 -1.62 -0.83 -1.14 -0.52 -1.94 -2.35 13 | 0.33 2.15 1.22 1.26 1.37 1.17 2.56 0.92 1.02 0.11 0.00 2.58 0.79 -0.26 0.53 1.19 1.11 0.21 0.39 0.15 14 | 0.82 1.05 2.18 2.11 0.01 2.42 2.58 1.15 0.97 0.20 0.31 1.31 1.25 0.12 2.00 1.09 1.13 0.58 0.31 0.39 15 | -0.40 0.95 0.03 0.14 -1.00 0.34 0.99 -1.32 0.13 -1.40 -1.36 1.58 -0.90 -1.41 0.82 -0.27 0.21 -0.59 -1.27 -1.09 16 | -0.75 2.19 0.13 0.68 -1.37 1.98 1.13 0.01 1.52 -0.83 -0.58 2.26 -0.82 -1.01 0.53 -0.17 0.02 -49.00 -0.61 -0.56 17 | -1.99 0.25 -0.20 1.00 -2.44 -0.12 0.88 -1.54 -0.05 -2.64 -2.33 0.75 -1.85 -2.46 -1.06 -0.59 -0.65 -1.82 -1.88 -2.45 18 | -2.02 0.34 -0.04 0.13 -2.29 0.24 0.73 -1.27 -0.46 -2.53 -2.44 0.67 -1.80 -2.28 -1.29 -0.40 -0.34 -1.76 -1.66 -2.26 19 | 0.60 3.11 2.23 1.06 0.50 1.80 1.65 0.82 1.25 0.10 0.34 3.51 0.98 -0.21 1.15 2.09 1.30 -0.14 0.28 0.13 20 | -1.54 -0.06 -0.63 1.76 -2.51 0.14 0.72 -1.74 0.07 -2.27 -2.22 1.27 -1.77 -1.87 0.34 -0.02 -0.21 -0.93 -1.54 -1.81 21 | -2.12 0.33 -0.70 0.17 -2.30 -0.59 0.26 -1.60 -0.88 -2.53 -2.44 -0.42 -1.83 -2.68 -1.40 -0.82 -0.61 -1.63 -1.83 -2.25 22 | 0.63 2.43 -0.19 1.31 -1.63 1.46 1.91 0.08 1.11 -0.20 0.47 1.94 -0.34 0.15 0.57 0.00 1.15 0.06 0.26 -0.06 23 | -0.41 0.88 1.02 1.04 -0.21 1.27 0.94 0.04 0.75 -0.48 -0.67 2.28 0.45 -0.92 0.75 0.50 0.96 0.22 -0.19 -0.54 24 | -0.32 1.48 0.35 0.43 -1.44 0.38 1.36 -0.38 0.20 -1.14 -1.00 1.38 -0.35 -0.97 -0.05 -0.16 0.29 -0.53 -0.76 -0.73 25 | -1.85 0.45 -0.03 0.80 -1.64 -0.23 0.11 -0.95 0.67 -1.58 -2.13 0.61 -1.75 -1.59 -1.07 -0.34 -0.40 -1.29 -1.27 -1.79 26 | -0.88 -0.20 -0.29 0.14 -1.31 0.09 0.71 -0.56 -0.57 -1.66 -1.38 1.40 -1.60 -1.97 -0.73 -0.32 -0.37 -1.40 -0.96 -1.38 27 | -1.74 0.85 0.24 0.72 -2.25 0.45 0.81 -1.29 -0.24 -2.46 -2.38 0.37 -1.21 -2.16 -1.00 -0.10 -0.57 -1.34 -1.52 -2.31 28 | -------------------------------------------------------------------------------- /pepdata/matrices/pmbec.mat: -------------------------------------------------------------------------------- 1 | A C D E F G H I K L M N P Q R S T V W Y 2 | A 0.322860152036 0.0113750373506 -0.0156239175966 -0.00259952715456 -0.0508792185716 0.0382679273874 -0.0832539299638 -0.00196691041626 -0.0103729638696 -0.042393907322 -0.0651042403697 -0.0853704925231 0.0757409633086 -0.0483151514798 -0.0136431408498 0.038455041596 0.0520376087986 0.081101427454 -0.125564718844 -0.0747500389698 3 | C 0.0113750373506 0.100680270274 0.0102951033136 0.0147570340938 0.0345785831581 0.00933463557214 -0.00750101609651 0.00476007239717 -0.0459237939975 -0.0182998264075 -0.0155971113182 0.0021128481374 -0.00860770840682 -0.0309903425175 -0.0482562439545 -0.0217965163697 -0.0227322740574 -0.0154276574266 0.0412325888637 0.00600631739163 4 | D -0.0156239175966 0.0102951033136 0.157208255034 0.0724279735923 -0.0189545540921 -0.00870389879389 -0.0180188107498 -0.0283467966687 -0.0634240071162 -0.0279979457557 -0.0241192288182 0.0194310374127 0.042784078891 0.000437307476866 -0.0591268568576 -0.0104660502173 0.00656101264316 -0.0193560886308 0.00415097887978 -0.0191575919464 5 | E -0.00259952715456 0.0147570340938 0.0724279735923 0.131775168933 -0.00519060032543 -0.00547805492393 -0.0335600821273 -0.0135417817213 -0.069471604426 0.00353800457557 -0.017166710134 0.00534055417468 0.022589833552 0.0281404974641 -0.0697402405064 -0.0172364513778 -0.0054830504799 -0.00806269508269 -0.00791955104235 -0.0231187170833 6 | F -0.0508792185716 0.0345785831581 -0.0189545540921 -0.00519060032543 0.259179996995 -0.00445131805782 -0.00639743486807 0.0628717025094 -0.049227253611 0.0488666377736 0.0315353570161 -0.0223593028205 -0.0919732521492 -0.0930189756622 -0.0626297946351 -0.0868415233743 -0.0777292391855 -0.015794520965 0.0625957490761 0.0858189617896 7 | G 0.0382679273874 0.00933463557214 -0.00870389879389 -0.00547805492393 -0.00445131805782 0.122499934434 -0.025558278086 -0.0207027221208 -0.0137316756786 -0.0326424665142 -0.0264215095016 -0.00403752148352 0.0094352664965 0.00425299048772 -0.0232280105465 0.0304312733191 0.00861853592388 -0.0127217072682 -0.0246539147339 -0.0205094859119 8 | H -0.0832539299638 -0.00750101609651 -0.0180188107498 -0.0335600821273 -0.00639743486807 -0.025558278086 0.207657765989 -0.0888505073496 0.0761447053198 -0.0351727012494 -0.000760393877348 0.0353903619255 -0.0682087048807 -0.00886454093107 0.109052662874 0.00938179429131 -0.0234122309305 -0.0870188771708 0.0123622841944 0.0365879336865 9 | I -0.00196691041626 0.00476007239717 -0.0283467966687 -0.0135417817213 0.0628717025094 -0.0207027221208 -0.0888505073496 0.27773187827 -0.0381642025534 0.0886112938313 0.0551293441776 -0.0593694184462 -0.039207153398 -0.0626883806129 -0.110160438997 -0.0618078497671 -0.0339233811197 0.091300054417 0.00138488610169 -0.0230596885329 10 | K -0.0103729638696 -0.0459237939975 -0.0634240071162 -0.069471604426 -0.049227253611 -0.0137316756786 0.0761447053198 -0.0381642025534 0.273355694189 -0.0177282663533 -0.00817300753785 -0.0339854863013 -0.0484016323395 -0.0331603641198 0.21516548555 0.00476731287861 -0.0331318604828 -0.0400367780545 -0.0598522551401 -0.00464804635586 11 | L -0.042393907322 -0.0182998264075 -0.0279979457557 0.00353800457557 0.0488666377736 -0.0326424665142 -0.0351727012494 0.0886112938313 -0.0177282663533 0.162738321535 0.0750528874999 -0.0111666419731 -0.051023845781 -0.00134001844501 -0.074934598492 -0.0584956357369 -0.031311528799 0.0449678806271 -0.00754567267671 -0.0137219703366 12 | M -0.0651042403697 -0.0155971113182 -0.0241192288182 -0.017166710134 0.0315353570161 -0.0264215095016 -0.000760393877348 0.0551293441776 -0.00817300753785 0.0750528874999 0.156957428383 0.00753829785887 -0.091647674076 0.00190198496329 -0.0257018542091 -0.0295349216339 -0.0454820084051 -0.0120310888206 0.0210041287765 0.0126203200265 13 | N -0.0853704925231 0.0021128481374 0.0194310374127 0.00534055417468 -0.0223593028205 -0.00403752148352 0.0353903619255 -0.0593694184462 -0.0339854863013 -0.0111666419731 0.00753829785887 0.151487423988 -0.0106077901881 0.0413965183445 -0.0338327997913 0.0170820288313 -0.00295174153884 -0.0436807942705 0.0296409813073 -0.00205806264345 14 | P 0.0757409633086 -0.00860770840682 0.042784078891 0.022589833552 -0.0919732521492 0.0094352664965 -0.0682087048807 -0.039207153398 -0.0484016323395 -0.051023845781 -0.091647674076 -0.0106077901881 0.354629507834 0.0481497903134 -0.0377142358446 -0.00687173098621 0.0199181111388 0.0225294243984 -0.0525069717881 -0.0890062760945 15 | Q -0.0483151514798 -0.0309903425175 0.000437307476866 0.0281404974641 -0.0930189756622 0.00425299048772 -0.00886454093107 -0.0626883806129 -0.0331603641198 -0.00134001844501 0.00190198496329 0.0413965183445 0.0481497903134 0.177175171536 0.00715630304762 0.0357241930907 0.027467611659 -0.032780800211 -0.0118972341632 -0.0487465602402 16 | R -0.0136431408498 -0.0482562439545 -0.0591268568576 -0.0697402405064 -0.0626297946351 -0.0232280105465 0.109052662874 -0.110160438997 0.21516548555 -0.074934598492 -0.0257018542091 -0.0338327997913 -0.0377142358446 0.00715630304762 0.389022190137 0.0204288367942 -0.0408668326839 -0.0934989556047 -0.0557627605155 0.00827128508526 17 | S 0.038455041596 -0.0217965163697 -0.0104660502173 -0.0172364513778 -0.0868415233743 0.0304312733191 0.00938179429131 -0.0618078497671 0.00476731287861 -0.0584956357369 -0.0295349216339 0.0170820288313 -0.00687173098621 0.0357241930907 0.0204288367942 0.161573840097 0.0839261885951 -0.00816241136786 -0.0444334801409 -0.0561239385213 18 | T 0.0520376087986 -0.0227322740574 0.00656101264316 -0.0054830504799 -0.0777292391855 0.00861853592388 -0.0234122309305 -0.0339233811197 -0.0331318604828 -0.031311528799 -0.0454820084051 -0.00295174153884 0.0199181111388 0.027467611659 -0.0408668326839 0.0839261885951 0.142525860495 0.0493244941272 -0.0264928932645 -0.0468623824337 19 | V 0.081101427454 -0.0154276574266 -0.0193560886308 -0.00806269508269 -0.015794520965 -0.0127217072682 -0.0870188771708 0.091300054417 -0.0400367780545 0.0449678806271 -0.0120310888206 -0.0436807942705 0.0225294243984 -0.032780800211 -0.0934989556047 -0.00816241136786 0.0493244941272 0.172778293246 -0.0289445753682 -0.0444846240282 20 | W -0.125564718844 0.0412325888637 0.00415097887978 -0.00791955104235 0.0625957490761 -0.0246539147339 0.0123622841944 0.00138488610169 -0.0598522551401 -0.00754567267671 0.0210041287765 0.0296409813073 -0.0525069717881 -0.0118972341632 -0.0557627605155 -0.0444334801409 -0.0264928932645 -0.0289445753682 0.194048086876 0.0791543436022 21 | Y -0.0747500389698 0.00600631739163 -0.0191575919464 -0.0231187170833 0.0858189617896 -0.0205094859119 0.0365879336865 -0.0230596885329 -0.00464804635586 -0.0137219703366 0.0126203200265 -0.00205806264345 -0.0890062760945 -0.0487465602402 0.00827128508526 -0.0561239385213 -0.0468623824337 -0.0444846240282 0.0791543436022 0.237788221516 22 | -------------------------------------------------------------------------------- /pepdata/matrices/strand_vs_coil.txt: -------------------------------------------------------------------------------- 1 | #H ZHAC000105 2 | #D Environment-dependent residue contact energies (rows = strand, cols = coil) 3 | #R PMID:10706611 4 | #A Zhang, C. and Kim, S.H. 5 | #T Environment-dependent residue contact energies for proteins 6 | #J Proc. Natl. Acad. Sci. USA 97, 2550-2555 (2000) 7 | #M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV 8 | -0.57 0.47 0.30 0.62 -1.60 0.45 0.61 -0.24 0.07 -1.64 -1.63 0.62 -1.03 -1.55 -0.11 -0.10 -0.34 -1.44 -0.39 -1.55 9 | 0.23 0.79 0.76 0.39 -0.41 0.92 0.76 0.52 0.51 -0.30 0.13 1.58 0.88 -0.07 0.60 0.65 0.37 0.14 0.32 0.17 10 | -0.28 0.74 0.57 0.87 -0.68 0.52 1.00 -0.07 0.32 -0.31 -0.08 0.87 0.29 -0.17 0.57 0.11 0.19 0.04 0.24 -0.23 11 | 0.15 -0.25 0.46 0.69 -0.46 0.41 1.34 0.56 -0.51 -0.23 0.27 0.59 0.60 -0.38 1.02 0.08 0.05 -0.48 0.02 0.34 12 | -1.19 -0.46 0.21 0.51 -3.30 0.26 0.20 -1.03 -0.72 -1.55 -1.71 0.27 -1.24 -1.70 -0.50 -0.55 -0.97 -0.67 -1.26 -1.62 13 | 0.63 1.18 0.92 1.37 -0.30 0.93 1.27 0.56 0.91 -0.28 -0.11 0.98 0.15 -0.30 0.64 0.88 0.68 -0.44 0.66 0.15 14 | 0.97 0.89 1.37 1.89 0.30 1.25 2.34 0.98 0.58 0.20 0.50 0.67 1.23 0.58 1.26 0.95 1.06 0.04 0.87 0.48 15 | -0.64 0.12 0.27 0.31 -1.37 0.38 0.98 -0.40 -0.12 -1.58 -1.40 0.78 -0.46 -1.38 -0.21 0.05 -0.26 -1.41 -0.61 -1.13 16 | -0.02 0.75 0.68 0.14 -0.58 0.73 0.84 0.41 -0.64 -0.75 0.03 1.46 -0.16 -0.49 0.52 0.31 -0.11 -1.00 -0.58 0.03 17 | -0.94 -0.14 0.31 0.26 -1.70 0.07 0.46 -0.37 -0.50 -1.88 -1.79 0.84 -0.99 -1.82 -0.47 -0.05 -0.54 -1.65 -1.09 -1.64 18 | -0.76 0.32 0.43 0.25 -1.63 0.22 0.68 -0.17 -0.40 -1.84 -1.70 0.47 -1.06 -1.76 -0.39 0.09 -0.42 -1.81 -1.15 -1.64 19 | 1.02 1.99 1.18 0.59 0.08 1.10 0.60 0.61 0.95 0.24 0.34 2.69 0.97 -0.03 1.23 1.07 0.83 0.00 0.26 0.36 20 | -0.16 0.83 0.47 0.92 -1.63 0.36 0.71 -0.20 0.90 -1.00 -1.12 1.55 -0.31 -1.35 -0.01 0.34 0.20 -1.70 -0.60 -0.79 21 | -0.70 0.03 0.63 0.15 -1.26 0.29 0.35 -0.11 -0.36 -1.73 -1.55 0.71 -0.97 -1.55 -0.28 -0.09 -0.32 -1.23 -0.91 -1.30 22 | 0.17 0.50 0.60 0.67 -1.31 0.50 0.94 0.02 -0.45 -1.26 -0.91 1.08 0.83 -0.87 0.63 0.31 0.26 -0.50 -0.55 -0.79 23 | -0.06 0.99 0.73 0.86 -0.89 0.85 0.67 0.08 0.06 -0.22 -0.29 0.94 -0.08 -0.41 0.67 0.33 0.13 -1.01 0.13 -0.24 24 | 0.26 0.93 0.70 0.87 -0.78 0.58 1.20 0.12 0.52 -0.30 -0.24 1.11 0.01 -0.08 0.65 0.47 0.41 -0.31 0.12 -0.32 25 | -0.03 -0.11 0.27 0.66 -1.50 0.65 0.50 -0.12 -0.32 -1.13 -1.01 0.52 -1.08 -1.04 -0.32 -0.03 -0.10 -0.67 -0.73 -0.64 26 | -0.44 0.20 0.20 0.20 -1.26 0.16 0.10 -0.21 -0.52 -1.26 -1.30 0.60 -0.76 -1.17 -0.42 0.05 -0.27 -1.20 -0.75 -0.84 27 | -0.83 0.20 0.48 0.62 -1.44 0.17 0.73 -0.12 -0.26 -1.64 -1.59 0.52 -0.70 -1.55 -0.28 0.12 -0.17 -1.16 -0.85 -1.42 -------------------------------------------------------------------------------- /pepdata/peptide_vectorizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2016. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from __future__ import print_function, division, absolute_import 17 | 18 | import numpy as np 19 | from sklearn.feature_extraction.text import CountVectorizer 20 | from sklearn.preprocessing import normalize 21 | 22 | def make_count_vectorizer(reduced_alphabet, max_ngram): 23 | if reduced_alphabet is None: 24 | preprocessor = None 25 | else: 26 | preprocessor = lambda s: "".join([reduced_alphabet[si] for si in s]) 27 | 28 | return CountVectorizer( 29 | analyzer='char', 30 | ngram_range=(1, max_ngram), 31 | dtype=np.float, 32 | preprocessor=preprocessor) 33 | 34 | class PeptideVectorizer(object): 35 | """ 36 | Make n-gram frequency vectors from peptide sequences 37 | """ 38 | def __init__( 39 | self, 40 | max_ngram=1, 41 | normalize_row=True, 42 | reduced_alphabet=None, 43 | training_already_reduced=False): 44 | self.reduced_alphabet = reduced_alphabet 45 | self.max_ngram = max_ngram 46 | self.normalize_row = normalize_row 47 | self.training_already_reduced = training_already_reduced 48 | self.count_vectorizer = None 49 | 50 | def __getstate__(self): 51 | return { 52 | 'reduced_alphabet': self.reduced_alphabet, 53 | 'count_vectorizer': self.count_vectorizer, 54 | 'training_already_reduced': self.training_already_reduced, 55 | 'normalize_row': self.normalize_row, 56 | 'max_ngram': self.max_ngram, 57 | } 58 | 59 | def fit_transform(self, amino_acid_strings): 60 | self.count_vectorizer = \ 61 | make_count_vectorizer(self.reduced_alphabet, self.max_ngram) 62 | 63 | if self.training_already_reduced: 64 | c = make_count_vectorizer(None, self.max_ngram) 65 | X = c.fit_transform(amino_acid_strings).todense() 66 | self.count_vectorizer.vocabulary_ = c.vocabulary_ 67 | else: 68 | c = self.count_vectorizer 69 | X = c.fit_transform(amino_acid_strings).todense() 70 | 71 | if self.normalize_row: 72 | X = normalize(X, norm='l1') 73 | return X 74 | 75 | def fit(self, amino_acid_strings): 76 | self.fit_transform(amino_acid_strings) 77 | 78 | def transform(self, amino_acid_strings): 79 | assert self.count_vectorizer, "Must call 'fit' before 'transform'" 80 | X = self.count_vectorizer.transform(amino_acid_strings).todense() 81 | if self.normalize_row: 82 | X = normalize(X, norm='l1') 83 | return X 84 | -------------------------------------------------------------------------------- /pepdata/pmbec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2016. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function, division, absolute_import 16 | from os.path import join 17 | 18 | from .static_data import MATRIX_DIR 19 | 20 | from .amino_acid_alphabet import dict_to_amino_acid_matrix 21 | 22 | def read_pmbec_coefficients( 23 | key_type='row', 24 | verbose=True, 25 | filename=join(MATRIX_DIR, 'pmbec.mat')): 26 | """ 27 | Parameters 28 | ------------ 29 | 30 | filename : str 31 | Location of PMBEC coefficient matrix 32 | 33 | key_type : str 34 | 'row' : every key is a single amino acid, 35 | which maps to a dictionary for that row 36 | 'pair' : every key is a tuple of amino acids 37 | 'pair_string' : every key is a string of two amino acid characters 38 | 39 | verbose : bool 40 | Print rows of matrix as we read them 41 | """ 42 | d = {} 43 | if key_type == 'row': 44 | def add_pair(row_letter, col_letter, value): 45 | if row_letter not in d: 46 | d[row_letter] = {} 47 | d[row_letter][col_letter] = value 48 | elif key_type == 'pair': 49 | def add_pair(row_letter, col_letter, value): 50 | d[(row_letter, col_letter)] = value 51 | 52 | else: 53 | assert key_type == 'pair_string', \ 54 | "Invalid dictionary key type: %s" % key_type 55 | 56 | def add_pair(row_letter, col_letter, value): 57 | d["%s%s" % (row_letter, col_letter)] = value 58 | 59 | with open(filename, 'r') as f: 60 | lines = [line for line in f.read().split('\n') if len(line) > 0] 61 | header = lines[0] 62 | if verbose: 63 | print(header) 64 | residues = [ 65 | x for x in header.split() 66 | if len(x) == 1 and x != ' ' and x != '\t' 67 | ] 68 | assert len(residues) == 20 69 | if verbose: 70 | print(residues) 71 | for line in lines[1:]: 72 | cols = [ 73 | x 74 | for x in line.split(' ') 75 | if len(x) > 0 and x != ' ' and x != '\t' 76 | ] 77 | assert len(cols) == 21, "Expected 20 values + letter, got %s" % cols 78 | row_letter = cols[0] 79 | for i, col in enumerate(cols[1:]): 80 | col_letter = residues[i] 81 | assert col_letter != ' ' and col_letter != '\t' 82 | value = float(col) 83 | add_pair(row_letter, col_letter, value) 84 | return d 85 | 86 | # dictionary of PMBEC coefficient accessed like pmbec_dict["V"]["R"] 87 | pmbec_dict = read_pmbec_coefficients(key_type="row") 88 | pmbec_matrix = dict_to_amino_acid_matrix(pmbec_dict) 89 | -------------------------------------------------------------------------------- /pepdata/reduced_alphabet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2018. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Amino acid groupings from 17 | 'Reduced amino acid alphabets improve the sensitivity...' by 18 | Peterson, Kondev, et al. 19 | http://www.rpgroup.caltech.edu/publications/Peterson2008.pdf 20 | """ 21 | from __future__ import print_function, division, absolute_import 22 | 23 | def dict_from_list(groups): 24 | aa_to_group = {} 25 | for i, group in enumerate(groups): 26 | for c in group: 27 | aa_to_group[c] = group[0] 28 | return aa_to_group 29 | 30 | gbmr4 = dict_from_list(["ADKERNTSQ", "YFLIVMCWH", "G", "P"]) 31 | 32 | sdm12 = dict_from_list([ 33 | "A", "D", "KER", "N", "TSQ", "YF", "LIVM", "C", "W", "H", "G", "P" 34 | ]) 35 | 36 | hsdm17 = dict_from_list([ 37 | "A", "D", "KE", "R", "N", "T", "S", "Q", "Y", 38 | "F", "LIV", "M", "C", "W", "H", "G", "P" 39 | ]) 40 | 41 | """ 42 | Other alphabets from 43 | http://bio.math-inf.uni-greifswald.de/viscose/html/alphabets.html 44 | """ 45 | 46 | # hydrophilic vs. hydrophobic 47 | hp2 = dict_from_list(["AGTSNQDEHRKP", "CMFILVWY"]) 48 | 49 | murphy10 = dict_from_list([ 50 | "LVIM", "C", "A", "G", "ST", "P", "FYW", "EDNQ", "KR", "H" 51 | ]) 52 | 53 | alex6 = dict_from_list(["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"]) 54 | 55 | aromatic2 = dict_from_list(["FHWY", "ADKERNTSQLIVMCGP"]) 56 | 57 | hp_vs_aromatic = dict_from_list(["H", "CMILV", "FWY", "ADKERNTSQGP"]) 58 | -------------------------------------------------------------------------------- /pepdata/residue_contact_energies.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | 15 | from os.path import join 16 | 17 | from .amino_acid_alphabet import canonical_amino_acid_letters, dict_to_amino_acid_matrix 18 | from .static_data import MATRIX_DIR 19 | 20 | 21 | def parse_interaction_table(table, amino_acid_order="ARNDCQEGHILKMFPSTWYV"): 22 | table = table.strip() 23 | while " " in table: 24 | table = table.replace(" ", " ") 25 | 26 | lines = [l.strip() for l in table.split("\n")] 27 | lines = [l for l in lines if len(l) > 0 and not l.startswith("#")] 28 | assert len(lines) == 20, "Malformed amino acid interaction table" 29 | d = {} 30 | for i, line in enumerate(lines): 31 | coeff_strings = line.split(" ") 32 | assert len(coeff_strings) == 20, \ 33 | "Malformed row in amino acid interaction table" 34 | x = amino_acid_order[i] 35 | d[x] = {} 36 | for j, coeff_str in enumerate(coeff_strings): 37 | value = float(coeff_str) 38 | y = amino_acid_order[j] 39 | d[x][y] = value 40 | return d 41 | 42 | def transpose_interaction_dict(d): 43 | transposed = {} 44 | for x in canonical_amino_acid_letters: 45 | transposed[x] = {} 46 | for y in canonical_amino_acid_letters: 47 | transposed[x][y] = d[y][x] 48 | return transposed 49 | 50 | 51 | with open(join(MATRIX_DIR, 'strand_vs_coil.txt'), 'r') as f: 52 | # Strand vs. Coil 53 | strand_vs_coil_dict = parse_interaction_table(f.read()) 54 | strand_vs_coil_array = dict_to_amino_acid_matrix(strand_vs_coil_dict) 55 | 56 | # Coil vs. Strand 57 | coil_vs_strand_dict = transpose_interaction_dict(strand_vs_coil_dict) 58 | coil_vs_strand_array = dict_to_amino_acid_matrix(coil_vs_strand_dict) 59 | 60 | with open(join(MATRIX_DIR, 'helix_vs_strand.txt'), 'r') as f: 61 | # Helix vs. Strand 62 | helix_vs_strand_dict = parse_interaction_table(f.read()) 63 | helix_vs_strand_array = dict_to_amino_acid_matrix(helix_vs_strand_dict) 64 | 65 | # Strand vs. Helix 66 | strand_vs_helix_dict = transpose_interaction_dict(helix_vs_strand_dict) 67 | strand_vs_helix_array = dict_to_amino_acid_matrix(strand_vs_helix_dict) 68 | 69 | with open(join(MATRIX_DIR, 'helix_vs_coil.txt'), 'r') as f: 70 | # Helix vs. Coil 71 | helix_vs_coil_dict = parse_interaction_table(f.read()) 72 | helix_vs_coil_array = dict_to_amino_acid_matrix(helix_vs_coil_dict) 73 | 74 | # Coil vs. Helix 75 | coil_vs_helix_dict = transpose_interaction_dict(helix_vs_coil_dict) 76 | coil_vs_helix_array = dict_to_amino_acid_matrix(coil_vs_helix_dict) 77 | -------------------------------------------------------------------------------- /pepdata/static_data.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from __future__ import print_function, division, absolute_import 15 | from os.path import dirname, realpath, join 16 | 17 | PACKAGE_DIR = dirname(realpath(__file__)) 18 | MATRIX_DIR = join(PACKAGE_DIR, 'matrices') 19 | -------------------------------------------------------------------------------- /pepdata/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.2.0" 2 | 3 | 4 | def print_version(): 5 | print(f"v{__version__}") 6 | 7 | if __name__ == "__main__": 8 | print_version() -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [TYPECHECK] 2 | # Without ignoring this, we get errors like: 3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member) 4 | ignored-modules = numpy,numpy.random 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.7 2 | scipy>=0.9 3 | pandas>=0.17 4 | scikit-learn>=0.14.1 5 | progressbar33 6 | biopython>=1.65 7 | datacache>=0.4.4 8 | lxml -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2018. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from __future__ import print_function, division, absolute_import 17 | import os 18 | import re 19 | 20 | from setuptools import setup, find_packages 21 | 22 | readme_dir = os.path.dirname(__file__) 23 | readme_path = os.path.join(readme_dir, 'README.md') 24 | 25 | try: 26 | with open(readme_path, 'r') as f: 27 | readme_markdown = f.read() 28 | except: 29 | print("Failed to load README file") 30 | readme_markdown = "" 31 | 32 | try: 33 | import pypandoc 34 | readme_restructured = pypandoc.convert(readme_markdown, to='rst', format='md') 35 | except: 36 | readme_restructured = readme_markdown 37 | print("Conversion of long_description from markdown to reStructuredText failed, skipping...") 38 | 39 | with open('pepdata/__init__.py', 'r') as f: 40 | version = re.search( 41 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 42 | f.read(), 43 | re.MULTILINE).group(1) 44 | 45 | if __name__ == '__main__': 46 | setup( 47 | name='pepdata', 48 | version=version, 49 | description="Immunological peptide datasets and amino acid properties", 50 | author="Alex Rubinsteyn", 51 | author_email="alex.rubinsteyn@mssm.edu", 52 | url="https://github.com/openvax/pepdata", 53 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 54 | classifiers=[ 55 | 'Development Status :: 3 - Alpha', 56 | 'Environment :: Console', 57 | 'Operating System :: OS Independent', 58 | 'Intended Audience :: Science/Research', 59 | 'License :: OSI Approved :: Apache Software License', 60 | 'Programming Language :: Python', 61 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 62 | ], 63 | install_requires=[ 64 | 'numpy>=1.7', 65 | 'scipy>=0.9', 66 | 'pandas>=0.17', 67 | 'scikit-learn>=0.14.1', 68 | 'progressbar33', 69 | 'biopython>=1.65', 70 | 'datacache>=0.4.4', 71 | 'lxml', 72 | ], 73 | long_description=readme_restructured, 74 | packages=find_packages(exclude="test"), 75 | include_package_data=True 76 | ) 77 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | pytest --cov=pepdata/ --cov-report=term-missing tests 2 | 3 | 4 | -------------------------------------------------------------------------------- /tests/test_amino_acids.py: -------------------------------------------------------------------------------- 1 | from nose.tools import eq_ 2 | from pepdata.amino_acid_alphabet import ( 3 | canonical_amino_acids, 4 | canonical_amino_acid_letters, 5 | extended_amino_acids, 6 | extended_amino_acid_letters, 7 | ) 8 | 9 | def test_canonical_amino_acids(): 10 | assert len(canonical_amino_acids) == 20 11 | 12 | def test_canonical_amino_acids_letters(): 13 | assert len(canonical_amino_acid_letters) == 20 14 | assert "X" not in canonical_amino_acid_letters 15 | expected_letters = [aa.letter for aa in canonical_amino_acids] 16 | eq_(expected_letters, canonical_amino_acid_letters) 17 | 18 | def test_extended_amino_acids(): 19 | assert len(extended_amino_acids) > 20 20 | 21 | def test_extended_amino_acids_letters(): 22 | assert len(extended_amino_acid_letters) > 20 23 | assert "X" in extended_amino_acid_letters 24 | assert "J" in extended_amino_acid_letters 25 | expected_letters = [aa.letter for aa in extended_amino_acids] 26 | eq_(expected_letters, extended_amino_acid_letters) 27 | -------------------------------------------------------------------------------- /tests/test_blosum.py: -------------------------------------------------------------------------------- 1 | from pepdata.blosum import ( 2 | blosum30_dict, 3 | blosum30_matrix, 4 | blosum50_dict, 5 | blosum50_matrix, 6 | blosum62_dict, 7 | blosum62_matrix 8 | ) 9 | 10 | def test_blosum30(): 11 | pass 12 | 13 | def test_blosum50(): 14 | pass 15 | 16 | def test_blosum62(): 17 | pass -------------------------------------------------------------------------------- /tests/test_iedb_alleles.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from __future__ import print_function, division, absolute_import 15 | 16 | from nose.tools import eq_ 17 | 18 | from pepdata import iedb 19 | 20 | def test_iedb_human_class1_allele(): 21 | allele_dict = iedb.alleles.load_alleles_dict() 22 | allele = allele_dict["HLA-C*07:02"] 23 | eq_(allele.mhc_class, "I") 24 | eq_(allele.locus, "C") 25 | 26 | def test_iedb_human_class2_allele(): 27 | allele_dict = iedb.alleles.load_alleles_dict() 28 | allele = allele_dict["HLA-DRA*01:01/DRB1*04:04"] 29 | eq_(allele.mhc_class, "II") 30 | eq_(allele.locus, "DR") 31 | 32 | 33 | def test_iedb_mouse_class1_allele(): 34 | allele_dict = iedb.alleles.load_alleles_dict() 35 | allele = allele_dict["H-2-Ds"] 36 | eq_(allele.mhc_class, "I") 37 | eq_(allele.locus, "D") 38 | 39 | def test_iedb_mouse_class2_allele(): 40 | allele_dict = iedb.alleles.load_alleles_dict() 41 | allele = allele_dict["H-2-IAq"] 42 | eq_(allele.mhc_class, "II") 43 | eq_(allele.locus, "IA") 44 | -------------------------------------------------------------------------------- /tests/test_iedb_mhc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function, division, absolute_import 16 | 17 | from pepdata import iedb 18 | 19 | def test_mhc_hla_a2(): 20 | """ 21 | IEDB MHC: Test that HLA restriction actually decreases number of results and 22 | that regular expression patterns are being used correctly 23 | """ 24 | df_all = iedb.mhc.load_dataframe(nrows=1000) 25 | df_a2_1 = iedb.mhc.load_dataframe(hla='HLA-A2', nrows=1000) 26 | df_a2_2 = iedb.mhc.load_dataframe(hla=r'HLA-A\*02', nrows=1000) 27 | df_a2_combined = iedb.mhc.load_dataframe(hla=r'HLA-A2|HLA-A\*02', nrows=1000) 28 | assert len(df_a2_1) < len(df_all) 29 | assert len(df_a2_2) < len(df_all) 30 | assert len(df_a2_combined) <= len(df_a2_1) + len(df_a2_2), \ 31 | "Expected %d <= %d + %d" % \ 32 | (len(df_a2_combined), len(df_a2_1), len(df_a2_2)) 33 | -------------------------------------------------------------------------------- /tests/test_iedb_tcell.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from pepdata import iedb 14 | 15 | def test_tcell_hla_restrict_a24(): 16 | """ 17 | IEDB T-cell: Test that HLA restriction actually decreases 18 | number of results and that regular expression patterns 19 | are being used correctly 20 | """ 21 | df_all = iedb.tcell.load_dataframe(nrows=1000) 22 | df_a24_1 = iedb.tcell.load_dataframe(hla='HLA-A24', nrows=1000) 23 | df_a24_2 = iedb.tcell.load_dataframe(hla=r'HLA-A\*24', nrows=1000) 24 | df_a24_combined = \ 25 | iedb.tcell.load_dataframe(hla=r'HLA-A24|HLA-A\*24', nrows=1000) 26 | assert len(df_a24_1) < len(df_all) 27 | assert len(df_a24_2) < len(df_all) 28 | assert len(df_a24_combined) <= \ 29 | len(df_a24_1) + len(df_a24_2), \ 30 | "Expected %d <= %d + %d" % \ 31 | (len(df_a24_combined), len(df_a24_1), len(df_a24_2)) 32 | 33 | def test_tcell_hla_exclude_a0201(): 34 | """ 35 | Test that excluding HLA allele A*02:01 36 | actually returns a DataFrame not containing 37 | that allele 38 | """ 39 | df_all = iedb.tcell.load_dataframe(nrows=1000) 40 | assert (df_all['MHC']['Allele Name'] == "HLA-A*02:01").any() 41 | 42 | df_exclude = iedb.tcell.load_dataframe( 43 | nrows=1000, 44 | exclude_hla="HLA-A\*02:01") 45 | 46 | n_A0201_entries = (df_exclude['MHC']['Allele Name'] == "HLA-A*02:01").sum() 47 | assert n_A0201_entries == 0, \ 48 | ("Not supposed to contain HLA-A*02:01, " 49 | " but found %d rows of that allele") % n_A0201_entries 50 | -------------------------------------------------------------------------------- /tests/test_ngram.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function, division, absolute_import 16 | from six.moves import cPickle 17 | 18 | from pepdata import PeptideVectorizer 19 | 20 | # isoforms of two different proteins a, b 21 | 22 | a1 = ( 23 | "MSPHPTALLGLVLCLAQTIHTQEEDLPRPSISAEPGTVIPLGSHVTFVCRGPVGVQTFRLERESRSTYND" 24 | "TEDVSQASPSESEARFRIDSVSEGNAGPYRCIYYKPPKWSEQSDYLELLVKETSGGPDSPDTEPGSSAGPT" 25 | "QRPSDNSHNEHAPASQGLKAEHLYILIGVSVVFLFCLLLLVLFCLHRQNQIKQGPPRSKDEEQKPQQRPDL" 26 | "AVDVLERTADKATVNGLPEKDRETDTSALAAGSSQEVTYAQLDHWALTQRTARAVSPQSTKPMAESITYAA" 27 | "VARH" 28 | ) 29 | 30 | a2 = ( 31 | "MSLMVVSMACVGFFLLQGAWPHEGVHRKPSLLAHPGPLVKSEETVILQCWSDVRFEHFLLHREGKYKDTLH" 32 | "LIGEHHDGVSKANFSIGPMMQDLAGTYRCYGSVTHSPYQLSAPSDPLDIVITGLYEKPSLSAQPGPTVLAG" 33 | "ESVTLSCSSRSSYDMYHLSREGEAHERRFSAGPKVNGTFQADFPLGPATHGGTYRCFGSFRDSPYEWSNSS" 34 | "DPLLVSVTGNPSNSWPSPTEPSSKTGNPRHLHVLIGTSVVKIPFTILLFFLLHRWCSNKKNAAVMDQEPAG" 35 | "NRTVNSEDSDEQDHQEVSYA" 36 | ) 37 | 38 | a3 = ( 39 | "MSLMVVSMACVGFFLLEGPWPHVGGQDKPFLSAWPGTVVSEGQHVTLQCRSRLGFNEFSLSKEDGMPVPEL" 40 | "YNRIFRNSFLMGPVTPAHAGTYRCCSSHPHSPTGWSAPSNPVVIMVTGVHRKPSLLAHPGPLVKSEETVIL" 41 | "QCWSDVRFEHFLLHREGKYKDTLHLIGEHHDGVSKANFSIGPMMQDLAGTYRCYGSVTHSPYQLSAPSDPL" 42 | "DIVITGLYEKPSLSAQPGPTVLAGESVTLSCSSRSSYDMYHLSREGEAHERRFSAGPKVNGTFQADFPLGP" 43 | "ATHGGTYRCFGSFRDSPYEWSNSSDPLLVSVTAFLSVKSSGHKYIY" 44 | ) 45 | 46 | A = [a1, a2, a3] 47 | 48 | b1 = ( 49 | "MPKGRAGSLPTTSIGWRFQLWFLGLTCPERHLARRLKNNSFYPFVQQEPNVFVLEYYLDTLWKGMLLFII" 50 | "SVVLVSFSSLREVQKQETWVFLVYGVGVGLWLVISSLPRRRLVLNHTRGVYHFSIQGRTVCQGPLHLVYV" 51 | "RLALSSDAHGRCFFHLVLGGHRLEPLVLVQLSEHYEQMEYLGRYIARKLNINYFDYLATSYRHVVRHWPP" 52 | "PGAGTVMGKSPMGHKPSSSQSSLEV" 53 | ) 54 | 55 | b2 = ( 56 | "MPKGRAGSLPTTSIGWRFQLWFLGLTCPERHLARRLKNNSFYPFVQQEPNVFVLEYYLDTLWKGMLLFII" 57 | "SVVLVSFSSLREVQKQETWVFLVYGVGVGLWLVISSLPRRRLVLNHTRGVYHFSIQGRTVCQGPLHLVYV" 58 | "RLALSSDAHGRCFFHLVLGGHRLEPLVLVQLSEHYEQMEYLGRYIARKLNINYFDYLATSYRHVVRHWPPP" 59 | "GAGTVMGKSPMGHKPSSSQSSLEV" 60 | ) 61 | 62 | B = [b1, b2] 63 | -------------------------------------------------------------------------------- /tests/test_pmbec.py: -------------------------------------------------------------------------------- 1 | from pepdata.pmbec import ( 2 | pmbec_dict, 3 | pmbec_matrix, 4 | ) 5 | 6 | def test_pmbec(): 7 | pass 8 | --------------------------------------------------------------------------------