├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── deploy.sh
├── develop.sh
├── lint.sh
├── pepdata
├── __init__.py
├── amino_acid.py
├── amino_acid_alphabet.py
├── amino_acid_properties.py
├── blosum.py
├── chou_fasman.py
├── common.py
├── iedb
│ ├── __init__.py
│ ├── alleles.py
│ ├── columns.py
│ ├── common.py
│ ├── memoize.py
│ ├── mhc.py
│ └── tcell.py
├── matrices
│ ├── BLOSUM30
│ ├── BLOSUM50
│ ├── BLOSUM62
│ ├── __init__.py
│ ├── amino_acid_properties.txt
│ ├── helix_vs_coil.txt
│ ├── helix_vs_strand.txt
│ ├── pmbec.mat
│ └── strand_vs_coil.txt
├── peptide_vectorizer.py
├── pmbec.py
├── reduced_alphabet.py
├── residue_contact_energies.py
├── static_data.py
└── version.py
├── pylintrc
├── requirements.txt
├── setup.py
├── test.sh
└── tests
├── test_amino_acids.py
├── test_blosum.py
├── test_iedb_alleles.py
├── test_iedb_mhc.py
├── test_iedb_tcell.py
├── test_ngram.py
└── test_pmbec.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 |
3 | # C extensions
4 | *.so
5 |
6 | # Packages
7 | *.egg
8 | *.egg-info
9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 |
22 | # Installer logs
23 | pip-log.txt
24 |
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 |
30 | # Translations
31 | *.mo
32 |
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 |
38 | *.csv
39 | *.fa
40 | *.faa
41 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false # Use container-based infrastructure
2 | language: python
3 | python:
4 | - "2.7"
5 | - "3.6"
6 | git:
7 | # don't need the default depth of 50
8 | # but don't want to use a depth of 1 since that affects
9 | # whether jobs run when you have multiple commits queued
10 | # https://github.com/travis-ci/travis-ci/issues/4575
11 | depth: 10
12 | cache:
13 | pip: true
14 | before_install:
15 | # download different versions of mini-conda for py2 vs. py3
16 | - |
17 | if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
18 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
19 | else
20 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
21 | fi
22 | - bash miniconda.sh -b -p $HOME/miniconda
23 | - export PATH="$HOME/miniconda/bin:$PATH"
24 | # reset the shell's lookup table for program name to path mappings
25 | - hash -r
26 | - conda config --set always_yes yes --set changeps1 no
27 | - conda update -q conda
28 | # Useful for debugging any issues with conda
29 | - conda info -a
30 | - python --version
31 | install:
32 | - >
33 | conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
34 | numpy scipy nose pandas pandoc
35 | - source activate test-environment
36 | - pip install pypandoc pylint
37 | - pip install -r requirements.txt
38 | - pip install .
39 | - pip install coveralls
40 | script:
41 | - ./lint.sh
42 | - nosetests test --with-coverage --cover-package=pepdata
43 | after_success:
44 | coveralls
45 | deploy:
46 | provider: pypi
47 | user: openvax
48 | distributions: sdist
49 | password:
50 | secure: "adaJvYZ6lDNqhf4jwrI3tsNVymL54yfKl8ymQPUYaL2yK75MaTurfoqqHEt31FXiZNUvwOz+o0i9GHoGRhoVlHKNoe/bN6f69qkNZNW/YC4b061/kPOdzpdpwFzrxXE9Zr6KPsbnGNGcJXzga9rd7XTh8Y34VDylylb5bhYmTC0="
51 | on:
52 | branch: master
53 | condition: $TRAVIS_PYTHON_VERSION = "2.7"
54 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include pepdata/matrices *
2 | recursive-include pepdata/data *csv
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | PepData
12 | =======
13 |
14 | Formerly a repository for diverse peptide datasets, now only contains the [Immune Epitope Database](http://www.iedb.org) and a variety of amino acid property matrices. This package
15 | will probably be eventually split and the IEDB portions placed into something
16 | named `pyiedb`.
17 |
18 | **Amino Acid Properties**
19 |
20 | The `amino_acid` module contains a variety of physical/chemical properties for both single amino residues and interactions between pairs of residues.
21 |
22 | Single residue feature tables are parsed into `StringTransformer` objects, which can be treated as dictionaries or will vectorize a string when you call their method `transform_string`.
23 |
24 | Examples of single residue features:
25 | - `hydropathy`
26 | - `volume`
27 | - `polarity`
28 | - `pK_side_chain`
29 | - `prct_exposed_residues`
30 | - `hydrophilicity`
31 | - `accessible_surface_area`
32 | - `refractivity`
33 | - `local_flexibility`
34 | - `accessible_surface_area_folded`
35 | - `alpha_helix_score` (Chou-Fasman)
36 | - `beta_sheet_score` (Chou-Fasman)
37 | - `turn_score` (Chou-Fasman)
38 |
39 | Pairwise interaction tables are parsed into nested dictionaries, so that the interaction between amino acids `x` and `y` can be determined from `d[x][y]`.
40 |
41 | Pairwise interaction dictionaries:
42 | - `strand_vs_coil` (and its transpose `coil_vs_strand`)
43 | - `helix_vs_strand` (and its transpose `strand_vs_helix`)
44 | - `helix_vs_coil` (and its transpose `coil_vs_helix`)
45 | - `blosum30`
46 | - `blosum50`
47 | - `blosum62`
48 |
49 | There is also a function to parse the coefficients of the [PMBEC similarity matrix](http://www.biomedcentral.com/1471-2105/10/394), though this currently lives in the separate `pmbec` module.
50 |
51 |
52 |
--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | ./lint.sh && \
2 | ./test.sh && \
3 | python3 -m pip install --upgrade build && \
4 | python3 -m pip install --upgrade twine && \
5 | rm -rf dist && \
6 | python3 -m build && \
7 | git --version && \
8 | python3 -m twine upload dist/* && \
9 | git tag "$(python3 pepdata/version.py)" && \
10 | git push --tags
--------------------------------------------------------------------------------
/develop.sh:
--------------------------------------------------------------------------------
1 | set -e
2 |
3 | pip install -e .
4 |
--------------------------------------------------------------------------------
/lint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -o errexit
3 |
4 | find pepdata test -name '*.py' \
5 | | xargs pylint \
6 | --errors-only \
7 | --disable=print-statement
8 |
9 | echo 'Passes pylint check'
10 |
--------------------------------------------------------------------------------
/pepdata/__init__.py:
--------------------------------------------------------------------------------
1 | from .amino_acid_alphabet import (
2 | AminoAcid,
3 | canonical_amino_acids,
4 | canonical_amino_acid_letters,
5 | extended_amino_acids,
6 | extended_amino_acid_letters,
7 | amino_acid_letter_indices,
8 | amino_acid_name_indices,
9 | )
10 | from .peptide_vectorizer import PeptideVectorizer
11 | from .version import __version__
12 | from . import iedb
13 |
14 |
15 |
16 | __all__ = [
17 | "iedb",
18 | "AminoAcid",
19 | "canonical_amino_acids",
20 | "canonical_amino_acid_letters",
21 | "extended_amino_acids",
22 | "extended_amino_acid_letters",
23 | "amino_acid_letter_indices",
24 | "amino_acid_name_indices",
25 | "PeptideVectorizer",
26 | ]
27 |
--------------------------------------------------------------------------------
/pepdata/amino_acid.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | from __future__ import print_function, division, absolute_import
15 |
16 | class AminoAcid(object):
17 | def __init__(
18 | self, full_name, short_name, letter, contains=None):
19 | self.letter = letter
20 | self.full_name = full_name
21 | self.short_name = short_name
22 | if not contains:
23 | contains = [letter]
24 | self.contains = contains
25 |
26 | def __str__(self):
27 | return (
28 | ("AminoAcid(full_name='%s', short_name='%s', letter='%s', "
29 | "contains=%s)") % (
30 | self.letter, self.full_name, self.short_name, self.contains))
31 |
32 | def __repr__(self):
33 | return str(self)
34 |
35 | def __eq__(self, other):
36 | return other.__class__ is AminoAcid and self.letter == other.letter
37 |
--------------------------------------------------------------------------------
/pepdata/amino_acid_alphabet.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | """
15 | Quantify amino acids by their physical/chemical properties
16 | """
17 |
18 | from __future__ import print_function, division, absolute_import
19 |
20 | import numpy as np
21 |
22 | from .amino_acid import AminoAcid
23 |
24 | canonical_amino_acids = [
25 | AminoAcid("Alanine", "Ala", "A"),
26 | AminoAcid("Arginine", "Arg", "R"),
27 | AminoAcid("Asparagine","Asn", "N"),
28 | AminoAcid("Aspartic Acid", "Asp", "D"),
29 | AminoAcid("Cysteine", "Cys", "C"),
30 | AminoAcid("Glutamic Acid", "Glu", "E"),
31 | AminoAcid("Glutamine", "Gln", "Q"),
32 | AminoAcid("Glycine", "Gly", "G"),
33 | AminoAcid("Histidine", "His", "H"),
34 | AminoAcid("Isoleucine", "Ile", "I"),
35 | AminoAcid("Leucine", "Leu", "L"),
36 | AminoAcid("Lysine", "Lys", "K"),
37 | AminoAcid("Methionine", "Met", "M"),
38 | AminoAcid("Phenylalanine", "Phe", "F"),
39 | AminoAcid("Proline", "Pro", "P"),
40 | AminoAcid("Serine", "Ser", "S"),
41 | AminoAcid("Threonine", "Thr", "T"),
42 | AminoAcid("Tryptophan", "Trp", "W"),
43 | AminoAcid("Tyrosine", "Tyr", "Y"),
44 | AminoAcid("Valine", "Val", "V")
45 | ]
46 |
47 | canonical_amino_acid_letters = [aa.letter for aa in canonical_amino_acids]
48 |
49 | ###
50 | # Post-translation modifications commonly detected by mass-spec
51 | ###
52 |
53 | # TODO: figure out three letter codes for modified AAs
54 |
55 | modified_amino_acids = [
56 | AminoAcid("Phospho-Serine", "Sep", "s"),
57 | AminoAcid("Phospho-Threonine", "???", "t"),
58 | AminoAcid("Phospho-Tyrosine", "???", "y"),
59 | AminoAcid("Cystine", "???", "c"),
60 | AminoAcid("Methionine sulfoxide", "???", "m"),
61 | AminoAcid("Pyroglutamate", "???", "q"),
62 | AminoAcid("Pyroglutamic acid", "???", "n"),
63 | ]
64 |
65 | ###
66 | # Amino acid tokens which represent multiple canonical amino acids
67 | ###
68 | wildcard_amino_acids = [
69 | AminoAcid("Unknown", "Xaa", "X", contains=set(canonical_amino_acid_letters)),
70 | AminoAcid("Asparagine-or-Aspartic-Acid", "Asx", "B", contains={"D", "N"}),
71 | AminoAcid("Glutamine-or-Glutamic-Acid", "Glx", "Z", contains={"E", "Q"}),
72 | AminoAcid("Leucine-or-Isoleucine", "Xle", "J", contains={"I", "L"})
73 | ]
74 |
75 | ###
76 | # Canonical amino acids + wilcard tokens
77 | ###
78 |
79 | canonical_amino_acids_with_unknown = canonical_amino_acids + wildcard_amino_acids
80 |
81 |
82 | ###
83 | # Rare amino acids which aren't considered part of the core 20 "canonical"
84 | ###
85 |
86 | rare_amino_acids = [
87 | AminoAcid("Selenocysteine", "Sec", "U"),
88 | AminoAcid("Pyrrolysine", "Pyl", "O"),
89 | ]
90 |
91 | ###
92 | # Extended amino acids + wildcard tokens
93 | ###
94 |
95 | extended_amino_acids = canonical_amino_acids + rare_amino_acids + wildcard_amino_acids
96 | extended_amino_acid_letters = [
97 | aa.letter for aa in extended_amino_acids
98 | ]
99 | extended_amino_acids_with_unknown_names = [
100 | aa.full_name for aa in extended_amino_acids
101 | ]
102 |
103 |
104 | amino_acid_letter_indices = {
105 | c: i for (i, c) in
106 | enumerate(extended_amino_acid_letters)
107 | }
108 |
109 |
110 | amino_acid_letter_pairs = [
111 | "%s%s" % (x, y)
112 | for y in extended_amino_acids
113 | for x in extended_amino_acids
114 | ]
115 |
116 |
117 | amino_acid_name_indices = {
118 | aa_name: i for (i, aa_name)
119 | in enumerate(extended_amino_acids_with_unknown_names)
120 | }
121 |
122 | amino_acid_pair_positions = {
123 | pair: i for (i, pair) in enumerate(amino_acid_letter_pairs)
124 | }
125 |
126 | def index_to_full_name(idx):
127 | return extended_amino_acids[idx].full_name
128 |
129 | def index_to_short_name(idx):
130 | return extended_amino_acids[idx].short_name
131 |
132 | def index_to_letter(idx):
133 | return extended_amino_acids[idx]
134 |
135 | def letter_to_index(x):
136 | """
137 | Convert from an amino acid's letter code to its position index
138 | """
139 | assert x in amino_acid_letter_indices, "Unknown amino acid: %s" % x
140 | return amino_acid_letter_indices[x]
141 |
142 | def peptide_to_indices(xs):
143 | return [amino_acid_letter_indices[x] for x in xs]
144 |
145 | def letter_to_short_name(x):
146 | return index_to_short_name(letter_to_index(x))
147 |
148 | def peptide_to_short_amino_acid_names(xs):
149 | return [amino_acid_letter_indices[x] for x in xs]
150 |
151 | def dict_to_amino_acid_matrix(d, alphabet=canonical_amino_acids):
152 | n_aa = len(d)
153 | result_matrix = np.zeros((n_aa, n_aa), dtype="float32")
154 | for i, aa_row in enumerate(alphabet):
155 | d_row = d[aa_row.letter]
156 | for j, aa_col in enumerate(alphabet):
157 | value = d_row[aa_col.letter]
158 | result_matrix[i, j] = value
159 | return result_matrix
160 |
161 |
--------------------------------------------------------------------------------
/pepdata/amino_acid_properties.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
15 | from .amino_acid_alphabet import letter_to_index
16 |
17 | """
18 | Quantify amino acids by their physical/chemical properties
19 | """
20 |
21 |
22 | def aa_dict_to_positional_list(aa_property_dict):
23 | value_list = [None] * 20
24 | for letter, value in aa_property_dict.items():
25 | idx = letter_to_index(letter)
26 | assert idx >= 0
27 | assert idx < 20
28 | value_list[idx] = value
29 | assert all(elt is not None for elt in value_list), \
30 | "Missing amino acids in:\n%s" % aa_property_dict.keys()
31 | return value_list
32 |
33 | def parse_property_table(table_string):
34 | value_dict = {}
35 | for line in table_string.splitlines():
36 | line = line.strip()
37 | if not line:
38 | continue
39 | fields = line.split(" ")
40 | fields = [f for f in fields if len(f.strip()) > 0]
41 | assert len(fields) >= 2
42 | value, letter = fields[:2]
43 | assert letter not in value_dict, "Repeated amino acid " + line
44 | value_dict[letter] = float(value)
45 | return value_dict
46 |
47 |
48 | """
49 | Amino acids property tables copied from CRASP website
50 | """
51 |
52 | hydropathy = parse_property_table("""
53 | 1.80000 A ALA
54 | -4.5000 R ARG
55 | -3.5000 N ASN
56 | -3.5000 D ASP
57 | 2.50000 C CYS
58 | -3.5000 Q GLN
59 | -3.5000 E GLU
60 | -0.4000 G GLY
61 | -3.2000 H HIS
62 | 4.50000 I ILE
63 | 3.80000 L LEU
64 | -3.9000 K LYS
65 | 1.90000 M MET
66 | 2.80000 F PHE
67 | -1.6000 P PRO
68 | -0.8000 S SER
69 | -0.7000 T THR
70 | -0.9000 W TRP
71 | -1.3000 Y TYR
72 | 4.20000 V VAL
73 | """)
74 |
75 | volume = parse_property_table("""
76 | 91.5000 A ALA
77 | 202.0000 R ARG
78 | 135.2000 N ASN
79 | 124.5000 D ASP
80 | 118.0000 C CYS
81 | 161.1000 Q GLN
82 | 155.1000 E GLU
83 | 66.40000 G GLY
84 | 167.3000 H HIS
85 | 168.8000 I ILE
86 | 167.9000 L LEU
87 | 171.3000 K LYS
88 | 170.8000 M MET
89 | 203.4000 F PHE
90 | 129.3000 P PRO
91 | 99.10000 S SER
92 | 122.1000 T THR
93 | 237.6000 W TRP
94 | 203.6000 Y TYR
95 | 141.7000 V VAL
96 | """)
97 |
98 | polarity = parse_property_table("""
99 | 0.0000 A ALA
100 | 52.000 R ARG
101 | 3.3800 N ASN
102 | 40.700 D ASP
103 | 1.4800 C CYS
104 | 3.5300 Q GLN
105 | 49.910 E GLU
106 | 0.0000 G GLY
107 | 51.600 H HIS
108 | 0.1500 I ILE
109 | 0.4500 L LEU
110 | 49.500 K LYS
111 | 1.4300 M MET
112 | 0.3500 F PHE
113 | 1.5800 P PRO
114 | 1.6700 S SER
115 | 1.6600 T THR
116 | 2.1000 W TRP
117 | 1.6100 Y TYR
118 | 0.1300 V VAL
119 | """)
120 |
121 | pK_side_chain = parse_property_table("""
122 | 0.0000 A ALA
123 | 12.480 R ARG
124 | 0.0000 N ASN
125 | 3.6500 D ASP
126 | 8.1800 C CYS
127 | 0.0000 Q GLN
128 | 4.2500 E GLU
129 | 0.0000 G GLY
130 | 6.0000 H HIS
131 | 0.0000 I ILE
132 | 0.0000 L LEU
133 | 10.530 K LYS
134 | 0.0000 M MET
135 | 0.0000 F PHE
136 | 0.0000 P PRO
137 | 0.0000 S SER
138 | 0.0000 T THR
139 | 0.0000 W TRP
140 | 10.700 Y TYR
141 | 0.0000 V VAL
142 | """)
143 |
144 | prct_exposed_residues = parse_property_table("""
145 | 15.0000 A ALA
146 | 67.0000 R ARG
147 | 49.0000 N ASN
148 | 50.0000 D ASP
149 | 5.00000 C CYS
150 | 56.0000 Q GLN
151 | 55.0000 E GLU
152 | 10.0000 G GLY
153 | 34.0000 H HIS
154 | 13.0000 I ILE
155 | 16.0000 L LEU
156 | 85.0000 K LYS
157 | 20.0000 M MET
158 | 10.0000 F PHE
159 | 45.0000 P PRO
160 | 32.0000 S SER
161 | 32.0000 T THR
162 | 17.0000 W TRP
163 | 41.0000 Y TYR
164 | 14.0000 V VAL
165 | """)
166 |
167 | hydrophilicity = parse_property_table("""
168 | -0.5000 A ALA
169 | 3.00000 R ARG
170 | 0.20000 N ASN
171 | 3.00000 D ASP
172 | -1.0000 C CYS
173 | 0.20000 Q GLN
174 | 3.00000 E GLU
175 | 0.00000 G GLY
176 | -0.5000 H HIS
177 | -1.8000 I ILE
178 | -1.8000 L LEU
179 | 3.00000 K LYS
180 | -1.3000 M MET
181 | -2.5000 F PHE
182 | 0.00000 P PRO
183 | 0.30000 S SER
184 | -0.4000 T THR
185 | -3.4000 W TRP
186 | -2.3000 Y TYR
187 | -1.5000 V VAL
188 | """)
189 |
190 | accessible_surface_area = parse_property_table("""
191 | 27.8000 A ALA
192 | 94.7000 R ARG
193 | 60.1000 N ASN
194 | 60.6000 D ASP
195 | 15.5000 C CYS
196 | 68.7000 Q GLN
197 | 68.2000 E GLU
198 | 24.5000 G GLY
199 | 50.7000 H HIS
200 | 22.8000 I ILE
201 | 27.6000 L LEU
202 | 103.000 K LYS
203 | 33.5000 M MET
204 | 25.5000 F PHE
205 | 51.5000 P PRO
206 | 42.0000 S SER
207 | 45.0000 T THR
208 | 34.7000 W TRP
209 | 55.2000 Y TYR
210 | 23.7000 V VAL
211 | """)
212 |
213 | local_flexibility = parse_property_table("""
214 | 705.42000 A ALA
215 | 1484.2800 R ARG
216 | 513.46010 N ASN
217 | 34.960000 D ASP
218 | 2412.5601 C CYS
219 | 1087.8300 Q GLN
220 | 1158.6600 E GLU
221 | 33.180000 G GLY
222 | 1637.1300 H HIS
223 | 5979.3701 I ILE
224 | 4985.7300 L LEU
225 | 699.69000 K LYS
226 | 4491.6602 M MET
227 | 5203.8599 F PHE
228 | 431.96000 P PRO
229 | 174.76000 S SER
230 | 601.88000 T THR
231 | 6374.0698 W TRP
232 | 4291.1001 Y TYR
233 | 4474.4199 V VAL
234 | """)
235 |
236 | accessible_surface_area_folded = parse_property_table("""
237 | 31.5000 A ALA
238 | 93.8000 R ARG
239 | 62.2000 N ASN
240 | 60.9000 D ASP
241 | 13.9000 C CYS
242 | 74.0000 Q GLN
243 | 72.3000 E GLU
244 | 25.2000 G GLY
245 | 46.7000 H HIS
246 | 23.0000 I ILE
247 | 29.0000 L LEU
248 | 110.300 K LYS
249 | 30.5000 M MET
250 | 28.7000 F PHE
251 | 53.7000 P PRO
252 | 44.2000 S SER
253 | 46.0000 T THR
254 | 41.7000 W TRP
255 | 59.1000 Y TYR
256 | 23.5000 V VAL
257 | """)
258 |
259 | refractivity = parse_property_table("""
260 | 4.34000 A ALA
261 | 26.6600 R ARG
262 | 13.2800 N ASN
263 | 12.0000 D ASP
264 | 35.7700 C CYS
265 | 17.5600 Q GLN
266 | 17.2600 E GLU
267 | 0.00000 G GLY
268 | 21.8100 H HIS
269 | 19.0600 I ILE
270 | 18.7800 L LEU
271 | 21.2900 K LYS
272 | 21.6400 M MET
273 | 29.4000 F PHE
274 | 10.9300 P PRO
275 | 6.35000 S SER
276 | 11.0100 T THR
277 | 42.5300 W TRP
278 | 31.5300 Y TYR
279 | 13.9200 V VAL
280 | """)
281 |
282 |
283 | mass = parse_property_table("""
284 | 70.079 A ALA
285 | 156.188 R ARG
286 | 114.104 N ASN
287 | 115.089 D ASP
288 | 103.144 C CYS
289 | 128.131 Q GLN
290 | 129.116 E GLU
291 | 57.052 G GLY
292 | 137.142 H HIS
293 | 113.160 I ILE
294 | 113.160 L LEU
295 | 128.174 K LYS
296 | 131.198 M MET
297 | 147.177 F PHE
298 | 97.177 P PRO
299 | 87.078 S SER
300 | 101.105 T THR
301 | 186.213 W TRP
302 | 163.170 Y TYR
303 | 99.133 V VAL
304 | """)
305 |
306 | ###
307 | # Values copied from:
308 | # "Solvent accessibility of AA in known protein structures"
309 | # http://prowl.rockefeller.edu/aainfo/access.htm
310 | ###
311 | """
312 | Solvent accessibility of AA in known protein structures
313 |
314 | Figure 1.
315 |
316 | S 0.70 0.20 0.10
317 | T 0.71 0.16 0.13
318 | A 0.48 0.35 0.17
319 | G 0.51 0.36 0.13
320 | P 0.78 0.13 0.09
321 | C 0.32 0.54 0.14
322 | D 0.81 0.09 0.10
323 | E 0.93 0.04 0.03
324 | Q 0.81 0.10 0.09
325 | N 0.82 0.10 0.08
326 | L 0.41 0.49 0.10
327 | I 0.39 0.47 0.14
328 | V 0.40 0.50 0.10
329 | M 0.44 0.20 0.36
330 | F 0.42 0.42 0.16
331 | Y 0.67 0.20 0.13
332 | W 0.49 0.44 0.07
333 | K 0.93 0.02 0.05
334 | R 0.84 0.05 0.11
335 | H 0.66 0.19 0.15
336 | """
337 |
338 | solvent_exposed_area = dict(
339 | S=0.70,
340 | T=0.71,
341 | A=0.48,
342 | G=0.51,
343 | P=0.78,
344 | C=0.32,
345 | D=0.81,
346 | E=0.93,
347 | Q=0.81,
348 | N=0.82,
349 | L=0.41,
350 | I=0.39,
351 | V=0.40,
352 | M=0.44,
353 | F=0.42,
354 | Y=0.67,
355 | W=0.49,
356 | K=0.93,
357 | R=0.84,
358 | H=0.66,
359 | )
360 |
--------------------------------------------------------------------------------
/pepdata/blosum.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
15 | from os.path import join
16 |
17 | from .static_data import MATRIX_DIR
18 |
19 | from .amino_acid_alphabet import dict_to_amino_acid_matrix
20 |
21 | def parse_blosum_table(table, coeff_type=int, key_type='row'):
22 | """
23 | Parse a table of pairwise amino acid coefficient (e.g. BLOSUM50)
24 | """
25 |
26 | lines = table.split("\n")
27 | # drop comments
28 | lines = [line for line in lines if not line.startswith("#")]
29 | # drop CR endline characters
30 | lines = [line.replace("\r", "") for line in lines]
31 | # skip empty lines
32 | lines = [line for line in lines if line]
33 |
34 | labels = lines[0].split()
35 |
36 | if len(labels) < 20:
37 | raise ValueError(
38 | "Expected 20+ amino acids but first line '%s' has %d fields" % (
39 | lines[0],
40 | len(labels)))
41 | coeffs = {}
42 | for line in lines[1:]:
43 |
44 | fields = line.split()
45 | assert len(fields) >= 21, \
46 | "Expected AA and 20+ coefficients but '%s' has %d fields" % (
47 | line, len(fields))
48 | x = fields[0]
49 | for i, coeff_str in enumerate(fields[1:]):
50 | y = labels[i]
51 | coeff = coeff_type(coeff_str)
52 | if key_type == 'pair':
53 | coeffs[(x, y)] = coeff
54 | elif key_type == 'pair_string':
55 | coeffs[x + y] = coeff
56 | else:
57 | assert key_type == 'row', "Unknown key type: %s" % key_type
58 | if x not in coeffs:
59 | coeffs[x] = {}
60 | coeffs[x][y] = coeff
61 | return coeffs
62 |
63 |
64 | with open(join(MATRIX_DIR, 'BLOSUM30'), 'r') as f:
65 | blosum30_dict = parse_blosum_table(f.read())
66 | blosum30_matrix = dict_to_amino_acid_matrix(blosum30_dict)
67 |
68 | with open(join(MATRIX_DIR, 'BLOSUM50'), 'r') as f:
69 | blosum50_dict = parse_blosum_table(f.read())
70 | blosum50_matrix = dict_to_amino_acid_matrix(blosum50_dict)
71 |
72 | with open(join(MATRIX_DIR, 'BLOSUM62'), 'r') as f:
73 | blosum62_dict = parse_blosum_table(f.read())
74 | blosum62_matrix = dict_to_amino_acid_matrix(blosum62_dict)
75 |
76 |
--------------------------------------------------------------------------------
/pepdata/chou_fasman.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
15 | from .amino_acid_alphabet import amino_acid_name_indices
16 |
17 | # Chou-Fasman of structural properties from
18 | # http://prowl.rockefeller.edu/aainfo/chou.htm
19 | chou_fasman_table = """
20 | Alanine 142 83 66 0.06 0.076 0.035 0.058
21 | Arginine 98 93 95 0.070 0.106 0.099 0.085
22 | Aspartic Acid 101 54 146 0.147 0.110 0.179 0.081
23 | Asparagine 67 89 156 0.161 0.083 0.191 0.091
24 | Cysteine 70 119 119 0.149 0.050 0.117 0.128
25 | Glutamic Acid 151 037 74 0.056 0.060 0.077 0.064
26 | Glutamine 111 110 98 0.074 0.098 0.037 0.098
27 | Glycine 57 75 156 0.102 0.085 0.190 0.152
28 | Histidine 100 87 95 0.140 0.047 0.093 0.054
29 | Isoleucine 108 160 47 0.043 0.034 0.013 0.056
30 | Leucine 121 130 59 0.061 0.025 0.036 0.070
31 | Lysine 114 74 101 0.055 0.115 0.072 0.095
32 | Methionine 145 105 60 0.068 0.082 0.014 0.055
33 | Phenylalanine 113 138 60 0.059 0.041 0.065 0.065
34 | Proline 57 55 152 0.102 0.301 0.034 0.068
35 | Serine 77 75 143 0.120 0.139 0.125 0.106
36 | Threonine 83 119 96 0.086 0.108 0.065 0.079
37 | Tryptophan 108 137 96 0.077 0.013 0.064 0.167
38 | Tyrosine 69 147 114 0.082 0.065 0.114 0.125
39 | Valine 106 170 50 0.062 0.048 0.028 0.053
40 | """
41 |
42 |
43 | def parse_chou_fasman(table):
44 | alpha_helix_score_dict = {}
45 | beta_sheet_score_dict = {}
46 | turn_score_dict = {}
47 |
48 | for line in table.split("\n"):
49 | fields = [field for field in line.split(" ") if len(field.strip()) > 0]
50 | if len(fields) == 0:
51 | continue
52 |
53 | if fields[1] == 'Acid':
54 | name = fields[0] + " " + fields[1]
55 | fields = fields[1:]
56 | else:
57 | name = fields[0]
58 |
59 | assert name in amino_acid_name_indices, "Invalid amino acid name %s" % name
60 | letter = amino_acid_name_indices[name]
61 | alpha = int(fields[1])
62 | beta = int(fields[2])
63 | turn = int(fields[3])
64 | alpha_helix_score_dict[letter] = alpha
65 | beta_sheet_score_dict[letter] = beta
66 | turn_score_dict[letter] = turn
67 |
68 | assert len(alpha_helix_score_dict) == 20
69 | assert len(beta_sheet_score_dict) == 20
70 | assert len(turn_score_dict) == 20
71 | return alpha_helix_score_dict, beta_sheet_score_dict, turn_score_dict
72 |
73 | alpha_helix_score, beta_sheet_score, turn_score = \
74 | parse_chou_fasman(chou_fasman_table)
75 |
--------------------------------------------------------------------------------
/pepdata/common.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2016. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from __future__ import print_function, division, absolute_import
17 |
18 | import numpy as np
19 |
20 | def transform_peptide(peptide, property_dict):
21 | return np.array([property_dict[amino_acid] for amino_acid in peptide])
22 |
23 | def transform_peptides(peptides, property_dict):
24 | return np.array([
25 | [property_dict[aa] for aa in peptide]
26 | for peptide in peptides])
27 |
28 |
--------------------------------------------------------------------------------
/pepdata/iedb/__init__.py:
--------------------------------------------------------------------------------
1 | from . import (
2 | alleles,
3 | mhc,
4 | tcell
5 | )
6 |
7 | __all__ = [
8 | "alleles",
9 | "mhc",
10 | "tcell",
11 | ]
--------------------------------------------------------------------------------
/pepdata/iedb/alleles.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 | from collections import namedtuple
15 | import os
16 | import xml
17 |
18 | from .common import cache
19 | from .memoize import memoize
20 |
21 | ALLELE_XML_FILENAME = "MhcAlleleNames.xml"
22 | ALLELE_XML_URL = "http://www.iedb.org/doc/MhcAlleleNameList.zip"
23 | ALLELE_XML_DECOMPRESS = True
24 |
25 | def local_path(force_download=False):
26 | """Downloads allele database from IEDB, returns local path to XML file."""
27 | return cache.fetch(
28 | filename=ALLELE_XML_FILENAME,
29 | url=ALLELE_XML_URL,
30 | decompress=ALLELE_XML_DECOMPRESS,
31 | force=force_download)
32 |
33 | def delete():
34 | """Deletes local XML file"""
35 | path = cache.local_path(
36 | filename=ALLELE_XML_FILENAME,
37 | url=ALLELE_XML_URL,
38 | decompress=ALLELE_XML_DECOMPRESS)
39 | os.remove(path)
40 |
41 | Allele = namedtuple("Allele", [
42 | "name",
43 | "mhc_class",
44 | "locus",
45 | "organism",
46 | "synonyms"
47 | ])
48 |
49 | @memoize
50 | def load_alleles():
51 | """Parses the IEDB MhcAlleleName XML file and returns a list of Allele
52 | namedtuple objects containing information about that each allele's HLA
53 | class and source organism.
54 | """
55 | result = []
56 | path = local_path()
57 | etree = xml.etree.ElementTree.parse(path)
58 | for allele in etree.iterfind("MhcAlleleName"):
59 | name_element = allele.find("DisplayedRestriction")
60 | mhc_class_element = allele.find("Class")
61 | # need at least a name and an HLA class
62 | if name_element is None or mhc_class_element is None:
63 | continue
64 | name = name_element.text
65 |
66 | synonyms = set([])
67 | for synonym_element in allele.iterfind("Synonyms"):
68 | for synonym in synonym_element.text.split(","):
69 | synonyms.add(synonym.strip())
70 | mhc_class = mhc_class_element.text
71 | organism_element = allele.find("Organsim")
72 | if organism_element is None:
73 | organism = None
74 | else:
75 | organism = organism_element.text
76 |
77 | locus_element = allele.find("Locus")
78 |
79 | if locus_element is None:
80 | locus = None
81 | else:
82 | locus = locus_element.text
83 |
84 | allele_object = Allele(
85 | name=name,
86 | mhc_class=mhc_class,
87 | locus=locus,
88 | organism=organism,
89 | synonyms=synonyms)
90 | result.append(allele_object)
91 | return result
92 |
93 | @memoize
94 | def load_alleles_dict():
95 | """Create a dictionary mapping each unique allele name to a namedtuple
96 | containing information about that alleles class, locus, species, &c.
97 | """
98 | alleles = load_alleles()
99 | result = {}
100 | for allele in alleles:
101 | for name in {allele.name}.union(allele.synonyms):
102 | result[name] = allele
103 | return result
104 |
--------------------------------------------------------------------------------
/pepdata/iedb/columns.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import annotations
14 |
15 |
16 | import pandas as pd
17 |
18 | def find(df : pd.DataFrame, group_candidates : list[str], column_candidates : list[str]) -> pd.Series | None:
19 | """
20 | Try to find a column that contains a combination of the two candidate lists.
21 |
22 | Motivation: format for MHC ligand CSV used to have:
23 | epitope_key = ("Epitope", "Description")
24 | mhc_allele_key = ("MHC", "Allele Name")
25 | mhc_class_key = ("MHC", "MHC allele class")
26 | mhc_assay_key = ("Assay", "Method/Technique")
27 |
28 | Now it's:
29 | epitope_key = ("Epitope", "Name")
30 | mhc_allele_key = ("MHC Restriction", "Name")
31 | mhc_class_key = ("MHC Restriction", "Class")
32 | mhc_assay_key = ("Assay", "Method")
33 |
34 | ...who knows what it will be next!
35 | """
36 | group_candidates = [s.lower() for s in group_candidates]
37 | column_candidates = [s.lower() for s in column_candidates]
38 |
39 | possible_matches = []
40 | for a in group_candidates:
41 | for b in column_candidates:
42 | for pair in df.columns:
43 | assert type(pair) is tuple and len(pair) == 2
44 | group, col = pair
45 | if a in group.lower() and b in col.lower():
46 | possible_matches.append(pair)
47 |
48 | if len(possible_matches) == 0:
49 | return None
50 | # get the shortest matches
51 |
52 |
53 |
54 | MHC_GROUP_CANDIDATES : list[str] = ["MHC", "MHC Restriction"]
55 | EPITOPE_GROUP_CANDIDATES : list[str] = ["Epitope"]
56 | ASSAY_GROUP_CANDIDATES : list[str] = ["Assay"]
57 | HOST_GROUP_CANDIDATES : list[str] = ["Host"]
58 |
59 | def get_mhc_allele(
60 | df : pd.DataFrame,
61 | group_candidates : list[str] = MHC_GROUP_CANDIDATES,
62 | column_candidates : list[str] = ["Allele", "Allele name", "Name"]) -> pd.Series | None:
63 | return find(df, group_candidates, column_candidates)
64 |
65 |
66 | def get_mhc_class(
67 | df : pd.DataFrame,
68 | group_candidates : list[str] = MHC_GROUP_CANDIDATES,
69 | column_candidates : list[str] =["Class", "MHC allele class"]) -> pd.Series | None:
70 | return find(df, group_candidates, column_candidates)
71 |
72 |
73 | def get_mhc_assay(
74 | df : pd.Series,
75 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES,
76 | column_candidates : list[str] =["method"]) -> pd.Series | None:
77 | return find(df, group_candidates, column_candidates)
78 |
79 |
80 | def get_epitope_name(
81 | df : pd.DataFrame,
82 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES,
83 | column_candidates : list[str] =["name"]) -> pd.Series | None:
84 | return find(df, group_candidates, column_candidates)
85 |
86 |
87 | def get_epitope_type(
88 | df : pd.DataFrame,
89 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES,
90 | column_candidates : list[str] =["Object Type", "Type"]) -> pd.Series | None:
91 | return find(df, group_candidates, column_candidates)
92 |
93 | def get_epitope_modifications(
94 | df : pd.DataFrame,
95 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES,
96 | column_candidates : list[str] = ["Modified Residue(s)"]) -> pd.Series | None:
97 | return find(df, group_candidates, column_candidates)
98 |
99 |
100 | def get_epitope_IRI(
101 | df : pd.DataFrame,
102 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES,
103 | column_candidates : list[str] =["Epitope IRI"]) -> pd.Series | None:
104 | return find(df, group_candidates, column_candidates)
105 |
106 |
107 | def get_epitope_source_molecule(
108 | df : pd.DataFrame,
109 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES,
110 | column_candidates=["Source Molecule"]) -> pd.Series | None:
111 | return find(df, group_candidates, column_candidates)
112 |
113 | def get_epitope_source_molecule_iri(
114 | df : pd.DataFrame,
115 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES,
116 | column_candidates : list[str] = ["Source Molecule IRI"]) -> pd.Series | None:
117 | return find(df, group_candidates, column_candidates)
118 |
119 |
120 | def get_epitope_source_organism(
121 | df : pd.DataFrame,
122 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES,
123 | column_candidates : list[str] = ["Source Organism"]) -> pd.Series | None:
124 | return find(df, group_candidates, column_candidates)
125 |
126 |
127 | def get_epitope_source_organism_iri(
128 | df : pd.DataFrame,
129 | group_candidates : list[str] = EPITOPE_GROUP_CANDIDATES,
130 | column_candidates : list[str] = ["Source Organism IRI"]) -> pd.Series | None:
131 | return find(df, group_candidates, column_candidates)
132 |
133 | def get_assay_method(
134 | df : pd.DataFrame,
135 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES,
136 | column_candidates : list[str] = ["Method", "Method/Technique"]) -> pd.Series | None:
137 | return find(df, group_candidates, column_candidates)
138 |
139 | def get_assay_response_measured(
140 | df : pd.DataFrame,
141 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES,
142 | column_candidates : list[str] = ["Response measured"]) -> pd.Series | None:
143 | return find(df, group_candidates, column_candidates)
144 |
145 |
146 | def get_assay_units(
147 | df : pd.DataFrame,
148 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES,
149 | column_candidates : list[str] = ["Units"]) -> pd.Series | None:
150 | return find(df, group_candidates, column_candidates)
151 |
152 |
153 | def get_assay_qualitative(
154 | df : pd.DataFrame,
155 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES,
156 | column_candidates : list[str] = ["Qualitative Measurement"]) -> pd.Series | None:
157 | return find(df, group_candidates, column_candidates)
158 |
159 | def get_assay_num_tested(
160 | df : pd.DataFrame,
161 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES,
162 | column_candidates : list[str] = ["Number of Subjects Tested"]) -> pd.Series | None:
163 | return find(df, group_candidates, column_candidates)
164 |
165 | def get_assay_num_responded(
166 | df : pd.DataFrame,
167 | group_candidates : list[str] = ASSAY_GROUP_CANDIDATES,
168 | column_candidates : list[str] = ["Number of Subjects Responded"]) -> pd.Series | None:
169 | return find(df, group_candidates, column_candidates)
170 |
171 |
172 | def get_host_name(
173 | df : pd.DataFrame,
174 | group_candidates : list[str] = HOST_GROUP_CANDIDATES,
175 | column_candidates : list[str] = ["Name"]) -> pd.Series | None:
176 | return find(df, group_candidates, column_candidates)
177 |
--------------------------------------------------------------------------------
/pepdata/iedb/common.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import annotations
14 |
15 | import datacache
16 |
17 | cache = datacache.Cache("pepdata")
18 |
19 | bad_amino_acids = 'U|X|J|B|Z'
20 |
--------------------------------------------------------------------------------
/pepdata/iedb/memoize.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
15 | from functools import wraps
16 |
17 | def _prepare_memoization_key(args, kwargs):
18 | """
19 | Make a tuple of arguments which can be used as a key
20 | for a memoized function's lookup_table. If some object can't be hashed
21 | then used its __repr__ instead.
22 | """
23 | key_list = []
24 | for arg in args:
25 | try:
26 | hash(arg)
27 | key_list.append(arg)
28 | except:
29 | key_list.append(repr(arg))
30 | for (k, v) in kwargs.items():
31 | try:
32 | hash(k)
33 | hash(v)
34 | key_list.append((k, v))
35 | except:
36 | key_list.append((repr(k), repr(v)))
37 | return tuple(key_list)
38 |
39 | def memoize(fn):
40 | lookup_table = {}
41 |
42 | @wraps(fn)
43 | def wrapped_fn(*args, **kwargs):
44 | key = _prepare_memoization_key(args, kwargs)
45 | if key not in lookup_table:
46 | lookup_table[key] = fn(*args, **kwargs)
47 | return lookup_table[key]
48 |
49 | return wrapped_fn
50 |
--------------------------------------------------------------------------------
/pepdata/iedb/mhc.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 | import logging
15 | import os
16 |
17 | import pandas as pd
18 |
19 | from .memoize import memoize
20 | from .common import bad_amino_acids, cache
21 |
22 |
23 | MHC_URL = "https://www.iedb.org/downloader.php?file_name=doc/mhc_ligand_full_single_file.zip"
24 | MHC_LOCAL_FILENAME = "mhc_ligand_full.csv"
25 | MHC_DECOMPRESS = True
26 |
27 | def download(force=False):
28 | return cache.fetch(
29 | filename=MHC_LOCAL_FILENAME,
30 | url=MHC_URL,
31 | decompress=MHC_DECOMPRESS,
32 | force=force)
33 |
34 | def local_path(auto_download=True):
35 | path = cache.local_path(
36 | filename=MHC_LOCAL_FILENAME,
37 | url=MHC_URL,
38 | decompress=MHC_DECOMPRESS)
39 | if not os.path.exists(path):
40 | if auto_download:
41 | return download()
42 | raise ValueError(
43 | ("MHC data file %s does not exist locally,"
44 | " call pepdata.mhc.download() to get a copy from IEDB") % path)
45 | return path
46 |
47 | def delete():
48 | os.remove(local_path())
49 |
50 | @memoize
51 | def load_dataframe(
52 | mhc_class : int | None = None, # 1, 2, or None for neither
53 | hla : str | None = None,
54 | exclude_hla : str | None = None,
55 | human_only : bool = False,
56 | peptide_length : int | None = None,
57 | assay_method : str | None = None,
58 | only_standard_amino_acids : bool = True,
59 | warn_bad_lines : bool = True,
60 | nrows : int | None = None):
61 | """
62 | Load IEDB MHC data without aggregating multiple entries for the same epitope
63 |
64 | Parameters
65 | ----------
66 | mhc_class
67 | Restrict to MHC Class I or Class II (or None for neither)
68 |
69 | hla
70 | Restrict results to specific HLA type used in assay (regex pattern)
71 |
72 | exclude_hla
73 | Regex pattern to exclude certain HLA types
74 |
75 | human_only
76 | Restrict to human samples (default False)
77 |
78 | peptide_length
79 | Restrict epitopes to amino acid strings of given length
80 |
81 | assay_method
82 | Limit to assay methods which contain the given string
83 |
84 | only_standard_amino_acids
85 | Drop sequences which use non-standard amino acids, anything outside
86 | the core 20, such as X or U (default = True)
87 |
88 | warn_bad_lines
89 | The full MHC ligand dataset seems to contain several dozen lines with
90 | too many fields. This currently results in a lot of warning messages
91 | from Pandas, which you can turn off with this option (default = True)
92 |
93 | nrows
94 | Don't load the full IEDB dataset but instead read only the first nrows
95 | """
96 | df = pd.read_csv(
97 | local_path(),
98 | header=[0, 1],
99 | skipinitialspace=True,
100 | nrows=nrows,
101 | low_memory=False,
102 | on_bad_lines='warn' if warn_bad_lines else 'skip',
103 | encoding="latin-1")
104 |
105 | # Sometimes the IEDB seems to put in an extra comma in the
106 | # header line, which creates an unnamed column of NaNs.
107 | # To deal with this, drop any columns which are all NaN
108 | df = df.dropna(axis=1, how="all")
109 |
110 | print(df.head())
111 |
112 | n = len(df)
113 |
114 | mhc_group_key = "MHC Restriction"
115 | epitope_group_key = "Epitope"
116 | epitope_column_key = (epitope_group_key, "Name")
117 |
118 | mhc_allele_column_key = (mhc_group_key, "Name")
119 |
120 | epitopes = df[epitope_column_key] = df[epitope_column_key].str.upper()
121 |
122 | null_epitope_seq = epitopes.isnull()
123 | n_null = null_epitope_seq.sum()
124 | if n_null > 0:
125 | logging.info("Dropping %d null sequences", n_null)
126 |
127 | mask = ~null_epitope_seq
128 |
129 | if only_standard_amino_acids:
130 | # if have rare or unknown amino acids, drop the sequence
131 | bad_epitope_seq = \
132 | epitopes.str.contains(bad_amino_acids, na=False).astype("bool")
133 | n_bad = bad_epitope_seq.sum()
134 | if n_bad > 0:
135 | logging.info("Dropping %d bad sequences", n_bad)
136 |
137 | mask &= ~bad_epitope_seq
138 |
139 | if human_only:
140 | mask &= df[mhc_allele_column_key].str.startswith("HLA").astype("bool")
141 |
142 | if mhc_class == 1:
143 | mask &= df[mhc_group_key]["Class"] == "I"
144 | elif mhc_class == 2:
145 | mask &= df[mhc_group_key]["Class"] == "II"
146 |
147 | if hla:
148 | mask &= df[mhc_allele_column_key].str.contains(hla, na=False)
149 |
150 | if exclude_hla:
151 | mask &= ~(df[mhc_allele_column_key].str.contains(exclude_hla, na=False))
152 |
153 | if assay_method:
154 | mask &= df["Assay"]["Method"].str.contains(assay_method)
155 |
156 | if peptide_length:
157 | assert peptide_length > 0
158 | mask &= df[epitope_column_key].str.len() == peptide_length
159 |
160 | df = df[mask].copy()
161 |
162 | logging.info("Returning %d / %d entries after filtering", len(df), n)
163 |
164 | return df
165 |
--------------------------------------------------------------------------------
/pepdata/iedb/tcell.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | from __future__ import print_function, division, absolute_import
15 | import logging
16 | import os
17 |
18 | import numpy as np
19 | import pandas as pd
20 |
21 |
22 | from .alleles import load_alleles_dict
23 | from .memoize import memoize
24 | from .common import bad_amino_acids, cache
25 | from .columns import (
26 | get_assay_method,
27 | get_assay_num_tested,
28 | get_assay_response_measured,
29 | get_assay_units,
30 | get_host_name,
31 | get_mhc_allele,
32 | get_mhc_assay,
33 | get_mhc_class,
34 | get_epitope_source_organism,
35 | get_epitope_type,
36 | get_epitope_name,
37 |
38 | )
39 |
40 | TCELL_COMPACT_FILENAME = "tcell_full.csv"
41 | TCELL_COMPACT_URL = "http://www.iedb.org/downloader.php?file_name=doc/tcell_full_v3.zip"
42 | TCELL_COMPACT_DECOMPRESS = True
43 |
44 | def download(force=False):
45 | return cache.fetch(
46 | filename=TCELL_COMPACT_FILENAME,
47 | url=TCELL_COMPACT_URL,
48 | decompress=TCELL_COMPACT_DECOMPRESS,
49 | force=force)
50 |
51 | def local_path(auto_download=True):
52 | path = cache.local_path(
53 | filename=TCELL_COMPACT_FILENAME,
54 | url=TCELL_COMPACT_URL,
55 | decompress=TCELL_COMPACT_DECOMPRESS)
56 | if not os.path.exists(path):
57 | if auto_download:
58 | return download()
59 | raise ValueError(
60 | ("Local file %s does not exist, call"
61 | " pepdata.iedb.tcell.download()") % path)
62 | return path
63 |
64 | def delete():
65 | os.remove(local_path())
66 |
67 | @memoize
68 | def load_dataframe(
69 | mhc_class : str | None = None, # 1, 2, or None for neither
70 | mhc_pattern : str | None = None,
71 | exclude_mhc : str | None = None,
72 | human_only : bool =False,
73 | peptide_length : int | None = None,
74 | assay_method : str | None = None,
75 | only_standard_amino_acids : bool = True,
76 | reduced_alphabet : dict | None = None, # 20 letter AA strings -> simpler alphabet
77 | nrows : int | None = None):
78 | """
79 | Load IEDB T-cell data without aggregating multiple entries for same epitope
80 |
81 | Parameters
82 | ----------
83 | mhc_class: {None, 1, 2}
84 | Restrict to MHC Class I or Class II (or None for neither)
85 |
86 | mhc_pattern: regex pattern, optional
87 | Restrict results to specific MHC used in assay
88 |
89 | exclude_mhc: regex pattern, optional
90 | Exclude certain MHC allele patterns
91 |
92 | human_only: bool
93 | Restrict to human samples (default False)
94 |
95 | peptide_length: int, optional
96 | Restrict epitopes to amino acid strings of given length
97 |
98 | assay_method string, optional
99 | Only collect results with assay methods containing the given string
100 |
101 | only_standard_amino_acids : bool, optional
102 | Drop sequences which use non-standard amino acids, anything outside
103 | the core 20, such as X or U (default = True)
104 |
105 | reduced_alphabet: dictionary, optional
106 | Remap amino acid letters to some other alphabet
107 |
108 | nrows: int, optional
109 | Don't load the full IEDB dataset but instead read only the first nrows
110 | """
111 | path = local_path()
112 | df = pd.read_csv(
113 | path,
114 | header=[0, 1],
115 | skipinitialspace=True,
116 | nrows=nrows,
117 | low_memory=False,
118 | on_bad_lines='warn',
119 | encoding="latin-1")
120 |
121 | mhc = get_mhc_allele(df)
122 | mhc_class = get_mhc_class(df)
123 | epitopes = get_epitope_name(df)
124 | organism = get_host_name(df)
125 | assay_method = get_assay_method(df)
126 |
127 |
128 | # Sometimes the IEDB seems to put in an extra comma in the
129 | # header line, which creates an unnamed column of NaNs.
130 | # To deal with this, drop any columns which are all NaN
131 | df = df.dropna(axis=1, how="all")
132 |
133 | n = len(df)
134 |
135 | null_epitope_seq = epitopes.isnull()
136 | n_null = null_epitope_seq.sum()
137 |
138 | if n_null > 0:
139 | logging.info("Dropping %d null sequences", n_null)
140 |
141 | mask = ~null_epitope_seq
142 |
143 | if only_standard_amino_acids:
144 | # if have rare or unknown amino acids, drop the sequence
145 | bad_epitope_seq = \
146 | epitopes.str.contains(bad_amino_acids, na=False).astype("bool")
147 | n_bad = bad_epitope_seq.sum()
148 | if n_bad > 0:
149 | logging.info("Dropping %d bad sequences", n_bad)
150 |
151 | mask &= ~bad_epitope_seq
152 |
153 | if human_only:
154 | mask &= organism.str.startswith('Homo sapiens', na=False).astype('bool')
155 |
156 |
157 | if mhc_class is not None:
158 | # since MHC classes can be specified as either strings ("I") or integers
159 | # standard them to be strings
160 | if mhc_class == 1:
161 | mhc_class = "I"
162 | elif mhc_class == 2:
163 | mhc_class = "II"
164 | if mhc_class not in {"I", "II"}:
165 | raise ValueError("Invalid MHC class: %s" % mhc_class)
166 | allele_dict = load_alleles_dict()
167 | mhc_class_mask = [False] * len(df)
168 | for i, allele_name in enumerate(mhc):
169 | allele_object = allele_dict.get(allele_name)
170 | if allele_object and allele_object.mhc_class == mhc_class:
171 | mhc_class_mask[i] = True
172 | mask &= np.array(mhc_class_mask)
173 |
174 | # Match known alleles such as "HLA-A*02:01",
175 | # broader groupings such as "HLA-A2"
176 | # and unknown alleles of the MHC-1 listed either as
177 | # "HLA-Class I,allele undetermined"
178 | # or
179 | # "Class I,allele undetermined"
180 | ]
181 |
182 | if hla:
183 | mask &= df[mhc_allele_column_key].str.contains(hla, na=False)
184 |
185 | if exclude_hla:
186 | mask &= ~(df[mhc_allele_column_key].str.contains(exclude_hla, na=False))
187 |
188 | if assay_group:
189 | mask &= df[assay_group_column_key].str.contains(assay_group)
190 |
191 | if assay_method:
192 | mask &= df[assay_method_column_key].str.contains(assay_method)
193 |
194 | if peptide_length:
195 | assert peptide_length > 0
196 | mask &= df[epitope_column_key].str.len() == peptide_length
197 |
198 | df = df[mask]
199 |
200 | logging.info("Returning %d / %d entries after filtering", len(df), n)
201 | return df
202 |
--------------------------------------------------------------------------------
/pepdata/matrices/BLOSUM30:
--------------------------------------------------------------------------------
1 | A B C D E F G H I K L M N P Q R S T V W X Y Z *
2 | A 4 0 -3 0 0 -2 0 -2 0 0 -1 1 0 -1 1 -1 1 1 1 -5 0 -4 0 -7
3 | B 0 5 -2 5 0 -3 0 -2 -2 0 -1 -2 4 -2 -1 -2 0 0 -2 -5 -1 -3 0 -7
4 | C -3 -2 17 -3 1 -3 -4 -5 -2 -3 0 -2 -1 -3 -2 -2 -2 -2 -2 -2 -2 -6 0 -7
5 | D 0 5 -3 9 1 -5 -1 -2 -4 0 -1 -3 1 -1 -1 -1 0 -1 -2 -4 -1 -1 0 -7
6 | E 0 0 1 1 6 -4 -2 0 -3 2 -1 -1 -1 1 2 -1 0 -2 -3 -1 -1 -2 5 -7
7 | F -2 -3 -3 -5 -4 10 -3 -3 0 -1 2 -2 -1 -4 -3 -1 -1 -2 1 1 -1 3 -4 -7
8 | G 0 0 -4 -1 -2 -3 8 -3 -1 -1 -2 -2 0 -1 -2 -2 0 -2 -3 1 -1 -3 -2 -7
9 | H -2 -2 -5 -2 0 -3 -3 14 -2 -2 -1 2 -1 1 0 -1 -1 -2 -3 -5 -1 0 0 -7
10 | I 0 -2 -2 -4 -3 0 -1 -2 6 -2 2 1 0 -3 -2 -3 -1 0 4 -3 0 -1 -3 -7
11 | K 0 0 -3 0 2 -1 -1 -2 -2 4 -2 2 0 1 0 1 0 -1 -2 -2 0 -1 1 -7
12 | L -1 -1 0 -1 -1 2 -2 -1 2 -2 4 2 -2 -3 -2 -2 -2 0 1 -2 0 3 -1 -7
13 | M 1 -2 -2 -3 -1 -2 -2 2 1 2 2 6 0 -4 -1 0 -2 0 0 -3 0 -1 -1 -7
14 | N 0 4 -1 1 -1 -1 0 -1 0 0 -2 0 8 -3 -1 -2 0 1 -2 -7 0 -4 -1 -7
15 | P -1 -2 -3 -1 1 -4 -1 1 -3 1 -3 -4 -3 11 0 -1 -1 0 -4 -3 -1 -2 0 -7
16 | Q 1 -1 -2 -1 2 -3 -2 0 -2 0 -2 -1 -1 0 8 3 -1 0 -3 -1 0 -1 4 -7
17 | R -1 -2 -2 -1 -1 -1 -2 -1 -3 1 -2 0 -2 -1 3 8 -1 -3 -1 0 -1 0 0 -7
18 | S 1 0 -2 0 0 -1 0 -1 -1 0 -2 -2 0 -1 -1 -1 4 2 -1 -3 0 -2 -1 -7
19 | T 1 0 -2 -1 -2 -2 -2 -2 0 -1 0 0 1 0 0 -3 2 5 1 -5 0 -1 -1 -7
20 | V 1 -2 -2 -2 -3 1 -3 -3 4 -2 1 0 -2 -4 -3 -1 -1 1 5 -3 0 1 -3 -7
21 | W -5 -5 -2 -4 -1 1 1 -5 -3 -2 -2 -3 -7 -3 -1 0 -3 -5 -3 20 -2 5 -1 -7
22 | X 0 -1 -2 -1 -1 -1 -1 -1 0 0 0 0 0 -1 0 -1 0 0 0 -2 -1 -1 0 -7
23 | Y -4 -3 -6 -1 -2 3 -3 0 -1 -1 3 -1 -4 -2 -1 0 -2 -1 1 5 -1 9 -2 -7
24 | Z 0 0 0 0 5 -4 -2 0 -3 1 -1 -1 -1 0 4 0 -1 -1 -3 -1 0 -2 4 -7
25 | * -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 1
26 |
--------------------------------------------------------------------------------
/pepdata/matrices/BLOSUM50:
--------------------------------------------------------------------------------
1 | A R N D C Q E G H I L K M F P S T W Y V
2 | A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0
3 | R -2 7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3
4 | N -1 -1 7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3
5 | D -2 -2 2 8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4
6 | C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1
7 | Q -1 1 0 0 -3 7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3
8 | E -1 0 0 2 -3 2 6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3
9 | G 0 -3 0 -1 -3 -2 -3 8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4
10 | H -2 0 1 -1 -3 1 0 -2 10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4
11 | I -1 -4 -3 -4 -2 -3 -4 -4 -4 5 2 -3 2 0 -3 -3 -1 -3 -1 4
12 | L -2 -3 -4 -4 -2 -2 -3 -4 -3 2 5 -3 3 1 -4 -3 -1 -2 -1 1
13 | K -1 3 0 -1 -3 2 1 -2 0 -3 -3 6 -2 -4 -1 0 -1 -3 -2 -3
14 | M -1 -2 -2 -4 -2 0 -2 -3 -1 2 3 -2 7 0 -3 -2 -1 -1 0 1
15 | F -3 -3 -4 -5 -2 -4 -3 -4 -1 0 1 -4 0 8 -4 -3 -2 1 4 -1
16 | P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3
17 | S 1 -1 1 0 -1 0 -1 0 -1 -3 -3 0 -2 -3 -1 5 2 -4 -2 -2
18 | T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 2 5 -3 -2 0
19 | W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1 1 -4 -4 -4 15 2 -3
20 | Y -2 -1 -2 -3 -3 -1 -2 -3 2 -1 -1 -2 0 4 -3 -2 -2 2 8 -1
21 | V 0 -3 -3 -4 -1 -3 -3 -4 -4 4 1 -3 1 -1 -3 -2 0 -3 -1 5
--------------------------------------------------------------------------------
/pepdata/matrices/BLOSUM62:
--------------------------------------------------------------------------------
1 | # Entries for the BLOSUM62 matrix at a scale of ln(2)/2.0.
2 | A R N D C Q E G H I L K M F P S T W Y V B J Z X *
3 | A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 -1 -1 -4
4 | R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 -2 0 -1 -4
5 | N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 4 -3 0 -1 -4
6 | D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 -3 1 -1 -4
7 | C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -1 -3 -1 -4
8 | Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 -2 4 -1 -4
9 | E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 -3 4 -1 -4
10 | G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -4 -2 -1 -4
11 | H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 -3 0 -1 -4
12 | I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 3 -3 -1 -4
13 | L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 3 -3 -1 -4
14 | K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 -3 1 -1 -4
15 | M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 2 -1 -1 -4
16 | F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 0 -3 -1 -4
17 | P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -3 -1 -1 -4
18 | S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 -2 0 -1 -4
19 | T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 -1 -1 -4
20 | W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -2 -2 -1 -4
21 | Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -1 -2 -1 -4
22 | V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 2 -2 -1 -4
23 | B -2 -1 4 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 -3 0 -1 -4
24 | J -1 -2 -3 -3 -1 -2 -3 -4 -3 3 3 -3 2 0 -3 -2 -1 -2 -1 2 -3 3 -3 -1 -4
25 | Z -1 0 0 1 -3 4 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -2 -2 -2 0 -3 4 -1 -4
26 | X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4
27 | * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
28 |
--------------------------------------------------------------------------------
/pepdata/matrices/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/pepdata/cae7b52f668e3111848c9263474d7aa2bacf8274/pepdata/matrices/__init__.py
--------------------------------------------------------------------------------
/pepdata/matrices/amino_acid_properties.txt:
--------------------------------------------------------------------------------
1 | http://www.sgi.sscc.ru/mgs/programs/crasp/texts/AA_Properties.htm
2 |
3 | 1. Free energy of transfer to surface (Bull & Breese 1974)
4 | Arch. Bioch. Biophys. 1974 161 665-670
5 | 610.0000 A ALA
6 | 690.0000 R ARG
7 | 890.0000 N ASN
8 | 610.0000 D ASP
9 | 360.0000 C CYS
10 | 970.0000 Q GLN
11 | 510.0000 E GLU
12 | 810.0000 G GLY
13 | 690.0000 H HIS
14 | -1450.0000 I ILE
15 | -1650.0000 L LEU
16 | 460.0000 K LYS
17 | -660.0000 M MET
18 | -1520.0000 F PHE
19 | -170.0000 P PRO
20 | 420.0000 S SER
21 | 290.0000 T THR
22 | -1200.0000 W TRP
23 | -1430.0000 Y TYR
24 | -750.0000 V VAL
25 |
26 | 2. Surrounding hydrophobicity in alpha-helix (Ponnuswamy et al. 1980)
27 | Biochim.Biophys.Acta(1980) 623 301-316
28 | 13.6500 A ALA
29 | 11.2800 R ARG
30 | 12.2400 N ASN
31 | 10.9800 D ASP
32 | 14.4900 C CYS
33 | 11.3000 Q GLN
34 | 12.5500 E GLU
35 | 15.3600 G GLY
36 | 11.5900 H HIS
37 | 14.6300 I ILE
38 | 14.0100 L LEU
39 | 11.9600 K LYS
40 | 13.4000 M MET
41 | 14.0800 F PHE
42 | 11.5100 P PRO
43 | 11.2600 S SER
44 | 13.0000 T THR
45 | 12.0600 W TRP
46 | 12.6400 Y TYR
47 | 12.8800 V VAL
48 |
49 | 3. Surrounding hydrophobicity in beta-sheet (Ponnuswamy et al. 1980)
50 | Biochim.Biophys.Acta(1980) 623 301-316
51 | 14.6000 A ALA
52 | 13.2400 R ARG
53 | 11.7900 N ASN
54 | 13.7800 D ASP
55 | 15.9000 C CYS
56 | 12.0200 Q GLN
57 | 13.5900 E GLU
58 | 14.1800 G GLY
59 | 15.3500 H HIS
60 | 14.1000 I ILE
61 | 16.4900 L LEU
62 | 13.2800 K LYS
63 | 16.2300 M MET
64 | 14.1800 F PHE
65 | 14.1000 P PRO
66 | 13.3600 S SER
67 | 14.5000 T THR
68 | 13.9000 W TRP
69 | 14.7600 Y TYR
70 | 16.3000 V VAL
71 |
72 | 4. Surrounding hydrophobicity in beta-turn (Ponnuswamy et al. 1980)
73 | Biochim.Biophys.Acta(1980) 623 301-316
74 | 10.6700 A ALA
75 | 11.0500 R ARG
76 | 10.8500 N ASN
77 | 10.2100 D ASP
78 | 14.1500 C CYS
79 | 11.7100 Q GLN
80 | 11.7100 E GLU
81 | 10.9500 G GLY
82 | 12.0700 H HIS
83 | 12.9500 I ILE
84 | 13.0700 L LEU
85 | 9.9300 K LYS
86 | 15.0000 M MET
87 | 13.2700 F PHE
88 | 10.6200 P PRO
89 | 11.1800 S SER
90 | 10.5300 T THR
91 | 11.4100 W TRP
92 | 11.5200 Y TYR
93 | 13.8600 V VAL
94 |
95 | 5. Accessibility reduction ratio (Ponnuswamy et al. 1980)
96 | Biochim.Biophys.Acta(1980) 623 301-316
97 | 3.7000 A ALA
98 | 2.5300 R ARG
99 | 2.1200 N ASN
100 | 2.6000 D ASP
101 | 3.0300 C CYS
102 | 2.7000 Q GLN
103 | 3.3000 E GLU
104 | 3.1300 G GLY
105 | 3.5700 H HIS
106 | 7.6900 I ILE
107 | 5.8800 L LEU
108 | 1.7900 K LYS
109 | 5.2100 M MET
110 | 6.6000 F PHE
111 | 2.1200 P PRO
112 | 2.4300 S SER
113 | 2.6000 T THR
114 | 6.2500 W TRP
115 | 3.0300 Y TYR
116 | 7.1400 V VAL
117 |
118 | 6. Average number of surrounding residues (Ponnuswamy et al. 1980)
119 | Biochim.Biophys.Acta(1980) 623 301-316
120 | 6.0500 A ALA
121 | 5.7000 R ARG
122 | 5.0400 N ASN
123 | 4.9500 D ASP
124 | 7.8600 C CYS
125 | 5.4500 Q GLN
126 | 5.1000 E GLU
127 | 6.1600 G GLY
128 | 5.8000 H HIS
129 | 7.5100 I ILE
130 | 7.3700 L LEU
131 | 4.8800 K LYS
132 | 6.3900 M MET
133 | 6.6200 F PHE
134 | 5.6500 P PRO
135 | 5.5300 S SER
136 | 5.8100 T THR
137 | 6.9800 W TRP
138 | 6.7300 Y TYR
139 | 7.6200 V VAL
140 |
141 | 7. Volume (Chothia 1984)
142 | Annu. Rev. Biochem. (1984) 53 537-572
143 | 91.5000 A ALA
144 | 202.0000 R ARG
145 | 135.2000 N ASN
146 | 124.5000 D ASP
147 | 118.0000 C CYS
148 | 161.1000 Q GLN
149 | 155.1000 E GLU
150 | 66.4000 G GLY
151 | 167.3000 H HIS
152 | 168.8000 I ILE
153 | 167.9000 L LEU
154 | 171.3000 K LYS
155 | 170.8000 M MET
156 | 203.4000 F PHE
157 | 129.3000 P PRO
158 | 99.1000 S SER
159 | 122.1000 T THR
160 | 237.6000 W TRP
161 | 203.6000 Y TYR
162 | 141.7000 V VAL
163 |
164 | 8. Local flexibility (Ragone et al. 1989)
165 | Prot. Engineering(1989) 2 497-504
166 | 705.4200 A ALA
167 | 1484.2800 R ARG
168 | 513.4601 N ASN
169 | 34.9600 D ASP
170 | 2412.5601 C CYS
171 | 1087.8300 Q GLN
172 | 1158.6600 E GLU
173 | 33.1800 G GLY
174 | 1637.1300 H HIS
175 | 5979.3701 I ILE
176 | 4985.7300 L LEU
177 | 699.6900 K LYS
178 | 4491.6602 M MET
179 | 5203.8599 F PHE
180 | 431.9600 P PRO
181 | 174.7600 S SER
182 | 601.8800 T THR
183 | 6374.0698 W TRP
184 | 4291.1001 Y TYR
185 | 4474.4199 V VAL
186 |
187 | 9. Flexibility (Bhaskaran and Ponnuswamy 1988)
188 | Int. J. Peptide Protein Res.(1988) 32 241-255
189 | 0.3570 A ALA
190 | 0.5290 R ARG
191 | 0.4630 N ASN
192 | 0.5110 D ASP
193 | 0.3460 C CYS
194 | 0.4930 Q GLN
195 | 0.4970 E GLU
196 | 0.5440 G GLY
197 | 0.3230 H HIS
198 | 0.4620 I ILE
199 | 0.3650 L LEU
200 | 0.4660 K LYS
201 | 0.2950 M MET
202 | 0.3140 F PHE
203 | 0.5090 P PRO
204 | 0.5070 S SER
205 | 0.4440 T THR
206 | 0.3050 W TRP
207 | 0.4200 Y TYR
208 | 0.3860 V VAL
209 |
210 | 10. Flexibility for no rigid neighbours (Karplus & Schulz 1985)
211 | Naturwissenschaften (1985) 72 212-213
212 | 1.0410 A ALA
213 | 1.0380 R ARG
214 | 1.1170 N ASN
215 | 1.0330 D ASP
216 | 0.9600 C CYS
217 | 1.1650 Q GLN
218 | 1.0940 E GLU
219 | 1.1420 G GLY
220 | 0.9820 H HIS
221 | 1.0020 I ILE
222 | 0.9670 L LEU
223 | 1.0930 K LYS
224 | 0.9470 M MET
225 | 0.9300 F PHE
226 | 1.0550 P PRO
227 | 1.1690 S SER
228 | 1.0730 T THR
229 | 0.9250 W TRP
230 | 0.9610 Y TYR
231 | 0.9820 V VAL
232 |
233 | 11. Flexibility for one rigid neighbour (Karplus & Schulz 1985)
234 | Naturwissenschaften (1985) 72 212-213
235 | 0.9460 A ALA
236 | 1.0280 R ARG
237 | 1.0060 N ASN
238 | 1.0890 D ASP
239 | 0.8780 C CYS
240 | 1.0280 Q GLN
241 | 1.0360 E GLU
242 | 1.0420 G GLY
243 | 0.9520 H HIS
244 | 0.8920 I ILE
245 | 0.9610 L LEU
246 | 1.0820 K LYS
247 | 0.8620 M MET
248 | 0.9120 F PHE
249 | 1.0850 P PRO
250 | 1.0480 S SER
251 | 1.0510 T THR
252 | 0.9170 W TRP
253 | 0.9300 Y TYR
254 | 0.9270 V VAL
255 |
256 | 12. Average accessibility surface area (Janin et al. 1978)
257 | J. Mol. Biol. (1978) 125 357-386
258 | 27.8000 A ALA
259 | 94.7000 R ARG
260 | 60.1000 N ASN
261 | 60.6000 D ASP
262 | 15.5000 C CYS
263 | 68.7000 Q GLN
264 | 68.2000 E GLU
265 | 24.5000 G GLY
266 | 50.7000 H HIS
267 | 22.8000 I ILE
268 | 27.6000 L LEU
269 | 103.000 K LYS
270 | 33.5000 M MET
271 | 25.5000 F PHE
272 | 51.5000 P PRO
273 | 42.0000 S SER
274 | 45.0000 T THR
275 | 34.7000 W TRP
276 | 55.2000 Y TYR
277 | 23.7000 V VAL
278 |
279 | 13. Flexibility for two rigid neighbours (Karplus & Schulz 1985)
280 | Naturwissenschaften (1985) 72 212-213
281 | 0.8920 A ALA
282 | 0.9010 R ARG
283 | 0.9300 N ASN
284 | 0.9320 D ASP
285 | 0.9250 C CYS
286 | 0.8850 Q GLN
287 | 0.9330 E GLU
288 | 0.9230 G GLY
289 | 0.8940 H HIS
290 | 0.8720 I ILE
291 | 0.9210 L LEU
292 | 1.0570 K LYS
293 | 0.8040 M MET
294 | 0.9140 F PHE
295 | 0.9320 P PRO
296 | 0.9230 S SER
297 | 0.9340 T THR
298 | 0.8030 W TRP
299 | 0.8370 Y TYR
300 | 0.9130 V VAL
301 |
302 | 14. Hydrophobicity (Eisenberg et al. 1984)
303 | J. Mol. Biol. (1984) 179 125-142
304 | 0.2500 A ALA
305 | -1.7600 R ARG
306 | -0.6400 N ASN
307 | -0.7200 D ASP
308 | 0.0400 C CYS
309 | -0.6900 Q GLN
310 | -0.6200 E GLU
311 | 0.1600 G GLY
312 | -0.4000 H HIS
313 | 0.7300 I ILE
314 | 0.5300 L LEU
315 | -1.1000 K LYS
316 | 0.2600 M MET
317 | 0.6100 F PHE
318 | -0.0700 P PRO
319 | -0.2600 S SER
320 | -0.1800 T THR
321 | 0.3700 W TRP
322 | 0.0200 Y TYR
323 | 0.5400 V VAL
324 |
325 | 15. Accessible surface area in the standard state (Rose et al. 1985)
326 | Science (1975) 229 834-838
327 | 118.1000 A ALA
328 | 256.0000 R ARG
329 | 165.5000 N ASN
330 | 158.7000 D ASP
331 | 146.1000 C CYS
332 | 193.2000 Q GLN
333 | 186.2000 E GLU
334 | 88.1000 G GLY
335 | 202.5000 H HIS
336 | 181.0000 I ILE
337 | 193.1000 L LEU
338 | 225.8000 K LYS
339 | 203.4000 M MET
340 | 222.8000 F PHE
341 | 146.8000 P PRO
342 | 129.8000 S SER
343 | 152.5000 T THR
344 | 266.3000 W TRP
345 | 236.8000 Y TYR
346 | 164.5000 V VAL
347 |
348 | 16. Average accessible surface area in folded proteins (Rose et al. 1985)
349 | Science (1975) 229 834-838
350 | 31.5000 A ALA
351 | 93.8000 R ARG
352 | 62.2000 N ASN
353 | 60.9000 D ASP
354 | 13.9000 C CYS
355 | 74.0000 Q GLN
356 | 72.3000 E GLU
357 | 25.2000 G GLY
358 | 46.7000 H HIS
359 | 23.0000 I ILE
360 | 29.0000 L LEU
361 | 110.300 K LYS
362 | 30.5000 M MET
363 | 28.7000 F PHE
364 | 53.7000 P PRO
365 | 44.2000 S SER
366 | 46.0000 T THR
367 | 41.7000 W TRP
368 | 59.1000 Y TYR
369 | 23.5000 V VAL
370 |
371 | 17. Average surrounding hydrophobicity (Manavalan and Ponnuswamy 1978)
372 | Nature (1978) 275 673-674
373 | 12.9700 A ALA
374 | 11.7200 R ARG
375 | 11.4200 N ASN
376 | 10.8500 D ASP
377 | 14.6400 C CYS
378 | 11.7600 Q GLN
379 | 11.8900 E GLU
380 | 12.4300 G GLY
381 | 12.1600 H HIS
382 | 15.6700 I ILE
383 | 14.9000 L LEU
384 | 11.3600 K LYS
385 | 14.3900 M MET
386 | 14.0000 F PHE
387 | 11.3700 P PRO
388 | 11.2300 S SER
389 | 11.6900 T THR
390 | 13.9300 W TRP
391 | 13.4200 Y TYR
392 | 15.7100 V VAL
393 |
394 | 18. Hydrophilicity (Hopp and Woods 1981)
395 | Proc. Natl. Acad. Sci. USA (1981) 78 3824-3828
396 | -0.5000 A ALA
397 | 3.0000 R ARG
398 | 0.2000 N ASN
399 | 3.0000 D ASP
400 | -1.0000 C CYS
401 | 0.2000 Q GLN
402 | 3.0000 E GLU
403 | 0.0000 G GLY
404 | -0.5000 H HIS
405 | -1.8000 I ILE
406 | -1.8000 L LEU
407 | 3.0000 K LYS
408 | -1.3000 M MET
409 | -2.5000 F PHE
410 | 0.0000 P PRO
411 | 0.3000 S SER
412 | -0.4000 T THR
413 | -3.4000 W TRP
414 | -2.3000 Y TYR
415 | -1.5000 V VAL
416 |
417 | 19. Hydropathy (Kyte and Doolittle 1982)
418 | J. Mol. Biol. (1982) 157 105-132
419 | 1.8000 A ALA
420 | -4.5000 R ARG
421 | -3.5000 N ASN
422 | -3.5000 D ASP
423 | 2.5000 C CYS
424 | -3.5000 Q GLN
425 | -3.5000 E GLU
426 | -0.4000 G GLY
427 | -3.2000 H HIS
428 | 4.5000 I ILE
429 | 3.8000 L LEU
430 | -3.9000 K LYS
431 | 1.9000 M MET
432 | 2.8000 F PHE
433 | -1.6000 P PRO
434 | -0.8000 S SER
435 | -0.7000 T THR
436 | -0.9000 W TRP
437 | -1.3000 Y TYR
438 | 4.2000 V VAL
439 |
440 | 20. Hydrophilicity from HPLC (Parker et al. 1986)
441 | Biochemistry (1986) 25 5425-5432
442 | 2.1000 A ALA
443 | 4.2000 R ARG
444 | 7.0000 N ASN
445 | 10.000 D ASP
446 | 1.4000 C CYS
447 | 6.0000 Q GLN
448 | 7.8000 E GLU
449 | 5.7000 G GLY
450 | 2.1000 H HIS
451 | -8.000 I ILE
452 | -9.200 L LEU
453 | 5.7000 K LYS
454 | -4.200 M MET
455 | -9.200 F PHE
456 | 2.1000 P PRO
457 | 6.5000 S SER
458 | 5.2000 T THR
459 | -10.00 W TRP
460 | -1.900 Y TYR
461 | -3.700 V VAL
462 |
463 | 21. Hydrophobicity (Jones 1975)
464 | J.theor.Biol.(1975) 50 167-183
465 | 0.8700 A ALA
466 | 0.8500 R ARG
467 | 0.0900 N ASN
468 | 0.6600 D ASP
469 | 1.5200 C CYS
470 | 0.0000 Q GLN
471 | 0.6700 E GLU
472 | 0.1000 G GLY
473 | 0.8700 H HIS
474 | 3.1500 I ILE
475 | 2.1700 L LEU
476 | 1.6400 K LYS
477 | 1.6700 M MET
478 | 2.8700 F PHE
479 | 2.7700 P PRO
480 | 0.0700 S SER
481 | 0.0700 T THR
482 | 3.7700 W TRP
483 | 2.6700 Y TYR
484 | 1.8700 V VAL
485 |
486 | 22. Refractivity (Jones 1975)
487 | J.theor.Biol.(1975) 50 167-183
488 | 4.3400 A ALA
489 | 26.6600 R ARG
490 | 13.2800 N ASN
491 | 12.0000 D ASP
492 | 35.7700 C CYS
493 | 17.5600 Q GLN
494 | 17.2600 E GLU
495 | 0.0000 G GLY
496 | 21.8100 H HIS
497 | 19.0600 I ILE
498 | 18.7800 L LEU
499 | 21.2900 K LYS
500 | 21.6400 M MET
501 | 29.4000 F PHE
502 | 10.9300 P PRO
503 | 6.3500 S SER
504 | 11.0100 T THR
505 | 42.5300 W TRP
506 | 31.5300 Y TYR
507 | 13.9200 V VAL
508 |
509 | 23. Percentage of buried residues (Janin et al. 1978)
510 | J. Mol. Biol. (1978) 125 357-386
511 | 51.0000 A ALA
512 | 5.0000 R ARG
513 | 22.0000 N ASN
514 | 19.0000 D ASP
515 | 74.0000 C CYS
516 | 16.0000 Q GLN
517 | 16.0000 E GLU
518 | 52.0000 G GLY
519 | 34.0000 H HIS
520 | 66.0000 I ILE
521 | 60.0000 L LEU
522 | 3.0000 K LYS
523 | 52.0000 M MET
524 | 58.0000 F PHE
525 | 25.0000 P PRO
526 | 35.0000 S SER
527 | 30.0000 T THR
528 | 49.0000 W TRP
529 | 24.0000 Y TYR
530 | 64.0000 V VAL
531 |
532 | 24. Normalized frequency of alpha-helix with weights (Levitt 1978)
533 | Biochemistry (1978) 17 4277-4285
534 | 1.2900 A ALA
535 | 0.9600 R ARG
536 | 0.9000 N ASN
537 | 1.0400 D ASP
538 | 1.1100 C CYS
539 | 1.2700 Q GLN
540 | 1.4400 E GLU
541 | 0.5600 G GLY
542 | 1.2200 H HIS
543 | 0.9700 I ILE
544 | 1.3000 L LEU
545 | 1.2300 K LYS
546 | 1.4700 M MET
547 | 1.0700 F PHE
548 | 0.5200 P PRO
549 | 0.8200 S SER
550 | 0.8200 T THR
551 | 0.9900 W TRP
552 | 0.7200 Y TYR
553 | 0.9100 V VAL
554 |
555 | 25. Normalized frequency of beta-sheet with weights (Levitt 1978)
556 | Biochemistry (1978) 17 4277-4285
557 | 0.9000 A ALA
558 | 0.9900 R ARG
559 | 0.7600 N ASN
560 | 0.7200 D ASP
561 | 0.7400 C CYS
562 | 0.8000 Q GLN
563 | 0.7500 E GLU
564 | 0.9200 G GLY
565 | 1.0800 H HIS
566 | 1.4500 I ILE
567 | 1.0200 L LEU
568 | 0.7700 K LYS
569 | 0.9700 M MET
570 | 1.3200 F PHE
571 | 0.6400 P PRO
572 | 0.9500 S SER
573 | 1.2100 T THR
574 | 1.1400 W TRP
575 | 1.2500 Y TYR
576 | 1.4900 V VAL
577 |
578 | 26. Normalized frequency for reverse turn with weights (Levitt 1978)
579 | Biochemistry (1978) 17 4277-4285
580 | 0.7800 A ALA
581 | 0.8800 R ARG
582 | 1.2800 N ASN
583 | 1.4100 D ASP
584 | 0.8000 C CYS
585 | 0.9700 Q GLN
586 | 1.0000 E GLU
587 | 1.6400 G GLY
588 | 0.6900 H HIS
589 | 0.5100 I ILE
590 | 0.5900 L LEU
591 | 0.9600 K LYS
592 | 0.3900 M MET
593 | 0.5800 F PHE
594 | 1.9100 P PRO
595 | 1.3300 S SER
596 | 1.0300 T THR
597 | 0.7500 W TRP
598 | 1.0500 Y TYR
599 | 0.4700 V VAL
600 |
601 | 27. Percentage of exposed residues (Janin et al. 1978)
602 | J. Mol. Biol. (1978) 125 357-386
603 | 15.0000 A ALA
604 | 67.0000 R ARG
605 | 49.0000 N ASN
606 | 50.0000 D ASP
607 | 5.0000 C CYS
608 | 56.0000 Q GLN
609 | 55.0000 E GLU
610 | 10.0000 G GLY
611 | 34.0000 H HIS
612 | 13.0000 I ILE
613 | 16.0000 L LEU
614 | 85.0000 K LYS
615 | 20.0000 M MET
616 | 10.0000 F PHE
617 | 45.0000 P PRO
618 | 32.0000 S SER
619 | 32.0000 T THR
620 | 17.0000 W TRP
621 | 41.0000 Y TYR
622 | 14.0000 V VAL
623 |
624 | 28. Hydrophobic index (Ponnuswamy et al. 1980)
625 | Biochim.Biophys.Acta(1980) 623 301-316
626 | 0.8700 A ALA
627 | 0.8500 R ARG
628 | 0.0900 N ASN
629 | 0.6600 D ASP
630 | 1.5200 C CYS
631 | 0.0000 Q GLN
632 | 0.6700 E GLU
633 | 0.1000 G GLY
634 | 0.8000 H HIS
635 | 3.1500 I ILE
636 | 2.1700 L LEU
637 | 1.6400 K LYS
638 | 1.6700 M MET
639 | 2.8700 F PHE
640 | 2.7700 P PRO
641 | 0.0700 S SER
642 | 0.0700 T THR
643 | 3.7700 W TRP
644 | 2.6700 Y TYR
645 | 1.8700 V VAL
646 |
647 | 29. Hydrophobicity in folded form (Ponnuswamy et al. 1980)
648 | Biochim.Biophys.Acta(1980) 623 301-316
649 | 12.2800 A ALA
650 | 11.4900 R ARG
651 | 11.0000 N ASN
652 | 10.9700 D ASP
653 | 14.9300 C CYS
654 | 11.2800 Q GLN
655 | 11.1900 E GLU
656 | 12.0100 G GLY
657 | 12.8400 H HIS
658 | 14.7700 I ILE
659 | 14.1000 L LEU
660 | 10.8000 K LYS
661 | 14.3300 M MET
662 | 13.4300 F PHE
663 | 11.1900 P PRO
664 | 11.2600 S SER
665 | 11.6500 T THR
666 | 12.9500 W TRP
667 | 13.2900 Y TYR
668 | 15.0700 V VAL
669 |
670 | 30. Hydrophobicity in unfolded form (Ponnuswamy et al. 1980)
671 | Biochim.Biophys.Acta(1980) 623 301-316
672 | 4.6600 A ALA
673 | 4.6800 R ARG
674 | 4.8700 N ASN
675 | 4.7900 D ASP
676 | 4.0000 C CYS
677 | 4.6100 Q GLN
678 | 4.8100 E GLU
679 | 4.7000 G GLY
680 | 4.9900 H HIS
681 | 4.7800 I ILE
682 | 4.7300 L LEU
683 | 5.0800 K LYS
684 | 4.5000 M MET
685 | 4.4400 F PHE
686 | 4.5500 P PRO
687 | 4.3300 S SER
688 | 4.7500 T THR
689 | 4.5400 W TRP
690 | 4.7600 Y TYR
691 | 4.6900 V VAL
692 |
693 | 31. Hydrophobicity gain (Ponnuswamy et al. 1980)
694 | Biochim.Biophys.Acta(1980) 623 301-316
695 | 2.6300 A ALA
696 | 2.4500 R ARG
697 | 2.2700 N ASN
698 | 2.2900 D ASP
699 | 3.3600 C CYS
700 | 2.4500 Q GLN
701 | 2.3100 E GLU
702 | 2.5500 G GLY
703 | 2.5700 H HIS
704 | 3.0800 I ILE
705 | 2.9800 L LEU
706 | 2.1200 K LYS
707 | 3.1800 M MET
708 | 3.0200 F PHE
709 | 2.4600 P PRO
710 | 2.6000 S SER
711 | 2.5500 T THR
712 | 2.8500 W TRP
713 | 2.7900 Y TYR
714 | 3.2100 V VAL
715 |
716 | 32. Polarity (Ponnuswamy et al. 1980)
717 | Biochim.Biophys.Acta(1980) 623 301-316
718 | 0.0000 A ALA
719 | 52.000 R ARG
720 | 3.3800 N ASN
721 | 40.700 D ASP
722 | 1.4800 C CYS
723 | 3.5300 Q GLN
724 | 49.910 E GLU
725 | 0.0000 G GLY
726 | 51.600 H HIS
727 | 0.1500 I ILE
728 | 0.4500 L LEU
729 | 49.500 K LYS
730 | 1.4300 M MET
731 | 0.3500 F PHE
732 | 1.5800 P PRO
733 | 1.6700 S SER
734 | 1.6600 T THR
735 | 2.1000 W TRP
736 | 1.6100 Y TYR
737 | 0.1300 V VAL
738 |
739 | 33. Average isotopic mass (Biemann, 1990)
740 | Meth.Enzimol. (1990) v.193, p.888
741 | 70.079 A ALA
742 | 156.188 R ARG
743 | 114.104 N ASN
744 | 115.089 D ASP
745 | 103.144 C CYS
746 | 128.131 Q GLN
747 | 129.116 E GLU
748 | 57.052 G GLY
749 | 137.142 H HIS
750 | 113.160 I ILE
751 | 113.160 L LEU
752 | 128.174 K LYS
753 | 131.198 M MET
754 | 147.177 F PHE
755 | 97.177 P PRO
756 | 87.078 S SER
757 | 101.105 T THR
758 | 186.213 W TRP
759 | 163.170 Y TYR
760 | 99.133 V VAL
761 |
762 | 34.Isoelectric point (Zimmerman et al., 1968)
763 | J. Theor. Biol. (1968) v.21, p.170-201
764 | 6.0000 A ALA
765 | 10.7600 R ARG
766 | 5.4100 N ASN
767 | 2.7700 D ASP
768 | 5.0500 C CYS
769 | 5.6500 Q GLN
770 | 3.2200 E GLU
771 | 5.9700 G GLY
772 | 7.5900 H HIS
773 | 6.0200 I ILE
774 | 5.9800 L LEU
775 | 9.7400 K LYS
776 | 5.7400 M MET
777 | 5.4800 F PHE
778 | 6.3000 P PRO
779 | 5.6800 S SER
780 | 5.6600 T THR
781 | 5.8900 W TRP
782 | 5.6600 Y TYR
783 | 5.9600 V VAL
784 |
785 | 35. pK of side chain (White et al., 1978)
786 | White A,Handler P,Smith EL,Hill RL,Lehman IR;Principles of Biochemistry;1978
787 | 0.0000 A ALA
788 | 12.480 R ARG
789 | 0.0000 N ASN
790 | 3.6500 D ASP
791 | 8.1800 C CYS
792 | 0.0000 Q GLN
793 | 4.2500 E GLU
794 | 0.0000 G GLY
795 | 6.0000 H HIS
796 | 0.0000 I ILE
797 | 0.0000 L LEU
798 | 10.530 K LYS
799 | 0.0000 M MET
800 | 0.0000 F PHE
801 | 0.0000 P PRO
802 | 0.0000 S SER
803 | 0.0000 T THR
804 | 0.0000 W TRP
805 | 10.700 Y TYR
806 | 0.0000 V VAL
807 |
808 | 36. Energy of transfer from water to ethanol kcal/mol (Nozaki & Tanford, 1971)
809 | J.Biol.Chem. (1971) 246 2211-2217
810 | -0.5000 A ALA
811 | -0.1000 R ARG
812 | 0.1100 N ASN
813 | -0.2000 D ASP
814 | -0.5000 C CYS
815 | 0.2000 Q GLN
816 | -0.3000 E GLU
817 | 0.0000 G GLY
818 | -0.4000 H HIS
819 | -2.0000 I ILE
820 | -2.0000 L LEU
821 | -0.3000 K LYS
822 | -1.3000 M MET
823 | -2.5000 F PHE
824 | -1.0000 P PRO
825 | -0.2000 S SER
826 | -0.4000 T THR
827 | -3.0000 W TRP
828 | -2.2000 Y TYR
829 | -1.5000 V VAL
830 |
--------------------------------------------------------------------------------
/pepdata/matrices/helix_vs_coil.txt:
--------------------------------------------------------------------------------
1 | #H ZHAC000103
2 | #D Environment-dependent residue contact energies (rows = helix, cols = coil)
3 | #R PMID:10706611
4 | #A Zhang, C. and Kim, S.H.
5 | #T Environment-dependent residue contact energies for proteins
6 | #J Proc. Natl. Acad. Sci. USA 97, 2550-2555 (2000)
7 | #M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
8 | 0.12 1.17 0.84 0.90 -0.81 1.16 1.44 0.10 0.69 -0.81 -0.78 1.16 -0.22 -0.67 0.61 0.47 0.36 -0.72 -0.37 -0.43
9 | 0.98 1.65 1.16 0.60 -0.21 1.26 1.12 1.09 1.16 -0.04 -0.09 2.37 0.47 -0.04 1.22 1.05 0.92 -0.09 0.06 0.32
10 | 0.69 1.16 1.16 1.22 -0.06 1.23 1.45 0.96 0.88 0.26 0.12 1.48 0.32 0.03 1.14 0.73 0.62 0.62 0.53 0.23
11 | 0.90 0.40 1.06 1.45 0.58 1.88 2.18 1.13 0.69 0.43 0.65 0.95 0.75 0.33 1.41 0.39 0.54 -0.10 0.12 0.77
12 | -0.83 0.10 0.40 0.12 -2.65 -0.24 0.96 -0.26 -0.26 -1.61 -1.77 0.80 -1.02 -1.47 -0.31 -0.31 -0.49 -1.30 -0.98 -1.62
13 | 1.13 1.10 1.28 1.37 0.14 1.62 1.84 1.29 1.31 0.05 -0.05 1.50 0.41 0.20 1.14 0.86 0.62 0.45 0.31 0.48
14 | 1.33 0.91 1.33 1.60 0.31 1.60 1.93 1.62 1.01 0.33 0.38 1.12 0.82 0.55 1.54 0.78 0.54 0.23 0.52 0.86
15 | -0.22 0.72 0.27 0.47 -0.95 0.42 1.39 -0.23 0.40 -0.48 -0.81 1.04 -0.62 -0.36 0.41 0.23 -0.04 -0.71 0.08 -0.35
16 | 0.47 0.81 0.95 0.51 -1.56 0.90 0.89 0.86 0.20 -0.43 -0.48 1.31 -0.63 -0.41 0.56 0.40 0.28 -0.20 -0.22 -0.21
17 | -0.58 0.17 0.61 0.46 -1.17 0.24 0.80 0.04 -0.16 -1.64 -1.66 0.87 -0.89 -1.56 -0.27 0.02 -0.32 -1.40 -1.13 -1.36
18 | -0.44 0.20 0.50 0.71 -1.56 0.11 0.82 0.28 -0.15 -1.67 -1.62 0.72 -0.96 -1.55 0.02 0.19 -0.09 -1.46 -0.95 -1.32
19 | 1.07 2.48 1.75 0.98 0.42 1.68 1.04 1.31 1.39 0.41 0.29 2.95 0.98 0.27 1.63 1.51 1.48 0.32 0.60 0.64
20 | -0.22 0.65 0.76 0.88 -0.95 0.68 1.92 0.27 0.31 -1.32 -1.04 1.02 -0.57 -1.60 0.07 0.47 0.04 -1.29 -0.85 -0.82
21 | -0.33 -0.06 0.42 0.42 -1.90 0.25 0.64 0.12 -0.01 -1.64 -1.50 0.58 -1.36 -1.77 -0.30 0.02 0.04 -1.41 -1.36 -1.34
22 | 0.78 1.30 1.31 1.27 -0.04 1.44 1.71 0.69 0.84 0.05 0.15 1.68 0.38 0.27 1.05 1.19 0.83 -0.24 0.23 0.12
23 | 0.46 1.07 1.04 0.73 -0.31 1.47 1.23 0.57 0.58 -0.11 -0.24 1.37 0.08 -0.34 0.76 0.51 0.48 -0.04 0.47 0.18
24 | 0.50 0.90 0.75 0.91 -0.26 1.03 1.25 0.55 0.55 -0.20 -0.26 1.42 0.50 -0.22 0.88 0.69 0.56 0.41 0.11 -0.15
25 | -0.41 -0.06 -0.19 0.32 -0.79 -0.14 0.58 0.07 -0.62 -1.58 -1.16 0.18 -1.03 -1.33 -0.56 0.15 -0.19 -1.83 -0.67 -0.92
26 | -0.22 -0.07 0.52 0.46 -0.87 0.38 0.59 0.40 -0.17 -1.29 -1.15 0.83 -0.98 -1.16 -0.16 0.34 -0.12 -0.79 -0.77 -0.78
27 | -0.51 0.49 0.48 0.67 -1.40 0.66 0.63 -0.06 0.28 -1.25 -1.50 1.14 -0.93 -1.36 -0.04 0.10 -0.01 -1.11 -0.82 -1.14
28 |
29 |
--------------------------------------------------------------------------------
/pepdata/matrices/helix_vs_strand.txt:
--------------------------------------------------------------------------------
1 | #H ZHAC000102
2 | #D Environment-dependent residue contact energies (rows = helix, cols = strand)
3 | #R PMID:10706611
4 | #A Zhang, C. and Kim, S.H.
5 | #T Environment-dependent residue contact energies for proteins
6 | #J Proc. Natl. Acad. Sci. USA 97, 2550-2555 (2000)
7 | #M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
8 | -0.94 1.26 0.55 0.76 -1.54 1.14 1.57 -0.78 0.44 -1.59 -1.64 1.91 -0.90 -1.49 0.28 0.20 -0.04 -0.92 -0.75 -1.45
9 | 0.56 1.79 2.31 0.79 -0.67 2.54 0.72 1.09 0.94 -0.01 0.01 3.68 0.89 -0.05 1.37 0.83 1.35 0.00 0.33 0.44
10 | 0.59 2.21 1.82 0.77 -0.90 0.46 3.06 -0.16 0.63 -0.33 0.20 2.43 0.99 0.63 0.54 0.24 0.63 0.11 -0.19 0.23
11 | 0.66 0.76 0.76 1.19 -0.21 1.66 2.22 0.29 0.57 0.59 0.79 1.13 1.41 0.49 1.70 1.03 1.19 1.85 0.18 0.86
12 | -1.75 0.78 -1.00 0.32 -3.64 0.48 0.87 -1.67 -0.62 -2.77 -2.32 0.19 -1.22 -2.67 -1.62 -0.83 -1.14 -0.52 -1.94 -2.35
13 | 0.33 2.15 1.22 1.26 1.37 1.17 2.56 0.92 1.02 0.11 0.00 2.58 0.79 -0.26 0.53 1.19 1.11 0.21 0.39 0.15
14 | 0.82 1.05 2.18 2.11 0.01 2.42 2.58 1.15 0.97 0.20 0.31 1.31 1.25 0.12 2.00 1.09 1.13 0.58 0.31 0.39
15 | -0.40 0.95 0.03 0.14 -1.00 0.34 0.99 -1.32 0.13 -1.40 -1.36 1.58 -0.90 -1.41 0.82 -0.27 0.21 -0.59 -1.27 -1.09
16 | -0.75 2.19 0.13 0.68 -1.37 1.98 1.13 0.01 1.52 -0.83 -0.58 2.26 -0.82 -1.01 0.53 -0.17 0.02 -49.00 -0.61 -0.56
17 | -1.99 0.25 -0.20 1.00 -2.44 -0.12 0.88 -1.54 -0.05 -2.64 -2.33 0.75 -1.85 -2.46 -1.06 -0.59 -0.65 -1.82 -1.88 -2.45
18 | -2.02 0.34 -0.04 0.13 -2.29 0.24 0.73 -1.27 -0.46 -2.53 -2.44 0.67 -1.80 -2.28 -1.29 -0.40 -0.34 -1.76 -1.66 -2.26
19 | 0.60 3.11 2.23 1.06 0.50 1.80 1.65 0.82 1.25 0.10 0.34 3.51 0.98 -0.21 1.15 2.09 1.30 -0.14 0.28 0.13
20 | -1.54 -0.06 -0.63 1.76 -2.51 0.14 0.72 -1.74 0.07 -2.27 -2.22 1.27 -1.77 -1.87 0.34 -0.02 -0.21 -0.93 -1.54 -1.81
21 | -2.12 0.33 -0.70 0.17 -2.30 -0.59 0.26 -1.60 -0.88 -2.53 -2.44 -0.42 -1.83 -2.68 -1.40 -0.82 -0.61 -1.63 -1.83 -2.25
22 | 0.63 2.43 -0.19 1.31 -1.63 1.46 1.91 0.08 1.11 -0.20 0.47 1.94 -0.34 0.15 0.57 0.00 1.15 0.06 0.26 -0.06
23 | -0.41 0.88 1.02 1.04 -0.21 1.27 0.94 0.04 0.75 -0.48 -0.67 2.28 0.45 -0.92 0.75 0.50 0.96 0.22 -0.19 -0.54
24 | -0.32 1.48 0.35 0.43 -1.44 0.38 1.36 -0.38 0.20 -1.14 -1.00 1.38 -0.35 -0.97 -0.05 -0.16 0.29 -0.53 -0.76 -0.73
25 | -1.85 0.45 -0.03 0.80 -1.64 -0.23 0.11 -0.95 0.67 -1.58 -2.13 0.61 -1.75 -1.59 -1.07 -0.34 -0.40 -1.29 -1.27 -1.79
26 | -0.88 -0.20 -0.29 0.14 -1.31 0.09 0.71 -0.56 -0.57 -1.66 -1.38 1.40 -1.60 -1.97 -0.73 -0.32 -0.37 -1.40 -0.96 -1.38
27 | -1.74 0.85 0.24 0.72 -2.25 0.45 0.81 -1.29 -0.24 -2.46 -2.38 0.37 -1.21 -2.16 -1.00 -0.10 -0.57 -1.34 -1.52 -2.31
28 |
--------------------------------------------------------------------------------
/pepdata/matrices/pmbec.mat:
--------------------------------------------------------------------------------
1 | A C D E F G H I K L M N P Q R S T V W Y
2 | A 0.322860152036 0.0113750373506 -0.0156239175966 -0.00259952715456 -0.0508792185716 0.0382679273874 -0.0832539299638 -0.00196691041626 -0.0103729638696 -0.042393907322 -0.0651042403697 -0.0853704925231 0.0757409633086 -0.0483151514798 -0.0136431408498 0.038455041596 0.0520376087986 0.081101427454 -0.125564718844 -0.0747500389698
3 | C 0.0113750373506 0.100680270274 0.0102951033136 0.0147570340938 0.0345785831581 0.00933463557214 -0.00750101609651 0.00476007239717 -0.0459237939975 -0.0182998264075 -0.0155971113182 0.0021128481374 -0.00860770840682 -0.0309903425175 -0.0482562439545 -0.0217965163697 -0.0227322740574 -0.0154276574266 0.0412325888637 0.00600631739163
4 | D -0.0156239175966 0.0102951033136 0.157208255034 0.0724279735923 -0.0189545540921 -0.00870389879389 -0.0180188107498 -0.0283467966687 -0.0634240071162 -0.0279979457557 -0.0241192288182 0.0194310374127 0.042784078891 0.000437307476866 -0.0591268568576 -0.0104660502173 0.00656101264316 -0.0193560886308 0.00415097887978 -0.0191575919464
5 | E -0.00259952715456 0.0147570340938 0.0724279735923 0.131775168933 -0.00519060032543 -0.00547805492393 -0.0335600821273 -0.0135417817213 -0.069471604426 0.00353800457557 -0.017166710134 0.00534055417468 0.022589833552 0.0281404974641 -0.0697402405064 -0.0172364513778 -0.0054830504799 -0.00806269508269 -0.00791955104235 -0.0231187170833
6 | F -0.0508792185716 0.0345785831581 -0.0189545540921 -0.00519060032543 0.259179996995 -0.00445131805782 -0.00639743486807 0.0628717025094 -0.049227253611 0.0488666377736 0.0315353570161 -0.0223593028205 -0.0919732521492 -0.0930189756622 -0.0626297946351 -0.0868415233743 -0.0777292391855 -0.015794520965 0.0625957490761 0.0858189617896
7 | G 0.0382679273874 0.00933463557214 -0.00870389879389 -0.00547805492393 -0.00445131805782 0.122499934434 -0.025558278086 -0.0207027221208 -0.0137316756786 -0.0326424665142 -0.0264215095016 -0.00403752148352 0.0094352664965 0.00425299048772 -0.0232280105465 0.0304312733191 0.00861853592388 -0.0127217072682 -0.0246539147339 -0.0205094859119
8 | H -0.0832539299638 -0.00750101609651 -0.0180188107498 -0.0335600821273 -0.00639743486807 -0.025558278086 0.207657765989 -0.0888505073496 0.0761447053198 -0.0351727012494 -0.000760393877348 0.0353903619255 -0.0682087048807 -0.00886454093107 0.109052662874 0.00938179429131 -0.0234122309305 -0.0870188771708 0.0123622841944 0.0365879336865
9 | I -0.00196691041626 0.00476007239717 -0.0283467966687 -0.0135417817213 0.0628717025094 -0.0207027221208 -0.0888505073496 0.27773187827 -0.0381642025534 0.0886112938313 0.0551293441776 -0.0593694184462 -0.039207153398 -0.0626883806129 -0.110160438997 -0.0618078497671 -0.0339233811197 0.091300054417 0.00138488610169 -0.0230596885329
10 | K -0.0103729638696 -0.0459237939975 -0.0634240071162 -0.069471604426 -0.049227253611 -0.0137316756786 0.0761447053198 -0.0381642025534 0.273355694189 -0.0177282663533 -0.00817300753785 -0.0339854863013 -0.0484016323395 -0.0331603641198 0.21516548555 0.00476731287861 -0.0331318604828 -0.0400367780545 -0.0598522551401 -0.00464804635586
11 | L -0.042393907322 -0.0182998264075 -0.0279979457557 0.00353800457557 0.0488666377736 -0.0326424665142 -0.0351727012494 0.0886112938313 -0.0177282663533 0.162738321535 0.0750528874999 -0.0111666419731 -0.051023845781 -0.00134001844501 -0.074934598492 -0.0584956357369 -0.031311528799 0.0449678806271 -0.00754567267671 -0.0137219703366
12 | M -0.0651042403697 -0.0155971113182 -0.0241192288182 -0.017166710134 0.0315353570161 -0.0264215095016 -0.000760393877348 0.0551293441776 -0.00817300753785 0.0750528874999 0.156957428383 0.00753829785887 -0.091647674076 0.00190198496329 -0.0257018542091 -0.0295349216339 -0.0454820084051 -0.0120310888206 0.0210041287765 0.0126203200265
13 | N -0.0853704925231 0.0021128481374 0.0194310374127 0.00534055417468 -0.0223593028205 -0.00403752148352 0.0353903619255 -0.0593694184462 -0.0339854863013 -0.0111666419731 0.00753829785887 0.151487423988 -0.0106077901881 0.0413965183445 -0.0338327997913 0.0170820288313 -0.00295174153884 -0.0436807942705 0.0296409813073 -0.00205806264345
14 | P 0.0757409633086 -0.00860770840682 0.042784078891 0.022589833552 -0.0919732521492 0.0094352664965 -0.0682087048807 -0.039207153398 -0.0484016323395 -0.051023845781 -0.091647674076 -0.0106077901881 0.354629507834 0.0481497903134 -0.0377142358446 -0.00687173098621 0.0199181111388 0.0225294243984 -0.0525069717881 -0.0890062760945
15 | Q -0.0483151514798 -0.0309903425175 0.000437307476866 0.0281404974641 -0.0930189756622 0.00425299048772 -0.00886454093107 -0.0626883806129 -0.0331603641198 -0.00134001844501 0.00190198496329 0.0413965183445 0.0481497903134 0.177175171536 0.00715630304762 0.0357241930907 0.027467611659 -0.032780800211 -0.0118972341632 -0.0487465602402
16 | R -0.0136431408498 -0.0482562439545 -0.0591268568576 -0.0697402405064 -0.0626297946351 -0.0232280105465 0.109052662874 -0.110160438997 0.21516548555 -0.074934598492 -0.0257018542091 -0.0338327997913 -0.0377142358446 0.00715630304762 0.389022190137 0.0204288367942 -0.0408668326839 -0.0934989556047 -0.0557627605155 0.00827128508526
17 | S 0.038455041596 -0.0217965163697 -0.0104660502173 -0.0172364513778 -0.0868415233743 0.0304312733191 0.00938179429131 -0.0618078497671 0.00476731287861 -0.0584956357369 -0.0295349216339 0.0170820288313 -0.00687173098621 0.0357241930907 0.0204288367942 0.161573840097 0.0839261885951 -0.00816241136786 -0.0444334801409 -0.0561239385213
18 | T 0.0520376087986 -0.0227322740574 0.00656101264316 -0.0054830504799 -0.0777292391855 0.00861853592388 -0.0234122309305 -0.0339233811197 -0.0331318604828 -0.031311528799 -0.0454820084051 -0.00295174153884 0.0199181111388 0.027467611659 -0.0408668326839 0.0839261885951 0.142525860495 0.0493244941272 -0.0264928932645 -0.0468623824337
19 | V 0.081101427454 -0.0154276574266 -0.0193560886308 -0.00806269508269 -0.015794520965 -0.0127217072682 -0.0870188771708 0.091300054417 -0.0400367780545 0.0449678806271 -0.0120310888206 -0.0436807942705 0.0225294243984 -0.032780800211 -0.0934989556047 -0.00816241136786 0.0493244941272 0.172778293246 -0.0289445753682 -0.0444846240282
20 | W -0.125564718844 0.0412325888637 0.00415097887978 -0.00791955104235 0.0625957490761 -0.0246539147339 0.0123622841944 0.00138488610169 -0.0598522551401 -0.00754567267671 0.0210041287765 0.0296409813073 -0.0525069717881 -0.0118972341632 -0.0557627605155 -0.0444334801409 -0.0264928932645 -0.0289445753682 0.194048086876 0.0791543436022
21 | Y -0.0747500389698 0.00600631739163 -0.0191575919464 -0.0231187170833 0.0858189617896 -0.0205094859119 0.0365879336865 -0.0230596885329 -0.00464804635586 -0.0137219703366 0.0126203200265 -0.00205806264345 -0.0890062760945 -0.0487465602402 0.00827128508526 -0.0561239385213 -0.0468623824337 -0.0444846240282 0.0791543436022 0.237788221516
22 |
--------------------------------------------------------------------------------
/pepdata/matrices/strand_vs_coil.txt:
--------------------------------------------------------------------------------
1 | #H ZHAC000105
2 | #D Environment-dependent residue contact energies (rows = strand, cols = coil)
3 | #R PMID:10706611
4 | #A Zhang, C. and Kim, S.H.
5 | #T Environment-dependent residue contact energies for proteins
6 | #J Proc. Natl. Acad. Sci. USA 97, 2550-2555 (2000)
7 | #M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
8 | -0.57 0.47 0.30 0.62 -1.60 0.45 0.61 -0.24 0.07 -1.64 -1.63 0.62 -1.03 -1.55 -0.11 -0.10 -0.34 -1.44 -0.39 -1.55
9 | 0.23 0.79 0.76 0.39 -0.41 0.92 0.76 0.52 0.51 -0.30 0.13 1.58 0.88 -0.07 0.60 0.65 0.37 0.14 0.32 0.17
10 | -0.28 0.74 0.57 0.87 -0.68 0.52 1.00 -0.07 0.32 -0.31 -0.08 0.87 0.29 -0.17 0.57 0.11 0.19 0.04 0.24 -0.23
11 | 0.15 -0.25 0.46 0.69 -0.46 0.41 1.34 0.56 -0.51 -0.23 0.27 0.59 0.60 -0.38 1.02 0.08 0.05 -0.48 0.02 0.34
12 | -1.19 -0.46 0.21 0.51 -3.30 0.26 0.20 -1.03 -0.72 -1.55 -1.71 0.27 -1.24 -1.70 -0.50 -0.55 -0.97 -0.67 -1.26 -1.62
13 | 0.63 1.18 0.92 1.37 -0.30 0.93 1.27 0.56 0.91 -0.28 -0.11 0.98 0.15 -0.30 0.64 0.88 0.68 -0.44 0.66 0.15
14 | 0.97 0.89 1.37 1.89 0.30 1.25 2.34 0.98 0.58 0.20 0.50 0.67 1.23 0.58 1.26 0.95 1.06 0.04 0.87 0.48
15 | -0.64 0.12 0.27 0.31 -1.37 0.38 0.98 -0.40 -0.12 -1.58 -1.40 0.78 -0.46 -1.38 -0.21 0.05 -0.26 -1.41 -0.61 -1.13
16 | -0.02 0.75 0.68 0.14 -0.58 0.73 0.84 0.41 -0.64 -0.75 0.03 1.46 -0.16 -0.49 0.52 0.31 -0.11 -1.00 -0.58 0.03
17 | -0.94 -0.14 0.31 0.26 -1.70 0.07 0.46 -0.37 -0.50 -1.88 -1.79 0.84 -0.99 -1.82 -0.47 -0.05 -0.54 -1.65 -1.09 -1.64
18 | -0.76 0.32 0.43 0.25 -1.63 0.22 0.68 -0.17 -0.40 -1.84 -1.70 0.47 -1.06 -1.76 -0.39 0.09 -0.42 -1.81 -1.15 -1.64
19 | 1.02 1.99 1.18 0.59 0.08 1.10 0.60 0.61 0.95 0.24 0.34 2.69 0.97 -0.03 1.23 1.07 0.83 0.00 0.26 0.36
20 | -0.16 0.83 0.47 0.92 -1.63 0.36 0.71 -0.20 0.90 -1.00 -1.12 1.55 -0.31 -1.35 -0.01 0.34 0.20 -1.70 -0.60 -0.79
21 | -0.70 0.03 0.63 0.15 -1.26 0.29 0.35 -0.11 -0.36 -1.73 -1.55 0.71 -0.97 -1.55 -0.28 -0.09 -0.32 -1.23 -0.91 -1.30
22 | 0.17 0.50 0.60 0.67 -1.31 0.50 0.94 0.02 -0.45 -1.26 -0.91 1.08 0.83 -0.87 0.63 0.31 0.26 -0.50 -0.55 -0.79
23 | -0.06 0.99 0.73 0.86 -0.89 0.85 0.67 0.08 0.06 -0.22 -0.29 0.94 -0.08 -0.41 0.67 0.33 0.13 -1.01 0.13 -0.24
24 | 0.26 0.93 0.70 0.87 -0.78 0.58 1.20 0.12 0.52 -0.30 -0.24 1.11 0.01 -0.08 0.65 0.47 0.41 -0.31 0.12 -0.32
25 | -0.03 -0.11 0.27 0.66 -1.50 0.65 0.50 -0.12 -0.32 -1.13 -1.01 0.52 -1.08 -1.04 -0.32 -0.03 -0.10 -0.67 -0.73 -0.64
26 | -0.44 0.20 0.20 0.20 -1.26 0.16 0.10 -0.21 -0.52 -1.26 -1.30 0.60 -0.76 -1.17 -0.42 0.05 -0.27 -1.20 -0.75 -0.84
27 | -0.83 0.20 0.48 0.62 -1.44 0.17 0.73 -0.12 -0.26 -1.64 -1.59 0.52 -0.70 -1.55 -0.28 0.12 -0.17 -1.16 -0.85 -1.42
--------------------------------------------------------------------------------
/pepdata/peptide_vectorizer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2016. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from __future__ import print_function, division, absolute_import
17 |
18 | import numpy as np
19 | from sklearn.feature_extraction.text import CountVectorizer
20 | from sklearn.preprocessing import normalize
21 |
22 | def make_count_vectorizer(reduced_alphabet, max_ngram):
23 | if reduced_alphabet is None:
24 | preprocessor = None
25 | else:
26 | preprocessor = lambda s: "".join([reduced_alphabet[si] for si in s])
27 |
28 | return CountVectorizer(
29 | analyzer='char',
30 | ngram_range=(1, max_ngram),
31 | dtype=np.float,
32 | preprocessor=preprocessor)
33 |
34 | class PeptideVectorizer(object):
35 | """
36 | Make n-gram frequency vectors from peptide sequences
37 | """
38 | def __init__(
39 | self,
40 | max_ngram=1,
41 | normalize_row=True,
42 | reduced_alphabet=None,
43 | training_already_reduced=False):
44 | self.reduced_alphabet = reduced_alphabet
45 | self.max_ngram = max_ngram
46 | self.normalize_row = normalize_row
47 | self.training_already_reduced = training_already_reduced
48 | self.count_vectorizer = None
49 |
50 | def __getstate__(self):
51 | return {
52 | 'reduced_alphabet': self.reduced_alphabet,
53 | 'count_vectorizer': self.count_vectorizer,
54 | 'training_already_reduced': self.training_already_reduced,
55 | 'normalize_row': self.normalize_row,
56 | 'max_ngram': self.max_ngram,
57 | }
58 |
59 | def fit_transform(self, amino_acid_strings):
60 | self.count_vectorizer = \
61 | make_count_vectorizer(self.reduced_alphabet, self.max_ngram)
62 |
63 | if self.training_already_reduced:
64 | c = make_count_vectorizer(None, self.max_ngram)
65 | X = c.fit_transform(amino_acid_strings).todense()
66 | self.count_vectorizer.vocabulary_ = c.vocabulary_
67 | else:
68 | c = self.count_vectorizer
69 | X = c.fit_transform(amino_acid_strings).todense()
70 |
71 | if self.normalize_row:
72 | X = normalize(X, norm='l1')
73 | return X
74 |
75 | def fit(self, amino_acid_strings):
76 | self.fit_transform(amino_acid_strings)
77 |
78 | def transform(self, amino_acid_strings):
79 | assert self.count_vectorizer, "Must call 'fit' before 'transform'"
80 | X = self.count_vectorizer.transform(amino_acid_strings).todense()
81 | if self.normalize_row:
82 | X = normalize(X, norm='l1')
83 | return X
84 |
--------------------------------------------------------------------------------
/pepdata/pmbec.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2016. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function, division, absolute_import
16 | from os.path import join
17 |
18 | from .static_data import MATRIX_DIR
19 |
20 | from .amino_acid_alphabet import dict_to_amino_acid_matrix
21 |
22 | def read_pmbec_coefficients(
23 | key_type='row',
24 | verbose=True,
25 | filename=join(MATRIX_DIR, 'pmbec.mat')):
26 | """
27 | Parameters
28 | ------------
29 |
30 | filename : str
31 | Location of PMBEC coefficient matrix
32 |
33 | key_type : str
34 | 'row' : every key is a single amino acid,
35 | which maps to a dictionary for that row
36 | 'pair' : every key is a tuple of amino acids
37 | 'pair_string' : every key is a string of two amino acid characters
38 |
39 | verbose : bool
40 | Print rows of matrix as we read them
41 | """
42 | d = {}
43 | if key_type == 'row':
44 | def add_pair(row_letter, col_letter, value):
45 | if row_letter not in d:
46 | d[row_letter] = {}
47 | d[row_letter][col_letter] = value
48 | elif key_type == 'pair':
49 | def add_pair(row_letter, col_letter, value):
50 | d[(row_letter, col_letter)] = value
51 |
52 | else:
53 | assert key_type == 'pair_string', \
54 | "Invalid dictionary key type: %s" % key_type
55 |
56 | def add_pair(row_letter, col_letter, value):
57 | d["%s%s" % (row_letter, col_letter)] = value
58 |
59 | with open(filename, 'r') as f:
60 | lines = [line for line in f.read().split('\n') if len(line) > 0]
61 | header = lines[0]
62 | if verbose:
63 | print(header)
64 | residues = [
65 | x for x in header.split()
66 | if len(x) == 1 and x != ' ' and x != '\t'
67 | ]
68 | assert len(residues) == 20
69 | if verbose:
70 | print(residues)
71 | for line in lines[1:]:
72 | cols = [
73 | x
74 | for x in line.split(' ')
75 | if len(x) > 0 and x != ' ' and x != '\t'
76 | ]
77 | assert len(cols) == 21, "Expected 20 values + letter, got %s" % cols
78 | row_letter = cols[0]
79 | for i, col in enumerate(cols[1:]):
80 | col_letter = residues[i]
81 | assert col_letter != ' ' and col_letter != '\t'
82 | value = float(col)
83 | add_pair(row_letter, col_letter, value)
84 | return d
85 |
86 | # dictionary of PMBEC coefficient accessed like pmbec_dict["V"]["R"]
87 | pmbec_dict = read_pmbec_coefficients(key_type="row")
88 | pmbec_matrix = dict_to_amino_acid_matrix(pmbec_dict)
89 |
--------------------------------------------------------------------------------
/pepdata/reduced_alphabet.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2018. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Amino acid groupings from
17 | 'Reduced amino acid alphabets improve the sensitivity...' by
18 | Peterson, Kondev, et al.
19 | http://www.rpgroup.caltech.edu/publications/Peterson2008.pdf
20 | """
21 | from __future__ import print_function, division, absolute_import
22 |
23 | def dict_from_list(groups):
24 | aa_to_group = {}
25 | for i, group in enumerate(groups):
26 | for c in group:
27 | aa_to_group[c] = group[0]
28 | return aa_to_group
29 |
30 | gbmr4 = dict_from_list(["ADKERNTSQ", "YFLIVMCWH", "G", "P"])
31 |
32 | sdm12 = dict_from_list([
33 | "A", "D", "KER", "N", "TSQ", "YF", "LIVM", "C", "W", "H", "G", "P"
34 | ])
35 |
36 | hsdm17 = dict_from_list([
37 | "A", "D", "KE", "R", "N", "T", "S", "Q", "Y",
38 | "F", "LIV", "M", "C", "W", "H", "G", "P"
39 | ])
40 |
41 | """
42 | Other alphabets from
43 | http://bio.math-inf.uni-greifswald.de/viscose/html/alphabets.html
44 | """
45 |
46 | # hydrophilic vs. hydrophobic
47 | hp2 = dict_from_list(["AGTSNQDEHRKP", "CMFILVWY"])
48 |
49 | murphy10 = dict_from_list([
50 | "LVIM", "C", "A", "G", "ST", "P", "FYW", "EDNQ", "KR", "H"
51 | ])
52 |
53 | alex6 = dict_from_list(["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"])
54 |
55 | aromatic2 = dict_from_list(["FHWY", "ADKERNTSQLIVMCGP"])
56 |
57 | hp_vs_aromatic = dict_from_list(["H", "CMILV", "FWY", "ADKERNTSQGP"])
58 |
--------------------------------------------------------------------------------
/pepdata/residue_contact_energies.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
15 | from os.path import join
16 |
17 | from .amino_acid_alphabet import canonical_amino_acid_letters, dict_to_amino_acid_matrix
18 | from .static_data import MATRIX_DIR
19 |
20 |
21 | def parse_interaction_table(table, amino_acid_order="ARNDCQEGHILKMFPSTWYV"):
22 | table = table.strip()
23 | while " " in table:
24 | table = table.replace(" ", " ")
25 |
26 | lines = [l.strip() for l in table.split("\n")]
27 | lines = [l for l in lines if len(l) > 0 and not l.startswith("#")]
28 | assert len(lines) == 20, "Malformed amino acid interaction table"
29 | d = {}
30 | for i, line in enumerate(lines):
31 | coeff_strings = line.split(" ")
32 | assert len(coeff_strings) == 20, \
33 | "Malformed row in amino acid interaction table"
34 | x = amino_acid_order[i]
35 | d[x] = {}
36 | for j, coeff_str in enumerate(coeff_strings):
37 | value = float(coeff_str)
38 | y = amino_acid_order[j]
39 | d[x][y] = value
40 | return d
41 |
42 | def transpose_interaction_dict(d):
43 | transposed = {}
44 | for x in canonical_amino_acid_letters:
45 | transposed[x] = {}
46 | for y in canonical_amino_acid_letters:
47 | transposed[x][y] = d[y][x]
48 | return transposed
49 |
50 |
51 | with open(join(MATRIX_DIR, 'strand_vs_coil.txt'), 'r') as f:
52 | # Strand vs. Coil
53 | strand_vs_coil_dict = parse_interaction_table(f.read())
54 | strand_vs_coil_array = dict_to_amino_acid_matrix(strand_vs_coil_dict)
55 |
56 | # Coil vs. Strand
57 | coil_vs_strand_dict = transpose_interaction_dict(strand_vs_coil_dict)
58 | coil_vs_strand_array = dict_to_amino_acid_matrix(coil_vs_strand_dict)
59 |
60 | with open(join(MATRIX_DIR, 'helix_vs_strand.txt'), 'r') as f:
61 | # Helix vs. Strand
62 | helix_vs_strand_dict = parse_interaction_table(f.read())
63 | helix_vs_strand_array = dict_to_amino_acid_matrix(helix_vs_strand_dict)
64 |
65 | # Strand vs. Helix
66 | strand_vs_helix_dict = transpose_interaction_dict(helix_vs_strand_dict)
67 | strand_vs_helix_array = dict_to_amino_acid_matrix(strand_vs_helix_dict)
68 |
69 | with open(join(MATRIX_DIR, 'helix_vs_coil.txt'), 'r') as f:
70 | # Helix vs. Coil
71 | helix_vs_coil_dict = parse_interaction_table(f.read())
72 | helix_vs_coil_array = dict_to_amino_acid_matrix(helix_vs_coil_dict)
73 |
74 | # Coil vs. Helix
75 | coil_vs_helix_dict = transpose_interaction_dict(helix_vs_coil_dict)
76 | coil_vs_helix_array = dict_to_amino_acid_matrix(coil_vs_helix_dict)
77 |
--------------------------------------------------------------------------------
/pepdata/static_data.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | from __future__ import print_function, division, absolute_import
15 | from os.path import dirname, realpath, join
16 |
17 | PACKAGE_DIR = dirname(realpath(__file__))
18 | MATRIX_DIR = join(PACKAGE_DIR, 'matrices')
19 |
--------------------------------------------------------------------------------
/pepdata/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.2.0"
2 |
3 |
4 | def print_version():
5 | print(f"v{__version__}")
6 |
7 | if __name__ == "__main__":
8 | print_version()
--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [TYPECHECK]
2 | # Without ignoring this, we get errors like:
3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member)
4 | ignored-modules = numpy,numpy.random
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.7
2 | scipy>=0.9
3 | pandas>=0.17
4 | scikit-learn>=0.14.1
5 | progressbar33
6 | biopython>=1.65
7 | datacache>=0.4.4
8 | lxml
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2018. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from __future__ import print_function, division, absolute_import
17 | import os
18 | import re
19 |
20 | from setuptools import setup, find_packages
21 |
22 | readme_dir = os.path.dirname(__file__)
23 | readme_path = os.path.join(readme_dir, 'README.md')
24 |
25 | try:
26 | with open(readme_path, 'r') as f:
27 | readme_markdown = f.read()
28 | except:
29 | print("Failed to load README file")
30 | readme_markdown = ""
31 |
32 | try:
33 | import pypandoc
34 | readme_restructured = pypandoc.convert(readme_markdown, to='rst', format='md')
35 | except:
36 | readme_restructured = readme_markdown
37 | print("Conversion of long_description from markdown to reStructuredText failed, skipping...")
38 |
39 | with open('pepdata/__init__.py', 'r') as f:
40 | version = re.search(
41 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
42 | f.read(),
43 | re.MULTILINE).group(1)
44 |
45 | if __name__ == '__main__':
46 | setup(
47 | name='pepdata',
48 | version=version,
49 | description="Immunological peptide datasets and amino acid properties",
50 | author="Alex Rubinsteyn",
51 | author_email="alex.rubinsteyn@mssm.edu",
52 | url="https://github.com/openvax/pepdata",
53 | license="http://www.apache.org/licenses/LICENSE-2.0.html",
54 | classifiers=[
55 | 'Development Status :: 3 - Alpha',
56 | 'Environment :: Console',
57 | 'Operating System :: OS Independent',
58 | 'Intended Audience :: Science/Research',
59 | 'License :: OSI Approved :: Apache Software License',
60 | 'Programming Language :: Python',
61 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
62 | ],
63 | install_requires=[
64 | 'numpy>=1.7',
65 | 'scipy>=0.9',
66 | 'pandas>=0.17',
67 | 'scikit-learn>=0.14.1',
68 | 'progressbar33',
69 | 'biopython>=1.65',
70 | 'datacache>=0.4.4',
71 | 'lxml',
72 | ],
73 | long_description=readme_restructured,
74 | packages=find_packages(exclude="test"),
75 | include_package_data=True
76 | )
77 |
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | pytest --cov=pepdata/ --cov-report=term-missing tests
2 |
3 |
4 |
--------------------------------------------------------------------------------
/tests/test_amino_acids.py:
--------------------------------------------------------------------------------
1 | from nose.tools import eq_
2 | from pepdata.amino_acid_alphabet import (
3 | canonical_amino_acids,
4 | canonical_amino_acid_letters,
5 | extended_amino_acids,
6 | extended_amino_acid_letters,
7 | )
8 |
9 | def test_canonical_amino_acids():
10 | assert len(canonical_amino_acids) == 20
11 |
12 | def test_canonical_amino_acids_letters():
13 | assert len(canonical_amino_acid_letters) == 20
14 | assert "X" not in canonical_amino_acid_letters
15 | expected_letters = [aa.letter for aa in canonical_amino_acids]
16 | eq_(expected_letters, canonical_amino_acid_letters)
17 |
18 | def test_extended_amino_acids():
19 | assert len(extended_amino_acids) > 20
20 |
21 | def test_extended_amino_acids_letters():
22 | assert len(extended_amino_acid_letters) > 20
23 | assert "X" in extended_amino_acid_letters
24 | assert "J" in extended_amino_acid_letters
25 | expected_letters = [aa.letter for aa in extended_amino_acids]
26 | eq_(expected_letters, extended_amino_acid_letters)
27 |
--------------------------------------------------------------------------------
/tests/test_blosum.py:
--------------------------------------------------------------------------------
1 | from pepdata.blosum import (
2 | blosum30_dict,
3 | blosum30_matrix,
4 | blosum50_dict,
5 | blosum50_matrix,
6 | blosum62_dict,
7 | blosum62_matrix
8 | )
9 |
10 | def test_blosum30():
11 | pass
12 |
13 | def test_blosum50():
14 | pass
15 |
16 | def test_blosum62():
17 | pass
--------------------------------------------------------------------------------
/tests/test_iedb_alleles.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | from __future__ import print_function, division, absolute_import
15 |
16 | from nose.tools import eq_
17 |
18 | from pepdata import iedb
19 |
20 | def test_iedb_human_class1_allele():
21 | allele_dict = iedb.alleles.load_alleles_dict()
22 | allele = allele_dict["HLA-C*07:02"]
23 | eq_(allele.mhc_class, "I")
24 | eq_(allele.locus, "C")
25 |
26 | def test_iedb_human_class2_allele():
27 | allele_dict = iedb.alleles.load_alleles_dict()
28 | allele = allele_dict["HLA-DRA*01:01/DRB1*04:04"]
29 | eq_(allele.mhc_class, "II")
30 | eq_(allele.locus, "DR")
31 |
32 |
33 | def test_iedb_mouse_class1_allele():
34 | allele_dict = iedb.alleles.load_alleles_dict()
35 | allele = allele_dict["H-2-Ds"]
36 | eq_(allele.mhc_class, "I")
37 | eq_(allele.locus, "D")
38 |
39 | def test_iedb_mouse_class2_allele():
40 | allele_dict = iedb.alleles.load_alleles_dict()
41 | allele = allele_dict["H-2-IAq"]
42 | eq_(allele.mhc_class, "II")
43 | eq_(allele.locus, "IA")
44 |
--------------------------------------------------------------------------------
/tests/test_iedb_mhc.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function, division, absolute_import
16 |
17 | from pepdata import iedb
18 |
19 | def test_mhc_hla_a2():
20 | """
21 | IEDB MHC: Test that HLA restriction actually decreases number of results and
22 | that regular expression patterns are being used correctly
23 | """
24 | df_all = iedb.mhc.load_dataframe(nrows=1000)
25 | df_a2_1 = iedb.mhc.load_dataframe(hla='HLA-A2', nrows=1000)
26 | df_a2_2 = iedb.mhc.load_dataframe(hla=r'HLA-A\*02', nrows=1000)
27 | df_a2_combined = iedb.mhc.load_dataframe(hla=r'HLA-A2|HLA-A\*02', nrows=1000)
28 | assert len(df_a2_1) < len(df_all)
29 | assert len(df_a2_2) < len(df_all)
30 | assert len(df_a2_combined) <= len(df_a2_1) + len(df_a2_2), \
31 | "Expected %d <= %d + %d" % \
32 | (len(df_a2_combined), len(df_a2_1), len(df_a2_2))
33 |
--------------------------------------------------------------------------------
/tests/test_iedb_tcell.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from pepdata import iedb
14 |
15 | def test_tcell_hla_restrict_a24():
16 | """
17 | IEDB T-cell: Test that HLA restriction actually decreases
18 | number of results and that regular expression patterns
19 | are being used correctly
20 | """
21 | df_all = iedb.tcell.load_dataframe(nrows=1000)
22 | df_a24_1 = iedb.tcell.load_dataframe(hla='HLA-A24', nrows=1000)
23 | df_a24_2 = iedb.tcell.load_dataframe(hla=r'HLA-A\*24', nrows=1000)
24 | df_a24_combined = \
25 | iedb.tcell.load_dataframe(hla=r'HLA-A24|HLA-A\*24', nrows=1000)
26 | assert len(df_a24_1) < len(df_all)
27 | assert len(df_a24_2) < len(df_all)
28 | assert len(df_a24_combined) <= \
29 | len(df_a24_1) + len(df_a24_2), \
30 | "Expected %d <= %d + %d" % \
31 | (len(df_a24_combined), len(df_a24_1), len(df_a24_2))
32 |
33 | def test_tcell_hla_exclude_a0201():
34 | """
35 | Test that excluding HLA allele A*02:01
36 | actually returns a DataFrame not containing
37 | that allele
38 | """
39 | df_all = iedb.tcell.load_dataframe(nrows=1000)
40 | assert (df_all['MHC']['Allele Name'] == "HLA-A*02:01").any()
41 |
42 | df_exclude = iedb.tcell.load_dataframe(
43 | nrows=1000,
44 | exclude_hla="HLA-A\*02:01")
45 |
46 | n_A0201_entries = (df_exclude['MHC']['Allele Name'] == "HLA-A*02:01").sum()
47 | assert n_A0201_entries == 0, \
48 | ("Not supposed to contain HLA-A*02:01, "
49 | " but found %d rows of that allele") % n_A0201_entries
50 |
--------------------------------------------------------------------------------
/tests/test_ngram.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function, division, absolute_import
16 | from six.moves import cPickle
17 |
18 | from pepdata import PeptideVectorizer
19 |
20 | # isoforms of two different proteins a, b
21 |
22 | a1 = (
23 | "MSPHPTALLGLVLCLAQTIHTQEEDLPRPSISAEPGTVIPLGSHVTFVCRGPVGVQTFRLERESRSTYND"
24 | "TEDVSQASPSESEARFRIDSVSEGNAGPYRCIYYKPPKWSEQSDYLELLVKETSGGPDSPDTEPGSSAGPT"
25 | "QRPSDNSHNEHAPASQGLKAEHLYILIGVSVVFLFCLLLLVLFCLHRQNQIKQGPPRSKDEEQKPQQRPDL"
26 | "AVDVLERTADKATVNGLPEKDRETDTSALAAGSSQEVTYAQLDHWALTQRTARAVSPQSTKPMAESITYAA"
27 | "VARH"
28 | )
29 |
30 | a2 = (
31 | "MSLMVVSMACVGFFLLQGAWPHEGVHRKPSLLAHPGPLVKSEETVILQCWSDVRFEHFLLHREGKYKDTLH"
32 | "LIGEHHDGVSKANFSIGPMMQDLAGTYRCYGSVTHSPYQLSAPSDPLDIVITGLYEKPSLSAQPGPTVLAG"
33 | "ESVTLSCSSRSSYDMYHLSREGEAHERRFSAGPKVNGTFQADFPLGPATHGGTYRCFGSFRDSPYEWSNSS"
34 | "DPLLVSVTGNPSNSWPSPTEPSSKTGNPRHLHVLIGTSVVKIPFTILLFFLLHRWCSNKKNAAVMDQEPAG"
35 | "NRTVNSEDSDEQDHQEVSYA"
36 | )
37 |
38 | a3 = (
39 | "MSLMVVSMACVGFFLLEGPWPHVGGQDKPFLSAWPGTVVSEGQHVTLQCRSRLGFNEFSLSKEDGMPVPEL"
40 | "YNRIFRNSFLMGPVTPAHAGTYRCCSSHPHSPTGWSAPSNPVVIMVTGVHRKPSLLAHPGPLVKSEETVIL"
41 | "QCWSDVRFEHFLLHREGKYKDTLHLIGEHHDGVSKANFSIGPMMQDLAGTYRCYGSVTHSPYQLSAPSDPL"
42 | "DIVITGLYEKPSLSAQPGPTVLAGESVTLSCSSRSSYDMYHLSREGEAHERRFSAGPKVNGTFQADFPLGP"
43 | "ATHGGTYRCFGSFRDSPYEWSNSSDPLLVSVTAFLSVKSSGHKYIY"
44 | )
45 |
46 | A = [a1, a2, a3]
47 |
48 | b1 = (
49 | "MPKGRAGSLPTTSIGWRFQLWFLGLTCPERHLARRLKNNSFYPFVQQEPNVFVLEYYLDTLWKGMLLFII"
50 | "SVVLVSFSSLREVQKQETWVFLVYGVGVGLWLVISSLPRRRLVLNHTRGVYHFSIQGRTVCQGPLHLVYV"
51 | "RLALSSDAHGRCFFHLVLGGHRLEPLVLVQLSEHYEQMEYLGRYIARKLNINYFDYLATSYRHVVRHWPP"
52 | "PGAGTVMGKSPMGHKPSSSQSSLEV"
53 | )
54 |
55 | b2 = (
56 | "MPKGRAGSLPTTSIGWRFQLWFLGLTCPERHLARRLKNNSFYPFVQQEPNVFVLEYYLDTLWKGMLLFII"
57 | "SVVLVSFSSLREVQKQETWVFLVYGVGVGLWLVISSLPRRRLVLNHTRGVYHFSIQGRTVCQGPLHLVYV"
58 | "RLALSSDAHGRCFFHLVLGGHRLEPLVLVQLSEHYEQMEYLGRYIARKLNINYFDYLATSYRHVVRHWPPP"
59 | "GAGTVMGKSPMGHKPSSSQSSLEV"
60 | )
61 |
62 | B = [b1, b2]
63 |
--------------------------------------------------------------------------------
/tests/test_pmbec.py:
--------------------------------------------------------------------------------
1 | from pepdata.pmbec import (
2 | pmbec_dict,
3 | pmbec_matrix,
4 | )
5 |
6 | def test_pmbec():
7 | pass
8 |
--------------------------------------------------------------------------------