├── .github
└── workflows
│ ├── compile_huggingface.yml
│ └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── huggingface
├── README.md
├── combine_files.py
├── huggingface_code.py
├── huggingface_wrapper.py
├── print_colab_dropdown.py
├── push_to_hub.py
└── requirements.txt
├── metl
├── __init__.py
├── encode.py
├── main.py
├── models.py
├── relative_attention.py
├── structure.py
├── test.py
├── test2.py
├── test3.py
└── test4.py
├── notebooks
└── inference.ipynb
├── pdbs
├── 1gfl_cm.pdb
├── 2qmt_p.pdb
├── 6qji_p_trunc_2022.pdb
├── AF-P60484-F1-model_v4_p.pdb
├── AF-P62993-F1-model_v4_trunc_p.pdb
├── AF-Q6SJ61-F1-model_v4_p.pdb
├── pab1_cm.pdb
└── ube4b_cm.pdb
├── requirements.txt
├── setup.cfg
└── setup.py
/.github/workflows/compile_huggingface.yml:
--------------------------------------------------------------------------------
1 | name: Compiling Huggingface Wrapper
2 | on: [push, workflow_dispatch]
3 | jobs:
4 | Combine-File:
5 | runs-on: ubuntu-latest
6 | env:
7 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
8 | steps:
9 | - uses: actions/checkout@v4
10 | with:
11 | ref: 'main'
12 | - name: Set up Python
13 | uses: actions/setup-python@v5
14 | with:
15 | python-version: '3.9'
16 | - name: Upgrade pip
17 | run: pip install --upgrade pip
18 | - name: Install dependencies
19 | run: pip install -r huggingface/requirements.txt
20 | - name: Install torch cpu only
21 | run: pip install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
22 | - name: Combining Files
23 | run: python huggingface/combine_files.py -o huggingface/huggingface_wrapper.py
24 | - name: Formatting generated code
25 | run: |
26 | python -m black huggingface/huggingface_wrapper.py
27 | python -m isort huggingface/huggingface_wrapper.py
28 | - name: Push to hub
29 | run: python huggingface/push_to_hub.py
30 |
31 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 | on:
3 | - push
4 | - pull_request
5 | jobs:
6 |
7 | test:
8 | name: Test pre-trained models
9 | runs-on: ${{ matrix.os }}
10 | strategy:
11 | max-parallel: 4
12 | fail-fast: false
13 | matrix:
14 | os:
15 | - ubuntu-latest
16 | - windows-latest
17 | - macos-latest
18 | python-version:
19 | - '3.9'
20 | - '3.12'
21 |
22 | steps:
23 | - name: Checkout repository
24 | uses: actions/checkout@v4
25 | - name: Install Python
26 | uses: actions/setup-python@v5
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | cache: 'pip' # caching pip dependencies
30 | - name: Install dependencies
31 | run: |
32 | python -m pip install --upgrade pip
33 | pip install -r requirements.txt
34 | pip install .
35 | pip list
36 | - name: Test METL-G
37 | run: python metl/test.py
38 | - name: Test 1D low-N METL-L avGFP
39 | run: python metl/test2.py
40 | - name: Test 3D low-N METL-L avGFP
41 | run: python metl/test3.py
42 | - name: Test METL-L GB1
43 | run: python metl/test4.py
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # PyCharm project settings
132 | .idea
133 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Sam Gelman
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pretrained METL models
2 | [](https://github.com/gitter-lab/metl-pretrained/actions/workflows/test.yml)
3 | [](https://zenodo.org/doi/10.5281/zenodo.10819499)
4 |
5 | This repository contains pretrained METL [models](https://zenodo.org/doi/10.5281/zenodo.11051644) with minimal dependencies.
6 | For more information, please see the [metl](https://github.com/gitter-lab/metl) repository and our manuscript:
7 |
8 | [Biophysics-based protein language models for protein engineering](https://doi.org/10.1101/2024.03.15.585128).
9 | Sam Gelman, Bryce Johnson, Chase Freschlin, Arnav Sharma, Sameer D'Costa, John Peters, Anthony Gitter+, Philip A Romero+.
10 | *bioRxiv*, 2024. doi:10.1101/2024.03.15.585128
11 | + denotes equal contribution.
12 |
13 | # Getting started
14 | 1. Create a conda environment (or use existing one): `conda create --name myenv python=3.9`
15 | 2. Activate conda environment `conda activate myenv`
16 | 3. Clone this repository
17 | 4. Navigate to the cloned repository `cd metl-pretrained`
18 | 5. Install the package with `pip install .`
19 | 6. Import the package in your script with `import metl`
20 | 7. Load a pretrained model using `model, data_encoder = metl.get_from_uuid(uuid)` or one of the other loading functions (see examples below)
21 | - `model` is a PyTorch model loaded with the pre-trained weights
22 | - `data_encoder` is a helper object that can be used to encode sequences and variants to be fed into the model
23 |
24 | # Available models
25 | Model checkpoints are available to download from [Zenodo](https://zenodo.org/doi/10.5281/zenodo.11051644).
26 | Once you have a checkpoint downloaded, you can load it into a PyTorch model using `metl.get_from_checkpoint()`.
27 | Alternatively, you can use `metl.get_from_uuid()` or `metl.get_from_ident()` to automatically download, cache, and load the model based on the model identifier or UUID.
28 | See the examples below.
29 |
30 | ## Source models
31 | Source models predict Rosetta energy terms.
32 |
33 | ### Global source models
34 |
35 | | Identifier | UUID | Params | RPE | Output | Description | Download |
36 | |-----------------|------------|--------|-----|------------------|-------------|--------------------------------------------------------------------------------------------|
37 | | `METL-G-20M-1D` | `D72M9aEp` | 20M | 1D | Rosetta energies | METL-G | [Download](https://zenodo.org/records/14908509/files/METL-G-20M-1D-D72M9aEp.pt?download=1) |
38 | | `METL-G-20M-3D` | `Nr9zCKpR` | 20M | 3D | Rosetta energies | METL-G | [Download](https://zenodo.org/records/14908509/files/METL-G-20M-3D-Nr9zCKpR.pt?download=1) |
39 | | `METL-G-50M-1D` | `auKdzzwX` | 50M | 1D | Rosetta energies | METL-G | [Download](https://zenodo.org/records/14908509/files/METL-G-50M-1D-auKdzzwX.pt?download=1) |
40 | | `METL-G-50M-3D` | `6PSAzdfv` | 50M | 3D | Rosetta energies | METL-G | [Download](https://zenodo.org/records/14908509/files/METL-G-50M-3D-6PSAzdfv.pt?download=1) |
41 |
42 | ### Local source models
43 |
44 | | Identifier | UUID | Protein | Params | RPE | Output | Description | Download |
45 | |--------------------------|------------|-----|--------|-----|------------------|-------------|-----------------------------------------------------------------------------------------------------|
46 | | `METL-L-2M-1D-GFP` | `8gMPQJy4` | GFP | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-GFP-8gMPQJy4.pt?download=1) |
47 | | `METL-L-2M-3D-GFP` | `Hr4GNHws` | GFP | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-GFP-Hr4GNHws.pt?download=1) |
48 | | `METL-L-2M-1D-DLG4_2022` | `8iFoiYw2` | DLG4 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-DLG4_2022-8iFoiYw2.pt?download=1) |
49 | | `METL-L-2M-3D-DLG4_2022` | `kt5DdWTa` | DLG4 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-DLG4_2022-kt5DdWTa.pt?download=1) |
50 | | `METL-L-2M-1D-GB1` | `DMfkjVzT` | GB1 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-GB1-DMfkjVzT.pt?download=1) |
51 | | `METL-L-2M-3D-GB1` | `epegcFiH` | GB1 | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-GB1-epegcFiH.pt?download=1) |
52 | | `METL-L-2M-1D-GRB2` | `kS3rUS7h` | GRB2 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-GRB2-kS3rUS7h.pt?download=1) |
53 | | `METL-L-2M-3D-GRB2` | `X7w83g6S` | GRB2 | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-GRB2-X7w83g6S.pt?download=1) |
54 | | `METL-L-2M-1D-Pab1` | `UKebCQGz` | Pab1 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-Pab1-UKebCQGz.pt?download=1) |
55 | | `METL-L-2M-3D-Pab1` | `2rr8V4th` | Pab1 | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-Pab1-2rr8V4th.pt?download=1) |
56 | | `METL-L-2M-1D-PTEN` | `CEMSx7ZC` | PTEN | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-PTEN-CEMSx7ZC.pt?download=1) |
57 | | `METL-L-2M-3D-PTEN` | `PjxR5LW7` | PTEN | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-PTEN-PjxR5LW7.pt?download=1) |
58 | | `METL-L-2M-1D-TEM-1` | `PREhfC22` | TEM-1 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-TEM-1-PREhfC22.pt?download=1) |
59 | | `METL-L-2M-3D-TEM-1` | `9ASvszux` | TEM-1 | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-TEM-1-9ASvszux.pt?download=1) |
60 | | `METL-L-2M-1D-Ube4b` | `HscFFkAb` | Ube4b | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-Ube4b-HscFFkAb.pt?download=1) |
61 | | `METL-L-2M-3D-Ube4b` | `H48oiNZN` | Ube4b | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-Ube4b-H48oiNZN.pt?download=1) |
62 |
63 |
64 |
65 | These models will output a length 55 vector corresponding to the following energy terms (in order):
66 |
67 |
68 | Expand to see energy terms
69 |
70 |
71 | ```
72 | total_score
73 | fa_atr
74 | fa_dun
75 | fa_elec
76 | fa_intra_rep
77 | fa_intra_sol_xover4
78 | fa_rep
79 | fa_sol
80 | hbond_bb_sc
81 | hbond_lr_bb
82 | hbond_sc
83 | hbond_sr_bb
84 | lk_ball_wtd
85 | omega
86 | p_aa_pp
87 | pro_close
88 | rama_prepro
89 | ref
90 | yhh_planarity
91 | buried_all
92 | buried_np
93 | contact_all
94 | contact_buried_core
95 | contact_buried_core_boundary
96 | degree
97 | degree_core
98 | degree_core_boundary
99 | exposed_hydrophobics
100 | exposed_np_AFIMLWVY
101 | exposed_polars
102 | exposed_total
103 | one_core_each
104 | pack
105 | res_count_buried_core
106 | res_count_buried_core_boundary
107 | res_count_buried_np_core
108 | res_count_buried_np_core_boundary
109 | ss_contributes_core
110 | ss_mis
111 | total_hydrophobic
112 | total_hydrophobic_AFILMVWY
113 | total_sasa
114 | two_core_each
115 | unsat_hbond
116 | centroid_total_score
117 | cbeta
118 | cenpack
119 | env
120 | hs_pair
121 | pair
122 | rg
123 | rsigma
124 | sheet
125 | ss_pair
126 | vdw
127 | ```
128 |
129 |
130 |
131 | ### Function-specific source models for GB1
132 |
133 | The GB1 experimental data measured the binding interaction between GB1 variants and Immunoglobulin G (IgG).
134 | To match this experimentally characterized function, we implemented a Rosetta pipeline to model the GB1-IgG complex and compute 17 attributes related to energy changes upon binding.
135 | We pretrained a standard METL-Local model and a modified METL-Bind model, which additionally incorporates the IgG binding attributes into its pretraining tasks.
136 |
137 | | Identifier | UUID | Protein | Params | RPE | Output | Description | Download |
138 | |--------------------------------|------------|---------|--------|-----|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------|
139 | | `METL-BIND-2M-3D-GB1-STANDARD` | `K6mw24Rg` | GB1 | 2M | 3D | Standard Rosetta energies | Trained for the function-specific synthetic data experiment, but only trained on the standard energy terms, to use as a baseline. Should perform similarly to `METL-L-2M-3D-GB1`. | [Download](https://zenodo.org/records/14908509/files/METL-BIND-2M-3D-GB1-STANDARD-K6mw24Rg.pt?download=1) |
140 | | `METL-BIND-2M-3D-GB1-BINDING` | `Bo5wn2SG` | GB1 | 2M | 3D | Standard + binding Rosetta energies | Trained on both the standard energy terms and the binding-specific energy terms. | [Download](https://zenodo.org/records/14908509/files/METL-BIND-2M-3D-GB1-BINDING-Bo5wn2SG.pt?download=1) |
141 |
142 |
143 | `METL-BIND-2M-3D-GB1-BINDING` predicts the standard energy terms listed above as well as the following binding energy terms (in order):
144 |
145 |
146 |
147 | Expand to see binding energy terms
148 |
149 |
150 | ```
151 | complex_normalized
152 | dG_cross
153 | dG_cross/dSASAx100
154 | dG_separated
155 | dG_separated/dSASAx100
156 | dSASA_hphobic
157 | dSASA_int
158 | dSASA_polar
159 | delta_unsatHbonds
160 | hbond_E_fraction
161 | hbonds_int
162 | nres_int
163 | per_residue_energy_int
164 | side1_normalized
165 | side1_score
166 | side2_normalized
167 | side2_score
168 | ```
169 |
170 |
171 | ## Target models
172 | Target models are fine-tuned source models that predict functional scores from experimental sequence-function data.
173 |
174 | ### Global target models
175 |
176 | These models were trained using 80% of the experimental sequence-function data as training data.
177 |
178 | | DMS Dataset | Identifier | UUID | RPE | Output | Description | Download |
179 | |----------------|------------|-------------|-----|------------------|-----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------|
180 | | GFP | `None` | `PeT2D92j` | 1D | Functional score | METL-Global finetuned on the GFP dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GFP-PeT2D92j.pt?download=1) |
181 | | GFP | `None` | `6JBzHpkQ` | 3D | Functional score | METL-Global finetuned on the GFP dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GFP-6JBzHpkQ.pt?download=1) |
182 | | DLG4-Abundance | `None` | `4Rh3WCbG` | 1D | Functional score | METL-Global finetuned on the DLG4-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-DLG4_2022-ABUNDANCE-4Rh3WCbG.pt?download=1) |
183 | | DLG4-Abundance | `None` | `RBtqxzvu` | 3D | Functional score | METL-Global finetuned on the DLG4-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-DLG4_2022-ABUNDANCE-RBtqxzvu.pt?download=1) |
184 | | DLG4-Binding | `None` | `4xbuC5y7` | 1D | Functional score | METL-Global finetuned on the DLG4-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-DLG4_2022-BINDING-4xbuC5y7.pt?download=1) |
185 | | DLG4-Binding | `None` | `BuvxgE2x` | 3D | Functional score | METL-Global finetuned on the DLG4-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-DLG4_2022-BINDING-BuvxgE2x.pt?download=1) |
186 | | GB1 | `None` | `dAndZfJ4` | 1D | Functional score | METL-Global finetuned on the GB1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GB1-dAndZfJ4.pt?download=1) |
187 | | GB1 | `None` | `9vSB3DRM` | 3D | Functional score | METL-Global finetuned on the GB1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GB1-9vSB3DRM.pt?download=1) |
188 | | GRB2-Abundance | `None` | `HenDpDWe` | 1D | Functional score | METL-Global finetuned on the GRB2-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GRB2-ABUNDANCE-HenDpDWe.pt?download=1) |
189 | | GRB2-Abundance | `None` | `dDoCCvfr` | 3D | Functional score | METL-Global finetuned on the GRB2-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GRB2-ABUNDANCE-dDoCCvfr.pt?download=1) |
190 | | GRB2-Binding | `None` | `cvnycE5Q` | 1D | Functional score | METL-Global finetuned on the GRB2-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GRB2-BINDING-cvnycE5Q.pt?download=1) |
191 | | GRB2-Binding | `None` | `jYesS9Ki` | 3D | Functional score | METL-Global finetuned on the GRB2-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GRB2-BINDING-jYesS9Ki.pt?download=1) |
192 | | Pab1 | `None` | `ho54gxzv` | 1D | Functional score | METL-Global finetuned on the Pab1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-Pab1-ho54gxzv.pt?download=1) |
193 | | Pab1 | `None` | `jhbL2FeB` | 3D | Functional score | METL-Global finetuned on the Pab1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-Pab1-jhbL2FeB.pt?download=1) |
194 | | PTEN-Abundance | `None` | `UEuMtmfx` | 1D | Functional score | METL-Global finetuned on the PTEN-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-PTEN-ABUNDANCE-UEuMtmfx.pt?download=1) |
195 | | PTEN-Abundance | `None` | `eJPPQYEW` | 3D | Functional score | METL-Global finetuned on the PTEN-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-PTEN-ABUNDANCE-eJPPQYEW.pt?download=1) |
196 | | PTEN-Activity | `None` | `U3X8mSeT` | 1D | Functional score | METL-Global finetuned on the PTEN-Activity dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-PTEN-ACTIVITY-U3X8mSeT.pt?download=1) |
197 | | PTEN-Activity | `None` | `4gqYnW6V` | 3D | Functional score | METL-Global finetuned on the PTEN-Activity dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-PTEN-ACTIVITY-4gqYnW6V.pt?download=1) |
198 | | TEM-1 | `None` | `ELL4GGQq` | 1D | Functional score | METL-Global finetuned on the TEM-1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-TEM-1-ELL4GGQq.pt?download=1) |
199 | | TEM-1 | `None` | `K6BjsWXm` | 3D | Functional score | METL-Global finetuned on the TEM-1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-TEM-1-K6BjsWXm.pt?download=1) |
200 | | Ube4b | `None` | `BAWw23vW` | 1D | Functional score | METL-Global finetuned on the Ube4b dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-Ube4b-BAWw23vW.pt?download=1) |
201 | | Ube4b | `None` | `G9piq6WH` | 3D | Functional score | METL-Global finetuned on the Ube4b dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-Ube4b-G9piq6WH.pt?download=1) |
202 |
203 | ### Local target models
204 |
205 | These models were trained using 80% of the experimental sequence-function data as training data.
206 |
207 | | DMS Dataset | Identifier | UUID | RPE | Output | Description | Download |
208 | |----------------|------------|----------|-----|------------------|----------------------------------------------------|---------------------------------------------------------------------------------------------------------------|
209 | | GFP | `None` | `HaUuRwfE` | 1D | Functional score | METL-Local finetuned on the GFP dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-GFP-HaUuRwfE.pt?download=1) |
210 | | GFP | `None` | `LWEY95Yb` | 3D | Functional score | METL-Local finetuned on the GFP dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-GFP-LWEY95Yb.pt?download=1) |
211 | | DLG4-Abundance | `None` | `RMFA6dnX` | 1D | Functional score | METL-Local finetuned on the DLG4-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-DLG4_2022-ABUNDANCE-RMFA6dnX.pt?download=1) |
212 | | DLG4-Abundance | `None` | `V3uTtXVe` | 3D | Functional score | METL-Local finetuned on the DLG4-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-DLG4_2022-ABUNDANCE-V3uTtXVe.pt?download=1) |
213 | | DLG4-Binding | `None` | `YdzBYWHs` | 1D | Functional score | METL-Local finetuned on the DLG4-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-DLG4_2022-BINDING-YdzBYWHs.pt?download=1) |
214 | | DLG4-Binding | `None` | `iu6ZahPw` | 3D | Functional score | METL-Local finetuned on the DLG4-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-DLG4_2022-BINDING-iu6ZahPw.pt?download=1) |
215 | | GB1 | `None` | `Pgcseywk` | 1D | Functional score | METL-Local finetuned on the GB1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-GB1-Pgcseywk.pt?download=1) |
216 | | GB1 | `None` | `UvMMdsq4` | 3D | Functional score | METL-Local finetuned on the GB1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-GB1-UvMMdsq4.pt?download=1) |
217 | | GRB2-Abundance | `None` | `VNpi9Zjt` | 1D | Functional score | METL-Local finetuned on the GRB2-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-GRB2-ABUNDANCE-VNpi9Zjt.pt?download=1) |
218 | | GRB2-Abundance | `None` | `PqBMjXkA` | 3D | Functional score | METL-Local finetuned on the GRB2-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-GRB2-ABUNDANCE-PqBMjXkA.pt?download=1) |
219 | | GRB2-Binding | `None` | `Z59BhUaE` | 1D | Functional score | METL-Local finetuned on the GRB2-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-GRB2-BINDING-Z59BhUaE.pt?download=1) |
220 | | GRB2-Binding | `None` | `VwcRN6UB` | 3D | Functional score | METL-Local finetuned on the GRB2-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-GRB2-BINDING-VwcRN6UB.pt?download=1) |
221 | | Pab1 | `None` | `TdjCzoQQ` | 1D | Functional score | METL-Local finetuned on the Pab1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-Pab1-TdjCzoQQ.pt?download=1) |
222 | | Pab1 | `None` | `5SjoLx3y` | 3D | Functional score | METL-Local finetuned on the Pab1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-Pab1-5SjoLx3y.pt?download=1) |
223 | | PTEN-Abundance | `None` | `oUScGeHo` | 1D | Functional score | METL-Local finetuned on the PTEN-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-PTEN-ABUNDANCE-oUScGeHo.pt?download=1) |
224 | | PTEN-Abundance | `None` | `DhuasDEr` | 3D | Functional score | METL-Local finetuned on the PTEN-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-PTEN-ABUNDANCE-DhuasDEr.pt?download=1) |
225 | | PTEN-Activity | `None` | `m9UsG7dq` | 1D | Functional score | METL-Local finetuned on the PTEN-Activity dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-PTEN-ACTIVITY-m9UsG7dq.pt?download=1) |
226 | | PTEN-Activity | `None` | `8Vi7ENcC` | 3D | Functional score | METL-Local finetuned on the PTEN-Activity dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-PTEN-ACTIVITY-8Vi7ENcC.pt?download=1) |
227 | | TEM-1 | `None` | `64ncFxBR` | 1D | Functional score | METL-Local finetuned on the TEM-1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-TEM-1-64ncFxBR.pt?download=1) |
228 | | TEM-1 | `None` | `PncvgiJU` | 3D | Functional score | METL-Local finetuned on the TEM-1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-TEM-1-PncvgiJU.pt?download=1) |
229 | | Ube4b | `None` | `e9uhhnAv` | 1D | Functional score | METL-Local finetuned on the Ube4b dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-Ube4b-e9uhhnAv.pt?download=1) |
230 | | Ube4b | `None` | `NfbZL7jK` | 3D | Functional score | METL-Local finetuned on the Ube4b dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-Ube4b-NfbZL7jK.pt?download=1) |
231 |
232 |
233 | ### GFP design experiment target models
234 |
235 | | DMS Dataset | Identifier | UUID | RPE | Output | Description | Download |
236 | |:------------|------------|------------|-----|------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|
237 | | GFP | `None` | `YoQkzoLD` | 1D | Functional score | The `METL-L-2M-1D-GFP` model, fine-tuned on 64 examples from the GFP DMS dataset. This model was used for the GFP design experiment described in the manuscript. | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-GFP-YoQkzoLD.pt?download=1) |
238 | | GFP | `None` | `PEkeRuxb` | 3D | Functional score | The `METL-L-2M-3D-GFP` model, fine-tuned on 64 examples from the GFP DMS dataset. This model was used for the GFP design experiment described in the manuscript. | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-GFP-PEkeRuxb.pt?download=1) |
239 |
240 |
241 | # 3D Relative Position Embeddings
242 |
243 | METL uses relative position embeddings (RPEs) based on 3D protein structure.
244 | The implementation of relative position embeddings is similar to the original paper by [Shaw et al](https://aclanthology.org/N18-2074/).
245 | However, instead of using the default 1D sequence-based distances, we calculate relative distances based on a graph of the 3D protein structure.
246 | These 3D RPEs enable the transformer to use 3D distances between amino acid residues as the positional signal when calculating attention.
247 | When using 3D RPEs, the model requires a protein structure in the form of a PDB file, corresponding to the wild-type protein or base protein of the input variant sequence.
248 |
249 | Our testing showed that 3D RPEs improve performance for METL-Global models but do not make a difference for METL-Local models.
250 | We provide both 1D and 3D models in this repository. The 1D models do not require the PDB structure as an additional input.
251 |
252 | The [pdbs](pdbs) directory contains PDB files corresponding to the experimental datasets we evaluated. These can be used with the 3D RPE models listed above.
253 |
254 | | DMS Dataset | PDB File |
255 | |----------------|-----------------------------------------------------------------------------|
256 | | GFP | [`1gfl_cm.pdb`](pdbs/1gfl_cm.pdb) |
257 | | DLG4-Abundance | [`6qji_p_trunc_2022.pdb`](pdbs/6qji_p_trunc_2022.pdb) |
258 | | DLG4-Binding | [`6qji_p_trunc_2022.pdb`](pdbs/6qji_p_trunc_2022.pdb) |
259 | | GB1 | [`2qmt_p.pdb`](pdbs/2qmt_p.pdb) |
260 | | GRB2-Abundance | [`AF-P62993-F1-model_v4_trunc_p.pdb`](pdbs/AF-P62993-F1-model_v4_trunc_p.pdb) |
261 | | GRB2-Binding | [`AF-P62993-F1-model_v4_trunc_p.pdb`](pdbs/AF-P62993-F1-model_v4_trunc_p.pdb) |
262 | | Pab1 | [`pab1_cm.pdb`](pdbs/pab1_cm.pdb) |
263 | | PTEN-Abundance | [`AF-P60484-F1-model_v4_p.pdb`](pdbs/AF-P60484-F1-model_v4_p.pdb) |
264 | | PTEN-Activity | [`AF-P60484-F1-model_v4_p.pdb`](pdbs/AF-P60484-F1-model_v4_p.pdb) |
265 | | TEM-1 | [`AF-Q6SJ61-F1-model_v4_p.pdb`](pdbs/AF-Q6SJ61-F1-model_v4_p.pdb) |
266 | | Ube4b | [`ube4b_cm.pdb`](pdbs/ube4b_cm.pdb) |
267 |
268 | # Examples
269 |
270 | ## METL source model
271 |
272 | METL source models are assigned identifiers that can be used to load the model with `metl.get_from_ident()`.
273 |
274 | This example:
275 | - Automatically downloads and caches `METL-G-20M-1D` using `metl.get_from_ident("metl-g-20m-1d")`.
276 | - Encodes a pair of dummy amino acid sequences using `data_encoder.encode_sequences()`.
277 | - Runs the sequences through the model and prints the predicted Rosetta energies.
278 |
279 | _Todo: show how to extract the METL representation at different layers of the network_
280 |
281 | ```python
282 | import metl
283 | import torch
284 |
285 | model, data_encoder = metl.get_from_ident("metl-g-20m-1d")
286 |
287 | # these are amino acid sequences
288 | # make sure all the sequences are the same length
289 | dummy_sequences = ["SMART", "MAGIC"]
290 | encoded_seqs = data_encoder.encode_sequences(dummy_sequences)
291 |
292 | # set model to eval mode
293 | model.eval()
294 | # no need to compute gradients for inference
295 | with torch.no_grad():
296 | predictions = model(torch.tensor(encoded_seqs))
297 |
298 | print(predictions)
299 | ```
300 |
301 | If you are using a model with 3D relative position embeddings, you will need to provide the PDB structure of the wild-type or base protein.
302 |
303 | ```
304 | predictions = model(torch.tensor(encoded_seqs), pdb_fn="../path/to/file.pdb")
305 | ```
306 |
307 |
308 | # METL target model
309 |
310 | METL target models can be loaded using the model's UUID and `metl.get_from_uuid()`.
311 |
312 | This example:
313 | - Automatically downloads and caches `YoQkzoLD` using `metl.get_from_uuid(uuid="YoQkzoLD")`.
314 | - Encodes several variants specified in variant notation. A wild-type sequence is needed to encode variants.
315 | - Runs the sequences through the model and prints the predicted DMS scores.
316 |
317 | ```python
318 | import metl
319 | import torch
320 |
321 | model, data_encoder = metl.get_from_uuid(uuid="YoQkzoLD")
322 |
323 | # the GFP wild-type sequence
324 | wt = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQ" \
325 | "HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKN" \
326 | "GIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
327 |
328 | # some example GFP variants to compute the scores for
329 | variants = ["E3K,G102S",
330 | "T36P,S203T,K207R",
331 | "V10A,D19G,F25S,E113V"]
332 |
333 | encoded_variants = data_encoder.encode_variants(wt, variants)
334 |
335 | # set model to eval mode
336 | model.eval()
337 | # no need to compute gradients for inference
338 | with torch.no_grad():
339 | predictions = model(torch.tensor(encoded_variants))
340 |
341 | print(predictions)
342 |
343 | ```
344 |
--------------------------------------------------------------------------------
/huggingface/README.md:
--------------------------------------------------------------------------------
1 | This directory is to maintain the 🤗 support of METL.
2 |
3 | Herein are a few files to facilitate uploading the wrapper to 🤗. First, combine_files.py takes all of the files in the METL directory, barring files that have test or _.py (think, init.py here) and combines them into a single file. combine_files.py also appends the huggingface wrapper code itself (stored in huggingface_code.py) onto the bottom of the script.
4 |
5 | This script then gets auto-updated to 🤗 after formatting it by running the push_to_hub.py script via GitHub Actions. Some additional small comments are included in the top of each file repeating these responsibilities.
--------------------------------------------------------------------------------
/huggingface/combine_files.py:
--------------------------------------------------------------------------------
1 | """
2 | This script combines all of the files in the metl directory into one file so that it can be uploaded automatically to huggingface.
3 |
4 | Files ending with _.py and that contain test in the filename will not be included. This script automatically generates the required imports from the files as well.
5 |
6 | Regardless of changes to metl, as long as necessary files that may be added don't contain test or _.py, this should work as intended.
7 | """
8 |
9 | import argparse
10 | import os
11 |
12 | def main(output_path: str):
13 | imports = set()
14 | code = []
15 | metl_imports = set()
16 | for file in os.listdir('./metl'):
17 | if '.py' in file and '_.py' not in file and 'test' not in file:
18 | with open(f'./metl/{file}', 'r') as f:
19 | file_text = f.readlines()
20 | for line in file_text:
21 | line_for_compare = line.strip()
22 | if 'import ' in line_for_compare and 'metl.' not in line_for_compare:
23 | imports.add(line_for_compare)
24 | elif 'import ' in line_for_compare and 'metl.' in line_for_compare:
25 | if 'as' in line_for_compare:
26 | metl_imports.add(line_for_compare)
27 | else:
28 | code.append(line[:-1])
29 |
30 | code = '\n'.join(code)
31 | imports = '\n'.join(imports)
32 |
33 | for line in metl_imports:
34 | import_name = line.split('as')[-1].strip()
35 | code = code.replace(f'{import_name}.', '')
36 |
37 | huggingface_import = 'from transformers import PretrainedConfig, PreTrainedModel'
38 | delimiter = '$>'
39 |
40 | with open('./huggingface/huggingface_code.py', 'r') as f:
41 | contents = f.read()
42 | delim_location = contents.find(delimiter)
43 | cut_contents = contents[delim_location+len(delimiter):]
44 |
45 | with open(output_path, 'w') as f:
46 | f.write(f'{huggingface_import}\n{imports}\n{code}\n{cut_contents}')
47 |
48 | def parse_args():
49 | parser = argparse.ArgumentParser(description="Compile huggingface wrapper")
50 | parser.add_argument("-o", type=str, help="Output filepath", default='./huggingface_wrapper.py')
51 |
52 | args = parser.parse_args()
53 |
54 | args.o = os.path.abspath(args.o)
55 | return args
56 |
57 | if __name__ == "__main__":
58 | args = parse_args()
59 | main(args.o)
60 |
--------------------------------------------------------------------------------
/huggingface/huggingface_code.py:
--------------------------------------------------------------------------------
1 | """
2 | This file contains the actual wrapper for METL.
3 | Above the delimiter for this file: #\$\> we have included imports and shell functions
4 | which prevent python (and other linters) from complaining this file has errors.
5 | """
6 |
7 |
8 | from transformers import PretrainedConfig, PreTrainedModel
9 |
10 | def get_from_uuid():
11 | pass
12 |
13 | def get_from_ident():
14 | pass
15 |
16 | def get_from_checkpoint():
17 | pass
18 |
19 | IDENT_UUID_MAP = ""
20 | UUID_URL_MAP = ""
21 |
22 | # Chop The above off.
23 |
24 | #$>
25 | # Huggingface code
26 |
27 | class METLConfig(PretrainedConfig):
28 | IDENT_UUID_MAP = IDENT_UUID_MAP
29 | UUID_URL_MAP = UUID_URL_MAP
30 | model_type = "METL"
31 |
32 | def __init__(
33 | self,
34 | id:str = None,
35 | **kwargs,
36 | ):
37 | self.id = id
38 | super().__init__(**kwargs)
39 |
40 | class METLModel(PreTrainedModel):
41 | config_class = METLConfig
42 | def __init__(self, config:METLConfig):
43 | super().__init__(config)
44 | self.model = None
45 | self.encoder = None
46 | self.config = config
47 |
48 | def forward(self, X, pdb_fn=None):
49 | if pdb_fn:
50 | return self.model(X, pdb_fn=pdb_fn)
51 | return self.model(X)
52 |
53 | def load_from_uuid(self, id):
54 | if id:
55 | assert id in self.config.UUID_URL_MAP, "ID given does not reference a valid METL model in the IDENT_UUID_MAP"
56 | self.config.id = id
57 |
58 | self.model, self.encoder = get_from_uuid(self.config.id)
59 |
60 | def load_from_ident(self, id):
61 | if id:
62 | id = id.lower()
63 | assert id in self.config.IDENT_UUID_MAP, "ID given does not reference a valid METL model in the IDENT_UUID_MAP"
64 | self.config.id = id
65 |
66 | self.model, self.encoder = get_from_ident(self.config.id)
67 |
68 | def get_from_checkpoint(self, checkpoint_path):
69 | self.model, self.encoder = get_from_checkpoint(checkpoint_path)
70 |
--------------------------------------------------------------------------------
/huggingface/print_colab_dropdown.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility script for generating a list that can be pasted into the google colab when more models are uploaded to zenodo and added to the METL IDENT_UUID_MAP.
3 |
4 | This pulls from huggingface, so wait for that action to finish first before running this script and uploading the colab notebook.
5 | """
6 |
7 | from transformers import AutoModel
8 |
9 | def main():
10 | metl = AutoModel.from_pretrained('gitter-lab/METL', trust_remote_code=True)
11 | start = "# @param ["
12 | metl_keys = [f'"{key}"' for key in metl.config.IDENT_UUID_MAP.keys()]
13 | keys = ','.join(metl_keys)
14 | end = f'{keys}]'
15 | print(start + end)
16 |
17 | if __name__ == "__main__":
18 | main()
--------------------------------------------------------------------------------
/huggingface/push_to_hub.py:
--------------------------------------------------------------------------------
1 | """
2 | Basic minimal script for uploading the generated file from combine_files.py onto huggingface.
3 | Requires the action to have access to the HF_TOKEN secret in the repository.
4 | """
5 |
6 | from huggingface_wrapper import METLConfig, METLModel
7 | from huggingface_hub import login
8 | import os
9 | from transformers import AutoModel, AutoConfig
10 | import torch
11 |
12 | def main():
13 | API_KEY = os.getenv('HF_TOKEN')
14 | login(API_KEY)
15 |
16 | config = METLConfig()
17 | model = METLModel(config)
18 | model.model = torch.nn.Linear(1, 1)
19 |
20 | AutoConfig.register("METL", METLConfig)
21 | AutoModel.register(METLConfig, METLModel)
22 |
23 | model.register_for_auto_class()
24 | config.register_for_auto_class()
25 |
26 | model.push_to_hub('gitter-lab/METL')
27 | config.push_to_hub('gitter-lab/METL')
28 |
29 | if __name__ == "__main__":
30 | main()
--------------------------------------------------------------------------------
/huggingface/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface-hub==0.30.2
2 | transformers==4.51.3
3 | numpy>=1.23.2
4 | networkx>=2.6.3
5 | scipy>=1.9.1
6 | biopandas>=0.2.7
7 | isort
8 | black
9 |
--------------------------------------------------------------------------------
/metl/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import *
2 | __version__ = "0.1"
3 |
--------------------------------------------------------------------------------
/metl/encode.py:
--------------------------------------------------------------------------------
1 | """ Encodes data in different formats """
2 | from enum import Enum, auto
3 |
4 | import numpy as np
5 |
6 |
7 | class Encoding(Enum):
8 | INT_SEQS = auto()
9 | ONE_HOT = auto()
10 |
11 |
12 | class DataEncoder:
13 | chars = ["*", "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
14 | num_chars = len(chars)
15 | mapping = {c: i for i, c in enumerate(chars)}
16 |
17 | def __init__(self, encoding: Encoding = Encoding.INT_SEQS):
18 | self.encoding = encoding
19 |
20 | def _encode_from_int_seqs(self, seq_ints):
21 | if self.encoding == Encoding.INT_SEQS:
22 | return seq_ints
23 | elif self.encoding == Encoding.ONE_HOT:
24 | one_hot = np.eye(self.num_chars)[seq_ints]
25 | return one_hot.astype(np.float32)
26 |
27 | def encode_sequences(self, char_seqs):
28 | seq_ints = []
29 | for char_seq in char_seqs:
30 | int_seq = [self.mapping[c] for c in char_seq]
31 | seq_ints.append(int_seq)
32 | seq_ints = np.array(seq_ints).astype(int)
33 | return self._encode_from_int_seqs(seq_ints)
34 |
35 | def encode_variants(self, wt, variants):
36 | # convert wild type seq to integer encoding
37 | wt_int = np.zeros(len(wt), dtype=np.uint8)
38 | for i, c in enumerate(wt):
39 | wt_int[i] = self.mapping[c]
40 |
41 | # tile the wild-type seq
42 | seq_ints = np.tile(wt_int, (len(variants), 1))
43 |
44 | for i, variant in enumerate(variants):
45 | # special handling if we want to encode the wild-type seq (it's already correct!)
46 | if variant == "_wt":
47 | continue
48 |
49 | # variants are a list of mutations [mutation1, mutation2, ....]
50 | variant = variant.split(",")
51 | for mutation in variant:
52 | # mutations are in the form
53 | position = int(mutation[1:-1])
54 | replacement = self.mapping[mutation[-1]]
55 | seq_ints[i, position] = replacement
56 |
57 | seq_ints = seq_ints.astype(int)
58 | return self._encode_from_int_seqs(seq_ints)
59 |
--------------------------------------------------------------------------------
/metl/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.hub
3 |
4 | import metl.models as models
5 | from metl.encode import DataEncoder, Encoding
6 |
7 | UUID_URL_MAP = {
8 | # global source models
9 | "D72M9aEp": "https://zenodo.org/records/14908509/files/METL-G-20M-1D-D72M9aEp.pt?download=1",
10 | "Nr9zCKpR": "https://zenodo.org/records/14908509/files/METL-G-20M-3D-Nr9zCKpR.pt?download=1",
11 | "auKdzzwX": "https://zenodo.org/records/14908509/files/METL-G-50M-1D-auKdzzwX.pt?download=1",
12 | "6PSAzdfv": "https://zenodo.org/records/14908509/files/METL-G-50M-3D-6PSAzdfv.pt?download=1",
13 |
14 | # local source models
15 | "8gMPQJy4": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-GFP-8gMPQJy4.pt?download=1",
16 | "Hr4GNHws": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-GFP-Hr4GNHws.pt?download=1",
17 | "8iFoiYw2": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-DLG4_2022-8iFoiYw2.pt?download=1",
18 | "kt5DdWTa": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-DLG4_2022-kt5DdWTa.pt?download=1",
19 | "DMfkjVzT": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-GB1-DMfkjVzT.pt?download=1",
20 | "epegcFiH": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-GB1-epegcFiH.pt?download=1",
21 | "kS3rUS7h": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-GRB2-kS3rUS7h.pt?download=1",
22 | "X7w83g6S": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-GRB2-X7w83g6S.pt?download=1",
23 | "UKebCQGz": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-Pab1-UKebCQGz.pt?download=1",
24 | "2rr8V4th": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-Pab1-2rr8V4th.pt?download=1",
25 | "PREhfC22": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-TEM-1-PREhfC22.pt?download=1",
26 | "9ASvszux": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-TEM-1-9ASvszux.pt?download=1",
27 | "HscFFkAb": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-Ube4b-HscFFkAb.pt?download=1",
28 | "H48oiNZN": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-Ube4b-H48oiNZN.pt?download=1",
29 | "CEMSx7ZC": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-PTEN-CEMSx7ZC.pt?download=1",
30 | "PjxR5LW7": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-PTEN-PjxR5LW7.pt?download=1",
31 |
32 | # metl bind source models
33 | "K6mw24Rg": "https://zenodo.org/records/14908509/files/METL-BIND-2M-3D-GB1-STANDARD-K6mw24Rg.pt?download=1",
34 | "Bo5wn2SG": "https://zenodo.org/records/14908509/files/METL-BIND-2M-3D-GB1-BINDING-Bo5wn2SG.pt?download=1",
35 |
36 | # finetuned models from GFP design experiment
37 | "YoQkzoLD": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-GFP-YoQkzoLD.pt?download=1",
38 | "PEkeRuxb": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-GFP-PEkeRuxb.pt?download=1",
39 |
40 | # new finetuned GLOBAL models
41 | "4Rh3WCbG": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-DLG4_2022-ABUNDANCE-4Rh3WCbG.pt?download=1",
42 | "4xbuC5y7": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-DLG4_2022-BINDING-4xbuC5y7.pt?download=1",
43 | "dAndZfJ4": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GB1-dAndZfJ4.pt?download=1",
44 | "PeT2D92j": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GFP-PeT2D92j.pt?download=1",
45 | "HenDpDWe": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GRB2-ABUNDANCE-HenDpDWe.pt?download=1",
46 | "cvnycE5Q": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GRB2-BINDING-cvnycE5Q.pt?download=1",
47 | "ho54gxzv": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-Pab1-ho54gxzv.pt?download=1",
48 | "UEuMtmfx": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-PTEN-ABUNDANCE-UEuMtmfx.pt?download=1",
49 | "U3X8mSeT": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-PTEN-ACTIVITY-U3X8mSeT.pt?download=1",
50 | "ELL4GGQq": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-TEM-1-ELL4GGQq.pt?download=1",
51 | "BAWw23vW": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-Ube4b-BAWw23vW.pt?download=1",
52 | "RBtqxzvu": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-DLG4_2022-ABUNDANCE-RBtqxzvu.pt?download=1",
53 | "BuvxgE2x": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-DLG4_2022-BINDING-BuvxgE2x.pt?download=1",
54 | "9vSB3DRM": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GB1-9vSB3DRM.pt?download=1",
55 | "6JBzHpkQ": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GFP-6JBzHpkQ.pt?download=1",
56 | "dDoCCvfr": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GRB2-ABUNDANCE-dDoCCvfr.pt?download=1",
57 | "jYesS9Ki": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GRB2-BINDING-jYesS9Ki.pt?download=1",
58 | "jhbL2FeB": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-Pab1-jhbL2FeB.pt?download=1",
59 | "eJPPQYEW": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-PTEN-ABUNDANCE-eJPPQYEW.pt?download=1",
60 | "4gqYnW6V": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-PTEN-ACTIVITY-4gqYnW6V.pt?download=1",
61 | "K6BjsWXm": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-TEM-1-K6BjsWXm.pt?download=1",
62 | "G9piq6WH": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-Ube4b-G9piq6WH.pt?download=1",
63 |
64 | # finetuned LOCAL models
65 | "RMFA6dnX": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-DLG4_2022-ABUNDANCE-RMFA6dnX.pt?download=1",
66 | "YdzBYWHs": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-DLG4_2022-BINDING-YdzBYWHs.pt?download=1",
67 | "Pgcseywk": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-GB1-Pgcseywk.pt?download=1",
68 | "HaUuRwfE": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-GFP-HaUuRwfE.pt?download=1",
69 | "VNpi9Zjt": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-GRB2-ABUNDANCE-VNpi9Zjt.pt?download=1",
70 | "Z59BhUaE": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-GRB2-BINDING-Z59BhUaE.pt?download=1",
71 | "TdjCzoQQ": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-Pab1-TdjCzoQQ.pt?download=1",
72 | "64ncFxBR": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-TEM-1-64ncFxBR.pt?download=1",
73 | "e9uhhnAv": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-Ube4b-e9uhhnAv.pt?download=1",
74 | "oUScGeHo": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-PTEN-ABUNDANCE-oUScGeHo.pt?download=1",
75 | "m9UsG7dq": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-PTEN-ACTIVITY-m9UsG7dq.pt?download=1",
76 | "DhuasDEr": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-PTEN-ABUNDANCE-DhuasDEr.pt?download=1",
77 | "8Vi7ENcC": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-PTEN-ACTIVITY-8Vi7ENcC.pt?download=1",
78 | "V3uTtXVe": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-DLG4_2022-ABUNDANCE-V3uTtXVe.pt?download=1",
79 | "iu6ZahPw": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-DLG4_2022-BINDING-iu6ZahPw.pt?download=1",
80 | "UvMMdsq4": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-GB1-UvMMdsq4.pt?download=1",
81 | "LWEY95Yb": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-GFP-LWEY95Yb.pt?download=1",
82 | "PqBMjXkA": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-GRB2-ABUNDANCE-PqBMjXkA.pt?download=1",
83 | "VwcRN6UB": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-GRB2-BINDING-VwcRN6UB.pt?download=1",
84 | "5SjoLx3y": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-Pab1-5SjoLx3y.pt?download=1",
85 | "PncvgiJU": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-TEM-1-PncvgiJU.pt?download=1",
86 | "NfbZL7jK": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-Ube4b-NfbZL7jK.pt?download=1"
87 |
88 | }
89 |
90 | IDENT_UUID_MAP = {
91 | # the keys should be all lowercase
92 | "metl-g-20m-1d": "D72M9aEp",
93 | "metl-g-20m-3d": "Nr9zCKpR",
94 | "metl-g-50m-1d": "auKdzzwX",
95 | "metl-g-50m-3d": "6PSAzdfv",
96 |
97 | # GFP local source models
98 | "metl-l-2m-1d-gfp": "8gMPQJy4",
99 | "metl-l-2m-3d-gfp": "Hr4GNHws",
100 |
101 | # DLG4 local source models
102 | "metl-l-2m-1d-dlg4_2022": "8iFoiYw2",
103 | "metl-l-2m-3d-dlg4_2022": "kt5DdWTa",
104 |
105 | # GB1 local source models
106 | "metl-l-2m-1d-gb1": "DMfkjVzT",
107 | "metl-l-2m-3d-gb1": "epegcFiH",
108 |
109 | # GRB2 local source models
110 | "metl-l-2m-1d-grb2": "kS3rUS7h",
111 | "metl-l-2m-3d-grb2": "X7w83g6S",
112 |
113 | # Pab1 local source models
114 | "metl-l-2m-1d-pab1": "UKebCQGz",
115 | "metl-l-2m-3d-pab1": "2rr8V4th",
116 |
117 | # PTEN local source models
118 | "metl-l-2m-1d-pten": "CEMSx7ZC",
119 | "metl-l-2m-3d-pten": "PjxR5LW7",
120 |
121 | # TEM-1 local source models
122 | "metl-l-2m-1d-tem-1": "PREhfC22",
123 | "metl-l-2m-3d-tem-1": "9ASvszux",
124 |
125 | # Ube4b local source models
126 | "metl-l-2m-1d-ube4b": "HscFFkAb",
127 | "metl-l-2m-3d-ube4b": "H48oiNZN",
128 |
129 | # METL-Bind for GB1
130 | "metl-bind-2m-3d-gb1-standard": "K6mw24Rg",
131 | "metl-bind-2m-3d-gb1-binding": "Bo5wn2SG",
132 |
133 | # GFP design models, giving them an ident
134 | "metl-l-2m-1d-gfp-ft-design": "YoQkzoLD",
135 | "metl-l-2m-3d-gfp-ft-design": "PEkeRuxb",
136 |
137 | }
138 |
139 |
140 | def download_checkpoint(uuid):
141 | ckpt = torch.hub.load_state_dict_from_url(UUID_URL_MAP[uuid],
142 | map_location="cpu", file_name=f"{uuid}.pt")
143 | state_dict = ckpt["state_dict"]
144 | hyper_parameters = ckpt["hyper_parameters"]
145 |
146 | return state_dict, hyper_parameters
147 |
148 |
149 | def _get_data_encoding(hparams):
150 | if "encoding" in hparams and hparams["encoding"] == "int_seqs":
151 | encoding = Encoding.INT_SEQS
152 | elif "encoding" in hparams and hparams["encoding"] == "one_hot":
153 | encoding = Encoding.ONE_HOT
154 | elif (("encoding" in hparams and hparams["encoding"] == "auto") or "encoding" not in hparams) and \
155 | hparams["model_name"] in ["transformer_encoder"]:
156 | encoding = Encoding.INT_SEQS
157 | else:
158 | raise ValueError("Detected unsupported encoding in hyperparameters")
159 |
160 | return encoding
161 |
162 |
163 | def load_model_and_data_encoder(state_dict, hparams):
164 | model = models.Model[hparams["model_name"]].cls(**hparams)
165 | model.load_state_dict(state_dict)
166 |
167 | data_encoder = DataEncoder(_get_data_encoding(hparams))
168 |
169 | return model, data_encoder
170 |
171 |
172 | def get_from_uuid(uuid):
173 | if uuid in UUID_URL_MAP:
174 | state_dict, hparams = download_checkpoint(uuid)
175 | return load_model_and_data_encoder(state_dict, hparams)
176 | else:
177 | raise ValueError(f"UUID {uuid} not found in UUID_URL_MAP")
178 |
179 |
180 | def get_from_ident(ident):
181 | ident = ident.lower()
182 | if ident in IDENT_UUID_MAP:
183 | state_dict, hparams = download_checkpoint(IDENT_UUID_MAP[ident])
184 | return load_model_and_data_encoder(state_dict, hparams)
185 | else:
186 | raise ValueError(f"Identifier {ident} not found in IDENT_UUID_MAP")
187 |
188 |
189 | def get_from_checkpoint(ckpt_fn):
190 | ckpt = torch.load(ckpt_fn, map_location="cpu")
191 | state_dict = ckpt["state_dict"]
192 | hyper_parameters = ckpt["hyper_parameters"]
193 | return load_model_and_data_encoder(state_dict, hyper_parameters)
194 |
--------------------------------------------------------------------------------
/metl/models.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import math
3 | from argparse import ArgumentParser
4 | import enum
5 | from os.path import isfile
6 | from typing import List, Tuple, Optional
7 |
8 | import torch
9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | from torch import Tensor
12 |
13 | import metl.relative_attention as ra
14 |
15 |
16 | def reset_parameters_helper(m: nn.Module):
17 | """ helper function for resetting model parameters, meant to be used with model.apply() """
18 |
19 | # the PyTorch MultiHeadAttention has a private function _reset_parameters()
20 | # other layers have a public reset_parameters()... go figure
21 | reset_parameters = getattr(m, "reset_parameters", None)
22 | reset_parameters_private = getattr(m, "_reset_parameters", None)
23 |
24 | if callable(reset_parameters) and callable(reset_parameters_private):
25 | raise RuntimeError("Module has both public and private methods for resetting parameters. "
26 | "This is unexpected... probably should just call the public one.")
27 |
28 | if callable(reset_parameters):
29 | m.reset_parameters()
30 |
31 | if callable(reset_parameters_private):
32 | m._reset_parameters()
33 |
34 |
35 | class SequentialWithArgs(nn.Sequential):
36 | def forward(self, x, **kwargs):
37 | for module in self:
38 | if isinstance(module, ra.RelativeTransformerEncoder) or isinstance(module, SequentialWithArgs):
39 | # for relative transformer encoders, pass in kwargs (pdb_fn)
40 | x = module(x, **kwargs)
41 | else:
42 | # for all modules, don't pass in kwargs
43 | x = module(x)
44 | return x
45 |
46 |
47 | class PositionalEncoding(nn.Module):
48 | # originally from https://pytorch.org/tutorials/beginner/transformer_tutorial.html
49 | # they have since updated their implementation, but it is functionally equivalent
50 | def __init__(self, d_model, dropout=0.1, max_len=5000):
51 | super(PositionalEncoding, self).__init__()
52 | self.dropout = nn.Dropout(p=dropout)
53 |
54 | pe = torch.zeros(max_len, d_model)
55 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
56 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
57 | pe[:, 0::2] = torch.sin(position * div_term)
58 | pe[:, 1::2] = torch.cos(position * div_term)
59 | # note the implementation on Pytorch's website expects [seq_len, batch_size, embedding_dim]
60 | # however our data is in [batch_size, seq_len, embedding_dim] (i.e. batch_first)
61 | # fixed by changing pe = pe.unsqueeze(0).transpose(0, 1) to pe = pe.unsqueeze(0)
62 | # also down below, changing our indexing into the position encoding to reflect new dimensions
63 | # pe = pe.unsqueeze(0).transpose(0, 1)
64 | pe = pe.unsqueeze(0)
65 | self.register_buffer('pe', pe)
66 |
67 | def forward(self, x, **kwargs):
68 | # note the implementation on Pytorch's website expects [seq_len, batch_size, embedding_dim]
69 | # however our data is in [batch_size, seq_len, embedding_dim] (i.e. batch_first)
70 | # fixed by changing x = x + self.pe[:x.size(0)] to x = x + self.pe[:, :x.size(1), :]
71 | # x = x + self.pe[:x.size(0), :]
72 | x = x + self.pe[:, :x.size(1), :]
73 | return self.dropout(x)
74 |
75 |
76 | class ScaledEmbedding(nn.Module):
77 | # https://pytorch.org/tutorials/beginner/translation_transformer.html
78 | # a helper function for embedding that scales by sqrt(d_model) in the forward()
79 | # makes it, so we don't have to do the scaling in the main AttnModel forward()
80 |
81 | # todo: be aware of embedding scaling factor
82 | # regarding the scaling factor, it's unclear exactly what the purpose is and whether it is needed
83 | # there are several theories on why it is used, and it shows up in all the transformer reference implementations
84 | # https://datascience.stackexchange.com/questions/87906/transformer-model-why-are-word-embeddings-scaled-before-adding-positional-encod
85 | # 1. Has something to do with weight sharing between the embedding and the decoder output
86 | # 2. Scales up the embeddings so the signal doesn't get overwhelmed when adding the absolute positional encoding
87 | # 3. It cancels out with the scaling factor in scaled dot product attention, and helps make the model robust
88 | # to the choice of embedding_len
89 | # 4. It's not actually needed
90 |
91 | # Regarding #1, not really sure about this. In section 3.4 of attention is all you need,
92 | # that's where they state they multiply the embedding weights by sqrt(d_model), and the context is that they
93 | # are sharing the same weight matrix between the two embedding layers and the pre-softmax linear transformation.
94 | # there may be a reason that we want those weights scaled differently for the embedding layers vs. the linear
95 | # transformation. It might have something to do with the scale at which embedding weights are initialized
96 | # is more appropriate for the decoder linear transform vs how they are used in the attention function. Might have
97 | # something to do with computing the correct next-token probabilities. Overall, I'm really not sure about this,
98 | # but we aren't using a decoder anyway. So if this is the reason, then we don't need to perform the multiply.
99 |
100 | # Regarding #2, it seems like in one implementation of transformers (fairseq), the sinusoidal positional encoding
101 | # has a range of (-1.0, 1.0), but the word embedding are initialized with mean 0 and s.d embedding_dim ** -0.5,
102 | # which for embedding_dim=512, is a range closer to (-0.10, 0.10). Thus, the positional embedding would overwhelm
103 | # the word embeddings when they are added together. The scaling factor increases the signal of the word embeddings.
104 | # for embedding_dim=512, it scales word embeddings by 22, increasing range of the word embeddings to (-2.2, 2.2).
105 | # link to fairseq implementation, search for nn.init to see them do the initialization
106 | # https://fairseq.readthedocs.io/en/v0.7.1/_modules/fairseq/models/transformer.html
107 | #
108 | # For PyTorch, PyTorch initializes nn.Embedding with a standard normal distribution mean 0, variance 1: N(0,1).
109 | # this puts the range for the word embeddings around (-3, 3). the pytorch implementation for positional encoding
110 | # also has a range of (-1.0, 1.0). So already, these are much closer in scale, and it doesn't seem like we need
111 | # to increase the scale of the word embeddings. However, PyTorch example still multiply by the scaling factor
112 | # unclear whether this is just a carryover that is not actually needed, or if there is a different reason
113 | #
114 | # EDIT! I just realized that even though nn.Embedding defaults to a range of around (-3, 3), the PyTorch
115 | # transformer example actually re-initializes them using a uniform distribution in the range of (-0.1, 0.1)
116 | # that makes it very similar to the fairseq implementation, so the scaling factor that PyTorch uses actually would
117 | # bring the word embedding and positional encodings much closer in scale. So this could be the reason why pytorch
118 | # does it
119 |
120 | # Regarding #3, I don't think so. Firstly, does it actually cancel there? Secondly, the purpose of the scaling
121 | # factor in scaled dot product attention, according to attention is all you need, is to counteract dot products
122 | # that are very high in magnitude due to choice of large mbedding length (aka d_k). The problem with high magnitude
123 | # dot products is that potentially, the softmax is pushed into regions where it has extremely small gradients,
124 | # making learning difficult. If the scaling factor in the embedding was meant to counteract the scaling factor in
125 | # scaled dot product attention, then what would be the point of doing all that?
126 |
127 | # Regarding #4, I don't think the scaling will have any effects in practice, it's probably not needed
128 |
129 | # Overall, I think #2 is the most likely reason why this scaling is performed. In theory, I think
130 | # even if the scaling wasn't performed, the network might learn to up-scale the word embedding weights to increase
131 | # word embedding signal vs. the position signal on its own. Another question I have is why not just initialize
132 | # the embedding weights to have higher initial values? Why put it in the range (-0.1, 0.1)?
133 | #
134 | # The fact that most implementations have this scaling concerns me, makes me think I might be missing something.
135 | # For our purposes, we can train a couple models to see if scaling has any positive or negative effect.
136 | # Still need to think about potential effects of this scaling on relative position embeddings.
137 |
138 | def __init__(self, num_embeddings: int, embedding_dim: int, scale: bool):
139 | super(ScaledEmbedding, self).__init__()
140 | self.embedding = nn.Embedding(num_embeddings, embedding_dim)
141 | self.emb_size = embedding_dim
142 | self.embed_scale = math.sqrt(self.emb_size)
143 |
144 | self.scale = scale
145 |
146 | self.init_weights()
147 |
148 | def init_weights(self):
149 | # todo: not sure why PyTorch example initializes weights like this
150 | # might have something to do with word embedding scaling factor (see above)
151 | # could also just try the default weight initialization for nn.Embedding()
152 | init_range = 0.1
153 | self.embedding.weight.data.uniform_(-init_range, init_range)
154 |
155 | def forward(self, tokens: Tensor, **kwargs):
156 | if self.scale:
157 | return self.embedding(tokens.long()) * self.embed_scale
158 | else:
159 | return self.embedding(tokens.long())
160 |
161 |
162 | class FCBlock(nn.Module):
163 | """ a fully connected block with options for batchnorm and dropout
164 | can extend in the future with option for different activation, etc """
165 |
166 | def __init__(self,
167 | in_features: int,
168 | num_hidden_nodes: int = 64,
169 | use_batchnorm: bool = False,
170 | use_layernorm: bool = False,
171 | norm_before_activation: bool = False,
172 | use_dropout: bool = False,
173 | dropout_rate: float = 0.2,
174 | activation: str = "relu"):
175 |
176 | super().__init__()
177 |
178 | if use_batchnorm and use_layernorm:
179 | raise ValueError("Only one of use_batchnorm or use_layernorm can be set to True")
180 |
181 | self.use_batchnorm = use_batchnorm
182 | self.use_dropout = use_dropout
183 | self.use_layernorm = use_layernorm
184 | self.norm_before_activation = norm_before_activation
185 |
186 | self.fc = nn.Linear(in_features=in_features, out_features=num_hidden_nodes)
187 |
188 | self.activation = get_activation_fn(activation, functional=False)
189 |
190 | if use_batchnorm:
191 | self.norm = nn.BatchNorm1d(num_hidden_nodes)
192 |
193 | if use_layernorm:
194 | self.norm = nn.LayerNorm(num_hidden_nodes)
195 |
196 | if use_dropout:
197 | self.dropout = nn.Dropout(p=dropout_rate)
198 |
199 | def forward(self, x, **kwargs):
200 | x = self.fc(x)
201 |
202 | # norm can be before or after activation, using flag
203 | if (self.use_batchnorm or self.use_layernorm) and self.norm_before_activation:
204 | x = self.norm(x)
205 |
206 | x = self.activation(x)
207 |
208 | # batchnorm being applied after activation, there is some discussion on this online
209 | if (self.use_batchnorm or self.use_layernorm) and not self.norm_before_activation:
210 | x = self.norm(x)
211 |
212 | # dropout being applied last
213 | if self.use_dropout:
214 | x = self.dropout(x)
215 |
216 | return x
217 |
218 |
219 | class TaskSpecificPredictionLayers(nn.Module):
220 | """ Constructs num_tasks [dense(num_hidden_nodes)+relu+dense(1)] layers, each independently transforming input
221 | into a single output node. All num_tasks outputs are then concatenated into a single tensor. """
222 |
223 | # todo: the independent layers are run in sequence rather than in parallel, causing a slowdown that
224 | # scales with the number of tasks. might be able to run in parallel by hacking convolution operation
225 | # https://stackoverflow.com/questions/58374980/run-multiple-models-of-an-ensemble-in-parallel-with-pytorch
226 | # https://github.com/pytorch/pytorch/issues/54147
227 | # https://github.com/pytorch/pytorch/issues/36459
228 |
229 | def __init__(self,
230 | num_tasks: int,
231 | in_features: int,
232 | num_hidden_nodes: int = 64,
233 | use_batchnorm: bool = False,
234 | use_dropout: bool = False,
235 | dropout_rate: float = 0.2,
236 | activation: str = "relu"):
237 |
238 | super().__init__()
239 |
240 | # each task-specific layer outputs a single node,
241 | # which can be combined with torch.cat into prediction vector
242 | self.task_specific_pred_layers = nn.ModuleList()
243 | for i in range(num_tasks):
244 | layers = [FCBlock(in_features=in_features,
245 | num_hidden_nodes=num_hidden_nodes,
246 | use_batchnorm=use_batchnorm,
247 | use_dropout=use_dropout,
248 | dropout_rate=dropout_rate,
249 | activation=activation),
250 | nn.Linear(in_features=num_hidden_nodes, out_features=1)]
251 | self.task_specific_pred_layers.append(nn.Sequential(*layers))
252 |
253 | def forward(self, x, **kwargs):
254 | # run each task-specific layer and concatenate outputs into a single output vector
255 | task_specific_outputs = []
256 | for layer in self.task_specific_pred_layers:
257 | task_specific_outputs.append(layer(x))
258 |
259 | output = torch.cat(task_specific_outputs, dim=1)
260 | return output
261 |
262 |
263 | class GlobalAveragePooling(nn.Module):
264 | """ helper class for global average pooling """
265 |
266 | def __init__(self, dim=1):
267 | super().__init__()
268 | # our data is in [batch_size, sequence_length, embedding_length]
269 | # with global pooling, we want to pool over the sequence dimension (dim=1)
270 | self.dim = dim
271 |
272 | def forward(self, x, **kwargs):
273 | return torch.mean(x, dim=self.dim)
274 |
275 |
276 | class CLSPooling(nn.Module):
277 | """ helper class for CLS token extraction """
278 |
279 | def __init__(self, cls_position=0):
280 | super().__init__()
281 |
282 | # the position of the CLS token in the sequence dimension
283 | # currently, the CLS token is in the first position, but may move it to the last position
284 | self.cls_position = cls_position
285 |
286 | def forward(self, x, **kwargs):
287 | # assumes input is in [batch_size, sequence_len, embedding_len]
288 | # thus sequence dimension is dimension 1
289 | return x[:, self.cls_position, :]
290 |
291 |
292 | class TransformerEncoderWrapper(nn.TransformerEncoder):
293 | """ wrapper around PyTorch's TransformerEncoder that re-initializes layer parameters,
294 | so each transformer encoder layer has a different initialization """
295 |
296 | # todo: PyTorch is changing its transformer API... check up on and see if there is a better way
297 | def __init__(self, encoder_layer, num_layers, norm=None, reset_params=True):
298 | super().__init__(encoder_layer, num_layers, norm)
299 | if reset_params:
300 | self.apply(reset_parameters_helper)
301 |
302 |
303 | class AttnModel(nn.Module):
304 | # https://pytorch.org/tutorials/beginner/transformer_tutorial.html
305 |
306 | @staticmethod
307 | def add_model_specific_args(parent_parser):
308 | parser = ArgumentParser(parents=[parent_parser], add_help=False)
309 |
310 | parser.add_argument('--pos_encoding', type=str, default="absolute",
311 | choices=["none", "absolute", "relative", "relative_3D"],
312 | help="what type of positional encoding to use")
313 | parser.add_argument('--pos_encoding_dropout', type=float, default=0.1,
314 | help="out much dropout to use in positional encoding, for pos_encoding==absolute")
315 | parser.add_argument('--clipping_threshold', type=int, default=3,
316 | help="clipping threshold for relative position embedding, for relative and relative_3D")
317 | parser.add_argument('--contact_threshold', type=int, default=7,
318 | help="threshold, in angstroms, for contact map, for relative_3D")
319 | parser.add_argument('--embedding_len', type=int, default=128)
320 | parser.add_argument('--num_heads', type=int, default=2)
321 | parser.add_argument('--num_hidden', type=int, default=64)
322 | parser.add_argument('--num_enc_layers', type=int, default=2)
323 | parser.add_argument('--enc_layer_dropout', type=float, default=0.1)
324 | parser.add_argument('--use_final_encoder_norm', action="store_true", default=False)
325 |
326 | parser.add_argument('--global_average_pooling', action="store_true", default=False)
327 | parser.add_argument('--cls_pooling', action="store_true", default=False)
328 |
329 | parser.add_argument('--use_task_specific_layers', action="store_true", default=False,
330 | help="exclusive with use_final_hidden_layer; takes priority over use_final_hidden_layer"
331 | " if both flags are set")
332 | parser.add_argument('--task_specific_hidden_nodes', type=int, default=64)
333 | parser.add_argument('--use_final_hidden_layer', action="store_true", default=False)
334 | parser.add_argument('--final_hidden_size', type=int, default=64)
335 | parser.add_argument('--use_final_hidden_layer_norm', action="store_true", default=False)
336 | parser.add_argument('--final_hidden_layer_norm_before_activation', action="store_true", default=False)
337 | parser.add_argument('--use_final_hidden_layer_dropout', action="store_true", default=False)
338 | parser.add_argument('--final_hidden_layer_dropout_rate', type=float, default=0.2)
339 |
340 | parser.add_argument('--activation', type=str, default="relu",
341 | help="activation function used for all activations in the network")
342 | return parser
343 |
344 | def __init__(self,
345 | # data args
346 | num_tasks: int,
347 | aa_seq_len: int,
348 | num_tokens: int,
349 | # transformer encoder model args
350 | pos_encoding: str = "absolute",
351 | pos_encoding_dropout: float = 0.1,
352 | clipping_threshold: int = 3,
353 | contact_threshold: int = 7,
354 | pdb_fns: List[str] = None,
355 | embedding_len: int = 64,
356 | num_heads: int = 2,
357 | num_hidden: int = 64,
358 | num_enc_layers: int = 2,
359 | enc_layer_dropout: float = 0.1,
360 | use_final_encoder_norm: bool = False,
361 | # pooling to fixed-length representation
362 | global_average_pooling: bool = True,
363 | cls_pooling: bool = False,
364 | # prediction layers
365 | use_task_specific_layers: bool = False,
366 | task_specific_hidden_nodes: int = 64,
367 | use_final_hidden_layer: bool = False,
368 | final_hidden_size: int = 64,
369 | use_final_hidden_layer_norm: bool = False,
370 | final_hidden_layer_norm_before_activation: bool = False,
371 | use_final_hidden_layer_dropout: bool = False,
372 | final_hidden_layer_dropout_rate: float = 0.2,
373 | # activation function
374 | activation: str = "relu",
375 | *args, **kwargs):
376 |
377 | super().__init__()
378 |
379 | # store embedding length for use in the forward function
380 | self.embedding_len = embedding_len
381 | self.aa_seq_len = aa_seq_len
382 |
383 | # build up layers
384 | layers = collections.OrderedDict()
385 |
386 | # amino acid embedding
387 | layers["embedder"] = ScaledEmbedding(num_embeddings=num_tokens, embedding_dim=embedding_len, scale=True)
388 |
389 | # absolute positional encoding
390 | if pos_encoding == "absolute":
391 | layers["pos_encoder"] = PositionalEncoding(embedding_len, dropout=pos_encoding_dropout, max_len=512)
392 |
393 | # transformer encoder layer for none or absolute positional encoding
394 | if pos_encoding in ["none", "absolute"]:
395 | encoder_layer = torch.nn.TransformerEncoderLayer(d_model=embedding_len,
396 | nhead=num_heads,
397 | dim_feedforward=num_hidden,
398 | dropout=enc_layer_dropout,
399 | activation=get_activation_fn(activation),
400 | norm_first=True,
401 | batch_first=True)
402 |
403 | # layer norm that is used after the transformer encoder layers
404 | # if the norm_first is False, this is *redundant* and not needed
405 | # but if norm_first is True, this can be used to normalize outputs from
406 | # the transformer encoder before inputting to the final fully connected layer
407 | encoder_norm = None
408 | if use_final_encoder_norm:
409 | encoder_norm = nn.LayerNorm(embedding_len)
410 |
411 | layers["tr_encoder"] = TransformerEncoderWrapper(encoder_layer=encoder_layer,
412 | num_layers=num_enc_layers,
413 | norm=encoder_norm)
414 |
415 | # transformer encoder layer for relative position encoding
416 | elif pos_encoding in ["relative", "relative_3D"]:
417 | relative_encoder_layer = ra.RelativeTransformerEncoderLayer(d_model=embedding_len,
418 | nhead=num_heads,
419 | pos_encoding=pos_encoding,
420 | clipping_threshold=clipping_threshold,
421 | contact_threshold=contact_threshold,
422 | pdb_fns=pdb_fns,
423 | dim_feedforward=num_hidden,
424 | dropout=enc_layer_dropout,
425 | activation=get_activation_fn(activation),
426 | norm_first=True)
427 |
428 | encoder_norm = None
429 | if use_final_encoder_norm:
430 | encoder_norm = nn.LayerNorm(embedding_len)
431 |
432 | layers["tr_encoder"] = ra.RelativeTransformerEncoder(encoder_layer=relative_encoder_layer,
433 | num_layers=num_enc_layers,
434 | norm=encoder_norm)
435 |
436 | # GLOBAL AVERAGE POOLING OR CLS TOKEN
437 | # set up the layers and output shapes (i.e. input shapes for the pred layer)
438 | if global_average_pooling:
439 | # pool over the sequence dimension
440 | layers["avg_pooling"] = GlobalAveragePooling(dim=1)
441 | pred_layer_input_features = embedding_len
442 | elif cls_pooling:
443 | layers["cls_pooling"] = CLSPooling(cls_position=0)
444 | pred_layer_input_features = embedding_len
445 | else:
446 | # no global average pooling or CLS token
447 | # sequence dimension is still there, just flattened
448 | layers["flatten"] = nn.Flatten()
449 | pred_layer_input_features = embedding_len * aa_seq_len
450 |
451 | # PREDICTION
452 | if use_task_specific_layers:
453 | # task specific prediction layers (nonlinear transform for each task)
454 | layers["prediction"] = TaskSpecificPredictionLayers(num_tasks=num_tasks,
455 | in_features=pred_layer_input_features,
456 | num_hidden_nodes=task_specific_hidden_nodes,
457 | activation=activation)
458 | elif use_final_hidden_layer:
459 | # combined prediction linear (linear transform for each task)
460 | layers["fc1"] = FCBlock(in_features=pred_layer_input_features,
461 | num_hidden_nodes=final_hidden_size,
462 | use_batchnorm=False,
463 | use_layernorm=use_final_hidden_layer_norm,
464 | norm_before_activation=final_hidden_layer_norm_before_activation,
465 | use_dropout=use_final_hidden_layer_dropout,
466 | dropout_rate=final_hidden_layer_dropout_rate,
467 | activation=activation)
468 |
469 | layers["prediction"] = nn.Linear(in_features=final_hidden_size, out_features=num_tasks)
470 | else:
471 | layers["prediction"] = nn.Linear(in_features=pred_layer_input_features, out_features=num_tasks)
472 |
473 | # FINAL MODEL
474 | self.model = SequentialWithArgs(layers)
475 |
476 | def forward(self, x, **kwargs):
477 | return self.model(x, **kwargs)
478 |
479 |
480 | class Transpose(nn.Module):
481 | """ helper layer to swap data from (batch, seq, channels) to (batch, channels, seq)
482 | used as a helper in the convolutional network which pytorch defaults to channels-first """
483 |
484 | def __init__(self, dims: Tuple[int, ...] = (1, 2)):
485 | super().__init__()
486 | self.dims = dims
487 |
488 | def forward(self, x, **kwargs):
489 | x = x.transpose(*self.dims).contiguous()
490 | return x
491 |
492 |
493 | def conv1d_out_shape(seq_len, kernel_size, stride=1, pad=0, dilation=1):
494 | return (seq_len + (2 * pad) - (dilation * (kernel_size - 1)) - 1 // stride) + 1
495 |
496 |
497 | class ConvBlock(nn.Module):
498 | def __init__(self,
499 | in_channels: int,
500 | out_channels: int,
501 | kernel_size: int,
502 | dilation: int = 1,
503 | padding: str = "same",
504 | use_batchnorm: bool = False,
505 | use_layernorm: bool = False,
506 | norm_before_activation: bool = False,
507 | use_dropout: bool = False,
508 | dropout_rate: float = 0.2,
509 | activation: str = "relu"):
510 |
511 | super().__init__()
512 |
513 | if use_batchnorm and use_layernorm:
514 | raise ValueError("Only one of use_batchnorm or use_layernorm can be set to True")
515 |
516 | self.use_batchnorm = use_batchnorm
517 | self.use_layernorm = use_layernorm
518 | self.norm_before_activation = norm_before_activation
519 | self.use_dropout = use_dropout
520 |
521 | self.conv = nn.Conv1d(in_channels=in_channels,
522 | out_channels=out_channels,
523 | kernel_size=kernel_size,
524 | padding=padding,
525 | dilation=dilation)
526 |
527 | self.activation = get_activation_fn(activation, functional=False)
528 |
529 | if use_batchnorm:
530 | self.norm = nn.BatchNorm1d(out_channels)
531 |
532 | if use_layernorm:
533 | self.norm = nn.LayerNorm(out_channels)
534 |
535 | if use_dropout:
536 | self.dropout = nn.Dropout(p=dropout_rate)
537 |
538 | def forward(self, x, **kwargs):
539 | x = self.conv(x)
540 |
541 | # norm can be before or after activation, using flag
542 | if self.use_batchnorm and self.norm_before_activation:
543 | x = self.norm(x)
544 | elif self.use_layernorm and self.norm_before_activation:
545 | x = self.norm(x.transpose(1, 2)).transpose(1, 2)
546 |
547 | x = self.activation(x)
548 |
549 | # batchnorm being applied after activation, there is some discussion on this online
550 | if self.use_batchnorm and not self.norm_before_activation:
551 | x = self.norm(x)
552 | elif self.use_layernorm and not self.norm_before_activation:
553 | x = self.norm(x.transpose(1, 2)).transpose(1, 2)
554 |
555 | # dropout being applied after batchnorm, there is some discussion on this online
556 | if self.use_dropout:
557 | x = self.dropout(x)
558 |
559 | return x
560 |
561 |
562 | class ConvModel2(nn.Module):
563 | """ convolutional source model that supports padded inputs, pooling, etc """
564 |
565 | @staticmethod
566 | def add_model_specific_args(parent_parser):
567 | parser = ArgumentParser(parents=[parent_parser], add_help=False)
568 | parser.add_argument('--use_embedding', action="store_true", default=False)
569 | parser.add_argument('--embedding_len', type=int, default=128)
570 |
571 | parser.add_argument('--num_conv_layers', type=int, default=1)
572 | parser.add_argument('--kernel_sizes', type=int, nargs="+", default=[7])
573 | parser.add_argument('--out_channels', type=int, nargs="+", default=[128])
574 | parser.add_argument('--dilations', type=int, nargs="+", default=[1])
575 | parser.add_argument('--padding', type=str, default="valid", choices=["valid", "same"])
576 | parser.add_argument('--use_conv_layer_norm', action="store_true", default=False)
577 | parser.add_argument('--conv_layer_norm_before_activation', action="store_true", default=False)
578 | parser.add_argument('--use_conv_layer_dropout', action="store_true", default=False)
579 | parser.add_argument('--conv_layer_dropout_rate', type=float, default=0.2)
580 |
581 | parser.add_argument('--global_average_pooling', action="store_true", default=False)
582 |
583 | parser.add_argument('--use_task_specific_layers', action="store_true", default=False)
584 | parser.add_argument('--task_specific_hidden_nodes', type=int, default=64)
585 | parser.add_argument('--use_final_hidden_layer', action="store_true", default=False)
586 | parser.add_argument('--final_hidden_size', type=int, default=64)
587 | parser.add_argument('--use_final_hidden_layer_norm', action="store_true", default=False)
588 | parser.add_argument('--final_hidden_layer_norm_before_activation', action="store_true", default=False)
589 | parser.add_argument('--use_final_hidden_layer_dropout', action="store_true", default=False)
590 | parser.add_argument('--final_hidden_layer_dropout_rate', type=float, default=0.2)
591 |
592 | parser.add_argument('--activation', type=str, default="relu",
593 | help="activation function used for all activations in the network")
594 |
595 | return parser
596 |
597 | def __init__(self,
598 | # data
599 | num_tasks: int,
600 | aa_seq_len: int,
601 | aa_encoding_len: int,
602 | num_tokens: int,
603 | # convolutional model args
604 | use_embedding: bool = False,
605 | embedding_len: int = 64,
606 | num_conv_layers: int = 1,
607 | kernel_sizes: List[int] = (7,),
608 | out_channels: List[int] = (128,),
609 | dilations: List[int] = (1,),
610 | padding: str = "valid",
611 | use_conv_layer_norm: bool = False,
612 | conv_layer_norm_before_activation: bool = False,
613 | use_conv_layer_dropout: bool = False,
614 | conv_layer_dropout_rate: float = 0.2,
615 | # pooling
616 | global_average_pooling: bool = True,
617 | # prediction layers
618 | use_task_specific_layers: bool = False,
619 | task_specific_hidden_nodes: int = 64,
620 | use_final_hidden_layer: bool = False,
621 | final_hidden_size: int = 64,
622 | use_final_hidden_layer_norm: bool = False,
623 | final_hidden_layer_norm_before_activation: bool = False,
624 | use_final_hidden_layer_dropout: bool = False,
625 | final_hidden_layer_dropout_rate: float = 0.2,
626 | # activation function
627 | activation: str = "relu",
628 | *args, **kwargs):
629 |
630 | super(ConvModel2, self).__init__()
631 |
632 | # build up the layers
633 | layers = collections.OrderedDict()
634 |
635 | # amino acid embedding
636 | if use_embedding:
637 | layers["embedder"] = ScaledEmbedding(num_embeddings=num_tokens, embedding_dim=embedding_len, scale=False)
638 |
639 | # transpose the input to match PyTorch's expected format
640 | layers["transpose"] = Transpose(dims=(1, 2))
641 |
642 | # build up the convolutional layers
643 | for layer_num in range(num_conv_layers):
644 | # determine the number of input channels for the first convolutional layer
645 | if layer_num == 0 and use_embedding:
646 | # for the first convolutional layer, the in_channels is the embedding_len
647 | in_channels = embedding_len
648 | elif layer_num == 0 and not use_embedding:
649 | # for the first convolutional layer, the in_channels is the aa_encoding_len
650 | in_channels = aa_encoding_len
651 | else:
652 | in_channels = out_channels[layer_num - 1]
653 |
654 | layers[f"conv{layer_num}"] = ConvBlock(in_channels=in_channels,
655 | out_channels=out_channels[layer_num],
656 | kernel_size=kernel_sizes[layer_num],
657 | dilation=dilations[layer_num],
658 | padding=padding,
659 | use_batchnorm=False,
660 | use_layernorm=use_conv_layer_norm,
661 | norm_before_activation=conv_layer_norm_before_activation,
662 | use_dropout=use_conv_layer_dropout,
663 | dropout_rate=conv_layer_dropout_rate,
664 | activation=activation)
665 |
666 | # handle transition from convolutional layers to fully connected layer
667 | # either use global average pooling or flatten
668 | # take into consideration whether we are using valid or same padding
669 | if global_average_pooling:
670 | # global average pooling (mean across the seq len dimension)
671 | # the seq len dimensions is the last dimension (batch_size, num_filters, seq_len)
672 | layers["avg_pooling"] = GlobalAveragePooling(dim=-1)
673 | # the prediction layers will take num_filters input features
674 | pred_layer_input_features = out_channels[-1]
675 |
676 | else:
677 | # no global average pooling. flatten instead.
678 | layers["flatten"] = nn.Flatten()
679 | # calculate the final output len of the convolutional layers
680 | # and the number of input features for the prediction layers
681 | if padding == "valid":
682 | # valid padding (aka no padding) results in shrinking length in progressive layers
683 | conv_out_len = conv1d_out_shape(aa_seq_len, kernel_size=kernel_sizes[0], dilation=dilations[0])
684 | for layer_num in range(1, num_conv_layers):
685 | conv_out_len = conv1d_out_shape(conv_out_len,
686 | kernel_size=kernel_sizes[layer_num],
687 | dilation=dilations[layer_num])
688 | pred_layer_input_features = conv_out_len * out_channels[-1]
689 | else:
690 | # padding == "same"
691 | pred_layer_input_features = aa_seq_len * out_channels[-1]
692 |
693 | # prediction layer
694 | if use_task_specific_layers:
695 | layers["prediction"] = TaskSpecificPredictionLayers(num_tasks=num_tasks,
696 | in_features=pred_layer_input_features,
697 | num_hidden_nodes=task_specific_hidden_nodes,
698 | activation=activation)
699 |
700 | # final hidden layer (with potential additional dropout)
701 | elif use_final_hidden_layer:
702 | layers["fc1"] = FCBlock(in_features=pred_layer_input_features,
703 | num_hidden_nodes=final_hidden_size,
704 | use_batchnorm=False,
705 | use_layernorm=use_final_hidden_layer_norm,
706 | norm_before_activation=final_hidden_layer_norm_before_activation,
707 | use_dropout=use_final_hidden_layer_dropout,
708 | dropout_rate=final_hidden_layer_dropout_rate,
709 | activation=activation)
710 | layers["prediction"] = nn.Linear(in_features=final_hidden_size, out_features=num_tasks)
711 |
712 | else:
713 | layers["prediction"] = nn.Linear(in_features=pred_layer_input_features, out_features=num_tasks)
714 |
715 | self.model = nn.Sequential(layers)
716 |
717 | def forward(self, x, **kwargs):
718 | output = self.model(x)
719 | return output
720 |
721 |
722 | class ConvModel(nn.Module):
723 | """ a convolutional network with convolutional layers followed by a fully connected layer """
724 |
725 | @staticmethod
726 | def add_model_specific_args(parent_parser):
727 | parser = ArgumentParser(parents=[parent_parser], add_help=False)
728 | parser.add_argument('--num_conv_layers', type=int, default=1)
729 | parser.add_argument('--kernel_sizes', type=int, nargs="+", default=[7])
730 | parser.add_argument('--out_channels', type=int, nargs="+", default=[128])
731 | parser.add_argument('--padding', type=str, default="valid", choices=["valid", "same"])
732 | parser.add_argument('--use_final_hidden_layer', action="store_true",
733 | help="whether to use a final hidden layer")
734 | parser.add_argument('--final_hidden_size', type=int, default=128,
735 | help="number of nodes in the final hidden layer")
736 | parser.add_argument('--use_dropout', action="store_true",
737 | help="whether to use dropout in the final hidden layer")
738 | parser.add_argument('--dropout_rate', type=float, default=0.2,
739 | help="dropout rate in the final hidden layer")
740 | parser.add_argument('--use_task_specific_layers', action="store_true", default=False)
741 | parser.add_argument('--task_specific_hidden_nodes', type=int, default=64)
742 | return parser
743 |
744 | def __init__(self,
745 | num_tasks: int,
746 | aa_seq_len: int,
747 | aa_encoding_len: int,
748 | num_conv_layers: int = 1,
749 | kernel_sizes: List[int] = (7,),
750 | out_channels: List[int] = (128,),
751 | padding: str = "valid",
752 | use_final_hidden_layer: bool = True,
753 | final_hidden_size: int = 128,
754 | use_dropout: bool = False,
755 | dropout_rate: float = 0.2,
756 | use_task_specific_layers: bool = False,
757 | task_specific_hidden_nodes: int = 64,
758 | *args, **kwargs):
759 |
760 | super(ConvModel, self).__init__()
761 |
762 | # set up the model as a Sequential block (less to do in forward())
763 | layers = collections.OrderedDict()
764 |
765 | layers["transpose"] = Transpose(dims=(1, 2))
766 |
767 | for layer_num in range(num_conv_layers):
768 | # for the first convolutional layer, the in_channels is the feature_len
769 | in_channels = aa_encoding_len if layer_num == 0 else out_channels[layer_num - 1]
770 |
771 | layers["conv{}".format(layer_num)] = nn.Sequential(
772 | nn.Conv1d(in_channels=in_channels,
773 | out_channels=out_channels[layer_num],
774 | kernel_size=kernel_sizes[layer_num],
775 | padding=padding),
776 | nn.ReLU()
777 | )
778 |
779 | layers["flatten"] = nn.Flatten()
780 |
781 | # calculate the final output len of the convolutional layers
782 | # and the number of input features for the prediction layers
783 | if padding == "valid":
784 | # valid padding (aka no padding) results in shrinking length in progressive layers
785 | conv_out_len = conv1d_out_shape(aa_seq_len, kernel_size=kernel_sizes[0])
786 | for layer_num in range(1, num_conv_layers):
787 | conv_out_len = conv1d_out_shape(conv_out_len, kernel_size=kernel_sizes[layer_num])
788 | next_dim = conv_out_len * out_channels[-1]
789 | elif padding == "same":
790 | next_dim = aa_seq_len * out_channels[-1]
791 | else:
792 | raise ValueError("unexpected value for padding: {}".format(padding))
793 |
794 | # final hidden layer (with potential additional dropout)
795 | if use_final_hidden_layer:
796 | layers["fc1"] = FCBlock(in_features=next_dim,
797 | num_hidden_nodes=final_hidden_size,
798 | use_batchnorm=False,
799 | use_dropout=use_dropout,
800 | dropout_rate=dropout_rate)
801 | next_dim = final_hidden_size
802 |
803 | # final prediction layer
804 | # either task specific nonlinear layers or a single linear layer
805 | if use_task_specific_layers:
806 | layers["prediction"] = TaskSpecificPredictionLayers(num_tasks=num_tasks,
807 | in_features=next_dim,
808 | num_hidden_nodes=task_specific_hidden_nodes)
809 | else:
810 | layers["prediction"] = nn.Linear(in_features=next_dim, out_features=num_tasks)
811 |
812 | self.model = nn.Sequential(layers)
813 |
814 | def forward(self, x, **kwargs):
815 | output = self.model(x)
816 | return output
817 |
818 |
819 | class FCModel(nn.Module):
820 |
821 | @staticmethod
822 | def add_model_specific_args(parent_parser):
823 | parser = ArgumentParser(parents=[parent_parser], add_help=False)
824 | parser.add_argument('--num_layers', type=int, default=1)
825 | parser.add_argument('--num_hidden', nargs="+", type=int, default=[128])
826 | parser.add_argument('--use_batchnorm', action="store_true", default=False)
827 | parser.add_argument('--use_layernorm', action="store_true", default=False)
828 | parser.add_argument('--norm_before_activation', action="store_true", default=False)
829 | parser.add_argument('--use_dropout', action="store_true", default=False)
830 | parser.add_argument('--dropout_rate', type=float, default=0.2)
831 | return parser
832 |
833 | def __init__(self,
834 | num_tasks: int,
835 | seq_encoding_len: int,
836 | num_layers: int = 1,
837 | num_hidden: List[int] = (128,),
838 | use_batchnorm: bool = False,
839 | use_layernorm: bool = False,
840 | norm_before_activation: bool = False,
841 | use_dropout: bool = False,
842 | dropout_rate: float = 0.2,
843 | activation: str = "relu",
844 | *args, **kwargs):
845 | super().__init__()
846 |
847 | # set up the model as a Sequential block (less to do in forward())
848 | layers = collections.OrderedDict()
849 |
850 | # flatten inputs as this is all fully connected
851 | layers["flatten"] = nn.Flatten()
852 |
853 | # build up the variable number of hidden layers (fully connected + ReLU + dropout (if set))
854 | for layer_num in range(num_layers):
855 | # for the first layer (layer_num == 0), in_features is determined by given input
856 | # for subsequent layers, the in_features is the previous layer's num_hidden
857 | in_features = seq_encoding_len if layer_num == 0 else num_hidden[layer_num - 1]
858 |
859 | layers["fc{}".format(layer_num)] = FCBlock(in_features=in_features,
860 | num_hidden_nodes=num_hidden[layer_num],
861 | use_batchnorm=use_batchnorm,
862 | use_layernorm=use_layernorm,
863 | norm_before_activation=norm_before_activation,
864 | use_dropout=use_dropout,
865 | dropout_rate=dropout_rate,
866 | activation=activation)
867 |
868 | # finally, the linear output layer
869 | in_features = num_hidden[-1] if num_layers > 0 else seq_encoding_len
870 | layers["output"] = nn.Linear(in_features=in_features, out_features=num_tasks)
871 |
872 | self.model = nn.Sequential(layers)
873 |
874 | def forward(self, x, **kwargs):
875 | output = self.model(x)
876 | return output
877 |
878 |
879 | class LRModel(nn.Module):
880 | """ a simple linear model """
881 |
882 | def __init__(self, num_tasks, seq_encoding_len, *args, **kwargs):
883 | super().__init__()
884 |
885 | self.model = nn.Sequential(
886 | nn.Flatten(),
887 | nn.Linear(seq_encoding_len, out_features=num_tasks))
888 |
889 | def forward(self, x, **kwargs):
890 | output = self.model(x)
891 | return output
892 |
893 |
894 | class TransferModel(nn.Module):
895 | """ transfer learning model """
896 |
897 | @staticmethod
898 | def add_model_specific_args(parent_parser):
899 |
900 | def none_or_int(value: str):
901 | return None if value.lower() == "none" else int(value)
902 |
903 | p = ArgumentParser(parents=[parent_parser], add_help=False)
904 |
905 | # for model set up
906 | p.add_argument('--pretrained_ckpt_path', type=str, default=None)
907 |
908 | # where to cut off the backbone
909 | p.add_argument("--backbone_cutoff", type=none_or_int, default=-1,
910 | help="where to cut off the backbone. can be a negative int, indexing back from "
911 | "pretrained_model.model.model. a value of -1 would chop off the backbone prediction head. "
912 | "a value of -2 chops the prediction head and FC layer. a value of -3 chops"
913 | "the above, as well as the global average pooling layer. all depends on architecture.")
914 |
915 | p.add_argument("--pred_layer_input_features", type=int, default=None,
916 | help="if None, number of features will be determined based on backbone_cutoff and standard "
917 | "architecture. otherwise, specify the number of input features for the prediction layer")
918 |
919 | # top net args
920 | p.add_argument("--top_net_type", type=str, default="linear", choices=["linear", "nonlinear", "sklearn"])
921 | p.add_argument("--top_net_hidden_nodes", type=int, default=256)
922 | p.add_argument("--top_net_use_batchnorm", action="store_true")
923 | p.add_argument("--top_net_use_dropout", action="store_true")
924 | p.add_argument("--top_net_dropout_rate", type=float, default=0.1)
925 |
926 | return p
927 |
928 | def __init__(self,
929 | # pretrained model
930 | pretrained_ckpt_path: Optional[str] = None,
931 | pretrained_hparams: Optional[dict] = None,
932 | backbone_cutoff: Optional[int] = -1,
933 | # top net
934 | pred_layer_input_features: Optional[int] = None,
935 | top_net_type: str = "linear",
936 | top_net_hidden_nodes: int = 256,
937 | top_net_use_batchnorm: bool = False,
938 | top_net_use_dropout: bool = False,
939 | top_net_dropout_rate: float = 0.1,
940 | *args, **kwargs):
941 |
942 | super().__init__()
943 |
944 | # error checking: if pretrained_ckpt_path is None, then pretrained_hparams must be specified
945 | if pretrained_ckpt_path is None and pretrained_hparams is None:
946 | raise ValueError("Either pretrained_ckpt_path or pretrained_hparams must be specified")
947 |
948 | # note: pdb_fns is loaded from transfer model arguments rather than original source model hparams
949 | # if pdb_fns is specified as a kwarg, pass it on for structure-based RPE
950 | # otherwise, can just set pdb_fns to None, and structure-based RPE will handle new PDBs on the fly
951 | pdb_fns = kwargs["pdb_fns"] if "pdb_fns" in kwargs else None
952 |
953 | # generate a fresh backbone using pretrained_hparams if specified
954 | # otherwise load the backbone from the pretrained checkpoint
955 | # we prioritize pretrained_hparams over pretrained_ckpt_path because
956 | # pretrained_hparams will only really be specified if we are loading from a DMSTask checkpoint
957 | # meaning the TransferModel has already been fine-tuned on DMS data, and we are likely loading
958 | # weights from that finetuning (including weights for the backbone)
959 | # whereas if pretrained_hparams is not specified but pretrained_ckpt_path is, then we are
960 | # likely finetuning the TransferModel for the first time, and we need the pretrained weights for the
961 | # backbone from the RosettaTask checkpoint
962 | if pretrained_hparams is not None:
963 | # pretrained_hparams will only be specified if we are loading from a DMSTask checkpoint
964 | pretrained_hparams["pdb_fns"] = pdb_fns
965 | pretrained_model = Model[pretrained_hparams["model_name"]].cls(**pretrained_hparams)
966 | self.pretrained_hparams = pretrained_hparams
967 | else:
968 | # not supported in metl-pretrained
969 | raise NotImplementedError("Loading pretrained weights from RosettaTask checkpoint not supported")
970 |
971 | layers = collections.OrderedDict()
972 |
973 | # set the backbone to all layers except the last layer (the pre-trained prediction layer)
974 | if backbone_cutoff is None:
975 | layers["backbone"] = SequentialWithArgs(*list(pretrained_model.model.children()))
976 | else:
977 | layers["backbone"] = SequentialWithArgs(*list(pretrained_model.model.children())[0:backbone_cutoff])
978 |
979 | if top_net_type == "sklearn":
980 | # sklearn top not doesn't require any more layers, just return model for the repr layer
981 | self.model = SequentialWithArgs(layers)
982 | return
983 |
984 | # figure out dimensions of input into the prediction layer
985 | if pred_layer_input_features is None:
986 | # todo: can make this more robust by checking if the pretrained_mode.hparams for use_final_hidden_layer,
987 | # global_average_pooling, etc. then can determine what the layer will be based on backbone_cutoff.
988 | # currently, assumes that pretrained_model uses global average pooling and a final_hidden_layer
989 | if backbone_cutoff is None:
990 | # no backbone cutoff... use the full network (including tasks) as the backbone
991 | pred_layer_input_features = self.pretrained_hparams["num_tasks"]
992 | elif backbone_cutoff == -1:
993 | pred_layer_input_features = self.pretrained_hparams["final_hidden_size"]
994 | elif backbone_cutoff == -2:
995 | pred_layer_input_features = self.pretrained_hparams["embedding_len"]
996 | elif backbone_cutoff == -3:
997 | pred_layer_input_features = self.pretrained_hparams["embedding_len"] * kwargs["aa_seq_len"]
998 | else:
999 | raise ValueError("can't automatically determine pred_layer_input_features for given backbone_cutoff")
1000 |
1001 | layers["flatten"] = nn.Flatten(start_dim=1)
1002 |
1003 | # create a new prediction layer on top of the backbone
1004 | if top_net_type == "linear":
1005 | # linear layer for prediction
1006 | layers["prediction"] = nn.Linear(in_features=pred_layer_input_features, out_features=1)
1007 | elif top_net_type == "nonlinear":
1008 | # fully connected with hidden layer
1009 | fc_block = FCBlock(in_features=pred_layer_input_features,
1010 | num_hidden_nodes=top_net_hidden_nodes,
1011 | use_batchnorm=top_net_use_batchnorm,
1012 | use_dropout=top_net_use_dropout,
1013 | dropout_rate=top_net_dropout_rate)
1014 |
1015 | pred_layer = nn.Linear(in_features=top_net_hidden_nodes, out_features=1)
1016 |
1017 | layers["prediction"] = SequentialWithArgs(fc_block, pred_layer)
1018 | else:
1019 | raise ValueError("Unexpected type of top net layer: {}".format(top_net_type))
1020 |
1021 | self.model = SequentialWithArgs(layers)
1022 |
1023 | def forward(self, x, **kwargs):
1024 | return self.model(x, **kwargs)
1025 |
1026 |
1027 | def get_activation_fn(activation, functional=True):
1028 | if activation == "relu":
1029 | return F.relu if functional else nn.ReLU()
1030 | elif activation == "gelu":
1031 | return F.gelu if functional else nn.GELU()
1032 | elif activation == "silo" or activation == "swish":
1033 | return F.silu if functional else nn.SiLU()
1034 | elif activation == "leaky_relu" or activation == "lrelu":
1035 | return F.leaky_relu if functional else nn.LeakyReLU()
1036 | else:
1037 | raise RuntimeError("unknown activation: {}".format(activation))
1038 |
1039 |
1040 | class Model(enum.Enum):
1041 | def __new__(cls, *args, **kwds):
1042 | value = len(cls.__members__) + 1
1043 | obj = object.__new__(cls)
1044 | obj._value_ = value
1045 | return obj
1046 |
1047 | def __init__(self, cls, transfer_model):
1048 | self.cls = cls
1049 | self.transfer_model = transfer_model
1050 |
1051 | linear = LRModel, False
1052 | fully_connected = FCModel, False
1053 | cnn = ConvModel, False
1054 | cnn2 = ConvModel2, False
1055 | transformer_encoder = AttnModel, False
1056 | transfer_model = TransferModel, True
1057 |
1058 |
1059 | def main():
1060 | pass
1061 |
1062 |
1063 | if __name__ == "__main__":
1064 | main()
1065 |
--------------------------------------------------------------------------------
/metl/relative_attention.py:
--------------------------------------------------------------------------------
1 | """ implementation of transformer encoder with relative attention
2 | references:
3 | - https://medium.com/@_init_/how-self-attention-with-relative-position-representations-works-28173b8c245a
4 | - https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html#TransformerEncoderLayer
5 | - https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py
6 | - https://github.com/jiezouguihuafu/ClassicalModelreproduced/blob/main/Transformer/transfor_rpe.py
7 | """
8 |
9 | import copy
10 | from os.path import basename, dirname, join, isfile
11 | from typing import Optional, Union
12 |
13 | import torch
14 | import torch.nn as nn
15 | import torch.nn.functional as F
16 | from torch import Tensor
17 | from torch.nn import Linear, Dropout, LayerNorm
18 | import time
19 | import networkx as nx
20 |
21 | import metl.structure as structure
22 | import metl.models as models
23 |
24 |
25 | class RelativePosition3D(nn.Module):
26 | """ Contact map-based relative position embeddings """
27 |
28 | # need to compute a bucket_mtx for each structure
29 | # need to know which bucket_mtx to use when grabbing the embeddings in forward()
30 | # - on init, get a list of all PDB files we will be using
31 | # - use a dictionary to store PDB files --> bucket_mtxs
32 | # - forward() gets a new arg: the pdb file, which indexes into the dictionary to grab the right bucket_mtx
33 | def __init__(self,
34 | embedding_len: int,
35 | contact_threshold: int,
36 | clipping_threshold: int,
37 | pdb_fns: Optional[Union[str, list, tuple]] = None,
38 | default_pdb_dir: str = "data/pdb_files"):
39 |
40 | # preferably, pdb_fns contains full paths to the PDBs, but if just the PDB filename is given
41 | # then it defaults to the path data/pdb_files/
42 | super().__init__()
43 | self.embedding_len = embedding_len
44 | self.clipping_threshold = clipping_threshold
45 | self.contact_threshold = contact_threshold
46 | self.default_pdb_dir = default_pdb_dir
47 |
48 | # dummy buffer for getting correct device for on-the-fly bucket matrix generation
49 | self.register_buffer("dummy_buffer", torch.empty(0), persistent=False)
50 |
51 | # for 3D-based positions, the number of embeddings is generally the number of buckets
52 | # for contact map-based distances, that is clipping_threshold + 1
53 | num_embeddings = clipping_threshold + 1
54 |
55 | # this is the embedding lookup table E_r
56 | self.embeddings_table = nn.Embedding(num_embeddings, embedding_len)
57 |
58 | # set up pdb_fns that were passed in on init (can also be set up during runtime in forward())
59 | # todo: i'm using a hacky workaround to move the bucket_mtxs to the correct device
60 | # i tried to make it more efficient by registering bucket matrices as buffers, but i was
61 | # having problems with DDP syncing the buffers across processes
62 | self.bucket_mtxs = {}
63 | self.bucket_mtxs_device = self.dummy_buffer.device
64 | self._init_pdbs(pdb_fns)
65 |
66 | def forward(self, pdb_fn):
67 | # compute matrix R by grabbing the embeddings from the embeddings lookup table
68 | embeddings = self.embeddings_table(self._get_bucket_mtx(pdb_fn))
69 | return embeddings
70 |
71 | # def _get_bucket_mtx(self, pdb_fn):
72 | # """ retrieve a bucket matrix given the pdb_fn.
73 | # if the pdb_fn was provided at init or has already been computed, then the bucket matrix will be
74 | # retrieved from the object buffer. if the bucket matrix has not been computed yet, it will be here """
75 | # pdb_attr = self._pdb_key(pdb_fn)
76 | # if hasattr(self, pdb_attr):
77 | # return getattr(self, pdb_attr)
78 | # else:
79 | # # encountering a new PDB at runtime... process it
80 | # # todo: if there's a new PDB at runtime, it will be initialized separately in each instance
81 | # # of RelativePosition3D, for each layer. It would be more efficient to have a global
82 | # # bucket_mtx registry... perhaps in the RelativeTransformerEncoder class, that can be passed through
83 | # self._init_pdb(pdb_fn)
84 | # return getattr(self, pdb_attr)
85 |
86 | def _move_bucket_mtxs(self, device):
87 | for k, v in self.bucket_mtxs.items():
88 | self.bucket_mtxs[k] = v.to(device)
89 | self.bucket_mtxs_device = device
90 |
91 | def _get_bucket_mtx(self, pdb_fn):
92 | """ retrieve a bucket matrix given the pdb_fn.
93 | if the pdb_fn was provided at init or has already been computed, then the bucket matrix will be
94 | retrieved from the bucket_mtxs dictionary. else, it will be computed now on-the-fly """
95 |
96 | # ensure that all the bucket matrices are on the same device as the nn.Embedding
97 | if self.bucket_mtxs_device != self.dummy_buffer.device:
98 | self._move_bucket_mtxs(self.dummy_buffer.device)
99 |
100 | pdb_attr = self._pdb_key(pdb_fn)
101 | if pdb_attr in self.bucket_mtxs:
102 | return self.bucket_mtxs[pdb_attr]
103 | else:
104 | # encountering a new PDB at runtime... process it
105 | # todo: if there's a new PDB at runtime, it will be initialized separately in each instance
106 | # of RelativePosition3D, for each layer. It would be more efficient to have a global
107 | # bucket_mtx registry... perhaps in the RelativeTransformerEncoder class, that can be passed through
108 | self._init_pdb(pdb_fn)
109 | return self.bucket_mtxs[pdb_attr]
110 |
111 | # def _set_bucket_mtx(self, pdb_fn, bucket_mtx):
112 | # """ store a bucket matrix as a buffer """
113 | # # if PyTorch ever implements a BufferDict, we could use it here efficiently
114 | # # there is also BufferDict from https://botorch.org/api/_modules/botorch/utils/torch.html
115 | # # would just need to modify it to have an option for persistent=False
116 | # bucket_mtx = bucket_mtx.to(self.dummy_buffer.device)
117 | #
118 | # self.register_buffer(self._pdb_key(pdb_fn), bucket_mtx, persistent=False)
119 |
120 | def _set_bucket_mtx(self, pdb_fn, bucket_mtx):
121 | """ store a bucket matrix in the bucket dict """
122 |
123 | # move the bucket_mtx to the same device that the other bucket matrices are on
124 | bucket_mtx = bucket_mtx.to(self.bucket_mtxs_device)
125 |
126 | self.bucket_mtxs[self._pdb_key(pdb_fn)] = bucket_mtx
127 |
128 | @staticmethod
129 | def _pdb_key(pdb_fn):
130 | """ return a unique key for the given pdb_fn, used to map unique PDBs """
131 | # note this key does NOT currently support PDBs with the same basename but different paths
132 | # assumes every PDB is in the format .pdb
133 | # should be a compatible with being a class attribute, as it is used as a pytorch buffer name
134 | return f"pdb_{basename(pdb_fn).split('.')[0]}"
135 |
136 | def _init_pdbs(self, pdb_fns):
137 | start = time.time()
138 |
139 | if pdb_fns is None:
140 | # nothing to initialize if pdb_fns is None
141 | return
142 |
143 | # make sure pdb_fns is a list
144 | if not isinstance(pdb_fns, list) and not isinstance(pdb_fns, tuple):
145 | pdb_fns = [pdb_fns]
146 |
147 | # init each pdb fn in the list
148 | for pdb_fn in pdb_fns:
149 | self._init_pdb(pdb_fn)
150 |
151 | print("Initialized PDB bucket matrices in: {:.3f}".format(time.time() - start))
152 |
153 | def _init_pdb(self, pdb_fn):
154 | """ process a pdb file for use with structure-based relative attention """
155 | # if pdb_fn is not a full path, default to the path data/pdb_files/
156 | if dirname(pdb_fn) == "":
157 | # handle the case where the pdb file is in the current working directory
158 | # if there is a PDB file in the cwd.... then just use it as is. otherwise, append the default.
159 | if not isfile(pdb_fn):
160 | pdb_fn = join(self.default_pdb_dir, pdb_fn)
161 |
162 | # create a structure graph from the pdb_fn and contact threshold
163 | cbeta_mtx = structure.cbeta_distance_matrix(pdb_fn)
164 | structure_graph = structure.dist_thresh_graph(cbeta_mtx, self.contact_threshold)
165 |
166 | # bucket_mtx indexes into the embedding lookup table to create the final distance matrix
167 | bucket_mtx = self._compute_bucket_mtx(structure_graph)
168 |
169 | self._set_bucket_mtx(pdb_fn, bucket_mtx)
170 |
171 | def _compute_bucketed_neighbors(self, structure_graph, source_node):
172 | """ gets the bucketed neighbors from the given source node and structure graph"""
173 | if self.clipping_threshold < 0:
174 | raise ValueError("Clipping threshold must be >= 0")
175 |
176 | sspl = _inv_dict(nx.single_source_shortest_path_length(structure_graph, source_node))
177 |
178 | if self.clipping_threshold is not None:
179 | num_buckets = 1 + self.clipping_threshold
180 | sspl = _combine_d(sspl, self.clipping_threshold, num_buckets - 1)
181 |
182 | return sspl
183 |
184 | def _compute_bucket_mtx(self, structure_graph):
185 | """ get the bucket_mtx for the given structure_graph
186 | calls _get_bucketed_neighbors for every node in the structure_graph """
187 | num_residues = len(list(structure_graph))
188 |
189 | # index into the embedding lookup table to create the final distance matrix
190 | bucket_mtx = torch.zeros(num_residues, num_residues, dtype=torch.long)
191 |
192 | for node_num in sorted(list(structure_graph)):
193 | bucketed_neighbors = self._compute_bucketed_neighbors(structure_graph, node_num)
194 |
195 | for bucket_num, neighbors in bucketed_neighbors.items():
196 | bucket_mtx[node_num, neighbors] = bucket_num
197 |
198 | return bucket_mtx
199 |
200 |
201 | class RelativePosition(nn.Module):
202 | """ creates the embedding lookup table E_r and computes R
203 | note this inherits from pl.LightningModule instead of nn.Module
204 | makes it easier to access the device with `self.device`
205 | might be able to keep it as an nn.Module using the hacky dummy_param or commented out .device property """
206 |
207 | def __init__(self, embedding_len: int, clipping_threshold: int):
208 | """
209 | embedding_len: the length of the embedding, may be d_model, or d_model // num_heads for multihead
210 | clipping_threshold: the maximum relative position, referred to as k by Shaw et al.
211 | """
212 | super().__init__()
213 | self.embedding_len = embedding_len
214 | self.clipping_threshold = clipping_threshold
215 | # for sequence-based distances, the number of embeddings is 2*k+1, where k is the clipping threshold
216 | num_embeddings = 2 * clipping_threshold + 1
217 |
218 | # this is the embedding lookup table E_r
219 | self.embeddings_table = nn.Embedding(num_embeddings, embedding_len)
220 |
221 | # for getting the correct device for range vectors in forward
222 | self.register_buffer("dummy_buffer", torch.empty(0), persistent=False)
223 |
224 | def forward(self, length_q, length_k):
225 | # supports different length sequences, but in self-attention length_q and length_k are the same
226 | range_vec_q = torch.arange(length_q, device=self.dummy_buffer.device)
227 | range_vec_k = torch.arange(length_k, device=self.dummy_buffer.device)
228 |
229 | # this sets up the standard sequence-based distance matrix for relative positions
230 | # the current position is 0, positions to the right are +1, +2, etc, and to the left -1, -2, etc
231 | distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
232 | distance_mat_clipped = torch.clamp(distance_mat, -self.clipping_threshold, self.clipping_threshold)
233 |
234 | # convert to indices, indexing into the embedding table
235 | final_mat = (distance_mat_clipped + self.clipping_threshold).long()
236 |
237 | # compute matrix R by grabbing the embeddings from the embedding lookup table
238 | embeddings = self.embeddings_table(final_mat)
239 |
240 | return embeddings
241 |
242 |
243 | class RelativeMultiHeadAttention(nn.Module):
244 | def __init__(self, embed_dim, num_heads, dropout, pos_encoding, clipping_threshold, contact_threshold, pdb_fns):
245 | """
246 | Multi-head attention with relative position embeddings. Input data should be in batch_first format.
247 | :param embed_dim: aka d_model, aka hid_dim
248 | :param num_heads: number of heads
249 | :param dropout: how much dropout for scaled dot product attention
250 |
251 | :param pos_encoding: what type of positional encoding to use, relative or relative3D
252 | :param clipping_threshold: clipping threshold for relative position embedding
253 | :param contact_threshold: for relative_3D, the threshold in angstroms for the contact map
254 | :param pdb_fns: pdb file(s) to set up the relative position object
255 |
256 | """
257 | super().__init__()
258 |
259 | assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
260 |
261 | # model dimensions
262 | self.embed_dim = embed_dim
263 | self.num_heads = num_heads
264 | self.head_dim = embed_dim // num_heads
265 |
266 | # pos encoding stuff
267 | self.pos_encoding = pos_encoding
268 | self.clipping_threshold = clipping_threshold
269 | self.contact_threshold = contact_threshold
270 | if pdb_fns is not None and not isinstance(pdb_fns, list):
271 | pdb_fns = [pdb_fns]
272 | self.pdb_fns = pdb_fns
273 |
274 | # relative position embeddings for use with keys and values
275 | # Shaw et al. uses relative position information for both keys and values
276 | # Huang et al. only uses it for the keys, which is probably enough
277 | if pos_encoding == "relative":
278 | self.relative_position_k = RelativePosition(self.head_dim, self.clipping_threshold)
279 | self.relative_position_v = RelativePosition(self.head_dim, self.clipping_threshold)
280 | elif pos_encoding == "relative_3D":
281 | self.relative_position_k = RelativePosition3D(self.head_dim, self.contact_threshold,
282 | self.clipping_threshold, self.pdb_fns)
283 | self.relative_position_v = RelativePosition3D(self.head_dim, self.contact_threshold,
284 | self.clipping_threshold, self.pdb_fns)
285 | else:
286 | raise ValueError("unrecognized pos_encoding: {}".format(pos_encoding))
287 |
288 | # WQ, WK, and WV from attention is all you need
289 | # note these default to bias=True, same as PyTorch implementation
290 | self.q_proj = nn.Linear(embed_dim, embed_dim)
291 | self.k_proj = nn.Linear(embed_dim, embed_dim)
292 | self.v_proj = nn.Linear(embed_dim, embed_dim)
293 |
294 | # WO from attention is all you need
295 | # used for the final projection when computing multi-head attention
296 | # PyTorch uses NonDynamicallyQuantizableLinear instead of Linear to avoid triggering an obscure
297 | # error quantizing the model https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/linear.py#L122
298 | # todo: if quantizing the model, explore if the above is a concern for us
299 | self.out_proj = nn.Linear(embed_dim, embed_dim)
300 |
301 | # dropout for scaled dot product attention
302 | self.dropout = nn.Dropout(dropout)
303 |
304 | # scaling factor for scaled dot product attention
305 | scale = torch.sqrt(torch.FloatTensor([self.head_dim]))
306 | # persistent=False if you don't want to save it inside state_dict
307 | self.register_buffer('scale', scale)
308 |
309 | # toggles meant to be set directly by user
310 | self.need_weights = False
311 | self.average_attn_weights = True
312 |
313 | def _compute_attn_weights(self, query, key, len_q, len_k, batch_size, mask, pdb_fn):
314 | """ computes the attention weights (a "compatability function" of queries with corresponding keys) """
315 |
316 | # calculate the first term in the numerator attn1, which is Q*K
317 | # todo: pytorch reshapes q,k and v to 3 dimensions (similar to how r_q2 is below)
318 | # is that functionally equivalent to what we're doing? is their way faster?
319 | # r_q1 = [batch_size, num_heads, len_q, head_dim]
320 | r_q1 = query.view(batch_size, len_q, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
321 | # todo: we could directly permute r_k1 to [batch_size, num_heads, head_dim, len_k]
322 | # to make it compatible for matrix multiplication with r_q1, instead of 2-step approach
323 | # r_k1 = [batch_size, num_heads, len_k, head_dim]
324 | r_k1 = key.view(batch_size, len_k, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
325 | # attn1 = [batch_size, num_heads, len_q, len_k]
326 | attn1 = torch.matmul(r_q1, r_k1.permute(0, 1, 3, 2))
327 |
328 | # calculate the second term in the numerator attn2, which is Q*R
329 | # r_q2 = [query_len, batch_size * num_heads, head_dim]
330 | r_q2 = query.permute(1, 0, 2).contiguous().view(len_q, batch_size * self.num_heads, self.head_dim)
331 |
332 | # todo: support multiple different PDB base structures per batch
333 | # one option:
334 | # - require batches to be all the same protein
335 | # - add argument to forward() to accept the PDB file for the protein in the batch
336 | # - then we just pass in the PDB file to relative position's forward()
337 | # to support multiple different structures per batch:
338 | # - add argument to forward() to accept PDB files, one for each item in batch
339 | # - make corresponding changing in relative_position object to return R for each structure
340 | # - note: if there are a lot of of different structures, and the sequence lengths are long,
341 | # this could be memory prohibitive because R (rel_pos_k) can take up a lot of mem for long seqs
342 | # - adjust the attn2 calculation to factor in the multiple different R matrices.
343 | # the way to do this might have to be to do multiple matmuls, one for each each structure.
344 | # basically, would split up r_q2 into several matrices grouped by structure, and then
345 | # multiply with corresponding R, then combine back into the exact same order of the original r_q2
346 | # note: this may be computationally intensive (splitting, more matrix muliplies, joining)
347 | # another option would be to create views(?), repeating the different Rs so we can do a
348 | # a matris multiply directly with r_q2
349 | # - would shapes be affected if there was padding in the queries, keys, values?
350 |
351 | if self.pos_encoding == "relative":
352 | # rel_pos_k = [len_q, len_k, head_dim]
353 | rel_pos_k = self.relative_position_k(len_q, len_k)
354 | elif self.pos_encoding == "relative_3D":
355 | # rel_pos_k = [sequence length (from PDB structure), head_dim]
356 | rel_pos_k = self.relative_position_k(pdb_fn)
357 | else:
358 | raise ValueError("unrecognized pos_encoding: {}".format(self.pos_encoding))
359 |
360 | # the matmul basically computes the dot product between each input position’s query vector and
361 | # its corresponding relative position embeddings across all input sequences in the heads and batch
362 | # attn2 = [batch_size * num_heads, len_q, len_k]
363 | attn2 = torch.matmul(r_q2, rel_pos_k.transpose(1, 2)).transpose(0, 1)
364 | # attn2 = [batch_size, num_heads, len_q, len_k]
365 | attn2 = attn2.contiguous().view(batch_size, self.num_heads, len_q, len_k)
366 |
367 | # calculate attention weights
368 | attn_weights = (attn1 + attn2) / self.scale
369 |
370 | # apply mask if given
371 | if mask is not None:
372 | # todo: pytorch uses float("-inf") instead of -1e10
373 | attn_weights = attn_weights.masked_fill(mask == 0, -1e10)
374 |
375 | # softmax gives us attn_weights weights
376 | attn_weights = torch.softmax(attn_weights, dim=-1)
377 | # attn_weights = [batch_size, num_heads, len_q, len_k]
378 | attn_weights = self.dropout(attn_weights)
379 |
380 | return attn_weights
381 |
382 | def _compute_avg_val(self, value, len_q, len_k, len_v, attn_weights, batch_size, pdb_fn):
383 | # todo: add option to not factor in relative position embeddings in value calculation
384 | # calculate the first term, the attn*values
385 | # r_v1 = [batch_size, num_heads, len_v, head_dim]
386 | r_v1 = value.view(batch_size, len_v, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
387 | # avg1 = [batch_size, num_heads, len_q, head_dim]
388 | avg1 = torch.matmul(attn_weights, r_v1)
389 |
390 | # calculate the second term, the attn*R
391 | # similar to how relative embeddings are factored in the attention weights calculation
392 | if self.pos_encoding == "relative":
393 | # rel_pos_v = [query_len, value_len, head_dim]
394 | rel_pos_v = self.relative_position_v(len_q, len_v)
395 | elif self.pos_encoding == "relative_3D":
396 | # rel_pos_v = [sequence length (from PDB structure), head_dim]
397 | rel_pos_v = self.relative_position_v(pdb_fn)
398 | else:
399 | raise ValueError("unrecognized pos_encoding: {}".format(self.pos_encoding))
400 |
401 | # r_attn_weights = [len_q, batch_size * num_heads, len_v]
402 | r_attn_weights = attn_weights.permute(2, 0, 1, 3).contiguous().view(len_q, batch_size * self.num_heads, len_k)
403 | avg2 = torch.matmul(r_attn_weights, rel_pos_v)
404 | # avg2 = [batch_size, num_heads, len_q, head_dim]
405 | avg2 = avg2.transpose(0, 1).contiguous().view(batch_size, self.num_heads, len_q, self.head_dim)
406 |
407 | # calculate avg value
408 | x = avg1 + avg2 # [batch_size, num_heads, len_q, head_dim]
409 | x = x.permute(0, 2, 1, 3).contiguous() # [batch_size, len_q, num_heads, head_dim]
410 | # x = [batch_size, len_q, embed_dim]
411 | x = x.view(batch_size, len_q, self.embed_dim)
412 |
413 | return x
414 |
415 | def forward(self, query, key, value, pdb_fn=None, mask=None):
416 | # query = [batch_size, q_len, embed_dim]
417 | # key = [batch_size, k_len, embed_dim]
418 | # value = [batch_size, v_en, embed_dim]
419 | batch_size = query.shape[0]
420 | len_k, len_q, len_v = (key.shape[1], query.shape[1], value.shape[1])
421 |
422 | # in projection (multiply inputs by WQ, WK, WV)
423 | query = self.q_proj(query)
424 | key = self.k_proj(key)
425 | value = self.v_proj(value)
426 |
427 | # first compute the attention weights, then multiply with values
428 | # attn = [batch size, num_heads, len_q, len_k]
429 | attn_weights = self._compute_attn_weights(query, key, len_q, len_k, batch_size, mask, pdb_fn)
430 |
431 | # take weighted average of values (weighted by attention weights)
432 | attn_output = self._compute_avg_val(value, len_q, len_k, len_v, attn_weights, batch_size, pdb_fn)
433 |
434 | # output projection
435 | # attn_output = [batch_size, len_q, embed_dim]
436 | attn_output = self.out_proj(attn_output)
437 |
438 | if self.need_weights:
439 | # return attention weights in addition to attention
440 | # average the weights over the heads (to get overall attention)
441 | # attn_weights = [batch_size, len_q, len_k]
442 | if self.average_attn_weights:
443 | attn_weights = attn_weights.sum(dim=1) / self.num_heads
444 | return {"attn_output": attn_output, "attn_weights": attn_weights}
445 | else:
446 | return attn_output
447 |
448 |
449 | class RelativeTransformerEncoderLayer(nn.Module):
450 | """
451 | d_model: the number of expected features in the input (required).
452 | nhead: the number of heads in the MultiHeadAttention models (required).
453 | clipping_threshold: the clipping threshold for relative position embeddings
454 | dim_feedforward: the dimension of the feedforward network model (default=2048).
455 | dropout: the dropout value (default=0.1).
456 | activation: the activation function of the intermediate layer, can be a string
457 | ("relu" or "gelu") or a unary callable. Default: relu
458 | layer_norm_eps: the eps value in layer normalization components (default=1e-5).
459 | norm_first: if ``True``, layer norm is done prior to attention and feedforward
460 | operations, respectively. Otherwise, it's done after. Default: ``False`` (after).
461 | """
462 |
463 | # this is some kind of torch jit compiling helper... will also ensure these values don't change
464 | __constants__ = ['batch_first', 'norm_first']
465 |
466 | def __init__(self,
467 | d_model,
468 | nhead,
469 | pos_encoding="relative",
470 | clipping_threshold=3,
471 | contact_threshold=7,
472 | pdb_fns=None,
473 | dim_feedforward=2048,
474 | dropout=0.1,
475 | activation=F.relu,
476 | layer_norm_eps=1e-5,
477 | norm_first=False) -> None:
478 |
479 | self.batch_first = True
480 |
481 | super(RelativeTransformerEncoderLayer, self).__init__()
482 |
483 | self.self_attn = RelativeMultiHeadAttention(d_model, nhead, dropout,
484 | pos_encoding, clipping_threshold, contact_threshold, pdb_fns)
485 |
486 | # feed forward model
487 | self.linear1 = Linear(d_model, dim_feedforward)
488 | self.dropout = Dropout(dropout)
489 | self.linear2 = Linear(dim_feedforward, d_model)
490 |
491 | self.norm_first = norm_first
492 | self.norm1 = LayerNorm(d_model, eps=layer_norm_eps)
493 | self.norm2 = LayerNorm(d_model, eps=layer_norm_eps)
494 | self.dropout1 = Dropout(dropout)
495 | self.dropout2 = Dropout(dropout)
496 |
497 | # Legacy string support for activation function.
498 | if isinstance(activation, str):
499 | self.activation = models.get_activation_fn(activation)
500 | else:
501 | self.activation = activation
502 |
503 | def forward(self, src: Tensor, pdb_fn=None) -> Tensor:
504 | x = src
505 | if self.norm_first:
506 | x = x + self._sa_block(self.norm1(x), pdb_fn=pdb_fn)
507 | x = x + self._ff_block(self.norm2(x))
508 | else:
509 | x = self.norm1(x + self._sa_block(x))
510 | x = self.norm2(x + self._ff_block(x))
511 |
512 | return x
513 |
514 | # self-attention block
515 | def _sa_block(self, x: Tensor, pdb_fn=None) -> Tensor:
516 | x = self.self_attn(x, x, x, pdb_fn=pdb_fn)
517 | if isinstance(x, dict):
518 | # handle the case where we are returning attention weights
519 | x = x["attn_output"]
520 | return self.dropout1(x)
521 |
522 | # feed forward block
523 | def _ff_block(self, x: Tensor) -> Tensor:
524 | x = self.linear2(self.dropout(self.activation(self.linear1(x))))
525 | return self.dropout2(x)
526 |
527 |
528 | class RelativeTransformerEncoder(nn.Module):
529 | def __init__(self, encoder_layer, num_layers, norm=None, reset_params=True):
530 | super(RelativeTransformerEncoder, self).__init__()
531 | # using get_clones means all layers have the same initialization
532 | # this is also a problem in PyTorch's TransformerEncoder implementation, which this is based on
533 | # todo: PyTorch is changing its transformer API... check up on and see if there is a better way
534 | self.layers = _get_clones(encoder_layer, num_layers)
535 | self.num_layers = num_layers
536 | self.norm = norm
537 |
538 | # important because get_clones means all layers have same initialization
539 | # should recursively reset parameters for all submodules
540 | if reset_params:
541 | self.apply(models.reset_parameters_helper)
542 |
543 | def forward(self, src: Tensor, pdb_fn=None) -> Tensor:
544 | output = src
545 |
546 | for mod in self.layers:
547 | output = mod(output, pdb_fn=pdb_fn)
548 |
549 | if self.norm is not None:
550 | output = self.norm(output)
551 |
552 | return output
553 |
554 |
555 | def _get_clones(module, num_clones):
556 | return nn.ModuleList([copy.deepcopy(module) for _ in range(num_clones)])
557 |
558 |
559 | def _inv_dict(d):
560 | """ helper function for contact map-based position embeddings """
561 | inv = dict()
562 | for k, v in d.items():
563 | # collect dict keys into lists based on value
564 | inv.setdefault(v, list()).append(k)
565 | for k, v in inv.items():
566 | # put in sorted order
567 | inv[k] = sorted(v)
568 | return inv
569 |
570 |
571 | def _combine_d(d, threshold, combined_key):
572 | """ helper function for contact map-based position embeddings
573 | d is a dictionary with ints as keys and lists as values.
574 | for all keys >= threshold, this function combines the values of those keys into a single list """
575 | out_d = {}
576 | for k, v in d.items():
577 | if k < threshold:
578 | out_d[k] = v
579 | elif k >= threshold:
580 | if combined_key not in out_d:
581 | out_d[combined_key] = v
582 | else:
583 | out_d[combined_key] += v
584 | if combined_key in out_d:
585 | out_d[combined_key] = sorted(out_d[combined_key])
586 | return out_d
587 |
--------------------------------------------------------------------------------
/metl/structure.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os.path import isfile
3 | from enum import Enum, auto
4 |
5 | import numpy as np
6 | from scipy.spatial.distance import cdist
7 | import networkx as nx
8 | from biopandas.pdb import PandasPdb
9 |
10 |
11 | class GraphType(Enum):
12 | LINEAR = auto()
13 | COMPLETE = auto()
14 | DISCONNECTED = auto()
15 | DIST_THRESH = auto()
16 | DIST_THRESH_SHUFFLED = auto()
17 |
18 |
19 | def save_graph(g, fn):
20 | """ Saves graph to file """
21 | nx.write_gexf(g, fn)
22 |
23 |
24 | def load_graph(fn):
25 | """ Loads graph from file """
26 | g = nx.read_gexf(fn, node_type=int)
27 | return g
28 |
29 |
30 | def shuffle_nodes(g, seed=7):
31 | """ Shuffles the nodes of the given graph and returns a copy of the shuffled graph """
32 | # get the list of nodes in this graph
33 | nodes = g.nodes()
34 |
35 | # create a permuted list of nodes
36 | np.random.seed(seed)
37 | nodes_shuffled = np.random.permutation(nodes)
38 |
39 | # create a dictionary mapping from old node label to new node label
40 | mapping = {n: ns for n, ns in zip(nodes, nodes_shuffled)}
41 |
42 | g_shuffled = nx.relabel_nodes(g, mapping, copy=True)
43 |
44 | return g_shuffled
45 |
46 |
47 | def linear_graph(num_residues):
48 | """ Creates a linear graph where each node is connected to its sequence neighbor in order """
49 | g = nx.Graph()
50 | g.add_nodes_from(np.arange(0, num_residues))
51 | for i in range(num_residues-1):
52 | g.add_edge(i, i+1)
53 | return g
54 |
55 |
56 | def complete_graph(num_residues):
57 | """ Creates a graph where each node is connected to all other nodes"""
58 | g = nx.complete_graph(num_residues)
59 | return g
60 |
61 |
62 | def disconnected_graph(num_residues):
63 | g = nx.Graph()
64 | g.add_nodes_from(np.arange(0, num_residues))
65 | return g
66 |
67 |
68 | def dist_thresh_graph(dist_mtx, threshold):
69 | """ Creates undirected graph based on a distance threshold """
70 | g = nx.Graph()
71 | g.add_nodes_from(np.arange(0, dist_mtx.shape[0]))
72 |
73 | # loop through each residue
74 | for rn1 in range(len(dist_mtx)):
75 | # find all residues that are within threshold distance of current
76 | rns_within_threshold = np.where(dist_mtx[rn1] < threshold)[0]
77 |
78 | # add edges from current residue to those that are within threshold
79 | for rn2 in rns_within_threshold:
80 | # don't add self edges
81 | if rn1 != rn2:
82 | g.add_edge(rn1, rn2)
83 | return g
84 |
85 |
86 | def ordered_adjacency_matrix(g):
87 | """ returns the adjacency matrix ordered by node label in increasing order as a numpy array """
88 | node_order = sorted(g.nodes())
89 | adj_mtx = nx.to_numpy_matrix(g, nodelist=node_order)
90 | return np.asarray(adj_mtx).astype(np.float32)
91 |
92 |
93 | def cbeta_distance_matrix(pdb_fn, start=0, end=None):
94 | # note that start and end are not going by residue number
95 | # they are going by whatever the listing in the pdb file is
96 |
97 | # read the pdb file into a biopandas object
98 | ppdb = PandasPdb().read_pdb(pdb_fn)
99 |
100 | # group by residue number
101 | # important to specify sort=True so that group keys (residue number) are in order
102 | # the reason is we loop through group keys below, and assume that residues are in order
103 | # the pandas function has sort=True by default, but we specify it anyway because it is important
104 | grouped = ppdb.df["ATOM"].groupby("residue_number", sort=True)
105 |
106 | # a list of coords for the cbeta or calpha of each residue
107 | coords = []
108 |
109 | # loop through each residue and find the coordinates of cbeta
110 | for i, (residue_number, values) in enumerate(grouped):
111 |
112 | # skip residues not in the range
113 | end_index = (len(grouped) if end is None else end)
114 | if i not in range(start, end_index):
115 | continue
116 |
117 | residue_group = grouped.get_group(residue_number)
118 |
119 | atom_names = residue_group["atom_name"]
120 | if "CB" in atom_names.values:
121 | # print("Using CB...")
122 | atom_name = "CB"
123 | elif "CA" in atom_names.values:
124 | # print("Using CA...")
125 | atom_name = "CA"
126 | else:
127 | raise ValueError("Couldn't find CB or CA for residue {}".format(residue_number))
128 |
129 | # get the coordinates of cbeta (or calpha)
130 | coords.append(
131 | residue_group[residue_group["atom_name"] == atom_name][["x_coord", "y_coord", "z_coord"]].values[0])
132 |
133 | # stack the coords into a numpy array where each row has the x,y,z coords for a different residue
134 | coords = np.stack(coords)
135 |
136 | # compute pairwise euclidean distance between all cbetas
137 | dist_mtx = cdist(coords, coords, metric="euclidean")
138 |
139 | return dist_mtx
140 |
141 |
142 | def get_neighbors(g, nodes):
143 | """ returns a list (set) of neighbors of all given nodes """
144 | neighbors = set()
145 | for n in nodes:
146 | neighbors.update(g.neighbors(n))
147 | return sorted(list(neighbors))
148 |
149 |
150 | def gen_graph(graph_type, res_dist_mtx, dist_thresh=7, shuffle_seed=7, graph_save_dir=None, save=False):
151 | """ generate the specified structure graph using the specified residue distance matrix """
152 | if graph_type is GraphType.LINEAR:
153 | g = linear_graph(len(res_dist_mtx))
154 | save_fn = None if not save else os.path.join(graph_save_dir, "linear.graph")
155 |
156 | elif graph_type is GraphType.COMPLETE:
157 | g = complete_graph(len(res_dist_mtx))
158 | save_fn = None if not save else os.path.join(graph_save_dir, "complete.graph")
159 |
160 | elif graph_type is GraphType.DISCONNECTED:
161 | g = disconnected_graph(len(res_dist_mtx))
162 | save_fn = None if not save else os.path.join(graph_save_dir, "disconnected.graph")
163 |
164 | elif graph_type is GraphType.DIST_THRESH:
165 | g = dist_thresh_graph(res_dist_mtx, dist_thresh)
166 | save_fn = None if not save else os.path.join(graph_save_dir, "dist_thresh_{}.graph".format(dist_thresh))
167 |
168 | elif graph_type is GraphType.DIST_THRESH_SHUFFLED:
169 | g = dist_thresh_graph(res_dist_mtx, dist_thresh)
170 | g = shuffle_nodes(g, seed=shuffle_seed)
171 | save_fn = None if not save else \
172 | os.path.join(graph_save_dir, "dist_thresh_{}_shuffled_r{}.graph".format(dist_thresh, shuffle_seed))
173 |
174 | else:
175 | raise ValueError("Graph type {} is not implemented".format(graph_type))
176 |
177 | if save:
178 | if isfile(save_fn):
179 | print("err: graph already exists: {}. to overwrite, delete the existing file first".format(save_fn))
180 | else:
181 | os.makedirs(graph_save_dir, exist_ok=True)
182 | save_graph(g, save_fn)
183 |
184 | return g
185 |
--------------------------------------------------------------------------------
/metl/test.py:
--------------------------------------------------------------------------------
1 | import metl
2 | import torch
3 |
4 |
5 | def main():
6 | model, data_encoder = metl.get_from_ident("metl-g-20m-1d")
7 |
8 | # make sure all the sequences are the same length
9 | amino_acid_sequences = ["SMART", "MAGIC"]
10 | encoded_seqs = data_encoder.encode_sequences(amino_acid_sequences)
11 |
12 | # set model to eval mode
13 | model.eval()
14 | # no need to compute gradients for inference
15 | with torch.no_grad():
16 | predictions = model(torch.tensor(encoded_seqs))
17 |
18 | print(predictions)
19 |
20 |
21 | if __name__ == "__main__":
22 | main()
23 |
--------------------------------------------------------------------------------
/metl/test2.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import metl
3 |
4 |
5 | def main():
6 | # "YoQkzoLD" is a METL-L (2M, 1D) [GFP] model that was fine-tuned on 64 examples from the avGFP DMS dataset
7 | model, data_encoder = metl.get_from_uuid(uuid="YoQkzoLD")
8 |
9 | # the GFP wild-type sequence
10 | wt = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQ" \
11 | "HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKN" \
12 | "GIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
13 |
14 | # some example GFP variants to compute the scores for
15 | variants = ["E3K,G102S",
16 | "T36P,S203T,K207R",
17 | "V10A,D19G,F25S,E113V"]
18 |
19 | encoded_variants = data_encoder.encode_variants(wt, variants)
20 |
21 | # set model to eval mode
22 | model.eval()
23 | # no need to compute gradients for inference
24 | with torch.no_grad():
25 | predictions = model(torch.tensor(encoded_variants))
26 |
27 | print(predictions)
28 |
29 |
30 | if __name__ == "__main__":
31 | main()
32 |
--------------------------------------------------------------------------------
/metl/test3.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import metl
3 |
4 |
5 | def main():
6 | # this is a 3D RPE model, which requires a PDB file matching the WT sequence
7 | model, data_encoder = metl.get_from_uuid(uuid="PEkeRuxb")
8 |
9 | # the GFP wild-type sequence
10 | wt = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQ" \
11 | "HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKN" \
12 | "GIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
13 |
14 | # some example GFP variants to compute the scores for
15 | variants = ["E3K,G102S",
16 | "T36P,S203T,K207R",
17 | "V10A,D19G,F25S,E113V"]
18 |
19 | encoded_variants = data_encoder.encode_variants(wt, variants)
20 |
21 | # set model to eval mode
22 | model.eval()
23 | # no need to compute gradients for inference
24 | with torch.no_grad():
25 | predictions = model(torch.tensor(encoded_variants), pdb_fn="pdbs/1gfl_cm.pdb")
26 |
27 | print(predictions)
28 |
29 |
30 | if __name__ == "__main__":
31 | main()
32 |
--------------------------------------------------------------------------------
/metl/test4.py:
--------------------------------------------------------------------------------
1 | import metl
2 | import torch
3 |
4 |
5 | def main():
6 | model, data_encoder = metl.get_from_ident("METL-L-2M-3D-GB1")
7 |
8 | # the GB1 WT sequence
9 | wt = "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"
10 |
11 | # some example GB1 variants for which to compute the METL-Local Rosetta scores estimates
12 | variants = ["T17P,T54F",
13 | "V28L,F51A",
14 | "T17P,V28L,F51A,T54F"]
15 |
16 | encoded_variants = data_encoder.encode_variants(wt, variants)
17 |
18 | # set model to eval mode
19 | model.eval()
20 | # no need to compute gradients for inference
21 | with torch.no_grad():
22 | predictions = model(torch.tensor(encoded_variants), pdb_fn="pdbs/2qmt_p.pdb")
23 | print(predictions)
24 |
25 | # can also input full sequences
26 | sequences = ["MPYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE",
27 | "MPAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE",
28 | "MGEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"]
29 | encoded_sequences = data_encoder.encode_sequences(sequences)
30 | model.eval()
31 | with torch.no_grad():
32 | predictions = model(torch.tensor(encoded_sequences), pdb_fn="pdbs/2qmt_p.pdb")
33 | print(predictions)
34 |
35 | # can also use the 1D model which doesn't require a PDB file
36 | model, data_encoder = metl.get_from_ident("METL-L-2M-1D-GB1")
37 | variants = ["T17P,T54F",
38 | "V28L,F51A",
39 | "T17P,V28L,F51A,T54F"]
40 | encoded_variants = data_encoder.encode_variants(wt, variants)
41 | model.eval()
42 | with torch.no_grad():
43 | predictions = model(torch.tensor(encoded_variants))
44 | print(predictions)
45 |
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/notebooks/inference.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Inference with METL-Global"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 10,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import torch\n",
17 | "import torchextractor as tx\n",
18 | "import torchinfo\n",
19 | "\n",
20 | "import metl"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "# Load a METL-G model"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 5,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "model, data_encoder = metl.get_from_ident(\"METL-G-20M-1D\")"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 7,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "name": "stdout",
46 | "output_type": "stream",
47 | "text": [
48 | "===============================================================================================\n",
49 | "Layer (type (var_name)) Param #\n",
50 | "===============================================================================================\n",
51 | "AttnModel (AttnModel) --\n",
52 | "├─SequentialWithArgs (model) --\n",
53 | "│ └─ScaledEmbedding (embedder) --\n",
54 | "│ │ └─Embedding (embedding) 10,752\n",
55 | "│ └─RelativeTransformerEncoder (tr_encoder) --\n",
56 | "│ │ └─ModuleList (layers) --\n",
57 | "│ │ │ └─RelativeTransformerEncoderLayer (0) 3,154,560\n",
58 | "│ │ │ └─RelativeTransformerEncoderLayer (1) 3,154,560\n",
59 | "│ │ │ └─RelativeTransformerEncoderLayer (2) 3,154,560\n",
60 | "│ │ │ └─RelativeTransformerEncoderLayer (3) 3,154,560\n",
61 | "│ │ │ └─RelativeTransformerEncoderLayer (4) 3,154,560\n",
62 | "│ │ │ └─RelativeTransformerEncoderLayer (5) 3,154,560\n",
63 | "│ │ └─LayerNorm (norm) 1,024\n",
64 | "│ └─GlobalAveragePooling (avg_pooling) --\n",
65 | "│ └─FCBlock (fc1) --\n",
66 | "│ │ └─Linear (fc) 262,656\n",
67 | "│ │ └─ReLU (activation) --\n",
68 | "│ │ └─LayerNorm (norm) 1,024\n",
69 | "│ │ └─Dropout (dropout) --\n",
70 | "│ └─Linear (prediction) 28,215\n",
71 | "===============================================================================================\n",
72 | "Total params: 19,231,031\n",
73 | "Trainable params: 19,231,031\n",
74 | "Non-trainable params: 0\n",
75 | "===============================================================================================\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "summary = torchinfo.summary(model, depth=4, verbose=1, row_settings=[\"var_names\"])"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "# Set up representation extraction\n",
88 | "For METL-Global models, I recommend using the representation immediately after the GlobalAveragePooling (avg_pooling) layer. For METL-Local models, I recommend using the representation immediately after the final fully connected layer (fc1). "
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 12,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "return_layers = [\n",
98 | " \"model.avg_pooling\",\n",
99 | "]\n",
100 | "\n",
101 | "extractor = tx.Extractor(model.eval(), return_layers)"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "# Test a couple sequences"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 25,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "# note: make sure all the sequences in a batch are the same length\n",
118 | "amino_acid_sequences = [\"SMART\", \"MAGIC\"]\n",
119 | "encoded_seqs = data_encoder.encode_sequences(amino_acid_sequences)\n",
120 | "\n",
121 | "with torch.no_grad():\n",
122 | " model_out, intermediate_out = extractor(torch.tensor(encoded_seqs))"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 29,
128 | "metadata": {},
129 | "outputs": [
130 | {
131 | "data": {
132 | "text/plain": [
133 | "torch.Size([2, 55])"
134 | ]
135 | },
136 | "execution_count": 29,
137 | "metadata": {},
138 | "output_type": "execute_result"
139 | }
140 | ],
141 | "source": [
142 | "# model_out contains the final output of the model (Rosetta energy term predictions)\n",
143 | "# there are 55 energy terms, the first one is total_score \n",
144 | "# they are listed in order on the main README\n",
145 | "model_out.shape"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 32,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "data": {
155 | "text/plain": [
156 | "torch.Size([2, 512])"
157 | ]
158 | },
159 | "execution_count": 32,
160 | "metadata": {},
161 | "output_type": "execute_result"
162 | }
163 | ],
164 | "source": [
165 | "# intermediate_out is a dictionary containing intermediate outputs \n",
166 | "# for all the return_layers specified above\n",
167 | "# METL-G has an embedding dimension of 512, thus outputs will be 512\n",
168 | "intermediate_out[\"model.avg_pooling\"].shape"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "# Additional notes\n",
176 | "The above will retrieve a length 512 sequence-level representation immediately following the global average pooling layer, which takes the average of residue-level representations. \n",
177 | "\n",
178 | "If you want, you can also get the residue-representations. You can also play around with the sequence-level representation from after the FC layer, although I haven't had as much success with this representation for my tasks (too specific to the Rosetta energies?). You may have more luck with it, though. "
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 34,
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "# the above will retrieve a length 512 sequence-level representation\n",
188 | "# you can also get a representation for each residue\n",
189 | "\n",
190 | "return_layers = [\n",
191 | " \"model.tr_encoder\", # residue-level representation\n",
192 | " \"model.avg_pooling\", # sequence-level representation following avg pooling\n",
193 | " \"model.fc1\", # sequence-level representation following the final fully connected layer\n",
194 | "]\n",
195 | "\n",
196 | "extractor = tx.Extractor(model.eval(), return_layers)\n",
197 | "\n",
198 | "amino_acid_sequences = [\"SMART\", \"MAGIC\"]\n",
199 | "encoded_seqs = data_encoder.encode_sequences(amino_acid_sequences)\n",
200 | "\n",
201 | "with torch.no_grad():\n",
202 | " model_out, intermediate_out = extractor(torch.tensor(encoded_seqs))"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 42,
208 | "metadata": {},
209 | "outputs": [
210 | {
211 | "name": "stdout",
212 | "output_type": "stream",
213 | "text": [
214 | "Layer: model.tr_encoder\n",
215 | "Output shape: torch.Size([2, 5, 512])\n",
216 | "\n",
217 | "Layer: model.avg_pooling\n",
218 | "Output shape: torch.Size([2, 512])\n",
219 | "\n",
220 | "Layer: model.fc1\n",
221 | "Output shape: torch.Size([2, 512])\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "for k, v in intermediate_out.items():\n",
227 | " print(\"Layer: {}\\nOutput shape: {}\\n\".format(k, v.shape))"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": []
236 | }
237 | ],
238 | "metadata": {
239 | "kernelspec": {
240 | "display_name": "Python 3 (ipykernel)",
241 | "language": "python",
242 | "name": "python3"
243 | },
244 | "language_info": {
245 | "codemirror_mode": {
246 | "name": "ipython",
247 | "version": 3
248 | },
249 | "file_extension": ".py",
250 | "mimetype": "text/x-python",
251 | "name": "python",
252 | "nbconvert_exporter": "python",
253 | "pygments_lexer": "ipython3",
254 | "version": "3.9.16"
255 | }
256 | },
257 | "nbformat": 4,
258 | "nbformat_minor": 4
259 | }
260 |
--------------------------------------------------------------------------------
/pdbs/pab1_cm.pdb:
--------------------------------------------------------------------------------
1 | ATOM 1 N GLY A 1 -14.422 25.734 -5.746 1.00 0.00 N
2 | ATOM 2 CA GLY A 1 -15.203 24.662 -5.115 1.00 0.00 C
3 | ATOM 3 C GLY A 1 -14.487 23.322 -5.215 1.00 0.00 C
4 | ATOM 4 O GLY A 1 -15.061 22.336 -5.691 1.00 0.00 O
5 | ATOM 10 N ASN A 2 -13.214 23.298 -4.818 1.00 0.00 N
6 | ATOM 11 CA ASN A 2 -12.380 22.099 -4.924 1.00 0.00 C
7 | ATOM 12 C ASN A 2 -12.321 21.350 -3.595 1.00 0.00 C
8 | ATOM 13 O ASN A 2 -11.749 21.848 -2.626 1.00 0.00 O
9 | ATOM 14 CB ASN A 2 -11.004 22.475 -5.456 1.00 0.00 C
10 | ATOM 15 CG ASN A 2 -10.105 21.294 -5.801 1.00 0.00 C
11 | ATOM 16 OD1 ASN A 2 -10.309 20.145 -5.399 1.00 0.00 O
12 | ATOM 17 ND2 ASN A 2 -9.088 21.602 -6.581 1.00 0.00 N
13 | ATOM 24 N ILE A 3 -12.987 20.198 -3.552 1.00 0.00 N
14 | ATOM 25 CA ILE A 3 -13.111 19.388 -2.339 1.00 0.00 C
15 | ATOM 26 C ILE A 3 -11.955 18.383 -2.279 1.00 0.00 C
16 | ATOM 27 O ILE A 3 -11.637 17.704 -3.269 1.00 0.00 O
17 | ATOM 28 CB ILE A 3 -14.514 18.692 -2.294 1.00 0.00 C
18 | ATOM 29 CG1 ILE A 3 -14.771 17.939 -0.981 1.00 0.00 C
19 | ATOM 30 CG2 ILE A 3 -14.647 17.722 -3.425 1.00 0.00 C
20 | ATOM 31 CD1 ILE A 3 -16.274 17.589 -0.797 1.00 0.00 C
21 | ATOM 43 N PHE A 4 -11.283 18.341 -1.122 1.00 0.00 N
22 | ATOM 44 CA PHE A 4 -10.155 17.436 -0.920 1.00 0.00 C
23 | ATOM 45 C PHE A 4 -10.661 16.137 -0.366 1.00 0.00 C
24 | ATOM 46 O PHE A 4 -11.352 16.110 0.657 1.00 0.00 O
25 | ATOM 47 CB PHE A 4 -9.103 18.009 0.017 1.00 0.00 C
26 | ATOM 48 CG PHE A 4 -7.943 17.071 0.172 1.00 0.00 C
27 | ATOM 49 CD1 PHE A 4 -7.063 16.930 -0.865 1.00 0.00 C
28 | ATOM 50 CD2 PHE A 4 -7.728 16.332 1.329 1.00 0.00 C
29 | ATOM 51 CE1 PHE A 4 -5.991 16.087 -0.788 1.00 0.00 C
30 | ATOM 52 CE2 PHE A 4 -6.636 15.480 1.408 1.00 0.00 C
31 | ATOM 53 CZ PHE A 4 -5.773 15.360 0.346 1.00 0.00 C
32 | ATOM 63 N ILE A 5 -10.339 15.058 -1.047 1.00 0.00 N
33 | ATOM 64 CA ILE A 5 -10.849 13.748 -0.712 1.00 0.00 C
34 | ATOM 65 C ILE A 5 -9.733 12.867 -0.152 1.00 0.00 C
35 | ATOM 66 O ILE A 5 -8.807 12.510 -0.882 1.00 0.00 O
36 | ATOM 67 CB ILE A 5 -11.419 13.233 -2.043 1.00 0.00 C
37 | ATOM 68 CG1 ILE A 5 -12.474 14.279 -2.540 1.00 0.00 C
38 | ATOM 69 CG2 ILE A 5 -12.001 11.881 -1.904 1.00 0.00 C
39 | ATOM 70 CD1 ILE A 5 -12.858 14.133 -3.925 1.00 0.00 C
40 | ATOM 82 N LYS A 6 -9.826 12.496 1.130 1.00 0.00 N
41 | ATOM 83 CA LYS A 6 -8.783 11.737 1.831 1.00 0.00 C
42 | ATOM 84 C LYS A 6 -9.218 10.329 2.259 1.00 0.00 C
43 | ATOM 85 O LYS A 6 -10.316 10.149 2.795 1.00 0.00 O
44 | ATOM 86 CB LYS A 6 -8.356 12.530 3.067 1.00 0.00 C
45 | ATOM 87 CG LYS A 6 -7.219 11.941 3.897 1.00 0.00 C
46 | ATOM 88 CD LYS A 6 -6.844 12.874 5.046 1.00 0.00 C
47 | ATOM 89 CE LYS A 6 -5.710 12.297 5.888 1.00 0.00 C
48 | ATOM 90 NZ LYS A 6 -5.333 13.202 7.016 1.00 0.00 N
49 | ATOM 104 N ASN A 7 -8.274 9.366 2.191 1.00 0.00 N
50 | ATOM 105 CA ASN A 7 -8.491 7.952 2.533 1.00 0.00 C
51 | ATOM 106 C ASN A 7 -9.432 7.386 1.496 1.00 0.00 C
52 | ATOM 107 O ASN A 7 -10.439 6.750 1.793 1.00 0.00 O
53 | ATOM 108 CB ASN A 7 -9.065 7.762 3.932 1.00 0.00 C
54 | ATOM 109 CG ASN A 7 -8.816 6.355 4.498 1.00 0.00 C
55 | ATOM 110 OD1 ASN A 7 -7.731 5.784 4.300 1.00 0.00 O
56 | ATOM 111 ND2 ASN A 7 -9.792 5.803 5.180 1.00 0.00 N
57 | ATOM 118 N LEU A 8 -9.085 7.686 0.259 1.00 0.00 N
58 | ATOM 119 CA LEU A 8 -9.833 7.261 -0.890 1.00 0.00 C
59 | ATOM 120 C LEU A 8 -9.372 5.874 -1.287 1.00 0.00 C
60 | ATOM 121 O LEU A 8 -8.217 5.662 -1.658 1.00 0.00 O
61 | ATOM 122 CB LEU A 8 -9.618 8.298 -2.008 1.00 0.00 C
62 | ATOM 123 CG LEU A 8 -10.363 8.119 -3.325 1.00 0.00 C
63 | ATOM 124 CD1 LEU A 8 -11.871 8.223 -3.085 1.00 0.00 C
64 | ATOM 125 CD2 LEU A 8 -9.900 9.222 -4.304 1.00 0.00 C
65 | ATOM 137 N HIS A 9 -10.299 4.929 -1.229 1.00 0.00 N
66 | ATOM 138 CA HIS A 9 -10.009 3.530 -1.488 1.00 0.00 C
67 | ATOM 139 C HIS A 9 -9.318 3.361 -2.853 1.00 0.00 C
68 | ATOM 140 O HIS A 9 -9.773 3.957 -3.832 1.00 0.00 O
69 | ATOM 141 CB HIS A 9 -11.318 2.733 -1.423 1.00 0.00 C
70 | ATOM 142 CG HIS A 9 -11.162 1.273 -1.530 1.00 0.00 C
71 | ATOM 143 ND1 HIS A 9 -10.964 0.628 -2.740 1.00 0.00 N
72 | ATOM 144 CD2 HIS A 9 -11.178 0.308 -0.583 1.00 0.00 C
73 | ATOM 145 CE1 HIS A 9 -10.849 -0.680 -2.513 1.00 0.00 C
74 | ATOM 146 NE2 HIS A 9 -10.982 -0.890 -1.218 1.00 0.00 N
75 | ATOM 154 N PRO A 10 -8.247 2.530 -2.961 1.00 0.00 N
76 | ATOM 155 CA PRO A 10 -7.463 2.245 -4.160 1.00 0.00 C
77 | ATOM 156 C PRO A 10 -8.216 1.897 -5.437 1.00 0.00 C
78 | ATOM 157 O PRO A 10 -7.713 2.181 -6.521 1.00 0.00 O
79 | ATOM 158 CB PRO A 10 -6.614 1.049 -3.718 1.00 0.00 C
80 | ATOM 159 CG PRO A 10 -6.414 1.247 -2.247 1.00 0.00 C
81 | ATOM 160 CD PRO A 10 -7.703 1.838 -1.744 1.00 0.00 C
82 | ATOM 168 N ASP A 11 -9.407 1.294 -5.349 1.00 0.00 N
83 | ATOM 169 CA ASP A 11 -10.095 0.943 -6.588 1.00 0.00 C
84 | ATOM 170 C ASP A 11 -10.914 2.105 -7.152 1.00 0.00 C
85 | ATOM 171 O ASP A 11 -11.527 1.994 -8.226 1.00 0.00 O
86 | ATOM 172 CB ASP A 11 -10.995 -0.273 -6.379 1.00 0.00 C
87 | ATOM 173 CG ASP A 11 -10.216 -1.581 -6.127 1.00 0.00 C
88 | ATOM 174 OD1 ASP A 11 -9.095 -1.715 -6.563 1.00 0.00 O
89 | ATOM 175 OD2 ASP A 11 -10.768 -2.435 -5.480 1.00 0.00 O
90 | ATOM 180 N ILE A 12 -10.948 3.237 -6.458 1.00 0.00 N
91 | ATOM 181 CA ILE A 12 -11.752 4.320 -6.974 1.00 0.00 C
92 | ATOM 182 C ILE A 12 -10.954 5.125 -7.965 1.00 0.00 C
93 | ATOM 183 O ILE A 12 -10.210 6.034 -7.606 1.00 0.00 O
94 | ATOM 184 CB ILE A 12 -12.266 5.232 -5.836 1.00 0.00 C
95 | ATOM 185 CG1 ILE A 12 -13.108 4.367 -4.867 1.00 0.00 C
96 | ATOM 186 CG2 ILE A 12 -13.089 6.433 -6.421 1.00 0.00 C
97 | ATOM 187 CD1 ILE A 12 -13.488 5.021 -3.574 1.00 0.00 C
98 | ATOM 199 N ASP A 13 -11.156 4.805 -9.232 1.00 0.00 N
99 | ATOM 200 CA ASP A 13 -10.457 5.490 -10.307 1.00 0.00 C
100 | ATOM 201 C ASP A 13 -11.192 6.799 -10.549 1.00 0.00 C
101 | ATOM 202 O ASP A 13 -12.200 7.063 -9.887 1.00 0.00 O
102 | ATOM 203 CB ASP A 13 -10.423 4.601 -11.563 1.00 0.00 C
103 | ATOM 204 CG ASP A 13 -9.331 4.969 -12.606 1.00 0.00 C
104 | ATOM 205 OD1 ASP A 13 -8.920 6.114 -12.647 1.00 0.00 O
105 | ATOM 206 OD2 ASP A 13 -8.950 4.110 -13.350 1.00 0.00 O
106 | ATOM 211 N ASN A 14 -10.728 7.614 -11.486 1.00 0.00 N
107 | ATOM 212 CA ASN A 14 -11.371 8.901 -11.714 1.00 0.00 C
108 | ATOM 213 C ASN A 14 -12.826 8.782 -12.134 1.00 0.00 C
109 | ATOM 214 O ASN A 14 -13.651 9.603 -11.737 1.00 0.00 O
110 | ATOM 215 CB ASN A 14 -10.595 9.740 -12.705 1.00 0.00 C
111 | ATOM 216 CG ASN A 14 -11.203 11.111 -12.864 1.00 0.00 C
112 | ATOM 217 OD1 ASN A 14 -11.506 11.787 -11.870 1.00 0.00 O
113 | ATOM 218 ND2 ASN A 14 -11.375 11.534 -14.091 1.00 0.00 N
114 | ATOM 225 N LYS A 15 -13.160 7.765 -12.927 1.00 0.00 N
115 | ATOM 226 CA LYS A 15 -14.541 7.618 -13.370 1.00 0.00 C
116 | ATOM 227 C LYS A 15 -15.434 7.408 -12.156 1.00 0.00 C
117 | ATOM 228 O LYS A 15 -16.485 8.036 -12.018 1.00 0.00 O
118 | ATOM 229 CB LYS A 15 -14.683 6.453 -14.354 1.00 0.00 C
119 | ATOM 230 CG LYS A 15 -15.999 6.439 -15.154 1.00 0.00 C
120 | ATOM 231 CD LYS A 15 -17.105 5.605 -14.486 1.00 0.00 C
121 | ATOM 232 CE LYS A 15 -18.334 5.496 -15.397 1.00 0.00 C
122 | ATOM 233 NZ LYS A 15 -19.468 4.780 -14.736 1.00 0.00 N
123 | ATOM 247 N ALA A 16 -15.019 6.484 -11.298 1.00 0.00 N
124 | ATOM 248 CA ALA A 16 -15.738 6.163 -10.083 1.00 0.00 C
125 | ATOM 249 C ALA A 16 -15.829 7.367 -9.147 1.00 0.00 C
126 | ATOM 250 O ALA A 16 -16.873 7.578 -8.507 1.00 0.00 O
127 | ATOM 251 CB ALA A 16 -15.056 5.009 -9.387 1.00 0.00 C
128 | ATOM 257 N LEU A 17 -14.757 8.174 -9.078 1.00 0.00 N
129 | ATOM 258 CA LEU A 17 -14.784 9.324 -8.193 1.00 0.00 C
130 | ATOM 259 C LEU A 17 -15.820 10.310 -8.717 1.00 0.00 C
131 | ATOM 260 O LEU A 17 -16.582 10.895 -7.940 1.00 0.00 O
132 | ATOM 261 CB LEU A 17 -13.412 10.018 -8.112 1.00 0.00 C
133 | ATOM 262 CG LEU A 17 -13.282 11.165 -7.060 1.00 0.00 C
134 | ATOM 263 CD1 LEU A 17 -13.501 10.569 -5.683 1.00 0.00 C
135 | ATOM 264 CD2 LEU A 17 -11.899 11.825 -7.130 1.00 0.00 C
136 | ATOM 276 N TYR A 18 -15.842 10.498 -10.042 1.00 0.00 N
137 | ATOM 277 CA TYR A 18 -16.803 11.367 -10.690 1.00 0.00 C
138 | ATOM 278 C TYR A 18 -18.218 10.915 -10.372 1.00 0.00 C
139 | ATOM 279 O TYR A 18 -19.041 11.723 -9.953 1.00 0.00 O
140 | ATOM 280 CB TYR A 18 -16.594 11.399 -12.202 1.00 0.00 C
141 | ATOM 281 CG TYR A 18 -17.645 12.191 -12.929 1.00 0.00 C
142 | ATOM 282 CD1 TYR A 18 -17.522 13.559 -13.084 1.00 0.00 C
143 | ATOM 283 CD2 TYR A 18 -18.763 11.535 -13.429 1.00 0.00 C
144 | ATOM 284 CE1 TYR A 18 -18.511 14.255 -13.747 1.00 0.00 C
145 | ATOM 285 CE2 TYR A 18 -19.742 12.237 -14.077 1.00 0.00 C
146 | ATOM 286 CZ TYR A 18 -19.619 13.592 -14.237 1.00 0.00 C
147 | ATOM 287 OH TYR A 18 -20.597 14.303 -14.892 1.00 0.00 O
148 | ATOM 297 N ASP A 19 -18.526 9.619 -10.559 1.00 0.00 N
149 | ATOM 298 CA ASP A 19 -19.887 9.156 -10.271 1.00 0.00 C
150 | ATOM 299 C ASP A 19 -20.270 9.381 -8.811 1.00 0.00 C
151 | ATOM 300 O ASP A 19 -21.415 9.730 -8.503 1.00 0.00 O
152 | ATOM 301 CB ASP A 19 -20.059 7.658 -10.564 1.00 0.00 C
153 | ATOM 302 CG ASP A 19 -20.136 7.255 -12.050 1.00 0.00 C
154 | ATOM 303 OD1 ASP A 19 -20.287 8.093 -12.902 1.00 0.00 O
155 | ATOM 304 OD2 ASP A 19 -20.085 6.061 -12.307 1.00 0.00 O
156 | ATOM 309 N THR A 20 -19.306 9.191 -7.909 1.00 0.00 N
157 | ATOM 310 CA THR A 20 -19.524 9.361 -6.480 1.00 0.00 C
158 | ATOM 311 C THR A 20 -19.883 10.802 -6.126 1.00 0.00 C
159 | ATOM 312 O THR A 20 -20.820 11.049 -5.367 1.00 0.00 O
160 | ATOM 313 CB THR A 20 -18.253 8.949 -5.689 1.00 0.00 C
161 | ATOM 314 OG1 THR A 20 -17.973 7.556 -5.924 1.00 0.00 O
162 | ATOM 315 CG2 THR A 20 -18.432 9.208 -4.168 1.00 0.00 C
163 | ATOM 323 N PHE A 21 -19.127 11.754 -6.666 1.00 0.00 N
164 | ATOM 324 CA PHE A 21 -19.326 13.162 -6.370 1.00 0.00 C
165 | ATOM 325 C PHE A 21 -20.297 13.952 -7.265 1.00 0.00 C
166 | ATOM 326 O PHE A 21 -20.857 14.948 -6.805 1.00 0.00 O
167 | ATOM 327 CB PHE A 21 -17.958 13.833 -6.328 1.00 0.00 C
168 | ATOM 328 CG PHE A 21 -17.184 13.527 -5.060 1.00 0.00 C
169 | ATOM 329 CD1 PHE A 21 -16.546 12.317 -4.880 1.00 0.00 C
170 | ATOM 330 CD2 PHE A 21 -17.068 14.475 -4.057 1.00 0.00 C
171 | ATOM 331 CE1 PHE A 21 -15.861 12.040 -3.728 1.00 0.00 C
172 | ATOM 332 CE2 PHE A 21 -16.357 14.195 -2.921 1.00 0.00 C
173 | ATOM 333 CZ PHE A 21 -15.767 12.971 -2.759 1.00 0.00 C
174 | ATOM 343 N SER A 22 -20.563 13.513 -8.507 1.00 0.00 N
175 | ATOM 344 CA SER A 22 -21.421 14.276 -9.438 1.00 0.00 C
176 | ATOM 345 C SER A 22 -22.840 14.409 -8.914 1.00 0.00 C
177 | ATOM 346 O SER A 22 -23.579 15.328 -9.278 1.00 0.00 O
178 | ATOM 347 CB SER A 22 -21.476 13.630 -10.809 1.00 0.00 C
179 | ATOM 348 OG SER A 22 -22.234 12.456 -10.781 1.00 0.00 O
180 | ATOM 354 N VAL A 23 -23.205 13.525 -8.002 1.00 0.00 N
181 | ATOM 355 CA VAL A 23 -24.507 13.511 -7.364 1.00 0.00 C
182 | ATOM 356 C VAL A 23 -24.765 14.811 -6.588 1.00 0.00 C
183 | ATOM 357 O VAL A 23 -25.915 15.119 -6.276 1.00 0.00 O
184 | ATOM 358 CB VAL A 23 -24.627 12.276 -6.441 1.00 0.00 C
185 | ATOM 359 CG1 VAL A 23 -23.788 12.451 -5.222 1.00 0.00 C
186 | ATOM 360 CG2 VAL A 23 -26.085 12.039 -6.048 1.00 0.00 C
187 | ATOM 370 N PHE A 24 -23.703 15.557 -6.245 1.00 0.00 N
188 | ATOM 371 CA PHE A 24 -23.847 16.787 -5.486 1.00 0.00 C
189 | ATOM 372 C PHE A 24 -23.802 18.026 -6.370 1.00 0.00 C
190 | ATOM 373 O PHE A 24 -23.863 19.143 -5.862 1.00 0.00 O
191 | ATOM 374 CB PHE A 24 -22.712 16.890 -4.470 1.00 0.00 C
192 | ATOM 375 CG PHE A 24 -22.697 15.738 -3.568 1.00 0.00 C
193 | ATOM 376 CD1 PHE A 24 -21.676 14.819 -3.638 1.00 0.00 C
194 | ATOM 377 CD2 PHE A 24 -23.717 15.523 -2.700 1.00 0.00 C
195 | ATOM 378 CE1 PHE A 24 -21.681 13.715 -2.848 1.00 0.00 C
196 | ATOM 379 CE2 PHE A 24 -23.735 14.430 -1.907 1.00 0.00 C
197 | ATOM 380 CZ PHE A 24 -22.710 13.512 -1.975 1.00 0.00 C
198 | ATOM 390 N GLY A 25 -23.727 17.846 -7.686 1.00 0.00 N
199 | ATOM 391 CA GLY A 25 -23.649 18.967 -8.611 1.00 0.00 C
200 | ATOM 392 C GLY A 25 -22.585 18.738 -9.674 1.00 0.00 C
201 | ATOM 393 O GLY A 25 -21.672 17.933 -9.504 1.00 0.00 O
202 | ATOM 397 N ASP A 26 -22.686 19.484 -10.769 1.00 0.00 N
203 | ATOM 398 CA ASP A 26 -21.756 19.335 -11.883 1.00 0.00 C
204 | ATOM 399 C ASP A 26 -20.306 19.573 -11.494 1.00 0.00 C
205 | ATOM 400 O ASP A 26 -19.990 20.458 -10.690 1.00 0.00 O
206 | ATOM 401 CB ASP A 26 -22.149 20.267 -13.029 1.00 0.00 C
207 | ATOM 402 CG ASP A 26 -23.427 19.804 -13.758 1.00 0.00 C
208 | ATOM 403 OD1 ASP A 26 -23.857 18.695 -13.542 1.00 0.00 O
209 | ATOM 404 OD2 ASP A 26 -23.955 20.561 -14.522 1.00 0.00 O
210 | ATOM 409 N ILE A 27 -19.440 18.762 -12.100 1.00 0.00 N
211 | ATOM 410 CA ILE A 27 -17.995 18.758 -11.893 1.00 0.00 C
212 | ATOM 411 C ILE A 27 -17.248 19.320 -13.092 1.00 0.00 C
213 | ATOM 412 O ILE A 27 -17.507 18.940 -14.230 1.00 0.00 O
214 | ATOM 413 CB ILE A 27 -17.516 17.325 -11.563 1.00 0.00 C
215 | ATOM 414 CG1 ILE A 27 -18.169 16.884 -10.236 1.00 0.00 C
216 | ATOM 415 CG2 ILE A 27 -15.974 17.227 -11.515 1.00 0.00 C
217 | ATOM 416 CD1 ILE A 27 -18.007 15.418 -9.892 1.00 0.00 C
218 | ATOM 428 N LEU A 28 -16.340 20.255 -12.818 1.00 0.00 N
219 | ATOM 429 CA LEU A 28 -15.535 20.902 -13.846 1.00 0.00 C
220 | ATOM 430 C LEU A 28 -14.346 20.016 -14.171 1.00 0.00 C
221 | ATOM 431 O LEU A 28 -13.986 19.816 -15.333 1.00 0.00 O
222 | ATOM 432 CB LEU A 28 -14.979 22.225 -13.299 1.00 0.00 C
223 | ATOM 433 CG LEU A 28 -15.983 23.294 -12.891 1.00 0.00 C
224 | ATOM 434 CD1 LEU A 28 -15.221 24.412 -12.185 1.00 0.00 C
225 | ATOM 435 CD2 LEU A 28 -16.746 23.825 -14.100 1.00 0.00 C
226 | ATOM 447 N SER A 29 -13.738 19.485 -13.112 1.00 0.00 N
227 | ATOM 448 CA SER A 29 -12.551 18.639 -13.225 1.00 0.00 C
228 | ATOM 449 C SER A 29 -12.321 17.756 -12.009 1.00 0.00 C
229 | ATOM 450 O SER A 29 -12.553 18.172 -10.874 1.00 0.00 O
230 | ATOM 451 CB SER A 29 -11.321 19.482 -13.463 1.00 0.00 C
231 | ATOM 452 OG SER A 29 -10.178 18.675 -13.512 1.00 0.00 O
232 | ATOM 458 N SER A 30 -11.853 16.538 -12.232 1.00 0.00 N
233 | ATOM 459 CA SER A 30 -11.551 15.635 -11.125 1.00 0.00 C
234 | ATOM 460 C SER A 30 -10.371 14.735 -11.441 1.00 0.00 C
235 | ATOM 461 O SER A 30 -10.023 14.531 -12.611 1.00 0.00 O
236 | ATOM 462 CB SER A 30 -12.765 14.793 -10.793 1.00 0.00 C
237 | ATOM 463 OG SER A 30 -13.117 13.966 -11.871 1.00 0.00 O
238 | ATOM 469 N LYS A 31 -9.748 14.213 -10.382 1.00 0.00 N
239 | ATOM 470 CA LYS A 31 -8.615 13.307 -10.539 1.00 0.00 C
240 | ATOM 471 C LYS A 31 -8.281 12.483 -9.299 1.00 0.00 C
241 | ATOM 472 O LYS A 31 -8.668 12.813 -8.171 1.00 0.00 O
242 | ATOM 473 CB LYS A 31 -7.371 14.102 -10.963 1.00 0.00 C
243 | ATOM 474 CG LYS A 31 -6.900 15.139 -9.938 1.00 0.00 C
244 | ATOM 475 CD LYS A 31 -5.709 15.940 -10.456 1.00 0.00 C
245 | ATOM 476 CE LYS A 31 -5.215 16.947 -9.418 1.00 0.00 C
246 | ATOM 477 NZ LYS A 31 -4.072 17.761 -9.931 1.00 0.00 N
247 | ATOM 491 N ILE A 32 -7.489 11.436 -9.519 1.00 0.00 N
248 | ATOM 492 CA ILE A 32 -6.959 10.585 -8.458 1.00 0.00 C
249 | ATOM 493 C ILE A 32 -5.485 10.867 -8.352 1.00 0.00 C
250 | ATOM 494 O ILE A 32 -4.791 10.889 -9.369 1.00 0.00 O
251 | ATOM 495 CB ILE A 32 -7.114 9.087 -8.789 1.00 0.00 C
252 | ATOM 496 CG1 ILE A 32 -8.554 8.771 -9.152 1.00 0.00 C
253 | ATOM 497 CG2 ILE A 32 -6.615 8.198 -7.598 1.00 0.00 C
254 | ATOM 498 CD1 ILE A 32 -9.572 9.045 -8.116 1.00 0.00 C
255 | ATOM 510 N ALA A 33 -4.994 11.121 -7.154 1.00 0.00 N
256 | ATOM 511 CA ALA A 33 -3.574 11.357 -7.015 1.00 0.00 C
257 | ATOM 512 C ALA A 33 -2.874 10.014 -6.992 1.00 0.00 C
258 | ATOM 513 O ALA A 33 -3.216 9.168 -6.159 1.00 0.00 O
259 | ATOM 514 CB ALA A 33 -3.297 12.132 -5.757 1.00 0.00 C
260 | ATOM 520 N THR A 34 -1.910 9.812 -7.889 1.00 0.00 N
261 | ATOM 521 CA THR A 34 -1.221 8.532 -7.971 1.00 0.00 C
262 | ATOM 522 C THR A 34 0.287 8.676 -7.951 1.00 0.00 C
263 | ATOM 523 O THR A 34 0.837 9.773 -8.107 1.00 0.00 O
264 | ATOM 524 CB THR A 34 -1.572 7.780 -9.264 1.00 0.00 C
265 | ATOM 525 OG1 THR A 34 -1.027 8.485 -10.394 1.00 0.00 O
266 | ATOM 526 CG2 THR A 34 -3.081 7.659 -9.417 1.00 0.00 C
267 | ATOM 534 N ASP A 35 0.946 7.541 -7.783 1.00 0.00 N
268 | ATOM 535 CA ASP A 35 2.384 7.406 -7.836 1.00 0.00 C
269 | ATOM 536 C ASP A 35 2.793 7.280 -9.301 1.00 0.00 C
270 | ATOM 537 O ASP A 35 1.939 7.274 -10.196 1.00 0.00 O
271 | ATOM 538 CB ASP A 35 2.810 6.178 -7.010 1.00 0.00 C
272 | ATOM 539 CG ASP A 35 4.262 6.217 -6.498 1.00 0.00 C
273 | ATOM 540 OD1 ASP A 35 5.071 6.886 -7.108 1.00 0.00 O
274 | ATOM 541 OD2 ASP A 35 4.541 5.575 -5.522 1.00 0.00 O
275 | ATOM 546 N GLU A 36 4.086 7.145 -9.552 1.00 0.00 N
276 | ATOM 547 CA GLU A 36 4.606 7.073 -10.916 1.00 0.00 C
277 | ATOM 548 C GLU A 36 4.040 5.897 -11.703 1.00 0.00 C
278 | ATOM 549 O GLU A 36 3.826 5.995 -12.910 1.00 0.00 O
279 | ATOM 550 CB GLU A 36 6.132 6.973 -10.903 1.00 0.00 C
280 | ATOM 551 CG GLU A 36 6.845 8.240 -10.448 1.00 0.00 C
281 | ATOM 552 CD GLU A 36 8.344 8.095 -10.431 1.00 0.00 C
282 | ATOM 553 OE1 GLU A 36 8.820 7.012 -10.687 1.00 0.00 O
283 | ATOM 554 OE2 GLU A 36 9.013 9.065 -10.166 1.00 0.00 O
284 | ATOM 561 N ASN A 37 3.780 4.784 -11.026 1.00 0.00 N
285 | ATOM 562 CA ASN A 37 3.262 3.591 -11.678 1.00 0.00 C
286 | ATOM 563 C ASN A 37 1.732 3.509 -11.725 1.00 0.00 C
287 | ATOM 564 O ASN A 37 1.181 2.464 -12.075 1.00 0.00 O
288 | ATOM 565 CB ASN A 37 3.848 2.367 -11.011 1.00 0.00 C
289 | ATOM 566 CG ASN A 37 3.428 2.234 -9.575 1.00 0.00 C
290 | ATOM 567 OD1 ASN A 37 2.503 2.909 -9.082 1.00 0.00 O
291 | ATOM 568 ND2 ASN A 37 4.112 1.366 -8.869 1.00 0.00 N
292 | ATOM 575 N GLY A 38 1.036 4.588 -11.355 1.00 0.00 N
293 | ATOM 576 CA GLY A 38 -0.422 4.612 -11.383 1.00 0.00 C
294 | ATOM 577 C GLY A 38 -1.093 4.204 -10.069 1.00 0.00 C
295 | ATOM 578 O GLY A 38 -2.308 4.355 -9.920 1.00 0.00 O
296 | ATOM 582 N LYS A 39 -0.328 3.678 -9.116 1.00 0.00 N
297 | ATOM 583 CA LYS A 39 -0.898 3.279 -7.834 1.00 0.00 C
298 | ATOM 584 C LYS A 39 -1.445 4.483 -7.070 1.00 0.00 C
299 | ATOM 585 O LYS A 39 -0.750 5.486 -6.904 1.00 0.00 O
300 | ATOM 586 CB LYS A 39 0.151 2.550 -6.989 1.00 0.00 C
301 | ATOM 587 CG LYS A 39 -0.353 2.010 -5.653 1.00 0.00 C
302 | ATOM 588 CD LYS A 39 0.739 1.224 -4.926 1.00 0.00 C
303 | ATOM 589 CE LYS A 39 0.241 0.694 -3.586 1.00 0.00 C
304 | ATOM 590 NZ LYS A 39 1.293 -0.082 -2.865 1.00 0.00 N
305 | ATOM 604 N SER A 40 -2.677 4.388 -6.576 1.00 0.00 N
306 | ATOM 605 CA SER A 40 -3.268 5.490 -5.814 1.00 0.00 C
307 | ATOM 606 C SER A 40 -2.516 5.832 -4.540 1.00 0.00 C
308 | ATOM 607 O SER A 40 -2.092 4.945 -3.795 1.00 0.00 O
309 | ATOM 608 CB SER A 40 -4.705 5.197 -5.464 1.00 0.00 C
310 | ATOM 609 OG SER A 40 -5.210 6.183 -4.590 1.00 0.00 O
311 | ATOM 615 N LYS A 41 -2.420 7.134 -4.258 1.00 0.00 N
312 | ATOM 616 CA LYS A 41 -1.789 7.649 -3.042 1.00 0.00 C
313 | ATOM 617 C LYS A 41 -2.785 7.821 -1.893 1.00 0.00 C
314 | ATOM 618 O LYS A 41 -2.442 8.363 -0.842 1.00 0.00 O
315 | ATOM 619 CB LYS A 41 -1.045 8.957 -3.310 1.00 0.00 C
316 | ATOM 620 CG LYS A 41 0.129 8.766 -4.242 1.00 0.00 C
317 | ATOM 621 CD LYS A 41 1.075 9.969 -4.320 1.00 0.00 C
318 | ATOM 622 CE LYS A 41 0.466 11.157 -5.033 1.00 0.00 C
319 | ATOM 623 NZ LYS A 41 1.508 12.156 -5.422 1.00 0.00 N
320 | ATOM 637 N GLY A 42 -4.027 7.380 -2.104 1.00 0.00 N
321 | ATOM 638 CA GLY A 42 -5.062 7.464 -1.079 1.00 0.00 C
322 | ATOM 639 C GLY A 42 -5.860 8.764 -1.046 1.00 0.00 C
323 | ATOM 640 O GLY A 42 -6.501 9.069 -0.026 1.00 0.00 O
324 | ATOM 644 N PHE A 43 -5.795 9.567 -2.103 1.00 0.00 N
325 | ATOM 645 CA PHE A 43 -6.545 10.807 -2.121 1.00 0.00 C
326 | ATOM 646 C PHE A 43 -6.832 11.269 -3.536 1.00 0.00 C
327 | ATOM 647 O PHE A 43 -6.225 10.800 -4.506 1.00 0.00 O
328 | ATOM 648 CB PHE A 43 -5.825 11.898 -1.317 1.00 0.00 C
329 | ATOM 649 CG PHE A 43 -4.514 12.358 -1.809 1.00 0.00 C
330 | ATOM 650 CD1 PHE A 43 -4.430 13.484 -2.610 1.00 0.00 C
331 | ATOM 651 CD2 PHE A 43 -3.354 11.699 -1.457 1.00 0.00 C
332 | ATOM 652 CE1 PHE A 43 -3.210 13.949 -3.029 1.00 0.00 C
333 | ATOM 653 CE2 PHE A 43 -2.139 12.155 -1.883 1.00 0.00 C
334 | ATOM 654 CZ PHE A 43 -2.064 13.287 -2.667 1.00 0.00 C
335 | ATOM 664 N GLY A 44 -7.742 12.219 -3.642 1.00 0.00 N
336 | ATOM 665 CA GLY A 44 -8.098 12.781 -4.933 1.00 0.00 C
337 | ATOM 666 C GLY A 44 -8.795 14.115 -4.796 1.00 0.00 C
338 | ATOM 667 O GLY A 44 -8.947 14.654 -3.693 1.00 0.00 O
339 | ATOM 671 N PHE A 45 -9.214 14.649 -5.932 1.00 0.00 N
340 | ATOM 672 CA PHE A 45 -9.828 15.966 -5.944 1.00 0.00 C
341 | ATOM 673 C PHE A 45 -11.043 16.056 -6.845 1.00 0.00 C
342 | ATOM 674 O PHE A 45 -11.049 15.492 -7.949 1.00 0.00 O
343 | ATOM 675 CB PHE A 45 -8.805 16.968 -6.480 1.00 0.00 C
344 | ATOM 676 CG PHE A 45 -7.515 17.035 -5.711 1.00 0.00 C
345 | ATOM 677 CD1 PHE A 45 -6.482 16.140 -6.009 1.00 0.00 C
346 | ATOM 678 CD2 PHE A 45 -7.306 17.981 -4.733 1.00 0.00 C
347 | ATOM 679 CE1 PHE A 45 -5.291 16.184 -5.343 1.00 0.00 C
348 | ATOM 680 CE2 PHE A 45 -6.096 18.033 -4.069 1.00 0.00 C
349 | ATOM 681 CZ PHE A 45 -5.092 17.132 -4.372 1.00 0.00 C
350 | ATOM 691 N VAL A 46 -12.038 16.824 -6.403 1.00 0.00 N
351 | ATOM 692 CA VAL A 46 -13.174 17.164 -7.258 1.00 0.00 C
352 | ATOM 693 C VAL A 46 -13.446 18.664 -7.270 1.00 0.00 C
353 | ATOM 694 O VAL A 46 -13.779 19.262 -6.250 1.00 0.00 O
354 | ATOM 695 CB VAL A 46 -14.454 16.385 -6.847 1.00 0.00 C
355 | ATOM 696 CG1 VAL A 46 -15.619 16.830 -7.674 1.00 0.00 C
356 | ATOM 697 CG2 VAL A 46 -14.253 14.891 -7.087 1.00 0.00 C
357 | ATOM 707 N HIS A 47 -13.370 19.279 -8.441 1.00 0.00 N
358 | ATOM 708 CA HIS A 47 -13.638 20.697 -8.503 1.00 0.00 C
359 | ATOM 709 C HIS A 47 -15.002 20.873 -9.131 1.00 0.00 C
360 | ATOM 710 O HIS A 47 -15.209 20.557 -10.313 1.00 0.00 O
361 | ATOM 711 CB HIS A 47 -12.535 21.463 -9.250 1.00 0.00 C
362 | ATOM 712 CG HIS A 47 -12.660 22.967 -9.106 1.00 0.00 C
363 | ATOM 713 ND1 HIS A 47 -11.736 23.855 -9.621 1.00 0.00 N
364 | ATOM 714 CD2 HIS A 47 -13.586 23.717 -8.468 1.00 0.00 C
365 | ATOM 715 CE1 HIS A 47 -12.091 25.087 -9.282 1.00 0.00 C
366 | ATOM 716 NE2 HIS A 47 -13.211 25.017 -8.553 1.00 0.00 N
367 | ATOM 724 N PHE A 48 -15.935 21.330 -8.300 1.00 0.00 N
368 | ATOM 725 CA PHE A 48 -17.319 21.527 -8.658 1.00 0.00 C
369 | ATOM 726 C PHE A 48 -17.548 22.887 -9.278 1.00 0.00 C
370 | ATOM 727 O PHE A 48 -16.890 23.862 -8.924 1.00 0.00 O
371 | ATOM 728 CB PHE A 48 -18.214 21.414 -7.448 1.00 0.00 C
372 | ATOM 729 CG PHE A 48 -18.419 20.032 -6.899 1.00 0.00 C
373 | ATOM 730 CD1 PHE A 48 -17.718 19.577 -5.798 1.00 0.00 C
374 | ATOM 731 CD2 PHE A 48 -19.356 19.197 -7.470 1.00 0.00 C
375 | ATOM 732 CE1 PHE A 48 -17.977 18.329 -5.276 1.00 0.00 C
376 | ATOM 733 CE2 PHE A 48 -19.605 17.966 -6.957 1.00 0.00 C
377 | ATOM 734 CZ PHE A 48 -18.925 17.533 -5.855 1.00 0.00 C
378 | ATOM 744 N GLU A 49 -18.524 22.953 -10.176 1.00 0.00 N
379 | ATOM 745 CA GLU A 49 -18.910 24.221 -10.795 1.00 0.00 C
380 | ATOM 746 C GLU A 49 -19.373 25.262 -9.780 1.00 0.00 C
381 | ATOM 747 O GLU A 49 -19.100 26.453 -9.934 1.00 0.00 O
382 | ATOM 748 CB GLU A 49 -19.978 24.001 -11.860 1.00 0.00 C
383 | ATOM 749 CG GLU A 49 -20.380 25.282 -12.594 1.00 0.00 C
384 | ATOM 750 CD GLU A 49 -21.290 25.037 -13.742 1.00 0.00 C
385 | ATOM 751 OE1 GLU A 49 -21.524 23.902 -14.047 1.00 0.00 O
386 | ATOM 752 OE2 GLU A 49 -21.752 25.986 -14.324 1.00 0.00 O
387 | ATOM 759 N GLU A 50 -20.097 24.819 -8.752 1.00 0.00 N
388 | ATOM 760 CA GLU A 50 -20.618 25.717 -7.731 1.00 0.00 C
389 | ATOM 761 C GLU A 50 -20.099 25.346 -6.348 1.00 0.00 C
390 | ATOM 762 O GLU A 50 -19.999 24.162 -6.000 1.00 0.00 O
391 | ATOM 763 CB GLU A 50 -22.150 25.694 -7.721 1.00 0.00 C
392 | ATOM 764 CG GLU A 50 -22.806 26.198 -9.009 1.00 0.00 C
393 | ATOM 765 CD GLU A 50 -24.322 26.205 -8.935 1.00 0.00 C
394 | ATOM 766 OE1 GLU A 50 -24.845 25.747 -7.947 1.00 0.00 O
395 | ATOM 767 OE2 GLU A 50 -24.948 26.671 -9.861 1.00 0.00 O
396 | ATOM 774 N GLU A 51 -19.900 26.371 -5.513 1.00 0.00 N
397 | ATOM 775 CA GLU A 51 -19.410 26.173 -4.151 1.00 0.00 C
398 | ATOM 776 C GLU A 51 -20.411 25.364 -3.351 1.00 0.00 C
399 | ATOM 777 O GLU A 51 -20.023 24.564 -2.495 1.00 0.00 O
400 | ATOM 778 CB GLU A 51 -19.190 27.522 -3.464 1.00 0.00 C
401 | ATOM 779 CG GLU A 51 -18.103 28.374 -4.101 1.00 0.00 C
402 | ATOM 780 CD GLU A 51 -16.781 27.703 -4.163 1.00 0.00 C
403 | ATOM 781 OE1 GLU A 51 -16.315 27.196 -3.170 1.00 0.00 O
404 | ATOM 782 OE2 GLU A 51 -16.238 27.662 -5.247 1.00 0.00 O
405 | ATOM 789 N GLY A 52 -21.697 25.569 -3.643 1.00 0.00 N
406 | ATOM 790 CA GLY A 52 -22.781 24.866 -2.988 1.00 0.00 C
407 | ATOM 791 C GLY A 52 -22.695 23.356 -3.191 1.00 0.00 C
408 | ATOM 792 O GLY A 52 -23.171 22.596 -2.342 1.00 0.00 O
409 | ATOM 796 N ALA A 53 -22.167 22.908 -4.344 1.00 0.00 N
410 | ATOM 797 CA ALA A 53 -22.053 21.486 -4.609 1.00 0.00 C
411 | ATOM 798 C ALA A 53 -20.965 20.904 -3.742 1.00 0.00 C
412 | ATOM 799 O ALA A 53 -21.128 19.831 -3.151 1.00 0.00 O
413 | ATOM 800 CB ALA A 53 -21.743 21.260 -6.062 1.00 0.00 C
414 | ATOM 806 N ALA A 54 -19.846 21.628 -3.651 1.00 0.00 N
415 | ATOM 807 CA ALA A 54 -18.755 21.163 -2.813 1.00 0.00 C
416 | ATOM 808 C ALA A 54 -19.228 21.099 -1.366 1.00 0.00 C
417 | ATOM 809 O ALA A 54 -18.911 20.156 -0.638 1.00 0.00 O
418 | ATOM 810 CB ALA A 54 -17.543 22.064 -2.962 1.00 0.00 C
419 | ATOM 816 N LYS A 55 -20.026 22.091 -0.961 1.00 0.00 N
420 | ATOM 817 CA LYS A 55 -20.544 22.138 0.391 1.00 0.00 C
421 | ATOM 818 C LYS A 55 -21.406 20.914 0.670 1.00 0.00 C
422 | ATOM 819 O LYS A 55 -21.192 20.235 1.681 1.00 0.00 O
423 | ATOM 820 CB LYS A 55 -21.325 23.432 0.630 1.00 0.00 C
424 | ATOM 821 CG LYS A 55 -21.573 23.788 2.117 1.00 0.00 C
425 | ATOM 822 CD LYS A 55 -22.912 23.260 2.651 1.00 0.00 C
426 | ATOM 823 CE LYS A 55 -23.184 23.781 4.071 1.00 0.00 C
427 | ATOM 824 NZ LYS A 55 -24.448 23.229 4.647 1.00 0.00 N
428 | ATOM 838 N GLU A 56 -22.385 20.621 -0.204 1.00 0.00 N
429 | ATOM 839 CA GLU A 56 -23.239 19.464 0.042 1.00 0.00 C
430 | ATOM 840 C GLU A 56 -22.412 18.194 0.115 1.00 0.00 C
431 | ATOM 841 O GLU A 56 -22.657 17.344 0.977 1.00 0.00 O
432 | ATOM 842 CB GLU A 56 -24.325 19.286 -1.010 1.00 0.00 C
433 | ATOM 843 CG GLU A 56 -25.311 18.146 -0.650 1.00 0.00 C
434 | ATOM 844 CD GLU A 56 -26.417 17.965 -1.634 1.00 0.00 C
435 | ATOM 845 OE1 GLU A 56 -26.442 18.667 -2.606 1.00 0.00 O
436 | ATOM 846 OE2 GLU A 56 -27.234 17.105 -1.419 1.00 0.00 O
437 | ATOM 853 N ALA A 57 -21.407 18.063 -0.757 1.00 0.00 N
438 | ATOM 854 CA ALA A 57 -20.570 16.879 -0.746 1.00 0.00 C
439 | ATOM 855 C ALA A 57 -19.909 16.716 0.625 1.00 0.00 C
440 | ATOM 856 O ALA A 57 -19.759 15.591 1.101 1.00 0.00 O
441 | ATOM 857 CB ALA A 57 -19.538 16.952 -1.854 1.00 0.00 C
442 | ATOM 863 N ILE A 58 -19.530 17.818 1.296 1.00 0.00 N
443 | ATOM 864 CA ILE A 58 -18.942 17.688 2.629 1.00 0.00 C
444 | ATOM 865 C ILE A 58 -19.992 17.163 3.608 1.00 0.00 C
445 | ATOM 866 O ILE A 58 -19.756 16.195 4.342 1.00 0.00 O
446 | ATOM 867 CB ILE A 58 -18.424 19.033 3.227 1.00 0.00 C
447 | ATOM 868 CG1 ILE A 58 -17.283 19.674 2.397 1.00 0.00 C
448 | ATOM 869 CG2 ILE A 58 -17.911 18.770 4.690 1.00 0.00 C
449 | ATOM 870 CD1 ILE A 58 -15.990 18.948 2.374 1.00 0.00 C
450 | ATOM 882 N ASP A 59 -21.179 17.790 3.605 1.00 0.00 N
451 | ATOM 883 CA ASP A 59 -22.237 17.394 4.538 1.00 0.00 C
452 | ATOM 884 C ASP A 59 -22.648 15.937 4.375 1.00 0.00 C
453 | ATOM 885 O ASP A 59 -22.963 15.258 5.354 1.00 0.00 O
454 | ATOM 886 CB ASP A 59 -23.511 18.239 4.364 1.00 0.00 C
455 | ATOM 887 CG ASP A 59 -23.469 19.694 4.904 1.00 0.00 C
456 | ATOM 888 OD1 ASP A 59 -22.587 20.051 5.643 1.00 0.00 O
457 | ATOM 889 OD2 ASP A 59 -24.384 20.433 4.575 1.00 0.00 O
458 | ATOM 894 N ALA A 60 -22.659 15.468 3.132 1.00 0.00 N
459 | ATOM 895 CA ALA A 60 -23.054 14.107 2.826 1.00 0.00 C
460 | ATOM 896 C ALA A 60 -21.951 13.069 2.981 1.00 0.00 C
461 | ATOM 897 O ALA A 60 -22.196 11.994 3.527 1.00 0.00 O
462 | ATOM 898 CB ALA A 60 -23.531 14.054 1.400 1.00 0.00 C
463 | ATOM 904 N LEU A 61 -20.744 13.353 2.492 1.00 0.00 N
464 | ATOM 905 CA LEU A 61 -19.708 12.338 2.487 1.00 0.00 C
465 | ATOM 906 C LEU A 61 -18.746 12.330 3.651 1.00 0.00 C
466 | ATOM 907 O LEU A 61 -17.992 11.370 3.785 1.00 0.00 O
467 | ATOM 908 CB LEU A 61 -18.864 12.454 1.225 1.00 0.00 C
468 | ATOM 909 CG LEU A 61 -19.561 12.290 -0.119 1.00 0.00 C
469 | ATOM 910 CD1 LEU A 61 -18.555 12.528 -1.138 1.00 0.00 C
470 | ATOM 911 CD2 LEU A 61 -20.166 10.921 -0.270 1.00 0.00 C
471 | ATOM 923 N ASN A 62 -18.732 13.342 4.508 1.00 0.00 N
472 | ATOM 924 CA ASN A 62 -17.757 13.312 5.584 1.00 0.00 C
473 | ATOM 925 C ASN A 62 -18.153 12.246 6.601 1.00 0.00 C
474 | ATOM 926 O ASN A 62 -19.125 12.414 7.340 1.00 0.00 O
475 | ATOM 927 CB ASN A 62 -17.652 14.688 6.242 1.00 0.00 C
476 | ATOM 928 CG ASN A 62 -16.593 14.765 7.308 1.00 0.00 C
477 | ATOM 929 OD1 ASN A 62 -15.882 13.781 7.528 1.00 0.00 O
478 | ATOM 930 ND2 ASN A 62 -16.487 15.897 7.976 1.00 0.00 N
479 | ATOM 937 N GLY A 63 -17.421 11.125 6.602 1.00 0.00 N
480 | ATOM 938 CA GLY A 63 -17.717 9.981 7.458 1.00 0.00 C
481 | ATOM 939 C GLY A 63 -18.501 8.862 6.755 1.00 0.00 C
482 | ATOM 940 O GLY A 63 -18.898 7.879 7.386 1.00 0.00 O
483 | ATOM 944 N MET A 64 -18.732 9.006 5.455 1.00 0.00 N
484 | ATOM 945 CA MET A 64 -19.421 7.978 4.679 1.00 0.00 C
485 | ATOM 946 C MET A 64 -18.374 6.966 4.262 1.00 0.00 C
486 | ATOM 947 O MET A 64 -17.204 7.316 4.157 1.00 0.00 O
487 | ATOM 948 CB MET A 64 -20.124 8.562 3.443 1.00 0.00 C
488 | ATOM 949 CG MET A 64 -20.946 7.519 2.605 1.00 0.00 C
489 | ATOM 950 SD MET A 64 -21.829 8.206 1.188 1.00 0.00 S
490 | ATOM 951 CE MET A 64 -23.197 9.105 1.914 1.00 0.00 C
491 | ATOM 961 N LEU A 65 -18.745 5.710 4.060 1.00 0.00 N
492 | ATOM 962 CA LEU A 65 -17.758 4.761 3.559 1.00 0.00 C
493 | ATOM 963 C LEU A 65 -17.901 4.546 2.070 1.00 0.00 C
494 | ATOM 964 O LEU A 65 -19.010 4.372 1.552 1.00 0.00 O
495 | ATOM 965 CB LEU A 65 -17.863 3.403 4.268 1.00 0.00 C
496 | ATOM 966 CG LEU A 65 -17.088 3.231 5.607 1.00 0.00 C
497 | ATOM 967 CD1 LEU A 65 -17.606 4.186 6.691 1.00 0.00 C
498 | ATOM 968 CD2 LEU A 65 -17.196 1.790 6.043 1.00 0.00 C
499 | ATOM 980 N LEU A 66 -16.768 4.558 1.383 1.00 0.00 N
500 | ATOM 981 CA LEU A 66 -16.711 4.289 -0.042 1.00 0.00 C
501 | ATOM 982 C LEU A 66 -16.004 2.952 -0.152 1.00 0.00 C
502 | ATOM 983 O LEU A 66 -14.933 2.765 0.438 1.00 0.00 O
503 | ATOM 984 CB LEU A 66 -15.969 5.396 -0.810 1.00 0.00 C
504 | ATOM 985 CG LEU A 66 -16.501 6.869 -0.634 1.00 0.00 C
505 | ATOM 986 CD1 LEU A 66 -15.607 7.819 -1.460 1.00 0.00 C
506 | ATOM 987 CD2 LEU A 66 -17.964 6.988 -1.059 1.00 0.00 C
507 | ATOM 999 N ASN A 67 -16.580 2.010 -0.878 1.00 0.00 N
508 | ATOM 1000 CA ASN A 67 -16.026 0.664 -0.915 1.00 0.00 C
509 | ATOM 1001 C ASN A 67 -15.858 0.188 0.540 1.00 0.00 C
510 | ATOM 1002 O ASN A 67 -16.852 0.054 1.255 1.00 0.00 O
511 | ATOM 1003 CB ASN A 67 -14.723 0.580 -1.710 1.00 0.00 C
512 | ATOM 1004 CG ASN A 67 -14.894 0.927 -3.161 1.00 0.00 C
513 | ATOM 1005 OD1 ASN A 67 -16.025 0.971 -3.665 1.00 0.00 O
514 | ATOM 1006 ND2 ASN A 67 -13.802 1.120 -3.869 1.00 0.00 N
515 | ATOM 1013 N GLY A 68 -14.629 -0.082 0.977 1.00 0.00 N
516 | ATOM 1014 CA GLY A 68 -14.380 -0.569 2.332 1.00 0.00 C
517 | ATOM 1015 C GLY A 68 -13.747 0.448 3.294 1.00 0.00 C
518 | ATOM 1016 O GLY A 68 -13.262 0.056 4.359 1.00 0.00 O
519 | ATOM 1020 N GLN A 69 -13.685 1.734 2.925 1.00 0.00 N
520 | ATOM 1021 CA GLN A 69 -13.010 2.710 3.792 1.00 0.00 C
521 | ATOM 1022 C GLN A 69 -13.760 4.024 4.022 1.00 0.00 C
522 | ATOM 1023 O GLN A 69 -14.364 4.601 3.112 1.00 0.00 O
523 | ATOM 1024 CB GLN A 69 -11.631 3.030 3.199 1.00 0.00 C
524 | ATOM 1025 CG GLN A 69 -10.650 1.856 3.182 1.00 0.00 C
525 | ATOM 1026 CD GLN A 69 -9.327 2.211 2.535 1.00 0.00 C
526 | ATOM 1027 OE1 GLN A 69 -9.219 3.233 1.858 1.00 0.00 O
527 | ATOM 1028 NE2 GLN A 69 -8.321 1.369 2.735 1.00 0.00 N
528 | ATOM 1037 N GLU A 70 -13.690 4.517 5.264 1.00 0.00 N
529 | ATOM 1038 CA GLU A 70 -14.299 5.800 5.618 1.00 0.00 C
530 | ATOM 1039 C GLU A 70 -13.638 6.922 4.846 1.00 0.00 C
531 | ATOM 1040 O GLU A 70 -12.414 7.048 4.873 1.00 0.00 O
532 | ATOM 1041 CB GLU A 70 -14.178 6.039 7.120 1.00 0.00 C
533 | ATOM 1042 CG GLU A 70 -14.868 7.286 7.620 1.00 0.00 C
534 | ATOM 1043 CD GLU A 70 -14.755 7.443 9.107 1.00 0.00 C
535 | ATOM 1044 OE1 GLU A 70 -14.195 6.576 9.734 1.00 0.00 O
536 | ATOM 1045 OE2 GLU A 70 -15.218 8.431 9.619 1.00 0.00 O
537 | ATOM 1052 N ILE A 71 -14.430 7.770 4.198 1.00 0.00 N
538 | ATOM 1053 CA ILE A 71 -13.828 8.849 3.432 1.00 0.00 C
539 | ATOM 1054 C ILE A 71 -13.945 10.142 4.211 1.00 0.00 C
540 | ATOM 1055 O ILE A 71 -14.988 10.423 4.818 1.00 0.00 O
541 | ATOM 1056 CB ILE A 71 -14.501 9.011 2.017 1.00 0.00 C
542 | ATOM 1057 CG1 ILE A 71 -13.656 9.902 1.076 1.00 0.00 C
543 | ATOM 1058 CG2 ILE A 71 -15.911 9.656 2.090 1.00 0.00 C
544 | ATOM 1059 CD1 ILE A 71 -12.437 9.208 0.584 1.00 0.00 C
545 | ATOM 1071 N TYR A 72 -12.870 10.915 4.211 1.00 0.00 N
546 | ATOM 1072 CA TYR A 72 -12.886 12.216 4.824 1.00 0.00 C
547 | ATOM 1073 C TYR A 72 -12.804 13.276 3.757 1.00 0.00 C
548 | ATOM 1074 O TYR A 72 -11.948 13.244 2.872 1.00 0.00 O
549 | ATOM 1075 CB TYR A 72 -11.746 12.382 5.817 1.00 0.00 C
550 | ATOM 1076 CG TYR A 72 -11.642 13.797 6.310 1.00 0.00 C
551 | ATOM 1077 CD1 TYR A 72 -12.565 14.295 7.200 1.00 0.00 C
552 | ATOM 1078 CD2 TYR A 72 -10.617 14.606 5.852 1.00 0.00 C
553 | ATOM 1079 CE1 TYR A 72 -12.471 15.598 7.625 1.00 0.00 C
554 | ATOM 1080 CE2 TYR A 72 -10.518 15.892 6.284 1.00 0.00 C
555 | ATOM 1081 CZ TYR A 72 -11.443 16.398 7.165 1.00 0.00 C
556 | ATOM 1082 OH TYR A 72 -11.344 17.701 7.596 1.00 0.00 O
557 | ATOM 1092 N VAL A 73 -13.716 14.213 3.808 1.00 0.00 N
558 | ATOM 1093 CA VAL A 73 -13.696 15.265 2.822 1.00 0.00 C
559 | ATOM 1094 C VAL A 73 -13.552 16.623 3.483 1.00 0.00 C
560 | ATOM 1095 O VAL A 73 -14.253 16.935 4.448 1.00 0.00 O
561 | ATOM 1096 CB VAL A 73 -14.928 15.169 1.908 1.00 0.00 C
562 | ATOM 1097 CG1 VAL A 73 -14.859 13.907 1.018 1.00 0.00 C
563 | ATOM 1098 CG2 VAL A 73 -16.176 15.090 2.736 1.00 0.00 C
564 | ATOM 1108 N ALA A 74 -12.641 17.427 2.942 1.00 0.00 N
565 | ATOM 1109 CA ALA A 74 -12.378 18.749 3.493 1.00 0.00 C
566 | ATOM 1110 C ALA A 74 -12.659 19.872 2.485 1.00 0.00 C
567 | ATOM 1111 O ALA A 74 -12.449 19.693 1.279 1.00 0.00 O
568 | ATOM 1112 CB ALA A 74 -10.933 18.846 3.937 1.00 0.00 C
569 | ATOM 1118 N PRO A 75 -13.125 21.048 2.953 1.00 0.00 N
570 | ATOM 1119 CA PRO A 75 -13.317 22.256 2.184 1.00 0.00 C
571 | ATOM 1120 C PRO A 75 -12.095 23.155 2.331 1.00 0.00 C
572 | ATOM 1121 O PRO A 75 -11.079 22.927 1.682 1.00 0.00 O
573 | ATOM 1122 OXT PRO A 75 -12.009 23.815 3.365 1.00 0.00 O
574 | ATOM 1123 CB PRO A 75 -14.530 22.885 2.865 1.00 0.00 C
575 | ATOM 1124 CG PRO A 75 -14.374 22.496 4.336 1.00 0.00 C
576 | ATOM 1125 CD PRO A 75 -13.655 21.153 4.342 1.00 0.00 C
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | numpy>=1.23.2
3 | networkx>=2.6.3
4 | scipy>=1.9.1
5 | biopandas>=0.2.7
6 | torch>=1.11.0
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = metl-pretrained
3 | version = 0.1
4 | description = Mutational effect transfer learning pretrained models
5 | url = https://github.com/gitter-lab/metl-pretrained
6 | author = Sam Gelman
7 | author_email = sgelman2@wisc.edu
8 | license = MIT
9 |
10 | [options]
11 | packages=find:
12 | install_requires =
13 | torch
14 | numpy
15 | scipy
16 | biopandas
17 | networkx
18 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | if __name__ == '__main__':
4 | setup()
5 |
--------------------------------------------------------------------------------