├── .github └── workflows │ ├── compile_huggingface.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── huggingface ├── README.md ├── combine_files.py ├── huggingface_code.py ├── huggingface_wrapper.py ├── print_colab_dropdown.py ├── push_to_hub.py └── requirements.txt ├── metl ├── __init__.py ├── encode.py ├── main.py ├── models.py ├── relative_attention.py ├── structure.py ├── test.py ├── test2.py ├── test3.py └── test4.py ├── notebooks └── inference.ipynb ├── pdbs ├── 1gfl_cm.pdb ├── 2qmt_p.pdb ├── 6qji_p_trunc_2022.pdb ├── AF-P60484-F1-model_v4_p.pdb ├── AF-P62993-F1-model_v4_trunc_p.pdb ├── AF-Q6SJ61-F1-model_v4_p.pdb ├── pab1_cm.pdb └── ube4b_cm.pdb ├── requirements.txt ├── setup.cfg └── setup.py /.github/workflows/compile_huggingface.yml: -------------------------------------------------------------------------------- 1 | name: Compiling Huggingface Wrapper 2 | on: [push, workflow_dispatch] 3 | jobs: 4 | Combine-File: 5 | runs-on: ubuntu-latest 6 | env: 7 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 8 | steps: 9 | - uses: actions/checkout@v4 10 | with: 11 | ref: 'main' 12 | - name: Set up Python 13 | uses: actions/setup-python@v5 14 | with: 15 | python-version: '3.9' 16 | - name: Upgrade pip 17 | run: pip install --upgrade pip 18 | - name: Install dependencies 19 | run: pip install -r huggingface/requirements.txt 20 | - name: Install torch cpu only 21 | run: pip install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu 22 | - name: Combining Files 23 | run: python huggingface/combine_files.py -o huggingface/huggingface_wrapper.py 24 | - name: Formatting generated code 25 | run: | 26 | python -m black huggingface/huggingface_wrapper.py 27 | python -m isort huggingface/huggingface_wrapper.py 28 | - name: Push to hub 29 | run: python huggingface/push_to_hub.py 30 | 31 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | 7 | test: 8 | name: Test pre-trained models 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | max-parallel: 4 12 | fail-fast: false 13 | matrix: 14 | os: 15 | - ubuntu-latest 16 | - windows-latest 17 | - macos-latest 18 | python-version: 19 | - '3.9' 20 | - '3.12' 21 | 22 | steps: 23 | - name: Checkout repository 24 | uses: actions/checkout@v4 25 | - name: Install Python 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | cache: 'pip' # caching pip dependencies 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install -r requirements.txt 34 | pip install . 35 | pip list 36 | - name: Test METL-G 37 | run: python metl/test.py 38 | - name: Test 1D low-N METL-L avGFP 39 | run: python metl/test2.py 40 | - name: Test 3D low-N METL-L avGFP 41 | run: python metl/test3.py 42 | - name: Test METL-L GB1 43 | run: python metl/test4.py 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # PyCharm project settings 132 | .idea 133 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Sam Gelman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pretrained METL models 2 | [![GitHub Actions](https://github.com/gitter-lab/metl-pretrained/actions/workflows/test.yml/badge.svg)](https://github.com/gitter-lab/metl-pretrained/actions/workflows/test.yml) 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10819499.svg)](https://zenodo.org/doi/10.5281/zenodo.10819499) 4 | 5 | This repository contains pretrained METL [models](https://zenodo.org/doi/10.5281/zenodo.11051644) with minimal dependencies. 6 | For more information, please see the [metl](https://github.com/gitter-lab/metl) repository and our manuscript: 7 | 8 | [Biophysics-based protein language models for protein engineering](https://doi.org/10.1101/2024.03.15.585128). 9 | Sam Gelman, Bryce Johnson, Chase Freschlin, Arnav Sharma, Sameer D'Costa, John Peters, Anthony Gitter+, Philip A Romero+. 10 | *bioRxiv*, 2024. doi:10.1101/2024.03.15.585128 11 | + denotes equal contribution. 12 | 13 | # Getting started 14 | 1. Create a conda environment (or use existing one): `conda create --name myenv python=3.9` 15 | 2. Activate conda environment `conda activate myenv` 16 | 3. Clone this repository 17 | 4. Navigate to the cloned repository `cd metl-pretrained` 18 | 5. Install the package with `pip install .` 19 | 6. Import the package in your script with `import metl` 20 | 7. Load a pretrained model using `model, data_encoder = metl.get_from_uuid(uuid)` or one of the other loading functions (see examples below) 21 | - `model` is a PyTorch model loaded with the pre-trained weights 22 | - `data_encoder` is a helper object that can be used to encode sequences and variants to be fed into the model 23 | 24 | # Available models 25 | Model checkpoints are available to download from [Zenodo](https://zenodo.org/doi/10.5281/zenodo.11051644). 26 | Once you have a checkpoint downloaded, you can load it into a PyTorch model using `metl.get_from_checkpoint()`. 27 | Alternatively, you can use `metl.get_from_uuid()` or `metl.get_from_ident()` to automatically download, cache, and load the model based on the model identifier or UUID. 28 | See the examples below. 29 | 30 | ## Source models 31 | Source models predict Rosetta energy terms. 32 | 33 | ### Global source models 34 | 35 | | Identifier | UUID | Params | RPE | Output | Description | Download | 36 | |-----------------|------------|--------|-----|------------------|-------------|--------------------------------------------------------------------------------------------| 37 | | `METL-G-20M-1D` | `D72M9aEp` | 20M | 1D | Rosetta energies | METL-G | [Download](https://zenodo.org/records/14908509/files/METL-G-20M-1D-D72M9aEp.pt?download=1) | 38 | | `METL-G-20M-3D` | `Nr9zCKpR` | 20M | 3D | Rosetta energies | METL-G | [Download](https://zenodo.org/records/14908509/files/METL-G-20M-3D-Nr9zCKpR.pt?download=1) | 39 | | `METL-G-50M-1D` | `auKdzzwX` | 50M | 1D | Rosetta energies | METL-G | [Download](https://zenodo.org/records/14908509/files/METL-G-50M-1D-auKdzzwX.pt?download=1) | 40 | | `METL-G-50M-3D` | `6PSAzdfv` | 50M | 3D | Rosetta energies | METL-G | [Download](https://zenodo.org/records/14908509/files/METL-G-50M-3D-6PSAzdfv.pt?download=1) | 41 | 42 | ### Local source models 43 | 44 | | Identifier | UUID | Protein | Params | RPE | Output | Description | Download | 45 | |--------------------------|------------|-----|--------|-----|------------------|-------------|-----------------------------------------------------------------------------------------------------| 46 | | `METL-L-2M-1D-GFP` | `8gMPQJy4` | GFP | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-GFP-8gMPQJy4.pt?download=1) | 47 | | `METL-L-2M-3D-GFP` | `Hr4GNHws` | GFP | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-GFP-Hr4GNHws.pt?download=1) | 48 | | `METL-L-2M-1D-DLG4_2022` | `8iFoiYw2` | DLG4 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-DLG4_2022-8iFoiYw2.pt?download=1) | 49 | | `METL-L-2M-3D-DLG4_2022` | `kt5DdWTa` | DLG4 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-DLG4_2022-kt5DdWTa.pt?download=1) | 50 | | `METL-L-2M-1D-GB1` | `DMfkjVzT` | GB1 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-GB1-DMfkjVzT.pt?download=1) | 51 | | `METL-L-2M-3D-GB1` | `epegcFiH` | GB1 | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-GB1-epegcFiH.pt?download=1) | 52 | | `METL-L-2M-1D-GRB2` | `kS3rUS7h` | GRB2 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-GRB2-kS3rUS7h.pt?download=1) | 53 | | `METL-L-2M-3D-GRB2` | `X7w83g6S` | GRB2 | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-GRB2-X7w83g6S.pt?download=1) | 54 | | `METL-L-2M-1D-Pab1` | `UKebCQGz` | Pab1 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-Pab1-UKebCQGz.pt?download=1) | 55 | | `METL-L-2M-3D-Pab1` | `2rr8V4th` | Pab1 | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-Pab1-2rr8V4th.pt?download=1) | 56 | | `METL-L-2M-1D-PTEN` | `CEMSx7ZC` | PTEN | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-PTEN-CEMSx7ZC.pt?download=1) | 57 | | `METL-L-2M-3D-PTEN` | `PjxR5LW7` | PTEN | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-PTEN-PjxR5LW7.pt?download=1) | 58 | | `METL-L-2M-1D-TEM-1` | `PREhfC22` | TEM-1 | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-TEM-1-PREhfC22.pt?download=1) | 59 | | `METL-L-2M-3D-TEM-1` | `9ASvszux` | TEM-1 | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-TEM-1-9ASvszux.pt?download=1) | 60 | | `METL-L-2M-1D-Ube4b` | `HscFFkAb` | Ube4b | 2M | 1D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-1D-Ube4b-HscFFkAb.pt?download=1) | 61 | | `METL-L-2M-3D-Ube4b` | `H48oiNZN` | Ube4b | 2M | 3D | Rosetta energies | METL-L | [Download](https://zenodo.org/records/14908509/files/METL-L-2M-3D-Ube4b-H48oiNZN.pt?download=1) | 62 | 63 | 64 | 65 | These models will output a length 55 vector corresponding to the following energy terms (in order): 66 |
67 | 68 | Expand to see energy terms 69 | 70 | 71 | ``` 72 | total_score 73 | fa_atr 74 | fa_dun 75 | fa_elec 76 | fa_intra_rep 77 | fa_intra_sol_xover4 78 | fa_rep 79 | fa_sol 80 | hbond_bb_sc 81 | hbond_lr_bb 82 | hbond_sc 83 | hbond_sr_bb 84 | lk_ball_wtd 85 | omega 86 | p_aa_pp 87 | pro_close 88 | rama_prepro 89 | ref 90 | yhh_planarity 91 | buried_all 92 | buried_np 93 | contact_all 94 | contact_buried_core 95 | contact_buried_core_boundary 96 | degree 97 | degree_core 98 | degree_core_boundary 99 | exposed_hydrophobics 100 | exposed_np_AFIMLWVY 101 | exposed_polars 102 | exposed_total 103 | one_core_each 104 | pack 105 | res_count_buried_core 106 | res_count_buried_core_boundary 107 | res_count_buried_np_core 108 | res_count_buried_np_core_boundary 109 | ss_contributes_core 110 | ss_mis 111 | total_hydrophobic 112 | total_hydrophobic_AFILMVWY 113 | total_sasa 114 | two_core_each 115 | unsat_hbond 116 | centroid_total_score 117 | cbeta 118 | cenpack 119 | env 120 | hs_pair 121 | pair 122 | rg 123 | rsigma 124 | sheet 125 | ss_pair 126 | vdw 127 | ``` 128 |
129 | 130 | 131 | ### Function-specific source models for GB1 132 | 133 | The GB1 experimental data measured the binding interaction between GB1 variants and Immunoglobulin G (IgG). 134 | To match this experimentally characterized function, we implemented a Rosetta pipeline to model the GB1-IgG complex and compute 17 attributes related to energy changes upon binding. 135 | We pretrained a standard METL-Local model and a modified METL-Bind model, which additionally incorporates the IgG binding attributes into its pretraining tasks. 136 | 137 | | Identifier | UUID | Protein | Params | RPE | Output | Description | Download | 138 | |--------------------------------|------------|---------|--------|-----|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------| 139 | | `METL-BIND-2M-3D-GB1-STANDARD` | `K6mw24Rg` | GB1 | 2M | 3D | Standard Rosetta energies | Trained for the function-specific synthetic data experiment, but only trained on the standard energy terms, to use as a baseline. Should perform similarly to `METL-L-2M-3D-GB1`. | [Download](https://zenodo.org/records/14908509/files/METL-BIND-2M-3D-GB1-STANDARD-K6mw24Rg.pt?download=1) | 140 | | `METL-BIND-2M-3D-GB1-BINDING` | `Bo5wn2SG` | GB1 | 2M | 3D | Standard + binding Rosetta energies | Trained on both the standard energy terms and the binding-specific energy terms. | [Download](https://zenodo.org/records/14908509/files/METL-BIND-2M-3D-GB1-BINDING-Bo5wn2SG.pt?download=1) | 141 | 142 | 143 | `METL-BIND-2M-3D-GB1-BINDING` predicts the standard energy terms listed above as well as the following binding energy terms (in order): 144 | 145 |
146 | 147 | Expand to see binding energy terms 148 | 149 | 150 | ``` 151 | complex_normalized 152 | dG_cross 153 | dG_cross/dSASAx100 154 | dG_separated 155 | dG_separated/dSASAx100 156 | dSASA_hphobic 157 | dSASA_int 158 | dSASA_polar 159 | delta_unsatHbonds 160 | hbond_E_fraction 161 | hbonds_int 162 | nres_int 163 | per_residue_energy_int 164 | side1_normalized 165 | side1_score 166 | side2_normalized 167 | side2_score 168 | ``` 169 |
170 | 171 | ## Target models 172 | Target models are fine-tuned source models that predict functional scores from experimental sequence-function data. 173 | 174 | ### Global target models 175 | 176 | These models were trained using 80% of the experimental sequence-function data as training data. 177 | 178 | | DMS Dataset | Identifier | UUID | RPE | Output | Description | Download | 179 | |----------------|------------|-------------|-----|------------------|-----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| 180 | | GFP | `None` | `PeT2D92j` | 1D | Functional score | METL-Global finetuned on the GFP dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GFP-PeT2D92j.pt?download=1) | 181 | | GFP | `None` | `6JBzHpkQ` | 3D | Functional score | METL-Global finetuned on the GFP dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GFP-6JBzHpkQ.pt?download=1) | 182 | | DLG4-Abundance | `None` | `4Rh3WCbG` | 1D | Functional score | METL-Global finetuned on the DLG4-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-DLG4_2022-ABUNDANCE-4Rh3WCbG.pt?download=1) | 183 | | DLG4-Abundance | `None` | `RBtqxzvu` | 3D | Functional score | METL-Global finetuned on the DLG4-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-DLG4_2022-ABUNDANCE-RBtqxzvu.pt?download=1) | 184 | | DLG4-Binding | `None` | `4xbuC5y7` | 1D | Functional score | METL-Global finetuned on the DLG4-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-DLG4_2022-BINDING-4xbuC5y7.pt?download=1) | 185 | | DLG4-Binding | `None` | `BuvxgE2x` | 3D | Functional score | METL-Global finetuned on the DLG4-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-DLG4_2022-BINDING-BuvxgE2x.pt?download=1) | 186 | | GB1 | `None` | `dAndZfJ4` | 1D | Functional score | METL-Global finetuned on the GB1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GB1-dAndZfJ4.pt?download=1) | 187 | | GB1 | `None` | `9vSB3DRM` | 3D | Functional score | METL-Global finetuned on the GB1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GB1-9vSB3DRM.pt?download=1) | 188 | | GRB2-Abundance | `None` | `HenDpDWe` | 1D | Functional score | METL-Global finetuned on the GRB2-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GRB2-ABUNDANCE-HenDpDWe.pt?download=1) | 189 | | GRB2-Abundance | `None` | `dDoCCvfr` | 3D | Functional score | METL-Global finetuned on the GRB2-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GRB2-ABUNDANCE-dDoCCvfr.pt?download=1) | 190 | | GRB2-Binding | `None` | `cvnycE5Q` | 1D | Functional score | METL-Global finetuned on the GRB2-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GRB2-BINDING-cvnycE5Q.pt?download=1) | 191 | | GRB2-Binding | `None` | `jYesS9Ki` | 3D | Functional score | METL-Global finetuned on the GRB2-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GRB2-BINDING-jYesS9Ki.pt?download=1) | 192 | | Pab1 | `None` | `ho54gxzv` | 1D | Functional score | METL-Global finetuned on the Pab1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-Pab1-ho54gxzv.pt?download=1) | 193 | | Pab1 | `None` | `jhbL2FeB` | 3D | Functional score | METL-Global finetuned on the Pab1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-Pab1-jhbL2FeB.pt?download=1) | 194 | | PTEN-Abundance | `None` | `UEuMtmfx` | 1D | Functional score | METL-Global finetuned on the PTEN-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-PTEN-ABUNDANCE-UEuMtmfx.pt?download=1) | 195 | | PTEN-Abundance | `None` | `eJPPQYEW` | 3D | Functional score | METL-Global finetuned on the PTEN-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-PTEN-ABUNDANCE-eJPPQYEW.pt?download=1) | 196 | | PTEN-Activity | `None` | `U3X8mSeT` | 1D | Functional score | METL-Global finetuned on the PTEN-Activity dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-PTEN-ACTIVITY-U3X8mSeT.pt?download=1) | 197 | | PTEN-Activity | `None` | `4gqYnW6V` | 3D | Functional score | METL-Global finetuned on the PTEN-Activity dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-PTEN-ACTIVITY-4gqYnW6V.pt?download=1) | 198 | | TEM-1 | `None` | `ELL4GGQq` | 1D | Functional score | METL-Global finetuned on the TEM-1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-TEM-1-ELL4GGQq.pt?download=1) | 199 | | TEM-1 | `None` | `K6BjsWXm` | 3D | Functional score | METL-Global finetuned on the TEM-1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-TEM-1-K6BjsWXm.pt?download=1) | 200 | | Ube4b | `None` | `BAWw23vW` | 1D | Functional score | METL-Global finetuned on the Ube4b dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-Ube4b-BAWw23vW.pt?download=1) | 201 | | Ube4b | `None` | `G9piq6WH` | 3D | Functional score | METL-Global finetuned on the Ube4b dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-Ube4b-G9piq6WH.pt?download=1) | 202 | 203 | ### Local target models 204 | 205 | These models were trained using 80% of the experimental sequence-function data as training data. 206 | 207 | | DMS Dataset | Identifier | UUID | RPE | Output | Description | Download | 208 | |----------------|------------|----------|-----|------------------|----------------------------------------------------|---------------------------------------------------------------------------------------------------------------| 209 | | GFP | `None` | `HaUuRwfE` | 1D | Functional score | METL-Local finetuned on the GFP dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-GFP-HaUuRwfE.pt?download=1) | 210 | | GFP | `None` | `LWEY95Yb` | 3D | Functional score | METL-Local finetuned on the GFP dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-GFP-LWEY95Yb.pt?download=1) | 211 | | DLG4-Abundance | `None` | `RMFA6dnX` | 1D | Functional score | METL-Local finetuned on the DLG4-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-DLG4_2022-ABUNDANCE-RMFA6dnX.pt?download=1) | 212 | | DLG4-Abundance | `None` | `V3uTtXVe` | 3D | Functional score | METL-Local finetuned on the DLG4-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-DLG4_2022-ABUNDANCE-V3uTtXVe.pt?download=1) | 213 | | DLG4-Binding | `None` | `YdzBYWHs` | 1D | Functional score | METL-Local finetuned on the DLG4-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-DLG4_2022-BINDING-YdzBYWHs.pt?download=1) | 214 | | DLG4-Binding | `None` | `iu6ZahPw` | 3D | Functional score | METL-Local finetuned on the DLG4-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-DLG4_2022-BINDING-iu6ZahPw.pt?download=1) | 215 | | GB1 | `None` | `Pgcseywk` | 1D | Functional score | METL-Local finetuned on the GB1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-GB1-Pgcseywk.pt?download=1) | 216 | | GB1 | `None` | `UvMMdsq4` | 3D | Functional score | METL-Local finetuned on the GB1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-GB1-UvMMdsq4.pt?download=1) | 217 | | GRB2-Abundance | `None` | `VNpi9Zjt` | 1D | Functional score | METL-Local finetuned on the GRB2-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-GRB2-ABUNDANCE-VNpi9Zjt.pt?download=1) | 218 | | GRB2-Abundance | `None` | `PqBMjXkA` | 3D | Functional score | METL-Local finetuned on the GRB2-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-GRB2-ABUNDANCE-PqBMjXkA.pt?download=1) | 219 | | GRB2-Binding | `None` | `Z59BhUaE` | 1D | Functional score | METL-Local finetuned on the GRB2-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-GRB2-BINDING-Z59BhUaE.pt?download=1) | 220 | | GRB2-Binding | `None` | `VwcRN6UB` | 3D | Functional score | METL-Local finetuned on the GRB2-Binding dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-GRB2-BINDING-VwcRN6UB.pt?download=1) | 221 | | Pab1 | `None` | `TdjCzoQQ` | 1D | Functional score | METL-Local finetuned on the Pab1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-Pab1-TdjCzoQQ.pt?download=1) | 222 | | Pab1 | `None` | `5SjoLx3y` | 3D | Functional score | METL-Local finetuned on the Pab1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-Pab1-5SjoLx3y.pt?download=1) | 223 | | PTEN-Abundance | `None` | `oUScGeHo` | 1D | Functional score | METL-Local finetuned on the PTEN-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-PTEN-ABUNDANCE-oUScGeHo.pt?download=1) | 224 | | PTEN-Abundance | `None` | `DhuasDEr` | 3D | Functional score | METL-Local finetuned on the PTEN-Abundance dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-PTEN-ABUNDANCE-DhuasDEr.pt?download=1) | 225 | | PTEN-Activity | `None` | `m9UsG7dq` | 1D | Functional score | METL-Local finetuned on the PTEN-Activity dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-PTEN-ACTIVITY-m9UsG7dq.pt?download=1) | 226 | | PTEN-Activity | `None` | `8Vi7ENcC` | 3D | Functional score | METL-Local finetuned on the PTEN-Activity dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-PTEN-ACTIVITY-8Vi7ENcC.pt?download=1) | 227 | | TEM-1 | `None` | `64ncFxBR` | 1D | Functional score | METL-Local finetuned on the TEM-1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-TEM-1-64ncFxBR.pt?download=1) | 228 | | TEM-1 | `None` | `PncvgiJU` | 3D | Functional score | METL-Local finetuned on the TEM-1 dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-TEM-1-PncvgiJU.pt?download=1) | 229 | | Ube4b | `None` | `e9uhhnAv` | 1D | Functional score | METL-Local finetuned on the Ube4b dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-1D-Ube4b-e9uhhnAv.pt?download=1) | 230 | | Ube4b | `None` | `NfbZL7jK` | 3D | Functional score | METL-Local finetuned on the Ube4b dataset | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-3D-Ube4b-NfbZL7jK.pt?download=1) | 231 | 232 | 233 | ### GFP design experiment target models 234 | 235 | | DMS Dataset | Identifier | UUID | RPE | Output | Description | Download | 236 | |:------------|------------|------------|-----|------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------| 237 | | GFP | `None` | `YoQkzoLD` | 1D | Functional score | The `METL-L-2M-1D-GFP` model, fine-tuned on 64 examples from the GFP DMS dataset. This model was used for the GFP design experiment described in the manuscript. | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-GFP-YoQkzoLD.pt?download=1) | 238 | | GFP | `None` | `PEkeRuxb` | 3D | Functional score | The `METL-L-2M-3D-GFP` model, fine-tuned on 64 examples from the GFP DMS dataset. This model was used for the GFP design experiment described in the manuscript. | [Download](https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-GFP-PEkeRuxb.pt?download=1) | 239 | 240 | 241 | # 3D Relative Position Embeddings 242 | 243 | METL uses relative position embeddings (RPEs) based on 3D protein structure. 244 | The implementation of relative position embeddings is similar to the original paper by [Shaw et al](https://aclanthology.org/N18-2074/). 245 | However, instead of using the default 1D sequence-based distances, we calculate relative distances based on a graph of the 3D protein structure. 246 | These 3D RPEs enable the transformer to use 3D distances between amino acid residues as the positional signal when calculating attention. 247 | When using 3D RPEs, the model requires a protein structure in the form of a PDB file, corresponding to the wild-type protein or base protein of the input variant sequence. 248 | 249 | Our testing showed that 3D RPEs improve performance for METL-Global models but do not make a difference for METL-Local models. 250 | We provide both 1D and 3D models in this repository. The 1D models do not require the PDB structure as an additional input. 251 | 252 | The [pdbs](pdbs) directory contains PDB files corresponding to the experimental datasets we evaluated. These can be used with the 3D RPE models listed above. 253 | 254 | | DMS Dataset | PDB File | 255 | |----------------|-----------------------------------------------------------------------------| 256 | | GFP | [`1gfl_cm.pdb`](pdbs/1gfl_cm.pdb) | 257 | | DLG4-Abundance | [`6qji_p_trunc_2022.pdb`](pdbs/6qji_p_trunc_2022.pdb) | 258 | | DLG4-Binding | [`6qji_p_trunc_2022.pdb`](pdbs/6qji_p_trunc_2022.pdb) | 259 | | GB1 | [`2qmt_p.pdb`](pdbs/2qmt_p.pdb) | 260 | | GRB2-Abundance | [`AF-P62993-F1-model_v4_trunc_p.pdb`](pdbs/AF-P62993-F1-model_v4_trunc_p.pdb) | 261 | | GRB2-Binding | [`AF-P62993-F1-model_v4_trunc_p.pdb`](pdbs/AF-P62993-F1-model_v4_trunc_p.pdb) | 262 | | Pab1 | [`pab1_cm.pdb`](pdbs/pab1_cm.pdb) | 263 | | PTEN-Abundance | [`AF-P60484-F1-model_v4_p.pdb`](pdbs/AF-P60484-F1-model_v4_p.pdb) | 264 | | PTEN-Activity | [`AF-P60484-F1-model_v4_p.pdb`](pdbs/AF-P60484-F1-model_v4_p.pdb) | 265 | | TEM-1 | [`AF-Q6SJ61-F1-model_v4_p.pdb`](pdbs/AF-Q6SJ61-F1-model_v4_p.pdb) | 266 | | Ube4b | [`ube4b_cm.pdb`](pdbs/ube4b_cm.pdb) | 267 | 268 | # Examples 269 | 270 | ## METL source model 271 | 272 | METL source models are assigned identifiers that can be used to load the model with `metl.get_from_ident()`. 273 | 274 | This example: 275 | - Automatically downloads and caches `METL-G-20M-1D` using `metl.get_from_ident("metl-g-20m-1d")`. 276 | - Encodes a pair of dummy amino acid sequences using `data_encoder.encode_sequences()`. 277 | - Runs the sequences through the model and prints the predicted Rosetta energies. 278 | 279 | _Todo: show how to extract the METL representation at different layers of the network_ 280 | 281 | ```python 282 | import metl 283 | import torch 284 | 285 | model, data_encoder = metl.get_from_ident("metl-g-20m-1d") 286 | 287 | # these are amino acid sequences 288 | # make sure all the sequences are the same length 289 | dummy_sequences = ["SMART", "MAGIC"] 290 | encoded_seqs = data_encoder.encode_sequences(dummy_sequences) 291 | 292 | # set model to eval mode 293 | model.eval() 294 | # no need to compute gradients for inference 295 | with torch.no_grad(): 296 | predictions = model(torch.tensor(encoded_seqs)) 297 | 298 | print(predictions) 299 | ``` 300 | 301 | If you are using a model with 3D relative position embeddings, you will need to provide the PDB structure of the wild-type or base protein. 302 | 303 | ``` 304 | predictions = model(torch.tensor(encoded_seqs), pdb_fn="../path/to/file.pdb") 305 | ``` 306 | 307 | 308 | # METL target model 309 | 310 | METL target models can be loaded using the model's UUID and `metl.get_from_uuid()`. 311 | 312 | This example: 313 | - Automatically downloads and caches `YoQkzoLD` using `metl.get_from_uuid(uuid="YoQkzoLD")`. 314 | - Encodes several variants specified in variant notation. A wild-type sequence is needed to encode variants. 315 | - Runs the sequences through the model and prints the predicted DMS scores. 316 | 317 | ```python 318 | import metl 319 | import torch 320 | 321 | model, data_encoder = metl.get_from_uuid(uuid="YoQkzoLD") 322 | 323 | # the GFP wild-type sequence 324 | wt = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQ" \ 325 | "HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKN" \ 326 | "GIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK" 327 | 328 | # some example GFP variants to compute the scores for 329 | variants = ["E3K,G102S", 330 | "T36P,S203T,K207R", 331 | "V10A,D19G,F25S,E113V"] 332 | 333 | encoded_variants = data_encoder.encode_variants(wt, variants) 334 | 335 | # set model to eval mode 336 | model.eval() 337 | # no need to compute gradients for inference 338 | with torch.no_grad(): 339 | predictions = model(torch.tensor(encoded_variants)) 340 | 341 | print(predictions) 342 | 343 | ``` 344 | -------------------------------------------------------------------------------- /huggingface/README.md: -------------------------------------------------------------------------------- 1 | This directory is to maintain the 🤗 support of METL. 2 | 3 | Herein are a few files to facilitate uploading the wrapper to 🤗. First, combine_files.py takes all of the files in the METL directory, barring files that have test or _.py (think, init.py here) and combines them into a single file. combine_files.py also appends the huggingface wrapper code itself (stored in huggingface_code.py) onto the bottom of the script. 4 | 5 | This script then gets auto-updated to 🤗 after formatting it by running the push_to_hub.py script via GitHub Actions. Some additional small comments are included in the top of each file repeating these responsibilities. -------------------------------------------------------------------------------- /huggingface/combine_files.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script combines all of the files in the metl directory into one file so that it can be uploaded automatically to huggingface. 3 | 4 | Files ending with _.py and that contain test in the filename will not be included. This script automatically generates the required imports from the files as well. 5 | 6 | Regardless of changes to metl, as long as necessary files that may be added don't contain test or _.py, this should work as intended. 7 | """ 8 | 9 | import argparse 10 | import os 11 | 12 | def main(output_path: str): 13 | imports = set() 14 | code = [] 15 | metl_imports = set() 16 | for file in os.listdir('./metl'): 17 | if '.py' in file and '_.py' not in file and 'test' not in file: 18 | with open(f'./metl/{file}', 'r') as f: 19 | file_text = f.readlines() 20 | for line in file_text: 21 | line_for_compare = line.strip() 22 | if 'import ' in line_for_compare and 'metl.' not in line_for_compare: 23 | imports.add(line_for_compare) 24 | elif 'import ' in line_for_compare and 'metl.' in line_for_compare: 25 | if 'as' in line_for_compare: 26 | metl_imports.add(line_for_compare) 27 | else: 28 | code.append(line[:-1]) 29 | 30 | code = '\n'.join(code) 31 | imports = '\n'.join(imports) 32 | 33 | for line in metl_imports: 34 | import_name = line.split('as')[-1].strip() 35 | code = code.replace(f'{import_name}.', '') 36 | 37 | huggingface_import = 'from transformers import PretrainedConfig, PreTrainedModel' 38 | delimiter = '$>' 39 | 40 | with open('./huggingface/huggingface_code.py', 'r') as f: 41 | contents = f.read() 42 | delim_location = contents.find(delimiter) 43 | cut_contents = contents[delim_location+len(delimiter):] 44 | 45 | with open(output_path, 'w') as f: 46 | f.write(f'{huggingface_import}\n{imports}\n{code}\n{cut_contents}') 47 | 48 | def parse_args(): 49 | parser = argparse.ArgumentParser(description="Compile huggingface wrapper") 50 | parser.add_argument("-o", type=str, help="Output filepath", default='./huggingface_wrapper.py') 51 | 52 | args = parser.parse_args() 53 | 54 | args.o = os.path.abspath(args.o) 55 | return args 56 | 57 | if __name__ == "__main__": 58 | args = parse_args() 59 | main(args.o) 60 | -------------------------------------------------------------------------------- /huggingface/huggingface_code.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the actual wrapper for METL. 3 | Above the delimiter for this file: #\$\> we have included imports and shell functions 4 | which prevent python (and other linters) from complaining this file has errors. 5 | """ 6 | 7 | 8 | from transformers import PretrainedConfig, PreTrainedModel 9 | 10 | def get_from_uuid(): 11 | pass 12 | 13 | def get_from_ident(): 14 | pass 15 | 16 | def get_from_checkpoint(): 17 | pass 18 | 19 | IDENT_UUID_MAP = "" 20 | UUID_URL_MAP = "" 21 | 22 | # Chop The above off. 23 | 24 | #$> 25 | # Huggingface code 26 | 27 | class METLConfig(PretrainedConfig): 28 | IDENT_UUID_MAP = IDENT_UUID_MAP 29 | UUID_URL_MAP = UUID_URL_MAP 30 | model_type = "METL" 31 | 32 | def __init__( 33 | self, 34 | id:str = None, 35 | **kwargs, 36 | ): 37 | self.id = id 38 | super().__init__(**kwargs) 39 | 40 | class METLModel(PreTrainedModel): 41 | config_class = METLConfig 42 | def __init__(self, config:METLConfig): 43 | super().__init__(config) 44 | self.model = None 45 | self.encoder = None 46 | self.config = config 47 | 48 | def forward(self, X, pdb_fn=None): 49 | if pdb_fn: 50 | return self.model(X, pdb_fn=pdb_fn) 51 | return self.model(X) 52 | 53 | def load_from_uuid(self, id): 54 | if id: 55 | assert id in self.config.UUID_URL_MAP, "ID given does not reference a valid METL model in the IDENT_UUID_MAP" 56 | self.config.id = id 57 | 58 | self.model, self.encoder = get_from_uuid(self.config.id) 59 | 60 | def load_from_ident(self, id): 61 | if id: 62 | id = id.lower() 63 | assert id in self.config.IDENT_UUID_MAP, "ID given does not reference a valid METL model in the IDENT_UUID_MAP" 64 | self.config.id = id 65 | 66 | self.model, self.encoder = get_from_ident(self.config.id) 67 | 68 | def get_from_checkpoint(self, checkpoint_path): 69 | self.model, self.encoder = get_from_checkpoint(checkpoint_path) 70 | -------------------------------------------------------------------------------- /huggingface/print_colab_dropdown.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility script for generating a list that can be pasted into the google colab when more models are uploaded to zenodo and added to the METL IDENT_UUID_MAP. 3 | 4 | This pulls from huggingface, so wait for that action to finish first before running this script and uploading the colab notebook. 5 | """ 6 | 7 | from transformers import AutoModel 8 | 9 | def main(): 10 | metl = AutoModel.from_pretrained('gitter-lab/METL', trust_remote_code=True) 11 | start = "# @param [" 12 | metl_keys = [f'"{key}"' for key in metl.config.IDENT_UUID_MAP.keys()] 13 | keys = ','.join(metl_keys) 14 | end = f'{keys}]' 15 | print(start + end) 16 | 17 | if __name__ == "__main__": 18 | main() -------------------------------------------------------------------------------- /huggingface/push_to_hub.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic minimal script for uploading the generated file from combine_files.py onto huggingface. 3 | Requires the action to have access to the HF_TOKEN secret in the repository. 4 | """ 5 | 6 | from huggingface_wrapper import METLConfig, METLModel 7 | from huggingface_hub import login 8 | import os 9 | from transformers import AutoModel, AutoConfig 10 | import torch 11 | 12 | def main(): 13 | API_KEY = os.getenv('HF_TOKEN') 14 | login(API_KEY) 15 | 16 | config = METLConfig() 17 | model = METLModel(config) 18 | model.model = torch.nn.Linear(1, 1) 19 | 20 | AutoConfig.register("METL", METLConfig) 21 | AutoModel.register(METLConfig, METLModel) 22 | 23 | model.register_for_auto_class() 24 | config.register_for_auto_class() 25 | 26 | model.push_to_hub('gitter-lab/METL') 27 | config.push_to_hub('gitter-lab/METL') 28 | 29 | if __name__ == "__main__": 30 | main() -------------------------------------------------------------------------------- /huggingface/requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface-hub==0.30.2 2 | transformers==4.51.3 3 | numpy>=1.23.2 4 | networkx>=2.6.3 5 | scipy>=1.9.1 6 | biopandas>=0.2.7 7 | isort 8 | black 9 | -------------------------------------------------------------------------------- /metl/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import * 2 | __version__ = "0.1" 3 | -------------------------------------------------------------------------------- /metl/encode.py: -------------------------------------------------------------------------------- 1 | """ Encodes data in different formats """ 2 | from enum import Enum, auto 3 | 4 | import numpy as np 5 | 6 | 7 | class Encoding(Enum): 8 | INT_SEQS = auto() 9 | ONE_HOT = auto() 10 | 11 | 12 | class DataEncoder: 13 | chars = ["*", "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"] 14 | num_chars = len(chars) 15 | mapping = {c: i for i, c in enumerate(chars)} 16 | 17 | def __init__(self, encoding: Encoding = Encoding.INT_SEQS): 18 | self.encoding = encoding 19 | 20 | def _encode_from_int_seqs(self, seq_ints): 21 | if self.encoding == Encoding.INT_SEQS: 22 | return seq_ints 23 | elif self.encoding == Encoding.ONE_HOT: 24 | one_hot = np.eye(self.num_chars)[seq_ints] 25 | return one_hot.astype(np.float32) 26 | 27 | def encode_sequences(self, char_seqs): 28 | seq_ints = [] 29 | for char_seq in char_seqs: 30 | int_seq = [self.mapping[c] for c in char_seq] 31 | seq_ints.append(int_seq) 32 | seq_ints = np.array(seq_ints).astype(int) 33 | return self._encode_from_int_seqs(seq_ints) 34 | 35 | def encode_variants(self, wt, variants): 36 | # convert wild type seq to integer encoding 37 | wt_int = np.zeros(len(wt), dtype=np.uint8) 38 | for i, c in enumerate(wt): 39 | wt_int[i] = self.mapping[c] 40 | 41 | # tile the wild-type seq 42 | seq_ints = np.tile(wt_int, (len(variants), 1)) 43 | 44 | for i, variant in enumerate(variants): 45 | # special handling if we want to encode the wild-type seq (it's already correct!) 46 | if variant == "_wt": 47 | continue 48 | 49 | # variants are a list of mutations [mutation1, mutation2, ....] 50 | variant = variant.split(",") 51 | for mutation in variant: 52 | # mutations are in the form 53 | position = int(mutation[1:-1]) 54 | replacement = self.mapping[mutation[-1]] 55 | seq_ints[i, position] = replacement 56 | 57 | seq_ints = seq_ints.astype(int) 58 | return self._encode_from_int_seqs(seq_ints) 59 | -------------------------------------------------------------------------------- /metl/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.hub 3 | 4 | import metl.models as models 5 | from metl.encode import DataEncoder, Encoding 6 | 7 | UUID_URL_MAP = { 8 | # global source models 9 | "D72M9aEp": "https://zenodo.org/records/14908509/files/METL-G-20M-1D-D72M9aEp.pt?download=1", 10 | "Nr9zCKpR": "https://zenodo.org/records/14908509/files/METL-G-20M-3D-Nr9zCKpR.pt?download=1", 11 | "auKdzzwX": "https://zenodo.org/records/14908509/files/METL-G-50M-1D-auKdzzwX.pt?download=1", 12 | "6PSAzdfv": "https://zenodo.org/records/14908509/files/METL-G-50M-3D-6PSAzdfv.pt?download=1", 13 | 14 | # local source models 15 | "8gMPQJy4": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-GFP-8gMPQJy4.pt?download=1", 16 | "Hr4GNHws": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-GFP-Hr4GNHws.pt?download=1", 17 | "8iFoiYw2": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-DLG4_2022-8iFoiYw2.pt?download=1", 18 | "kt5DdWTa": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-DLG4_2022-kt5DdWTa.pt?download=1", 19 | "DMfkjVzT": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-GB1-DMfkjVzT.pt?download=1", 20 | "epegcFiH": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-GB1-epegcFiH.pt?download=1", 21 | "kS3rUS7h": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-GRB2-kS3rUS7h.pt?download=1", 22 | "X7w83g6S": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-GRB2-X7w83g6S.pt?download=1", 23 | "UKebCQGz": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-Pab1-UKebCQGz.pt?download=1", 24 | "2rr8V4th": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-Pab1-2rr8V4th.pt?download=1", 25 | "PREhfC22": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-TEM-1-PREhfC22.pt?download=1", 26 | "9ASvszux": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-TEM-1-9ASvszux.pt?download=1", 27 | "HscFFkAb": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-Ube4b-HscFFkAb.pt?download=1", 28 | "H48oiNZN": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-Ube4b-H48oiNZN.pt?download=1", 29 | "CEMSx7ZC": "https://zenodo.org/records/14908509/files/METL-L-2M-1D-PTEN-CEMSx7ZC.pt?download=1", 30 | "PjxR5LW7": "https://zenodo.org/records/14908509/files/METL-L-2M-3D-PTEN-PjxR5LW7.pt?download=1", 31 | 32 | # metl bind source models 33 | "K6mw24Rg": "https://zenodo.org/records/14908509/files/METL-BIND-2M-3D-GB1-STANDARD-K6mw24Rg.pt?download=1", 34 | "Bo5wn2SG": "https://zenodo.org/records/14908509/files/METL-BIND-2M-3D-GB1-BINDING-Bo5wn2SG.pt?download=1", 35 | 36 | # finetuned models from GFP design experiment 37 | "YoQkzoLD": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-GFP-YoQkzoLD.pt?download=1", 38 | "PEkeRuxb": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-GFP-PEkeRuxb.pt?download=1", 39 | 40 | # new finetuned GLOBAL models 41 | "4Rh3WCbG": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-DLG4_2022-ABUNDANCE-4Rh3WCbG.pt?download=1", 42 | "4xbuC5y7": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-DLG4_2022-BINDING-4xbuC5y7.pt?download=1", 43 | "dAndZfJ4": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GB1-dAndZfJ4.pt?download=1", 44 | "PeT2D92j": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GFP-PeT2D92j.pt?download=1", 45 | "HenDpDWe": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GRB2-ABUNDANCE-HenDpDWe.pt?download=1", 46 | "cvnycE5Q": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-GRB2-BINDING-cvnycE5Q.pt?download=1", 47 | "ho54gxzv": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-Pab1-ho54gxzv.pt?download=1", 48 | "UEuMtmfx": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-PTEN-ABUNDANCE-UEuMtmfx.pt?download=1", 49 | "U3X8mSeT": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-PTEN-ACTIVITY-U3X8mSeT.pt?download=1", 50 | "ELL4GGQq": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-TEM-1-ELL4GGQq.pt?download=1", 51 | "BAWw23vW": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-1D-Ube4b-BAWw23vW.pt?download=1", 52 | "RBtqxzvu": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-DLG4_2022-ABUNDANCE-RBtqxzvu.pt?download=1", 53 | "BuvxgE2x": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-DLG4_2022-BINDING-BuvxgE2x.pt?download=1", 54 | "9vSB3DRM": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GB1-9vSB3DRM.pt?download=1", 55 | "6JBzHpkQ": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GFP-6JBzHpkQ.pt?download=1", 56 | "dDoCCvfr": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GRB2-ABUNDANCE-dDoCCvfr.pt?download=1", 57 | "jYesS9Ki": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-GRB2-BINDING-jYesS9Ki.pt?download=1", 58 | "jhbL2FeB": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-Pab1-jhbL2FeB.pt?download=1", 59 | "eJPPQYEW": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-PTEN-ABUNDANCE-eJPPQYEW.pt?download=1", 60 | "4gqYnW6V": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-PTEN-ACTIVITY-4gqYnW6V.pt?download=1", 61 | "K6BjsWXm": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-TEM-1-K6BjsWXm.pt?download=1", 62 | "G9piq6WH": "https://zenodo.org/records/14908509/files/FT-METL-G-20M-3D-Ube4b-G9piq6WH.pt?download=1", 63 | 64 | # finetuned LOCAL models 65 | "RMFA6dnX": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-DLG4_2022-ABUNDANCE-RMFA6dnX.pt?download=1", 66 | "YdzBYWHs": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-DLG4_2022-BINDING-YdzBYWHs.pt?download=1", 67 | "Pgcseywk": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-GB1-Pgcseywk.pt?download=1", 68 | "HaUuRwfE": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-GFP-HaUuRwfE.pt?download=1", 69 | "VNpi9Zjt": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-GRB2-ABUNDANCE-VNpi9Zjt.pt?download=1", 70 | "Z59BhUaE": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-GRB2-BINDING-Z59BhUaE.pt?download=1", 71 | "TdjCzoQQ": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-Pab1-TdjCzoQQ.pt?download=1", 72 | "64ncFxBR": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-TEM-1-64ncFxBR.pt?download=1", 73 | "e9uhhnAv": "https://zenodo.org/records/14908509/files/FT-METL-L-1D-Ube4b-e9uhhnAv.pt?download=1", 74 | "oUScGeHo": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-PTEN-ABUNDANCE-oUScGeHo.pt?download=1", 75 | "m9UsG7dq": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-1D-PTEN-ACTIVITY-m9UsG7dq.pt?download=1", 76 | "DhuasDEr": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-PTEN-ABUNDANCE-DhuasDEr.pt?download=1", 77 | "8Vi7ENcC": "https://zenodo.org/records/14908509/files/FT-METL-L-2M-3D-PTEN-ACTIVITY-8Vi7ENcC.pt?download=1", 78 | "V3uTtXVe": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-DLG4_2022-ABUNDANCE-V3uTtXVe.pt?download=1", 79 | "iu6ZahPw": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-DLG4_2022-BINDING-iu6ZahPw.pt?download=1", 80 | "UvMMdsq4": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-GB1-UvMMdsq4.pt?download=1", 81 | "LWEY95Yb": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-GFP-LWEY95Yb.pt?download=1", 82 | "PqBMjXkA": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-GRB2-ABUNDANCE-PqBMjXkA.pt?download=1", 83 | "VwcRN6UB": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-GRB2-BINDING-VwcRN6UB.pt?download=1", 84 | "5SjoLx3y": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-Pab1-5SjoLx3y.pt?download=1", 85 | "PncvgiJU": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-TEM-1-PncvgiJU.pt?download=1", 86 | "NfbZL7jK": "https://zenodo.org/records/14908509/files/FT-METL-L-3D-Ube4b-NfbZL7jK.pt?download=1" 87 | 88 | } 89 | 90 | IDENT_UUID_MAP = { 91 | # the keys should be all lowercase 92 | "metl-g-20m-1d": "D72M9aEp", 93 | "metl-g-20m-3d": "Nr9zCKpR", 94 | "metl-g-50m-1d": "auKdzzwX", 95 | "metl-g-50m-3d": "6PSAzdfv", 96 | 97 | # GFP local source models 98 | "metl-l-2m-1d-gfp": "8gMPQJy4", 99 | "metl-l-2m-3d-gfp": "Hr4GNHws", 100 | 101 | # DLG4 local source models 102 | "metl-l-2m-1d-dlg4_2022": "8iFoiYw2", 103 | "metl-l-2m-3d-dlg4_2022": "kt5DdWTa", 104 | 105 | # GB1 local source models 106 | "metl-l-2m-1d-gb1": "DMfkjVzT", 107 | "metl-l-2m-3d-gb1": "epegcFiH", 108 | 109 | # GRB2 local source models 110 | "metl-l-2m-1d-grb2": "kS3rUS7h", 111 | "metl-l-2m-3d-grb2": "X7w83g6S", 112 | 113 | # Pab1 local source models 114 | "metl-l-2m-1d-pab1": "UKebCQGz", 115 | "metl-l-2m-3d-pab1": "2rr8V4th", 116 | 117 | # PTEN local source models 118 | "metl-l-2m-1d-pten": "CEMSx7ZC", 119 | "metl-l-2m-3d-pten": "PjxR5LW7", 120 | 121 | # TEM-1 local source models 122 | "metl-l-2m-1d-tem-1": "PREhfC22", 123 | "metl-l-2m-3d-tem-1": "9ASvszux", 124 | 125 | # Ube4b local source models 126 | "metl-l-2m-1d-ube4b": "HscFFkAb", 127 | "metl-l-2m-3d-ube4b": "H48oiNZN", 128 | 129 | # METL-Bind for GB1 130 | "metl-bind-2m-3d-gb1-standard": "K6mw24Rg", 131 | "metl-bind-2m-3d-gb1-binding": "Bo5wn2SG", 132 | 133 | # GFP design models, giving them an ident 134 | "metl-l-2m-1d-gfp-ft-design": "YoQkzoLD", 135 | "metl-l-2m-3d-gfp-ft-design": "PEkeRuxb", 136 | 137 | } 138 | 139 | 140 | def download_checkpoint(uuid): 141 | ckpt = torch.hub.load_state_dict_from_url(UUID_URL_MAP[uuid], 142 | map_location="cpu", file_name=f"{uuid}.pt") 143 | state_dict = ckpt["state_dict"] 144 | hyper_parameters = ckpt["hyper_parameters"] 145 | 146 | return state_dict, hyper_parameters 147 | 148 | 149 | def _get_data_encoding(hparams): 150 | if "encoding" in hparams and hparams["encoding"] == "int_seqs": 151 | encoding = Encoding.INT_SEQS 152 | elif "encoding" in hparams and hparams["encoding"] == "one_hot": 153 | encoding = Encoding.ONE_HOT 154 | elif (("encoding" in hparams and hparams["encoding"] == "auto") or "encoding" not in hparams) and \ 155 | hparams["model_name"] in ["transformer_encoder"]: 156 | encoding = Encoding.INT_SEQS 157 | else: 158 | raise ValueError("Detected unsupported encoding in hyperparameters") 159 | 160 | return encoding 161 | 162 | 163 | def load_model_and_data_encoder(state_dict, hparams): 164 | model = models.Model[hparams["model_name"]].cls(**hparams) 165 | model.load_state_dict(state_dict) 166 | 167 | data_encoder = DataEncoder(_get_data_encoding(hparams)) 168 | 169 | return model, data_encoder 170 | 171 | 172 | def get_from_uuid(uuid): 173 | if uuid in UUID_URL_MAP: 174 | state_dict, hparams = download_checkpoint(uuid) 175 | return load_model_and_data_encoder(state_dict, hparams) 176 | else: 177 | raise ValueError(f"UUID {uuid} not found in UUID_URL_MAP") 178 | 179 | 180 | def get_from_ident(ident): 181 | ident = ident.lower() 182 | if ident in IDENT_UUID_MAP: 183 | state_dict, hparams = download_checkpoint(IDENT_UUID_MAP[ident]) 184 | return load_model_and_data_encoder(state_dict, hparams) 185 | else: 186 | raise ValueError(f"Identifier {ident} not found in IDENT_UUID_MAP") 187 | 188 | 189 | def get_from_checkpoint(ckpt_fn): 190 | ckpt = torch.load(ckpt_fn, map_location="cpu") 191 | state_dict = ckpt["state_dict"] 192 | hyper_parameters = ckpt["hyper_parameters"] 193 | return load_model_and_data_encoder(state_dict, hyper_parameters) 194 | -------------------------------------------------------------------------------- /metl/models.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import math 3 | from argparse import ArgumentParser 4 | import enum 5 | from os.path import isfile 6 | from typing import List, Tuple, Optional 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from torch import Tensor 12 | 13 | import metl.relative_attention as ra 14 | 15 | 16 | def reset_parameters_helper(m: nn.Module): 17 | """ helper function for resetting model parameters, meant to be used with model.apply() """ 18 | 19 | # the PyTorch MultiHeadAttention has a private function _reset_parameters() 20 | # other layers have a public reset_parameters()... go figure 21 | reset_parameters = getattr(m, "reset_parameters", None) 22 | reset_parameters_private = getattr(m, "_reset_parameters", None) 23 | 24 | if callable(reset_parameters) and callable(reset_parameters_private): 25 | raise RuntimeError("Module has both public and private methods for resetting parameters. " 26 | "This is unexpected... probably should just call the public one.") 27 | 28 | if callable(reset_parameters): 29 | m.reset_parameters() 30 | 31 | if callable(reset_parameters_private): 32 | m._reset_parameters() 33 | 34 | 35 | class SequentialWithArgs(nn.Sequential): 36 | def forward(self, x, **kwargs): 37 | for module in self: 38 | if isinstance(module, ra.RelativeTransformerEncoder) or isinstance(module, SequentialWithArgs): 39 | # for relative transformer encoders, pass in kwargs (pdb_fn) 40 | x = module(x, **kwargs) 41 | else: 42 | # for all modules, don't pass in kwargs 43 | x = module(x) 44 | return x 45 | 46 | 47 | class PositionalEncoding(nn.Module): 48 | # originally from https://pytorch.org/tutorials/beginner/transformer_tutorial.html 49 | # they have since updated their implementation, but it is functionally equivalent 50 | def __init__(self, d_model, dropout=0.1, max_len=5000): 51 | super(PositionalEncoding, self).__init__() 52 | self.dropout = nn.Dropout(p=dropout) 53 | 54 | pe = torch.zeros(max_len, d_model) 55 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 56 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 57 | pe[:, 0::2] = torch.sin(position * div_term) 58 | pe[:, 1::2] = torch.cos(position * div_term) 59 | # note the implementation on Pytorch's website expects [seq_len, batch_size, embedding_dim] 60 | # however our data is in [batch_size, seq_len, embedding_dim] (i.e. batch_first) 61 | # fixed by changing pe = pe.unsqueeze(0).transpose(0, 1) to pe = pe.unsqueeze(0) 62 | # also down below, changing our indexing into the position encoding to reflect new dimensions 63 | # pe = pe.unsqueeze(0).transpose(0, 1) 64 | pe = pe.unsqueeze(0) 65 | self.register_buffer('pe', pe) 66 | 67 | def forward(self, x, **kwargs): 68 | # note the implementation on Pytorch's website expects [seq_len, batch_size, embedding_dim] 69 | # however our data is in [batch_size, seq_len, embedding_dim] (i.e. batch_first) 70 | # fixed by changing x = x + self.pe[:x.size(0)] to x = x + self.pe[:, :x.size(1), :] 71 | # x = x + self.pe[:x.size(0), :] 72 | x = x + self.pe[:, :x.size(1), :] 73 | return self.dropout(x) 74 | 75 | 76 | class ScaledEmbedding(nn.Module): 77 | # https://pytorch.org/tutorials/beginner/translation_transformer.html 78 | # a helper function for embedding that scales by sqrt(d_model) in the forward() 79 | # makes it, so we don't have to do the scaling in the main AttnModel forward() 80 | 81 | # todo: be aware of embedding scaling factor 82 | # regarding the scaling factor, it's unclear exactly what the purpose is and whether it is needed 83 | # there are several theories on why it is used, and it shows up in all the transformer reference implementations 84 | # https://datascience.stackexchange.com/questions/87906/transformer-model-why-are-word-embeddings-scaled-before-adding-positional-encod 85 | # 1. Has something to do with weight sharing between the embedding and the decoder output 86 | # 2. Scales up the embeddings so the signal doesn't get overwhelmed when adding the absolute positional encoding 87 | # 3. It cancels out with the scaling factor in scaled dot product attention, and helps make the model robust 88 | # to the choice of embedding_len 89 | # 4. It's not actually needed 90 | 91 | # Regarding #1, not really sure about this. In section 3.4 of attention is all you need, 92 | # that's where they state they multiply the embedding weights by sqrt(d_model), and the context is that they 93 | # are sharing the same weight matrix between the two embedding layers and the pre-softmax linear transformation. 94 | # there may be a reason that we want those weights scaled differently for the embedding layers vs. the linear 95 | # transformation. It might have something to do with the scale at which embedding weights are initialized 96 | # is more appropriate for the decoder linear transform vs how they are used in the attention function. Might have 97 | # something to do with computing the correct next-token probabilities. Overall, I'm really not sure about this, 98 | # but we aren't using a decoder anyway. So if this is the reason, then we don't need to perform the multiply. 99 | 100 | # Regarding #2, it seems like in one implementation of transformers (fairseq), the sinusoidal positional encoding 101 | # has a range of (-1.0, 1.0), but the word embedding are initialized with mean 0 and s.d embedding_dim ** -0.5, 102 | # which for embedding_dim=512, is a range closer to (-0.10, 0.10). Thus, the positional embedding would overwhelm 103 | # the word embeddings when they are added together. The scaling factor increases the signal of the word embeddings. 104 | # for embedding_dim=512, it scales word embeddings by 22, increasing range of the word embeddings to (-2.2, 2.2). 105 | # link to fairseq implementation, search for nn.init to see them do the initialization 106 | # https://fairseq.readthedocs.io/en/v0.7.1/_modules/fairseq/models/transformer.html 107 | # 108 | # For PyTorch, PyTorch initializes nn.Embedding with a standard normal distribution mean 0, variance 1: N(0,1). 109 | # this puts the range for the word embeddings around (-3, 3). the pytorch implementation for positional encoding 110 | # also has a range of (-1.0, 1.0). So already, these are much closer in scale, and it doesn't seem like we need 111 | # to increase the scale of the word embeddings. However, PyTorch example still multiply by the scaling factor 112 | # unclear whether this is just a carryover that is not actually needed, or if there is a different reason 113 | # 114 | # EDIT! I just realized that even though nn.Embedding defaults to a range of around (-3, 3), the PyTorch 115 | # transformer example actually re-initializes them using a uniform distribution in the range of (-0.1, 0.1) 116 | # that makes it very similar to the fairseq implementation, so the scaling factor that PyTorch uses actually would 117 | # bring the word embedding and positional encodings much closer in scale. So this could be the reason why pytorch 118 | # does it 119 | 120 | # Regarding #3, I don't think so. Firstly, does it actually cancel there? Secondly, the purpose of the scaling 121 | # factor in scaled dot product attention, according to attention is all you need, is to counteract dot products 122 | # that are very high in magnitude due to choice of large mbedding length (aka d_k). The problem with high magnitude 123 | # dot products is that potentially, the softmax is pushed into regions where it has extremely small gradients, 124 | # making learning difficult. If the scaling factor in the embedding was meant to counteract the scaling factor in 125 | # scaled dot product attention, then what would be the point of doing all that? 126 | 127 | # Regarding #4, I don't think the scaling will have any effects in practice, it's probably not needed 128 | 129 | # Overall, I think #2 is the most likely reason why this scaling is performed. In theory, I think 130 | # even if the scaling wasn't performed, the network might learn to up-scale the word embedding weights to increase 131 | # word embedding signal vs. the position signal on its own. Another question I have is why not just initialize 132 | # the embedding weights to have higher initial values? Why put it in the range (-0.1, 0.1)? 133 | # 134 | # The fact that most implementations have this scaling concerns me, makes me think I might be missing something. 135 | # For our purposes, we can train a couple models to see if scaling has any positive or negative effect. 136 | # Still need to think about potential effects of this scaling on relative position embeddings. 137 | 138 | def __init__(self, num_embeddings: int, embedding_dim: int, scale: bool): 139 | super(ScaledEmbedding, self).__init__() 140 | self.embedding = nn.Embedding(num_embeddings, embedding_dim) 141 | self.emb_size = embedding_dim 142 | self.embed_scale = math.sqrt(self.emb_size) 143 | 144 | self.scale = scale 145 | 146 | self.init_weights() 147 | 148 | def init_weights(self): 149 | # todo: not sure why PyTorch example initializes weights like this 150 | # might have something to do with word embedding scaling factor (see above) 151 | # could also just try the default weight initialization for nn.Embedding() 152 | init_range = 0.1 153 | self.embedding.weight.data.uniform_(-init_range, init_range) 154 | 155 | def forward(self, tokens: Tensor, **kwargs): 156 | if self.scale: 157 | return self.embedding(tokens.long()) * self.embed_scale 158 | else: 159 | return self.embedding(tokens.long()) 160 | 161 | 162 | class FCBlock(nn.Module): 163 | """ a fully connected block with options for batchnorm and dropout 164 | can extend in the future with option for different activation, etc """ 165 | 166 | def __init__(self, 167 | in_features: int, 168 | num_hidden_nodes: int = 64, 169 | use_batchnorm: bool = False, 170 | use_layernorm: bool = False, 171 | norm_before_activation: bool = False, 172 | use_dropout: bool = False, 173 | dropout_rate: float = 0.2, 174 | activation: str = "relu"): 175 | 176 | super().__init__() 177 | 178 | if use_batchnorm and use_layernorm: 179 | raise ValueError("Only one of use_batchnorm or use_layernorm can be set to True") 180 | 181 | self.use_batchnorm = use_batchnorm 182 | self.use_dropout = use_dropout 183 | self.use_layernorm = use_layernorm 184 | self.norm_before_activation = norm_before_activation 185 | 186 | self.fc = nn.Linear(in_features=in_features, out_features=num_hidden_nodes) 187 | 188 | self.activation = get_activation_fn(activation, functional=False) 189 | 190 | if use_batchnorm: 191 | self.norm = nn.BatchNorm1d(num_hidden_nodes) 192 | 193 | if use_layernorm: 194 | self.norm = nn.LayerNorm(num_hidden_nodes) 195 | 196 | if use_dropout: 197 | self.dropout = nn.Dropout(p=dropout_rate) 198 | 199 | def forward(self, x, **kwargs): 200 | x = self.fc(x) 201 | 202 | # norm can be before or after activation, using flag 203 | if (self.use_batchnorm or self.use_layernorm) and self.norm_before_activation: 204 | x = self.norm(x) 205 | 206 | x = self.activation(x) 207 | 208 | # batchnorm being applied after activation, there is some discussion on this online 209 | if (self.use_batchnorm or self.use_layernorm) and not self.norm_before_activation: 210 | x = self.norm(x) 211 | 212 | # dropout being applied last 213 | if self.use_dropout: 214 | x = self.dropout(x) 215 | 216 | return x 217 | 218 | 219 | class TaskSpecificPredictionLayers(nn.Module): 220 | """ Constructs num_tasks [dense(num_hidden_nodes)+relu+dense(1)] layers, each independently transforming input 221 | into a single output node. All num_tasks outputs are then concatenated into a single tensor. """ 222 | 223 | # todo: the independent layers are run in sequence rather than in parallel, causing a slowdown that 224 | # scales with the number of tasks. might be able to run in parallel by hacking convolution operation 225 | # https://stackoverflow.com/questions/58374980/run-multiple-models-of-an-ensemble-in-parallel-with-pytorch 226 | # https://github.com/pytorch/pytorch/issues/54147 227 | # https://github.com/pytorch/pytorch/issues/36459 228 | 229 | def __init__(self, 230 | num_tasks: int, 231 | in_features: int, 232 | num_hidden_nodes: int = 64, 233 | use_batchnorm: bool = False, 234 | use_dropout: bool = False, 235 | dropout_rate: float = 0.2, 236 | activation: str = "relu"): 237 | 238 | super().__init__() 239 | 240 | # each task-specific layer outputs a single node, 241 | # which can be combined with torch.cat into prediction vector 242 | self.task_specific_pred_layers = nn.ModuleList() 243 | for i in range(num_tasks): 244 | layers = [FCBlock(in_features=in_features, 245 | num_hidden_nodes=num_hidden_nodes, 246 | use_batchnorm=use_batchnorm, 247 | use_dropout=use_dropout, 248 | dropout_rate=dropout_rate, 249 | activation=activation), 250 | nn.Linear(in_features=num_hidden_nodes, out_features=1)] 251 | self.task_specific_pred_layers.append(nn.Sequential(*layers)) 252 | 253 | def forward(self, x, **kwargs): 254 | # run each task-specific layer and concatenate outputs into a single output vector 255 | task_specific_outputs = [] 256 | for layer in self.task_specific_pred_layers: 257 | task_specific_outputs.append(layer(x)) 258 | 259 | output = torch.cat(task_specific_outputs, dim=1) 260 | return output 261 | 262 | 263 | class GlobalAveragePooling(nn.Module): 264 | """ helper class for global average pooling """ 265 | 266 | def __init__(self, dim=1): 267 | super().__init__() 268 | # our data is in [batch_size, sequence_length, embedding_length] 269 | # with global pooling, we want to pool over the sequence dimension (dim=1) 270 | self.dim = dim 271 | 272 | def forward(self, x, **kwargs): 273 | return torch.mean(x, dim=self.dim) 274 | 275 | 276 | class CLSPooling(nn.Module): 277 | """ helper class for CLS token extraction """ 278 | 279 | def __init__(self, cls_position=0): 280 | super().__init__() 281 | 282 | # the position of the CLS token in the sequence dimension 283 | # currently, the CLS token is in the first position, but may move it to the last position 284 | self.cls_position = cls_position 285 | 286 | def forward(self, x, **kwargs): 287 | # assumes input is in [batch_size, sequence_len, embedding_len] 288 | # thus sequence dimension is dimension 1 289 | return x[:, self.cls_position, :] 290 | 291 | 292 | class TransformerEncoderWrapper(nn.TransformerEncoder): 293 | """ wrapper around PyTorch's TransformerEncoder that re-initializes layer parameters, 294 | so each transformer encoder layer has a different initialization """ 295 | 296 | # todo: PyTorch is changing its transformer API... check up on and see if there is a better way 297 | def __init__(self, encoder_layer, num_layers, norm=None, reset_params=True): 298 | super().__init__(encoder_layer, num_layers, norm) 299 | if reset_params: 300 | self.apply(reset_parameters_helper) 301 | 302 | 303 | class AttnModel(nn.Module): 304 | # https://pytorch.org/tutorials/beginner/transformer_tutorial.html 305 | 306 | @staticmethod 307 | def add_model_specific_args(parent_parser): 308 | parser = ArgumentParser(parents=[parent_parser], add_help=False) 309 | 310 | parser.add_argument('--pos_encoding', type=str, default="absolute", 311 | choices=["none", "absolute", "relative", "relative_3D"], 312 | help="what type of positional encoding to use") 313 | parser.add_argument('--pos_encoding_dropout', type=float, default=0.1, 314 | help="out much dropout to use in positional encoding, for pos_encoding==absolute") 315 | parser.add_argument('--clipping_threshold', type=int, default=3, 316 | help="clipping threshold for relative position embedding, for relative and relative_3D") 317 | parser.add_argument('--contact_threshold', type=int, default=7, 318 | help="threshold, in angstroms, for contact map, for relative_3D") 319 | parser.add_argument('--embedding_len', type=int, default=128) 320 | parser.add_argument('--num_heads', type=int, default=2) 321 | parser.add_argument('--num_hidden', type=int, default=64) 322 | parser.add_argument('--num_enc_layers', type=int, default=2) 323 | parser.add_argument('--enc_layer_dropout', type=float, default=0.1) 324 | parser.add_argument('--use_final_encoder_norm', action="store_true", default=False) 325 | 326 | parser.add_argument('--global_average_pooling', action="store_true", default=False) 327 | parser.add_argument('--cls_pooling', action="store_true", default=False) 328 | 329 | parser.add_argument('--use_task_specific_layers', action="store_true", default=False, 330 | help="exclusive with use_final_hidden_layer; takes priority over use_final_hidden_layer" 331 | " if both flags are set") 332 | parser.add_argument('--task_specific_hidden_nodes', type=int, default=64) 333 | parser.add_argument('--use_final_hidden_layer', action="store_true", default=False) 334 | parser.add_argument('--final_hidden_size', type=int, default=64) 335 | parser.add_argument('--use_final_hidden_layer_norm', action="store_true", default=False) 336 | parser.add_argument('--final_hidden_layer_norm_before_activation', action="store_true", default=False) 337 | parser.add_argument('--use_final_hidden_layer_dropout', action="store_true", default=False) 338 | parser.add_argument('--final_hidden_layer_dropout_rate', type=float, default=0.2) 339 | 340 | parser.add_argument('--activation', type=str, default="relu", 341 | help="activation function used for all activations in the network") 342 | return parser 343 | 344 | def __init__(self, 345 | # data args 346 | num_tasks: int, 347 | aa_seq_len: int, 348 | num_tokens: int, 349 | # transformer encoder model args 350 | pos_encoding: str = "absolute", 351 | pos_encoding_dropout: float = 0.1, 352 | clipping_threshold: int = 3, 353 | contact_threshold: int = 7, 354 | pdb_fns: List[str] = None, 355 | embedding_len: int = 64, 356 | num_heads: int = 2, 357 | num_hidden: int = 64, 358 | num_enc_layers: int = 2, 359 | enc_layer_dropout: float = 0.1, 360 | use_final_encoder_norm: bool = False, 361 | # pooling to fixed-length representation 362 | global_average_pooling: bool = True, 363 | cls_pooling: bool = False, 364 | # prediction layers 365 | use_task_specific_layers: bool = False, 366 | task_specific_hidden_nodes: int = 64, 367 | use_final_hidden_layer: bool = False, 368 | final_hidden_size: int = 64, 369 | use_final_hidden_layer_norm: bool = False, 370 | final_hidden_layer_norm_before_activation: bool = False, 371 | use_final_hidden_layer_dropout: bool = False, 372 | final_hidden_layer_dropout_rate: float = 0.2, 373 | # activation function 374 | activation: str = "relu", 375 | *args, **kwargs): 376 | 377 | super().__init__() 378 | 379 | # store embedding length for use in the forward function 380 | self.embedding_len = embedding_len 381 | self.aa_seq_len = aa_seq_len 382 | 383 | # build up layers 384 | layers = collections.OrderedDict() 385 | 386 | # amino acid embedding 387 | layers["embedder"] = ScaledEmbedding(num_embeddings=num_tokens, embedding_dim=embedding_len, scale=True) 388 | 389 | # absolute positional encoding 390 | if pos_encoding == "absolute": 391 | layers["pos_encoder"] = PositionalEncoding(embedding_len, dropout=pos_encoding_dropout, max_len=512) 392 | 393 | # transformer encoder layer for none or absolute positional encoding 394 | if pos_encoding in ["none", "absolute"]: 395 | encoder_layer = torch.nn.TransformerEncoderLayer(d_model=embedding_len, 396 | nhead=num_heads, 397 | dim_feedforward=num_hidden, 398 | dropout=enc_layer_dropout, 399 | activation=get_activation_fn(activation), 400 | norm_first=True, 401 | batch_first=True) 402 | 403 | # layer norm that is used after the transformer encoder layers 404 | # if the norm_first is False, this is *redundant* and not needed 405 | # but if norm_first is True, this can be used to normalize outputs from 406 | # the transformer encoder before inputting to the final fully connected layer 407 | encoder_norm = None 408 | if use_final_encoder_norm: 409 | encoder_norm = nn.LayerNorm(embedding_len) 410 | 411 | layers["tr_encoder"] = TransformerEncoderWrapper(encoder_layer=encoder_layer, 412 | num_layers=num_enc_layers, 413 | norm=encoder_norm) 414 | 415 | # transformer encoder layer for relative position encoding 416 | elif pos_encoding in ["relative", "relative_3D"]: 417 | relative_encoder_layer = ra.RelativeTransformerEncoderLayer(d_model=embedding_len, 418 | nhead=num_heads, 419 | pos_encoding=pos_encoding, 420 | clipping_threshold=clipping_threshold, 421 | contact_threshold=contact_threshold, 422 | pdb_fns=pdb_fns, 423 | dim_feedforward=num_hidden, 424 | dropout=enc_layer_dropout, 425 | activation=get_activation_fn(activation), 426 | norm_first=True) 427 | 428 | encoder_norm = None 429 | if use_final_encoder_norm: 430 | encoder_norm = nn.LayerNorm(embedding_len) 431 | 432 | layers["tr_encoder"] = ra.RelativeTransformerEncoder(encoder_layer=relative_encoder_layer, 433 | num_layers=num_enc_layers, 434 | norm=encoder_norm) 435 | 436 | # GLOBAL AVERAGE POOLING OR CLS TOKEN 437 | # set up the layers and output shapes (i.e. input shapes for the pred layer) 438 | if global_average_pooling: 439 | # pool over the sequence dimension 440 | layers["avg_pooling"] = GlobalAveragePooling(dim=1) 441 | pred_layer_input_features = embedding_len 442 | elif cls_pooling: 443 | layers["cls_pooling"] = CLSPooling(cls_position=0) 444 | pred_layer_input_features = embedding_len 445 | else: 446 | # no global average pooling or CLS token 447 | # sequence dimension is still there, just flattened 448 | layers["flatten"] = nn.Flatten() 449 | pred_layer_input_features = embedding_len * aa_seq_len 450 | 451 | # PREDICTION 452 | if use_task_specific_layers: 453 | # task specific prediction layers (nonlinear transform for each task) 454 | layers["prediction"] = TaskSpecificPredictionLayers(num_tasks=num_tasks, 455 | in_features=pred_layer_input_features, 456 | num_hidden_nodes=task_specific_hidden_nodes, 457 | activation=activation) 458 | elif use_final_hidden_layer: 459 | # combined prediction linear (linear transform for each task) 460 | layers["fc1"] = FCBlock(in_features=pred_layer_input_features, 461 | num_hidden_nodes=final_hidden_size, 462 | use_batchnorm=False, 463 | use_layernorm=use_final_hidden_layer_norm, 464 | norm_before_activation=final_hidden_layer_norm_before_activation, 465 | use_dropout=use_final_hidden_layer_dropout, 466 | dropout_rate=final_hidden_layer_dropout_rate, 467 | activation=activation) 468 | 469 | layers["prediction"] = nn.Linear(in_features=final_hidden_size, out_features=num_tasks) 470 | else: 471 | layers["prediction"] = nn.Linear(in_features=pred_layer_input_features, out_features=num_tasks) 472 | 473 | # FINAL MODEL 474 | self.model = SequentialWithArgs(layers) 475 | 476 | def forward(self, x, **kwargs): 477 | return self.model(x, **kwargs) 478 | 479 | 480 | class Transpose(nn.Module): 481 | """ helper layer to swap data from (batch, seq, channels) to (batch, channels, seq) 482 | used as a helper in the convolutional network which pytorch defaults to channels-first """ 483 | 484 | def __init__(self, dims: Tuple[int, ...] = (1, 2)): 485 | super().__init__() 486 | self.dims = dims 487 | 488 | def forward(self, x, **kwargs): 489 | x = x.transpose(*self.dims).contiguous() 490 | return x 491 | 492 | 493 | def conv1d_out_shape(seq_len, kernel_size, stride=1, pad=0, dilation=1): 494 | return (seq_len + (2 * pad) - (dilation * (kernel_size - 1)) - 1 // stride) + 1 495 | 496 | 497 | class ConvBlock(nn.Module): 498 | def __init__(self, 499 | in_channels: int, 500 | out_channels: int, 501 | kernel_size: int, 502 | dilation: int = 1, 503 | padding: str = "same", 504 | use_batchnorm: bool = False, 505 | use_layernorm: bool = False, 506 | norm_before_activation: bool = False, 507 | use_dropout: bool = False, 508 | dropout_rate: float = 0.2, 509 | activation: str = "relu"): 510 | 511 | super().__init__() 512 | 513 | if use_batchnorm and use_layernorm: 514 | raise ValueError("Only one of use_batchnorm or use_layernorm can be set to True") 515 | 516 | self.use_batchnorm = use_batchnorm 517 | self.use_layernorm = use_layernorm 518 | self.norm_before_activation = norm_before_activation 519 | self.use_dropout = use_dropout 520 | 521 | self.conv = nn.Conv1d(in_channels=in_channels, 522 | out_channels=out_channels, 523 | kernel_size=kernel_size, 524 | padding=padding, 525 | dilation=dilation) 526 | 527 | self.activation = get_activation_fn(activation, functional=False) 528 | 529 | if use_batchnorm: 530 | self.norm = nn.BatchNorm1d(out_channels) 531 | 532 | if use_layernorm: 533 | self.norm = nn.LayerNorm(out_channels) 534 | 535 | if use_dropout: 536 | self.dropout = nn.Dropout(p=dropout_rate) 537 | 538 | def forward(self, x, **kwargs): 539 | x = self.conv(x) 540 | 541 | # norm can be before or after activation, using flag 542 | if self.use_batchnorm and self.norm_before_activation: 543 | x = self.norm(x) 544 | elif self.use_layernorm and self.norm_before_activation: 545 | x = self.norm(x.transpose(1, 2)).transpose(1, 2) 546 | 547 | x = self.activation(x) 548 | 549 | # batchnorm being applied after activation, there is some discussion on this online 550 | if self.use_batchnorm and not self.norm_before_activation: 551 | x = self.norm(x) 552 | elif self.use_layernorm and not self.norm_before_activation: 553 | x = self.norm(x.transpose(1, 2)).transpose(1, 2) 554 | 555 | # dropout being applied after batchnorm, there is some discussion on this online 556 | if self.use_dropout: 557 | x = self.dropout(x) 558 | 559 | return x 560 | 561 | 562 | class ConvModel2(nn.Module): 563 | """ convolutional source model that supports padded inputs, pooling, etc """ 564 | 565 | @staticmethod 566 | def add_model_specific_args(parent_parser): 567 | parser = ArgumentParser(parents=[parent_parser], add_help=False) 568 | parser.add_argument('--use_embedding', action="store_true", default=False) 569 | parser.add_argument('--embedding_len', type=int, default=128) 570 | 571 | parser.add_argument('--num_conv_layers', type=int, default=1) 572 | parser.add_argument('--kernel_sizes', type=int, nargs="+", default=[7]) 573 | parser.add_argument('--out_channels', type=int, nargs="+", default=[128]) 574 | parser.add_argument('--dilations', type=int, nargs="+", default=[1]) 575 | parser.add_argument('--padding', type=str, default="valid", choices=["valid", "same"]) 576 | parser.add_argument('--use_conv_layer_norm', action="store_true", default=False) 577 | parser.add_argument('--conv_layer_norm_before_activation', action="store_true", default=False) 578 | parser.add_argument('--use_conv_layer_dropout', action="store_true", default=False) 579 | parser.add_argument('--conv_layer_dropout_rate', type=float, default=0.2) 580 | 581 | parser.add_argument('--global_average_pooling', action="store_true", default=False) 582 | 583 | parser.add_argument('--use_task_specific_layers', action="store_true", default=False) 584 | parser.add_argument('--task_specific_hidden_nodes', type=int, default=64) 585 | parser.add_argument('--use_final_hidden_layer', action="store_true", default=False) 586 | parser.add_argument('--final_hidden_size', type=int, default=64) 587 | parser.add_argument('--use_final_hidden_layer_norm', action="store_true", default=False) 588 | parser.add_argument('--final_hidden_layer_norm_before_activation', action="store_true", default=False) 589 | parser.add_argument('--use_final_hidden_layer_dropout', action="store_true", default=False) 590 | parser.add_argument('--final_hidden_layer_dropout_rate', type=float, default=0.2) 591 | 592 | parser.add_argument('--activation', type=str, default="relu", 593 | help="activation function used for all activations in the network") 594 | 595 | return parser 596 | 597 | def __init__(self, 598 | # data 599 | num_tasks: int, 600 | aa_seq_len: int, 601 | aa_encoding_len: int, 602 | num_tokens: int, 603 | # convolutional model args 604 | use_embedding: bool = False, 605 | embedding_len: int = 64, 606 | num_conv_layers: int = 1, 607 | kernel_sizes: List[int] = (7,), 608 | out_channels: List[int] = (128,), 609 | dilations: List[int] = (1,), 610 | padding: str = "valid", 611 | use_conv_layer_norm: bool = False, 612 | conv_layer_norm_before_activation: bool = False, 613 | use_conv_layer_dropout: bool = False, 614 | conv_layer_dropout_rate: float = 0.2, 615 | # pooling 616 | global_average_pooling: bool = True, 617 | # prediction layers 618 | use_task_specific_layers: bool = False, 619 | task_specific_hidden_nodes: int = 64, 620 | use_final_hidden_layer: bool = False, 621 | final_hidden_size: int = 64, 622 | use_final_hidden_layer_norm: bool = False, 623 | final_hidden_layer_norm_before_activation: bool = False, 624 | use_final_hidden_layer_dropout: bool = False, 625 | final_hidden_layer_dropout_rate: float = 0.2, 626 | # activation function 627 | activation: str = "relu", 628 | *args, **kwargs): 629 | 630 | super(ConvModel2, self).__init__() 631 | 632 | # build up the layers 633 | layers = collections.OrderedDict() 634 | 635 | # amino acid embedding 636 | if use_embedding: 637 | layers["embedder"] = ScaledEmbedding(num_embeddings=num_tokens, embedding_dim=embedding_len, scale=False) 638 | 639 | # transpose the input to match PyTorch's expected format 640 | layers["transpose"] = Transpose(dims=(1, 2)) 641 | 642 | # build up the convolutional layers 643 | for layer_num in range(num_conv_layers): 644 | # determine the number of input channels for the first convolutional layer 645 | if layer_num == 0 and use_embedding: 646 | # for the first convolutional layer, the in_channels is the embedding_len 647 | in_channels = embedding_len 648 | elif layer_num == 0 and not use_embedding: 649 | # for the first convolutional layer, the in_channels is the aa_encoding_len 650 | in_channels = aa_encoding_len 651 | else: 652 | in_channels = out_channels[layer_num - 1] 653 | 654 | layers[f"conv{layer_num}"] = ConvBlock(in_channels=in_channels, 655 | out_channels=out_channels[layer_num], 656 | kernel_size=kernel_sizes[layer_num], 657 | dilation=dilations[layer_num], 658 | padding=padding, 659 | use_batchnorm=False, 660 | use_layernorm=use_conv_layer_norm, 661 | norm_before_activation=conv_layer_norm_before_activation, 662 | use_dropout=use_conv_layer_dropout, 663 | dropout_rate=conv_layer_dropout_rate, 664 | activation=activation) 665 | 666 | # handle transition from convolutional layers to fully connected layer 667 | # either use global average pooling or flatten 668 | # take into consideration whether we are using valid or same padding 669 | if global_average_pooling: 670 | # global average pooling (mean across the seq len dimension) 671 | # the seq len dimensions is the last dimension (batch_size, num_filters, seq_len) 672 | layers["avg_pooling"] = GlobalAveragePooling(dim=-1) 673 | # the prediction layers will take num_filters input features 674 | pred_layer_input_features = out_channels[-1] 675 | 676 | else: 677 | # no global average pooling. flatten instead. 678 | layers["flatten"] = nn.Flatten() 679 | # calculate the final output len of the convolutional layers 680 | # and the number of input features for the prediction layers 681 | if padding == "valid": 682 | # valid padding (aka no padding) results in shrinking length in progressive layers 683 | conv_out_len = conv1d_out_shape(aa_seq_len, kernel_size=kernel_sizes[0], dilation=dilations[0]) 684 | for layer_num in range(1, num_conv_layers): 685 | conv_out_len = conv1d_out_shape(conv_out_len, 686 | kernel_size=kernel_sizes[layer_num], 687 | dilation=dilations[layer_num]) 688 | pred_layer_input_features = conv_out_len * out_channels[-1] 689 | else: 690 | # padding == "same" 691 | pred_layer_input_features = aa_seq_len * out_channels[-1] 692 | 693 | # prediction layer 694 | if use_task_specific_layers: 695 | layers["prediction"] = TaskSpecificPredictionLayers(num_tasks=num_tasks, 696 | in_features=pred_layer_input_features, 697 | num_hidden_nodes=task_specific_hidden_nodes, 698 | activation=activation) 699 | 700 | # final hidden layer (with potential additional dropout) 701 | elif use_final_hidden_layer: 702 | layers["fc1"] = FCBlock(in_features=pred_layer_input_features, 703 | num_hidden_nodes=final_hidden_size, 704 | use_batchnorm=False, 705 | use_layernorm=use_final_hidden_layer_norm, 706 | norm_before_activation=final_hidden_layer_norm_before_activation, 707 | use_dropout=use_final_hidden_layer_dropout, 708 | dropout_rate=final_hidden_layer_dropout_rate, 709 | activation=activation) 710 | layers["prediction"] = nn.Linear(in_features=final_hidden_size, out_features=num_tasks) 711 | 712 | else: 713 | layers["prediction"] = nn.Linear(in_features=pred_layer_input_features, out_features=num_tasks) 714 | 715 | self.model = nn.Sequential(layers) 716 | 717 | def forward(self, x, **kwargs): 718 | output = self.model(x) 719 | return output 720 | 721 | 722 | class ConvModel(nn.Module): 723 | """ a convolutional network with convolutional layers followed by a fully connected layer """ 724 | 725 | @staticmethod 726 | def add_model_specific_args(parent_parser): 727 | parser = ArgumentParser(parents=[parent_parser], add_help=False) 728 | parser.add_argument('--num_conv_layers', type=int, default=1) 729 | parser.add_argument('--kernel_sizes', type=int, nargs="+", default=[7]) 730 | parser.add_argument('--out_channels', type=int, nargs="+", default=[128]) 731 | parser.add_argument('--padding', type=str, default="valid", choices=["valid", "same"]) 732 | parser.add_argument('--use_final_hidden_layer', action="store_true", 733 | help="whether to use a final hidden layer") 734 | parser.add_argument('--final_hidden_size', type=int, default=128, 735 | help="number of nodes in the final hidden layer") 736 | parser.add_argument('--use_dropout', action="store_true", 737 | help="whether to use dropout in the final hidden layer") 738 | parser.add_argument('--dropout_rate', type=float, default=0.2, 739 | help="dropout rate in the final hidden layer") 740 | parser.add_argument('--use_task_specific_layers', action="store_true", default=False) 741 | parser.add_argument('--task_specific_hidden_nodes', type=int, default=64) 742 | return parser 743 | 744 | def __init__(self, 745 | num_tasks: int, 746 | aa_seq_len: int, 747 | aa_encoding_len: int, 748 | num_conv_layers: int = 1, 749 | kernel_sizes: List[int] = (7,), 750 | out_channels: List[int] = (128,), 751 | padding: str = "valid", 752 | use_final_hidden_layer: bool = True, 753 | final_hidden_size: int = 128, 754 | use_dropout: bool = False, 755 | dropout_rate: float = 0.2, 756 | use_task_specific_layers: bool = False, 757 | task_specific_hidden_nodes: int = 64, 758 | *args, **kwargs): 759 | 760 | super(ConvModel, self).__init__() 761 | 762 | # set up the model as a Sequential block (less to do in forward()) 763 | layers = collections.OrderedDict() 764 | 765 | layers["transpose"] = Transpose(dims=(1, 2)) 766 | 767 | for layer_num in range(num_conv_layers): 768 | # for the first convolutional layer, the in_channels is the feature_len 769 | in_channels = aa_encoding_len if layer_num == 0 else out_channels[layer_num - 1] 770 | 771 | layers["conv{}".format(layer_num)] = nn.Sequential( 772 | nn.Conv1d(in_channels=in_channels, 773 | out_channels=out_channels[layer_num], 774 | kernel_size=kernel_sizes[layer_num], 775 | padding=padding), 776 | nn.ReLU() 777 | ) 778 | 779 | layers["flatten"] = nn.Flatten() 780 | 781 | # calculate the final output len of the convolutional layers 782 | # and the number of input features for the prediction layers 783 | if padding == "valid": 784 | # valid padding (aka no padding) results in shrinking length in progressive layers 785 | conv_out_len = conv1d_out_shape(aa_seq_len, kernel_size=kernel_sizes[0]) 786 | for layer_num in range(1, num_conv_layers): 787 | conv_out_len = conv1d_out_shape(conv_out_len, kernel_size=kernel_sizes[layer_num]) 788 | next_dim = conv_out_len * out_channels[-1] 789 | elif padding == "same": 790 | next_dim = aa_seq_len * out_channels[-1] 791 | else: 792 | raise ValueError("unexpected value for padding: {}".format(padding)) 793 | 794 | # final hidden layer (with potential additional dropout) 795 | if use_final_hidden_layer: 796 | layers["fc1"] = FCBlock(in_features=next_dim, 797 | num_hidden_nodes=final_hidden_size, 798 | use_batchnorm=False, 799 | use_dropout=use_dropout, 800 | dropout_rate=dropout_rate) 801 | next_dim = final_hidden_size 802 | 803 | # final prediction layer 804 | # either task specific nonlinear layers or a single linear layer 805 | if use_task_specific_layers: 806 | layers["prediction"] = TaskSpecificPredictionLayers(num_tasks=num_tasks, 807 | in_features=next_dim, 808 | num_hidden_nodes=task_specific_hidden_nodes) 809 | else: 810 | layers["prediction"] = nn.Linear(in_features=next_dim, out_features=num_tasks) 811 | 812 | self.model = nn.Sequential(layers) 813 | 814 | def forward(self, x, **kwargs): 815 | output = self.model(x) 816 | return output 817 | 818 | 819 | class FCModel(nn.Module): 820 | 821 | @staticmethod 822 | def add_model_specific_args(parent_parser): 823 | parser = ArgumentParser(parents=[parent_parser], add_help=False) 824 | parser.add_argument('--num_layers', type=int, default=1) 825 | parser.add_argument('--num_hidden', nargs="+", type=int, default=[128]) 826 | parser.add_argument('--use_batchnorm', action="store_true", default=False) 827 | parser.add_argument('--use_layernorm', action="store_true", default=False) 828 | parser.add_argument('--norm_before_activation', action="store_true", default=False) 829 | parser.add_argument('--use_dropout', action="store_true", default=False) 830 | parser.add_argument('--dropout_rate', type=float, default=0.2) 831 | return parser 832 | 833 | def __init__(self, 834 | num_tasks: int, 835 | seq_encoding_len: int, 836 | num_layers: int = 1, 837 | num_hidden: List[int] = (128,), 838 | use_batchnorm: bool = False, 839 | use_layernorm: bool = False, 840 | norm_before_activation: bool = False, 841 | use_dropout: bool = False, 842 | dropout_rate: float = 0.2, 843 | activation: str = "relu", 844 | *args, **kwargs): 845 | super().__init__() 846 | 847 | # set up the model as a Sequential block (less to do in forward()) 848 | layers = collections.OrderedDict() 849 | 850 | # flatten inputs as this is all fully connected 851 | layers["flatten"] = nn.Flatten() 852 | 853 | # build up the variable number of hidden layers (fully connected + ReLU + dropout (if set)) 854 | for layer_num in range(num_layers): 855 | # for the first layer (layer_num == 0), in_features is determined by given input 856 | # for subsequent layers, the in_features is the previous layer's num_hidden 857 | in_features = seq_encoding_len if layer_num == 0 else num_hidden[layer_num - 1] 858 | 859 | layers["fc{}".format(layer_num)] = FCBlock(in_features=in_features, 860 | num_hidden_nodes=num_hidden[layer_num], 861 | use_batchnorm=use_batchnorm, 862 | use_layernorm=use_layernorm, 863 | norm_before_activation=norm_before_activation, 864 | use_dropout=use_dropout, 865 | dropout_rate=dropout_rate, 866 | activation=activation) 867 | 868 | # finally, the linear output layer 869 | in_features = num_hidden[-1] if num_layers > 0 else seq_encoding_len 870 | layers["output"] = nn.Linear(in_features=in_features, out_features=num_tasks) 871 | 872 | self.model = nn.Sequential(layers) 873 | 874 | def forward(self, x, **kwargs): 875 | output = self.model(x) 876 | return output 877 | 878 | 879 | class LRModel(nn.Module): 880 | """ a simple linear model """ 881 | 882 | def __init__(self, num_tasks, seq_encoding_len, *args, **kwargs): 883 | super().__init__() 884 | 885 | self.model = nn.Sequential( 886 | nn.Flatten(), 887 | nn.Linear(seq_encoding_len, out_features=num_tasks)) 888 | 889 | def forward(self, x, **kwargs): 890 | output = self.model(x) 891 | return output 892 | 893 | 894 | class TransferModel(nn.Module): 895 | """ transfer learning model """ 896 | 897 | @staticmethod 898 | def add_model_specific_args(parent_parser): 899 | 900 | def none_or_int(value: str): 901 | return None if value.lower() == "none" else int(value) 902 | 903 | p = ArgumentParser(parents=[parent_parser], add_help=False) 904 | 905 | # for model set up 906 | p.add_argument('--pretrained_ckpt_path', type=str, default=None) 907 | 908 | # where to cut off the backbone 909 | p.add_argument("--backbone_cutoff", type=none_or_int, default=-1, 910 | help="where to cut off the backbone. can be a negative int, indexing back from " 911 | "pretrained_model.model.model. a value of -1 would chop off the backbone prediction head. " 912 | "a value of -2 chops the prediction head and FC layer. a value of -3 chops" 913 | "the above, as well as the global average pooling layer. all depends on architecture.") 914 | 915 | p.add_argument("--pred_layer_input_features", type=int, default=None, 916 | help="if None, number of features will be determined based on backbone_cutoff and standard " 917 | "architecture. otherwise, specify the number of input features for the prediction layer") 918 | 919 | # top net args 920 | p.add_argument("--top_net_type", type=str, default="linear", choices=["linear", "nonlinear", "sklearn"]) 921 | p.add_argument("--top_net_hidden_nodes", type=int, default=256) 922 | p.add_argument("--top_net_use_batchnorm", action="store_true") 923 | p.add_argument("--top_net_use_dropout", action="store_true") 924 | p.add_argument("--top_net_dropout_rate", type=float, default=0.1) 925 | 926 | return p 927 | 928 | def __init__(self, 929 | # pretrained model 930 | pretrained_ckpt_path: Optional[str] = None, 931 | pretrained_hparams: Optional[dict] = None, 932 | backbone_cutoff: Optional[int] = -1, 933 | # top net 934 | pred_layer_input_features: Optional[int] = None, 935 | top_net_type: str = "linear", 936 | top_net_hidden_nodes: int = 256, 937 | top_net_use_batchnorm: bool = False, 938 | top_net_use_dropout: bool = False, 939 | top_net_dropout_rate: float = 0.1, 940 | *args, **kwargs): 941 | 942 | super().__init__() 943 | 944 | # error checking: if pretrained_ckpt_path is None, then pretrained_hparams must be specified 945 | if pretrained_ckpt_path is None and pretrained_hparams is None: 946 | raise ValueError("Either pretrained_ckpt_path or pretrained_hparams must be specified") 947 | 948 | # note: pdb_fns is loaded from transfer model arguments rather than original source model hparams 949 | # if pdb_fns is specified as a kwarg, pass it on for structure-based RPE 950 | # otherwise, can just set pdb_fns to None, and structure-based RPE will handle new PDBs on the fly 951 | pdb_fns = kwargs["pdb_fns"] if "pdb_fns" in kwargs else None 952 | 953 | # generate a fresh backbone using pretrained_hparams if specified 954 | # otherwise load the backbone from the pretrained checkpoint 955 | # we prioritize pretrained_hparams over pretrained_ckpt_path because 956 | # pretrained_hparams will only really be specified if we are loading from a DMSTask checkpoint 957 | # meaning the TransferModel has already been fine-tuned on DMS data, and we are likely loading 958 | # weights from that finetuning (including weights for the backbone) 959 | # whereas if pretrained_hparams is not specified but pretrained_ckpt_path is, then we are 960 | # likely finetuning the TransferModel for the first time, and we need the pretrained weights for the 961 | # backbone from the RosettaTask checkpoint 962 | if pretrained_hparams is not None: 963 | # pretrained_hparams will only be specified if we are loading from a DMSTask checkpoint 964 | pretrained_hparams["pdb_fns"] = pdb_fns 965 | pretrained_model = Model[pretrained_hparams["model_name"]].cls(**pretrained_hparams) 966 | self.pretrained_hparams = pretrained_hparams 967 | else: 968 | # not supported in metl-pretrained 969 | raise NotImplementedError("Loading pretrained weights from RosettaTask checkpoint not supported") 970 | 971 | layers = collections.OrderedDict() 972 | 973 | # set the backbone to all layers except the last layer (the pre-trained prediction layer) 974 | if backbone_cutoff is None: 975 | layers["backbone"] = SequentialWithArgs(*list(pretrained_model.model.children())) 976 | else: 977 | layers["backbone"] = SequentialWithArgs(*list(pretrained_model.model.children())[0:backbone_cutoff]) 978 | 979 | if top_net_type == "sklearn": 980 | # sklearn top not doesn't require any more layers, just return model for the repr layer 981 | self.model = SequentialWithArgs(layers) 982 | return 983 | 984 | # figure out dimensions of input into the prediction layer 985 | if pred_layer_input_features is None: 986 | # todo: can make this more robust by checking if the pretrained_mode.hparams for use_final_hidden_layer, 987 | # global_average_pooling, etc. then can determine what the layer will be based on backbone_cutoff. 988 | # currently, assumes that pretrained_model uses global average pooling and a final_hidden_layer 989 | if backbone_cutoff is None: 990 | # no backbone cutoff... use the full network (including tasks) as the backbone 991 | pred_layer_input_features = self.pretrained_hparams["num_tasks"] 992 | elif backbone_cutoff == -1: 993 | pred_layer_input_features = self.pretrained_hparams["final_hidden_size"] 994 | elif backbone_cutoff == -2: 995 | pred_layer_input_features = self.pretrained_hparams["embedding_len"] 996 | elif backbone_cutoff == -3: 997 | pred_layer_input_features = self.pretrained_hparams["embedding_len"] * kwargs["aa_seq_len"] 998 | else: 999 | raise ValueError("can't automatically determine pred_layer_input_features for given backbone_cutoff") 1000 | 1001 | layers["flatten"] = nn.Flatten(start_dim=1) 1002 | 1003 | # create a new prediction layer on top of the backbone 1004 | if top_net_type == "linear": 1005 | # linear layer for prediction 1006 | layers["prediction"] = nn.Linear(in_features=pred_layer_input_features, out_features=1) 1007 | elif top_net_type == "nonlinear": 1008 | # fully connected with hidden layer 1009 | fc_block = FCBlock(in_features=pred_layer_input_features, 1010 | num_hidden_nodes=top_net_hidden_nodes, 1011 | use_batchnorm=top_net_use_batchnorm, 1012 | use_dropout=top_net_use_dropout, 1013 | dropout_rate=top_net_dropout_rate) 1014 | 1015 | pred_layer = nn.Linear(in_features=top_net_hidden_nodes, out_features=1) 1016 | 1017 | layers["prediction"] = SequentialWithArgs(fc_block, pred_layer) 1018 | else: 1019 | raise ValueError("Unexpected type of top net layer: {}".format(top_net_type)) 1020 | 1021 | self.model = SequentialWithArgs(layers) 1022 | 1023 | def forward(self, x, **kwargs): 1024 | return self.model(x, **kwargs) 1025 | 1026 | 1027 | def get_activation_fn(activation, functional=True): 1028 | if activation == "relu": 1029 | return F.relu if functional else nn.ReLU() 1030 | elif activation == "gelu": 1031 | return F.gelu if functional else nn.GELU() 1032 | elif activation == "silo" or activation == "swish": 1033 | return F.silu if functional else nn.SiLU() 1034 | elif activation == "leaky_relu" or activation == "lrelu": 1035 | return F.leaky_relu if functional else nn.LeakyReLU() 1036 | else: 1037 | raise RuntimeError("unknown activation: {}".format(activation)) 1038 | 1039 | 1040 | class Model(enum.Enum): 1041 | def __new__(cls, *args, **kwds): 1042 | value = len(cls.__members__) + 1 1043 | obj = object.__new__(cls) 1044 | obj._value_ = value 1045 | return obj 1046 | 1047 | def __init__(self, cls, transfer_model): 1048 | self.cls = cls 1049 | self.transfer_model = transfer_model 1050 | 1051 | linear = LRModel, False 1052 | fully_connected = FCModel, False 1053 | cnn = ConvModel, False 1054 | cnn2 = ConvModel2, False 1055 | transformer_encoder = AttnModel, False 1056 | transfer_model = TransferModel, True 1057 | 1058 | 1059 | def main(): 1060 | pass 1061 | 1062 | 1063 | if __name__ == "__main__": 1064 | main() 1065 | -------------------------------------------------------------------------------- /metl/relative_attention.py: -------------------------------------------------------------------------------- 1 | """ implementation of transformer encoder with relative attention 2 | references: 3 | - https://medium.com/@_init_/how-self-attention-with-relative-position-representations-works-28173b8c245a 4 | - https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html#TransformerEncoderLayer 5 | - https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py 6 | - https://github.com/jiezouguihuafu/ClassicalModelreproduced/blob/main/Transformer/transfor_rpe.py 7 | """ 8 | 9 | import copy 10 | from os.path import basename, dirname, join, isfile 11 | from typing import Optional, Union 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | from torch import Tensor 17 | from torch.nn import Linear, Dropout, LayerNorm 18 | import time 19 | import networkx as nx 20 | 21 | import metl.structure as structure 22 | import metl.models as models 23 | 24 | 25 | class RelativePosition3D(nn.Module): 26 | """ Contact map-based relative position embeddings """ 27 | 28 | # need to compute a bucket_mtx for each structure 29 | # need to know which bucket_mtx to use when grabbing the embeddings in forward() 30 | # - on init, get a list of all PDB files we will be using 31 | # - use a dictionary to store PDB files --> bucket_mtxs 32 | # - forward() gets a new arg: the pdb file, which indexes into the dictionary to grab the right bucket_mtx 33 | def __init__(self, 34 | embedding_len: int, 35 | contact_threshold: int, 36 | clipping_threshold: int, 37 | pdb_fns: Optional[Union[str, list, tuple]] = None, 38 | default_pdb_dir: str = "data/pdb_files"): 39 | 40 | # preferably, pdb_fns contains full paths to the PDBs, but if just the PDB filename is given 41 | # then it defaults to the path data/pdb_files/ 42 | super().__init__() 43 | self.embedding_len = embedding_len 44 | self.clipping_threshold = clipping_threshold 45 | self.contact_threshold = contact_threshold 46 | self.default_pdb_dir = default_pdb_dir 47 | 48 | # dummy buffer for getting correct device for on-the-fly bucket matrix generation 49 | self.register_buffer("dummy_buffer", torch.empty(0), persistent=False) 50 | 51 | # for 3D-based positions, the number of embeddings is generally the number of buckets 52 | # for contact map-based distances, that is clipping_threshold + 1 53 | num_embeddings = clipping_threshold + 1 54 | 55 | # this is the embedding lookup table E_r 56 | self.embeddings_table = nn.Embedding(num_embeddings, embedding_len) 57 | 58 | # set up pdb_fns that were passed in on init (can also be set up during runtime in forward()) 59 | # todo: i'm using a hacky workaround to move the bucket_mtxs to the correct device 60 | # i tried to make it more efficient by registering bucket matrices as buffers, but i was 61 | # having problems with DDP syncing the buffers across processes 62 | self.bucket_mtxs = {} 63 | self.bucket_mtxs_device = self.dummy_buffer.device 64 | self._init_pdbs(pdb_fns) 65 | 66 | def forward(self, pdb_fn): 67 | # compute matrix R by grabbing the embeddings from the embeddings lookup table 68 | embeddings = self.embeddings_table(self._get_bucket_mtx(pdb_fn)) 69 | return embeddings 70 | 71 | # def _get_bucket_mtx(self, pdb_fn): 72 | # """ retrieve a bucket matrix given the pdb_fn. 73 | # if the pdb_fn was provided at init or has already been computed, then the bucket matrix will be 74 | # retrieved from the object buffer. if the bucket matrix has not been computed yet, it will be here """ 75 | # pdb_attr = self._pdb_key(pdb_fn) 76 | # if hasattr(self, pdb_attr): 77 | # return getattr(self, pdb_attr) 78 | # else: 79 | # # encountering a new PDB at runtime... process it 80 | # # todo: if there's a new PDB at runtime, it will be initialized separately in each instance 81 | # # of RelativePosition3D, for each layer. It would be more efficient to have a global 82 | # # bucket_mtx registry... perhaps in the RelativeTransformerEncoder class, that can be passed through 83 | # self._init_pdb(pdb_fn) 84 | # return getattr(self, pdb_attr) 85 | 86 | def _move_bucket_mtxs(self, device): 87 | for k, v in self.bucket_mtxs.items(): 88 | self.bucket_mtxs[k] = v.to(device) 89 | self.bucket_mtxs_device = device 90 | 91 | def _get_bucket_mtx(self, pdb_fn): 92 | """ retrieve a bucket matrix given the pdb_fn. 93 | if the pdb_fn was provided at init or has already been computed, then the bucket matrix will be 94 | retrieved from the bucket_mtxs dictionary. else, it will be computed now on-the-fly """ 95 | 96 | # ensure that all the bucket matrices are on the same device as the nn.Embedding 97 | if self.bucket_mtxs_device != self.dummy_buffer.device: 98 | self._move_bucket_mtxs(self.dummy_buffer.device) 99 | 100 | pdb_attr = self._pdb_key(pdb_fn) 101 | if pdb_attr in self.bucket_mtxs: 102 | return self.bucket_mtxs[pdb_attr] 103 | else: 104 | # encountering a new PDB at runtime... process it 105 | # todo: if there's a new PDB at runtime, it will be initialized separately in each instance 106 | # of RelativePosition3D, for each layer. It would be more efficient to have a global 107 | # bucket_mtx registry... perhaps in the RelativeTransformerEncoder class, that can be passed through 108 | self._init_pdb(pdb_fn) 109 | return self.bucket_mtxs[pdb_attr] 110 | 111 | # def _set_bucket_mtx(self, pdb_fn, bucket_mtx): 112 | # """ store a bucket matrix as a buffer """ 113 | # # if PyTorch ever implements a BufferDict, we could use it here efficiently 114 | # # there is also BufferDict from https://botorch.org/api/_modules/botorch/utils/torch.html 115 | # # would just need to modify it to have an option for persistent=False 116 | # bucket_mtx = bucket_mtx.to(self.dummy_buffer.device) 117 | # 118 | # self.register_buffer(self._pdb_key(pdb_fn), bucket_mtx, persistent=False) 119 | 120 | def _set_bucket_mtx(self, pdb_fn, bucket_mtx): 121 | """ store a bucket matrix in the bucket dict """ 122 | 123 | # move the bucket_mtx to the same device that the other bucket matrices are on 124 | bucket_mtx = bucket_mtx.to(self.bucket_mtxs_device) 125 | 126 | self.bucket_mtxs[self._pdb_key(pdb_fn)] = bucket_mtx 127 | 128 | @staticmethod 129 | def _pdb_key(pdb_fn): 130 | """ return a unique key for the given pdb_fn, used to map unique PDBs """ 131 | # note this key does NOT currently support PDBs with the same basename but different paths 132 | # assumes every PDB is in the format .pdb 133 | # should be a compatible with being a class attribute, as it is used as a pytorch buffer name 134 | return f"pdb_{basename(pdb_fn).split('.')[0]}" 135 | 136 | def _init_pdbs(self, pdb_fns): 137 | start = time.time() 138 | 139 | if pdb_fns is None: 140 | # nothing to initialize if pdb_fns is None 141 | return 142 | 143 | # make sure pdb_fns is a list 144 | if not isinstance(pdb_fns, list) and not isinstance(pdb_fns, tuple): 145 | pdb_fns = [pdb_fns] 146 | 147 | # init each pdb fn in the list 148 | for pdb_fn in pdb_fns: 149 | self._init_pdb(pdb_fn) 150 | 151 | print("Initialized PDB bucket matrices in: {:.3f}".format(time.time() - start)) 152 | 153 | def _init_pdb(self, pdb_fn): 154 | """ process a pdb file for use with structure-based relative attention """ 155 | # if pdb_fn is not a full path, default to the path data/pdb_files/ 156 | if dirname(pdb_fn) == "": 157 | # handle the case where the pdb file is in the current working directory 158 | # if there is a PDB file in the cwd.... then just use it as is. otherwise, append the default. 159 | if not isfile(pdb_fn): 160 | pdb_fn = join(self.default_pdb_dir, pdb_fn) 161 | 162 | # create a structure graph from the pdb_fn and contact threshold 163 | cbeta_mtx = structure.cbeta_distance_matrix(pdb_fn) 164 | structure_graph = structure.dist_thresh_graph(cbeta_mtx, self.contact_threshold) 165 | 166 | # bucket_mtx indexes into the embedding lookup table to create the final distance matrix 167 | bucket_mtx = self._compute_bucket_mtx(structure_graph) 168 | 169 | self._set_bucket_mtx(pdb_fn, bucket_mtx) 170 | 171 | def _compute_bucketed_neighbors(self, structure_graph, source_node): 172 | """ gets the bucketed neighbors from the given source node and structure graph""" 173 | if self.clipping_threshold < 0: 174 | raise ValueError("Clipping threshold must be >= 0") 175 | 176 | sspl = _inv_dict(nx.single_source_shortest_path_length(structure_graph, source_node)) 177 | 178 | if self.clipping_threshold is not None: 179 | num_buckets = 1 + self.clipping_threshold 180 | sspl = _combine_d(sspl, self.clipping_threshold, num_buckets - 1) 181 | 182 | return sspl 183 | 184 | def _compute_bucket_mtx(self, structure_graph): 185 | """ get the bucket_mtx for the given structure_graph 186 | calls _get_bucketed_neighbors for every node in the structure_graph """ 187 | num_residues = len(list(structure_graph)) 188 | 189 | # index into the embedding lookup table to create the final distance matrix 190 | bucket_mtx = torch.zeros(num_residues, num_residues, dtype=torch.long) 191 | 192 | for node_num in sorted(list(structure_graph)): 193 | bucketed_neighbors = self._compute_bucketed_neighbors(structure_graph, node_num) 194 | 195 | for bucket_num, neighbors in bucketed_neighbors.items(): 196 | bucket_mtx[node_num, neighbors] = bucket_num 197 | 198 | return bucket_mtx 199 | 200 | 201 | class RelativePosition(nn.Module): 202 | """ creates the embedding lookup table E_r and computes R 203 | note this inherits from pl.LightningModule instead of nn.Module 204 | makes it easier to access the device with `self.device` 205 | might be able to keep it as an nn.Module using the hacky dummy_param or commented out .device property """ 206 | 207 | def __init__(self, embedding_len: int, clipping_threshold: int): 208 | """ 209 | embedding_len: the length of the embedding, may be d_model, or d_model // num_heads for multihead 210 | clipping_threshold: the maximum relative position, referred to as k by Shaw et al. 211 | """ 212 | super().__init__() 213 | self.embedding_len = embedding_len 214 | self.clipping_threshold = clipping_threshold 215 | # for sequence-based distances, the number of embeddings is 2*k+1, where k is the clipping threshold 216 | num_embeddings = 2 * clipping_threshold + 1 217 | 218 | # this is the embedding lookup table E_r 219 | self.embeddings_table = nn.Embedding(num_embeddings, embedding_len) 220 | 221 | # for getting the correct device for range vectors in forward 222 | self.register_buffer("dummy_buffer", torch.empty(0), persistent=False) 223 | 224 | def forward(self, length_q, length_k): 225 | # supports different length sequences, but in self-attention length_q and length_k are the same 226 | range_vec_q = torch.arange(length_q, device=self.dummy_buffer.device) 227 | range_vec_k = torch.arange(length_k, device=self.dummy_buffer.device) 228 | 229 | # this sets up the standard sequence-based distance matrix for relative positions 230 | # the current position is 0, positions to the right are +1, +2, etc, and to the left -1, -2, etc 231 | distance_mat = range_vec_k[None, :] - range_vec_q[:, None] 232 | distance_mat_clipped = torch.clamp(distance_mat, -self.clipping_threshold, self.clipping_threshold) 233 | 234 | # convert to indices, indexing into the embedding table 235 | final_mat = (distance_mat_clipped + self.clipping_threshold).long() 236 | 237 | # compute matrix R by grabbing the embeddings from the embedding lookup table 238 | embeddings = self.embeddings_table(final_mat) 239 | 240 | return embeddings 241 | 242 | 243 | class RelativeMultiHeadAttention(nn.Module): 244 | def __init__(self, embed_dim, num_heads, dropout, pos_encoding, clipping_threshold, contact_threshold, pdb_fns): 245 | """ 246 | Multi-head attention with relative position embeddings. Input data should be in batch_first format. 247 | :param embed_dim: aka d_model, aka hid_dim 248 | :param num_heads: number of heads 249 | :param dropout: how much dropout for scaled dot product attention 250 | 251 | :param pos_encoding: what type of positional encoding to use, relative or relative3D 252 | :param clipping_threshold: clipping threshold for relative position embedding 253 | :param contact_threshold: for relative_3D, the threshold in angstroms for the contact map 254 | :param pdb_fns: pdb file(s) to set up the relative position object 255 | 256 | """ 257 | super().__init__() 258 | 259 | assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads" 260 | 261 | # model dimensions 262 | self.embed_dim = embed_dim 263 | self.num_heads = num_heads 264 | self.head_dim = embed_dim // num_heads 265 | 266 | # pos encoding stuff 267 | self.pos_encoding = pos_encoding 268 | self.clipping_threshold = clipping_threshold 269 | self.contact_threshold = contact_threshold 270 | if pdb_fns is not None and not isinstance(pdb_fns, list): 271 | pdb_fns = [pdb_fns] 272 | self.pdb_fns = pdb_fns 273 | 274 | # relative position embeddings for use with keys and values 275 | # Shaw et al. uses relative position information for both keys and values 276 | # Huang et al. only uses it for the keys, which is probably enough 277 | if pos_encoding == "relative": 278 | self.relative_position_k = RelativePosition(self.head_dim, self.clipping_threshold) 279 | self.relative_position_v = RelativePosition(self.head_dim, self.clipping_threshold) 280 | elif pos_encoding == "relative_3D": 281 | self.relative_position_k = RelativePosition3D(self.head_dim, self.contact_threshold, 282 | self.clipping_threshold, self.pdb_fns) 283 | self.relative_position_v = RelativePosition3D(self.head_dim, self.contact_threshold, 284 | self.clipping_threshold, self.pdb_fns) 285 | else: 286 | raise ValueError("unrecognized pos_encoding: {}".format(pos_encoding)) 287 | 288 | # WQ, WK, and WV from attention is all you need 289 | # note these default to bias=True, same as PyTorch implementation 290 | self.q_proj = nn.Linear(embed_dim, embed_dim) 291 | self.k_proj = nn.Linear(embed_dim, embed_dim) 292 | self.v_proj = nn.Linear(embed_dim, embed_dim) 293 | 294 | # WO from attention is all you need 295 | # used for the final projection when computing multi-head attention 296 | # PyTorch uses NonDynamicallyQuantizableLinear instead of Linear to avoid triggering an obscure 297 | # error quantizing the model https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/linear.py#L122 298 | # todo: if quantizing the model, explore if the above is a concern for us 299 | self.out_proj = nn.Linear(embed_dim, embed_dim) 300 | 301 | # dropout for scaled dot product attention 302 | self.dropout = nn.Dropout(dropout) 303 | 304 | # scaling factor for scaled dot product attention 305 | scale = torch.sqrt(torch.FloatTensor([self.head_dim])) 306 | # persistent=False if you don't want to save it inside state_dict 307 | self.register_buffer('scale', scale) 308 | 309 | # toggles meant to be set directly by user 310 | self.need_weights = False 311 | self.average_attn_weights = True 312 | 313 | def _compute_attn_weights(self, query, key, len_q, len_k, batch_size, mask, pdb_fn): 314 | """ computes the attention weights (a "compatability function" of queries with corresponding keys) """ 315 | 316 | # calculate the first term in the numerator attn1, which is Q*K 317 | # todo: pytorch reshapes q,k and v to 3 dimensions (similar to how r_q2 is below) 318 | # is that functionally equivalent to what we're doing? is their way faster? 319 | # r_q1 = [batch_size, num_heads, len_q, head_dim] 320 | r_q1 = query.view(batch_size, len_q, self.num_heads, self.head_dim).permute(0, 2, 1, 3) 321 | # todo: we could directly permute r_k1 to [batch_size, num_heads, head_dim, len_k] 322 | # to make it compatible for matrix multiplication with r_q1, instead of 2-step approach 323 | # r_k1 = [batch_size, num_heads, len_k, head_dim] 324 | r_k1 = key.view(batch_size, len_k, self.num_heads, self.head_dim).permute(0, 2, 1, 3) 325 | # attn1 = [batch_size, num_heads, len_q, len_k] 326 | attn1 = torch.matmul(r_q1, r_k1.permute(0, 1, 3, 2)) 327 | 328 | # calculate the second term in the numerator attn2, which is Q*R 329 | # r_q2 = [query_len, batch_size * num_heads, head_dim] 330 | r_q2 = query.permute(1, 0, 2).contiguous().view(len_q, batch_size * self.num_heads, self.head_dim) 331 | 332 | # todo: support multiple different PDB base structures per batch 333 | # one option: 334 | # - require batches to be all the same protein 335 | # - add argument to forward() to accept the PDB file for the protein in the batch 336 | # - then we just pass in the PDB file to relative position's forward() 337 | # to support multiple different structures per batch: 338 | # - add argument to forward() to accept PDB files, one for each item in batch 339 | # - make corresponding changing in relative_position object to return R for each structure 340 | # - note: if there are a lot of of different structures, and the sequence lengths are long, 341 | # this could be memory prohibitive because R (rel_pos_k) can take up a lot of mem for long seqs 342 | # - adjust the attn2 calculation to factor in the multiple different R matrices. 343 | # the way to do this might have to be to do multiple matmuls, one for each each structure. 344 | # basically, would split up r_q2 into several matrices grouped by structure, and then 345 | # multiply with corresponding R, then combine back into the exact same order of the original r_q2 346 | # note: this may be computationally intensive (splitting, more matrix muliplies, joining) 347 | # another option would be to create views(?), repeating the different Rs so we can do a 348 | # a matris multiply directly with r_q2 349 | # - would shapes be affected if there was padding in the queries, keys, values? 350 | 351 | if self.pos_encoding == "relative": 352 | # rel_pos_k = [len_q, len_k, head_dim] 353 | rel_pos_k = self.relative_position_k(len_q, len_k) 354 | elif self.pos_encoding == "relative_3D": 355 | # rel_pos_k = [sequence length (from PDB structure), head_dim] 356 | rel_pos_k = self.relative_position_k(pdb_fn) 357 | else: 358 | raise ValueError("unrecognized pos_encoding: {}".format(self.pos_encoding)) 359 | 360 | # the matmul basically computes the dot product between each input position’s query vector and 361 | # its corresponding relative position embeddings across all input sequences in the heads and batch 362 | # attn2 = [batch_size * num_heads, len_q, len_k] 363 | attn2 = torch.matmul(r_q2, rel_pos_k.transpose(1, 2)).transpose(0, 1) 364 | # attn2 = [batch_size, num_heads, len_q, len_k] 365 | attn2 = attn2.contiguous().view(batch_size, self.num_heads, len_q, len_k) 366 | 367 | # calculate attention weights 368 | attn_weights = (attn1 + attn2) / self.scale 369 | 370 | # apply mask if given 371 | if mask is not None: 372 | # todo: pytorch uses float("-inf") instead of -1e10 373 | attn_weights = attn_weights.masked_fill(mask == 0, -1e10) 374 | 375 | # softmax gives us attn_weights weights 376 | attn_weights = torch.softmax(attn_weights, dim=-1) 377 | # attn_weights = [batch_size, num_heads, len_q, len_k] 378 | attn_weights = self.dropout(attn_weights) 379 | 380 | return attn_weights 381 | 382 | def _compute_avg_val(self, value, len_q, len_k, len_v, attn_weights, batch_size, pdb_fn): 383 | # todo: add option to not factor in relative position embeddings in value calculation 384 | # calculate the first term, the attn*values 385 | # r_v1 = [batch_size, num_heads, len_v, head_dim] 386 | r_v1 = value.view(batch_size, len_v, self.num_heads, self.head_dim).permute(0, 2, 1, 3) 387 | # avg1 = [batch_size, num_heads, len_q, head_dim] 388 | avg1 = torch.matmul(attn_weights, r_v1) 389 | 390 | # calculate the second term, the attn*R 391 | # similar to how relative embeddings are factored in the attention weights calculation 392 | if self.pos_encoding == "relative": 393 | # rel_pos_v = [query_len, value_len, head_dim] 394 | rel_pos_v = self.relative_position_v(len_q, len_v) 395 | elif self.pos_encoding == "relative_3D": 396 | # rel_pos_v = [sequence length (from PDB structure), head_dim] 397 | rel_pos_v = self.relative_position_v(pdb_fn) 398 | else: 399 | raise ValueError("unrecognized pos_encoding: {}".format(self.pos_encoding)) 400 | 401 | # r_attn_weights = [len_q, batch_size * num_heads, len_v] 402 | r_attn_weights = attn_weights.permute(2, 0, 1, 3).contiguous().view(len_q, batch_size * self.num_heads, len_k) 403 | avg2 = torch.matmul(r_attn_weights, rel_pos_v) 404 | # avg2 = [batch_size, num_heads, len_q, head_dim] 405 | avg2 = avg2.transpose(0, 1).contiguous().view(batch_size, self.num_heads, len_q, self.head_dim) 406 | 407 | # calculate avg value 408 | x = avg1 + avg2 # [batch_size, num_heads, len_q, head_dim] 409 | x = x.permute(0, 2, 1, 3).contiguous() # [batch_size, len_q, num_heads, head_dim] 410 | # x = [batch_size, len_q, embed_dim] 411 | x = x.view(batch_size, len_q, self.embed_dim) 412 | 413 | return x 414 | 415 | def forward(self, query, key, value, pdb_fn=None, mask=None): 416 | # query = [batch_size, q_len, embed_dim] 417 | # key = [batch_size, k_len, embed_dim] 418 | # value = [batch_size, v_en, embed_dim] 419 | batch_size = query.shape[0] 420 | len_k, len_q, len_v = (key.shape[1], query.shape[1], value.shape[1]) 421 | 422 | # in projection (multiply inputs by WQ, WK, WV) 423 | query = self.q_proj(query) 424 | key = self.k_proj(key) 425 | value = self.v_proj(value) 426 | 427 | # first compute the attention weights, then multiply with values 428 | # attn = [batch size, num_heads, len_q, len_k] 429 | attn_weights = self._compute_attn_weights(query, key, len_q, len_k, batch_size, mask, pdb_fn) 430 | 431 | # take weighted average of values (weighted by attention weights) 432 | attn_output = self._compute_avg_val(value, len_q, len_k, len_v, attn_weights, batch_size, pdb_fn) 433 | 434 | # output projection 435 | # attn_output = [batch_size, len_q, embed_dim] 436 | attn_output = self.out_proj(attn_output) 437 | 438 | if self.need_weights: 439 | # return attention weights in addition to attention 440 | # average the weights over the heads (to get overall attention) 441 | # attn_weights = [batch_size, len_q, len_k] 442 | if self.average_attn_weights: 443 | attn_weights = attn_weights.sum(dim=1) / self.num_heads 444 | return {"attn_output": attn_output, "attn_weights": attn_weights} 445 | else: 446 | return attn_output 447 | 448 | 449 | class RelativeTransformerEncoderLayer(nn.Module): 450 | """ 451 | d_model: the number of expected features in the input (required). 452 | nhead: the number of heads in the MultiHeadAttention models (required). 453 | clipping_threshold: the clipping threshold for relative position embeddings 454 | dim_feedforward: the dimension of the feedforward network model (default=2048). 455 | dropout: the dropout value (default=0.1). 456 | activation: the activation function of the intermediate layer, can be a string 457 | ("relu" or "gelu") or a unary callable. Default: relu 458 | layer_norm_eps: the eps value in layer normalization components (default=1e-5). 459 | norm_first: if ``True``, layer norm is done prior to attention and feedforward 460 | operations, respectively. Otherwise, it's done after. Default: ``False`` (after). 461 | """ 462 | 463 | # this is some kind of torch jit compiling helper... will also ensure these values don't change 464 | __constants__ = ['batch_first', 'norm_first'] 465 | 466 | def __init__(self, 467 | d_model, 468 | nhead, 469 | pos_encoding="relative", 470 | clipping_threshold=3, 471 | contact_threshold=7, 472 | pdb_fns=None, 473 | dim_feedforward=2048, 474 | dropout=0.1, 475 | activation=F.relu, 476 | layer_norm_eps=1e-5, 477 | norm_first=False) -> None: 478 | 479 | self.batch_first = True 480 | 481 | super(RelativeTransformerEncoderLayer, self).__init__() 482 | 483 | self.self_attn = RelativeMultiHeadAttention(d_model, nhead, dropout, 484 | pos_encoding, clipping_threshold, contact_threshold, pdb_fns) 485 | 486 | # feed forward model 487 | self.linear1 = Linear(d_model, dim_feedforward) 488 | self.dropout = Dropout(dropout) 489 | self.linear2 = Linear(dim_feedforward, d_model) 490 | 491 | self.norm_first = norm_first 492 | self.norm1 = LayerNorm(d_model, eps=layer_norm_eps) 493 | self.norm2 = LayerNorm(d_model, eps=layer_norm_eps) 494 | self.dropout1 = Dropout(dropout) 495 | self.dropout2 = Dropout(dropout) 496 | 497 | # Legacy string support for activation function. 498 | if isinstance(activation, str): 499 | self.activation = models.get_activation_fn(activation) 500 | else: 501 | self.activation = activation 502 | 503 | def forward(self, src: Tensor, pdb_fn=None) -> Tensor: 504 | x = src 505 | if self.norm_first: 506 | x = x + self._sa_block(self.norm1(x), pdb_fn=pdb_fn) 507 | x = x + self._ff_block(self.norm2(x)) 508 | else: 509 | x = self.norm1(x + self._sa_block(x)) 510 | x = self.norm2(x + self._ff_block(x)) 511 | 512 | return x 513 | 514 | # self-attention block 515 | def _sa_block(self, x: Tensor, pdb_fn=None) -> Tensor: 516 | x = self.self_attn(x, x, x, pdb_fn=pdb_fn) 517 | if isinstance(x, dict): 518 | # handle the case where we are returning attention weights 519 | x = x["attn_output"] 520 | return self.dropout1(x) 521 | 522 | # feed forward block 523 | def _ff_block(self, x: Tensor) -> Tensor: 524 | x = self.linear2(self.dropout(self.activation(self.linear1(x)))) 525 | return self.dropout2(x) 526 | 527 | 528 | class RelativeTransformerEncoder(nn.Module): 529 | def __init__(self, encoder_layer, num_layers, norm=None, reset_params=True): 530 | super(RelativeTransformerEncoder, self).__init__() 531 | # using get_clones means all layers have the same initialization 532 | # this is also a problem in PyTorch's TransformerEncoder implementation, which this is based on 533 | # todo: PyTorch is changing its transformer API... check up on and see if there is a better way 534 | self.layers = _get_clones(encoder_layer, num_layers) 535 | self.num_layers = num_layers 536 | self.norm = norm 537 | 538 | # important because get_clones means all layers have same initialization 539 | # should recursively reset parameters for all submodules 540 | if reset_params: 541 | self.apply(models.reset_parameters_helper) 542 | 543 | def forward(self, src: Tensor, pdb_fn=None) -> Tensor: 544 | output = src 545 | 546 | for mod in self.layers: 547 | output = mod(output, pdb_fn=pdb_fn) 548 | 549 | if self.norm is not None: 550 | output = self.norm(output) 551 | 552 | return output 553 | 554 | 555 | def _get_clones(module, num_clones): 556 | return nn.ModuleList([copy.deepcopy(module) for _ in range(num_clones)]) 557 | 558 | 559 | def _inv_dict(d): 560 | """ helper function for contact map-based position embeddings """ 561 | inv = dict() 562 | for k, v in d.items(): 563 | # collect dict keys into lists based on value 564 | inv.setdefault(v, list()).append(k) 565 | for k, v in inv.items(): 566 | # put in sorted order 567 | inv[k] = sorted(v) 568 | return inv 569 | 570 | 571 | def _combine_d(d, threshold, combined_key): 572 | """ helper function for contact map-based position embeddings 573 | d is a dictionary with ints as keys and lists as values. 574 | for all keys >= threshold, this function combines the values of those keys into a single list """ 575 | out_d = {} 576 | for k, v in d.items(): 577 | if k < threshold: 578 | out_d[k] = v 579 | elif k >= threshold: 580 | if combined_key not in out_d: 581 | out_d[combined_key] = v 582 | else: 583 | out_d[combined_key] += v 584 | if combined_key in out_d: 585 | out_d[combined_key] = sorted(out_d[combined_key]) 586 | return out_d 587 | -------------------------------------------------------------------------------- /metl/structure.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import isfile 3 | from enum import Enum, auto 4 | 5 | import numpy as np 6 | from scipy.spatial.distance import cdist 7 | import networkx as nx 8 | from biopandas.pdb import PandasPdb 9 | 10 | 11 | class GraphType(Enum): 12 | LINEAR = auto() 13 | COMPLETE = auto() 14 | DISCONNECTED = auto() 15 | DIST_THRESH = auto() 16 | DIST_THRESH_SHUFFLED = auto() 17 | 18 | 19 | def save_graph(g, fn): 20 | """ Saves graph to file """ 21 | nx.write_gexf(g, fn) 22 | 23 | 24 | def load_graph(fn): 25 | """ Loads graph from file """ 26 | g = nx.read_gexf(fn, node_type=int) 27 | return g 28 | 29 | 30 | def shuffle_nodes(g, seed=7): 31 | """ Shuffles the nodes of the given graph and returns a copy of the shuffled graph """ 32 | # get the list of nodes in this graph 33 | nodes = g.nodes() 34 | 35 | # create a permuted list of nodes 36 | np.random.seed(seed) 37 | nodes_shuffled = np.random.permutation(nodes) 38 | 39 | # create a dictionary mapping from old node label to new node label 40 | mapping = {n: ns for n, ns in zip(nodes, nodes_shuffled)} 41 | 42 | g_shuffled = nx.relabel_nodes(g, mapping, copy=True) 43 | 44 | return g_shuffled 45 | 46 | 47 | def linear_graph(num_residues): 48 | """ Creates a linear graph where each node is connected to its sequence neighbor in order """ 49 | g = nx.Graph() 50 | g.add_nodes_from(np.arange(0, num_residues)) 51 | for i in range(num_residues-1): 52 | g.add_edge(i, i+1) 53 | return g 54 | 55 | 56 | def complete_graph(num_residues): 57 | """ Creates a graph where each node is connected to all other nodes""" 58 | g = nx.complete_graph(num_residues) 59 | return g 60 | 61 | 62 | def disconnected_graph(num_residues): 63 | g = nx.Graph() 64 | g.add_nodes_from(np.arange(0, num_residues)) 65 | return g 66 | 67 | 68 | def dist_thresh_graph(dist_mtx, threshold): 69 | """ Creates undirected graph based on a distance threshold """ 70 | g = nx.Graph() 71 | g.add_nodes_from(np.arange(0, dist_mtx.shape[0])) 72 | 73 | # loop through each residue 74 | for rn1 in range(len(dist_mtx)): 75 | # find all residues that are within threshold distance of current 76 | rns_within_threshold = np.where(dist_mtx[rn1] < threshold)[0] 77 | 78 | # add edges from current residue to those that are within threshold 79 | for rn2 in rns_within_threshold: 80 | # don't add self edges 81 | if rn1 != rn2: 82 | g.add_edge(rn1, rn2) 83 | return g 84 | 85 | 86 | def ordered_adjacency_matrix(g): 87 | """ returns the adjacency matrix ordered by node label in increasing order as a numpy array """ 88 | node_order = sorted(g.nodes()) 89 | adj_mtx = nx.to_numpy_matrix(g, nodelist=node_order) 90 | return np.asarray(adj_mtx).astype(np.float32) 91 | 92 | 93 | def cbeta_distance_matrix(pdb_fn, start=0, end=None): 94 | # note that start and end are not going by residue number 95 | # they are going by whatever the listing in the pdb file is 96 | 97 | # read the pdb file into a biopandas object 98 | ppdb = PandasPdb().read_pdb(pdb_fn) 99 | 100 | # group by residue number 101 | # important to specify sort=True so that group keys (residue number) are in order 102 | # the reason is we loop through group keys below, and assume that residues are in order 103 | # the pandas function has sort=True by default, but we specify it anyway because it is important 104 | grouped = ppdb.df["ATOM"].groupby("residue_number", sort=True) 105 | 106 | # a list of coords for the cbeta or calpha of each residue 107 | coords = [] 108 | 109 | # loop through each residue and find the coordinates of cbeta 110 | for i, (residue_number, values) in enumerate(grouped): 111 | 112 | # skip residues not in the range 113 | end_index = (len(grouped) if end is None else end) 114 | if i not in range(start, end_index): 115 | continue 116 | 117 | residue_group = grouped.get_group(residue_number) 118 | 119 | atom_names = residue_group["atom_name"] 120 | if "CB" in atom_names.values: 121 | # print("Using CB...") 122 | atom_name = "CB" 123 | elif "CA" in atom_names.values: 124 | # print("Using CA...") 125 | atom_name = "CA" 126 | else: 127 | raise ValueError("Couldn't find CB or CA for residue {}".format(residue_number)) 128 | 129 | # get the coordinates of cbeta (or calpha) 130 | coords.append( 131 | residue_group[residue_group["atom_name"] == atom_name][["x_coord", "y_coord", "z_coord"]].values[0]) 132 | 133 | # stack the coords into a numpy array where each row has the x,y,z coords for a different residue 134 | coords = np.stack(coords) 135 | 136 | # compute pairwise euclidean distance between all cbetas 137 | dist_mtx = cdist(coords, coords, metric="euclidean") 138 | 139 | return dist_mtx 140 | 141 | 142 | def get_neighbors(g, nodes): 143 | """ returns a list (set) of neighbors of all given nodes """ 144 | neighbors = set() 145 | for n in nodes: 146 | neighbors.update(g.neighbors(n)) 147 | return sorted(list(neighbors)) 148 | 149 | 150 | def gen_graph(graph_type, res_dist_mtx, dist_thresh=7, shuffle_seed=7, graph_save_dir=None, save=False): 151 | """ generate the specified structure graph using the specified residue distance matrix """ 152 | if graph_type is GraphType.LINEAR: 153 | g = linear_graph(len(res_dist_mtx)) 154 | save_fn = None if not save else os.path.join(graph_save_dir, "linear.graph") 155 | 156 | elif graph_type is GraphType.COMPLETE: 157 | g = complete_graph(len(res_dist_mtx)) 158 | save_fn = None if not save else os.path.join(graph_save_dir, "complete.graph") 159 | 160 | elif graph_type is GraphType.DISCONNECTED: 161 | g = disconnected_graph(len(res_dist_mtx)) 162 | save_fn = None if not save else os.path.join(graph_save_dir, "disconnected.graph") 163 | 164 | elif graph_type is GraphType.DIST_THRESH: 165 | g = dist_thresh_graph(res_dist_mtx, dist_thresh) 166 | save_fn = None if not save else os.path.join(graph_save_dir, "dist_thresh_{}.graph".format(dist_thresh)) 167 | 168 | elif graph_type is GraphType.DIST_THRESH_SHUFFLED: 169 | g = dist_thresh_graph(res_dist_mtx, dist_thresh) 170 | g = shuffle_nodes(g, seed=shuffle_seed) 171 | save_fn = None if not save else \ 172 | os.path.join(graph_save_dir, "dist_thresh_{}_shuffled_r{}.graph".format(dist_thresh, shuffle_seed)) 173 | 174 | else: 175 | raise ValueError("Graph type {} is not implemented".format(graph_type)) 176 | 177 | if save: 178 | if isfile(save_fn): 179 | print("err: graph already exists: {}. to overwrite, delete the existing file first".format(save_fn)) 180 | else: 181 | os.makedirs(graph_save_dir, exist_ok=True) 182 | save_graph(g, save_fn) 183 | 184 | return g 185 | -------------------------------------------------------------------------------- /metl/test.py: -------------------------------------------------------------------------------- 1 | import metl 2 | import torch 3 | 4 | 5 | def main(): 6 | model, data_encoder = metl.get_from_ident("metl-g-20m-1d") 7 | 8 | # make sure all the sequences are the same length 9 | amino_acid_sequences = ["SMART", "MAGIC"] 10 | encoded_seqs = data_encoder.encode_sequences(amino_acid_sequences) 11 | 12 | # set model to eval mode 13 | model.eval() 14 | # no need to compute gradients for inference 15 | with torch.no_grad(): 16 | predictions = model(torch.tensor(encoded_seqs)) 17 | 18 | print(predictions) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /metl/test2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import metl 3 | 4 | 5 | def main(): 6 | # "YoQkzoLD" is a METL-L (2M, 1D) [GFP] model that was fine-tuned on 64 examples from the avGFP DMS dataset 7 | model, data_encoder = metl.get_from_uuid(uuid="YoQkzoLD") 8 | 9 | # the GFP wild-type sequence 10 | wt = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQ" \ 11 | "HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKN" \ 12 | "GIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK" 13 | 14 | # some example GFP variants to compute the scores for 15 | variants = ["E3K,G102S", 16 | "T36P,S203T,K207R", 17 | "V10A,D19G,F25S,E113V"] 18 | 19 | encoded_variants = data_encoder.encode_variants(wt, variants) 20 | 21 | # set model to eval mode 22 | model.eval() 23 | # no need to compute gradients for inference 24 | with torch.no_grad(): 25 | predictions = model(torch.tensor(encoded_variants)) 26 | 27 | print(predictions) 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /metl/test3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import metl 3 | 4 | 5 | def main(): 6 | # this is a 3D RPE model, which requires a PDB file matching the WT sequence 7 | model, data_encoder = metl.get_from_uuid(uuid="PEkeRuxb") 8 | 9 | # the GFP wild-type sequence 10 | wt = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQ" \ 11 | "HDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKN" \ 12 | "GIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK" 13 | 14 | # some example GFP variants to compute the scores for 15 | variants = ["E3K,G102S", 16 | "T36P,S203T,K207R", 17 | "V10A,D19G,F25S,E113V"] 18 | 19 | encoded_variants = data_encoder.encode_variants(wt, variants) 20 | 21 | # set model to eval mode 22 | model.eval() 23 | # no need to compute gradients for inference 24 | with torch.no_grad(): 25 | predictions = model(torch.tensor(encoded_variants), pdb_fn="pdbs/1gfl_cm.pdb") 26 | 27 | print(predictions) 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /metl/test4.py: -------------------------------------------------------------------------------- 1 | import metl 2 | import torch 3 | 4 | 5 | def main(): 6 | model, data_encoder = metl.get_from_ident("METL-L-2M-3D-GB1") 7 | 8 | # the GB1 WT sequence 9 | wt = "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE" 10 | 11 | # some example GB1 variants for which to compute the METL-Local Rosetta scores estimates 12 | variants = ["T17P,T54F", 13 | "V28L,F51A", 14 | "T17P,V28L,F51A,T54F"] 15 | 16 | encoded_variants = data_encoder.encode_variants(wt, variants) 17 | 18 | # set model to eval mode 19 | model.eval() 20 | # no need to compute gradients for inference 21 | with torch.no_grad(): 22 | predictions = model(torch.tensor(encoded_variants), pdb_fn="pdbs/2qmt_p.pdb") 23 | print(predictions) 24 | 25 | # can also input full sequences 26 | sequences = ["MPYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE", 27 | "MPAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE", 28 | "MGEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"] 29 | encoded_sequences = data_encoder.encode_sequences(sequences) 30 | model.eval() 31 | with torch.no_grad(): 32 | predictions = model(torch.tensor(encoded_sequences), pdb_fn="pdbs/2qmt_p.pdb") 33 | print(predictions) 34 | 35 | # can also use the 1D model which doesn't require a PDB file 36 | model, data_encoder = metl.get_from_ident("METL-L-2M-1D-GB1") 37 | variants = ["T17P,T54F", 38 | "V28L,F51A", 39 | "T17P,V28L,F51A,T54F"] 40 | encoded_variants = data_encoder.encode_variants(wt, variants) 41 | model.eval() 42 | with torch.no_grad(): 43 | predictions = model(torch.tensor(encoded_variants)) 44 | print(predictions) 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /notebooks/inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Inference with METL-Global" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 10, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import torch\n", 17 | "import torchextractor as tx\n", 18 | "import torchinfo\n", 19 | "\n", 20 | "import metl" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# Load a METL-G model" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 5, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "model, data_encoder = metl.get_from_ident(\"METL-G-20M-1D\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 7, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "===============================================================================================\n", 49 | "Layer (type (var_name)) Param #\n", 50 | "===============================================================================================\n", 51 | "AttnModel (AttnModel) --\n", 52 | "├─SequentialWithArgs (model) --\n", 53 | "│ └─ScaledEmbedding (embedder) --\n", 54 | "│ │ └─Embedding (embedding) 10,752\n", 55 | "│ └─RelativeTransformerEncoder (tr_encoder) --\n", 56 | "│ │ └─ModuleList (layers) --\n", 57 | "│ │ │ └─RelativeTransformerEncoderLayer (0) 3,154,560\n", 58 | "│ │ │ └─RelativeTransformerEncoderLayer (1) 3,154,560\n", 59 | "│ │ │ └─RelativeTransformerEncoderLayer (2) 3,154,560\n", 60 | "│ │ │ └─RelativeTransformerEncoderLayer (3) 3,154,560\n", 61 | "│ │ │ └─RelativeTransformerEncoderLayer (4) 3,154,560\n", 62 | "│ │ │ └─RelativeTransformerEncoderLayer (5) 3,154,560\n", 63 | "│ │ └─LayerNorm (norm) 1,024\n", 64 | "│ └─GlobalAveragePooling (avg_pooling) --\n", 65 | "│ └─FCBlock (fc1) --\n", 66 | "│ │ └─Linear (fc) 262,656\n", 67 | "│ │ └─ReLU (activation) --\n", 68 | "│ │ └─LayerNorm (norm) 1,024\n", 69 | "│ │ └─Dropout (dropout) --\n", 70 | "│ └─Linear (prediction) 28,215\n", 71 | "===============================================================================================\n", 72 | "Total params: 19,231,031\n", 73 | "Trainable params: 19,231,031\n", 74 | "Non-trainable params: 0\n", 75 | "===============================================================================================\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "summary = torchinfo.summary(model, depth=4, verbose=1, row_settings=[\"var_names\"])" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "# Set up representation extraction\n", 88 | "For METL-Global models, I recommend using the representation immediately after the GlobalAveragePooling (avg_pooling) layer. For METL-Local models, I recommend using the representation immediately after the final fully connected layer (fc1). " 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 12, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "return_layers = [\n", 98 | " \"model.avg_pooling\",\n", 99 | "]\n", 100 | "\n", 101 | "extractor = tx.Extractor(model.eval(), return_layers)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "# Test a couple sequences" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 25, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# note: make sure all the sequences in a batch are the same length\n", 118 | "amino_acid_sequences = [\"SMART\", \"MAGIC\"]\n", 119 | "encoded_seqs = data_encoder.encode_sequences(amino_acid_sequences)\n", 120 | "\n", 121 | "with torch.no_grad():\n", 122 | " model_out, intermediate_out = extractor(torch.tensor(encoded_seqs))" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 29, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "torch.Size([2, 55])" 134 | ] 135 | }, 136 | "execution_count": 29, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "# model_out contains the final output of the model (Rosetta energy term predictions)\n", 143 | "# there are 55 energy terms, the first one is total_score \n", 144 | "# they are listed in order on the main README\n", 145 | "model_out.shape" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 32, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "torch.Size([2, 512])" 157 | ] 158 | }, 159 | "execution_count": 32, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "# intermediate_out is a dictionary containing intermediate outputs \n", 166 | "# for all the return_layers specified above\n", 167 | "# METL-G has an embedding dimension of 512, thus outputs will be 512\n", 168 | "intermediate_out[\"model.avg_pooling\"].shape" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "# Additional notes\n", 176 | "The above will retrieve a length 512 sequence-level representation immediately following the global average pooling layer, which takes the average of residue-level representations. \n", 177 | "\n", 178 | "If you want, you can also get the residue-representations. You can also play around with the sequence-level representation from after the FC layer, although I haven't had as much success with this representation for my tasks (too specific to the Rosetta energies?). You may have more luck with it, though. " 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 34, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# the above will retrieve a length 512 sequence-level representation\n", 188 | "# you can also get a representation for each residue\n", 189 | "\n", 190 | "return_layers = [\n", 191 | " \"model.tr_encoder\", # residue-level representation\n", 192 | " \"model.avg_pooling\", # sequence-level representation following avg pooling\n", 193 | " \"model.fc1\", # sequence-level representation following the final fully connected layer\n", 194 | "]\n", 195 | "\n", 196 | "extractor = tx.Extractor(model.eval(), return_layers)\n", 197 | "\n", 198 | "amino_acid_sequences = [\"SMART\", \"MAGIC\"]\n", 199 | "encoded_seqs = data_encoder.encode_sequences(amino_acid_sequences)\n", 200 | "\n", 201 | "with torch.no_grad():\n", 202 | " model_out, intermediate_out = extractor(torch.tensor(encoded_seqs))" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 42, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "name": "stdout", 212 | "output_type": "stream", 213 | "text": [ 214 | "Layer: model.tr_encoder\n", 215 | "Output shape: torch.Size([2, 5, 512])\n", 216 | "\n", 217 | "Layer: model.avg_pooling\n", 218 | "Output shape: torch.Size([2, 512])\n", 219 | "\n", 220 | "Layer: model.fc1\n", 221 | "Output shape: torch.Size([2, 512])\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "for k, v in intermediate_out.items():\n", 227 | " print(\"Layer: {}\\nOutput shape: {}\\n\".format(k, v.shape))" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [] 236 | } 237 | ], 238 | "metadata": { 239 | "kernelspec": { 240 | "display_name": "Python 3 (ipykernel)", 241 | "language": "python", 242 | "name": "python3" 243 | }, 244 | "language_info": { 245 | "codemirror_mode": { 246 | "name": "ipython", 247 | "version": 3 248 | }, 249 | "file_extension": ".py", 250 | "mimetype": "text/x-python", 251 | "name": "python", 252 | "nbconvert_exporter": "python", 253 | "pygments_lexer": "ipython3", 254 | "version": "3.9.16" 255 | } 256 | }, 257 | "nbformat": 4, 258 | "nbformat_minor": 4 259 | } 260 | -------------------------------------------------------------------------------- /pdbs/pab1_cm.pdb: -------------------------------------------------------------------------------- 1 | ATOM 1 N GLY A 1 -14.422 25.734 -5.746 1.00 0.00 N 2 | ATOM 2 CA GLY A 1 -15.203 24.662 -5.115 1.00 0.00 C 3 | ATOM 3 C GLY A 1 -14.487 23.322 -5.215 1.00 0.00 C 4 | ATOM 4 O GLY A 1 -15.061 22.336 -5.691 1.00 0.00 O 5 | ATOM 10 N ASN A 2 -13.214 23.298 -4.818 1.00 0.00 N 6 | ATOM 11 CA ASN A 2 -12.380 22.099 -4.924 1.00 0.00 C 7 | ATOM 12 C ASN A 2 -12.321 21.350 -3.595 1.00 0.00 C 8 | ATOM 13 O ASN A 2 -11.749 21.848 -2.626 1.00 0.00 O 9 | ATOM 14 CB ASN A 2 -11.004 22.475 -5.456 1.00 0.00 C 10 | ATOM 15 CG ASN A 2 -10.105 21.294 -5.801 1.00 0.00 C 11 | ATOM 16 OD1 ASN A 2 -10.309 20.145 -5.399 1.00 0.00 O 12 | ATOM 17 ND2 ASN A 2 -9.088 21.602 -6.581 1.00 0.00 N 13 | ATOM 24 N ILE A 3 -12.987 20.198 -3.552 1.00 0.00 N 14 | ATOM 25 CA ILE A 3 -13.111 19.388 -2.339 1.00 0.00 C 15 | ATOM 26 C ILE A 3 -11.955 18.383 -2.279 1.00 0.00 C 16 | ATOM 27 O ILE A 3 -11.637 17.704 -3.269 1.00 0.00 O 17 | ATOM 28 CB ILE A 3 -14.514 18.692 -2.294 1.00 0.00 C 18 | ATOM 29 CG1 ILE A 3 -14.771 17.939 -0.981 1.00 0.00 C 19 | ATOM 30 CG2 ILE A 3 -14.647 17.722 -3.425 1.00 0.00 C 20 | ATOM 31 CD1 ILE A 3 -16.274 17.589 -0.797 1.00 0.00 C 21 | ATOM 43 N PHE A 4 -11.283 18.341 -1.122 1.00 0.00 N 22 | ATOM 44 CA PHE A 4 -10.155 17.436 -0.920 1.00 0.00 C 23 | ATOM 45 C PHE A 4 -10.661 16.137 -0.366 1.00 0.00 C 24 | ATOM 46 O PHE A 4 -11.352 16.110 0.657 1.00 0.00 O 25 | ATOM 47 CB PHE A 4 -9.103 18.009 0.017 1.00 0.00 C 26 | ATOM 48 CG PHE A 4 -7.943 17.071 0.172 1.00 0.00 C 27 | ATOM 49 CD1 PHE A 4 -7.063 16.930 -0.865 1.00 0.00 C 28 | ATOM 50 CD2 PHE A 4 -7.728 16.332 1.329 1.00 0.00 C 29 | ATOM 51 CE1 PHE A 4 -5.991 16.087 -0.788 1.00 0.00 C 30 | ATOM 52 CE2 PHE A 4 -6.636 15.480 1.408 1.00 0.00 C 31 | ATOM 53 CZ PHE A 4 -5.773 15.360 0.346 1.00 0.00 C 32 | ATOM 63 N ILE A 5 -10.339 15.058 -1.047 1.00 0.00 N 33 | ATOM 64 CA ILE A 5 -10.849 13.748 -0.712 1.00 0.00 C 34 | ATOM 65 C ILE A 5 -9.733 12.867 -0.152 1.00 0.00 C 35 | ATOM 66 O ILE A 5 -8.807 12.510 -0.882 1.00 0.00 O 36 | ATOM 67 CB ILE A 5 -11.419 13.233 -2.043 1.00 0.00 C 37 | ATOM 68 CG1 ILE A 5 -12.474 14.279 -2.540 1.00 0.00 C 38 | ATOM 69 CG2 ILE A 5 -12.001 11.881 -1.904 1.00 0.00 C 39 | ATOM 70 CD1 ILE A 5 -12.858 14.133 -3.925 1.00 0.00 C 40 | ATOM 82 N LYS A 6 -9.826 12.496 1.130 1.00 0.00 N 41 | ATOM 83 CA LYS A 6 -8.783 11.737 1.831 1.00 0.00 C 42 | ATOM 84 C LYS A 6 -9.218 10.329 2.259 1.00 0.00 C 43 | ATOM 85 O LYS A 6 -10.316 10.149 2.795 1.00 0.00 O 44 | ATOM 86 CB LYS A 6 -8.356 12.530 3.067 1.00 0.00 C 45 | ATOM 87 CG LYS A 6 -7.219 11.941 3.897 1.00 0.00 C 46 | ATOM 88 CD LYS A 6 -6.844 12.874 5.046 1.00 0.00 C 47 | ATOM 89 CE LYS A 6 -5.710 12.297 5.888 1.00 0.00 C 48 | ATOM 90 NZ LYS A 6 -5.333 13.202 7.016 1.00 0.00 N 49 | ATOM 104 N ASN A 7 -8.274 9.366 2.191 1.00 0.00 N 50 | ATOM 105 CA ASN A 7 -8.491 7.952 2.533 1.00 0.00 C 51 | ATOM 106 C ASN A 7 -9.432 7.386 1.496 1.00 0.00 C 52 | ATOM 107 O ASN A 7 -10.439 6.750 1.793 1.00 0.00 O 53 | ATOM 108 CB ASN A 7 -9.065 7.762 3.932 1.00 0.00 C 54 | ATOM 109 CG ASN A 7 -8.816 6.355 4.498 1.00 0.00 C 55 | ATOM 110 OD1 ASN A 7 -7.731 5.784 4.300 1.00 0.00 O 56 | ATOM 111 ND2 ASN A 7 -9.792 5.803 5.180 1.00 0.00 N 57 | ATOM 118 N LEU A 8 -9.085 7.686 0.259 1.00 0.00 N 58 | ATOM 119 CA LEU A 8 -9.833 7.261 -0.890 1.00 0.00 C 59 | ATOM 120 C LEU A 8 -9.372 5.874 -1.287 1.00 0.00 C 60 | ATOM 121 O LEU A 8 -8.217 5.662 -1.658 1.00 0.00 O 61 | ATOM 122 CB LEU A 8 -9.618 8.298 -2.008 1.00 0.00 C 62 | ATOM 123 CG LEU A 8 -10.363 8.119 -3.325 1.00 0.00 C 63 | ATOM 124 CD1 LEU A 8 -11.871 8.223 -3.085 1.00 0.00 C 64 | ATOM 125 CD2 LEU A 8 -9.900 9.222 -4.304 1.00 0.00 C 65 | ATOM 137 N HIS A 9 -10.299 4.929 -1.229 1.00 0.00 N 66 | ATOM 138 CA HIS A 9 -10.009 3.530 -1.488 1.00 0.00 C 67 | ATOM 139 C HIS A 9 -9.318 3.361 -2.853 1.00 0.00 C 68 | ATOM 140 O HIS A 9 -9.773 3.957 -3.832 1.00 0.00 O 69 | ATOM 141 CB HIS A 9 -11.318 2.733 -1.423 1.00 0.00 C 70 | ATOM 142 CG HIS A 9 -11.162 1.273 -1.530 1.00 0.00 C 71 | ATOM 143 ND1 HIS A 9 -10.964 0.628 -2.740 1.00 0.00 N 72 | ATOM 144 CD2 HIS A 9 -11.178 0.308 -0.583 1.00 0.00 C 73 | ATOM 145 CE1 HIS A 9 -10.849 -0.680 -2.513 1.00 0.00 C 74 | ATOM 146 NE2 HIS A 9 -10.982 -0.890 -1.218 1.00 0.00 N 75 | ATOM 154 N PRO A 10 -8.247 2.530 -2.961 1.00 0.00 N 76 | ATOM 155 CA PRO A 10 -7.463 2.245 -4.160 1.00 0.00 C 77 | ATOM 156 C PRO A 10 -8.216 1.897 -5.437 1.00 0.00 C 78 | ATOM 157 O PRO A 10 -7.713 2.181 -6.521 1.00 0.00 O 79 | ATOM 158 CB PRO A 10 -6.614 1.049 -3.718 1.00 0.00 C 80 | ATOM 159 CG PRO A 10 -6.414 1.247 -2.247 1.00 0.00 C 81 | ATOM 160 CD PRO A 10 -7.703 1.838 -1.744 1.00 0.00 C 82 | ATOM 168 N ASP A 11 -9.407 1.294 -5.349 1.00 0.00 N 83 | ATOM 169 CA ASP A 11 -10.095 0.943 -6.588 1.00 0.00 C 84 | ATOM 170 C ASP A 11 -10.914 2.105 -7.152 1.00 0.00 C 85 | ATOM 171 O ASP A 11 -11.527 1.994 -8.226 1.00 0.00 O 86 | ATOM 172 CB ASP A 11 -10.995 -0.273 -6.379 1.00 0.00 C 87 | ATOM 173 CG ASP A 11 -10.216 -1.581 -6.127 1.00 0.00 C 88 | ATOM 174 OD1 ASP A 11 -9.095 -1.715 -6.563 1.00 0.00 O 89 | ATOM 175 OD2 ASP A 11 -10.768 -2.435 -5.480 1.00 0.00 O 90 | ATOM 180 N ILE A 12 -10.948 3.237 -6.458 1.00 0.00 N 91 | ATOM 181 CA ILE A 12 -11.752 4.320 -6.974 1.00 0.00 C 92 | ATOM 182 C ILE A 12 -10.954 5.125 -7.965 1.00 0.00 C 93 | ATOM 183 O ILE A 12 -10.210 6.034 -7.606 1.00 0.00 O 94 | ATOM 184 CB ILE A 12 -12.266 5.232 -5.836 1.00 0.00 C 95 | ATOM 185 CG1 ILE A 12 -13.108 4.367 -4.867 1.00 0.00 C 96 | ATOM 186 CG2 ILE A 12 -13.089 6.433 -6.421 1.00 0.00 C 97 | ATOM 187 CD1 ILE A 12 -13.488 5.021 -3.574 1.00 0.00 C 98 | ATOM 199 N ASP A 13 -11.156 4.805 -9.232 1.00 0.00 N 99 | ATOM 200 CA ASP A 13 -10.457 5.490 -10.307 1.00 0.00 C 100 | ATOM 201 C ASP A 13 -11.192 6.799 -10.549 1.00 0.00 C 101 | ATOM 202 O ASP A 13 -12.200 7.063 -9.887 1.00 0.00 O 102 | ATOM 203 CB ASP A 13 -10.423 4.601 -11.563 1.00 0.00 C 103 | ATOM 204 CG ASP A 13 -9.331 4.969 -12.606 1.00 0.00 C 104 | ATOM 205 OD1 ASP A 13 -8.920 6.114 -12.647 1.00 0.00 O 105 | ATOM 206 OD2 ASP A 13 -8.950 4.110 -13.350 1.00 0.00 O 106 | ATOM 211 N ASN A 14 -10.728 7.614 -11.486 1.00 0.00 N 107 | ATOM 212 CA ASN A 14 -11.371 8.901 -11.714 1.00 0.00 C 108 | ATOM 213 C ASN A 14 -12.826 8.782 -12.134 1.00 0.00 C 109 | ATOM 214 O ASN A 14 -13.651 9.603 -11.737 1.00 0.00 O 110 | ATOM 215 CB ASN A 14 -10.595 9.740 -12.705 1.00 0.00 C 111 | ATOM 216 CG ASN A 14 -11.203 11.111 -12.864 1.00 0.00 C 112 | ATOM 217 OD1 ASN A 14 -11.506 11.787 -11.870 1.00 0.00 O 113 | ATOM 218 ND2 ASN A 14 -11.375 11.534 -14.091 1.00 0.00 N 114 | ATOM 225 N LYS A 15 -13.160 7.765 -12.927 1.00 0.00 N 115 | ATOM 226 CA LYS A 15 -14.541 7.618 -13.370 1.00 0.00 C 116 | ATOM 227 C LYS A 15 -15.434 7.408 -12.156 1.00 0.00 C 117 | ATOM 228 O LYS A 15 -16.485 8.036 -12.018 1.00 0.00 O 118 | ATOM 229 CB LYS A 15 -14.683 6.453 -14.354 1.00 0.00 C 119 | ATOM 230 CG LYS A 15 -15.999 6.439 -15.154 1.00 0.00 C 120 | ATOM 231 CD LYS A 15 -17.105 5.605 -14.486 1.00 0.00 C 121 | ATOM 232 CE LYS A 15 -18.334 5.496 -15.397 1.00 0.00 C 122 | ATOM 233 NZ LYS A 15 -19.468 4.780 -14.736 1.00 0.00 N 123 | ATOM 247 N ALA A 16 -15.019 6.484 -11.298 1.00 0.00 N 124 | ATOM 248 CA ALA A 16 -15.738 6.163 -10.083 1.00 0.00 C 125 | ATOM 249 C ALA A 16 -15.829 7.367 -9.147 1.00 0.00 C 126 | ATOM 250 O ALA A 16 -16.873 7.578 -8.507 1.00 0.00 O 127 | ATOM 251 CB ALA A 16 -15.056 5.009 -9.387 1.00 0.00 C 128 | ATOM 257 N LEU A 17 -14.757 8.174 -9.078 1.00 0.00 N 129 | ATOM 258 CA LEU A 17 -14.784 9.324 -8.193 1.00 0.00 C 130 | ATOM 259 C LEU A 17 -15.820 10.310 -8.717 1.00 0.00 C 131 | ATOM 260 O LEU A 17 -16.582 10.895 -7.940 1.00 0.00 O 132 | ATOM 261 CB LEU A 17 -13.412 10.018 -8.112 1.00 0.00 C 133 | ATOM 262 CG LEU A 17 -13.282 11.165 -7.060 1.00 0.00 C 134 | ATOM 263 CD1 LEU A 17 -13.501 10.569 -5.683 1.00 0.00 C 135 | ATOM 264 CD2 LEU A 17 -11.899 11.825 -7.130 1.00 0.00 C 136 | ATOM 276 N TYR A 18 -15.842 10.498 -10.042 1.00 0.00 N 137 | ATOM 277 CA TYR A 18 -16.803 11.367 -10.690 1.00 0.00 C 138 | ATOM 278 C TYR A 18 -18.218 10.915 -10.372 1.00 0.00 C 139 | ATOM 279 O TYR A 18 -19.041 11.723 -9.953 1.00 0.00 O 140 | ATOM 280 CB TYR A 18 -16.594 11.399 -12.202 1.00 0.00 C 141 | ATOM 281 CG TYR A 18 -17.645 12.191 -12.929 1.00 0.00 C 142 | ATOM 282 CD1 TYR A 18 -17.522 13.559 -13.084 1.00 0.00 C 143 | ATOM 283 CD2 TYR A 18 -18.763 11.535 -13.429 1.00 0.00 C 144 | ATOM 284 CE1 TYR A 18 -18.511 14.255 -13.747 1.00 0.00 C 145 | ATOM 285 CE2 TYR A 18 -19.742 12.237 -14.077 1.00 0.00 C 146 | ATOM 286 CZ TYR A 18 -19.619 13.592 -14.237 1.00 0.00 C 147 | ATOM 287 OH TYR A 18 -20.597 14.303 -14.892 1.00 0.00 O 148 | ATOM 297 N ASP A 19 -18.526 9.619 -10.559 1.00 0.00 N 149 | ATOM 298 CA ASP A 19 -19.887 9.156 -10.271 1.00 0.00 C 150 | ATOM 299 C ASP A 19 -20.270 9.381 -8.811 1.00 0.00 C 151 | ATOM 300 O ASP A 19 -21.415 9.730 -8.503 1.00 0.00 O 152 | ATOM 301 CB ASP A 19 -20.059 7.658 -10.564 1.00 0.00 C 153 | ATOM 302 CG ASP A 19 -20.136 7.255 -12.050 1.00 0.00 C 154 | ATOM 303 OD1 ASP A 19 -20.287 8.093 -12.902 1.00 0.00 O 155 | ATOM 304 OD2 ASP A 19 -20.085 6.061 -12.307 1.00 0.00 O 156 | ATOM 309 N THR A 20 -19.306 9.191 -7.909 1.00 0.00 N 157 | ATOM 310 CA THR A 20 -19.524 9.361 -6.480 1.00 0.00 C 158 | ATOM 311 C THR A 20 -19.883 10.802 -6.126 1.00 0.00 C 159 | ATOM 312 O THR A 20 -20.820 11.049 -5.367 1.00 0.00 O 160 | ATOM 313 CB THR A 20 -18.253 8.949 -5.689 1.00 0.00 C 161 | ATOM 314 OG1 THR A 20 -17.973 7.556 -5.924 1.00 0.00 O 162 | ATOM 315 CG2 THR A 20 -18.432 9.208 -4.168 1.00 0.00 C 163 | ATOM 323 N PHE A 21 -19.127 11.754 -6.666 1.00 0.00 N 164 | ATOM 324 CA PHE A 21 -19.326 13.162 -6.370 1.00 0.00 C 165 | ATOM 325 C PHE A 21 -20.297 13.952 -7.265 1.00 0.00 C 166 | ATOM 326 O PHE A 21 -20.857 14.948 -6.805 1.00 0.00 O 167 | ATOM 327 CB PHE A 21 -17.958 13.833 -6.328 1.00 0.00 C 168 | ATOM 328 CG PHE A 21 -17.184 13.527 -5.060 1.00 0.00 C 169 | ATOM 329 CD1 PHE A 21 -16.546 12.317 -4.880 1.00 0.00 C 170 | ATOM 330 CD2 PHE A 21 -17.068 14.475 -4.057 1.00 0.00 C 171 | ATOM 331 CE1 PHE A 21 -15.861 12.040 -3.728 1.00 0.00 C 172 | ATOM 332 CE2 PHE A 21 -16.357 14.195 -2.921 1.00 0.00 C 173 | ATOM 333 CZ PHE A 21 -15.767 12.971 -2.759 1.00 0.00 C 174 | ATOM 343 N SER A 22 -20.563 13.513 -8.507 1.00 0.00 N 175 | ATOM 344 CA SER A 22 -21.421 14.276 -9.438 1.00 0.00 C 176 | ATOM 345 C SER A 22 -22.840 14.409 -8.914 1.00 0.00 C 177 | ATOM 346 O SER A 22 -23.579 15.328 -9.278 1.00 0.00 O 178 | ATOM 347 CB SER A 22 -21.476 13.630 -10.809 1.00 0.00 C 179 | ATOM 348 OG SER A 22 -22.234 12.456 -10.781 1.00 0.00 O 180 | ATOM 354 N VAL A 23 -23.205 13.525 -8.002 1.00 0.00 N 181 | ATOM 355 CA VAL A 23 -24.507 13.511 -7.364 1.00 0.00 C 182 | ATOM 356 C VAL A 23 -24.765 14.811 -6.588 1.00 0.00 C 183 | ATOM 357 O VAL A 23 -25.915 15.119 -6.276 1.00 0.00 O 184 | ATOM 358 CB VAL A 23 -24.627 12.276 -6.441 1.00 0.00 C 185 | ATOM 359 CG1 VAL A 23 -23.788 12.451 -5.222 1.00 0.00 C 186 | ATOM 360 CG2 VAL A 23 -26.085 12.039 -6.048 1.00 0.00 C 187 | ATOM 370 N PHE A 24 -23.703 15.557 -6.245 1.00 0.00 N 188 | ATOM 371 CA PHE A 24 -23.847 16.787 -5.486 1.00 0.00 C 189 | ATOM 372 C PHE A 24 -23.802 18.026 -6.370 1.00 0.00 C 190 | ATOM 373 O PHE A 24 -23.863 19.143 -5.862 1.00 0.00 O 191 | ATOM 374 CB PHE A 24 -22.712 16.890 -4.470 1.00 0.00 C 192 | ATOM 375 CG PHE A 24 -22.697 15.738 -3.568 1.00 0.00 C 193 | ATOM 376 CD1 PHE A 24 -21.676 14.819 -3.638 1.00 0.00 C 194 | ATOM 377 CD2 PHE A 24 -23.717 15.523 -2.700 1.00 0.00 C 195 | ATOM 378 CE1 PHE A 24 -21.681 13.715 -2.848 1.00 0.00 C 196 | ATOM 379 CE2 PHE A 24 -23.735 14.430 -1.907 1.00 0.00 C 197 | ATOM 380 CZ PHE A 24 -22.710 13.512 -1.975 1.00 0.00 C 198 | ATOM 390 N GLY A 25 -23.727 17.846 -7.686 1.00 0.00 N 199 | ATOM 391 CA GLY A 25 -23.649 18.967 -8.611 1.00 0.00 C 200 | ATOM 392 C GLY A 25 -22.585 18.738 -9.674 1.00 0.00 C 201 | ATOM 393 O GLY A 25 -21.672 17.933 -9.504 1.00 0.00 O 202 | ATOM 397 N ASP A 26 -22.686 19.484 -10.769 1.00 0.00 N 203 | ATOM 398 CA ASP A 26 -21.756 19.335 -11.883 1.00 0.00 C 204 | ATOM 399 C ASP A 26 -20.306 19.573 -11.494 1.00 0.00 C 205 | ATOM 400 O ASP A 26 -19.990 20.458 -10.690 1.00 0.00 O 206 | ATOM 401 CB ASP A 26 -22.149 20.267 -13.029 1.00 0.00 C 207 | ATOM 402 CG ASP A 26 -23.427 19.804 -13.758 1.00 0.00 C 208 | ATOM 403 OD1 ASP A 26 -23.857 18.695 -13.542 1.00 0.00 O 209 | ATOM 404 OD2 ASP A 26 -23.955 20.561 -14.522 1.00 0.00 O 210 | ATOM 409 N ILE A 27 -19.440 18.762 -12.100 1.00 0.00 N 211 | ATOM 410 CA ILE A 27 -17.995 18.758 -11.893 1.00 0.00 C 212 | ATOM 411 C ILE A 27 -17.248 19.320 -13.092 1.00 0.00 C 213 | ATOM 412 O ILE A 27 -17.507 18.940 -14.230 1.00 0.00 O 214 | ATOM 413 CB ILE A 27 -17.516 17.325 -11.563 1.00 0.00 C 215 | ATOM 414 CG1 ILE A 27 -18.169 16.884 -10.236 1.00 0.00 C 216 | ATOM 415 CG2 ILE A 27 -15.974 17.227 -11.515 1.00 0.00 C 217 | ATOM 416 CD1 ILE A 27 -18.007 15.418 -9.892 1.00 0.00 C 218 | ATOM 428 N LEU A 28 -16.340 20.255 -12.818 1.00 0.00 N 219 | ATOM 429 CA LEU A 28 -15.535 20.902 -13.846 1.00 0.00 C 220 | ATOM 430 C LEU A 28 -14.346 20.016 -14.171 1.00 0.00 C 221 | ATOM 431 O LEU A 28 -13.986 19.816 -15.333 1.00 0.00 O 222 | ATOM 432 CB LEU A 28 -14.979 22.225 -13.299 1.00 0.00 C 223 | ATOM 433 CG LEU A 28 -15.983 23.294 -12.891 1.00 0.00 C 224 | ATOM 434 CD1 LEU A 28 -15.221 24.412 -12.185 1.00 0.00 C 225 | ATOM 435 CD2 LEU A 28 -16.746 23.825 -14.100 1.00 0.00 C 226 | ATOM 447 N SER A 29 -13.738 19.485 -13.112 1.00 0.00 N 227 | ATOM 448 CA SER A 29 -12.551 18.639 -13.225 1.00 0.00 C 228 | ATOM 449 C SER A 29 -12.321 17.756 -12.009 1.00 0.00 C 229 | ATOM 450 O SER A 29 -12.553 18.172 -10.874 1.00 0.00 O 230 | ATOM 451 CB SER A 29 -11.321 19.482 -13.463 1.00 0.00 C 231 | ATOM 452 OG SER A 29 -10.178 18.675 -13.512 1.00 0.00 O 232 | ATOM 458 N SER A 30 -11.853 16.538 -12.232 1.00 0.00 N 233 | ATOM 459 CA SER A 30 -11.551 15.635 -11.125 1.00 0.00 C 234 | ATOM 460 C SER A 30 -10.371 14.735 -11.441 1.00 0.00 C 235 | ATOM 461 O SER A 30 -10.023 14.531 -12.611 1.00 0.00 O 236 | ATOM 462 CB SER A 30 -12.765 14.793 -10.793 1.00 0.00 C 237 | ATOM 463 OG SER A 30 -13.117 13.966 -11.871 1.00 0.00 O 238 | ATOM 469 N LYS A 31 -9.748 14.213 -10.382 1.00 0.00 N 239 | ATOM 470 CA LYS A 31 -8.615 13.307 -10.539 1.00 0.00 C 240 | ATOM 471 C LYS A 31 -8.281 12.483 -9.299 1.00 0.00 C 241 | ATOM 472 O LYS A 31 -8.668 12.813 -8.171 1.00 0.00 O 242 | ATOM 473 CB LYS A 31 -7.371 14.102 -10.963 1.00 0.00 C 243 | ATOM 474 CG LYS A 31 -6.900 15.139 -9.938 1.00 0.00 C 244 | ATOM 475 CD LYS A 31 -5.709 15.940 -10.456 1.00 0.00 C 245 | ATOM 476 CE LYS A 31 -5.215 16.947 -9.418 1.00 0.00 C 246 | ATOM 477 NZ LYS A 31 -4.072 17.761 -9.931 1.00 0.00 N 247 | ATOM 491 N ILE A 32 -7.489 11.436 -9.519 1.00 0.00 N 248 | ATOM 492 CA ILE A 32 -6.959 10.585 -8.458 1.00 0.00 C 249 | ATOM 493 C ILE A 32 -5.485 10.867 -8.352 1.00 0.00 C 250 | ATOM 494 O ILE A 32 -4.791 10.889 -9.369 1.00 0.00 O 251 | ATOM 495 CB ILE A 32 -7.114 9.087 -8.789 1.00 0.00 C 252 | ATOM 496 CG1 ILE A 32 -8.554 8.771 -9.152 1.00 0.00 C 253 | ATOM 497 CG2 ILE A 32 -6.615 8.198 -7.598 1.00 0.00 C 254 | ATOM 498 CD1 ILE A 32 -9.572 9.045 -8.116 1.00 0.00 C 255 | ATOM 510 N ALA A 33 -4.994 11.121 -7.154 1.00 0.00 N 256 | ATOM 511 CA ALA A 33 -3.574 11.357 -7.015 1.00 0.00 C 257 | ATOM 512 C ALA A 33 -2.874 10.014 -6.992 1.00 0.00 C 258 | ATOM 513 O ALA A 33 -3.216 9.168 -6.159 1.00 0.00 O 259 | ATOM 514 CB ALA A 33 -3.297 12.132 -5.757 1.00 0.00 C 260 | ATOM 520 N THR A 34 -1.910 9.812 -7.889 1.00 0.00 N 261 | ATOM 521 CA THR A 34 -1.221 8.532 -7.971 1.00 0.00 C 262 | ATOM 522 C THR A 34 0.287 8.676 -7.951 1.00 0.00 C 263 | ATOM 523 O THR A 34 0.837 9.773 -8.107 1.00 0.00 O 264 | ATOM 524 CB THR A 34 -1.572 7.780 -9.264 1.00 0.00 C 265 | ATOM 525 OG1 THR A 34 -1.027 8.485 -10.394 1.00 0.00 O 266 | ATOM 526 CG2 THR A 34 -3.081 7.659 -9.417 1.00 0.00 C 267 | ATOM 534 N ASP A 35 0.946 7.541 -7.783 1.00 0.00 N 268 | ATOM 535 CA ASP A 35 2.384 7.406 -7.836 1.00 0.00 C 269 | ATOM 536 C ASP A 35 2.793 7.280 -9.301 1.00 0.00 C 270 | ATOM 537 O ASP A 35 1.939 7.274 -10.196 1.00 0.00 O 271 | ATOM 538 CB ASP A 35 2.810 6.178 -7.010 1.00 0.00 C 272 | ATOM 539 CG ASP A 35 4.262 6.217 -6.498 1.00 0.00 C 273 | ATOM 540 OD1 ASP A 35 5.071 6.886 -7.108 1.00 0.00 O 274 | ATOM 541 OD2 ASP A 35 4.541 5.575 -5.522 1.00 0.00 O 275 | ATOM 546 N GLU A 36 4.086 7.145 -9.552 1.00 0.00 N 276 | ATOM 547 CA GLU A 36 4.606 7.073 -10.916 1.00 0.00 C 277 | ATOM 548 C GLU A 36 4.040 5.897 -11.703 1.00 0.00 C 278 | ATOM 549 O GLU A 36 3.826 5.995 -12.910 1.00 0.00 O 279 | ATOM 550 CB GLU A 36 6.132 6.973 -10.903 1.00 0.00 C 280 | ATOM 551 CG GLU A 36 6.845 8.240 -10.448 1.00 0.00 C 281 | ATOM 552 CD GLU A 36 8.344 8.095 -10.431 1.00 0.00 C 282 | ATOM 553 OE1 GLU A 36 8.820 7.012 -10.687 1.00 0.00 O 283 | ATOM 554 OE2 GLU A 36 9.013 9.065 -10.166 1.00 0.00 O 284 | ATOM 561 N ASN A 37 3.780 4.784 -11.026 1.00 0.00 N 285 | ATOM 562 CA ASN A 37 3.262 3.591 -11.678 1.00 0.00 C 286 | ATOM 563 C ASN A 37 1.732 3.509 -11.725 1.00 0.00 C 287 | ATOM 564 O ASN A 37 1.181 2.464 -12.075 1.00 0.00 O 288 | ATOM 565 CB ASN A 37 3.848 2.367 -11.011 1.00 0.00 C 289 | ATOM 566 CG ASN A 37 3.428 2.234 -9.575 1.00 0.00 C 290 | ATOM 567 OD1 ASN A 37 2.503 2.909 -9.082 1.00 0.00 O 291 | ATOM 568 ND2 ASN A 37 4.112 1.366 -8.869 1.00 0.00 N 292 | ATOM 575 N GLY A 38 1.036 4.588 -11.355 1.00 0.00 N 293 | ATOM 576 CA GLY A 38 -0.422 4.612 -11.383 1.00 0.00 C 294 | ATOM 577 C GLY A 38 -1.093 4.204 -10.069 1.00 0.00 C 295 | ATOM 578 O GLY A 38 -2.308 4.355 -9.920 1.00 0.00 O 296 | ATOM 582 N LYS A 39 -0.328 3.678 -9.116 1.00 0.00 N 297 | ATOM 583 CA LYS A 39 -0.898 3.279 -7.834 1.00 0.00 C 298 | ATOM 584 C LYS A 39 -1.445 4.483 -7.070 1.00 0.00 C 299 | ATOM 585 O LYS A 39 -0.750 5.486 -6.904 1.00 0.00 O 300 | ATOM 586 CB LYS A 39 0.151 2.550 -6.989 1.00 0.00 C 301 | ATOM 587 CG LYS A 39 -0.353 2.010 -5.653 1.00 0.00 C 302 | ATOM 588 CD LYS A 39 0.739 1.224 -4.926 1.00 0.00 C 303 | ATOM 589 CE LYS A 39 0.241 0.694 -3.586 1.00 0.00 C 304 | ATOM 590 NZ LYS A 39 1.293 -0.082 -2.865 1.00 0.00 N 305 | ATOM 604 N SER A 40 -2.677 4.388 -6.576 1.00 0.00 N 306 | ATOM 605 CA SER A 40 -3.268 5.490 -5.814 1.00 0.00 C 307 | ATOM 606 C SER A 40 -2.516 5.832 -4.540 1.00 0.00 C 308 | ATOM 607 O SER A 40 -2.092 4.945 -3.795 1.00 0.00 O 309 | ATOM 608 CB SER A 40 -4.705 5.197 -5.464 1.00 0.00 C 310 | ATOM 609 OG SER A 40 -5.210 6.183 -4.590 1.00 0.00 O 311 | ATOM 615 N LYS A 41 -2.420 7.134 -4.258 1.00 0.00 N 312 | ATOM 616 CA LYS A 41 -1.789 7.649 -3.042 1.00 0.00 C 313 | ATOM 617 C LYS A 41 -2.785 7.821 -1.893 1.00 0.00 C 314 | ATOM 618 O LYS A 41 -2.442 8.363 -0.842 1.00 0.00 O 315 | ATOM 619 CB LYS A 41 -1.045 8.957 -3.310 1.00 0.00 C 316 | ATOM 620 CG LYS A 41 0.129 8.766 -4.242 1.00 0.00 C 317 | ATOM 621 CD LYS A 41 1.075 9.969 -4.320 1.00 0.00 C 318 | ATOM 622 CE LYS A 41 0.466 11.157 -5.033 1.00 0.00 C 319 | ATOM 623 NZ LYS A 41 1.508 12.156 -5.422 1.00 0.00 N 320 | ATOM 637 N GLY A 42 -4.027 7.380 -2.104 1.00 0.00 N 321 | ATOM 638 CA GLY A 42 -5.062 7.464 -1.079 1.00 0.00 C 322 | ATOM 639 C GLY A 42 -5.860 8.764 -1.046 1.00 0.00 C 323 | ATOM 640 O GLY A 42 -6.501 9.069 -0.026 1.00 0.00 O 324 | ATOM 644 N PHE A 43 -5.795 9.567 -2.103 1.00 0.00 N 325 | ATOM 645 CA PHE A 43 -6.545 10.807 -2.121 1.00 0.00 C 326 | ATOM 646 C PHE A 43 -6.832 11.269 -3.536 1.00 0.00 C 327 | ATOM 647 O PHE A 43 -6.225 10.800 -4.506 1.00 0.00 O 328 | ATOM 648 CB PHE A 43 -5.825 11.898 -1.317 1.00 0.00 C 329 | ATOM 649 CG PHE A 43 -4.514 12.358 -1.809 1.00 0.00 C 330 | ATOM 650 CD1 PHE A 43 -4.430 13.484 -2.610 1.00 0.00 C 331 | ATOM 651 CD2 PHE A 43 -3.354 11.699 -1.457 1.00 0.00 C 332 | ATOM 652 CE1 PHE A 43 -3.210 13.949 -3.029 1.00 0.00 C 333 | ATOM 653 CE2 PHE A 43 -2.139 12.155 -1.883 1.00 0.00 C 334 | ATOM 654 CZ PHE A 43 -2.064 13.287 -2.667 1.00 0.00 C 335 | ATOM 664 N GLY A 44 -7.742 12.219 -3.642 1.00 0.00 N 336 | ATOM 665 CA GLY A 44 -8.098 12.781 -4.933 1.00 0.00 C 337 | ATOM 666 C GLY A 44 -8.795 14.115 -4.796 1.00 0.00 C 338 | ATOM 667 O GLY A 44 -8.947 14.654 -3.693 1.00 0.00 O 339 | ATOM 671 N PHE A 45 -9.214 14.649 -5.932 1.00 0.00 N 340 | ATOM 672 CA PHE A 45 -9.828 15.966 -5.944 1.00 0.00 C 341 | ATOM 673 C PHE A 45 -11.043 16.056 -6.845 1.00 0.00 C 342 | ATOM 674 O PHE A 45 -11.049 15.492 -7.949 1.00 0.00 O 343 | ATOM 675 CB PHE A 45 -8.805 16.968 -6.480 1.00 0.00 C 344 | ATOM 676 CG PHE A 45 -7.515 17.035 -5.711 1.00 0.00 C 345 | ATOM 677 CD1 PHE A 45 -6.482 16.140 -6.009 1.00 0.00 C 346 | ATOM 678 CD2 PHE A 45 -7.306 17.981 -4.733 1.00 0.00 C 347 | ATOM 679 CE1 PHE A 45 -5.291 16.184 -5.343 1.00 0.00 C 348 | ATOM 680 CE2 PHE A 45 -6.096 18.033 -4.069 1.00 0.00 C 349 | ATOM 681 CZ PHE A 45 -5.092 17.132 -4.372 1.00 0.00 C 350 | ATOM 691 N VAL A 46 -12.038 16.824 -6.403 1.00 0.00 N 351 | ATOM 692 CA VAL A 46 -13.174 17.164 -7.258 1.00 0.00 C 352 | ATOM 693 C VAL A 46 -13.446 18.664 -7.270 1.00 0.00 C 353 | ATOM 694 O VAL A 46 -13.779 19.262 -6.250 1.00 0.00 O 354 | ATOM 695 CB VAL A 46 -14.454 16.385 -6.847 1.00 0.00 C 355 | ATOM 696 CG1 VAL A 46 -15.619 16.830 -7.674 1.00 0.00 C 356 | ATOM 697 CG2 VAL A 46 -14.253 14.891 -7.087 1.00 0.00 C 357 | ATOM 707 N HIS A 47 -13.370 19.279 -8.441 1.00 0.00 N 358 | ATOM 708 CA HIS A 47 -13.638 20.697 -8.503 1.00 0.00 C 359 | ATOM 709 C HIS A 47 -15.002 20.873 -9.131 1.00 0.00 C 360 | ATOM 710 O HIS A 47 -15.209 20.557 -10.313 1.00 0.00 O 361 | ATOM 711 CB HIS A 47 -12.535 21.463 -9.250 1.00 0.00 C 362 | ATOM 712 CG HIS A 47 -12.660 22.967 -9.106 1.00 0.00 C 363 | ATOM 713 ND1 HIS A 47 -11.736 23.855 -9.621 1.00 0.00 N 364 | ATOM 714 CD2 HIS A 47 -13.586 23.717 -8.468 1.00 0.00 C 365 | ATOM 715 CE1 HIS A 47 -12.091 25.087 -9.282 1.00 0.00 C 366 | ATOM 716 NE2 HIS A 47 -13.211 25.017 -8.553 1.00 0.00 N 367 | ATOM 724 N PHE A 48 -15.935 21.330 -8.300 1.00 0.00 N 368 | ATOM 725 CA PHE A 48 -17.319 21.527 -8.658 1.00 0.00 C 369 | ATOM 726 C PHE A 48 -17.548 22.887 -9.278 1.00 0.00 C 370 | ATOM 727 O PHE A 48 -16.890 23.862 -8.924 1.00 0.00 O 371 | ATOM 728 CB PHE A 48 -18.214 21.414 -7.448 1.00 0.00 C 372 | ATOM 729 CG PHE A 48 -18.419 20.032 -6.899 1.00 0.00 C 373 | ATOM 730 CD1 PHE A 48 -17.718 19.577 -5.798 1.00 0.00 C 374 | ATOM 731 CD2 PHE A 48 -19.356 19.197 -7.470 1.00 0.00 C 375 | ATOM 732 CE1 PHE A 48 -17.977 18.329 -5.276 1.00 0.00 C 376 | ATOM 733 CE2 PHE A 48 -19.605 17.966 -6.957 1.00 0.00 C 377 | ATOM 734 CZ PHE A 48 -18.925 17.533 -5.855 1.00 0.00 C 378 | ATOM 744 N GLU A 49 -18.524 22.953 -10.176 1.00 0.00 N 379 | ATOM 745 CA GLU A 49 -18.910 24.221 -10.795 1.00 0.00 C 380 | ATOM 746 C GLU A 49 -19.373 25.262 -9.780 1.00 0.00 C 381 | ATOM 747 O GLU A 49 -19.100 26.453 -9.934 1.00 0.00 O 382 | ATOM 748 CB GLU A 49 -19.978 24.001 -11.860 1.00 0.00 C 383 | ATOM 749 CG GLU A 49 -20.380 25.282 -12.594 1.00 0.00 C 384 | ATOM 750 CD GLU A 49 -21.290 25.037 -13.742 1.00 0.00 C 385 | ATOM 751 OE1 GLU A 49 -21.524 23.902 -14.047 1.00 0.00 O 386 | ATOM 752 OE2 GLU A 49 -21.752 25.986 -14.324 1.00 0.00 O 387 | ATOM 759 N GLU A 50 -20.097 24.819 -8.752 1.00 0.00 N 388 | ATOM 760 CA GLU A 50 -20.618 25.717 -7.731 1.00 0.00 C 389 | ATOM 761 C GLU A 50 -20.099 25.346 -6.348 1.00 0.00 C 390 | ATOM 762 O GLU A 50 -19.999 24.162 -6.000 1.00 0.00 O 391 | ATOM 763 CB GLU A 50 -22.150 25.694 -7.721 1.00 0.00 C 392 | ATOM 764 CG GLU A 50 -22.806 26.198 -9.009 1.00 0.00 C 393 | ATOM 765 CD GLU A 50 -24.322 26.205 -8.935 1.00 0.00 C 394 | ATOM 766 OE1 GLU A 50 -24.845 25.747 -7.947 1.00 0.00 O 395 | ATOM 767 OE2 GLU A 50 -24.948 26.671 -9.861 1.00 0.00 O 396 | ATOM 774 N GLU A 51 -19.900 26.371 -5.513 1.00 0.00 N 397 | ATOM 775 CA GLU A 51 -19.410 26.173 -4.151 1.00 0.00 C 398 | ATOM 776 C GLU A 51 -20.411 25.364 -3.351 1.00 0.00 C 399 | ATOM 777 O GLU A 51 -20.023 24.564 -2.495 1.00 0.00 O 400 | ATOM 778 CB GLU A 51 -19.190 27.522 -3.464 1.00 0.00 C 401 | ATOM 779 CG GLU A 51 -18.103 28.374 -4.101 1.00 0.00 C 402 | ATOM 780 CD GLU A 51 -16.781 27.703 -4.163 1.00 0.00 C 403 | ATOM 781 OE1 GLU A 51 -16.315 27.196 -3.170 1.00 0.00 O 404 | ATOM 782 OE2 GLU A 51 -16.238 27.662 -5.247 1.00 0.00 O 405 | ATOM 789 N GLY A 52 -21.697 25.569 -3.643 1.00 0.00 N 406 | ATOM 790 CA GLY A 52 -22.781 24.866 -2.988 1.00 0.00 C 407 | ATOM 791 C GLY A 52 -22.695 23.356 -3.191 1.00 0.00 C 408 | ATOM 792 O GLY A 52 -23.171 22.596 -2.342 1.00 0.00 O 409 | ATOM 796 N ALA A 53 -22.167 22.908 -4.344 1.00 0.00 N 410 | ATOM 797 CA ALA A 53 -22.053 21.486 -4.609 1.00 0.00 C 411 | ATOM 798 C ALA A 53 -20.965 20.904 -3.742 1.00 0.00 C 412 | ATOM 799 O ALA A 53 -21.128 19.831 -3.151 1.00 0.00 O 413 | ATOM 800 CB ALA A 53 -21.743 21.260 -6.062 1.00 0.00 C 414 | ATOM 806 N ALA A 54 -19.846 21.628 -3.651 1.00 0.00 N 415 | ATOM 807 CA ALA A 54 -18.755 21.163 -2.813 1.00 0.00 C 416 | ATOM 808 C ALA A 54 -19.228 21.099 -1.366 1.00 0.00 C 417 | ATOM 809 O ALA A 54 -18.911 20.156 -0.638 1.00 0.00 O 418 | ATOM 810 CB ALA A 54 -17.543 22.064 -2.962 1.00 0.00 C 419 | ATOM 816 N LYS A 55 -20.026 22.091 -0.961 1.00 0.00 N 420 | ATOM 817 CA LYS A 55 -20.544 22.138 0.391 1.00 0.00 C 421 | ATOM 818 C LYS A 55 -21.406 20.914 0.670 1.00 0.00 C 422 | ATOM 819 O LYS A 55 -21.192 20.235 1.681 1.00 0.00 O 423 | ATOM 820 CB LYS A 55 -21.325 23.432 0.630 1.00 0.00 C 424 | ATOM 821 CG LYS A 55 -21.573 23.788 2.117 1.00 0.00 C 425 | ATOM 822 CD LYS A 55 -22.912 23.260 2.651 1.00 0.00 C 426 | ATOM 823 CE LYS A 55 -23.184 23.781 4.071 1.00 0.00 C 427 | ATOM 824 NZ LYS A 55 -24.448 23.229 4.647 1.00 0.00 N 428 | ATOM 838 N GLU A 56 -22.385 20.621 -0.204 1.00 0.00 N 429 | ATOM 839 CA GLU A 56 -23.239 19.464 0.042 1.00 0.00 C 430 | ATOM 840 C GLU A 56 -22.412 18.194 0.115 1.00 0.00 C 431 | ATOM 841 O GLU A 56 -22.657 17.344 0.977 1.00 0.00 O 432 | ATOM 842 CB GLU A 56 -24.325 19.286 -1.010 1.00 0.00 C 433 | ATOM 843 CG GLU A 56 -25.311 18.146 -0.650 1.00 0.00 C 434 | ATOM 844 CD GLU A 56 -26.417 17.965 -1.634 1.00 0.00 C 435 | ATOM 845 OE1 GLU A 56 -26.442 18.667 -2.606 1.00 0.00 O 436 | ATOM 846 OE2 GLU A 56 -27.234 17.105 -1.419 1.00 0.00 O 437 | ATOM 853 N ALA A 57 -21.407 18.063 -0.757 1.00 0.00 N 438 | ATOM 854 CA ALA A 57 -20.570 16.879 -0.746 1.00 0.00 C 439 | ATOM 855 C ALA A 57 -19.909 16.716 0.625 1.00 0.00 C 440 | ATOM 856 O ALA A 57 -19.759 15.591 1.101 1.00 0.00 O 441 | ATOM 857 CB ALA A 57 -19.538 16.952 -1.854 1.00 0.00 C 442 | ATOM 863 N ILE A 58 -19.530 17.818 1.296 1.00 0.00 N 443 | ATOM 864 CA ILE A 58 -18.942 17.688 2.629 1.00 0.00 C 444 | ATOM 865 C ILE A 58 -19.992 17.163 3.608 1.00 0.00 C 445 | ATOM 866 O ILE A 58 -19.756 16.195 4.342 1.00 0.00 O 446 | ATOM 867 CB ILE A 58 -18.424 19.033 3.227 1.00 0.00 C 447 | ATOM 868 CG1 ILE A 58 -17.283 19.674 2.397 1.00 0.00 C 448 | ATOM 869 CG2 ILE A 58 -17.911 18.770 4.690 1.00 0.00 C 449 | ATOM 870 CD1 ILE A 58 -15.990 18.948 2.374 1.00 0.00 C 450 | ATOM 882 N ASP A 59 -21.179 17.790 3.605 1.00 0.00 N 451 | ATOM 883 CA ASP A 59 -22.237 17.394 4.538 1.00 0.00 C 452 | ATOM 884 C ASP A 59 -22.648 15.937 4.375 1.00 0.00 C 453 | ATOM 885 O ASP A 59 -22.963 15.258 5.354 1.00 0.00 O 454 | ATOM 886 CB ASP A 59 -23.511 18.239 4.364 1.00 0.00 C 455 | ATOM 887 CG ASP A 59 -23.469 19.694 4.904 1.00 0.00 C 456 | ATOM 888 OD1 ASP A 59 -22.587 20.051 5.643 1.00 0.00 O 457 | ATOM 889 OD2 ASP A 59 -24.384 20.433 4.575 1.00 0.00 O 458 | ATOM 894 N ALA A 60 -22.659 15.468 3.132 1.00 0.00 N 459 | ATOM 895 CA ALA A 60 -23.054 14.107 2.826 1.00 0.00 C 460 | ATOM 896 C ALA A 60 -21.951 13.069 2.981 1.00 0.00 C 461 | ATOM 897 O ALA A 60 -22.196 11.994 3.527 1.00 0.00 O 462 | ATOM 898 CB ALA A 60 -23.531 14.054 1.400 1.00 0.00 C 463 | ATOM 904 N LEU A 61 -20.744 13.353 2.492 1.00 0.00 N 464 | ATOM 905 CA LEU A 61 -19.708 12.338 2.487 1.00 0.00 C 465 | ATOM 906 C LEU A 61 -18.746 12.330 3.651 1.00 0.00 C 466 | ATOM 907 O LEU A 61 -17.992 11.370 3.785 1.00 0.00 O 467 | ATOM 908 CB LEU A 61 -18.864 12.454 1.225 1.00 0.00 C 468 | ATOM 909 CG LEU A 61 -19.561 12.290 -0.119 1.00 0.00 C 469 | ATOM 910 CD1 LEU A 61 -18.555 12.528 -1.138 1.00 0.00 C 470 | ATOM 911 CD2 LEU A 61 -20.166 10.921 -0.270 1.00 0.00 C 471 | ATOM 923 N ASN A 62 -18.732 13.342 4.508 1.00 0.00 N 472 | ATOM 924 CA ASN A 62 -17.757 13.312 5.584 1.00 0.00 C 473 | ATOM 925 C ASN A 62 -18.153 12.246 6.601 1.00 0.00 C 474 | ATOM 926 O ASN A 62 -19.125 12.414 7.340 1.00 0.00 O 475 | ATOM 927 CB ASN A 62 -17.652 14.688 6.242 1.00 0.00 C 476 | ATOM 928 CG ASN A 62 -16.593 14.765 7.308 1.00 0.00 C 477 | ATOM 929 OD1 ASN A 62 -15.882 13.781 7.528 1.00 0.00 O 478 | ATOM 930 ND2 ASN A 62 -16.487 15.897 7.976 1.00 0.00 N 479 | ATOM 937 N GLY A 63 -17.421 11.125 6.602 1.00 0.00 N 480 | ATOM 938 CA GLY A 63 -17.717 9.981 7.458 1.00 0.00 C 481 | ATOM 939 C GLY A 63 -18.501 8.862 6.755 1.00 0.00 C 482 | ATOM 940 O GLY A 63 -18.898 7.879 7.386 1.00 0.00 O 483 | ATOM 944 N MET A 64 -18.732 9.006 5.455 1.00 0.00 N 484 | ATOM 945 CA MET A 64 -19.421 7.978 4.679 1.00 0.00 C 485 | ATOM 946 C MET A 64 -18.374 6.966 4.262 1.00 0.00 C 486 | ATOM 947 O MET A 64 -17.204 7.316 4.157 1.00 0.00 O 487 | ATOM 948 CB MET A 64 -20.124 8.562 3.443 1.00 0.00 C 488 | ATOM 949 CG MET A 64 -20.946 7.519 2.605 1.00 0.00 C 489 | ATOM 950 SD MET A 64 -21.829 8.206 1.188 1.00 0.00 S 490 | ATOM 951 CE MET A 64 -23.197 9.105 1.914 1.00 0.00 C 491 | ATOM 961 N LEU A 65 -18.745 5.710 4.060 1.00 0.00 N 492 | ATOM 962 CA LEU A 65 -17.758 4.761 3.559 1.00 0.00 C 493 | ATOM 963 C LEU A 65 -17.901 4.546 2.070 1.00 0.00 C 494 | ATOM 964 O LEU A 65 -19.010 4.372 1.552 1.00 0.00 O 495 | ATOM 965 CB LEU A 65 -17.863 3.403 4.268 1.00 0.00 C 496 | ATOM 966 CG LEU A 65 -17.088 3.231 5.607 1.00 0.00 C 497 | ATOM 967 CD1 LEU A 65 -17.606 4.186 6.691 1.00 0.00 C 498 | ATOM 968 CD2 LEU A 65 -17.196 1.790 6.043 1.00 0.00 C 499 | ATOM 980 N LEU A 66 -16.768 4.558 1.383 1.00 0.00 N 500 | ATOM 981 CA LEU A 66 -16.711 4.289 -0.042 1.00 0.00 C 501 | ATOM 982 C LEU A 66 -16.004 2.952 -0.152 1.00 0.00 C 502 | ATOM 983 O LEU A 66 -14.933 2.765 0.438 1.00 0.00 O 503 | ATOM 984 CB LEU A 66 -15.969 5.396 -0.810 1.00 0.00 C 504 | ATOM 985 CG LEU A 66 -16.501 6.869 -0.634 1.00 0.00 C 505 | ATOM 986 CD1 LEU A 66 -15.607 7.819 -1.460 1.00 0.00 C 506 | ATOM 987 CD2 LEU A 66 -17.964 6.988 -1.059 1.00 0.00 C 507 | ATOM 999 N ASN A 67 -16.580 2.010 -0.878 1.00 0.00 N 508 | ATOM 1000 CA ASN A 67 -16.026 0.664 -0.915 1.00 0.00 C 509 | ATOM 1001 C ASN A 67 -15.858 0.188 0.540 1.00 0.00 C 510 | ATOM 1002 O ASN A 67 -16.852 0.054 1.255 1.00 0.00 O 511 | ATOM 1003 CB ASN A 67 -14.723 0.580 -1.710 1.00 0.00 C 512 | ATOM 1004 CG ASN A 67 -14.894 0.927 -3.161 1.00 0.00 C 513 | ATOM 1005 OD1 ASN A 67 -16.025 0.971 -3.665 1.00 0.00 O 514 | ATOM 1006 ND2 ASN A 67 -13.802 1.120 -3.869 1.00 0.00 N 515 | ATOM 1013 N GLY A 68 -14.629 -0.082 0.977 1.00 0.00 N 516 | ATOM 1014 CA GLY A 68 -14.380 -0.569 2.332 1.00 0.00 C 517 | ATOM 1015 C GLY A 68 -13.747 0.448 3.294 1.00 0.00 C 518 | ATOM 1016 O GLY A 68 -13.262 0.056 4.359 1.00 0.00 O 519 | ATOM 1020 N GLN A 69 -13.685 1.734 2.925 1.00 0.00 N 520 | ATOM 1021 CA GLN A 69 -13.010 2.710 3.792 1.00 0.00 C 521 | ATOM 1022 C GLN A 69 -13.760 4.024 4.022 1.00 0.00 C 522 | ATOM 1023 O GLN A 69 -14.364 4.601 3.112 1.00 0.00 O 523 | ATOM 1024 CB GLN A 69 -11.631 3.030 3.199 1.00 0.00 C 524 | ATOM 1025 CG GLN A 69 -10.650 1.856 3.182 1.00 0.00 C 525 | ATOM 1026 CD GLN A 69 -9.327 2.211 2.535 1.00 0.00 C 526 | ATOM 1027 OE1 GLN A 69 -9.219 3.233 1.858 1.00 0.00 O 527 | ATOM 1028 NE2 GLN A 69 -8.321 1.369 2.735 1.00 0.00 N 528 | ATOM 1037 N GLU A 70 -13.690 4.517 5.264 1.00 0.00 N 529 | ATOM 1038 CA GLU A 70 -14.299 5.800 5.618 1.00 0.00 C 530 | ATOM 1039 C GLU A 70 -13.638 6.922 4.846 1.00 0.00 C 531 | ATOM 1040 O GLU A 70 -12.414 7.048 4.873 1.00 0.00 O 532 | ATOM 1041 CB GLU A 70 -14.178 6.039 7.120 1.00 0.00 C 533 | ATOM 1042 CG GLU A 70 -14.868 7.286 7.620 1.00 0.00 C 534 | ATOM 1043 CD GLU A 70 -14.755 7.443 9.107 1.00 0.00 C 535 | ATOM 1044 OE1 GLU A 70 -14.195 6.576 9.734 1.00 0.00 O 536 | ATOM 1045 OE2 GLU A 70 -15.218 8.431 9.619 1.00 0.00 O 537 | ATOM 1052 N ILE A 71 -14.430 7.770 4.198 1.00 0.00 N 538 | ATOM 1053 CA ILE A 71 -13.828 8.849 3.432 1.00 0.00 C 539 | ATOM 1054 C ILE A 71 -13.945 10.142 4.211 1.00 0.00 C 540 | ATOM 1055 O ILE A 71 -14.988 10.423 4.818 1.00 0.00 O 541 | ATOM 1056 CB ILE A 71 -14.501 9.011 2.017 1.00 0.00 C 542 | ATOM 1057 CG1 ILE A 71 -13.656 9.902 1.076 1.00 0.00 C 543 | ATOM 1058 CG2 ILE A 71 -15.911 9.656 2.090 1.00 0.00 C 544 | ATOM 1059 CD1 ILE A 71 -12.437 9.208 0.584 1.00 0.00 C 545 | ATOM 1071 N TYR A 72 -12.870 10.915 4.211 1.00 0.00 N 546 | ATOM 1072 CA TYR A 72 -12.886 12.216 4.824 1.00 0.00 C 547 | ATOM 1073 C TYR A 72 -12.804 13.276 3.757 1.00 0.00 C 548 | ATOM 1074 O TYR A 72 -11.948 13.244 2.872 1.00 0.00 O 549 | ATOM 1075 CB TYR A 72 -11.746 12.382 5.817 1.00 0.00 C 550 | ATOM 1076 CG TYR A 72 -11.642 13.797 6.310 1.00 0.00 C 551 | ATOM 1077 CD1 TYR A 72 -12.565 14.295 7.200 1.00 0.00 C 552 | ATOM 1078 CD2 TYR A 72 -10.617 14.606 5.852 1.00 0.00 C 553 | ATOM 1079 CE1 TYR A 72 -12.471 15.598 7.625 1.00 0.00 C 554 | ATOM 1080 CE2 TYR A 72 -10.518 15.892 6.284 1.00 0.00 C 555 | ATOM 1081 CZ TYR A 72 -11.443 16.398 7.165 1.00 0.00 C 556 | ATOM 1082 OH TYR A 72 -11.344 17.701 7.596 1.00 0.00 O 557 | ATOM 1092 N VAL A 73 -13.716 14.213 3.808 1.00 0.00 N 558 | ATOM 1093 CA VAL A 73 -13.696 15.265 2.822 1.00 0.00 C 559 | ATOM 1094 C VAL A 73 -13.552 16.623 3.483 1.00 0.00 C 560 | ATOM 1095 O VAL A 73 -14.253 16.935 4.448 1.00 0.00 O 561 | ATOM 1096 CB VAL A 73 -14.928 15.169 1.908 1.00 0.00 C 562 | ATOM 1097 CG1 VAL A 73 -14.859 13.907 1.018 1.00 0.00 C 563 | ATOM 1098 CG2 VAL A 73 -16.176 15.090 2.736 1.00 0.00 C 564 | ATOM 1108 N ALA A 74 -12.641 17.427 2.942 1.00 0.00 N 565 | ATOM 1109 CA ALA A 74 -12.378 18.749 3.493 1.00 0.00 C 566 | ATOM 1110 C ALA A 74 -12.659 19.872 2.485 1.00 0.00 C 567 | ATOM 1111 O ALA A 74 -12.449 19.693 1.279 1.00 0.00 O 568 | ATOM 1112 CB ALA A 74 -10.933 18.846 3.937 1.00 0.00 C 569 | ATOM 1118 N PRO A 75 -13.125 21.048 2.953 1.00 0.00 N 570 | ATOM 1119 CA PRO A 75 -13.317 22.256 2.184 1.00 0.00 C 571 | ATOM 1120 C PRO A 75 -12.095 23.155 2.331 1.00 0.00 C 572 | ATOM 1121 O PRO A 75 -11.079 22.927 1.682 1.00 0.00 O 573 | ATOM 1122 OXT PRO A 75 -12.009 23.815 3.365 1.00 0.00 O 574 | ATOM 1123 CB PRO A 75 -14.530 22.885 2.865 1.00 0.00 C 575 | ATOM 1124 CG PRO A 75 -14.374 22.496 4.336 1.00 0.00 C 576 | ATOM 1125 CD PRO A 75 -13.655 21.153 4.342 1.00 0.00 C -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | numpy>=1.23.2 3 | networkx>=2.6.3 4 | scipy>=1.9.1 5 | biopandas>=0.2.7 6 | torch>=1.11.0 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = metl-pretrained 3 | version = 0.1 4 | description = Mutational effect transfer learning pretrained models 5 | url = https://github.com/gitter-lab/metl-pretrained 6 | author = Sam Gelman 7 | author_email = sgelman2@wisc.edu 8 | license = MIT 9 | 10 | [options] 11 | packages=find: 12 | install_requires = 13 | torch 14 | numpy 15 | scipy 16 | biopandas 17 | networkx 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | if __name__ == '__main__': 4 | setup() 5 | --------------------------------------------------------------------------------