├── boileroom
    ├── images
    │   ├── __init__.py
    │   ├── volumes.py
    │   └── esm.py
    ├── models
    │   ├── esm
    │   │   ├── __init__.py
    │   │   ├── linker.py
    │   │   ├── esm2.py
    │   │   └── esmfold.py
    │   └── __init__.py
    ├── convert.py
    ├── constants.py
    ├── __init__.py
    ├── utils.py
    └── base.py
├── .gitignore
├── tests
    ├── data
    │   ├── multimer-check.txt
    │   ├── esmfold_server_short.pdb
    │   └── esmfold_server_medium.pdb
    ├── test_utils.py
    ├── conftest.py
    └── esm
    │   ├── test_esm2.py
    │   └── test_esmfold.py
├── .github
    └── workflows
    │   ├── pypi-publish.yaml
    │   ├── version-bump.yaml
    │   └── python-checks.yaml
├── pyproject.toml
├── LICENSE
├── .pre-commit-config.yaml
└── README.md


/boileroom/images/__init__.py:
--------------------------------------------------------------------------------
1 | from .esm import esm_image
2 | 
3 | __all__ = ["esm_image"]
4 | 


--------------------------------------------------------------------------------
/boileroom/images/volumes.py:
--------------------------------------------------------------------------------
1 | from modal import Volume
2 | 
3 | model_weights = Volume.from_name("model-weights", create_if_missing=True)
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | boileroom.egg-info/
 3 | __pycache__/
 4 | *.pyc
 5 | .pytest_cache/
 6 | .ruff_cache/
 7 | .mypy_cache/
 8 | .model_cache/
 9 | .venv/
10 | dist/
11 | 


--------------------------------------------------------------------------------
/boileroom/models/esm/__init__.py:
--------------------------------------------------------------------------------
1 | from .esmfold import ESMFold, get_esmfold
2 | from .esm2 import ESM2, get_esm2
3 | 
4 | __all__ = ["ESMFold", "get_esmfold", "ESM2", "get_esm2"]
5 | 


--------------------------------------------------------------------------------
/boileroom/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .esm.esmfold import ESMFold, get_esmfold
 2 | from .esm.esm2 import ESM2, get_esm2
 3 | 
 4 | __all__ = [
 5 |     "ESMFold",
 6 |     "ESM2",
 7 |     "get_esmfold",
 8 |     "get_esm2",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/tests/data/multimer-check.txt:
--------------------------------------------------------------------------------
1 | DRKIAGMDKGNGGTGAGMGI:AESHWCYEVQAESSNYPCLVPVKWGGNCQKDRQSPINIVTTKAKVDKKLGRFFFSGYDKKQTWTVQNNGHSVMMLLENKASISGGGLPAPYQAKQLHLHWSDLPYKGSEHSLDGEHFAMEMHIVHEKEKGTSRNVKEAQDPEDEIAVLAFLVEAGTQVNEGFQPLVEALSNIPKPEMSTTMAESSLLDLLPKEEKLRHYFRYLGSLTTPTCDEKVVWTVFREPIQLHREQILAFSQKLYYDKEQTVSMKDNVRPLQQLGQRTVIKS
2 | 


--------------------------------------------------------------------------------
/boileroom/images/esm.py:
--------------------------------------------------------------------------------
 1 | """Modal image definition for ESM family of models."""
 2 | 
 3 | from modal import Image
 4 | 
 5 | # Define the base image with all dependencies
 6 | esm_image = (
 7 |     Image.debian_slim(python_version="3.12")
 8 |     .apt_install("wget", "git")
 9 |     .pip_install("torch>=2.5.1,<2.7.0", "torch-tensorrt", "biotite>=1.0.1")
10 |     .run_commands(
11 |         "git clone https://github.com/jakublala/my_transformers.git",
12 |         "cd my_transformers && pip install .",
13 |     )
14 |     .env({"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"})
15 | )
16 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Python package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   build-and-publish:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v4
12 | 
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v5
15 |         with:
16 |           python-version: '3.12'
17 | 
18 |       - name: Install uv
19 |         run: |
20 |           curl -LsSf https://astral.sh/uv/install.sh | sh
21 |           echo "$HOME/.cargo/bin" >> $GITHUB_PATH
22 | 
23 |       - name: Build and publish to PyPI
24 |         run: |
25 |           uv build --no-sources
26 |           uv publish --token ${{ secrets.PYPI_API_TOKEN }}
27 | 


--------------------------------------------------------------------------------
/boileroom/convert.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Union
 3 | from io import StringIO
 4 | from biotite.structure import AtomArray
 5 | from biotite.structure.io.pdb import PDBFile
 6 | 
 7 | 
 8 | # TODO: for now, we can keep it in desprot, but long-term it makes sense to
 9 | # have it as a default part of boileroom (as a function that can be run locally)
10 | def pdb_file_to_atomarray(pdb_path: Union[str, StringIO]) -> AtomArray:
11 |     assert isinstance(pdb_path, (str, StringIO)), "pdb_path must be a string or StringIO"
12 |     if isinstance(pdb_path, str):
13 |         assert os.path.exists(pdb_path), "pdb_path must be a valid path"
14 |     return PDBFile.read(pdb_path).get_structure(model=1)
15 | 
16 | 
17 | def pdb_string_to_atomarray(pdb_string: str) -> AtomArray:
18 |     assert isinstance(pdb_string, str), "pdb_string must be a string"
19 |     return pdb_file_to_atomarray(StringIO(pdb_string))
20 | 


--------------------------------------------------------------------------------
/boileroom/constants.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | restype_1to3: Dict[str, str] = {
 4 |     "A": "ALA",
 5 |     "R": "ARG",
 6 |     "N": "ASN",
 7 |     "D": "ASP",
 8 |     "C": "CYS",
 9 |     "Q": "GLN",
10 |     "E": "GLU",
11 |     "G": "GLY",
12 |     "H": "HIS",
13 |     "I": "ILE",
14 |     "L": "LEU",
15 |     "K": "LYS",
16 |     "M": "MET",
17 |     "F": "PHE",
18 |     "P": "PRO",
19 |     "S": "SER",
20 |     "T": "THR",
21 |     "W": "TRP",
22 |     "Y": "TYR",
23 |     "V": "VAL",
24 | }
25 | 
26 | 
27 | # NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
28 | # 1-to-1 mapping of 3 letter names to one letter names. The latter contains
29 | # many more, and less common, three letter names as keys and maps many of these
30 | # to the same one letter name (including 'X' and 'U' which we don't use here).
31 | restype_3to1: Dict[str, str] = {v: k for k, v in restype_1to3.items()}
32 | 


--------------------------------------------------------------------------------
/boileroom/__init__.py:
--------------------------------------------------------------------------------
 1 | import modal
 2 | 
 3 | app = modal.App("boileroom")
 4 | 
 5 | 
 6 | # Lazy import to avoid circular import
 7 | def _import_models():
 8 |     from .models import ESMFold, ESM2, get_esmfold, get_esm2
 9 | 
10 |     return ESMFold, ESM2, get_esmfold, get_esm2
11 | 
12 | 
13 | # Make these available at module level
14 | def __getattr__(name):
15 |     if name in ["ESMFold", "ESM2", "get_esmfold", "get_esm2"]:
16 |         ESMFold, ESM2, get_esmfold, get_esm2 = _import_models()
17 |         globals().update({"ESMFold": ESMFold, "ESM2": ESM2, "get_esmfold": get_esmfold, "get_esm2": get_esm2})
18 |         return globals()[name]
19 |     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
20 | 
21 | 
22 | def __dir__():
23 |     return sorted(list(globals().keys()) + ["ESMFold", "ESM2", "get_esmfold", "get_esm2"])
24 | 
25 | 
26 | __all__ = [
27 |     "ESMFold",
28 |     "ESM2",
29 |     "get_esmfold",
30 |     "get_esm2",
31 | ]
32 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from boileroom.utils import validate_sequence, format_time
 4 | 
 5 | 
 6 | def test_validate_sequence(test_sequences: dict[str, str]):
 7 |     """Test sequence validation."""
 8 |     # Valid sequences
 9 |     assert validate_sequence(test_sequences["short"]) is True
10 |     assert validate_sequence(test_sequences["medium"]) is True
11 | 
12 |     # Invalid sequences
13 |     with pytest.raises(ValueError):
14 |         validate_sequence(test_sequences["invalid"])
15 |     with pytest.raises(ValueError):
16 |         validate_sequence("NOT A SEQUENCE")
17 | 
18 | 
19 | def test_format_time():
20 |     """Test time formatting."""
21 |     assert format_time(30) == "30s", f"Expected '30s', got {format_time(30)}"
22 |     assert format_time(90) == "1m 30s", f"Expected '1m 30s', got {format_time(90)}"
23 |     assert format_time(3600) == "1h", f"Expected '1h', got {format_time(3600)}"
24 |     assert format_time(3661) == "1h 1m 1s", f"Expected '1h 1m 1s', got {format_time(3661)}"
25 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "boileroom"
 7 | version = "0.2.1"
 8 | authors = [
 9 |     { name="Jakub Lála", email="jakublala@gmail.com" },
10 | ]
11 | description = "Protein prediction models with Modal"
12 | readme = "README.md"
13 | requires-python = ">=3.11"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "modal>=0.73.12",
21 |     "numpy>=2.2.2",
22 |     "biotite>=1.0.1",
23 |     "torch>=2.5.1,<2.7.0",
24 |     "deprecated>=1.2.14",
25 | ]
26 | [project.optional-dependencies]
27 | dev = [
28 |     "pre-commit>=4.1.0",
29 |     "pytest>=8.3.4",
30 |     "pytest-xdist>=3.6.1",
31 |     "pytest-mock>=3.14.0",
32 | ]
33 | local = [
34 |     "transformers>=4.49.0",
35 | ]
36 | 
37 | [project.urls]
38 | Homepage = "https://github.com/jakublala/boileroom"
39 | 
40 | [tool.hatch.metadata]
41 | allow-direct-references = true
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Jakub Lála
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/mirrors-mypy
 3 |     rev: v1.5.1
 4 |     hooks:
 5 |       - id: mypy
 6 |         exclude: &exclude_patterns |
 7 |             (?x)^.*\.pdb$
 8 |             |^scripts/.*$
 9 | 
10 |   - repo: https://github.com/pre-commit/pre-commit-hooks
11 |     rev: v4.4.0
12 |     hooks:
13 |       - id: trailing-whitespace
14 |         exclude: *exclude_patterns
15 |       - id: end-of-file-fixer
16 |         exclude: *exclude_patterns
17 |       - id: check-yaml
18 |         exclude: *exclude_patterns
19 |       - id: check-json
20 |         exclude: *exclude_patterns
21 |       - id: check-added-large-files
22 |         exclude: *exclude_patterns
23 |       - id: detect-aws-credentials
24 |         args: ["--allow-missing-credentials"]
25 |         exclude: *exclude_patterns
26 | 
27 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
28 |     rev: v0.8.6
29 |     hooks:
30 |       - id: ruff
31 |         args: ["--fix", "--line-length", "120"]
32 |         exclude: *exclude_patterns
33 |       - id: ruff-format
34 |         args: ["--line-length", "120"]
35 |         exclude: *exclude_patterns
36 | 


--------------------------------------------------------------------------------
/.github/workflows/version-bump.yaml:
--------------------------------------------------------------------------------
 1 | name: Require Version Bump
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   check-version:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout PR
13 |         uses: actions/checkout@v4
14 | 
15 |       - name: Get version from PR branch
16 |         id: pr_version
17 |         run: |
18 |           VERSION=$(grep -Po '(?<=version = ")[^"]+' pyproject.toml)
19 |           echo "version=$VERSION" >> $GITHUB_OUTPUT
20 | 
21 |       - name: Checkout main
22 |         uses: actions/checkout@v4
23 |         with:
24 |           ref: main
25 | 
26 |       - name: Get version from main
27 |         id: main_version
28 |         run: |
29 |           VERSION=$(grep -Po '(?<=version = ")[^"]+' pyproject.toml)
30 |           echo "version=$VERSION" >> $GITHUB_OUTPUT
31 | 
32 |       - name: Compare versions
33 |         run: |
34 |           if [ "${{ steps.pr_version.outputs.version }}" == "${{ steps.main_version.outputs.version }}" ]; then
35 |             echo "❌ Version has not been bumped. Please update the version in pyproject.toml."
36 |             exit 1
37 |           else
38 |             echo "✅ Version bump detected."
39 |           fi
40 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Pytest configuration for the boileroom package."""
 2 | 
 3 | import os
 4 | import pathlib
 5 | import pytest
 6 | 
 7 | 
 8 | def pytest_addoption(parser):
 9 |     parser.addoption(
10 |         "--backend",
11 |         action="store",
12 |         default="modal",
13 |         choices=("modal", "local"),
14 |         help="Execution backend for models in tests: modal (default) or local",
15 |     )
16 | 
17 | 
18 | @pytest.fixture(autouse=True, scope="session")
19 | def model_dir():
20 |     os.environ["MODEL_DIR"] = str(pathlib.Path(__file__).parent.parent / ".model_cache")
21 | 
22 | 
23 | @pytest.fixture
24 | def run_backend(request):
25 |     mode = request.config.getoption("--backend")
26 | 
27 |     def select(method):
28 |         # method is e.g. model.fold or model.embed
29 |         return getattr(method, "local" if mode == "local" else "remote")
30 | 
31 |     return select
32 | 
33 | 
34 | @pytest.fixture
35 | def test_sequences() -> dict[str, str]:
36 |     return {
37 |         "short": "MLKNVHVLVLGAGDVGSVVVRLLEK",
38 |         "medium": "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKT",
39 |         "invalid": "MALWMRLLPX123LLALWGPD",
40 |         "multimer": (
41 |             "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKT:"
42 |             "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKT"
43 |         ),
44 |     }
45 | 
46 | 
47 | @pytest.fixture
48 | def data_dir() -> pathlib.Path:
49 |     return pathlib.Path(__file__).parent / "data"
50 | 
51 | 
52 | @pytest.fixture(params=[10, 25, 50])
53 | def glycine_linker(request) -> str:
54 |     return "G" * request.param
55 | 


--------------------------------------------------------------------------------
/.github/workflows/python-checks.yaml:
--------------------------------------------------------------------------------
 1 | name: Python Checks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   lint-checks:
11 |     runs-on: "ubuntu-latest"
12 |     steps:
13 |       - uses: actions/checkout@v4
14 | 
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v5
17 |         with:
18 |           python-version: '3.12'
19 | 
20 |       - name: Install uv
21 |         run: |
22 |           curl -LsSf https://astral.sh/uv/install.sh | sh
23 |           echo "$HOME/.cargo/bin" >> $GITHUB_PATH
24 | 
25 |       - name: Install dependencies
26 |         run: uv sync --extra dev
27 | 
28 |       - name: Run pre-commit
29 |         run: uv run pre-commit run --all-files
30 | 
31 |   unit-tests:
32 |     runs-on: "ubuntu-latest"
33 |     steps:
34 |       - uses: actions/checkout@v4
35 | 
36 |       - name: Set up Python
37 |         uses: actions/setup-python@v5
38 |         with:
39 |           python-version: '3.12'
40 | 
41 |       - name: Install uv
42 |         run: |
43 |           curl -LsSf https://astral.sh/uv/install.sh | sh
44 |           echo "$HOME/.cargo/bin" >> $GITHUB_PATH
45 | 
46 |       - name: Install dependencies
47 |         run: uv sync --extra dev
48 | 
49 |       - name: Install the package
50 |         run: uv pip install -e .
51 | 
52 |       - name: Authenticate Modal
53 |         shell: bash -l {0}
54 |         run: uv run modal token set --token-id ${{ secrets.MODAL_API_TOKEN_ID }} --token-secret ${{ secrets.MODAL_API_TOKEN_SECRET }}
55 | 
56 |       - name: Run tests
57 |         shell: bash -l {0}
58 |         run: uv run pytest -n auto
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # boileroom: serverless protein prediction models
  2 | 
  3 | [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/)
  4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  5 | [![PyPI version](https://img.shields.io/pypi/v/boileroom.svg)](https://pypi.org/project/boileroom/)
  6 | [![GitHub last commit](https://img.shields.io/github/last-commit/jakublala/boileroom.svg)](https://github.com/jakublala/boileroom/commits/main)
  7 | [![GitHub issues](https://img.shields.io/github/issues/jakublala/boileroom.svg)](https://github.com/jakublala/boileroom/issues)
  8 | 
  9 | `boileroom` is a Python package that provides a unified interface to various protein prediction models, running them efficiently on Modal's serverless infrastructure.
 10 | 
 11 | ## Features
 12 | 
 13 | - 🚀 Serverless execution of protein models
 14 | - 🔄 Unified API across different models
 15 | - 🎯 Production-ready with GPU acceleration
 16 | - 📦 Easy installation and deployment
 17 | 
 18 | ## Installation
 19 | 
 20 | 1. Install the package using pip:
 21 | 
 22 | ```bash
 23 | pip install boileroom
 24 | ```
 25 | 
 26 | 2. Set up Modal credentials (if you haven't already):
 27 | 
 28 | ```bash
 29 | modal token new
 30 | ```
 31 | 
 32 | ## Quick Start
 33 | 
 34 | ```python
 35 | from boileroom import app, ESMFold
 36 | 
 37 | # Initialize the model
 38 | model = ESMFold()
 39 | 
 40 | # Predict structure for a protein sequence
 41 | sequence = "MLKNVHVLVLGAGDVGSVVVRLLEK"
 42 | with app.run():
 43 |     result = model.fold.remote([sequence])
 44 | 
 45 | # Access prediction results
 46 | coordinates = result.positions
 47 | confidence = result.plddt
 48 | ```
 49 | 
 50 | ## Available Models
 51 | 
 52 | | Model      | Status | Description                                    | Reference                                              |
 53 | |------------|--------|------------------------------------------------|--------------------------------------------------------|
 54 | | ESMFold    | ✅      | Fast protein structure prediction   | [Facebook (now Meta)](https://github.com/facebookresearch/esm)     |
 55 | | ESM-2    | ✅      | MSA-free embedding model   | [Facebook (now Meta)](https://github.com/facebookresearch/esm)     |
 56 | 
 57 | ## Development
 58 | 
 59 | 1. Clone the repository:
 60 | 
 61 | ```bash
 62 | git clone https://github.com/jakublala/boileroom
 63 | cd boileroom
 64 | ```
 65 | 
 66 | 2. Install development dependencies using `uv`:
 67 | 
 68 | ```bash
 69 | curl -LsSf https://astral.sh/uv/install.sh | sh
 70 | uv python install 3.12
 71 | uv sync
 72 | ```
 73 | 
 74 | 3. Run tests:
 75 | 
 76 | ```bash
 77 | uv run pytest
 78 | ```
 79 | 
 80 | or only one test that's more verbose and shows print statements:
 81 | 
 82 | ```bash
 83 | uv run python -m pytest tests/test_basic.py::test_esmfold_batch -v -s
 84 | ```
 85 | 
 86 | ## License
 87 | 
 88 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
 89 | 
 90 | ## Citation
 91 | 
 92 | If you use `boileroom` in your research, please cite:
 93 | 
 94 | ```bibtex
 95 | @software{boileroom2025,
 96 |   author = {Lála, Jakub},
 97 |   title = {boileroom: serverless protein prediction models},
 98 |   year = {2025},
 99 |   publisher = {GitHub},
100 |   url = {https://github.com/jakublala/boileroom}
101 | }
102 | ```
103 | 
104 | ## Acknowledgments
105 | 
106 | - [Modal Labs](https://modal.com/) for the serverless infrastructure
107 | - The teams behind ESMFold, AlphaFold, and other protein prediction models
108 | 


--------------------------------------------------------------------------------
/boileroom/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions and constants for the BoilerRoom package."""
  2 | 
  3 | import os
  4 | import time
  5 | import logging
  6 | from pathlib import Path
  7 | from typing import Dict, Optional
  8 | 
  9 | 
 10 | # Time constants
 11 | SECONDS = 1
 12 | MINUTES = 60
 13 | HOURS = 60 * MINUTES
 14 | 
 15 | # Directory constants
 16 | MODEL_DIR = "/mnt/models"
 17 | CACHE_DIR = os.path.expanduser("~/.cache/boileroom")
 18 | 
 19 | # Amino acid constants
 20 | AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
 21 | AMINO_ACID_DICT = {aa: i for i, aa in enumerate(AMINO_ACIDS)}
 22 | VALID_AMINO_ACIDS = set(AMINO_ACIDS)  # For faster lookups
 23 | 
 24 | GPUS_AVAIL_ON_MODAL = ["T4", "L4", "A10G", "A100-40GB", "A100-80GB", "L40S", "H100"]
 25 | 
 26 | 
 27 | def validate_sequence(sequence: str) -> bool:
 28 |     """Validate that a sequence contains only valid amino acids.
 29 | 
 30 |     Args:
 31 |         sequence: A string of amino acids in single-letter code
 32 | 
 33 |     Returns:
 34 |         bool: True if sequence is valid
 35 | 
 36 |     Raises:
 37 |         ValueError: If sequence contains invalid characters
 38 |     """
 39 |     sequence = sequence.replace(":", "")  # remove any linkers first ":"
 40 |     invalid_chars = set(sequence) - VALID_AMINO_ACIDS
 41 |     # TODO: we should think whether there's not a cleaner way to throw an error on Modal
 42 |     # the traceback is otherwise quite messy and hard to debug
 43 |     if invalid_chars:
 44 |         raise ValueError(f"Invalid amino acid(s) in sequence: {sorted(invalid_chars)}")
 45 |     return True
 46 | 
 47 | 
 48 | def ensure_cache_dir() -> Path:
 49 |     """Ensure the cache directory exists.
 50 | 
 51 |     Returns:
 52 |         Path: Path to cache directory
 53 |     """
 54 |     cache_path = Path(CACHE_DIR)
 55 |     cache_path.mkdir(parents=True, exist_ok=True)
 56 |     return cache_path
 57 | 
 58 | 
 59 | def format_time(seconds: float) -> str:
 60 |     """Format time in seconds to human readable string.
 61 | 
 62 |     Args:
 63 |         seconds: Time in seconds
 64 | 
 65 |     Returns:
 66 |         str: Formatted time string (e.g. "2h 30m 15s")
 67 |     """
 68 |     hours = int(seconds // HOURS)
 69 |     minutes = int((seconds % HOURS) // MINUTES)
 70 |     secs = int(seconds % MINUTES)
 71 | 
 72 |     parts = []
 73 |     if hours > 0:
 74 |         parts.append(f"{hours}h")
 75 |     if minutes > 0:
 76 |         parts.append(f"{minutes}m")
 77 |     if secs > 0 or not parts:
 78 |         parts.append(f"{secs}s")
 79 | 
 80 |     return " ".join(parts)
 81 | 
 82 | 
 83 | def get_gpu_memory_info() -> Optional[Dict[str, int]]:
 84 |     """Get GPU memory information if available.
 85 | 
 86 |     Returns:
 87 |         Optional[Dict[str, int]]: Dictionary with 'total' and 'free' memory in MB,
 88 |                                  or None if no GPU is available
 89 |     """
 90 |     try:
 91 |         import torch
 92 | 
 93 |         if not torch.cuda.is_available():
 94 |             return None
 95 | 
 96 |         device = torch.cuda.current_device()
 97 |         total = torch.cuda.get_device_properties(device).total_memory // (1024 * 1024)
 98 |         free = torch.cuda.memory_reserved(device) // (1024 * 1024)
 99 | 
100 |         return {"total": total, "free": free, "used": total - free}
101 |     except Exception as e:
102 |         print(f"Error getting GPU memory info: {e}")
103 |         return None
104 | 
105 | 
106 | class Timer:
107 |     """Context manager for timing operations."""
108 | 
109 |     def __init__(self, description: str):
110 |         self.description = description
111 |         self.duration = None
112 | 
113 |     def __enter__(self):
114 |         self.start_time = time.perf_counter()
115 |         return self
116 | 
117 |     def __exit__(self, *args):
118 |         self.duration = time.perf_counter() - self.start_time
119 |         logging.info(f"{self.description} completed in {self.duration:.2f} seconds")
120 | 


--------------------------------------------------------------------------------
/boileroom/models/esm/linker.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import List
  3 | 
  4 | from ...images import esm_image
  5 | 
  6 | with esm_image.imports():
  7 |     import torch
  8 |     import torch.nn.functional as F
  9 | 
 10 | 
 11 | # --- Glycine linker and positional skip utilities ---
 12 | def compute_position_ids(sequences: List[str], glycine_linker: str, position_ids_skip: int) -> torch.Tensor:
 13 |     """
 14 |     Compute the position ids for the sequences.
 15 |     Parameters
 16 |     ----------
 17 |     sequences: List of sequences, each containing chains separated by ":".
 18 |     glycine_linker: The glycine linker string used between chains represented as a string (e.g. "GGGG").
 19 |     position_ids_skip: The number of positions to skip between chains.
 20 |     Returns
 21 |     -------
 22 |     torch.Tensor: The position ids for the sequences
 23 |     """
 24 |     position_ids = []
 25 |     for multimer_seq in sequences:
 26 |         multimer_position_ids = []
 27 |         previous_chain_end = 0
 28 |         for chain_id, chain_seq in enumerate(multimer_seq.split(":")):
 29 |             intrachain_position_ids = np.arange(len(chain_seq))
 30 |             if chain_id != 0:
 31 |                 intrachain_position_ids = (intrachain_position_ids + (previous_chain_end + 1)) + position_ids_skip
 32 |             # add linker if not last chain
 33 |             if chain_id != len(multimer_seq.split(":")) - 1:
 34 |                 linker_position_ids = np.arange(len(glycine_linker)) + intrachain_position_ids[-1] + 1
 35 |                 intrachain_position_ids = np.concatenate([intrachain_position_ids, linker_position_ids])
 36 |             previous_chain_end = intrachain_position_ids[-1]
 37 |             multimer_position_ids += intrachain_position_ids.tolist()
 38 |         position_ids.append(torch.tensor(multimer_position_ids))
 39 |     # add padding to the position ids
 40 |     max_length = max(len(ids) for ids in position_ids)
 41 |     for i, pos_ids in enumerate(position_ids):
 42 |         position_ids[i] = torch.cat([pos_ids, torch.zeros(max_length - len(pos_ids), dtype=torch.long)])
 43 |     return torch.stack(position_ids)
 44 | 
 45 | 
 46 | def store_multimer_properties(_sequences: List[str], glycine_linker: str):
 47 |     """Store properties needed for multimer processing.
 48 |     Args:
 49 |         _sequences: List of sequences, each containing chains separated by ":"
 50 |         glycine_linker: The glycine linker string used between chains
 51 |     Returns:
 52 |         tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
 53 |             - linker_map: tensor of shape (batch_size, sequence_length) where 0 indicates
 54 |             linker positions and 1 indicates chain positions
 55 |             - residue_index: tensor of shape (batch_size, sequence_length) containing
 56 |             residue indices that restart at 1 for each chain
 57 |             - chain_index: tensor of shape (batch_size, sequence_length) containing
 58 |             chain indices (0, 1, 2, etc.)
 59 |     """
 60 |     linker_map = []
 61 |     residue_index = []
 62 |     chain_index = []
 63 |     assert len(_sequences) > 0, "Sequences must not be empty"
 64 |     for seq in _sequences:
 65 |         full_seq_len = len(seq.replace(":", glycine_linker))
 66 |         seq_mask = torch.ones(full_seq_len, dtype=torch.long)
 67 |         res_index = torch.zeros(full_seq_len, dtype=torch.long)
 68 |         ch_index = torch.zeros(full_seq_len, dtype=torch.long)
 69 |         current_pos = 0
 70 |         chains = seq.split(":")
 71 |         for i, chain in enumerate(chains):
 72 |             ch_index[current_pos : current_pos + len(chain)] = i
 73 |             res_index[current_pos : current_pos + len(chain)] = torch.arange(0, len(chain))
 74 |             current_pos += len(chain)
 75 |             if i < len(chains) - 1:
 76 |                 seq_mask[current_pos : current_pos + len(glycine_linker)] = 0
 77 |                 ch_index[current_pos : current_pos + len(glycine_linker)] = i
 78 |                 res_index[current_pos : current_pos + len(glycine_linker)] = torch.arange(
 79 |                     len(chain) + 1, len(chain) + len(glycine_linker) + 1
 80 |                 )
 81 |                 current_pos += len(glycine_linker)
 82 |         linker_map.append(seq_mask)
 83 |         residue_index.append(res_index)
 84 |         chain_index.append(ch_index)
 85 |     linker_max_size = max(tensor.size(0) for tensor in linker_map)
 86 |     residue_index_max_size = max(tensor.size(0) for tensor in residue_index)
 87 |     chain_index_max_size = max(tensor.size(0) for tensor in chain_index)
 88 |     max_size = max(linker_max_size, residue_index_max_size, chain_index_max_size)
 89 |     padded_linker_map = [F.pad(tensor, (0, max_size - tensor.size(0)), value=-1) for tensor in linker_map]
 90 |     padded_residue_index = [F.pad(tensor, (0, max_size - tensor.size(0)), value=-1) for tensor in residue_index]
 91 |     padded_chain_index = [F.pad(tensor, (0, max_size - tensor.size(0)), value=-1) for tensor in chain_index]
 92 |     return (
 93 |         torch.stack(padded_linker_map),
 94 |         torch.stack(padded_residue_index),
 95 |         torch.stack(padded_chain_index),
 96 |     )
 97 | 
 98 | 
 99 | def replace_glycine_linkers(sequences: List[str], glycine_linker: str) -> List[str]:
100 |     return [multimer_seq.replace(":", glycine_linker) for multimer_seq in sequences]
101 | 


--------------------------------------------------------------------------------
/boileroom/base.py:
--------------------------------------------------------------------------------
  1 | """Base classes and interfaces for BoilerRoom protein structure prediction models."""
  2 | 
  3 | import logging
  4 | 
  5 | from abc import ABC, abstractmethod
  6 | from dataclasses import dataclass
  7 | from typing import Union, Sequence, Optional, Protocol, List
  8 | 
  9 | import numpy as np
 10 | 
 11 | from .utils import validate_sequence
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | @dataclass
 17 | class PredictionMetadata:
 18 |     """Metadata about a protein structure prediction."""
 19 | 
 20 |     model_name: str
 21 |     model_version: str
 22 |     prediction_time: Optional[float]  # in seconds
 23 |     sequence_lengths: Optional[List[int]]
 24 | 
 25 | 
 26 | class StructurePrediction(Protocol):
 27 |     """Protocol defining the minimum interface for structure prediction outputs."""
 28 | 
 29 |     metadata: PredictionMetadata
 30 |     positions: np.ndarray  # Atom positions
 31 |     pdb: Optional[list[str]] = None
 32 |     cif: Optional[list[str]] = None
 33 | 
 34 | 
 35 | class EmbeddingPrediction(Protocol):
 36 |     """Protocol defining the minimum interface for embedding outputs."""
 37 | 
 38 |     metadata: PredictionMetadata
 39 |     embeddings: np.ndarray  # Atom positions
 40 | 
 41 | 
 42 | class Algorithm(ABC):
 43 |     """Abstract base class for algorithms."""
 44 | 
 45 |     DEFAULT_CONFIG: dict = {}
 46 | 
 47 |     def __init__(self, config: dict = {}) -> None:
 48 |         """Initialize the algorithm."""
 49 |         self.config = {**self.DEFAULT_CONFIG, **config}
 50 |         self.name: str = self.__class__.__name__
 51 |         self.version: str = ""  # Should be overridden by implementations
 52 |         self.ready: bool = False
 53 | 
 54 |     @abstractmethod
 55 |     def _load(self) -> None:
 56 |         """Load the model and prepare it for prediction.
 57 | 
 58 |         This method should handle:
 59 |         - Loading model weights
 60 |         - Moving model to appropriate device
 61 |         - Setting up any necessary preprocessing
 62 | 
 63 |         Raises:
 64 |             RuntimeError: If model loading fails
 65 |         """
 66 |         raise NotImplementedError
 67 | 
 68 |     def update_config(self, config: dict) -> None:
 69 |         """
 70 |         Update the config with the default values.
 71 | 
 72 |         This does not work with Modal and remote execution. Create a new instance instead.
 73 |         """
 74 |         logger.warning("This does not work with Modal and remote execution. Create a new instance instead.")
 75 |         # TODO: Make this work smartly with remote Modal, calling _load() again, etc. and thus programmatically
 76 |         # updating the model if anything has changed
 77 |         self.config = {**self.config, **config}
 78 | 
 79 |     @staticmethod
 80 |     def _initialize_metadata(model_name: str, model_version: str) -> PredictionMetadata:
 81 |         """Initialize metadata for the prediction.
 82 | 
 83 |         Parameters
 84 |         ----------
 85 |         model_name : str
 86 |             Name of the model
 87 |         model_version : str
 88 |             Version of the model
 89 | 
 90 |         Returns
 91 |         -------
 92 |         PredictionMetadata
 93 |             Metadata for the prediction
 94 |         """
 95 |         return PredictionMetadata(
 96 |             model_name=model_name, model_version=model_version, prediction_time=None, sequence_lengths=None
 97 |         )
 98 | 
 99 | 
100 | class FoldingAlgorithm(Algorithm):
101 |     """Abstract base class for protein structure prediction algorithms.
102 | 
103 |     This class defines the interface that all protein structure prediction models must implement.
104 |     Each implementation should handle model loading, prediction, and cleanup appropriately.
105 | 
106 |     Attributes:
107 |         name (str): Name of the folding algorithm
108 |         version (str): Version of the model being used
109 |         ready (bool): Whether the model is loaded and ready for prediction
110 |     """
111 | 
112 |     @abstractmethod
113 |     def fold(self, sequences: Union[str, Sequence[str]]) -> StructurePrediction:
114 |         """Predict the structure for one or more protein sequences.
115 | 
116 |         Parameters
117 |         ----------
118 |         sequences : Union[str, Sequence[str]]
119 |             A single sequence string or list of sequence strings
120 |             containing valid amino acid characters
121 | 
122 |         Returns
123 |         -------
124 |         StructurePrediction
125 |             Structure prediction output implementing the StructurePrediction protocol
126 | 
127 |         Raises:
128 |             ValueError: If sequences are invalid
129 |             RuntimeError: If prediction fails
130 |         """
131 |         raise NotImplementedError
132 | 
133 |     def _validate_sequences(self, sequences: Union[str, Sequence[str]]) -> list[str]:
134 |         """Validate input sequences and convert to list format.
135 | 
136 |         Parameters
137 |         ----------
138 |         sequences : Union[str, Sequence[str]]
139 |             Single sequence or list of sequences
140 | 
141 |         Returns
142 |         -------
143 |         list[str]
144 |             List of validated sequences
145 | 
146 |         Raises:
147 |             ValueError: If any sequence contains invalid amino acids
148 |         """
149 |         # Convert single sequence to list
150 |         if isinstance(sequences, str):
151 |             sequences = [sequences]
152 | 
153 |         # Validate each sequence and return as explicit list
154 |         return [seq for seq in sequences if validate_sequence(seq)]
155 | 
156 |     def _compute_sequence_lengths(self, sequences: List[str]) -> List[int]:
157 |         """
158 |         Compute the sequence lengths for multimer sequences.
159 |         """
160 |         return [len(seq) - seq.count(":") for seq in sequences]
161 | 
162 |     def _prepare_multimer_sequences(self, sequences: List[str]) -> List[str]:
163 |         """
164 |         Prepare multimer sequences for prediction.
165 |         This method is model-specific and how they handle multimers.
166 | 
167 |         Parameters
168 |         ----------
169 |         sequences : List[str]
170 |             List of protein sequences
171 | 
172 |         Returns
173 |         -------
174 |         List[str]
175 |             List of prepared sequences"
176 |         """
177 |         raise NotImplementedError
178 | 
179 | 
180 | class EmbeddingAlgorithm(Algorithm):
181 |     """Abstract base class for embedding algorithms."""
182 | 
183 |     @abstractmethod
184 |     def embed(self, sequences: Union[str, Sequence[str]]) -> EmbeddingPrediction:
185 |         """Generate embeddings for one or more protein sequences.
186 | 
187 |         Parameters
188 |         ----------
189 |         sequences : Union[str, Sequence[str]]
190 |             A single sequence string or list of sequence strings
191 |             containing valid amino acid characters
192 | 
193 |         Returns
194 |         -------
195 |         EmbeddingPrediction
196 |             Embedding output implementing the EmbeddingPrediction protocol
197 | 
198 |         Raises:
199 |             ValueError: If sequences are invalid
200 |             RuntimeError: If embedding generation fails
201 |         """
202 |         raise NotImplementedError
203 | 


--------------------------------------------------------------------------------
/tests/esm/test_esm2.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | 
  4 | from boileroom import app, get_esm2
  5 | 
  6 | 
  7 | @pytest.fixture
  8 | def esm2_model_factory():
  9 |     def _make_model(**kwargs):
 10 |         config = {**kwargs}
 11 | 
 12 |         if "15B" in config["model_name"]:
 13 |             model = get_esm2(gpu_type="A100-80GB", config=config)
 14 |         elif "3B" in config["model_name"]:
 15 |             model = get_esm2(gpu_type="A100-40GB", config=config)
 16 |         else:
 17 |             model = get_esm2(gpu_type="T4", config=config)
 18 | 
 19 |         return model
 20 | 
 21 |     return _make_model
 22 | 
 23 | 
 24 | @pytest.mark.parametrize(
 25 |     "model_config",
 26 |     [
 27 |         {"model_name": "esm2_t6_8M_UR50D", "latent_dim": 320, "num_layers": 6},
 28 |         {"model_name": "esm2_t12_35M_UR50D", "latent_dim": 480, "num_layers": 12},
 29 |         {"model_name": "esm2_t30_150M_UR50D", "latent_dim": 640, "num_layers": 30},
 30 |         {"model_name": "esm2_t33_650M_UR50D", "latent_dim": 1280, "num_layers": 33},
 31 |         {"model_name": "esm2_t36_3B_UR50D", "latent_dim": 2560, "num_layers": 36},
 32 |         # {"model_name": "esm2_t48_15B_UR50D", "latent_dim": 5120, "num_layers": 48},
 33 |     ],
 34 | )
 35 | def test_esm2_embed_basic(esm2_model_factory, model_config, run_backend):
 36 |     """Test ESM2 embedding."""
 37 |     sequence = "MALWMRLLPLLALLALWGPDPAAA"
 38 | 
 39 |     with app.run():
 40 |         model = esm2_model_factory(model_name=model_config["model_name"])
 41 |         result = run_backend(model.embed)([sequence])
 42 |         # +2 for the two extra tokens (start of sequence and end of sequence)
 43 |         assert result.embeddings.shape == (1, len(sequence), model_config["latent_dim"])
 44 |         assert result.hidden_states is not None
 45 |         # +1 for the extra layer of the transformer ??? UNCLEAR WHY THIS IS THE CASE
 46 |         assert result.hidden_states.shape == (
 47 |             1,
 48 |             model_config["num_layers"] + 1,
 49 |             len(sequence),
 50 |             model_config["latent_dim"],
 51 |         )
 52 |         del model
 53 | 
 54 | 
 55 | def test_esm2_embed_hidden_states(esm2_model_factory, run_backend):
 56 |     """Test ESM2 embedding hidden states."""
 57 |     with app.run():
 58 |         sequence = "MALWMRLLPLLALLALWGPDPAAA"
 59 |         model = esm2_model_factory(model_name="esm2_t33_650M_UR50D", output_hidden_states=False)
 60 |         result = run_backend(model.embed)([sequence])
 61 |         assert result.hidden_states is None
 62 |         del model
 63 | 
 64 | 
 65 | def test_esm2_embed_multimer(esm2_model_factory, test_sequences, run_backend):
 66 |     """Test ESM2 embedding multimer functionality.
 67 | 
 68 |     Tests various aspects of multimer handling:
 69 |     - Basic multimer embedding
 70 |     - Chain indices and residue indices
 71 |     - Padding mask
 72 |     - Hidden states (when enabled)
 73 |     - Different glycine linker lengths
 74 |     """
 75 |     with app.run():
 76 |         # Test with different glycine linker lengths
 77 |         for linker_length in [0, 10, 50]:
 78 |             model = esm2_model_factory(
 79 |                 model_name="esm2_t33_650M_UR50D",
 80 |                 output_hidden_states=True,
 81 |                 glycine_linker="G" * linker_length,
 82 |                 position_ids_skip=512,
 83 |             )
 84 | 
 85 |             # Test with a simple multimer sequence
 86 |             sequence = test_sequences["multimer"]
 87 |             result = run_backend(model.embed)([sequence])
 88 | 
 89 |             # Check basic shape
 90 |             expected_length = len(sequence.replace(":", ""))
 91 |             assert result.embeddings.shape == (1, expected_length, 1280), "Embedding shape mismatch"
 92 | 
 93 |             # Check chain indices
 94 |             assert result.chain_index is not None, "Chain index should be present"
 95 |             assert result.chain_index.shape == (1, expected_length), "Chain index shape mismatch"
 96 | 
 97 |             # First chain should be 0, second chain should be 1
 98 |             first_chain_length = len(sequence.split(":")[0])
 99 |             assert np.all(result.chain_index[0, :first_chain_length] == 0), "First chain indices should be 0"
100 |             assert np.all(result.chain_index[0, first_chain_length:] == 1), "Second chain indices should be 1"
101 | 
102 |             # Check residue indices
103 |             assert result.residue_index is not None, "Residue index should be present"
104 |             assert result.residue_index.shape == (1, expected_length), "Residue index shape mismatch"
105 | 
106 |             # Check hidden states
107 |             assert result.hidden_states is not None, "Hidden states should be present"
108 |             assert result.hidden_states.shape == (1, 34, expected_length, 1280), "Hidden states shape mismatch"
109 | 
110 |             # Test with a more complex multimer sequence
111 |             complex_sequence = "MALWMRLLPLLALLALLAADASDASLLALWGPDPAAA:MADLLALWGPDPAAA:MALWMRLLPLLAADLLALWGPDPWGPDPAAA"
112 |             result = run_backend(model.embed)([complex_sequence])
113 | 
114 |             # Check basic shape for complex sequence
115 |             expected_length = len(complex_sequence.replace(":", ""))
116 |             assert result.embeddings.shape == (1, expected_length, 1280), "Complex sequence embedding shape mismatch"
117 | 
118 |             # Check chain indices for complex sequence
119 |             assert result.chain_index.shape == (1, expected_length), "Complex sequence chain index shape mismatch"
120 | 
121 |             # First chain should be 0, second chain should be 1, third chain should be 2
122 |             first_chain_length = len(complex_sequence.split(":")[0])
123 |             second_chain_length = len(complex_sequence.split(":")[1])
124 |             third_chain_length = len(complex_sequence.split(":")[2])
125 |             assert np.all(result.chain_index[0, :first_chain_length] == 0), "First chain indices should be 0"
126 |             assert np.all(
127 |                 result.chain_index[0, first_chain_length : first_chain_length + second_chain_length] == 1
128 |             ), "Second chain indices should be 1"
129 |             assert np.all(
130 |                 result.chain_index[0, first_chain_length + second_chain_length :] == 2
131 |             ), "Third chain indices should be 2"
132 |             assert np.all(
133 |                 result.chain_index[0, first_chain_length + second_chain_length + third_chain_length :] == 3
134 |             ), "Fourth chain indices should be 3"
135 | 
136 |             # Last test for a batched multimer, each sequence has different number of chains and length
137 |             sequences = [
138 |                 "AAA:CCC",  # Very short 2-chain multimer
139 |                 test_sequences["short"],  # Monomer (25 residues)
140 |                 "A" * 50 + ":" + "C" * 100 + ":" + "D" * 75,  # Long 3-chain multimer with different chain lengths
141 |                 "M" * 10 + ":" + "K" * 10,  # Small symmetric 2-chain multimer
142 |                 "M" * 1 + ":" + "Y" * 1,  # Edge case: minimal 2-chain multimer (1 residue each)
143 |             ]
144 |             result = run_backend(model.embed)(sequences)
145 |             assert result.embeddings.shape == (
146 |                 len(sequences),
147 |                 max(len(seq.replace(":", "")) for seq in sequences),
148 |                 1280,
149 |             ), "Embedding shape mismatch"
150 |             assert result.chain_index.shape == (
151 |                 len(sequences),
152 |                 max(len(seq.replace(":", "")) for seq in sequences),
153 |             ), "Chain index shape mismatch"
154 |             assert result.residue_index.shape == (
155 |                 len(sequences),
156 |                 max(len(seq.replace(":", "")) for seq in sequences),
157 |             ), "Residue index shape mismatch"
158 |             assert result.hidden_states.shape == (
159 |                 len(sequences),
160 |                 34,
161 |                 max(len(seq.replace(":", "")) for seq in sequences),
162 |                 1280,
163 |             ), "Hidden states shape mismatch"
164 | 
165 |             for i, seq in enumerate(sequences):
166 |                 expected_length = len(seq.replace(":", ""))
167 |                 assert np.all(result.embeddings[i, :expected_length] != 0), "No padding should be 0"
168 |                 assert np.all(result.embeddings[i, expected_length:] == 0), "Padding should be 0"
169 |                 assert np.all(result.chain_index[i, :expected_length] != -1), "No padding should be -1"
170 |                 assert np.all(result.chain_index[i, expected_length:] == -1), "Padding should be -1"
171 |                 assert np.all(result.residue_index[i, :expected_length] != -1), "No padding should be -1"
172 |                 assert np.all(result.residue_index[i, expected_length:] == -1), "Padding should be -1"
173 |                 # Count the number of zeros in the non-padding region; allow up to 16 zeros due to possible sparsity
174 |                 num_zeros = np.sum(result.hidden_states[i, :, :expected_length] == 0)
175 |                 assert num_zeros < 16, f"Too many zeros ({num_zeros}) in non-padding hidden states"
176 |                 assert np.all(result.hidden_states[i, :, expected_length:] == 0), "Padding should be 0"
177 |             del model
178 | 


--------------------------------------------------------------------------------
/boileroom/models/esm/esm2.py:
--------------------------------------------------------------------------------
  1 | import modal
  2 | import numpy as np
  3 | import os
  4 | from dataclasses import dataclass
  5 | from typing import List, Union, Optional, TYPE_CHECKING
  6 | 
  7 | import logging
  8 | 
  9 | from ... import app
 10 | from ...base import EmbeddingAlgorithm, EmbeddingPrediction, PredictionMetadata
 11 | from ...images import esm_image
 12 | from ...utils import MINUTES, MODEL_DIR, Timer
 13 | from ...images.volumes import model_weights
 14 | from .linker import compute_position_ids, store_multimer_properties, replace_glycine_linkers
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | if TYPE_CHECKING:
 19 |     from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
 20 | 
 21 | 
 22 | # TODO: turn this into a Pydantic model instead
 23 | @dataclass
 24 | class ESM2Output(EmbeddingPrediction):
 25 |     """Output from ESM2 prediction including all model outputs."""
 26 | 
 27 |     embeddings: np.ndarray  # (batch_size, seq_len, embedding_dim)
 28 |     metadata: PredictionMetadata
 29 |     chain_index: np.ndarray  # (batch_size, seq_len)
 30 |     residue_index: np.ndarray  # (batch_size, seq_len)
 31 |     hidden_states: Optional[np.ndarray] = None  # (batch_size, hidden_state_iter, seq_len, embedding_dim)
 32 | 
 33 | 
 34 | with esm_image.imports():
 35 |     import torch
 36 |     from transformers import EsmModel, AutoTokenizer
 37 | 
 38 | 
 39 | @app.cls(
 40 |     image=esm_image,
 41 |     gpu="T4",
 42 |     timeout=20 * MINUTES,
 43 |     container_idle_timeout=10 * MINUTES,
 44 |     volumes={MODEL_DIR: model_weights},
 45 | )
 46 | class ESM2(EmbeddingAlgorithm):
 47 |     """ESM2 protein language model."""
 48 | 
 49 |     DEFAULT_CONFIG = {
 50 |         "model_name": "esm2_t33_650M_UR50D",
 51 |         "output_hidden_states": True,
 52 |         # Chain linking and positioning config
 53 |         "glycine_linker": "",
 54 |         "position_ids_skip": 512,
 55 |     }
 56 | 
 57 |     def __init__(self, config: dict = {}) -> None:
 58 |         super().__init__(config)
 59 |         self.metadata = self._initialize_metadata(
 60 |             model_name="ESM-2",
 61 |             model_version="v4.49.0",  # HuggingFace transformers version
 62 |         )
 63 |         self.model_dir: Optional[str] = os.environ.get("MODEL_DIR", MODEL_DIR)
 64 |         self.tokenizer: Optional[AutoTokenizer] = None
 65 |         self.model: Optional[EsmModel] = None
 66 |         self.assert_valid_model(config)
 67 | 
 68 |     @staticmethod
 69 |     def assert_valid_model(config: dict) -> None:
 70 |         """
 71 |         Validate that the model name is supported.
 72 | 
 73 |         Available ESM-2 models:
 74 |         - esm2_t48_15B_UR50D: 48 layers, 5120 hidden size, 40 attention heads
 75 |         - esm2_t36_3B_UR50D: 36 layers, 2560 hidden size, 40 attention heads
 76 |         - esm2_t33_650M_UR50D: 33 layers, 1280 hidden size, 20 attention heads
 77 |         - esm2_t30_150M_UR50D: 30 layers, 640 hidden size, 12 attention heads
 78 |         - esm2_t12_35M_UR50D: 12 layers, 480 hidden size, 20 attention heads
 79 |         - esm2_t6_8M_UR50D: 6 layers, 320 hidden size, 20 attention heads
 80 |         """
 81 |         models_name = [
 82 |             "esm2_t48_15B_UR50D",
 83 |             "esm2_t36_3B_UR50D",
 84 |             "esm2_t33_650M_UR50D",
 85 |             "esm2_t30_150M_UR50D",
 86 |             "esm2_t12_35M_UR50D",
 87 |             "esm2_t6_8M_UR50D",
 88 |         ]
 89 |         assert config["model_name"] in models_name, f"Model {config['model_name']} not supported"
 90 | 
 91 |     @modal.enter()
 92 |     def _initialize(self) -> None:
 93 |         self._load()
 94 | 
 95 |     def _load(self) -> None:
 96 |         if self.tokenizer is None:
 97 |             self.tokenizer = AutoTokenizer.from_pretrained(
 98 |                 f"facebook/{self.config['model_name']}", cache_dir=self.model_dir
 99 |             )
100 |         if self.model is None:
101 |             self.model = EsmModel.from_pretrained(f"facebook/{self.config['model_name']}", cache_dir=self.model_dir)
102 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
103 |         self.model = self.model.to(self.device)
104 |         self.model.eval()
105 |         self.ready = True
106 | 
107 |     @modal.method()
108 |     def embed(self, sequences: Union[str, List[str]]) -> ESM2Output:
109 |         if self.tokenizer is None or self.model is None:
110 |             logger.warning("Model not loaded. Forcing the model to load... Next time call _load() first.")
111 |             self._load()
112 |         assert self.tokenizer is not None and self.model is not None, "Model not loaded"
113 | 
114 |         logger.debug(f'Embedding {len(sequences)} sequences using {self.config["model_name"]}')
115 | 
116 |         # Support for glycine linker and positional skip logic (multimer)
117 |         if isinstance(sequences, str):
118 |             sequences = [sequences]
119 | 
120 |         if any(":" in seq for seq in sequences):
121 |             # Multimer logic
122 |             glycine_linker = self.config["glycine_linker"]
123 |             multimer_properties = self._store_multimer_properties(sequences, glycine_linker)
124 |             tokenized = self.tokenizer(
125 |                 replace_glycine_linkers(sequences, glycine_linker),
126 |                 return_tensors="pt",
127 |                 padding=True,
128 |                 truncation=True,
129 |             )
130 |             # Add position_ids and attention_mask
131 |             tokenized["position_ids"] = compute_position_ids(
132 |                 sequences, glycine_linker, self.config["position_ids_skip"]
133 |             )
134 |             tokenized["attention_mask"] = (multimer_properties["linker_map"] == 1).to(torch.int32)
135 |         else:
136 |             # Monomer logic
137 |             tokenized = self.tokenizer(
138 |                 sequences,
139 |                 return_tensors="pt",
140 |                 padding=True,
141 |                 truncation=True,
142 |             )
143 |             multimer_properties = None
144 |         tokenized = tokenized.to(self.device)
145 |         tokenized["output_hidden_states"] = self.config["output_hidden_states"]
146 | 
147 |         with Timer("Model Inference") as timer:
148 |             with torch.inference_mode():
149 |                 outputs = self.model(**tokenized)
150 | 
151 |         outputs = self._convert_outputs(outputs, multimer_properties, timer.duration)
152 | 
153 |         return outputs
154 | 
155 |     @staticmethod
156 |     def _store_multimer_properties(sequences: List[str], glycine_linker: str) -> dict[str, torch.Tensor]:
157 |         linker_map, residue_index, chain_index = store_multimer_properties(sequences, glycine_linker)
158 |         # Add <cls> and <eos> as effective padding
159 |         batch_size = linker_map.shape[0]
160 |         linker_map = torch.cat([-torch.ones(batch_size, 1), linker_map, -torch.ones(batch_size, 1)], dim=1)
161 |         residue_index = torch.cat([-torch.ones(batch_size, 1), residue_index, -torch.ones(batch_size, 1)], dim=1)
162 |         chain_index = torch.cat([-torch.ones(batch_size, 1), chain_index, -torch.ones(batch_size, 1)], dim=1)
163 |         return {"linker_map": linker_map, "residue_index": residue_index, "chain_index": chain_index}
164 | 
165 |     def _convert_outputs(
166 |         self,
167 |         outputs: "BaseModelOutputWithPoolingAndCrossAttentions",
168 |         multimer_properties: dict[str, torch.Tensor] | None,
169 |         prediction_time: float,
170 |     ) -> ESM2Output:
171 |         """Convert model outputs to ESM2Output format."""
172 | 
173 |         embeddings = outputs.last_hidden_state.cpu().numpy()
174 | 
175 |         if self.config["output_hidden_states"]:
176 |             assert torch.all(
177 |                 outputs.hidden_states[-1] == outputs.last_hidden_state
178 |             ), "Last hidden state should be the same as the output of the model"
179 |             hidden_states = np.stack([h.cpu().numpy() for h in outputs.hidden_states], axis=1)
180 |         else:
181 |             hidden_states = None
182 | 
183 |         if multimer_properties is not None:
184 |             # TODO: maybe add a proper MULTIMER flag?
185 |             result = self._mask_linker_region(embeddings, hidden_states, **multimer_properties)
186 |             embeddings, hidden_states, chain_index_output, residue_index_output = result
187 |         else:  # only MONOMERs
188 |             chain_index_output = np.zeros((embeddings.shape[0], embeddings.shape[1]), dtype=np.int32)
189 |             residue_index_output = None  # HACK: for now, but given it's only monomers, it is clear what the res ids are
190 |             if hidden_states is not None:
191 |                 hidden_states = hidden_states[:, :, 1:-1, :]  # remove the first and last token
192 |             embeddings = embeddings[:, 1:-1, :]  # remove the first and last token
193 | 
194 |         self.metadata.prediction_time = prediction_time
195 | 
196 |         return ESM2Output(
197 |             metadata=self.metadata,
198 |             embeddings=embeddings,
199 |             hidden_states=hidden_states,
200 |             chain_index=chain_index_output,
201 |             residue_index=residue_index_output,
202 |         )
203 | 
204 |     def _mask_linker_region(
205 |         self,
206 |         embeddings: np.ndarray,
207 |         hidden_states: np.ndarray,
208 |         linker_map: torch.Tensor,
209 |         residue_index: torch.Tensor,
210 |         chain_index: torch.Tensor,
211 |     ) -> tuple[np.ndarray, np.ndarray, torch.Tensor, torch.Tensor]:
212 |         """
213 |         Mask the linker region in the outputs and track padding information.
214 | 
215 |         Args:
216 |             embeddings: Dictionary containing model outputs
217 |             hidden_states: Dictionary containing model outputs
218 |             chain_index: Dictionary containing model outputs
219 |             residue_index: Dictionary containing model outputs
220 | 
221 |         Returns:
222 |             dict: Updated outputs with linker regions masked and padding information
223 |         """
224 |         assert isinstance(linker_map, torch.Tensor), "linker_map must be a tensor"
225 | 
226 |         embeddings_list = []
227 |         if hidden_states is not None:
228 |             hidden_states_list = []
229 |         chain_index_list = []
230 |         residue_index_list = []
231 | 
232 |         for batch_idx, multimer in enumerate(linker_map):
233 |             # Drop the -1 values, meaning 1s refer to residues we want to keep
234 |             multimer = multimer.masked_fill(multimer == -1, 0).cpu().numpy()
235 |             # Chain indices are the ones that were not masked, hence they were kept and are thus 1
236 |             chain_indices = np.where(multimer == 1)[0]
237 | 
238 |             # Get embeddings for the residues we want to keep
239 |             embeddings_list.append(embeddings[batch_idx, chain_indices])
240 |             if hidden_states is not None:
241 |                 hidden_states_list.append(hidden_states[batch_idx, :, chain_indices, :])
242 |             chain_index_list.append(chain_index[batch_idx, chain_indices])
243 |             residue_index_list.append(residue_index[batch_idx, chain_indices])
244 | 
245 |         def pad_and_stack(
246 |             arrays: list[np.ndarray], residue_dim: int, batch_dim: int, constant_value: int = 0
247 |         ) -> np.ndarray:
248 |             """Pad arrays to match the largest size in the residue dimension and stack them in the batch dimension.
249 | 
250 |             Args:
251 |                 arrays: List of NumPy arrays to pad and stack
252 |                 residue_dim: Dimension to pad to match sizes
253 |                 batch_dim: Dimension to stack the arrays along
254 |                 constant_value: Value to use for padding (default: 0)
255 | 
256 |             Returns:
257 |                 Stacked and padded NumPy array
258 |             """
259 |             max_size = max(arr.shape[residue_dim] for arr in arrays)
260 |             padded_arrays = []
261 |             for arr in arrays:
262 |                 padding = [(0, 0)] * arr.ndim
263 |                 padding[residue_dim] = (0, max_size - arr.shape[residue_dim])
264 |                 padded_arrays.append(np.pad(arr, padding, mode="constant", constant_values=constant_value))
265 |             return np.stack(padded_arrays, axis=batch_dim)
266 | 
267 |         # Stack embeddings along batch dimension (0)
268 |         embeddings = pad_and_stack(embeddings_list, residue_dim=0, batch_dim=0)
269 |         if hidden_states is not None:
270 |             hidden_states = pad_and_stack(hidden_states_list, residue_dim=0, batch_dim=0)
271 |             # Transpose to get correct dimension order (batch, layers, seq_len, hidden_dim)
272 |             hidden_states = np.transpose(hidden_states, (0, 2, 1, 3))
273 |         chain_index = pad_and_stack(chain_index_list, residue_dim=0, batch_dim=0, constant_value=-1)
274 |         residue_index = pad_and_stack(residue_index_list, residue_dim=0, batch_dim=0, constant_value=-1)
275 | 
276 |         return embeddings, hidden_states, chain_index, residue_index
277 | 
278 | 
279 | def get_esm2(gpu_type="T4", config: dict = {}):
280 |     """
281 |     Note that the app will still show that's using T4, but the actual method / function call will use the correct GPU,
282 |     and display accordingly in the Modal dashboard.
283 |     """
284 |     Model = ESM2.with_options(gpu=gpu_type)  # type: ignore
285 |     return Model(config=config)
286 | 


--------------------------------------------------------------------------------
/tests/data/esmfold_server_short.pdb:
--------------------------------------------------------------------------------
  1 | HEADER                                            18-OCT-22                     
  2 | TITLE     ESMFOLD V1 PREDICTION FOR INPUT
  3 | REMARK   1                                                                      
  4 | REMARK   1 REFERENCE 1                                                          
  5 | REMARK   1  AUTH   ZEMING LIN, HALIL AKIN, ROSHAN RAO, BRIAN HIE, ZHONGKAI ZHU,
  6 | REMARK   1  AUTH 2 WENTING LU, NIKITA SMETANIN, ROBERT VERKUIL, ORI KABELI,
  7 | REMARK   1  AUTH 3 YANIV SHMUELI, ALLAN DOS SANTOS COSTA,
  8 | REMARK   1  AUTH 4 MARYAM FAZEL-ZARANDI, TOM SERCU, SALVATORE CANDIDO,
  9 | REMARK   1  AUTH 5 ALEXANDER RIVES
 10 | REMARK   1  TITL   EVOLUTIONARY-SCALE PREDICTION OF ATOMIC LEVEL PROTEIN
 11 | REMARK   1  TITL 2 STRUCTURE WITH A LANGUAGE MODEL
 12 | REMARK   1  REF                                                                 
 13 | REMARK   1  REFN                                                                
 14 | REMARK   1  PMID                                                                
 15 | REMARK   1  DOI    10.1101/2022.07.20.500902                                    
 16 | REMARK   1                                                                      
 17 | REMARK   1 LICENSE AND DISCLAIMERS                
 18 | REMARK   1 ESM METAGENOMIC ATLAS DATA IS AVAILABLE UNDER
 19 | REMARK   1 A CC-BY-4.0 LICENSE FOR ACADEMIC AND COMMERCIAL USE.
 20 | REMARK   1 COPYRIGHT (C) META PLATFORMS, INC. ALL RIGHTS RESERVED.
 21 | REMARK   1 USE OF THE ESM METAGENOMIC ATLAS DATA IS SUBJECT
 22 | REMARK   1 TO THE META OPEN SOURCE TERMS OF USE AND PRIVACY POLICY.
 23 | ATOM      1  N   MET A   1      -6.344  13.980  10.106  1.00  0.74           N  
 24 | ATOM      2  CA  MET A   1      -5.240  13.029  10.013  1.00  0.76           C  
 25 | ATOM      3  C   MET A   1      -5.717  11.610  10.305  1.00  0.76           C  
 26 | ATOM      4  CB  MET A   1      -4.116  13.411  10.978  1.00  0.67           C  
 27 | ATOM      5  O   MET A   1      -6.468  11.387  11.256  1.00  0.68           O  
 28 | ATOM      6  CG  MET A   1      -2.725  13.295  10.377  1.00  0.61           C  
 29 | ATOM      7  SD  MET A   1      -1.399  13.571  11.615  1.00  0.58           S  
 30 | ATOM      8  CE  MET A   1      -0.218  14.517  10.614  1.00  0.57           C  
 31 | ATOM      9  N   LEU A   2      -5.708  10.653   9.301  1.00  0.73           N  
 32 | ATOM     10  CA  LEU A   2      -6.196   9.295   9.516  1.00  0.73           C  
 33 | ATOM     11  C   LEU A   2      -5.285   8.536  10.476  1.00  0.73           C  
 34 | ATOM     12  CB  LEU A   2      -6.295   8.543   8.186  1.00  0.69           C  
 35 | ATOM     13  O   LEU A   2      -4.060   8.582  10.342  1.00  0.69           O  
 36 | ATOM     14  CG  LEU A   2      -7.287   9.102   7.164  1.00  0.66           C  
 37 | ATOM     15  CD1 LEU A   2      -7.122   8.393   5.824  1.00  0.63           C  
 38 | ATOM     16  CD2 LEU A   2      -8.717   8.964   7.676  1.00  0.64           C  
 39 | ATOM     17  N   LYS A   3      -5.543   8.350  11.697  1.00  0.77           N  
 40 | ATOM     18  CA  LYS A   3      -4.857   7.533  12.694  1.00  0.78           C  
 41 | ATOM     19  C   LYS A   3      -5.440   6.124  12.746  1.00  0.77           C  
 42 | ATOM     20  CB  LYS A   3      -4.941   8.187  14.074  1.00  0.72           C  
 43 | ATOM     21  O   LYS A   3      -6.625   5.927  12.467  1.00  0.72           O  
 44 | ATOM     22  CG  LYS A   3      -3.976   9.347  14.269  1.00  0.66           C  
 45 | ATOM     23  CD  LYS A   3      -3.993   9.853  15.706  1.00  0.64           C  
 46 | ATOM     24  CE  LYS A   3      -3.045  11.029  15.898  1.00  0.55           C  
 47 | ATOM     25  NZ  LYS A   3      -3.034  11.507  17.313  1.00  0.47           N  
 48 | ATOM     26  N   ASN A   4      -4.550   5.162  12.738  1.00  0.78           N  
 49 | ATOM     27  CA  ASN A   4      -4.841   3.750  12.960  1.00  0.79           C  
 50 | ATOM     28  C   ASN A   4      -5.689   3.167  11.833  1.00  0.79           C  
 51 | ATOM     29  CB  ASN A   4      -5.538   3.550  14.307  1.00  0.75           C  
 52 | ATOM     30  O   ASN A   4      -6.680   2.481  12.088  1.00  0.75           O  
 53 | ATOM     31  CG  ASN A   4      -4.602   3.743  15.484  1.00  0.69           C  
 54 | ATOM     32  ND2 ASN A   4      -5.131   4.267  16.583  1.00  0.69           N  
 55 | ATOM     33  OD1 ASN A   4      -3.413   3.425  15.405  1.00  0.68           O  
 56 | ATOM     34  N   VAL A   5      -5.408   3.591  10.585  1.00  0.80           N  
 57 | ATOM     35  CA  VAL A   5      -6.150   3.103   9.428  1.00  0.79           C  
 58 | ATOM     36  C   VAL A   5      -5.511   1.818   8.907  1.00  0.79           C  
 59 | ATOM     37  CB  VAL A   5      -6.211   4.162   8.304  1.00  0.76           C  
 60 | ATOM     38  O   VAL A   5      -4.285   1.682   8.911  1.00  0.74           O  
 61 | ATOM     39  CG1 VAL A   5      -7.342   3.848   7.326  1.00  0.65           C  
 62 | ATOM     40  CG2 VAL A   5      -6.385   5.559   8.898  1.00  0.64           C  
 63 | ATOM     41  N   HIS A   6      -6.163   0.782   8.894  1.00  0.80           N  
 64 | ATOM     42  CA  HIS A   6      -5.767  -0.453   8.228  1.00  0.80           C  
 65 | ATOM     43  C   HIS A   6      -6.071  -0.396   6.735  1.00  0.80           C  
 66 | ATOM     44  CB  HIS A   6      -6.472  -1.654   8.862  1.00  0.77           C  
 67 | ATOM     45  O   HIS A   6      -7.210  -0.136   6.339  1.00  0.76           O  
 68 | ATOM     46  CG  HIS A   6      -6.046  -1.925  10.270  1.00  0.73           C  
 69 | ATOM     47  CD2 HIS A   6      -6.538  -1.478  11.449  1.00  0.70           C  
 70 | ATOM     48  ND1 HIS A   6      -4.985  -2.748  10.582  1.00  0.70           N  
 71 | ATOM     49  CE1 HIS A   6      -4.844  -2.796  11.896  1.00  0.67           C  
 72 | ATOM     50  NE2 HIS A   6      -5.774  -2.033  12.446  1.00  0.66           N  
 73 | ATOM     51  N   VAL A   7      -4.983  -0.405   5.963  1.00  0.77           N  
 74 | ATOM     52  CA  VAL A   7      -5.141  -0.306   4.515  1.00  0.76           C  
 75 | ATOM     53  C   VAL A   7      -5.020  -1.692   3.886  1.00  0.77           C  
 76 | ATOM     54  CB  VAL A   7      -4.100   0.654   3.895  1.00  0.74           C  
 77 | ATOM     55  O   VAL A   7      -4.086  -2.438   4.188  1.00  0.75           O  
 78 | ATOM     56  CG1 VAL A   7      -4.248   0.702   2.376  1.00  0.70           C  
 79 | ATOM     57  CG2 VAL A   7      -4.241   2.052   4.494  1.00  0.70           C  
 80 | ATOM     58  N   LEU A   8      -6.072  -2.280   3.332  1.00  0.78           N  
 81 | ATOM     59  CA  LEU A   8      -6.003  -3.528   2.580  1.00  0.77           C  
 82 | ATOM     60  C   LEU A   8      -5.699  -3.261   1.110  1.00  0.78           C  
 83 | ATOM     61  CB  LEU A   8      -7.316  -4.304   2.709  1.00  0.75           C  
 84 | ATOM     62  O   LEU A   8      -6.402  -2.486   0.457  1.00  0.75           O  
 85 | ATOM     63  CG  LEU A   8      -7.391  -5.638   1.963  1.00  0.72           C  
 86 | ATOM     64  CD1 LEU A   8      -6.436  -6.649   2.589  1.00  0.68           C  
 87 | ATOM     65  CD2 LEU A   8      -8.819  -6.172   1.964  1.00  0.69           C  
 88 | ATOM     66  N   VAL A   9      -4.616  -3.580   0.630  1.00  0.74           N  
 89 | ATOM     67  CA  VAL A   9      -4.288  -3.473  -0.788  1.00  0.74           C  
 90 | ATOM     68  C   VAL A   9      -4.711  -4.748  -1.514  1.00  0.75           C  
 91 | ATOM     69  CB  VAL A   9      -2.780  -3.212  -1.004  1.00  0.72           C  
 92 | ATOM     70  O   VAL A   9      -4.239  -5.840  -1.190  1.00  0.73           O  
 93 | ATOM     71  CG1 VAL A   9      -2.466  -3.060  -2.491  1.00  0.69           C  
 94 | ATOM     72  CG2 VAL A   9      -2.338  -1.971  -0.231  1.00  0.69           C  
 95 | ATOM     73  N   LEU A  10      -5.865  -4.754  -2.225  1.00  0.75           N  
 96 | ATOM     74  CA  LEU A  10      -6.380  -5.808  -3.092  1.00  0.73           C  
 97 | ATOM     75  C   LEU A  10      -5.663  -5.803  -4.438  1.00  0.73           C  
 98 | ATOM     76  CB  LEU A  10      -7.887  -5.639  -3.304  1.00  0.70           C  
 99 | ATOM     77  O   LEU A  10      -5.697  -4.805  -5.161  1.00  0.68           O  
100 | ATOM     78  CG  LEU A  10      -8.773  -5.860  -2.077  1.00  0.67           C  
101 | ATOM     79  CD1 LEU A  10     -10.181  -5.336  -2.338  1.00  0.64           C  
102 | ATOM     80  CD2 LEU A  10      -8.808  -7.338  -1.701  1.00  0.65           C  
103 | ATOM     81  N   GLY A  11      -4.786  -6.738  -4.567  1.00  0.71           N  
104 | ATOM     82  CA  GLY A  11      -4.025  -6.970  -5.784  1.00  0.69           C  
105 | ATOM     83  C   GLY A  11      -2.639  -6.354  -5.749  1.00  0.69           C  
106 | ATOM     84  O   GLY A  11      -2.493  -5.155  -5.502  1.00  0.64           O  
107 | ATOM     85  N   ALA A  12      -1.688  -7.170  -5.236  1.00  0.64           N  
108 | ATOM     86  CA  ALA A  12      -0.257  -6.901  -5.129  1.00  0.63           C  
109 | ATOM     87  C   ALA A  12       0.432  -7.053  -6.482  1.00  0.65           C  
110 | ATOM     88  CB  ALA A  12       0.385  -7.830  -4.101  1.00  0.59           C  
111 | ATOM     89  O   ALA A  12       1.382  -7.828  -6.619  1.00  0.63           O  
112 | ATOM     90  N   GLY A  13      -0.229  -6.758  -7.545  1.00  0.72           N  
113 | ATOM     91  CA  GLY A  13       0.451  -6.664  -8.827  1.00  0.71           C  
114 | ATOM     92  C   GLY A  13       1.361  -5.454  -8.933  1.00  0.70           C  
115 | ATOM     93  O   GLY A  13       1.861  -4.957  -7.923  1.00  0.64           O  
116 | ATOM     94  N   ASP A  14       1.883  -5.236 -10.085  1.00  0.67           N  
117 | ATOM     95  CA  ASP A  14       2.905  -4.213 -10.286  1.00  0.66           C  
118 | ATOM     96  C   ASP A  14       2.562  -2.937  -9.520  1.00  0.67           C  
119 | ATOM     97  CB  ASP A  14       3.070  -3.904 -11.775  1.00  0.61           C  
120 | ATOM     98  O   ASP A  14       3.445  -2.293  -8.949  1.00  0.65           O  
121 | ATOM     99  CG  ASP A  14       3.806  -4.998 -12.529  1.00  0.58           C  
122 | ATOM    100  OD1 ASP A  14       4.578  -5.756 -11.904  1.00  0.58           O  
123 | ATOM    101  OD2 ASP A  14       3.614  -5.100 -13.761  1.00  0.60           O  
124 | ATOM    102  N   VAL A  15       1.399  -2.629  -9.519  1.00  0.72           N  
125 | ATOM    103  CA  VAL A  15       0.984  -1.369  -8.913  1.00  0.71           C  
126 | ATOM    104  C   VAL A  15       0.792  -1.556  -7.410  1.00  0.71           C  
127 | ATOM    105  CB  VAL A  15      -0.316  -0.832  -9.553  1.00  0.68           C  
128 | ATOM    106  O   VAL A  15       1.247  -0.732  -6.612  1.00  0.68           O  
129 | ATOM    107  CG1 VAL A  15      -0.775   0.446  -8.853  1.00  0.62           C  
130 | ATOM    108  CG2 VAL A  15      -0.112  -0.583 -11.046  1.00  0.63           C  
131 | ATOM    109  N   GLY A  16       0.138  -2.722  -7.046  1.00  0.73           N  
132 | ATOM    110  CA  GLY A  16      -0.096  -2.967  -5.632  1.00  0.73           C  
133 | ATOM    111  C   GLY A  16       1.182  -3.031  -4.818  1.00  0.73           C  
134 | ATOM    112  O   GLY A  16       1.229  -2.545  -3.686  1.00  0.72           O  
135 | ATOM    113  N   SER A  17       2.239  -3.570  -5.344  1.00  0.73           N  
136 | ATOM    114  CA  SER A  17       3.507  -3.731  -4.640  1.00  0.73           C  
137 | ATOM    115  C   SER A  17       4.164  -2.382  -4.368  1.00  0.73           C  
138 | ATOM    116  CB  SER A  17       4.459  -4.617  -5.446  1.00  0.69           C  
139 | ATOM    117  O   SER A  17       4.824  -2.202  -3.343  1.00  0.72           O  
140 | ATOM    118  OG  SER A  17       4.699  -4.062  -6.727  1.00  0.62           O  
141 | ATOM    119  N   VAL A  18       3.984  -1.511  -5.393  1.00  0.73           N  
142 | ATOM    120  CA  VAL A  18       4.565  -0.192  -5.166  1.00  0.73           C  
143 | ATOM    121  C   VAL A  18       3.882   0.475  -3.974  1.00  0.73           C  
144 | ATOM    122  CB  VAL A  18       4.447   0.704  -6.419  1.00  0.69           C  
145 | ATOM    123  O   VAL A  18       4.547   1.069  -3.122  1.00  0.72           O  
146 | ATOM    124  CG1 VAL A  18       4.897   2.130  -6.107  1.00  0.59           C  
147 | ATOM    125  CG2 VAL A  18       5.266   0.122  -7.570  1.00  0.59           C  
148 | ATOM    126  N   VAL A  19       2.569   0.345  -3.865  1.00  0.75           N  
149 | ATOM    127  CA  VAL A  19       1.787   1.016  -2.832  1.00  0.74           C  
150 | ATOM    128  C   VAL A  19       2.144   0.445  -1.461  1.00  0.75           C  
151 | ATOM    129  CB  VAL A  19       0.270   0.876  -3.088  1.00  0.71           C  
152 | ATOM    130  O   VAL A  19       2.334   1.194  -0.500  1.00  0.73           O  
153 | ATOM    131  CG1 VAL A  19      -0.530   1.478  -1.934  1.00  0.64           C  
154 | ATOM    132  CG2 VAL A  19      -0.111   1.541  -4.410  1.00  0.64           C  
155 | ATOM    133  N   VAL A  20       2.344  -0.856  -1.357  1.00  0.74           N  
156 | ATOM    134  CA  VAL A  20       2.742  -1.493  -0.106  1.00  0.74           C  
157 | ATOM    135  C   VAL A  20       4.111  -0.972   0.327  1.00  0.75           C  
158 | ATOM    136  CB  VAL A  20       2.774  -3.032  -0.237  1.00  0.71           C  
159 | ATOM    137  O   VAL A  20       4.320  -0.658   1.501  1.00  0.75           O  
160 | ATOM    138  CG1 VAL A  20       3.370  -3.669   1.017  1.00  0.63           C  
161 | ATOM    139  CG2 VAL A  20       1.370  -3.574  -0.500  1.00  0.64           C  
162 | ATOM    140  N   ARG A  21       5.028  -0.887  -0.623  1.00  0.75           N  
163 | ATOM    141  CA  ARG A  21       6.379  -0.435  -0.308  1.00  0.76           C  
164 | ATOM    142  C   ARG A  21       6.366   0.988   0.240  1.00  0.76           C  
165 | ATOM    143  CB  ARG A  21       7.274  -0.511  -1.547  1.00  0.72           C  
166 | ATOM    144  O   ARG A  21       7.128   1.314   1.153  1.00  0.76           O  
167 | ATOM    145  CG  ARG A  21       7.873  -1.887  -1.791  1.00  0.67           C  
168 | ATOM    146  CD  ARG A  21       8.870  -1.872  -2.941  1.00  0.64           C  
169 | ATOM    147  NE  ARG A  21       9.399  -3.206  -3.213  1.00  0.57           N  
170 | ATOM    148  NH1 ARG A  21      11.174  -2.467  -4.493  1.00  0.46           N  
171 | ATOM    149  NH2 ARG A  21      10.874  -4.710  -4.127  1.00  0.42           N  
172 | ATOM    150  CZ  ARG A  21      10.481  -3.458  -3.944  1.00  0.58           C  
173 | ATOM    151  N   LEU A  22       5.481   1.861  -0.360  1.00  0.76           N  
174 | ATOM    152  CA  LEU A  22       5.398   3.260   0.046  1.00  0.76           C  
175 | ATOM    153  C   LEU A  22       4.780   3.387   1.435  1.00  0.76           C  
176 | ATOM    154  CB  LEU A  22       4.578   4.065  -0.966  1.00  0.73           C  
177 | ATOM    155  O   LEU A  22       5.161   4.266   2.211  1.00  0.74           O  
178 | ATOM    156  CG  LEU A  22       5.253   4.354  -2.308  1.00  0.68           C  
179 | ATOM    157  CD1 LEU A  22       4.241   4.920  -3.298  1.00  0.63           C  
180 | ATOM    158  CD2 LEU A  22       6.423   5.314  -2.123  1.00  0.64           C  
181 | ATOM    159  N   LEU A  23       3.901   2.462   1.770  1.00  0.78           N  
182 | ATOM    160  CA  LEU A  23       3.204   2.524   3.050  1.00  0.77           C  
183 | ATOM    161  C   LEU A  23       4.095   2.018   4.179  1.00  0.77           C  
184 | ATOM    162  CB  LEU A  23       1.911   1.705   2.997  1.00  0.75           C  
185 | ATOM    163  O   LEU A  23       3.958   2.451   5.325  1.00  0.75           O  
186 | ATOM    164  CG  LEU A  23       0.790   2.264   2.119  1.00  0.70           C  
187 | ATOM    165  CD1 LEU A  23      -0.358   1.265   2.027  1.00  0.66           C  
188 | ATOM    166  CD2 LEU A  23       0.300   3.601   2.663  1.00  0.67           C  
189 | ATOM    167  N   GLU A  24       4.981   1.016   3.857  1.00  0.74           N  
190 | ATOM    168  CA  GLU A  24       5.952   0.504   4.819  1.00  0.74           C  
191 | ATOM    169  C   GLU A  24       6.979   1.571   5.187  1.00  0.74           C  
192 | ATOM    170  CB  GLU A  24       6.657  -0.736   4.264  1.00  0.71           C  
193 | ATOM    171  O   GLU A  24       7.549   1.541   6.280  1.00  0.74           O  
194 | ATOM    172  CG  GLU A  24       5.816  -2.003   4.332  1.00  0.65           C  
195 | ATOM    173  CD  GLU A  24       6.574  -3.249   3.903  1.00  0.63           C  
196 | ATOM    174  OE1 GLU A  24       7.478  -3.144   3.043  1.00  0.63           O  
197 | ATOM    175  OE2 GLU A  24       6.261  -4.340   4.431  1.00  0.59           O  
198 | ATOM    176  N   LYS A  25       7.132   2.617   4.434  1.00  0.74           N  
199 | ATOM    177  CA  LYS A  25       8.123   3.644   4.744  1.00  0.73           C  
200 | ATOM    178  C   LYS A  25       7.535   4.717   5.656  1.00  0.70           C  
201 | ATOM    179  CB  LYS A  25       8.655   4.282   3.460  1.00  0.68           C  
202 | ATOM    180  O   LYS A  25       8.218   5.219   6.551  1.00  0.67           O  
203 | ATOM    181  CG  LYS A  25       9.592   3.384   2.666  1.00  0.63           C  
204 | ATOM    182  CD  LYS A  25      10.175   4.112   1.461  1.00  0.62           C  
205 | ATOM    183  CE  LYS A  25      11.083   3.203   0.643  1.00  0.54           C  
206 | ATOM    184  NZ  LYS A  25      11.645   3.907  -0.548  1.00  0.50           N  


--------------------------------------------------------------------------------
/tests/esm/test_esmfold.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pathlib
  3 | import numpy as np
  4 | import torch
  5 | from typing import Generator
  6 | from modal import enable_output
  7 | 
  8 | from boileroom import app, ESMFold
  9 | from boileroom.models.esm.esmfold import ESMFoldOutput
 10 | from boileroom.models.esm.linker import store_multimer_properties
 11 | from boileroom.convert import pdb_string_to_atomarray
 12 | from boileroom.constants import restype_3to1
 13 | from biotite.structure import AtomArray, rmsd
 14 | from io import StringIO
 15 | from biotite.structure.io.pdb import PDBFile
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def esmfold_model(config={}) -> Generator[ESMFold, None, None]:
 20 |     with enable_output(), app.run():
 21 |         yield ESMFold(config=config)
 22 | 
 23 | 
 24 | def test_esmfold_basic(test_sequences: dict[str, str], esmfold_model: ESMFold, run_backend):
 25 |     """Test basic ESMFold functionality."""
 26 |     result = run_backend(esmfold_model.fold)(test_sequences["short"])
 27 | 
 28 |     assert isinstance(result, ESMFoldOutput), "Result should be an ESMFoldOutput"
 29 | 
 30 |     seq_len = len(test_sequences["short"])
 31 |     positions_shape = result.positions.shape
 32 | 
 33 |     assert positions_shape[-1] == 3, "Coordinate dimension mismatch. Expected: 3, Got: {positions_shape[-1]}"
 34 |     assert (
 35 |         positions_shape[-3] == seq_len
 36 |     ), "Number of residues mismatch. Expected: {seq_len}, Got: {positions_shape[-3]}"
 37 |     assert np.all(result.plddt >= 0), "pLDDT scores should be non-negative"
 38 |     assert np.all(result.plddt <= 100), "pLDDT scores should be less than or equal to 100"
 39 | 
 40 | 
 41 | def test_esmfold_multimer(test_sequences, run_backend):
 42 |     """Test ESMFold multimer functionality."""
 43 |     with enable_output(), app.run():  # TODO: make this better with a fixture, re-using the logic
 44 |         model = ESMFold(config={"output_pdb": True})
 45 |         result = run_backend(model.fold)(test_sequences["multimer"])
 46 | 
 47 |     assert result.pdb is not None, "PDB output should be generated"
 48 |     assert result.positions.shape[2] == len(test_sequences["multimer"].replace(":", "")), "Number of residues mismatch"
 49 |     assert np.all(result.residue_index[0][:54] == np.arange(0, 54)), "First chain residue index mismatch"
 50 |     assert np.all(result.residue_index[0][54:] == np.arange(0, 54)), "Second chain residue index mismatch"
 51 |     assert np.all(result.chain_index[0][:54] == 0), "First chain index mismatch"
 52 |     assert np.all(result.chain_index[0][54:] == 1), "Second chain index mismatch"
 53 | 
 54 |     structure = pdb_string_to_atomarray(result.pdb[0])
 55 | 
 56 |     n_residues = len(set((chain, res) for chain, res in zip(structure.chain_id, structure.res_id, strict=True)))
 57 | 
 58 |     assert n_residues == len(test_sequences["multimer"].replace(":", "")), "Number of residues mismatch"
 59 |     assert len(result.chain_index[0]) == n_residues, "Chain index length mismatch"
 60 |     assert len(result.residue_index[0]) == n_residues, "Residue index length mismatch"
 61 | 
 62 |     # Check chain assignments
 63 |     unique_chains = np.unique(structure.chain_id)
 64 |     assert len(unique_chains) == 2, f"Expected 2 chains, got {len(unique_chains)}"
 65 | 
 66 |     # Check residues per chain
 67 |     chain_a_residues = len(np.unique(structure.res_id[structure.chain_id == "A"]))
 68 |     chain_b_residues = len(np.unique(structure.res_id[structure.chain_id == "B"]))
 69 |     assert chain_a_residues == 54, f"Chain A should have 54 residues, got {chain_a_residues}"
 70 |     assert chain_b_residues == 54, f"Chain B should have 54 residues, got {chain_b_residues}"
 71 | 
 72 |     # Assert correct folding outputs metrics (need to do it as we slice the linker out)
 73 |     assert result.predicted_aligned_error.shape == (1, n_residues, n_residues), "PAE matrix shape mismatch"
 74 |     assert result.plddt.shape == (1, n_residues, 37), "pLDDT matrix shape mismatch"
 75 |     assert result.ptm_logits.shape == (1, n_residues, n_residues, 64), "pTM matrix shape mismatch"
 76 |     assert result.aligned_confidence_probs.shape == (1, n_residues, n_residues, 64), "aligned confidence shape mismatch"
 77 |     assert result.s_z.shape == (1, n_residues, n_residues, 128), "s_z matrix shape mismatch"
 78 |     assert result.s_s.shape == (1, n_residues, 1024), "s_s matrix shape mismatch"
 79 |     assert result.distogram_logits.shape == (1, n_residues, n_residues, 64), "distogram logits matrix shape mismatch"
 80 |     assert result.lm_logits.shape == (1, n_residues, 23), "lm logits matrix shape mismatch"
 81 |     assert result.lddt_head.shape == (8, 1, n_residues, 37, 50), "lddt head matrix shape mismatch"
 82 |     assert result.plddt.shape == (1, n_residues, 37), "pLDDT matrix shape mismatch"
 83 | 
 84 | 
 85 | def test_esmfold_linker_map():
 86 |     """
 87 |     Test ESMFold linker map.
 88 |     The linker map has 1 for residues to keep (i.e. those not part of the linker),
 89 |     and 0 for residues to remove (i.e. those part of the linker).
 90 |     """
 91 |     sequences = ["AAAAAA:BBBBBBBBB", "CCCCC:DDDDDDD:EEEEEEE", "HHHH"]
 92 |     GLYCINE_LINKER = "G" * 50
 93 |     N = len(GLYCINE_LINKER)
 94 |     linker_map, _, _ = store_multimer_properties([sequences[0]], GLYCINE_LINKER)
 95 |     gt_map = torch.tensor([1] * 6 + [0] * N + [1] * 9)
 96 |     assert torch.all(linker_map == gt_map), "Linker map mismatch"
 97 | 
 98 |     linker_map, _, _ = store_multimer_properties([sequences[1]], GLYCINE_LINKER)
 99 |     gt_map = torch.tensor([1] * 5 + [0] * N + [1] * 7 + [0] * N + [1] * 7)
100 |     assert torch.all(linker_map == gt_map), "Linker map mismatch"
101 | 
102 |     linker_map, _, _ = store_multimer_properties([sequences[2]], GLYCINE_LINKER)
103 |     gt_map = torch.tensor([1] * 4)
104 |     assert torch.all(linker_map == gt_map), "Linker map mismatch"
105 | 
106 | 
107 | def test_esmfold_no_glycine_linker(test_sequences, run_backend):
108 |     """Test ESMFold no glycine linker."""
109 |     model = ESMFold(
110 |         config={
111 |             "glycine_linker": "",
112 |         }
113 |     )
114 | 
115 |     with enable_output(), app.run():
116 |         result = run_backend(model.fold)(test_sequences["multimer"])
117 | 
118 |     assert result.positions is not None, "Positions should be generated"
119 |     assert result.positions.shape[2] == len(test_sequences["multimer"].replace(":", "")), "Number of residues mismatch"
120 | 
121 |     assert result.residue_index is not None, "Residue index should be generated"
122 |     assert result.plddt is not None, "pLDDT should be generated"
123 |     assert result.ptm is not None, "pTM should be generated"
124 | 
125 |     # assert correct chain_indices
126 |     assert np.all(result.chain_index[0] == np.array([0] * 54 + [1] * 54)), "Chain indices mismatch"
127 |     assert np.all(
128 |         result.residue_index[0] == np.concatenate([np.arange(0, 54), np.arange(0, 54)])
129 |     ), "Residue index mismatch"
130 | 
131 | 
132 | def test_esmfold_chain_indices():
133 |     """
134 |     Test ESMFold chain indices. Note that this is before we slice the linker out, that
135 |     is why we need to check the presence of the linker indices here as well. And by construction,
136 |     it is assigned to the first chain, i.e. 0.
137 |     """
138 |     sequences = ["AAAAAA:CCCCCCCCC", "CCCCC:DDDDDDD:EEEEEEE", "HHHH"]
139 |     GLYCINE_LINKER = "G" * 50
140 |     N = len(GLYCINE_LINKER)
141 | 
142 |     _, _, chain_indices = store_multimer_properties([sequences[0]], GLYCINE_LINKER)
143 | 
144 |     expected_chain_indices = np.concatenate(
145 |         [
146 |             np.zeros(6),  # First chain (6 residues)
147 |             np.zeros(N),  # Linker region (N residues) - belongs to first chain
148 |             np.ones(9),  # Second chain (9 residues)
149 |         ]
150 |     )
151 |     assert np.array_equal(chain_indices[0], expected_chain_indices), "Chain indices mismatch"
152 | 
153 | 
154 | def test_esmfold_batch(esmfold_model: ESMFold, test_sequences: dict[str, str], run_backend):
155 |     """Test ESMFold batch prediction."""
156 | 
157 |     # Define input sequences
158 |     sequences = [test_sequences["short"], test_sequences["medium"]]
159 | 
160 |     # Make prediction
161 |     result = run_backend(esmfold_model.fold)(sequences)
162 | 
163 |     max_seq_length = max(len(seq) for seq in sequences)
164 |     assert (
165 |         result.positions.shape == (8, len(sequences), max_seq_length, 14, 3)
166 |     ), f"Position shape mismatch. Expected: (8, {len(sequences)}, {max_seq_length}, 14, 3), Got: {result.positions.shape}"
167 | 
168 |     # Check that batch outputs have correct sequence lengths
169 |     assert result.aatype.shape[0] == len(sequences), "Batch size mismatch in aatype"
170 |     assert result.plddt.shape[0] == len(sequences), "Batch size mismatch in plddt"
171 |     assert result.ptm_logits.shape[0] == len(sequences), "Batch size mismatch in ptm_logits"
172 |     assert result.predicted_aligned_error.shape[0] == len(sequences), "Batch size mismatch in predicted_aligned_error"
173 | 
174 | 
175 | # TODO: This is not obvious to do, given the way we wrap things around in Modal
176 | # This shows well how fragile relying on Modal is going to be moving forward, and we should think
177 | # of ways to make it more managable through local execution as well
178 | 
179 | # def test_tokenize_sequences_with_mocker(mocker):
180 | #     """Test tokenization of multimer sequences using pytest-mock."""
181 | #     from boileroom.esmfold import ESMFold
182 | 
183 | #     # Test data
184 | #     sequences = ["AAAAAA:CCCCCCCCC", "CCCCC:DDDDDDD:EEEEEEE", "HHHH"]
185 | #     GLYCINE_LINKER = ""
186 | #     POSITION_IDS_SKIP = 512
187 | 
188 | #     # Create a model instance
189 | #     model = ESMFold(config={"glycine_linker": GLYCINE_LINKER, "position_ids_skip": POSITION_IDS_SKIP})
190 | 
191 | #     # Mock the tokenizer
192 | #     mock_tokenizer = mocker.patch.object(model, 'tokenizer')
193 | #     mock_tokenizer.return_value = {
194 | #         "input_ids": torch.tensor([
195 | #             [1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, -1, -1, -1],
196 | #             [3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5],
197 | #             [8, 8, 8, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
198 | #             ]),
199 | #         "attention_mask": torch.ones(3, 19),
200 | #         }
201 | 
202 | #     # Call the method to test
203 | #     tokenized_input = model._tokenize_sequences(sequences)
204 | 
205 | #     # Assert the tokenizer was called with the expected arguments
206 | #     expected_sequences = [seq.replace(":", GLYCINE_LINKER) for seq in sequences]
207 | #     mock_tokenizer.assert_called_once_with(
208 | #         expected_sequences,
209 | #         padding=True,
210 | #         truncation=True,
211 | #         return_tensors="pt",
212 | #         add_special_tokens=False
213 | #     )
214 | 
215 | #     # Verify the output contains the expected keys
216 | #     assert set(tokenized_input.keys()) >= {"input_ids", "attention_mask", "position_ids"}
217 | 
218 | 
219 | def test_sequence_validation(esmfold_model: ESMFold, test_sequences: dict[str, str], run_backend):
220 |     """Test sequence validation in FoldingAlgorithm."""
221 | 
222 |     # Test single sequence
223 |     single_seq = test_sequences["short"]
224 |     validated = esmfold_model._validate_sequences(single_seq)
225 |     assert isinstance(validated, list), "Single sequence should be converted to list"
226 |     assert len(validated) == 1, "Should contain one sequence"
227 |     assert validated[0] == single_seq, "Sequence should be unchanged"
228 | 
229 |     # Test sequence list
230 |     seq_list = [test_sequences["short"], test_sequences["medium"]]
231 |     validated = esmfold_model._validate_sequences(seq_list)
232 |     assert isinstance(validated, list), "Should return a list"
233 |     assert len(validated) == 2, "Should contain two sequences"
234 |     assert validated == seq_list, "Sequences should be unchanged"
235 | 
236 |     # Test invalid sequence
237 |     with pytest.raises(ValueError) as exc_info:
238 |         esmfold_model._validate_sequences(test_sequences["invalid"])
239 |     assert "Invalid amino acid" in str(exc_info.value), f"Expected 'Invalid amino acid', got {str(exc_info.value)}"
240 | 
241 |     # Test that fold method uses validation
242 |     with pytest.raises(ValueError) as exc_info:
243 |         run_backend(esmfold_model.fold)(test_sequences["invalid"])
244 |     assert "Invalid amino acid" in str(exc_info.value), f"Expected 'Invalid amino acid', got {str(exc_info.value)}"
245 | 
246 | 
247 | def test_esmfold_output_pdb_cif(data_dir: pathlib.Path, test_sequences: dict[str, str], run_backend):
248 |     """Test ESMFold output PDB and CIF."""
249 | 
250 |     def recover_sequence(atomarray: AtomArray) -> str:
251 |         unique_res_ids = np.unique(atomarray.res_id)
252 |         three_letter_codes = [atomarray.res_name[atomarray.res_id == res_id][0] for res_id in unique_res_ids]
253 |         one_letter_codes = [restype_3to1[three_letter_code] for three_letter_code in three_letter_codes]
254 |         return "".join(one_letter_codes)
255 | 
256 |     with enable_output(), app.run():
257 |         model = ESMFold(config={"output_pdb": True, "output_cif": False, "output_atomarray": True})
258 |         # Define input sequences
259 |         sequences = [test_sequences["short"], test_sequences["medium"]]
260 |         result = run_backend(model.fold)(sequences)
261 | 
262 |     assert result.pdb is not None, "PDB output should be generated"
263 |     assert result.cif is None, "CIF output should be None"
264 |     assert len(result.pdb) == len(result.atom_array) == len(sequences) == 2, "Batching output match!"
265 |     assert isinstance(result.pdb, list), "PDB output should be a list"
266 |     assert len(result.pdb) == len(sequences), "PDB output should have same length as input sequences"
267 |     assert isinstance(result.atom_array, list), "Atom array should be a list"
268 |     assert isinstance(result.atom_array[0], AtomArray), "Atom array should be an AtomArray"
269 | 
270 |     short_pdb = PDBFile.read(StringIO(result.pdb[0])).get_structure(model=1)
271 |     medium_pdb = PDBFile.read(StringIO(result.pdb[1])).get_structure(model=1)
272 |     short_atomarray = result.atom_array[0]
273 |     medium_atomarray = result.atom_array[1]
274 | 
275 |     # Short protein checks
276 |     num_residues = len(sequences[0])
277 |     assert np.all(
278 |         np.unique(short_atomarray.res_id) == np.arange(0, num_residues)
279 |     ), "AtomArray residues should be 0-indexed"
280 |     recovered_seq = recover_sequence(short_atomarray)
281 |     assert recovered_seq == sequences[0], "Recovered sequence should be equal to the input sequence"
282 |     assert np.all(np.unique(short_pdb.res_id) == np.arange(0, num_residues)), "Residues should be 0-indexed"
283 |     # Compare coordinates with tolerance
284 |     assert np.allclose(
285 |         short_pdb.coord, short_atomarray.coord, atol=0.1
286 |     ), "Atom coordinates should be equal within 0.1Å tolerance"
287 |     # Compare other attributes exactly
288 |     assert np.array_equal(short_pdb.chain_id, short_atomarray.chain_id), "Chain IDs should match exactly"
289 |     assert np.array_equal(short_pdb.res_id, short_atomarray.res_id), "Residue IDs should match exactly"
290 |     assert np.array_equal(short_pdb.res_name, short_atomarray.res_name), "Residue names should match exactly"
291 |     assert np.array_equal(short_pdb.atom_name, short_atomarray.atom_name), "Atom names should match exactly"
292 | 
293 |     # Medium protein checks
294 |     num_residues = len(sequences[1])
295 |     assert np.all(
296 |         np.unique(medium_atomarray.res_id) == np.arange(0, num_residues)
297 |     ), "AtomArray residues should be 0-indexed"
298 |     recovered_seq = recover_sequence(medium_atomarray)
299 |     assert recovered_seq == sequences[1], "Recovered sequence should be equal to the input sequence"
300 |     assert np.all(np.unique(medium_pdb.res_id) == np.arange(0, num_residues)), "Residues should be 0-indexed"
301 | 
302 |     # Compare coordinates with tolerance
303 |     assert np.allclose(
304 |         medium_pdb.coord, medium_atomarray.coord, atol=0.1
305 |     ), "Atom coordinates should be equal within 0.1Å tolerance"
306 |     # Compare other attributes exactly
307 |     assert np.array_equal(medium_pdb.chain_id, medium_atomarray.chain_id), "Chain IDs should match exactly"
308 |     assert np.array_equal(medium_pdb.res_id, medium_atomarray.res_id), "Residue IDs should match exactly"
309 |     assert np.array_equal(medium_pdb.res_name, medium_atomarray.res_name), "Residue names should match exactly"
310 |     assert np.array_equal(medium_pdb.atom_name, medium_atomarray.atom_name), "Atom names should match exactly"
311 | 
312 |     short_pdbfile = PDBFile().read(data_dir / "esmfold_server_short.pdb")
313 |     saved_short_pdb = short_pdbfile.get_structure(model=1)
314 |     saved_short_bfactor = short_pdbfile.get_b_factor()
315 |     rmsd_value = rmsd(short_pdb, saved_short_pdb)
316 |     assert (
317 |         rmsd_value < 1.5
318 |     ), "PDB file should be almost equal to the saved ESMFold Server PDB file. Difference comes from HF vs. Meta implementation differences."
319 | 
320 |     medium_pdbfile = PDBFile().read(data_dir / "esmfold_server_medium.pdb")
321 |     saved_medium_pdb = medium_pdbfile.get_structure(model=1)
322 |     saved_medium_bfactor = medium_pdbfile.get_b_factor()
323 |     rmsd_value = rmsd(medium_pdb, saved_medium_pdb)
324 |     assert (
325 |         rmsd_value < 1.5
326 |     ), "PDB file should be almost equal to the saved ESMFold Server PDB file. Difference comes from HF vs. Meta implementation differences."
327 | 
328 |     # compare b-factor
329 |     short_bfactor = short_atomarray.get_annotation("b_factor")
330 |     medium_bfactor = medium_atomarray.get_annotation("b_factor")
331 |     assert np.allclose(
332 |         short_bfactor, saved_short_bfactor, atol=0.05
333 |     ), "B-factor should match within a tolerance (HF vs. Meta)"
334 |     assert np.allclose(
335 |         medium_bfactor, saved_medium_bfactor, atol=0.05
336 |     ), "B-factor should match within a tolerance (HF vs. Meta)"
337 | 


--------------------------------------------------------------------------------
/boileroom/models/esm/esmfold.py:
--------------------------------------------------------------------------------
  1 | """ESMFold implementation for protein structure prediction using Meta AI's ESM-2 model."""
  2 | 
  3 | import os
  4 | import logging
  5 | from dataclasses import dataclass
  6 | from typing import Optional, List, Union
  7 | 
  8 | import modal
  9 | import numpy as np
 10 | from biotite.structure import AtomArray
 11 | 
 12 | from ... import app
 13 | from ...base import FoldingAlgorithm, StructurePrediction, PredictionMetadata
 14 | from ...images import esm_image
 15 | from ...images.volumes import model_weights
 16 | from ...utils import MINUTES, MODEL_DIR, GPUS_AVAIL_ON_MODAL, Timer
 17 | from .linker import compute_position_ids, store_multimer_properties
 18 | 
 19 | # ESMFold-Specific: A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
 20 | # This might be re-used elsewhere, if OpenFold/AlphaFold or newer models use the same ordering convention.
 21 | RESIDUE_ATOMS: dict[str, list[str]] = {
 22 |     "ALA": ["C", "CA", "CB", "N", "O"],
 23 |     "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"],
 24 |     "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"],
 25 |     "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"],
 26 |     "CYS": ["C", "CA", "CB", "N", "O", "SG"],
 27 |     "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"],
 28 |     "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"],
 29 |     "GLY": ["C", "CA", "N", "O"],
 30 |     "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"],
 31 |     "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"],
 32 |     "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"],
 33 |     "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"],
 34 |     "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"],
 35 |     "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"],
 36 |     "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"],
 37 |     "SER": ["C", "CA", "CB", "N", "O", "OG"],
 38 |     "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"],
 39 |     "TRP": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE2", "CE3", "CZ2", "CZ3", "CH2", "N", "NE1", "O"],
 40 |     "TYR": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O", "OH"],
 41 |     "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"],
 42 | }
 43 | 
 44 | with esm_image.imports():
 45 |     import torch
 46 |     from transformers import EsmForProteinFolding, AutoTokenizer
 47 |     from transformers.models.esm.modeling_esmfold import EsmFoldingTrunk
 48 | 
 49 |     def always_no_grad_forward(self, seq_feats, pair_feats, true_aa, residx, mask, no_recycles):
 50 |         """
 51 |         Inputs:
 52 |             seq_feats: B x L x C tensor of sequence features pair_feats: B x L x L x C tensor of pair features residx: B
 53 |             x L long tensor giving the position in the sequence mask: B x L boolean tensor indicating valid residues
 54 | 
 55 |         Output:
 56 |             predicted_structure: B x L x (num_atoms_per_residue * 3) tensor wrapped in a Coordinates object
 57 |         """
 58 | 
 59 |         device = seq_feats.device
 60 |         s_s_0 = seq_feats
 61 |         s_z_0 = pair_feats
 62 | 
 63 |         if no_recycles is None:
 64 |             no_recycles = self.config.max_recycles
 65 |         else:
 66 |             if no_recycles < 0:
 67 |                 raise ValueError("Number of recycles must not be negative.")
 68 |             no_recycles += 1  # First 'recycle' is just the standard forward pass through the model.
 69 | 
 70 |         def trunk_iter(s, z, residx, mask):
 71 |             z = z + self.pairwise_positional_embedding(residx, mask=mask)
 72 | 
 73 |             for block in self.blocks:
 74 |                 s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size)
 75 |             return s, z
 76 | 
 77 |         s_s = s_s_0
 78 |         s_z = s_z_0
 79 |         recycle_s = torch.zeros_like(s_s)
 80 |         recycle_z = torch.zeros_like(s_z)
 81 |         recycle_bins = torch.zeros(*s_z.shape[:-1], device=device, dtype=torch.int64)
 82 | 
 83 |         for recycle_idx in range(no_recycles):
 84 |             with torch.no_grad():
 85 |                 # === Recycling ===
 86 |                 recycle_s = self.recycle_s_norm(recycle_s.detach()).to(device)
 87 |                 recycle_z = self.recycle_z_norm(recycle_z.detach()).to(device)
 88 |                 recycle_z += self.recycle_disto(recycle_bins.detach()).to(device)
 89 | 
 90 |                 s_s, s_z = trunk_iter(s_s_0 + recycle_s, s_z_0 + recycle_z, residx, mask)
 91 | 
 92 |                 # === Structure module ===
 93 |                 structure = self.structure_module(
 94 |                     {"single": self.trunk2sm_s(s_s), "pair": self.trunk2sm_z(s_z)},
 95 |                     true_aa,
 96 |                     mask.float(),
 97 |                 )
 98 | 
 99 |                 recycle_s = s_s
100 |                 recycle_z = s_z
101 |                 # Distogram needs the N, CA, C coordinates, and bin constants same as alphafold.
102 |                 recycle_bins = EsmFoldingTrunk.distogram(
103 |                     structure["positions"][-1][:, :, :3],
104 |                     3.375,
105 |                     21.375,
106 |                     self.recycle_bins,
107 |                 )
108 | 
109 |         structure["s_s"] = s_s
110 |         structure["s_z"] = s_z
111 | 
112 |         return structure
113 | 
114 |     EsmFoldingTrunk.forward = always_no_grad_forward
115 | 
116 | # Set up basic logging configuration
117 | logging.basicConfig(level=logging.INFO)
118 | logger = logging.getLogger(__name__)
119 | 
120 | 
121 | # TODO: turn this into a Pydantic model instead
122 | @dataclass
123 | class ESMFoldOutput(StructurePrediction):
124 |     """Output from ESMFold prediction including all model outputs."""
125 | 
126 |     # TODO: we should figure out what should be the verbosity of the output,
127 |     # as a usual user does not need all of this information
128 | 
129 |     # Required by StructurePrediction protocol
130 |     positions: np.ndarray  # (model_layer, batch_size, residue, atom=14, xyz=3)
131 |     metadata: PredictionMetadata
132 | 
133 |     # Additional ESMFold-specific outputs
134 |     frames: np.ndarray  # (model_layer, batch_size, residue, qxyz=7)
135 |     sidechain_frames: np.ndarray  # (model_layer, batch_size, residue, 8, 4, 4) [rot matrix per sidechain]
136 |     unnormalized_angles: np.ndarray  # (model_layer, batch_size, residue, 7, 2) [torsion angles]
137 |     angles: np.ndarray  # (model_layer, batch_size, residue, 7, 2) [torsion angles]
138 |     states: np.ndarray  # (model_layer, batch_size, residue, ???)
139 |     s_s: np.ndarray  # (batch_size, residue, 1024)
140 |     s_z: np.ndarray  # (batch_size, residue, residue, 128)
141 |     distogram_logits: np.ndarray  # (batch_size, residue, residue, 64) ???
142 |     lm_logits: np.ndarray  # (batch_size, residue, 23) ???
143 |     aatype: np.ndarray  # (batch_size, residue) amino acid identity
144 |     atom14_atom_exists: np.ndarray  # (batch_size, residue, atom=14)
145 |     residx_atom14_to_atom37: np.ndarray  # (batch_size, residue, atom=14)
146 |     residx_atom37_to_atom14: np.ndarray  # (batch_size, residue, atom=37)
147 |     atom37_atom_exists: np.ndarray  # (batch_size, residue, atom=37)
148 |     residue_index: np.ndarray  # (batch_size, residue)
149 |     lddt_head: np.ndarray  # (model_layer, batch_size, residue, atom=37, 50) ??
150 |     plddt: np.ndarray  # (batch_size, residue, atom=37)
151 |     ptm_logits: np.ndarray  # (batch_size, residue, residue, 64) ???
152 |     ptm: np.ndarray  # float # TODO: make it into a float when sending to the client
153 |     aligned_confidence_probs: np.ndarray  # (batch_size, residue, residue, 64)
154 |     predicted_aligned_error: np.ndarray  # (batch_size, residue, residue)
155 |     max_predicted_aligned_error: np.ndarray  # float # TODO: make it into a float when sending to the client
156 |     chain_index: np.ndarray  # (batch_size, residue)
157 |     # TODO: maybe add this to the output to clearly indicate padded residues
158 |     atom_array: Optional[AtomArray] = None  # 0-indexed
159 |     pdb: Optional[list[str]] = None  # 0-indexed
160 |     cif: Optional[list[str]] = None  # 0-indexed
161 | 
162 |     # TODO: can add a save method here (to a pickle and a pdb file) that can be run locally
163 |     # TODO: add verification of the outputs, and primarily the shape of all the arrays
164 |     # (see test_esmfold_batch_multimer_linkers for the exact batched shapes)
165 | 
166 | 
167 | GPU_TO_USE = os.environ.get("BOILEROOM_GPU", "T4")
168 | 
169 | if GPU_TO_USE not in GPUS_AVAIL_ON_MODAL:
170 |     raise ValueError(
171 |         f"GPU specified in BOILEROOM_GPU environment variable ('{GPU_TO_USE}') not available on "
172 |         f"Modal. Please choose from: {GPUS_AVAIL_ON_MODAL}"
173 |     )
174 | 
175 | 
176 | @app.cls(
177 |     image=esm_image,
178 |     gpu=GPU_TO_USE,
179 |     timeout=20 * MINUTES,
180 |     container_idle_timeout=10 * MINUTES,
181 |     volumes={MODEL_DIR: model_weights},
182 | )
183 | class ESMFold(FoldingAlgorithm):
184 |     """ESMFold protein structure prediction model."""
185 | 
186 |     # TODO: maybe this config should be input to the fold function, so that it can
187 |     # changed programmatically on a single ephermal app, rather than re-creating the app?
188 |     DEFAULT_CONFIG = {
189 |         # ESMFold model config
190 |         "output_pdb": False,
191 |         "output_cif": False,
192 |         "output_atomarray": False,
193 |         # Chain linking and positioning config
194 |         "glycine_linker": "",
195 |         "position_ids_skip": 512,
196 |     }
197 | 
198 |     # We need to properly asses whether using this or the original ESMFold is better
199 |     # based on speed, accuracy, bugs, etc.; as well as customizability
200 |     # For instance, if we want to also allow differently sized structure modules, than this would be good
201 |     # TODO: we should add a settings dictionary or something, that would make it easier to add new options
202 |     # TODO: maybe use OmegaConf instead to make it easier instead of config
203 |     def __init__(self, config: dict = {}) -> None:
204 |         """Initialize ESMFold."""
205 |         super().__init__(config)
206 |         self.metadata = self._initialize_metadata(
207 |             model_name="ESMFold",
208 |             model_version="v4.49.0",  # HuggingFace transformers version
209 |         )
210 |         self.model_dir: Optional[str] = os.environ.get("MODEL_DIR", MODEL_DIR)
211 |         self.tokenizer: Optional[AutoTokenizer] = None
212 |         self.model: Optional[EsmForProteinFolding] = None
213 | 
214 |     @modal.enter()
215 |     def _initialize(self) -> None:
216 |         """Initialize the model during container startup. This helps us determine whether we run locally or remotely."""
217 |         self._load()
218 | 
219 |     def _load(self) -> None:
220 |         """Load the ESMFold model and tokenizer."""
221 |         if self.tokenizer is None:
222 |             self.tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1", cache_dir=self.model_dir)
223 |         if self.model is None:
224 |             self.model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", cache_dir=self.model_dir)
225 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
226 |         self.model = self.model.to(self.device)
227 |         self.model.eval()
228 |         self.model.trunk.set_chunk_size(64)
229 |         self.ready = True
230 | 
231 |     @modal.method()
232 |     def fold(self, sequences: Union[str, List[str]]) -> ESMFoldOutput:
233 |         """Predict protein structure(s) using ESMFold."""
234 |         if self.tokenizer is None or self.model is None:
235 |             logger.warning("Model not loaded. Forcing the model to load... Next time call _load() first.")
236 |             self._load()
237 |         assert self.tokenizer is not None and self.model is not None, "Model not loaded"
238 | 
239 |         if isinstance(sequences, str):
240 |             sequences = [sequences]
241 | 
242 |         sequences = self._validate_sequences(sequences)
243 |         self.metadata.sequence_lengths = self._compute_sequence_lengths(sequences)
244 | 
245 |         tokenized_input, multimer_properties = self._tokenize_sequences(sequences)
246 | 
247 |         with Timer("Model Inference") as timer:
248 |             with torch.inference_mode():
249 |                 outputs = self.model(**tokenized_input)
250 | 
251 |         outputs = self._convert_outputs(outputs, multimer_properties, timer.duration)
252 |         return outputs
253 | 
254 |     def _tokenize_sequences(self, sequences: List[str]) -> tuple[dict, dict[str, torch.Tensor] | None]:
255 |         assert self.tokenizer is not None, "Tokenizer not loaded"
256 |         if ":" in "".join(sequences):  # MULTIMER setting
257 |             tokenized, multimer_properties = self._tokenize_multimer(sequences)
258 |         else:  # MONOMER setting
259 |             tokenized = self.tokenizer(
260 |                 sequences, return_tensors="pt", add_special_tokens=False, padding=True, truncation=True, max_length=1024
261 |             )
262 |             multimer_properties = None
263 |         tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
264 | 
265 |         return tokenized, multimer_properties
266 | 
267 |     def _tokenize_multimer(self, sequences: List[str]) -> torch.Tensor:
268 |         assert self.tokenizer is not None, "Tokenizer not loaded"
269 |         # Store multimer properties first
270 |         linker_map, residue_index, chain_index = store_multimer_properties(sequences, self.config["glycine_linker"])
271 | 
272 |         # Create tokenized input using list comprehension directly
273 |         glycine_linker = self.config["glycine_linker"]
274 |         tokenized = self.tokenizer(
275 |             [seq.replace(":", glycine_linker) for seq in sequences],
276 |             padding=True,
277 |             truncation=True,
278 |             return_tensors="pt",
279 |             add_special_tokens=False,
280 |         )
281 | 
282 |         # Add position IDs
283 |         tokenized["position_ids"] = compute_position_ids(sequences, glycine_linker, self.config["position_ids_skip"])
284 | 
285 |         # Create attention mask (1 means keep, 0 means mask)
286 |         # This also masks padding tokens, which are -1
287 |         tokenized["attention_mask"] = (linker_map == 1).to(torch.int32)
288 | 
289 |         return tokenized, {"linker_map": linker_map, "residue_index": residue_index, "chain_index": chain_index}
290 | 
291 |     def _mask_linker_region(
292 |         self,
293 |         outputs: dict,
294 |         linker_map: torch.Tensor,
295 |         residue_index: torch.Tensor,
296 |         chain_index: torch.Tensor,
297 |     ) -> dict:
298 |         """Mask the linker region in the outputs and track padding information.
299 |         This includes all the metrics.
300 | 
301 |         Args:
302 |             outputs: Dictionary containing model outputs
303 | 
304 |         Returns:
305 |             dict: Updated outputs with linker regions masked and padding information
306 |         """
307 |         assert isinstance(linker_map, torch.Tensor), "linker_map must be a tensor"
308 | 
309 |         positions = []
310 |         frames = []
311 |         sidechain_frames = []
312 |         unnormalized_angles = []
313 |         angles = []
314 |         states = []
315 |         lddt_head = []
316 | 
317 |         s_s = []
318 |         lm_logits = []
319 |         aatype = []
320 |         atom14_atom_exists = []
321 |         residx_atom14_to_atom37 = []
322 |         residx_atom37_to_atom14 = []
323 |         atom37_atom_exists = []
324 |         plddt = []
325 | 
326 |         s_z = []
327 |         distogram_logits = []
328 |         ptm_logits = []
329 |         aligned_confidence_probs = []
330 |         predicted_aligned_error = []
331 | 
332 |         _residue_index = []
333 |         _chain_index = []
334 | 
335 |         for batch_idx, multimer in enumerate(linker_map):
336 |             # Drop the -1 values, meaning 1s refer to residues we want to keep
337 |             multimer = multimer.masked_fill(multimer == -1, 0).cpu().numpy()
338 |             # Chain indices are the ones that were not masked, hence they were kept and are thus 1
339 |             chain_indices = np.where(multimer == 1)[0]
340 | 
341 |             # 3rd dim is residue index
342 |             positions.append(outputs["positions"][:, batch_idx, chain_indices])
343 |             frames.append(outputs["frames"][:, batch_idx, chain_indices])
344 |             sidechain_frames.append(outputs["sidechain_frames"][:, batch_idx, chain_indices])
345 |             unnormalized_angles.append(outputs["unnormalized_angles"][:, batch_idx, chain_indices])
346 |             angles.append(outputs["angles"][:, batch_idx, chain_indices])
347 |             states.append(outputs["states"][:, batch_idx, chain_indices])
348 |             lddt_head.append(outputs["lddt_head"][:, batch_idx, chain_indices])
349 | 
350 |             # 2nd dim is residue index
351 |             s_s.append(outputs["s_s"][batch_idx, chain_indices])
352 |             lm_logits.append(outputs["lm_logits"][batch_idx, chain_indices])
353 |             aatype.append(outputs["aatype"][batch_idx, chain_indices])
354 |             atom14_atom_exists.append(outputs["atom14_atom_exists"][batch_idx, chain_indices])
355 |             residx_atom14_to_atom37.append(outputs["residx_atom14_to_atom37"][batch_idx, chain_indices])
356 |             residx_atom37_to_atom14.append(outputs["residx_atom37_to_atom14"][batch_idx, chain_indices])
357 |             atom37_atom_exists.append(outputs["atom37_atom_exists"][batch_idx, chain_indices])
358 |             plddt.append(outputs["plddt"][batch_idx, chain_indices])
359 | 
360 |             # 2D properties that are per residue pair; thus residues is both the 2nd and 3rd dim
361 |             s_z.append(outputs["s_z"][batch_idx, chain_indices][:, chain_indices])
362 |             distogram_logits.append(outputs["distogram_logits"][batch_idx, chain_indices][:, chain_indices])
363 |             ptm_logits.append(outputs["ptm_logits"][batch_idx, chain_indices][:, chain_indices])
364 |             aligned_confidence_probs.append(
365 |                 outputs["aligned_confidence_probs"][batch_idx, chain_indices][:, chain_indices]
366 |             )
367 |             predicted_aligned_error.append(
368 |                 outputs["predicted_aligned_error"][batch_idx, chain_indices][:, chain_indices]
369 |             )
370 | 
371 |             # Custom outputs, that also have 2nd dimension as residue index
372 |             _residue_index.append(residue_index[batch_idx, chain_indices].cpu().numpy())
373 |             _chain_index.append(chain_index[batch_idx, chain_indices].cpu().numpy())
374 | 
375 |         def pad_and_stack(
376 |             arrays: list[np.ndarray], residue_dim: Union[int, List[int]], batch_dim: int, intermediate_dim: bool = False
377 |         ) -> np.ndarray:
378 |             """Pad arrays to match the largest size in the residue dimension and stack them in the batch dimension.
379 | 
380 |             Args:
381 |                 arrays: List of NumPy arrays to pad and stack
382 |                 residue_dim: Dimension(s) to pad to match sizes
383 |                 batch_dim: Dimension to stack the arrays along
384 |                 intermediate_dim: Whether the array has an intermediate dimension to preserve
385 | 
386 |             Returns:
387 |                 Stacked and padded NumPy array
388 |             """
389 |             if isinstance(residue_dim, int):
390 |                 max_size = max(arr.shape[residue_dim] for arr in arrays)
391 |                 padded_arrays = []
392 |                 for arr in arrays:
393 |                     padding = [(0, 0)] * arr.ndim
394 |                     padding[residue_dim] = (0, max_size - arr.shape[residue_dim])
395 |                     padded_arrays.append(np.pad(arr, padding, mode="constant", constant_values=-1))
396 |             elif isinstance(residue_dim, list):
397 |                 # Multi-dimension padding (e.g., for 2D matrices)
398 |                 max_sizes = []
399 |                 for dim in residue_dim:
400 |                     max_sizes.append(max(arr.shape[dim] for arr in arrays))
401 | 
402 |                 padded_arrays = []
403 |                 for arr in arrays:
404 |                     padding = [(0, 0)] * arr.ndim
405 |                     for dim, max_size in zip(residue_dim, max_sizes):
406 |                         padding[dim] = (0, max_size - arr.shape[dim])
407 |                     padded_arrays.append(np.pad(arr, padding, mode="constant", constant_values=-1))
408 | 
409 |             # Handle intermediate dimensions differently
410 |             if intermediate_dim:
411 |                 # Stack along axis=1 to preserve intermediate dim as first dimension
412 |                 return np.stack(padded_arrays, axis=1)
413 |             else:
414 |                 return np.stack(padded_arrays, axis=batch_dim)
415 | 
416 |         # 2nd dimension is the batch size, 3rd dimension was the residue index (without batch it's the 2nd dim)
417 |         # These are not done same as below is because of getting the 8 intermediate outputs from StructureModule
418 |         outputs["positions"] = pad_and_stack(positions, residue_dim=1, batch_dim=0, intermediate_dim=True)
419 |         outputs["frames"] = pad_and_stack(frames, residue_dim=1, batch_dim=0, intermediate_dim=True)
420 |         outputs["sidechain_frames"] = pad_and_stack(sidechain_frames, residue_dim=1, batch_dim=0, intermediate_dim=True)
421 |         outputs["unnormalized_angles"] = pad_and_stack(
422 |             unnormalized_angles, residue_dim=1, batch_dim=0, intermediate_dim=True
423 |         )
424 |         outputs["angles"] = pad_and_stack(angles, residue_dim=1, batch_dim=0, intermediate_dim=True)
425 |         outputs["states"] = pad_and_stack(states, residue_dim=1, batch_dim=0, intermediate_dim=True)
426 |         outputs["lddt_head"] = pad_and_stack(lddt_head, residue_dim=1, batch_dim=0, intermediate_dim=True)
427 | 
428 |         # 1st dimension is the batch size, 2nd dimension was the residue index (without batch it's the 1st dim)
429 |         outputs["s_s"] = pad_and_stack(s_s, residue_dim=0, batch_dim=0)
430 |         outputs["lm_logits"] = pad_and_stack(lm_logits, residue_dim=0, batch_dim=0)
431 |         outputs["aatype"] = pad_and_stack(aatype, residue_dim=0, batch_dim=0)
432 |         outputs["atom14_atom_exists"] = pad_and_stack(atom14_atom_exists, residue_dim=0, batch_dim=0)
433 |         outputs["residx_atom14_to_atom37"] = pad_and_stack(residx_atom14_to_atom37, residue_dim=0, batch_dim=0)
434 |         outputs["residx_atom37_to_atom14"] = pad_and_stack(residx_atom37_to_atom14, residue_dim=0, batch_dim=0)
435 |         outputs["atom37_atom_exists"] = pad_and_stack(atom37_atom_exists, residue_dim=0, batch_dim=0)
436 |         outputs["plddt"] = pad_and_stack(plddt, residue_dim=0, batch_dim=0)
437 | 
438 |         # 2D properties, otherwise same as above
439 |         outputs["s_z"] = pad_and_stack(s_z, residue_dim=[0, 1], batch_dim=0)
440 |         outputs["distogram_logits"] = pad_and_stack(distogram_logits, residue_dim=[0, 1], batch_dim=0)
441 |         outputs["ptm_logits"] = pad_and_stack(ptm_logits, residue_dim=[0, 1], batch_dim=0)
442 |         outputs["aligned_confidence_probs"] = pad_and_stack(aligned_confidence_probs, residue_dim=[0, 1], batch_dim=0)
443 |         outputs["predicted_aligned_error"] = pad_and_stack(predicted_aligned_error, residue_dim=[0, 1], batch_dim=0)
444 | 
445 |         # Custom
446 |         outputs["chain_index"] = pad_and_stack(_chain_index, residue_dim=0, batch_dim=0)
447 |         outputs["residue_index"] = pad_and_stack(_residue_index, residue_dim=0, batch_dim=0)
448 | 
449 |         return outputs
450 | 
451 |     def _convert_outputs(
452 |         self,
453 |         outputs: dict,
454 |         multimer_properties: dict[str, torch.Tensor] | None,
455 |         prediction_time: float,
456 |     ) -> ESMFoldOutput:
457 |         """Convert model outputs to ESMFoldOutput format."""
458 | 
459 |         outputs = {k: v.cpu().numpy() for k, v in outputs.items()}
460 |         if multimer_properties is not None:
461 |             # TODO: maybe add a proper MULTIMER flag?
462 |             outputs = self._mask_linker_region(outputs, **multimer_properties)
463 |         else:  # only MONOMERs
464 |             outputs["chain_index"] = np.zeros(outputs["residue_index"].shape, dtype=np.int32)
465 | 
466 |         self.metadata.prediction_time = prediction_time
467 | 
468 |         atom_array = self._convert_outputs_to_atomarray(outputs)
469 |         if self.config["output_pdb"]:
470 |             outputs["pdb"] = self._convert_outputs_to_pdb(atom_array)
471 |         if self.config["output_cif"]:
472 |             outputs["cif"] = self._convert_outputs_to_cif(atom_array)
473 |         if self.config["output_atomarray"]:
474 |             outputs["atom_array"] = atom_array
475 | 
476 |         return ESMFoldOutput(metadata=self.metadata, **outputs)
477 | 
478 |     def _convert_outputs_to_atomarray(self, outputs: dict) -> AtomArray:
479 |         """Convert ESMFold outputs to a Biotite AtomArray.
480 | 
481 |         Args:
482 |             outputs: Dictionary containing ESMFold model outputs
483 | 
484 |         Returns:
485 |             AtomArray: Biotite structure representation
486 |         """
487 |         from biotite.structure import Atom, array
488 |         from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
489 |         from transformers.models.esm.openfold_utils.residue_constants import atom_types, restypes, restype_1to3
490 | 
491 |         # Convert atom14 to atom37 format
492 |         atom_positions = atom14_to_atom37(
493 |             outputs["positions"][-1], outputs
494 |         )  # (model_layer, batch, residue, atom37, xyz)
495 |         atom_mask = outputs["atom37_atom_exists"]  # (batch, residue, atom37)
496 | 
497 |         assert len(atom_types) == atom_positions.shape[2] == 37, "Atom types must be 37"
498 | 
499 |         # Get batch and residue dimensions
500 |         batch_size, n_residues, n_atoms = atom_mask.shape
501 | 
502 |         # Create list to store atoms
503 |         arrays = []
504 | 
505 |         # Process each protein in the batch
506 |         for b in range(batch_size):
507 |             atoms = []  # clear out the atoms list
508 |             # Process each residue
509 |             for res_idx in range(n_residues):
510 |                 # Get chain ID (convert numeric index to letter A-Z)
511 |                 chain_id = chr(65 + outputs["chain_index"][b, res_idx])  # A=65 in ASCII
512 | 
513 |                 # Get residue name (3-letter code)
514 |                 aa_type = outputs["aatype"][b, res_idx]  # id representing residue identity
515 |                 res_name = restypes[aa_type]  # 1-letter residue identity
516 |                 res_name = restype_1to3[res_name]  # 3-letter residue identity
517 | 
518 |                 # Process each atom in the residue
519 |                 for atom_idx in range(n_atoms):  # loops through all 37 atom types
520 |                     # Skip if atom doesn't exist
521 |                     if not atom_mask[b, res_idx, atom_idx]:
522 |                         continue
523 | 
524 |                     # Get atom coordinates
525 |                     coord = atom_positions[b, res_idx, atom_idx]
526 | 
527 |                     # Create Atom object
528 |                     atom = Atom(
529 |                         coord=coord,
530 |                         chain_id=chain_id,
531 |                         atom_name=atom_types[atom_idx],
532 |                         res_name=res_name,
533 |                         res_id=outputs["residue_index"][b, res_idx],  # 0-indexed
534 |                         element=atom_types[atom_idx][0],
535 |                         # we only support C, N, O, S, [according to OpenFold Protein class]
536 |                         # element is thus the first character of any atom name (according to PDB nomenclature)
537 |                         b_factor=outputs["plddt"][b, res_idx, atom_idx],
538 |                     )
539 |                     atoms.append(atom)
540 |             arrays.append(array(atoms))
541 |         return arrays
542 | 
543 |     def _convert_outputs_to_pdb(self, atom_array: AtomArray) -> list[str]:
544 |         # TODO: this might make more sense to do locally, instead of doing it on the Modal instance
545 |         from biotite.structure.io.pdb import PDBFile, set_structure
546 |         from io import StringIO
547 | 
548 |         pdbs = []
549 |         for a in atom_array:
550 |             structure_file = PDBFile()
551 |             set_structure(structure_file, a)
552 |             string = StringIO()
553 |             structure_file.write(string)
554 |             pdbs.append(string.getvalue())
555 |         return pdbs
556 | 
557 |     def _convert_outputs_to_cif(self, atom_array: AtomArray) -> list[str]:
558 |         # TODO: this might make more sense to do locally, instead of doing it on the Modal instance
559 |         from biotite.structure.io.pdbx import CIFFile, set_structure
560 |         from io import StringIO
561 | 
562 |         cifs = []
563 |         for a in atom_array:
564 |             structure_file = CIFFile()
565 |             set_structure(structure_file, a)
566 |             string = StringIO()
567 |             structure_file.write(string)
568 |             cifs.append(string.getvalue())
569 |         return cifs
570 | 
571 | 
572 | def get_esmfold(gpu_type="T4", config: dict = {}):
573 |     """
574 |     Note that the app will still show that's using T4, but the actual method / function call will use the correct GPU,
575 |     and display accordingly in the Modal dashboard.
576 |     """
577 |     Model = ESMFold.with_options(gpu=gpu_type)  # type: ignore
578 |     return Model(config=config)
579 | 


--------------------------------------------------------------------------------
/tests/data/esmfold_server_medium.pdb:
--------------------------------------------------------------------------------
  1 | HEADER                                            18-OCT-22                     
  2 | TITLE     ESMFOLD V1 PREDICTION FOR INPUT
  3 | REMARK   1                                                                      
  4 | REMARK   1 REFERENCE 1                                                          
  5 | REMARK   1  AUTH   ZEMING LIN, HALIL AKIN, ROSHAN RAO, BRIAN HIE, ZHONGKAI ZHU,
  6 | REMARK   1  AUTH 2 WENTING LU, NIKITA SMETANIN, ROBERT VERKUIL, ORI KABELI,
  7 | REMARK   1  AUTH 3 YANIV SHMUELI, ALLAN DOS SANTOS COSTA,
  8 | REMARK   1  AUTH 4 MARYAM FAZEL-ZARANDI, TOM SERCU, SALVATORE CANDIDO,
  9 | REMARK   1  AUTH 5 ALEXANDER RIVES
 10 | REMARK   1  TITL   EVOLUTIONARY-SCALE PREDICTION OF ATOMIC LEVEL PROTEIN
 11 | REMARK   1  TITL 2 STRUCTURE WITH A LANGUAGE MODEL
 12 | REMARK   1  REF                                                                 
 13 | REMARK   1  REFN                                                                
 14 | REMARK   1  PMID                                                                
 15 | REMARK   1  DOI    10.1101/2022.07.20.500902                                    
 16 | REMARK   1                                                                      
 17 | REMARK   1 LICENSE AND DISCLAIMERS                
 18 | REMARK   1 ESM METAGENOMIC ATLAS DATA IS AVAILABLE UNDER
 19 | REMARK   1 A CC-BY-4.0 LICENSE FOR ACADEMIC AND COMMERCIAL USE.
 20 | REMARK   1 COPYRIGHT (C) META PLATFORMS, INC. ALL RIGHTS RESERVED.
 21 | REMARK   1 USE OF THE ESM METAGENOMIC ATLAS DATA IS SUBJECT
 22 | REMARK   1 TO THE META OPEN SOURCE TERMS OF USE AND PRIVACY POLICY.
 23 | ATOM      1  N   MET A   1      26.052  18.992 -15.018  1.00  0.65           N  
 24 | ATOM      2  CA  MET A   1      24.789  19.636 -14.668  1.00  0.65           C  
 25 | ATOM      3  C   MET A   1      23.762  19.460 -15.781  1.00  0.66           C  
 26 | ATOM      4  CB  MET A   1      25.004  21.124 -14.386  1.00  0.58           C  
 27 | ATOM      5  O   MET A   1      22.557  19.439 -15.523  1.00  0.64           O  
 28 | ATOM      6  CG  MET A   1      25.590  21.409 -13.012  1.00  0.55           C  
 29 | ATOM      7  SD  MET A   1      25.304  23.139 -12.470  1.00  0.55           S  
 30 | ATOM      8  CE  MET A   1      25.949  23.054 -10.777  1.00  0.50           C  
 31 | ATOM      9  N   ALA A   2      24.292  19.398 -16.973  1.00  0.72           N  
 32 | ATOM     10  CA  ALA A   2      23.414  19.330 -18.139  1.00  0.72           C  
 33 | ATOM     11  C   ALA A   2      22.697  17.985 -18.208  1.00  0.72           C  
 34 | ATOM     12  CB  ALA A   2      24.210  19.572 -19.419  1.00  0.66           C  
 35 | ATOM     13  O   ALA A   2      21.561  17.903 -18.679  1.00  0.69           O  
 36 | ATOM     14  N   LEU A   3      23.370  17.004 -17.718  1.00  0.80           N  
 37 | ATOM     15  CA  LEU A   3      22.810  15.664 -17.858  1.00  0.79           C  
 38 | ATOM     16  C   LEU A   3      21.563  15.503 -16.995  1.00  0.79           C  
 39 | ATOM     17  CB  LEU A   3      23.849  14.607 -17.476  1.00  0.75           C  
 40 | ATOM     18  O   LEU A   3      20.616  14.818 -17.388  1.00  0.76           O  
 41 | ATOM     19  CG  LEU A   3      23.687  13.232 -18.126  1.00  0.68           C  
 42 | ATOM     20  CD1 LEU A   3      24.421  13.188 -19.462  1.00  0.62           C  
 43 | ATOM     21  CD2 LEU A   3      24.196  12.137 -17.194  1.00  0.62           C  
 44 | ATOM     22  N   TRP A   4      21.538  16.162 -15.722  1.00  0.77           N  
 45 | ATOM     23  CA  TRP A   4      20.392  16.073 -14.823  1.00  0.79           C  
 46 | ATOM     24  C   TRP A   4      19.141  16.652 -15.476  1.00  0.76           C  
 47 | ATOM     25  CB  TRP A   4      20.681  16.803 -13.509  1.00  0.72           C  
 48 | ATOM     26  O   TRP A   4      18.042  16.119 -15.306  1.00  0.72           O  
 49 | ATOM     27  CG  TRP A   4      21.608  16.063 -12.592  1.00  0.63           C  
 50 | ATOM     28  CD1 TRP A   4      22.958  16.241 -12.466  1.00  0.56           C  
 51 | ATOM     29  CD2 TRP A   4      21.253  15.023 -11.676  1.00  0.57           C  
 52 | ATOM     30  CE2 TRP A   4      22.439  14.615 -11.026  1.00  0.51           C  
 53 | ATOM     31  CE3 TRP A   4      20.046  14.395 -11.341  1.00  0.62           C  
 54 | ATOM     32  NE1 TRP A   4      23.464  15.373 -11.525  1.00  0.65           N  
 55 | ATOM     33  CH2 TRP A   4      21.258  13.010  -9.750  1.00  0.61           C  
 56 | ATOM     34  CZ2 TRP A   4      22.452  13.607 -10.059  1.00  0.66           C  
 57 | ATOM     35  CZ3 TRP A   4      20.061  13.392 -10.378  1.00  0.60           C  
 58 | ATOM     36  N   MET A   5      19.363  17.705 -16.222  1.00  0.81           N  
 59 | ATOM     37  CA  MET A   5      18.210  18.391 -16.799  1.00  0.81           C  
 60 | ATOM     38  C   MET A   5      17.573  17.550 -17.900  1.00  0.80           C  
 61 | ATOM     39  CB  MET A   5      18.618  19.756 -17.354  1.00  0.77           C  
 62 | ATOM     40  O   MET A   5      16.382  17.693 -18.185  1.00  0.77           O  
 63 | ATOM     41  CG  MET A   5      18.812  20.820 -16.285  1.00  0.70           C  
 64 | ATOM     42  SD  MET A   5      19.375  22.422 -16.980  1.00  0.67           S  
 65 | ATOM     43  CE  MET A   5      18.971  23.529 -15.601  1.00  0.64           C  
 66 | ATOM     44  N   ARG A   6      18.403  16.787 -18.427  1.00  0.79           N  
 67 | ATOM     45  CA  ARG A   6      17.871  15.935 -19.486  1.00  0.79           C  
 68 | ATOM     46  C   ARG A   6      17.163  14.716 -18.905  1.00  0.79           C  
 69 | ATOM     47  CB  ARG A   6      18.988  15.491 -20.432  1.00  0.76           C  
 70 | ATOM     48  O   ARG A   6      16.239  14.177 -19.518  1.00  0.77           O  
 71 | ATOM     49  CG  ARG A   6      19.569  16.618 -21.271  1.00  0.73           C  
 72 | ATOM     50  CD  ARG A   6      20.611  16.108 -22.257  1.00  0.71           C  
 73 | ATOM     51  NE  ARG A   6      21.169  17.192 -23.060  1.00  0.65           N  
 74 | ATOM     52  NH1 ARG A   6      22.611  15.841 -24.257  1.00  0.60           N  
 75 | ATOM     53  NH2 ARG A   6      22.543  18.096 -24.663  1.00  0.58           N  
 76 | ATOM     54  CZ  ARG A   6      22.107  17.041 -23.991  1.00  0.65           C  
 77 | ATOM     55  N   LEU A   7      17.636  14.248 -17.747  1.00  0.81           N  
 78 | ATOM     56  CA  LEU A   7      17.089  13.018 -17.184  1.00  0.81           C  
 79 | ATOM     57  C   LEU A   7      15.776  13.291 -16.457  1.00  0.81           C  
 80 | ATOM     58  CB  LEU A   7      18.093  12.375 -16.224  1.00  0.78           C  
 81 | ATOM     59  O   LEU A   7      14.936  12.398 -16.329  1.00  0.78           O  
 82 | ATOM     60  CG  LEU A   7      19.228  11.575 -16.866  1.00  0.72           C  
 83 | ATOM     61  CD1 LEU A   7      20.409  11.468 -15.907  1.00  0.66           C  
 84 | ATOM     62  CD2 LEU A   7      18.740  10.191 -17.279  1.00  0.67           C  
 85 | ATOM     63  N   LEU A   8      15.576  14.517 -16.010  1.00  0.80           N  
 86 | ATOM     64  CA  LEU A   8      14.412  14.837 -15.190  1.00  0.79           C  
 87 | ATOM     65  C   LEU A   8      13.120  14.603 -15.965  1.00  0.80           C  
 88 | ATOM     66  CB  LEU A   8      14.477  16.289 -14.710  1.00  0.76           C  
 89 | ATOM     67  O   LEU A   8      12.179  13.999 -15.444  1.00  0.78           O  
 90 | ATOM     68  CG  LEU A   8      15.119  16.523 -13.342  1.00  0.71           C  
 91 | ATOM     69  CD1 LEU A   8      15.693  17.933 -13.259  1.00  0.64           C  
 92 | ATOM     70  CD2 LEU A   8      14.105  16.286 -12.228  1.00  0.65           C  
 93 | ATOM     71  N   PRO A   9      13.063  15.189 -17.227  1.00  0.80           N  
 94 | ATOM     72  CA  PRO A   9      11.797  14.925 -17.915  1.00  0.80           C  
 95 | ATOM     73  C   PRO A   9      11.551  13.436 -18.147  1.00  0.80           C  
 96 | ATOM     74  CB  PRO A   9      11.958  15.665 -19.245  1.00  0.78           C  
 97 | ATOM     75  O   PRO A   9      10.399  12.999 -18.208  1.00  0.79           O  
 98 | ATOM     76  CG  PRO A   9      13.354  16.199 -19.219  1.00  0.77           C  
 99 | ATOM     77  CD  PRO A   9      13.973  15.852 -17.895  1.00  0.78           C  
100 | ATOM     78  N   LEU A  10      12.604  12.701 -18.345  1.00  0.79           N  
101 | ATOM     79  CA  LEU A  10      12.424  11.271 -18.567  1.00  0.78           C  
102 | ATOM     80  C   LEU A  10      11.899  10.586 -17.310  1.00  0.79           C  
103 | ATOM     81  CB  LEU A  10      13.744  10.626 -19.000  1.00  0.76           C  
104 | ATOM     82  O   LEU A  10      11.049   9.696 -17.392  1.00  0.77           O  
105 | ATOM     83  CG  LEU A  10      14.060  10.667 -20.496  1.00  0.72           C  
106 | ATOM     84  CD1 LEU A  10      15.566  10.758 -20.717  1.00  0.67           C  
107 | ATOM     85  CD2 LEU A  10      13.484   9.442 -21.198  1.00  0.67           C  
108 | ATOM     86  N   LEU A  11      12.426  10.939 -16.134  1.00  0.78           N  
109 | ATOM     87  CA  LEU A  11      11.946  10.378 -14.876  1.00  0.78           C  
110 | ATOM     88  C   LEU A  11      10.493  10.769 -14.626  1.00  0.78           C  
111 | ATOM     89  CB  LEU A  11      12.820  10.849 -13.711  1.00  0.76           C  
112 | ATOM     90  O   LEU A  11       9.724   9.987 -14.062  1.00  0.77           O  
113 | ATOM     91  CG  LEU A  11      14.179  10.161 -13.562  1.00  0.72           C  
114 | ATOM     92  CD1 LEU A  11      15.106  11.003 -12.692  1.00  0.68           C  
115 | ATOM     93  CD2 LEU A  11      14.009   8.764 -12.975  1.00  0.68           C  
116 | ATOM     94  N   ALA A  12      10.168  12.021 -14.929  1.00  0.80           N  
117 | ATOM     95  CA  ALA A  12       8.783  12.457 -14.771  1.00  0.79           C  
118 | ATOM     96  C   ALA A  12       7.841  11.614 -15.625  1.00  0.80           C  
119 | ATOM     97  CB  ALA A  12       8.646  13.934 -15.133  1.00  0.78           C  
120 | ATOM     98  O   ALA A  12       6.720  11.310 -15.209  1.00  0.78           O  
121 | ATOM     99  N   LEU A  13       8.367  11.285 -16.860  1.00  0.80           N  
122 | ATOM    100  CA  LEU A  13       7.550  10.464 -17.747  1.00  0.79           C  
123 | ATOM    101  C   LEU A  13       7.355   9.066 -17.168  1.00  0.79           C  
124 | ATOM    102  CB  LEU A  13       8.193  10.369 -19.133  1.00  0.77           C  
125 | ATOM    103  O   LEU A  13       6.274   8.485 -17.288  1.00  0.77           O  
126 | ATOM    104  CG  LEU A  13       7.907  11.524 -20.094  1.00  0.73           C  
127 | ATOM    105  CD1 LEU A  13       8.997  11.611 -21.157  1.00  0.67           C  
128 | ATOM    106  CD2 LEU A  13       6.535  11.356 -20.739  1.00  0.68           C  
129 | ATOM    107  N   LEU A  14       8.371   8.519 -16.560  1.00  0.78           N  
130 | ATOM    108  CA  LEU A  14       8.260   7.183 -15.984  1.00  0.76           C  
131 | ATOM    109  C   LEU A  14       7.288   7.176 -14.809  1.00  0.76           C  
132 | ATOM    110  CB  LEU A  14       9.633   6.680 -15.529  1.00  0.74           C  
133 | ATOM    111  O   LEU A  14       6.598   6.181 -14.575  1.00  0.74           O  
134 | ATOM    112  CG  LEU A  14      10.542   6.110 -16.618  1.00  0.70           C  
135 | ATOM    113  CD1 LEU A  14      12.004   6.214 -16.197  1.00  0.66           C  
136 | ATOM    114  CD2 LEU A  14      10.170   4.663 -16.924  1.00  0.66           C  
137 | ATOM    115  N   ALA A  15       7.321   8.268 -14.050  1.00  0.76           N  
138 | ATOM    116  CA  ALA A  15       6.430   8.354 -12.896  1.00  0.75           C  
139 | ATOM    117  C   ALA A  15       4.970   8.414 -13.333  1.00  0.75           C  
140 | ATOM    118  CB  ALA A  15       6.778   9.572 -12.044  1.00  0.73           C  
141 | ATOM    119  O   ALA A  15       4.078   7.968 -12.606  1.00  0.74           O  
142 | ATOM    120  N   LEU A  16       4.789   9.020 -14.538  1.00  0.77           N  
143 | ATOM    121  CA  LEU A  16       3.418   9.204 -14.999  1.00  0.75           C  
144 | ATOM    122  C   LEU A  16       2.878   7.923 -15.626  1.00  0.75           C  
145 | ATOM    123  CB  LEU A  16       3.342  10.351 -16.010  1.00  0.73           C  
146 | ATOM    124  O   LEU A  16       1.667   7.693 -15.634  1.00  0.72           O  
147 | ATOM    125  CG  LEU A  16       3.569  11.759 -15.456  1.00  0.70           C  
148 | ATOM    126  CD1 LEU A  16       3.738  12.756 -16.597  1.00  0.66           C  
149 | ATOM    127  CD2 LEU A  16       2.415  12.169 -14.548  1.00  0.66           C  
150 | ATOM    128  N   TRP A  17       3.722   7.027 -16.161  1.00  0.74           N  
151 | ATOM    129  CA  TRP A  17       3.296   5.799 -16.824  1.00  0.75           C  
152 | ATOM    130  C   TRP A  17       3.432   4.601 -15.890  1.00  0.73           C  
153 | ATOM    131  CB  TRP A  17       4.113   5.564 -18.098  1.00  0.70           C  
154 | ATOM    132  O   TRP A  17       3.072   3.478 -16.254  1.00  0.68           O  
155 | ATOM    133  CG  TRP A  17       3.781   6.507 -19.216  1.00  0.63           C  
156 | ATOM    134  CD1 TRP A  17       4.047   7.847 -19.271  1.00  0.57           C  
157 | ATOM    135  CD2 TRP A  17       3.116   6.179 -20.440  1.00  0.59           C  
158 | ATOM    136  CE2 TRP A  17       3.013   7.371 -21.191  1.00  0.55           C  
159 | ATOM    137  CE3 TRP A  17       2.598   4.992 -20.975  1.00  0.65           C  
160 | ATOM    138  NE1 TRP A  17       3.588   8.372 -20.456  1.00  0.65           N  
161 | ATOM    139  CH2 TRP A  17       1.912   6.234 -22.951  1.00  0.61           C  
162 | ATOM    140  CZ2 TRP A  17       2.411   7.409 -22.451  1.00  0.65           C  
163 | ATOM    141  CZ3 TRP A  17       1.999   5.033 -22.229  1.00  0.60           C  
164 | ATOM    142  N   GLY A  18       4.037   4.844 -14.696  1.00  0.69           N  
165 | ATOM    143  CA  GLY A  18       4.207   3.725 -13.783  1.00  0.67           C  
166 | ATOM    144  C   GLY A  18       2.892   3.168 -13.271  1.00  0.68           C  
167 | ATOM    145  O   GLY A  18       1.835   3.767 -13.478  1.00  0.64           O  
168 | ATOM    146  N   PRO A  19       2.745   1.860 -13.147  1.00  0.63           N  
169 | ATOM    147  CA  PRO A  19       1.523   1.249 -12.618  1.00  0.62           C  
170 | ATOM    148  C   PRO A  19       0.988   1.973 -11.384  1.00  0.63           C  
171 | ATOM    149  CB  PRO A  19       1.963  -0.174 -12.268  1.00  0.59           C  
172 | ATOM    150  O   PRO A  19       1.767   2.503 -10.588  1.00  0.61           O  
173 | ATOM    151  CG  PRO A  19       3.456  -0.115 -12.235  1.00  0.58           C  
174 | ATOM    152  CD  PRO A  19       3.902   1.107 -12.985  1.00  0.57           C  
175 | ATOM    153  N   ASP A  20      -0.137   2.514 -11.540  1.00  0.61           N  
176 | ATOM    154  CA  ASP A  20      -0.769   3.111 -10.368  1.00  0.60           C  
177 | ATOM    155  C   ASP A  20      -0.518   2.268  -9.120  1.00  0.61           C  
178 | ATOM    156  CB  ASP A  20      -2.273   3.279 -10.595  1.00  0.57           C  
179 | ATOM    157  O   ASP A  20      -0.774   1.062  -9.117  1.00  0.60           O  
180 | ATOM    158  CG  ASP A  20      -2.753   4.700 -10.359  1.00  0.55           C  
181 | ATOM    159  OD1 ASP A  20      -2.071   5.463  -9.642  1.00  0.55           O  
182 | ATOM    160  OD2 ASP A  20      -3.825   5.058 -10.892  1.00  0.56           O  
183 | ATOM    161  N   PRO A  21       0.428   2.770  -8.153  1.00  0.58           N  
184 | ATOM    162  CA  PRO A  21       0.552   1.983  -6.923  1.00  0.58           C  
185 | ATOM    163  C   PRO A  21      -0.798   1.527  -6.376  1.00  0.58           C  
186 | ATOM    164  CB  PRO A  21       1.236   2.950  -5.953  1.00  0.55           C  
187 | ATOM    165  O   PRO A  21      -0.882   0.489  -5.715  1.00  0.58           O  
188 | ATOM    166  CG  PRO A  21       1.236   4.266  -6.662  1.00  0.53           C  
189 | ATOM    167  CD  PRO A  21       0.822   4.040  -8.088  1.00  0.54           C  
190 | ATOM    168  N   ALA A  22      -1.842   2.398  -6.585  1.00  0.58           N  
191 | ATOM    169  CA  ALA A  22      -3.142   2.105  -5.987  1.00  0.58           C  
192 | ATOM    170  C   ALA A  22      -3.796   0.901  -6.659  1.00  0.58           C  
193 | ATOM    171  CB  ALA A  22      -4.057   3.324  -6.080  1.00  0.55           C  
194 | ATOM    172  O   ALA A  22      -4.564   0.171  -6.028  1.00  0.56           O  
195 | ATOM    173  N   ALA A  23      -3.407   0.679  -7.903  1.00  0.57           N  
196 | ATOM    174  CA  ALA A  23      -4.023  -0.432  -8.624  1.00  0.57           C  
197 | ATOM    175  C   ALA A  23      -3.591  -1.773  -8.038  1.00  0.56           C  
198 | ATOM    176  CB  ALA A  23      -3.670  -0.364 -10.108  1.00  0.53           C  
199 | ATOM    177  O   ALA A  23      -4.345  -2.748  -8.081  1.00  0.55           O  
200 | ATOM    178  N   ALA A  24      -2.296  -1.821  -7.577  1.00  0.56           N  
201 | ATOM    179  CA  ALA A  24      -1.852  -3.088  -7.002  1.00  0.55           C  
202 | ATOM    180  C   ALA A  24      -2.667  -3.443  -5.762  1.00  0.56           C  
203 | ATOM    181  CB  ALA A  24      -0.366  -3.024  -6.657  1.00  0.53           C  
204 | ATOM    182  O   ALA A  24      -2.923  -4.620  -5.496  1.00  0.55           O  
205 | ATOM    183  N   PHE A  25      -3.261  -2.394  -5.054  1.00  0.58           N  
206 | ATOM    184  CA  PHE A  25      -3.992  -2.699  -3.830  1.00  0.58           C  
207 | ATOM    185  C   PHE A  25      -5.456  -2.994  -4.133  1.00  0.58           C  
208 | ATOM    186  CB  PHE A  25      -3.885  -1.538  -2.836  1.00  0.55           C  
209 | ATOM    187  O   PHE A  25      -6.139  -3.648  -3.343  1.00  0.57           O  
210 | ATOM    188  CG  PHE A  25      -2.652  -1.588  -1.975  1.00  0.53           C  
211 | ATOM    189  CD1 PHE A  25      -2.611  -2.389  -0.841  1.00  0.50           C  
212 | ATOM    190  CD2 PHE A  25      -1.533  -0.832  -2.300  1.00  0.51           C  
213 | ATOM    191  CE1 PHE A  25      -1.471  -2.437  -0.042  1.00  0.50           C  
214 | ATOM    192  CE2 PHE A  25      -0.390  -0.875  -1.507  1.00  0.51           C  
215 | ATOM    193  CZ  PHE A  25      -0.361  -1.677  -0.378  1.00  0.49           C  
216 | ATOM    194  N   VAL A  26      -5.891  -2.531  -5.255  1.00  0.58           N  
217 | ATOM    195  CA  VAL A  26      -7.337  -2.503  -5.449  1.00  0.57           C  
218 | ATOM    196  C   VAL A  26      -7.829  -3.886  -5.869  1.00  0.58           C  
219 | ATOM    197  CB  VAL A  26      -7.749  -1.449  -6.501  1.00  0.54           C  
220 | ATOM    198  O   VAL A  26      -8.984  -4.242  -5.623  1.00  0.57           O  
221 | ATOM    199  CG1 VAL A  26      -9.269  -1.376  -6.627  1.00  0.49           C  
222 | ATOM    200  CG2 VAL A  26      -7.174  -0.081  -6.137  1.00  0.50           C  
223 | ATOM    201  N   ASN A  27      -6.835  -4.868  -6.257  1.00  0.56           N  
224 | ATOM    202  CA  ASN A  27      -7.415  -6.146  -6.657  1.00  0.57           C  
225 | ATOM    203  C   ASN A  27      -7.242  -7.204  -5.572  1.00  0.56           C  
226 | ATOM    204  CB  ASN A  27      -6.798  -6.627  -7.972  1.00  0.52           C  
227 | ATOM    205  O   ASN A  27      -7.445  -8.394  -5.821  1.00  0.54           O  
228 | ATOM    206  CG  ASN A  27      -7.627  -6.242  -9.182  1.00  0.50           C  
229 | ATOM    207  ND2 ASN A  27      -7.010  -6.272 -10.357  1.00  0.45           N  
230 | ATOM    208  OD1 ASN A  27      -8.811  -5.918  -9.061  1.00  0.50           O  
231 | ATOM    209  N   GLN A  28      -6.793  -6.747  -4.428  1.00  0.65           N  
232 | ATOM    210  CA  GLN A  28      -6.747  -7.783  -3.401  1.00  0.65           C  
233 | ATOM    211  C   GLN A  28      -8.020  -7.780  -2.560  1.00  0.65           C  
234 | ATOM    212  CB  GLN A  28      -5.524  -7.595  -2.503  1.00  0.62           C  
235 | ATOM    213  O   GLN A  28      -8.355  -6.771  -1.935  1.00  0.62           O  
236 | ATOM    214  CG  GLN A  28      -4.344  -8.482  -2.877  1.00  0.59           C  
237 | ATOM    215  CD  GLN A  28      -3.184  -8.356  -1.907  1.00  0.58           C  
238 | ATOM    216  NE2 GLN A  28      -2.141  -9.150  -2.124  1.00  0.49           N  
239 | ATOM    217  OE1 GLN A  28      -3.225  -7.552  -0.971  1.00  0.58           O  
240 | ATOM    218  N   HIS A  29      -9.039  -8.415  -2.963  1.00  0.64           N  
241 | ATOM    219  CA  HIS A  29     -10.237  -8.685  -2.175  1.00  0.64           C  
242 | ATOM    220  C   HIS A  29      -9.905  -9.498  -0.928  1.00  0.64           C  
243 | ATOM    221  CB  HIS A  29     -11.279  -9.420  -3.019  1.00  0.61           C  
244 | ATOM    222  O   HIS A  29      -9.259 -10.545  -1.019  1.00  0.62           O  
245 | ATOM    223  CG  HIS A  29     -11.923  -8.560  -4.060  1.00  0.59           C  
246 | ATOM    224  CD2 HIS A  29     -11.791  -8.545  -5.407  1.00  0.55           C  
247 | ATOM    225  ND1 HIS A  29     -12.825  -7.566  -3.749  1.00  0.53           N  
248 | ATOM    226  CE1 HIS A  29     -13.222  -6.976  -4.865  1.00  0.53           C  
249 | ATOM    227  NE2 HIS A  29     -12.610  -7.552  -5.885  1.00  0.50           N  
250 | ATOM    228  N   LEU A  30      -9.925  -8.826   0.194  1.00  0.71           N  
251 | ATOM    229  CA  LEU A  30      -9.866  -9.579   1.443  1.00  0.70           C  
252 | ATOM    230  C   LEU A  30     -11.227 -10.180   1.780  1.00  0.71           C  
253 | ATOM    231  CB  LEU A  30      -9.394  -8.680   2.589  1.00  0.68           C  
254 | ATOM    232  O   LEU A  30     -12.221  -9.458   1.885  1.00  0.68           O  
255 | ATOM    233  CG  LEU A  30      -7.962  -8.151   2.490  1.00  0.64           C  
256 | ATOM    234  CD1 LEU A  30      -7.706  -7.111   3.575  1.00  0.61           C  
257 | ATOM    235  CD2 LEU A  30      -6.961  -9.297   2.592  1.00  0.63           C  
258 | ATOM    236  N   CYS A  31     -11.548 -11.380   1.500  1.00  0.72           N  
259 | ATOM    237  CA  CYS A  31     -12.756 -12.157   1.757  1.00  0.72           C  
260 | ATOM    238  C   CYS A  31     -12.468 -13.314   2.706  1.00  0.73           C  
261 | ATOM    239  CB  CYS A  31     -13.338 -12.691   0.449  1.00  0.69           C  
262 | ATOM    240  O   CYS A  31     -11.339 -13.805   2.769  1.00  0.70           O  
263 | ATOM    241  SG  CYS A  31     -13.889 -11.397  -0.684  1.00  0.70           S  
264 | ATOM    242  N   GLY A  32     -13.698 -13.594   3.630  1.00  0.74           N  
265 | ATOM    243  CA  GLY A  32     -13.641 -14.740   4.523  1.00  0.74           C  
266 | ATOM    244  C   GLY A  32     -12.600 -14.593   5.616  1.00  0.75           C  
267 | ATOM    245  O   GLY A  32     -12.557 -13.573   6.307  1.00  0.72           O  
268 | ATOM    246  N   SER A  33     -11.803 -15.731   5.786  1.00  0.76           N  
269 | ATOM    247  CA  SER A  33     -10.773 -15.896   6.807  1.00  0.76           C  
270 | ATOM    248  C   SER A  33      -9.647 -14.884   6.626  1.00  0.76           C  
271 | ATOM    249  CB  SER A  33     -10.204 -17.315   6.770  1.00  0.74           C  
272 | ATOM    250  O   SER A  33      -8.989 -14.500   7.595  1.00  0.74           O  
273 | ATOM    251  OG  SER A  33      -9.593 -17.580   5.519  1.00  0.67           O  
274 | ATOM    252  N   HIS A  34      -9.453 -14.504   5.377  1.00  0.76           N  
275 | ATOM    253  CA  HIS A  34      -8.368 -13.554   5.160  1.00  0.76           C  
276 | ATOM    254  C   HIS A  34      -8.733 -12.170   5.687  1.00  0.76           C  
277 | ATOM    255  CB  HIS A  34      -8.016 -13.473   3.673  1.00  0.73           C  
278 | ATOM    256  O   HIS A  34      -7.874 -11.449   6.198  1.00  0.73           O  
279 | ATOM    257  CG  HIS A  34      -7.359 -14.708   3.144  1.00  0.69           C  
280 | ATOM    258  CD2 HIS A  34      -7.793 -15.637   2.260  1.00  0.66           C  
281 | ATOM    259  ND1 HIS A  34      -6.097 -15.104   3.531  1.00  0.64           N  
282 | ATOM    260  CE1 HIS A  34      -5.782 -16.225   2.905  1.00  0.63           C  
283 | ATOM    261  NE2 HIS A  34      -6.795 -16.570   2.128  1.00  0.61           N  
284 | ATOM    262  N   LEU A  35     -10.027 -11.761   5.551  1.00  0.77           N  
285 | ATOM    263  CA  LEU A  35     -10.529 -10.540   6.171  1.00  0.76           C  
286 | ATOM    264  C   LEU A  35     -10.294 -10.559   7.678  1.00  0.77           C  
287 | ATOM    265  CB  LEU A  35     -12.022 -10.364   5.878  1.00  0.74           C  
288 | ATOM    266  O   LEU A  35      -9.861  -9.560   8.256  1.00  0.76           O  
289 | ATOM    267  CG  LEU A  35     -12.700  -9.153   6.521  1.00  0.70           C  
290 | ATOM    268  CD1 LEU A  35     -12.071  -7.860   6.012  1.00  0.65           C  
291 | ATOM    269  CD2 LEU A  35     -14.200  -9.170   6.242  1.00  0.65           C  
292 | ATOM    270  N   VAL A  36     -10.599 -11.795   8.230  1.00  0.78           N  
293 | ATOM    271  CA  VAL A  36     -10.488 -11.902   9.681  1.00  0.77           C  
294 | ATOM    272  C   VAL A  36      -9.032 -11.719  10.103  1.00  0.78           C  
295 | ATOM    273  CB  VAL A  36     -11.025 -13.257  10.193  1.00  0.76           C  
296 | ATOM    274  O   VAL A  36      -8.745 -11.025  11.082  1.00  0.77           O  
297 | ATOM    275  CG1 VAL A  36     -10.724 -13.429  11.681  1.00  0.70           C  
298 | ATOM    276  CG2 VAL A  36     -12.526 -13.369   9.932  1.00  0.71           C  
299 | ATOM    277  N   GLU A  37      -8.181 -12.295   9.344  1.00  0.79           N  
300 | ATOM    278  CA  GLU A  37      -6.761 -12.183   9.666  1.00  0.79           C  
301 | ATOM    279  C   GLU A  37      -6.286 -10.737   9.570  1.00  0.79           C  
302 | ATOM    280  CB  GLU A  37      -5.928 -13.074   8.741  1.00  0.77           C  
303 | ATOM    281  O   GLU A  37      -5.507 -10.276  10.407  1.00  0.77           O  
304 | ATOM    282  CG  GLU A  37      -5.576 -14.427   9.342  1.00  0.71           C  
305 | ATOM    283  CD  GLU A  37      -4.825 -15.334   8.381  1.00  0.68           C  
306 | ATOM    284  OE1 GLU A  37      -4.452 -14.873   7.278  1.00  0.65           O  
307 | ATOM    285  OE2 GLU A  37      -4.607 -16.514   8.733  1.00  0.63           O  
308 | ATOM    286  N   ALA A  38      -6.713 -10.044   8.430  1.00  0.80           N  
309 | ATOM    287  CA  ALA A  38      -6.340  -8.642   8.261  1.00  0.80           C  
310 | ATOM    288  C   ALA A  38      -6.851  -7.794   9.422  1.00  0.80           C  
311 | ATOM    289  CB  ALA A  38      -6.878  -8.104   6.937  1.00  0.78           C  
312 | ATOM    290  O   ALA A  38      -6.160  -6.884   9.885  1.00  0.79           O  
313 | ATOM    291  N   LEU A  39      -8.106  -8.108   9.884  1.00  0.79           N  
314 | ATOM    292  CA  LEU A  39      -8.691  -7.357  10.990  1.00  0.79           C  
315 | ATOM    293  C   LEU A  39      -7.861  -7.524  12.259  1.00  0.80           C  
316 | ATOM    294  CB  LEU A  39     -10.130  -7.811  11.245  1.00  0.78           C  
317 | ATOM    295  O   LEU A  39      -7.699  -6.574  13.029  1.00  0.79           O  
318 | ATOM    296  CG  LEU A  39     -11.181  -7.328  10.244  1.00  0.75           C  
319 | ATOM    297  CD1 LEU A  39     -12.509  -8.036  10.486  1.00  0.70           C  
320 | ATOM    298  CD2 LEU A  39     -11.350  -5.815  10.335  1.00  0.71           C  
321 | ATOM    299  N   TYR A  40      -7.386  -8.776  12.413  1.00  0.79           N  
322 | ATOM    300  CA  TYR A  40      -6.564  -8.991  13.599  1.00  0.79           C  
323 | ATOM    301  C   TYR A  40      -5.293  -8.152  13.542  1.00  0.80           C  
324 | ATOM    302  CB  TYR A  40      -6.205 -10.473  13.742  1.00  0.77           C  
325 | ATOM    303  O   TYR A  40      -4.825  -7.653  14.567  1.00  0.78           O  
326 | ATOM    304  CG  TYR A  40      -7.159 -11.247  14.618  1.00  0.72           C  
327 | ATOM    305  CD1 TYR A  40      -7.198 -11.038  15.995  1.00  0.66           C  
328 | ATOM    306  CD2 TYR A  40      -8.023 -12.190  14.072  1.00  0.66           C  
329 | ATOM    307  CE1 TYR A  40      -8.075 -11.750  16.806  1.00  0.66           C  
330 | ATOM    308  CE2 TYR A  40      -8.904 -12.908  14.874  1.00  0.65           C  
331 | ATOM    309  OH  TYR A  40      -9.793 -13.389  17.036  1.00  0.64           O  
332 | ATOM    310  CZ  TYR A  40      -8.923 -12.681  16.237  1.00  0.64           C  
333 | ATOM    311  N   LEU A  41      -4.722  -8.098  12.285  1.00  0.81           N  
334 | ATOM    312  CA  LEU A  41      -3.487  -7.340  12.116  1.00  0.81           C  
335 | ATOM    313  C   LEU A  41      -3.717  -5.858  12.388  1.00  0.81           C  
336 | ATOM    314  CB  LEU A  41      -2.929  -7.532  10.704  1.00  0.79           C  
337 | ATOM    315  O   LEU A  41      -2.871  -5.194  12.991  1.00  0.79           O  
338 | ATOM    316  CG  LEU A  41      -1.552  -8.191  10.602  1.00  0.72           C  
339 | ATOM    317  CD1 LEU A  41      -1.571  -9.300   9.555  1.00  0.65           C  
340 | ATOM    318  CD2 LEU A  41      -0.486  -7.153  10.269  1.00  0.66           C  
341 | ATOM    319  N   VAL A  42      -4.952  -5.275  11.875  1.00  0.80           N  
342 | ATOM    320  CA  VAL A  42      -5.263  -3.855  11.996  1.00  0.80           C  
343 | ATOM    321  C   VAL A  42      -5.748  -3.551  13.412  1.00  0.80           C  
344 | ATOM    322  CB  VAL A  42      -6.325  -3.416  10.963  1.00  0.78           C  
345 | ATOM    323  O   VAL A  42      -5.401  -2.516  13.985  1.00  0.79           O  
346 | ATOM    324  CG1 VAL A  42      -6.747  -1.968  11.204  1.00  0.69           C  
347 | ATOM    325  CG2 VAL A  42      -5.791  -3.591   9.543  1.00  0.68           C  
348 | ATOM    326  N   CYS A  43      -6.665  -4.513  13.786  1.00  0.79           N  
349 | ATOM    327  CA  CYS A  43      -7.303  -4.224  15.065  1.00  0.80           C  
350 | ATOM    328  C   CYS A  43      -6.416  -4.655  16.227  1.00  0.79           C  
351 | ATOM    329  CB  CYS A  43      -8.658  -4.924  15.158  1.00  0.77           C  
352 | ATOM    330  O   CYS A  43      -6.525  -4.116  17.329  1.00  0.75           O  
353 | ATOM    331  SG  CYS A  43      -9.921  -4.213  14.081  1.00  0.73           S  
354 | ATOM    332  N   GLY A  44      -5.288  -5.501  15.916  1.00  0.74           N  
355 | ATOM    333  CA  GLY A  44      -4.387  -5.949  16.966  1.00  0.73           C  
356 | ATOM    334  C   GLY A  44      -5.111  -6.438  18.206  1.00  0.73           C  
357 | ATOM    335  O   GLY A  44      -5.996  -7.291  18.119  1.00  0.67           O  
358 | ATOM    336  N   GLU A  45      -4.713  -5.736  19.408  1.00  0.69           N  
359 | ATOM    337  CA  GLU A  45      -5.200  -5.984  20.762  1.00  0.68           C  
360 | ATOM    338  C   GLU A  45      -6.615  -5.444  20.946  1.00  0.69           C  
361 | ATOM    339  CB  GLU A  45      -4.260  -5.360  21.795  1.00  0.63           C  
362 | ATOM    340  O   GLU A  45      -7.292  -5.781  21.920  1.00  0.66           O  
363 | ATOM    341  CG  GLU A  45      -2.895  -6.030  21.868  1.00  0.58           C  
364 | ATOM    342  CD  GLU A  45      -2.127  -5.690  23.136  1.00  0.57           C  
365 | ATOM    343  OE1 GLU A  45      -2.600  -4.837  23.922  1.00  0.58           O  
366 | ATOM    344  OE2 GLU A  45      -1.044  -6.280  23.345  1.00  0.52           O  
367 | ATOM    345  N   ARG A  46      -7.092  -4.661  20.125  1.00  0.76           N  
368 | ATOM    346  CA  ARG A  46      -8.378  -4.018  20.377  1.00  0.75           C  
369 | ATOM    347  C   ARG A  46      -9.533  -4.972  20.092  1.00  0.73           C  
370 | ATOM    348  CB  ARG A  46      -8.522  -2.753  19.528  1.00  0.70           C  
371 | ATOM    349  O   ARG A  46     -10.645  -4.775  20.587  1.00  0.68           O  
372 | ATOM    350  CG  ARG A  46      -7.580  -1.630  19.932  1.00  0.65           C  
373 | ATOM    351  CD  ARG A  46      -7.850  -0.356  19.142  1.00  0.64           C  
374 | ATOM    352  NE  ARG A  46      -6.809   0.644  19.361  1.00  0.51           N  
375 | ATOM    353  NH1 ARG A  46      -7.757   2.233  17.976  1.00  0.39           N  
376 | ATOM    354  NH2 ARG A  46      -5.793   2.683  19.071  1.00  0.34           N  
377 | ATOM    355  CZ  ARG A  46      -6.789   1.851  18.802  1.00  0.59           C  
378 | ATOM    356  N   GLY A  47      -9.129  -6.353  19.616  1.00  0.74           N  
379 | ATOM    357  CA  GLY A  47     -10.195  -7.295  19.313  1.00  0.73           C  
380 | ATOM    358  C   GLY A  47     -11.246  -6.726  18.379  1.00  0.73           C  
381 | ATOM    359  O   GLY A  47     -11.345  -5.508  18.215  1.00  0.69           O  
382 | ATOM    360  N   PHE A  48     -11.830  -7.174  17.396  1.00  0.74           N  
383 | ATOM    361  CA  PHE A  48     -12.941  -6.773  16.541  1.00  0.75           C  
384 | ATOM    362  C   PHE A  48     -14.019  -7.849  16.513  1.00  0.74           C  
385 | ATOM    363  CB  PHE A  48     -12.450  -6.485  15.119  1.00  0.71           C  
386 | ATOM    364  O   PHE A  48     -13.757  -9.007  16.846  1.00  0.69           O  
387 | ATOM    365  CG  PHE A  48     -11.813  -7.670  14.445  1.00  0.67           C  
388 | ATOM    366  CD1 PHE A  48     -10.431  -7.814  14.421  1.00  0.63           C  
389 | ATOM    367  CD2 PHE A  48     -12.597  -8.641  13.834  1.00  0.64           C  
390 | ATOM    368  CE1 PHE A  48      -9.839  -8.910  13.798  1.00  0.60           C  
391 | ATOM    369  CE2 PHE A  48     -12.012  -9.739  13.210  1.00  0.61           C  
392 | ATOM    370  CZ  PHE A  48     -10.633  -9.871  13.192  1.00  0.60           C  
393 | ATOM    371  N   PHE A  49     -15.284  -7.565  16.604  1.00  0.72           N  
394 | ATOM    372  CA  PHE A  49     -16.459  -8.424  16.517  1.00  0.72           C  
395 | ATOM    373  C   PHE A  49     -16.824  -8.697  15.063  1.00  0.72           C  
396 | ATOM    374  CB  PHE A  49     -17.647  -7.788  17.246  1.00  0.68           C  
397 | ATOM    375  O   PHE A  49     -16.892  -7.773  14.250  1.00  0.68           O  
398 | ATOM    376  CG  PHE A  49     -18.878  -8.653  17.270  1.00  0.64           C  
399 | ATOM    377  CD1 PHE A  49     -19.900  -8.458  16.348  1.00  0.60           C  
400 | ATOM    378  CD2 PHE A  49     -19.014  -9.662  18.214  1.00  0.61           C  
401 | ATOM    379  CE1 PHE A  49     -21.041  -9.258  16.368  1.00  0.57           C  
402 | ATOM    380  CE2 PHE A  49     -20.151 -10.465  18.241  1.00  0.59           C  
403 | ATOM    381  CZ  PHE A  49     -21.163 -10.260  17.317  1.00  0.54           C  
404 | ATOM    382  N   TYR A  50     -16.792 -10.010  14.666  1.00  0.70           N  
405 | ATOM    383  CA  TYR A  50     -17.259 -10.411  13.344  1.00  0.70           C  
406 | ATOM    384  C   TYR A  50     -18.531 -11.246  13.445  1.00  0.71           C  
407 | ATOM    385  CB  TYR A  50     -16.174 -11.201  12.606  1.00  0.67           C  
408 | ATOM    386  O   TYR A  50     -18.576 -12.233  14.182  1.00  0.68           O  
409 | ATOM    387  CG  TYR A  50     -16.607 -11.711  11.254  1.00  0.63           C  
410 | ATOM    388  CD1 TYR A  50     -16.923 -13.054  11.062  1.00  0.59           C  
411 | ATOM    389  CD2 TYR A  50     -16.701 -10.851  10.164  1.00  0.60           C  
412 | ATOM    390  CE1 TYR A  50     -17.322 -13.529   9.817  1.00  0.56           C  
413 | ATOM    391  CE2 TYR A  50     -17.099 -11.315   8.914  1.00  0.58           C  
414 | ATOM    392  OH  TYR A  50     -17.800 -13.118   7.516  1.00  0.51           O  
415 | ATOM    393  CZ  TYR A  50     -17.407 -12.653   8.751  1.00  0.57           C  
416 | ATOM    394  N   THR A  51     -19.828 -10.760  12.997  1.00  0.71           N  
417 | ATOM    395  CA  THR A  51     -21.072 -11.507  12.841  1.00  0.71           C  
418 | ATOM    396  C   THR A  51     -21.209 -12.038  11.417  1.00  0.71           C  
419 | ATOM    397  CB  THR A  51     -22.294 -10.637  13.187  1.00  0.68           C  
420 | ATOM    398  O   THR A  51     -21.321 -11.261  10.467  1.00  0.68           O  
421 | ATOM    399  CG2 THR A  51     -22.894 -11.040  14.530  1.00  0.62           C  
422 | ATOM    400  OG1 THR A  51     -21.892  -9.263  13.249  1.00  0.65           O  
423 | ATOM    401  N   PRO A  52     -20.836 -13.373  11.219  1.00  0.69           N  
424 | ATOM    402  CA  PRO A  52     -21.056 -13.925   9.880  1.00  0.68           C  
425 | ATOM    403  C   PRO A  52     -22.496 -13.757   9.402  1.00  0.70           C  
426 | ATOM    404  CB  PRO A  52     -20.705 -15.405  10.046  1.00  0.67           C  
427 | ATOM    405  O   PRO A  52     -23.428 -13.807  10.209  1.00  0.70           O  
428 | ATOM    406  CG  PRO A  52     -20.298 -15.543  11.478  1.00  0.65           C  
429 | ATOM    407  CD  PRO A  52     -20.517 -14.226  12.165  1.00  0.64           C  
430 | ATOM    408  N   LYS A  53     -22.707 -13.101   8.244  1.00  0.65           N  
431 | ATOM    409  CA  LYS A  53     -24.025 -13.035   7.619  1.00  0.65           C  
432 | ATOM    410  C   LYS A  53     -24.600 -14.432   7.403  1.00  0.66           C  
433 | ATOM    411  CB  LYS A  53     -23.950 -12.288   6.287  1.00  0.62           C  
434 | ATOM    412  O   LYS A  53     -23.877 -15.355   7.023  1.00  0.65           O  
435 | ATOM    413  CG  LYS A  53     -23.951 -10.772   6.427  1.00  0.60           C  
436 | ATOM    414  CD  LYS A  53     -24.045 -10.086   5.070  1.00  0.60           C  
437 | ATOM    415  CE  LYS A  53     -23.998  -8.570   5.205  1.00  0.52           C  
438 | ATOM    416  NZ  LYS A  53     -24.054  -7.893   3.875  1.00  0.45           N  
439 | ATOM    417  N   THR A  54     -25.606 -14.981   8.222  1.00  0.63           N  
440 | ATOM    418  CA  THR A  54     -26.393 -16.152   7.853  1.00  0.65           C  
441 | ATOM    419  C   THR A  54     -26.967 -15.997   6.447  1.00  0.62           C  
442 | ATOM    420  CB  THR A  54     -27.538 -16.394   8.854  1.00  0.59           C  
443 | ATOM    421  O   THR A  54     -27.291 -14.886   6.022  1.00  0.61           O  
444 | ATOM    422  CG2 THR A  54     -27.025 -17.060  10.126  1.00  0.49           C  
445 | ATOM    423  OG1 THR A  54     -28.140 -15.139   9.194  1.00  0.52           O  


--------------------------------------------------------------------------------