├── boileroom ├── images │ ├── __init__.py │ ├── volumes.py │ └── esm.py ├── models │ ├── esm │ │ ├── __init__.py │ │ ├── linker.py │ │ ├── esm2.py │ │ └── esmfold.py │ └── __init__.py ├── convert.py ├── constants.py ├── __init__.py ├── utils.py └── base.py ├── .gitignore ├── tests ├── data │ ├── multimer-check.txt │ ├── esmfold_server_short.pdb │ └── esmfold_server_medium.pdb ├── test_utils.py ├── conftest.py └── esm │ ├── test_esm2.py │ └── test_esmfold.py ├── .github └── workflows │ ├── pypi-publish.yaml │ ├── version-bump.yaml │ └── python-checks.yaml ├── pyproject.toml ├── LICENSE ├── .pre-commit-config.yaml └── README.md /boileroom/images/__init__.py: -------------------------------------------------------------------------------- 1 | from .esm import esm_image 2 | 3 | __all__ = ["esm_image"] 4 | -------------------------------------------------------------------------------- /boileroom/images/volumes.py: -------------------------------------------------------------------------------- 1 | from modal import Volume 2 | 3 | model_weights = Volume.from_name("model-weights", create_if_missing=True) 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | boileroom.egg-info/ 3 | __pycache__/ 4 | *.pyc 5 | .pytest_cache/ 6 | .ruff_cache/ 7 | .mypy_cache/ 8 | .model_cache/ 9 | .venv/ 10 | dist/ 11 | -------------------------------------------------------------------------------- /boileroom/models/esm/__init__.py: -------------------------------------------------------------------------------- 1 | from .esmfold import ESMFold, get_esmfold 2 | from .esm2 import ESM2, get_esm2 3 | 4 | __all__ = ["ESMFold", "get_esmfold", "ESM2", "get_esm2"] 5 | -------------------------------------------------------------------------------- /boileroom/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .esm.esmfold import ESMFold, get_esmfold 2 | from .esm.esm2 import ESM2, get_esm2 3 | 4 | __all__ = [ 5 | "ESMFold", 6 | "ESM2", 7 | "get_esmfold", 8 | "get_esm2", 9 | ] 10 | -------------------------------------------------------------------------------- /tests/data/multimer-check.txt: -------------------------------------------------------------------------------- 1 | DRKIAGMDKGNGGTGAGMGI:AESHWCYEVQAESSNYPCLVPVKWGGNCQKDRQSPINIVTTKAKVDKKLGRFFFSGYDKKQTWTVQNNGHSVMMLLENKASISGGGLPAPYQAKQLHLHWSDLPYKGSEHSLDGEHFAMEMHIVHEKEKGTSRNVKEAQDPEDEIAVLAFLVEAGTQVNEGFQPLVEALSNIPKPEMSTTMAESSLLDLLPKEEKLRHYFRYLGSLTTPTCDEKVVWTVFREPIQLHREQILAFSQKLYYDKEQTVSMKDNVRPLQQLGQRTVIKS 2 | -------------------------------------------------------------------------------- /boileroom/images/esm.py: -------------------------------------------------------------------------------- 1 | """Modal image definition for ESM family of models.""" 2 | 3 | from modal import Image 4 | 5 | # Define the base image with all dependencies 6 | esm_image = ( 7 | Image.debian_slim(python_version="3.12") 8 | .apt_install("wget", "git") 9 | .pip_install("torch>=2.5.1,<2.7.0", "torch-tensorrt", "biotite>=1.0.1") 10 | .run_commands( 11 | "git clone https://github.com/jakublala/my_transformers.git", 12 | "cd my_transformers && pip install .", 13 | ) 14 | .env({"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"}) 15 | ) 16 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Python package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | build-and-publish: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v4 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: '3.12' 17 | 18 | - name: Install uv 19 | run: | 20 | curl -LsSf https://astral.sh/uv/install.sh | sh 21 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 22 | 23 | - name: Build and publish to PyPI 24 | run: | 25 | uv build --no-sources 26 | uv publish --token ${{ secrets.PYPI_API_TOKEN }} 27 | -------------------------------------------------------------------------------- /boileroom/convert.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Union 3 | from io import StringIO 4 | from biotite.structure import AtomArray 5 | from biotite.structure.io.pdb import PDBFile 6 | 7 | 8 | # TODO: for now, we can keep it in desprot, but long-term it makes sense to 9 | # have it as a default part of boileroom (as a function that can be run locally) 10 | def pdb_file_to_atomarray(pdb_path: Union[str, StringIO]) -> AtomArray: 11 | assert isinstance(pdb_path, (str, StringIO)), "pdb_path must be a string or StringIO" 12 | if isinstance(pdb_path, str): 13 | assert os.path.exists(pdb_path), "pdb_path must be a valid path" 14 | return PDBFile.read(pdb_path).get_structure(model=1) 15 | 16 | 17 | def pdb_string_to_atomarray(pdb_string: str) -> AtomArray: 18 | assert isinstance(pdb_string, str), "pdb_string must be a string" 19 | return pdb_file_to_atomarray(StringIO(pdb_string)) 20 | -------------------------------------------------------------------------------- /boileroom/constants.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | restype_1to3: Dict[str, str] = { 4 | "A": "ALA", 5 | "R": "ARG", 6 | "N": "ASN", 7 | "D": "ASP", 8 | "C": "CYS", 9 | "Q": "GLN", 10 | "E": "GLU", 11 | "G": "GLY", 12 | "H": "HIS", 13 | "I": "ILE", 14 | "L": "LEU", 15 | "K": "LYS", 16 | "M": "MET", 17 | "F": "PHE", 18 | "P": "PRO", 19 | "S": "SER", 20 | "T": "THR", 21 | "W": "TRP", 22 | "Y": "TYR", 23 | "V": "VAL", 24 | } 25 | 26 | 27 | # NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple 28 | # 1-to-1 mapping of 3 letter names to one letter names. The latter contains 29 | # many more, and less common, three letter names as keys and maps many of these 30 | # to the same one letter name (including 'X' and 'U' which we don't use here). 31 | restype_3to1: Dict[str, str] = {v: k for k, v in restype_1to3.items()} 32 | -------------------------------------------------------------------------------- /boileroom/__init__.py: -------------------------------------------------------------------------------- 1 | import modal 2 | 3 | app = modal.App("boileroom") 4 | 5 | 6 | # Lazy import to avoid circular import 7 | def _import_models(): 8 | from .models import ESMFold, ESM2, get_esmfold, get_esm2 9 | 10 | return ESMFold, ESM2, get_esmfold, get_esm2 11 | 12 | 13 | # Make these available at module level 14 | def __getattr__(name): 15 | if name in ["ESMFold", "ESM2", "get_esmfold", "get_esm2"]: 16 | ESMFold, ESM2, get_esmfold, get_esm2 = _import_models() 17 | globals().update({"ESMFold": ESMFold, "ESM2": ESM2, "get_esmfold": get_esmfold, "get_esm2": get_esm2}) 18 | return globals()[name] 19 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 20 | 21 | 22 | def __dir__(): 23 | return sorted(list(globals().keys()) + ["ESMFold", "ESM2", "get_esmfold", "get_esm2"]) 24 | 25 | 26 | __all__ = [ 27 | "ESMFold", 28 | "ESM2", 29 | "get_esmfold", 30 | "get_esm2", 31 | ] 32 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from boileroom.utils import validate_sequence, format_time 4 | 5 | 6 | def test_validate_sequence(test_sequences: dict[str, str]): 7 | """Test sequence validation.""" 8 | # Valid sequences 9 | assert validate_sequence(test_sequences["short"]) is True 10 | assert validate_sequence(test_sequences["medium"]) is True 11 | 12 | # Invalid sequences 13 | with pytest.raises(ValueError): 14 | validate_sequence(test_sequences["invalid"]) 15 | with pytest.raises(ValueError): 16 | validate_sequence("NOT A SEQUENCE") 17 | 18 | 19 | def test_format_time(): 20 | """Test time formatting.""" 21 | assert format_time(30) == "30s", f"Expected '30s', got {format_time(30)}" 22 | assert format_time(90) == "1m 30s", f"Expected '1m 30s', got {format_time(90)}" 23 | assert format_time(3600) == "1h", f"Expected '1h', got {format_time(3600)}" 24 | assert format_time(3661) == "1h 1m 1s", f"Expected '1h 1m 1s', got {format_time(3661)}" 25 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "boileroom" 7 | version = "0.2.1" 8 | authors = [ 9 | { name="Jakub Lála", email="jakublala@gmail.com" }, 10 | ] 11 | description = "Protein prediction models with Modal" 12 | readme = "README.md" 13 | requires-python = ">=3.11" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "modal>=0.73.12", 21 | "numpy>=2.2.2", 22 | "biotite>=1.0.1", 23 | "torch>=2.5.1,<2.7.0", 24 | "deprecated>=1.2.14", 25 | ] 26 | [project.optional-dependencies] 27 | dev = [ 28 | "pre-commit>=4.1.0", 29 | "pytest>=8.3.4", 30 | "pytest-xdist>=3.6.1", 31 | "pytest-mock>=3.14.0", 32 | ] 33 | local = [ 34 | "transformers>=4.49.0", 35 | ] 36 | 37 | [project.urls] 38 | Homepage = "https://github.com/jakublala/boileroom" 39 | 40 | [tool.hatch.metadata] 41 | allow-direct-references = true 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Jakub Lála 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/mirrors-mypy 3 | rev: v1.5.1 4 | hooks: 5 | - id: mypy 6 | exclude: &exclude_patterns | 7 | (?x)^.*\.pdb$ 8 | |^scripts/.*$ 9 | 10 | - repo: https://github.com/pre-commit/pre-commit-hooks 11 | rev: v4.4.0 12 | hooks: 13 | - id: trailing-whitespace 14 | exclude: *exclude_patterns 15 | - id: end-of-file-fixer 16 | exclude: *exclude_patterns 17 | - id: check-yaml 18 | exclude: *exclude_patterns 19 | - id: check-json 20 | exclude: *exclude_patterns 21 | - id: check-added-large-files 22 | exclude: *exclude_patterns 23 | - id: detect-aws-credentials 24 | args: ["--allow-missing-credentials"] 25 | exclude: *exclude_patterns 26 | 27 | - repo: https://github.com/charliermarsh/ruff-pre-commit 28 | rev: v0.8.6 29 | hooks: 30 | - id: ruff 31 | args: ["--fix", "--line-length", "120"] 32 | exclude: *exclude_patterns 33 | - id: ruff-format 34 | args: ["--line-length", "120"] 35 | exclude: *exclude_patterns 36 | -------------------------------------------------------------------------------- /.github/workflows/version-bump.yaml: -------------------------------------------------------------------------------- 1 | name: Require Version Bump 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | check-version: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout PR 13 | uses: actions/checkout@v4 14 | 15 | - name: Get version from PR branch 16 | id: pr_version 17 | run: | 18 | VERSION=$(grep -Po '(?<=version = ")[^"]+' pyproject.toml) 19 | echo "version=$VERSION" >> $GITHUB_OUTPUT 20 | 21 | - name: Checkout main 22 | uses: actions/checkout@v4 23 | with: 24 | ref: main 25 | 26 | - name: Get version from main 27 | id: main_version 28 | run: | 29 | VERSION=$(grep -Po '(?<=version = ")[^"]+' pyproject.toml) 30 | echo "version=$VERSION" >> $GITHUB_OUTPUT 31 | 32 | - name: Compare versions 33 | run: | 34 | if [ "${{ steps.pr_version.outputs.version }}" == "${{ steps.main_version.outputs.version }}" ]; then 35 | echo "❌ Version has not been bumped. Please update the version in pyproject.toml." 36 | exit 1 37 | else 38 | echo "✅ Version bump detected." 39 | fi 40 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Pytest configuration for the boileroom package.""" 2 | 3 | import os 4 | import pathlib 5 | import pytest 6 | 7 | 8 | def pytest_addoption(parser): 9 | parser.addoption( 10 | "--backend", 11 | action="store", 12 | default="modal", 13 | choices=("modal", "local"), 14 | help="Execution backend for models in tests: modal (default) or local", 15 | ) 16 | 17 | 18 | @pytest.fixture(autouse=True, scope="session") 19 | def model_dir(): 20 | os.environ["MODEL_DIR"] = str(pathlib.Path(__file__).parent.parent / ".model_cache") 21 | 22 | 23 | @pytest.fixture 24 | def run_backend(request): 25 | mode = request.config.getoption("--backend") 26 | 27 | def select(method): 28 | # method is e.g. model.fold or model.embed 29 | return getattr(method, "local" if mode == "local" else "remote") 30 | 31 | return select 32 | 33 | 34 | @pytest.fixture 35 | def test_sequences() -> dict[str, str]: 36 | return { 37 | "short": "MLKNVHVLVLGAGDVGSVVVRLLEK", 38 | "medium": "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKT", 39 | "invalid": "MALWMRLLPX123LLALWGPD", 40 | "multimer": ( 41 | "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKT:" 42 | "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKT" 43 | ), 44 | } 45 | 46 | 47 | @pytest.fixture 48 | def data_dir() -> pathlib.Path: 49 | return pathlib.Path(__file__).parent / "data" 50 | 51 | 52 | @pytest.fixture(params=[10, 25, 50]) 53 | def glycine_linker(request) -> str: 54 | return "G" * request.param 55 | -------------------------------------------------------------------------------- /.github/workflows/python-checks.yaml: -------------------------------------------------------------------------------- 1 | name: Python Checks 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | lint-checks: 11 | runs-on: "ubuntu-latest" 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.12' 19 | 20 | - name: Install uv 21 | run: | 22 | curl -LsSf https://astral.sh/uv/install.sh | sh 23 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 24 | 25 | - name: Install dependencies 26 | run: uv sync --extra dev 27 | 28 | - name: Run pre-commit 29 | run: uv run pre-commit run --all-files 30 | 31 | unit-tests: 32 | runs-on: "ubuntu-latest" 33 | steps: 34 | - uses: actions/checkout@v4 35 | 36 | - name: Set up Python 37 | uses: actions/setup-python@v5 38 | with: 39 | python-version: '3.12' 40 | 41 | - name: Install uv 42 | run: | 43 | curl -LsSf https://astral.sh/uv/install.sh | sh 44 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 45 | 46 | - name: Install dependencies 47 | run: uv sync --extra dev 48 | 49 | - name: Install the package 50 | run: uv pip install -e . 51 | 52 | - name: Authenticate Modal 53 | shell: bash -l {0} 54 | run: uv run modal token set --token-id ${{ secrets.MODAL_API_TOKEN_ID }} --token-secret ${{ secrets.MODAL_API_TOKEN_SECRET }} 55 | 56 | - name: Run tests 57 | shell: bash -l {0} 58 | run: uv run pytest -n auto 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # boileroom: serverless protein prediction models 2 | 3 | [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 5 | [![PyPI version](https://img.shields.io/pypi/v/boileroom.svg)](https://pypi.org/project/boileroom/) 6 | [![GitHub last commit](https://img.shields.io/github/last-commit/jakublala/boileroom.svg)](https://github.com/jakublala/boileroom/commits/main) 7 | [![GitHub issues](https://img.shields.io/github/issues/jakublala/boileroom.svg)](https://github.com/jakublala/boileroom/issues) 8 | 9 | `boileroom` is a Python package that provides a unified interface to various protein prediction models, running them efficiently on Modal's serverless infrastructure. 10 | 11 | ## Features 12 | 13 | - 🚀 Serverless execution of protein models 14 | - 🔄 Unified API across different models 15 | - 🎯 Production-ready with GPU acceleration 16 | - 📦 Easy installation and deployment 17 | 18 | ## Installation 19 | 20 | 1. Install the package using pip: 21 | 22 | ```bash 23 | pip install boileroom 24 | ``` 25 | 26 | 2. Set up Modal credentials (if you haven't already): 27 | 28 | ```bash 29 | modal token new 30 | ``` 31 | 32 | ## Quick Start 33 | 34 | ```python 35 | from boileroom import app, ESMFold 36 | 37 | # Initialize the model 38 | model = ESMFold() 39 | 40 | # Predict structure for a protein sequence 41 | sequence = "MLKNVHVLVLGAGDVGSVVVRLLEK" 42 | with app.run(): 43 | result = model.fold.remote([sequence]) 44 | 45 | # Access prediction results 46 | coordinates = result.positions 47 | confidence = result.plddt 48 | ``` 49 | 50 | ## Available Models 51 | 52 | | Model | Status | Description | Reference | 53 | |------------|--------|------------------------------------------------|--------------------------------------------------------| 54 | | ESMFold | ✅ | Fast protein structure prediction | [Facebook (now Meta)](https://github.com/facebookresearch/esm) | 55 | | ESM-2 | ✅ | MSA-free embedding model | [Facebook (now Meta)](https://github.com/facebookresearch/esm) | 56 | 57 | ## Development 58 | 59 | 1. Clone the repository: 60 | 61 | ```bash 62 | git clone https://github.com/jakublala/boileroom 63 | cd boileroom 64 | ``` 65 | 66 | 2. Install development dependencies using `uv`: 67 | 68 | ```bash 69 | curl -LsSf https://astral.sh/uv/install.sh | sh 70 | uv python install 3.12 71 | uv sync 72 | ``` 73 | 74 | 3. Run tests: 75 | 76 | ```bash 77 | uv run pytest 78 | ``` 79 | 80 | or only one test that's more verbose and shows print statements: 81 | 82 | ```bash 83 | uv run python -m pytest tests/test_basic.py::test_esmfold_batch -v -s 84 | ``` 85 | 86 | ## License 87 | 88 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 89 | 90 | ## Citation 91 | 92 | If you use `boileroom` in your research, please cite: 93 | 94 | ```bibtex 95 | @software{boileroom2025, 96 | author = {Lála, Jakub}, 97 | title = {boileroom: serverless protein prediction models}, 98 | year = {2025}, 99 | publisher = {GitHub}, 100 | url = {https://github.com/jakublala/boileroom} 101 | } 102 | ``` 103 | 104 | ## Acknowledgments 105 | 106 | - [Modal Labs](https://modal.com/) for the serverless infrastructure 107 | - The teams behind ESMFold, AlphaFold, and other protein prediction models 108 | -------------------------------------------------------------------------------- /boileroom/utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions and constants for the BoilerRoom package.""" 2 | 3 | import os 4 | import time 5 | import logging 6 | from pathlib import Path 7 | from typing import Dict, Optional 8 | 9 | 10 | # Time constants 11 | SECONDS = 1 12 | MINUTES = 60 13 | HOURS = 60 * MINUTES 14 | 15 | # Directory constants 16 | MODEL_DIR = "/mnt/models" 17 | CACHE_DIR = os.path.expanduser("~/.cache/boileroom") 18 | 19 | # Amino acid constants 20 | AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY" 21 | AMINO_ACID_DICT = {aa: i for i, aa in enumerate(AMINO_ACIDS)} 22 | VALID_AMINO_ACIDS = set(AMINO_ACIDS) # For faster lookups 23 | 24 | GPUS_AVAIL_ON_MODAL = ["T4", "L4", "A10G", "A100-40GB", "A100-80GB", "L40S", "H100"] 25 | 26 | 27 | def validate_sequence(sequence: str) -> bool: 28 | """Validate that a sequence contains only valid amino acids. 29 | 30 | Args: 31 | sequence: A string of amino acids in single-letter code 32 | 33 | Returns: 34 | bool: True if sequence is valid 35 | 36 | Raises: 37 | ValueError: If sequence contains invalid characters 38 | """ 39 | sequence = sequence.replace(":", "") # remove any linkers first ":" 40 | invalid_chars = set(sequence) - VALID_AMINO_ACIDS 41 | # TODO: we should think whether there's not a cleaner way to throw an error on Modal 42 | # the traceback is otherwise quite messy and hard to debug 43 | if invalid_chars: 44 | raise ValueError(f"Invalid amino acid(s) in sequence: {sorted(invalid_chars)}") 45 | return True 46 | 47 | 48 | def ensure_cache_dir() -> Path: 49 | """Ensure the cache directory exists. 50 | 51 | Returns: 52 | Path: Path to cache directory 53 | """ 54 | cache_path = Path(CACHE_DIR) 55 | cache_path.mkdir(parents=True, exist_ok=True) 56 | return cache_path 57 | 58 | 59 | def format_time(seconds: float) -> str: 60 | """Format time in seconds to human readable string. 61 | 62 | Args: 63 | seconds: Time in seconds 64 | 65 | Returns: 66 | str: Formatted time string (e.g. "2h 30m 15s") 67 | """ 68 | hours = int(seconds // HOURS) 69 | minutes = int((seconds % HOURS) // MINUTES) 70 | secs = int(seconds % MINUTES) 71 | 72 | parts = [] 73 | if hours > 0: 74 | parts.append(f"{hours}h") 75 | if minutes > 0: 76 | parts.append(f"{minutes}m") 77 | if secs > 0 or not parts: 78 | parts.append(f"{secs}s") 79 | 80 | return " ".join(parts) 81 | 82 | 83 | def get_gpu_memory_info() -> Optional[Dict[str, int]]: 84 | """Get GPU memory information if available. 85 | 86 | Returns: 87 | Optional[Dict[str, int]]: Dictionary with 'total' and 'free' memory in MB, 88 | or None if no GPU is available 89 | """ 90 | try: 91 | import torch 92 | 93 | if not torch.cuda.is_available(): 94 | return None 95 | 96 | device = torch.cuda.current_device() 97 | total = torch.cuda.get_device_properties(device).total_memory // (1024 * 1024) 98 | free = torch.cuda.memory_reserved(device) // (1024 * 1024) 99 | 100 | return {"total": total, "free": free, "used": total - free} 101 | except Exception as e: 102 | print(f"Error getting GPU memory info: {e}") 103 | return None 104 | 105 | 106 | class Timer: 107 | """Context manager for timing operations.""" 108 | 109 | def __init__(self, description: str): 110 | self.description = description 111 | self.duration = None 112 | 113 | def __enter__(self): 114 | self.start_time = time.perf_counter() 115 | return self 116 | 117 | def __exit__(self, *args): 118 | self.duration = time.perf_counter() - self.start_time 119 | logging.info(f"{self.description} completed in {self.duration:.2f} seconds") 120 | -------------------------------------------------------------------------------- /boileroom/models/esm/linker.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List 3 | 4 | from ...images import esm_image 5 | 6 | with esm_image.imports(): 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | 11 | # --- Glycine linker and positional skip utilities --- 12 | def compute_position_ids(sequences: List[str], glycine_linker: str, position_ids_skip: int) -> torch.Tensor: 13 | """ 14 | Compute the position ids for the sequences. 15 | Parameters 16 | ---------- 17 | sequences: List of sequences, each containing chains separated by ":". 18 | glycine_linker: The glycine linker string used between chains represented as a string (e.g. "GGGG"). 19 | position_ids_skip: The number of positions to skip between chains. 20 | Returns 21 | ------- 22 | torch.Tensor: The position ids for the sequences 23 | """ 24 | position_ids = [] 25 | for multimer_seq in sequences: 26 | multimer_position_ids = [] 27 | previous_chain_end = 0 28 | for chain_id, chain_seq in enumerate(multimer_seq.split(":")): 29 | intrachain_position_ids = np.arange(len(chain_seq)) 30 | if chain_id != 0: 31 | intrachain_position_ids = (intrachain_position_ids + (previous_chain_end + 1)) + position_ids_skip 32 | # add linker if not last chain 33 | if chain_id != len(multimer_seq.split(":")) - 1: 34 | linker_position_ids = np.arange(len(glycine_linker)) + intrachain_position_ids[-1] + 1 35 | intrachain_position_ids = np.concatenate([intrachain_position_ids, linker_position_ids]) 36 | previous_chain_end = intrachain_position_ids[-1] 37 | multimer_position_ids += intrachain_position_ids.tolist() 38 | position_ids.append(torch.tensor(multimer_position_ids)) 39 | # add padding to the position ids 40 | max_length = max(len(ids) for ids in position_ids) 41 | for i, pos_ids in enumerate(position_ids): 42 | position_ids[i] = torch.cat([pos_ids, torch.zeros(max_length - len(pos_ids), dtype=torch.long)]) 43 | return torch.stack(position_ids) 44 | 45 | 46 | def store_multimer_properties(_sequences: List[str], glycine_linker: str): 47 | """Store properties needed for multimer processing. 48 | Args: 49 | _sequences: List of sequences, each containing chains separated by ":" 50 | glycine_linker: The glycine linker string used between chains 51 | Returns: 52 | tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 53 | - linker_map: tensor of shape (batch_size, sequence_length) where 0 indicates 54 | linker positions and 1 indicates chain positions 55 | - residue_index: tensor of shape (batch_size, sequence_length) containing 56 | residue indices that restart at 1 for each chain 57 | - chain_index: tensor of shape (batch_size, sequence_length) containing 58 | chain indices (0, 1, 2, etc.) 59 | """ 60 | linker_map = [] 61 | residue_index = [] 62 | chain_index = [] 63 | assert len(_sequences) > 0, "Sequences must not be empty" 64 | for seq in _sequences: 65 | full_seq_len = len(seq.replace(":", glycine_linker)) 66 | seq_mask = torch.ones(full_seq_len, dtype=torch.long) 67 | res_index = torch.zeros(full_seq_len, dtype=torch.long) 68 | ch_index = torch.zeros(full_seq_len, dtype=torch.long) 69 | current_pos = 0 70 | chains = seq.split(":") 71 | for i, chain in enumerate(chains): 72 | ch_index[current_pos : current_pos + len(chain)] = i 73 | res_index[current_pos : current_pos + len(chain)] = torch.arange(0, len(chain)) 74 | current_pos += len(chain) 75 | if i < len(chains) - 1: 76 | seq_mask[current_pos : current_pos + len(glycine_linker)] = 0 77 | ch_index[current_pos : current_pos + len(glycine_linker)] = i 78 | res_index[current_pos : current_pos + len(glycine_linker)] = torch.arange( 79 | len(chain) + 1, len(chain) + len(glycine_linker) + 1 80 | ) 81 | current_pos += len(glycine_linker) 82 | linker_map.append(seq_mask) 83 | residue_index.append(res_index) 84 | chain_index.append(ch_index) 85 | linker_max_size = max(tensor.size(0) for tensor in linker_map) 86 | residue_index_max_size = max(tensor.size(0) for tensor in residue_index) 87 | chain_index_max_size = max(tensor.size(0) for tensor in chain_index) 88 | max_size = max(linker_max_size, residue_index_max_size, chain_index_max_size) 89 | padded_linker_map = [F.pad(tensor, (0, max_size - tensor.size(0)), value=-1) for tensor in linker_map] 90 | padded_residue_index = [F.pad(tensor, (0, max_size - tensor.size(0)), value=-1) for tensor in residue_index] 91 | padded_chain_index = [F.pad(tensor, (0, max_size - tensor.size(0)), value=-1) for tensor in chain_index] 92 | return ( 93 | torch.stack(padded_linker_map), 94 | torch.stack(padded_residue_index), 95 | torch.stack(padded_chain_index), 96 | ) 97 | 98 | 99 | def replace_glycine_linkers(sequences: List[str], glycine_linker: str) -> List[str]: 100 | return [multimer_seq.replace(":", glycine_linker) for multimer_seq in sequences] 101 | -------------------------------------------------------------------------------- /boileroom/base.py: -------------------------------------------------------------------------------- 1 | """Base classes and interfaces for BoilerRoom protein structure prediction models.""" 2 | 3 | import logging 4 | 5 | from abc import ABC, abstractmethod 6 | from dataclasses import dataclass 7 | from typing import Union, Sequence, Optional, Protocol, List 8 | 9 | import numpy as np 10 | 11 | from .utils import validate_sequence 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | @dataclass 17 | class PredictionMetadata: 18 | """Metadata about a protein structure prediction.""" 19 | 20 | model_name: str 21 | model_version: str 22 | prediction_time: Optional[float] # in seconds 23 | sequence_lengths: Optional[List[int]] 24 | 25 | 26 | class StructurePrediction(Protocol): 27 | """Protocol defining the minimum interface for structure prediction outputs.""" 28 | 29 | metadata: PredictionMetadata 30 | positions: np.ndarray # Atom positions 31 | pdb: Optional[list[str]] = None 32 | cif: Optional[list[str]] = None 33 | 34 | 35 | class EmbeddingPrediction(Protocol): 36 | """Protocol defining the minimum interface for embedding outputs.""" 37 | 38 | metadata: PredictionMetadata 39 | embeddings: np.ndarray # Atom positions 40 | 41 | 42 | class Algorithm(ABC): 43 | """Abstract base class for algorithms.""" 44 | 45 | DEFAULT_CONFIG: dict = {} 46 | 47 | def __init__(self, config: dict = {}) -> None: 48 | """Initialize the algorithm.""" 49 | self.config = {**self.DEFAULT_CONFIG, **config} 50 | self.name: str = self.__class__.__name__ 51 | self.version: str = "" # Should be overridden by implementations 52 | self.ready: bool = False 53 | 54 | @abstractmethod 55 | def _load(self) -> None: 56 | """Load the model and prepare it for prediction. 57 | 58 | This method should handle: 59 | - Loading model weights 60 | - Moving model to appropriate device 61 | - Setting up any necessary preprocessing 62 | 63 | Raises: 64 | RuntimeError: If model loading fails 65 | """ 66 | raise NotImplementedError 67 | 68 | def update_config(self, config: dict) -> None: 69 | """ 70 | Update the config with the default values. 71 | 72 | This does not work with Modal and remote execution. Create a new instance instead. 73 | """ 74 | logger.warning("This does not work with Modal and remote execution. Create a new instance instead.") 75 | # TODO: Make this work smartly with remote Modal, calling _load() again, etc. and thus programmatically 76 | # updating the model if anything has changed 77 | self.config = {**self.config, **config} 78 | 79 | @staticmethod 80 | def _initialize_metadata(model_name: str, model_version: str) -> PredictionMetadata: 81 | """Initialize metadata for the prediction. 82 | 83 | Parameters 84 | ---------- 85 | model_name : str 86 | Name of the model 87 | model_version : str 88 | Version of the model 89 | 90 | Returns 91 | ------- 92 | PredictionMetadata 93 | Metadata for the prediction 94 | """ 95 | return PredictionMetadata( 96 | model_name=model_name, model_version=model_version, prediction_time=None, sequence_lengths=None 97 | ) 98 | 99 | 100 | class FoldingAlgorithm(Algorithm): 101 | """Abstract base class for protein structure prediction algorithms. 102 | 103 | This class defines the interface that all protein structure prediction models must implement. 104 | Each implementation should handle model loading, prediction, and cleanup appropriately. 105 | 106 | Attributes: 107 | name (str): Name of the folding algorithm 108 | version (str): Version of the model being used 109 | ready (bool): Whether the model is loaded and ready for prediction 110 | """ 111 | 112 | @abstractmethod 113 | def fold(self, sequences: Union[str, Sequence[str]]) -> StructurePrediction: 114 | """Predict the structure for one or more protein sequences. 115 | 116 | Parameters 117 | ---------- 118 | sequences : Union[str, Sequence[str]] 119 | A single sequence string or list of sequence strings 120 | containing valid amino acid characters 121 | 122 | Returns 123 | ------- 124 | StructurePrediction 125 | Structure prediction output implementing the StructurePrediction protocol 126 | 127 | Raises: 128 | ValueError: If sequences are invalid 129 | RuntimeError: If prediction fails 130 | """ 131 | raise NotImplementedError 132 | 133 | def _validate_sequences(self, sequences: Union[str, Sequence[str]]) -> list[str]: 134 | """Validate input sequences and convert to list format. 135 | 136 | Parameters 137 | ---------- 138 | sequences : Union[str, Sequence[str]] 139 | Single sequence or list of sequences 140 | 141 | Returns 142 | ------- 143 | list[str] 144 | List of validated sequences 145 | 146 | Raises: 147 | ValueError: If any sequence contains invalid amino acids 148 | """ 149 | # Convert single sequence to list 150 | if isinstance(sequences, str): 151 | sequences = [sequences] 152 | 153 | # Validate each sequence and return as explicit list 154 | return [seq for seq in sequences if validate_sequence(seq)] 155 | 156 | def _compute_sequence_lengths(self, sequences: List[str]) -> List[int]: 157 | """ 158 | Compute the sequence lengths for multimer sequences. 159 | """ 160 | return [len(seq) - seq.count(":") for seq in sequences] 161 | 162 | def _prepare_multimer_sequences(self, sequences: List[str]) -> List[str]: 163 | """ 164 | Prepare multimer sequences for prediction. 165 | This method is model-specific and how they handle multimers. 166 | 167 | Parameters 168 | ---------- 169 | sequences : List[str] 170 | List of protein sequences 171 | 172 | Returns 173 | ------- 174 | List[str] 175 | List of prepared sequences" 176 | """ 177 | raise NotImplementedError 178 | 179 | 180 | class EmbeddingAlgorithm(Algorithm): 181 | """Abstract base class for embedding algorithms.""" 182 | 183 | @abstractmethod 184 | def embed(self, sequences: Union[str, Sequence[str]]) -> EmbeddingPrediction: 185 | """Generate embeddings for one or more protein sequences. 186 | 187 | Parameters 188 | ---------- 189 | sequences : Union[str, Sequence[str]] 190 | A single sequence string or list of sequence strings 191 | containing valid amino acid characters 192 | 193 | Returns 194 | ------- 195 | EmbeddingPrediction 196 | Embedding output implementing the EmbeddingPrediction protocol 197 | 198 | Raises: 199 | ValueError: If sequences are invalid 200 | RuntimeError: If embedding generation fails 201 | """ 202 | raise NotImplementedError 203 | -------------------------------------------------------------------------------- /tests/esm/test_esm2.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | 4 | from boileroom import app, get_esm2 5 | 6 | 7 | @pytest.fixture 8 | def esm2_model_factory(): 9 | def _make_model(**kwargs): 10 | config = {**kwargs} 11 | 12 | if "15B" in config["model_name"]: 13 | model = get_esm2(gpu_type="A100-80GB", config=config) 14 | elif "3B" in config["model_name"]: 15 | model = get_esm2(gpu_type="A100-40GB", config=config) 16 | else: 17 | model = get_esm2(gpu_type="T4", config=config) 18 | 19 | return model 20 | 21 | return _make_model 22 | 23 | 24 | @pytest.mark.parametrize( 25 | "model_config", 26 | [ 27 | {"model_name": "esm2_t6_8M_UR50D", "latent_dim": 320, "num_layers": 6}, 28 | {"model_name": "esm2_t12_35M_UR50D", "latent_dim": 480, "num_layers": 12}, 29 | {"model_name": "esm2_t30_150M_UR50D", "latent_dim": 640, "num_layers": 30}, 30 | {"model_name": "esm2_t33_650M_UR50D", "latent_dim": 1280, "num_layers": 33}, 31 | {"model_name": "esm2_t36_3B_UR50D", "latent_dim": 2560, "num_layers": 36}, 32 | # {"model_name": "esm2_t48_15B_UR50D", "latent_dim": 5120, "num_layers": 48}, 33 | ], 34 | ) 35 | def test_esm2_embed_basic(esm2_model_factory, model_config, run_backend): 36 | """Test ESM2 embedding.""" 37 | sequence = "MALWMRLLPLLALLALWGPDPAAA" 38 | 39 | with app.run(): 40 | model = esm2_model_factory(model_name=model_config["model_name"]) 41 | result = run_backend(model.embed)([sequence]) 42 | # +2 for the two extra tokens (start of sequence and end of sequence) 43 | assert result.embeddings.shape == (1, len(sequence), model_config["latent_dim"]) 44 | assert result.hidden_states is not None 45 | # +1 for the extra layer of the transformer ??? UNCLEAR WHY THIS IS THE CASE 46 | assert result.hidden_states.shape == ( 47 | 1, 48 | model_config["num_layers"] + 1, 49 | len(sequence), 50 | model_config["latent_dim"], 51 | ) 52 | del model 53 | 54 | 55 | def test_esm2_embed_hidden_states(esm2_model_factory, run_backend): 56 | """Test ESM2 embedding hidden states.""" 57 | with app.run(): 58 | sequence = "MALWMRLLPLLALLALWGPDPAAA" 59 | model = esm2_model_factory(model_name="esm2_t33_650M_UR50D", output_hidden_states=False) 60 | result = run_backend(model.embed)([sequence]) 61 | assert result.hidden_states is None 62 | del model 63 | 64 | 65 | def test_esm2_embed_multimer(esm2_model_factory, test_sequences, run_backend): 66 | """Test ESM2 embedding multimer functionality. 67 | 68 | Tests various aspects of multimer handling: 69 | - Basic multimer embedding 70 | - Chain indices and residue indices 71 | - Padding mask 72 | - Hidden states (when enabled) 73 | - Different glycine linker lengths 74 | """ 75 | with app.run(): 76 | # Test with different glycine linker lengths 77 | for linker_length in [0, 10, 50]: 78 | model = esm2_model_factory( 79 | model_name="esm2_t33_650M_UR50D", 80 | output_hidden_states=True, 81 | glycine_linker="G" * linker_length, 82 | position_ids_skip=512, 83 | ) 84 | 85 | # Test with a simple multimer sequence 86 | sequence = test_sequences["multimer"] 87 | result = run_backend(model.embed)([sequence]) 88 | 89 | # Check basic shape 90 | expected_length = len(sequence.replace(":", "")) 91 | assert result.embeddings.shape == (1, expected_length, 1280), "Embedding shape mismatch" 92 | 93 | # Check chain indices 94 | assert result.chain_index is not None, "Chain index should be present" 95 | assert result.chain_index.shape == (1, expected_length), "Chain index shape mismatch" 96 | 97 | # First chain should be 0, second chain should be 1 98 | first_chain_length = len(sequence.split(":")[0]) 99 | assert np.all(result.chain_index[0, :first_chain_length] == 0), "First chain indices should be 0" 100 | assert np.all(result.chain_index[0, first_chain_length:] == 1), "Second chain indices should be 1" 101 | 102 | # Check residue indices 103 | assert result.residue_index is not None, "Residue index should be present" 104 | assert result.residue_index.shape == (1, expected_length), "Residue index shape mismatch" 105 | 106 | # Check hidden states 107 | assert result.hidden_states is not None, "Hidden states should be present" 108 | assert result.hidden_states.shape == (1, 34, expected_length, 1280), "Hidden states shape mismatch" 109 | 110 | # Test with a more complex multimer sequence 111 | complex_sequence = "MALWMRLLPLLALLALLAADASDASLLALWGPDPAAA:MADLLALWGPDPAAA:MALWMRLLPLLAADLLALWGPDPWGPDPAAA" 112 | result = run_backend(model.embed)([complex_sequence]) 113 | 114 | # Check basic shape for complex sequence 115 | expected_length = len(complex_sequence.replace(":", "")) 116 | assert result.embeddings.shape == (1, expected_length, 1280), "Complex sequence embedding shape mismatch" 117 | 118 | # Check chain indices for complex sequence 119 | assert result.chain_index.shape == (1, expected_length), "Complex sequence chain index shape mismatch" 120 | 121 | # First chain should be 0, second chain should be 1, third chain should be 2 122 | first_chain_length = len(complex_sequence.split(":")[0]) 123 | second_chain_length = len(complex_sequence.split(":")[1]) 124 | third_chain_length = len(complex_sequence.split(":")[2]) 125 | assert np.all(result.chain_index[0, :first_chain_length] == 0), "First chain indices should be 0" 126 | assert np.all( 127 | result.chain_index[0, first_chain_length : first_chain_length + second_chain_length] == 1 128 | ), "Second chain indices should be 1" 129 | assert np.all( 130 | result.chain_index[0, first_chain_length + second_chain_length :] == 2 131 | ), "Third chain indices should be 2" 132 | assert np.all( 133 | result.chain_index[0, first_chain_length + second_chain_length + third_chain_length :] == 3 134 | ), "Fourth chain indices should be 3" 135 | 136 | # Last test for a batched multimer, each sequence has different number of chains and length 137 | sequences = [ 138 | "AAA:CCC", # Very short 2-chain multimer 139 | test_sequences["short"], # Monomer (25 residues) 140 | "A" * 50 + ":" + "C" * 100 + ":" + "D" * 75, # Long 3-chain multimer with different chain lengths 141 | "M" * 10 + ":" + "K" * 10, # Small symmetric 2-chain multimer 142 | "M" * 1 + ":" + "Y" * 1, # Edge case: minimal 2-chain multimer (1 residue each) 143 | ] 144 | result = run_backend(model.embed)(sequences) 145 | assert result.embeddings.shape == ( 146 | len(sequences), 147 | max(len(seq.replace(":", "")) for seq in sequences), 148 | 1280, 149 | ), "Embedding shape mismatch" 150 | assert result.chain_index.shape == ( 151 | len(sequences), 152 | max(len(seq.replace(":", "")) for seq in sequences), 153 | ), "Chain index shape mismatch" 154 | assert result.residue_index.shape == ( 155 | len(sequences), 156 | max(len(seq.replace(":", "")) for seq in sequences), 157 | ), "Residue index shape mismatch" 158 | assert result.hidden_states.shape == ( 159 | len(sequences), 160 | 34, 161 | max(len(seq.replace(":", "")) for seq in sequences), 162 | 1280, 163 | ), "Hidden states shape mismatch" 164 | 165 | for i, seq in enumerate(sequences): 166 | expected_length = len(seq.replace(":", "")) 167 | assert np.all(result.embeddings[i, :expected_length] != 0), "No padding should be 0" 168 | assert np.all(result.embeddings[i, expected_length:] == 0), "Padding should be 0" 169 | assert np.all(result.chain_index[i, :expected_length] != -1), "No padding should be -1" 170 | assert np.all(result.chain_index[i, expected_length:] == -1), "Padding should be -1" 171 | assert np.all(result.residue_index[i, :expected_length] != -1), "No padding should be -1" 172 | assert np.all(result.residue_index[i, expected_length:] == -1), "Padding should be -1" 173 | # Count the number of zeros in the non-padding region; allow up to 16 zeros due to possible sparsity 174 | num_zeros = np.sum(result.hidden_states[i, :, :expected_length] == 0) 175 | assert num_zeros < 16, f"Too many zeros ({num_zeros}) in non-padding hidden states" 176 | assert np.all(result.hidden_states[i, :, expected_length:] == 0), "Padding should be 0" 177 | del model 178 | -------------------------------------------------------------------------------- /boileroom/models/esm/esm2.py: -------------------------------------------------------------------------------- 1 | import modal 2 | import numpy as np 3 | import os 4 | from dataclasses import dataclass 5 | from typing import List, Union, Optional, TYPE_CHECKING 6 | 7 | import logging 8 | 9 | from ... import app 10 | from ...base import EmbeddingAlgorithm, EmbeddingPrediction, PredictionMetadata 11 | from ...images import esm_image 12 | from ...utils import MINUTES, MODEL_DIR, Timer 13 | from ...images.volumes import model_weights 14 | from .linker import compute_position_ids, store_multimer_properties, replace_glycine_linkers 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | if TYPE_CHECKING: 19 | from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions 20 | 21 | 22 | # TODO: turn this into a Pydantic model instead 23 | @dataclass 24 | class ESM2Output(EmbeddingPrediction): 25 | """Output from ESM2 prediction including all model outputs.""" 26 | 27 | embeddings: np.ndarray # (batch_size, seq_len, embedding_dim) 28 | metadata: PredictionMetadata 29 | chain_index: np.ndarray # (batch_size, seq_len) 30 | residue_index: np.ndarray # (batch_size, seq_len) 31 | hidden_states: Optional[np.ndarray] = None # (batch_size, hidden_state_iter, seq_len, embedding_dim) 32 | 33 | 34 | with esm_image.imports(): 35 | import torch 36 | from transformers import EsmModel, AutoTokenizer 37 | 38 | 39 | @app.cls( 40 | image=esm_image, 41 | gpu="T4", 42 | timeout=20 * MINUTES, 43 | container_idle_timeout=10 * MINUTES, 44 | volumes={MODEL_DIR: model_weights}, 45 | ) 46 | class ESM2(EmbeddingAlgorithm): 47 | """ESM2 protein language model.""" 48 | 49 | DEFAULT_CONFIG = { 50 | "model_name": "esm2_t33_650M_UR50D", 51 | "output_hidden_states": True, 52 | # Chain linking and positioning config 53 | "glycine_linker": "", 54 | "position_ids_skip": 512, 55 | } 56 | 57 | def __init__(self, config: dict = {}) -> None: 58 | super().__init__(config) 59 | self.metadata = self._initialize_metadata( 60 | model_name="ESM-2", 61 | model_version="v4.49.0", # HuggingFace transformers version 62 | ) 63 | self.model_dir: Optional[str] = os.environ.get("MODEL_DIR", MODEL_DIR) 64 | self.tokenizer: Optional[AutoTokenizer] = None 65 | self.model: Optional[EsmModel] = None 66 | self.assert_valid_model(config) 67 | 68 | @staticmethod 69 | def assert_valid_model(config: dict) -> None: 70 | """ 71 | Validate that the model name is supported. 72 | 73 | Available ESM-2 models: 74 | - esm2_t48_15B_UR50D: 48 layers, 5120 hidden size, 40 attention heads 75 | - esm2_t36_3B_UR50D: 36 layers, 2560 hidden size, 40 attention heads 76 | - esm2_t33_650M_UR50D: 33 layers, 1280 hidden size, 20 attention heads 77 | - esm2_t30_150M_UR50D: 30 layers, 640 hidden size, 12 attention heads 78 | - esm2_t12_35M_UR50D: 12 layers, 480 hidden size, 20 attention heads 79 | - esm2_t6_8M_UR50D: 6 layers, 320 hidden size, 20 attention heads 80 | """ 81 | models_name = [ 82 | "esm2_t48_15B_UR50D", 83 | "esm2_t36_3B_UR50D", 84 | "esm2_t33_650M_UR50D", 85 | "esm2_t30_150M_UR50D", 86 | "esm2_t12_35M_UR50D", 87 | "esm2_t6_8M_UR50D", 88 | ] 89 | assert config["model_name"] in models_name, f"Model {config['model_name']} not supported" 90 | 91 | @modal.enter() 92 | def _initialize(self) -> None: 93 | self._load() 94 | 95 | def _load(self) -> None: 96 | if self.tokenizer is None: 97 | self.tokenizer = AutoTokenizer.from_pretrained( 98 | f"facebook/{self.config['model_name']}", cache_dir=self.model_dir 99 | ) 100 | if self.model is None: 101 | self.model = EsmModel.from_pretrained(f"facebook/{self.config['model_name']}", cache_dir=self.model_dir) 102 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 103 | self.model = self.model.to(self.device) 104 | self.model.eval() 105 | self.ready = True 106 | 107 | @modal.method() 108 | def embed(self, sequences: Union[str, List[str]]) -> ESM2Output: 109 | if self.tokenizer is None or self.model is None: 110 | logger.warning("Model not loaded. Forcing the model to load... Next time call _load() first.") 111 | self._load() 112 | assert self.tokenizer is not None and self.model is not None, "Model not loaded" 113 | 114 | logger.debug(f'Embedding {len(sequences)} sequences using {self.config["model_name"]}') 115 | 116 | # Support for glycine linker and positional skip logic (multimer) 117 | if isinstance(sequences, str): 118 | sequences = [sequences] 119 | 120 | if any(":" in seq for seq in sequences): 121 | # Multimer logic 122 | glycine_linker = self.config["glycine_linker"] 123 | multimer_properties = self._store_multimer_properties(sequences, glycine_linker) 124 | tokenized = self.tokenizer( 125 | replace_glycine_linkers(sequences, glycine_linker), 126 | return_tensors="pt", 127 | padding=True, 128 | truncation=True, 129 | ) 130 | # Add position_ids and attention_mask 131 | tokenized["position_ids"] = compute_position_ids( 132 | sequences, glycine_linker, self.config["position_ids_skip"] 133 | ) 134 | tokenized["attention_mask"] = (multimer_properties["linker_map"] == 1).to(torch.int32) 135 | else: 136 | # Monomer logic 137 | tokenized = self.tokenizer( 138 | sequences, 139 | return_tensors="pt", 140 | padding=True, 141 | truncation=True, 142 | ) 143 | multimer_properties = None 144 | tokenized = tokenized.to(self.device) 145 | tokenized["output_hidden_states"] = self.config["output_hidden_states"] 146 | 147 | with Timer("Model Inference") as timer: 148 | with torch.inference_mode(): 149 | outputs = self.model(**tokenized) 150 | 151 | outputs = self._convert_outputs(outputs, multimer_properties, timer.duration) 152 | 153 | return outputs 154 | 155 | @staticmethod 156 | def _store_multimer_properties(sequences: List[str], glycine_linker: str) -> dict[str, torch.Tensor]: 157 | linker_map, residue_index, chain_index = store_multimer_properties(sequences, glycine_linker) 158 | # Add and as effective padding 159 | batch_size = linker_map.shape[0] 160 | linker_map = torch.cat([-torch.ones(batch_size, 1), linker_map, -torch.ones(batch_size, 1)], dim=1) 161 | residue_index = torch.cat([-torch.ones(batch_size, 1), residue_index, -torch.ones(batch_size, 1)], dim=1) 162 | chain_index = torch.cat([-torch.ones(batch_size, 1), chain_index, -torch.ones(batch_size, 1)], dim=1) 163 | return {"linker_map": linker_map, "residue_index": residue_index, "chain_index": chain_index} 164 | 165 | def _convert_outputs( 166 | self, 167 | outputs: "BaseModelOutputWithPoolingAndCrossAttentions", 168 | multimer_properties: dict[str, torch.Tensor] | None, 169 | prediction_time: float, 170 | ) -> ESM2Output: 171 | """Convert model outputs to ESM2Output format.""" 172 | 173 | embeddings = outputs.last_hidden_state.cpu().numpy() 174 | 175 | if self.config["output_hidden_states"]: 176 | assert torch.all( 177 | outputs.hidden_states[-1] == outputs.last_hidden_state 178 | ), "Last hidden state should be the same as the output of the model" 179 | hidden_states = np.stack([h.cpu().numpy() for h in outputs.hidden_states], axis=1) 180 | else: 181 | hidden_states = None 182 | 183 | if multimer_properties is not None: 184 | # TODO: maybe add a proper MULTIMER flag? 185 | result = self._mask_linker_region(embeddings, hidden_states, **multimer_properties) 186 | embeddings, hidden_states, chain_index_output, residue_index_output = result 187 | else: # only MONOMERs 188 | chain_index_output = np.zeros((embeddings.shape[0], embeddings.shape[1]), dtype=np.int32) 189 | residue_index_output = None # HACK: for now, but given it's only monomers, it is clear what the res ids are 190 | if hidden_states is not None: 191 | hidden_states = hidden_states[:, :, 1:-1, :] # remove the first and last token 192 | embeddings = embeddings[:, 1:-1, :] # remove the first and last token 193 | 194 | self.metadata.prediction_time = prediction_time 195 | 196 | return ESM2Output( 197 | metadata=self.metadata, 198 | embeddings=embeddings, 199 | hidden_states=hidden_states, 200 | chain_index=chain_index_output, 201 | residue_index=residue_index_output, 202 | ) 203 | 204 | def _mask_linker_region( 205 | self, 206 | embeddings: np.ndarray, 207 | hidden_states: np.ndarray, 208 | linker_map: torch.Tensor, 209 | residue_index: torch.Tensor, 210 | chain_index: torch.Tensor, 211 | ) -> tuple[np.ndarray, np.ndarray, torch.Tensor, torch.Tensor]: 212 | """ 213 | Mask the linker region in the outputs and track padding information. 214 | 215 | Args: 216 | embeddings: Dictionary containing model outputs 217 | hidden_states: Dictionary containing model outputs 218 | chain_index: Dictionary containing model outputs 219 | residue_index: Dictionary containing model outputs 220 | 221 | Returns: 222 | dict: Updated outputs with linker regions masked and padding information 223 | """ 224 | assert isinstance(linker_map, torch.Tensor), "linker_map must be a tensor" 225 | 226 | embeddings_list = [] 227 | if hidden_states is not None: 228 | hidden_states_list = [] 229 | chain_index_list = [] 230 | residue_index_list = [] 231 | 232 | for batch_idx, multimer in enumerate(linker_map): 233 | # Drop the -1 values, meaning 1s refer to residues we want to keep 234 | multimer = multimer.masked_fill(multimer == -1, 0).cpu().numpy() 235 | # Chain indices are the ones that were not masked, hence they were kept and are thus 1 236 | chain_indices = np.where(multimer == 1)[0] 237 | 238 | # Get embeddings for the residues we want to keep 239 | embeddings_list.append(embeddings[batch_idx, chain_indices]) 240 | if hidden_states is not None: 241 | hidden_states_list.append(hidden_states[batch_idx, :, chain_indices, :]) 242 | chain_index_list.append(chain_index[batch_idx, chain_indices]) 243 | residue_index_list.append(residue_index[batch_idx, chain_indices]) 244 | 245 | def pad_and_stack( 246 | arrays: list[np.ndarray], residue_dim: int, batch_dim: int, constant_value: int = 0 247 | ) -> np.ndarray: 248 | """Pad arrays to match the largest size in the residue dimension and stack them in the batch dimension. 249 | 250 | Args: 251 | arrays: List of NumPy arrays to pad and stack 252 | residue_dim: Dimension to pad to match sizes 253 | batch_dim: Dimension to stack the arrays along 254 | constant_value: Value to use for padding (default: 0) 255 | 256 | Returns: 257 | Stacked and padded NumPy array 258 | """ 259 | max_size = max(arr.shape[residue_dim] for arr in arrays) 260 | padded_arrays = [] 261 | for arr in arrays: 262 | padding = [(0, 0)] * arr.ndim 263 | padding[residue_dim] = (0, max_size - arr.shape[residue_dim]) 264 | padded_arrays.append(np.pad(arr, padding, mode="constant", constant_values=constant_value)) 265 | return np.stack(padded_arrays, axis=batch_dim) 266 | 267 | # Stack embeddings along batch dimension (0) 268 | embeddings = pad_and_stack(embeddings_list, residue_dim=0, batch_dim=0) 269 | if hidden_states is not None: 270 | hidden_states = pad_and_stack(hidden_states_list, residue_dim=0, batch_dim=0) 271 | # Transpose to get correct dimension order (batch, layers, seq_len, hidden_dim) 272 | hidden_states = np.transpose(hidden_states, (0, 2, 1, 3)) 273 | chain_index = pad_and_stack(chain_index_list, residue_dim=0, batch_dim=0, constant_value=-1) 274 | residue_index = pad_and_stack(residue_index_list, residue_dim=0, batch_dim=0, constant_value=-1) 275 | 276 | return embeddings, hidden_states, chain_index, residue_index 277 | 278 | 279 | def get_esm2(gpu_type="T4", config: dict = {}): 280 | """ 281 | Note that the app will still show that's using T4, but the actual method / function call will use the correct GPU, 282 | and display accordingly in the Modal dashboard. 283 | """ 284 | Model = ESM2.with_options(gpu=gpu_type) # type: ignore 285 | return Model(config=config) 286 | -------------------------------------------------------------------------------- /tests/data/esmfold_server_short.pdb: -------------------------------------------------------------------------------- 1 | HEADER 18-OCT-22 2 | TITLE ESMFOLD V1 PREDICTION FOR INPUT 3 | REMARK 1 4 | REMARK 1 REFERENCE 1 5 | REMARK 1 AUTH ZEMING LIN, HALIL AKIN, ROSHAN RAO, BRIAN HIE, ZHONGKAI ZHU, 6 | REMARK 1 AUTH 2 WENTING LU, NIKITA SMETANIN, ROBERT VERKUIL, ORI KABELI, 7 | REMARK 1 AUTH 3 YANIV SHMUELI, ALLAN DOS SANTOS COSTA, 8 | REMARK 1 AUTH 4 MARYAM FAZEL-ZARANDI, TOM SERCU, SALVATORE CANDIDO, 9 | REMARK 1 AUTH 5 ALEXANDER RIVES 10 | REMARK 1 TITL EVOLUTIONARY-SCALE PREDICTION OF ATOMIC LEVEL PROTEIN 11 | REMARK 1 TITL 2 STRUCTURE WITH A LANGUAGE MODEL 12 | REMARK 1 REF 13 | REMARK 1 REFN 14 | REMARK 1 PMID 15 | REMARK 1 DOI 10.1101/2022.07.20.500902 16 | REMARK 1 17 | REMARK 1 LICENSE AND DISCLAIMERS 18 | REMARK 1 ESM METAGENOMIC ATLAS DATA IS AVAILABLE UNDER 19 | REMARK 1 A CC-BY-4.0 LICENSE FOR ACADEMIC AND COMMERCIAL USE. 20 | REMARK 1 COPYRIGHT (C) META PLATFORMS, INC. ALL RIGHTS RESERVED. 21 | REMARK 1 USE OF THE ESM METAGENOMIC ATLAS DATA IS SUBJECT 22 | REMARK 1 TO THE META OPEN SOURCE TERMS OF USE AND PRIVACY POLICY. 23 | ATOM 1 N MET A 1 -6.344 13.980 10.106 1.00 0.74 N 24 | ATOM 2 CA MET A 1 -5.240 13.029 10.013 1.00 0.76 C 25 | ATOM 3 C MET A 1 -5.717 11.610 10.305 1.00 0.76 C 26 | ATOM 4 CB MET A 1 -4.116 13.411 10.978 1.00 0.67 C 27 | ATOM 5 O MET A 1 -6.468 11.387 11.256 1.00 0.68 O 28 | ATOM 6 CG MET A 1 -2.725 13.295 10.377 1.00 0.61 C 29 | ATOM 7 SD MET A 1 -1.399 13.571 11.615 1.00 0.58 S 30 | ATOM 8 CE MET A 1 -0.218 14.517 10.614 1.00 0.57 C 31 | ATOM 9 N LEU A 2 -5.708 10.653 9.301 1.00 0.73 N 32 | ATOM 10 CA LEU A 2 -6.196 9.295 9.516 1.00 0.73 C 33 | ATOM 11 C LEU A 2 -5.285 8.536 10.476 1.00 0.73 C 34 | ATOM 12 CB LEU A 2 -6.295 8.543 8.186 1.00 0.69 C 35 | ATOM 13 O LEU A 2 -4.060 8.582 10.342 1.00 0.69 O 36 | ATOM 14 CG LEU A 2 -7.287 9.102 7.164 1.00 0.66 C 37 | ATOM 15 CD1 LEU A 2 -7.122 8.393 5.824 1.00 0.63 C 38 | ATOM 16 CD2 LEU A 2 -8.717 8.964 7.676 1.00 0.64 C 39 | ATOM 17 N LYS A 3 -5.543 8.350 11.697 1.00 0.77 N 40 | ATOM 18 CA LYS A 3 -4.857 7.533 12.694 1.00 0.78 C 41 | ATOM 19 C LYS A 3 -5.440 6.124 12.746 1.00 0.77 C 42 | ATOM 20 CB LYS A 3 -4.941 8.187 14.074 1.00 0.72 C 43 | ATOM 21 O LYS A 3 -6.625 5.927 12.467 1.00 0.72 O 44 | ATOM 22 CG LYS A 3 -3.976 9.347 14.269 1.00 0.66 C 45 | ATOM 23 CD LYS A 3 -3.993 9.853 15.706 1.00 0.64 C 46 | ATOM 24 CE LYS A 3 -3.045 11.029 15.898 1.00 0.55 C 47 | ATOM 25 NZ LYS A 3 -3.034 11.507 17.313 1.00 0.47 N 48 | ATOM 26 N ASN A 4 -4.550 5.162 12.738 1.00 0.78 N 49 | ATOM 27 CA ASN A 4 -4.841 3.750 12.960 1.00 0.79 C 50 | ATOM 28 C ASN A 4 -5.689 3.167 11.833 1.00 0.79 C 51 | ATOM 29 CB ASN A 4 -5.538 3.550 14.307 1.00 0.75 C 52 | ATOM 30 O ASN A 4 -6.680 2.481 12.088 1.00 0.75 O 53 | ATOM 31 CG ASN A 4 -4.602 3.743 15.484 1.00 0.69 C 54 | ATOM 32 ND2 ASN A 4 -5.131 4.267 16.583 1.00 0.69 N 55 | ATOM 33 OD1 ASN A 4 -3.413 3.425 15.405 1.00 0.68 O 56 | ATOM 34 N VAL A 5 -5.408 3.591 10.585 1.00 0.80 N 57 | ATOM 35 CA VAL A 5 -6.150 3.103 9.428 1.00 0.79 C 58 | ATOM 36 C VAL A 5 -5.511 1.818 8.907 1.00 0.79 C 59 | ATOM 37 CB VAL A 5 -6.211 4.162 8.304 1.00 0.76 C 60 | ATOM 38 O VAL A 5 -4.285 1.682 8.911 1.00 0.74 O 61 | ATOM 39 CG1 VAL A 5 -7.342 3.848 7.326 1.00 0.65 C 62 | ATOM 40 CG2 VAL A 5 -6.385 5.559 8.898 1.00 0.64 C 63 | ATOM 41 N HIS A 6 -6.163 0.782 8.894 1.00 0.80 N 64 | ATOM 42 CA HIS A 6 -5.767 -0.453 8.228 1.00 0.80 C 65 | ATOM 43 C HIS A 6 -6.071 -0.396 6.735 1.00 0.80 C 66 | ATOM 44 CB HIS A 6 -6.472 -1.654 8.862 1.00 0.77 C 67 | ATOM 45 O HIS A 6 -7.210 -0.136 6.339 1.00 0.76 O 68 | ATOM 46 CG HIS A 6 -6.046 -1.925 10.270 1.00 0.73 C 69 | ATOM 47 CD2 HIS A 6 -6.538 -1.478 11.449 1.00 0.70 C 70 | ATOM 48 ND1 HIS A 6 -4.985 -2.748 10.582 1.00 0.70 N 71 | ATOM 49 CE1 HIS A 6 -4.844 -2.796 11.896 1.00 0.67 C 72 | ATOM 50 NE2 HIS A 6 -5.774 -2.033 12.446 1.00 0.66 N 73 | ATOM 51 N VAL A 7 -4.983 -0.405 5.963 1.00 0.77 N 74 | ATOM 52 CA VAL A 7 -5.141 -0.306 4.515 1.00 0.76 C 75 | ATOM 53 C VAL A 7 -5.020 -1.692 3.886 1.00 0.77 C 76 | ATOM 54 CB VAL A 7 -4.100 0.654 3.895 1.00 0.74 C 77 | ATOM 55 O VAL A 7 -4.086 -2.438 4.188 1.00 0.75 O 78 | ATOM 56 CG1 VAL A 7 -4.248 0.702 2.376 1.00 0.70 C 79 | ATOM 57 CG2 VAL A 7 -4.241 2.052 4.494 1.00 0.70 C 80 | ATOM 58 N LEU A 8 -6.072 -2.280 3.332 1.00 0.78 N 81 | ATOM 59 CA LEU A 8 -6.003 -3.528 2.580 1.00 0.77 C 82 | ATOM 60 C LEU A 8 -5.699 -3.261 1.110 1.00 0.78 C 83 | ATOM 61 CB LEU A 8 -7.316 -4.304 2.709 1.00 0.75 C 84 | ATOM 62 O LEU A 8 -6.402 -2.486 0.457 1.00 0.75 O 85 | ATOM 63 CG LEU A 8 -7.391 -5.638 1.963 1.00 0.72 C 86 | ATOM 64 CD1 LEU A 8 -6.436 -6.649 2.589 1.00 0.68 C 87 | ATOM 65 CD2 LEU A 8 -8.819 -6.172 1.964 1.00 0.69 C 88 | ATOM 66 N VAL A 9 -4.616 -3.580 0.630 1.00 0.74 N 89 | ATOM 67 CA VAL A 9 -4.288 -3.473 -0.788 1.00 0.74 C 90 | ATOM 68 C VAL A 9 -4.711 -4.748 -1.514 1.00 0.75 C 91 | ATOM 69 CB VAL A 9 -2.780 -3.212 -1.004 1.00 0.72 C 92 | ATOM 70 O VAL A 9 -4.239 -5.840 -1.190 1.00 0.73 O 93 | ATOM 71 CG1 VAL A 9 -2.466 -3.060 -2.491 1.00 0.69 C 94 | ATOM 72 CG2 VAL A 9 -2.338 -1.971 -0.231 1.00 0.69 C 95 | ATOM 73 N LEU A 10 -5.865 -4.754 -2.225 1.00 0.75 N 96 | ATOM 74 CA LEU A 10 -6.380 -5.808 -3.092 1.00 0.73 C 97 | ATOM 75 C LEU A 10 -5.663 -5.803 -4.438 1.00 0.73 C 98 | ATOM 76 CB LEU A 10 -7.887 -5.639 -3.304 1.00 0.70 C 99 | ATOM 77 O LEU A 10 -5.697 -4.805 -5.161 1.00 0.68 O 100 | ATOM 78 CG LEU A 10 -8.773 -5.860 -2.077 1.00 0.67 C 101 | ATOM 79 CD1 LEU A 10 -10.181 -5.336 -2.338 1.00 0.64 C 102 | ATOM 80 CD2 LEU A 10 -8.808 -7.338 -1.701 1.00 0.65 C 103 | ATOM 81 N GLY A 11 -4.786 -6.738 -4.567 1.00 0.71 N 104 | ATOM 82 CA GLY A 11 -4.025 -6.970 -5.784 1.00 0.69 C 105 | ATOM 83 C GLY A 11 -2.639 -6.354 -5.749 1.00 0.69 C 106 | ATOM 84 O GLY A 11 -2.493 -5.155 -5.502 1.00 0.64 O 107 | ATOM 85 N ALA A 12 -1.688 -7.170 -5.236 1.00 0.64 N 108 | ATOM 86 CA ALA A 12 -0.257 -6.901 -5.129 1.00 0.63 C 109 | ATOM 87 C ALA A 12 0.432 -7.053 -6.482 1.00 0.65 C 110 | ATOM 88 CB ALA A 12 0.385 -7.830 -4.101 1.00 0.59 C 111 | ATOM 89 O ALA A 12 1.382 -7.828 -6.619 1.00 0.63 O 112 | ATOM 90 N GLY A 13 -0.229 -6.758 -7.545 1.00 0.72 N 113 | ATOM 91 CA GLY A 13 0.451 -6.664 -8.827 1.00 0.71 C 114 | ATOM 92 C GLY A 13 1.361 -5.454 -8.933 1.00 0.70 C 115 | ATOM 93 O GLY A 13 1.861 -4.957 -7.923 1.00 0.64 O 116 | ATOM 94 N ASP A 14 1.883 -5.236 -10.085 1.00 0.67 N 117 | ATOM 95 CA ASP A 14 2.905 -4.213 -10.286 1.00 0.66 C 118 | ATOM 96 C ASP A 14 2.562 -2.937 -9.520 1.00 0.67 C 119 | ATOM 97 CB ASP A 14 3.070 -3.904 -11.775 1.00 0.61 C 120 | ATOM 98 O ASP A 14 3.445 -2.293 -8.949 1.00 0.65 O 121 | ATOM 99 CG ASP A 14 3.806 -4.998 -12.529 1.00 0.58 C 122 | ATOM 100 OD1 ASP A 14 4.578 -5.756 -11.904 1.00 0.58 O 123 | ATOM 101 OD2 ASP A 14 3.614 -5.100 -13.761 1.00 0.60 O 124 | ATOM 102 N VAL A 15 1.399 -2.629 -9.519 1.00 0.72 N 125 | ATOM 103 CA VAL A 15 0.984 -1.369 -8.913 1.00 0.71 C 126 | ATOM 104 C VAL A 15 0.792 -1.556 -7.410 1.00 0.71 C 127 | ATOM 105 CB VAL A 15 -0.316 -0.832 -9.553 1.00 0.68 C 128 | ATOM 106 O VAL A 15 1.247 -0.732 -6.612 1.00 0.68 O 129 | ATOM 107 CG1 VAL A 15 -0.775 0.446 -8.853 1.00 0.62 C 130 | ATOM 108 CG2 VAL A 15 -0.112 -0.583 -11.046 1.00 0.63 C 131 | ATOM 109 N GLY A 16 0.138 -2.722 -7.046 1.00 0.73 N 132 | ATOM 110 CA GLY A 16 -0.096 -2.967 -5.632 1.00 0.73 C 133 | ATOM 111 C GLY A 16 1.182 -3.031 -4.818 1.00 0.73 C 134 | ATOM 112 O GLY A 16 1.229 -2.545 -3.686 1.00 0.72 O 135 | ATOM 113 N SER A 17 2.239 -3.570 -5.344 1.00 0.73 N 136 | ATOM 114 CA SER A 17 3.507 -3.731 -4.640 1.00 0.73 C 137 | ATOM 115 C SER A 17 4.164 -2.382 -4.368 1.00 0.73 C 138 | ATOM 116 CB SER A 17 4.459 -4.617 -5.446 1.00 0.69 C 139 | ATOM 117 O SER A 17 4.824 -2.202 -3.343 1.00 0.72 O 140 | ATOM 118 OG SER A 17 4.699 -4.062 -6.727 1.00 0.62 O 141 | ATOM 119 N VAL A 18 3.984 -1.511 -5.393 1.00 0.73 N 142 | ATOM 120 CA VAL A 18 4.565 -0.192 -5.166 1.00 0.73 C 143 | ATOM 121 C VAL A 18 3.882 0.475 -3.974 1.00 0.73 C 144 | ATOM 122 CB VAL A 18 4.447 0.704 -6.419 1.00 0.69 C 145 | ATOM 123 O VAL A 18 4.547 1.069 -3.122 1.00 0.72 O 146 | ATOM 124 CG1 VAL A 18 4.897 2.130 -6.107 1.00 0.59 C 147 | ATOM 125 CG2 VAL A 18 5.266 0.122 -7.570 1.00 0.59 C 148 | ATOM 126 N VAL A 19 2.569 0.345 -3.865 1.00 0.75 N 149 | ATOM 127 CA VAL A 19 1.787 1.016 -2.832 1.00 0.74 C 150 | ATOM 128 C VAL A 19 2.144 0.445 -1.461 1.00 0.75 C 151 | ATOM 129 CB VAL A 19 0.270 0.876 -3.088 1.00 0.71 C 152 | ATOM 130 O VAL A 19 2.334 1.194 -0.500 1.00 0.73 O 153 | ATOM 131 CG1 VAL A 19 -0.530 1.478 -1.934 1.00 0.64 C 154 | ATOM 132 CG2 VAL A 19 -0.111 1.541 -4.410 1.00 0.64 C 155 | ATOM 133 N VAL A 20 2.344 -0.856 -1.357 1.00 0.74 N 156 | ATOM 134 CA VAL A 20 2.742 -1.493 -0.106 1.00 0.74 C 157 | ATOM 135 C VAL A 20 4.111 -0.972 0.327 1.00 0.75 C 158 | ATOM 136 CB VAL A 20 2.774 -3.032 -0.237 1.00 0.71 C 159 | ATOM 137 O VAL A 20 4.320 -0.658 1.501 1.00 0.75 O 160 | ATOM 138 CG1 VAL A 20 3.370 -3.669 1.017 1.00 0.63 C 161 | ATOM 139 CG2 VAL A 20 1.370 -3.574 -0.500 1.00 0.64 C 162 | ATOM 140 N ARG A 21 5.028 -0.887 -0.623 1.00 0.75 N 163 | ATOM 141 CA ARG A 21 6.379 -0.435 -0.308 1.00 0.76 C 164 | ATOM 142 C ARG A 21 6.366 0.988 0.240 1.00 0.76 C 165 | ATOM 143 CB ARG A 21 7.274 -0.511 -1.547 1.00 0.72 C 166 | ATOM 144 O ARG A 21 7.128 1.314 1.153 1.00 0.76 O 167 | ATOM 145 CG ARG A 21 7.873 -1.887 -1.791 1.00 0.67 C 168 | ATOM 146 CD ARG A 21 8.870 -1.872 -2.941 1.00 0.64 C 169 | ATOM 147 NE ARG A 21 9.399 -3.206 -3.213 1.00 0.57 N 170 | ATOM 148 NH1 ARG A 21 11.174 -2.467 -4.493 1.00 0.46 N 171 | ATOM 149 NH2 ARG A 21 10.874 -4.710 -4.127 1.00 0.42 N 172 | ATOM 150 CZ ARG A 21 10.481 -3.458 -3.944 1.00 0.58 C 173 | ATOM 151 N LEU A 22 5.481 1.861 -0.360 1.00 0.76 N 174 | ATOM 152 CA LEU A 22 5.398 3.260 0.046 1.00 0.76 C 175 | ATOM 153 C LEU A 22 4.780 3.387 1.435 1.00 0.76 C 176 | ATOM 154 CB LEU A 22 4.578 4.065 -0.966 1.00 0.73 C 177 | ATOM 155 O LEU A 22 5.161 4.266 2.211 1.00 0.74 O 178 | ATOM 156 CG LEU A 22 5.253 4.354 -2.308 1.00 0.68 C 179 | ATOM 157 CD1 LEU A 22 4.241 4.920 -3.298 1.00 0.63 C 180 | ATOM 158 CD2 LEU A 22 6.423 5.314 -2.123 1.00 0.64 C 181 | ATOM 159 N LEU A 23 3.901 2.462 1.770 1.00 0.78 N 182 | ATOM 160 CA LEU A 23 3.204 2.524 3.050 1.00 0.77 C 183 | ATOM 161 C LEU A 23 4.095 2.018 4.179 1.00 0.77 C 184 | ATOM 162 CB LEU A 23 1.911 1.705 2.997 1.00 0.75 C 185 | ATOM 163 O LEU A 23 3.958 2.451 5.325 1.00 0.75 O 186 | ATOM 164 CG LEU A 23 0.790 2.264 2.119 1.00 0.70 C 187 | ATOM 165 CD1 LEU A 23 -0.358 1.265 2.027 1.00 0.66 C 188 | ATOM 166 CD2 LEU A 23 0.300 3.601 2.663 1.00 0.67 C 189 | ATOM 167 N GLU A 24 4.981 1.016 3.857 1.00 0.74 N 190 | ATOM 168 CA GLU A 24 5.952 0.504 4.819 1.00 0.74 C 191 | ATOM 169 C GLU A 24 6.979 1.571 5.187 1.00 0.74 C 192 | ATOM 170 CB GLU A 24 6.657 -0.736 4.264 1.00 0.71 C 193 | ATOM 171 O GLU A 24 7.549 1.541 6.280 1.00 0.74 O 194 | ATOM 172 CG GLU A 24 5.816 -2.003 4.332 1.00 0.65 C 195 | ATOM 173 CD GLU A 24 6.574 -3.249 3.903 1.00 0.63 C 196 | ATOM 174 OE1 GLU A 24 7.478 -3.144 3.043 1.00 0.63 O 197 | ATOM 175 OE2 GLU A 24 6.261 -4.340 4.431 1.00 0.59 O 198 | ATOM 176 N LYS A 25 7.132 2.617 4.434 1.00 0.74 N 199 | ATOM 177 CA LYS A 25 8.123 3.644 4.744 1.00 0.73 C 200 | ATOM 178 C LYS A 25 7.535 4.717 5.656 1.00 0.70 C 201 | ATOM 179 CB LYS A 25 8.655 4.282 3.460 1.00 0.68 C 202 | ATOM 180 O LYS A 25 8.218 5.219 6.551 1.00 0.67 O 203 | ATOM 181 CG LYS A 25 9.592 3.384 2.666 1.00 0.63 C 204 | ATOM 182 CD LYS A 25 10.175 4.112 1.461 1.00 0.62 C 205 | ATOM 183 CE LYS A 25 11.083 3.203 0.643 1.00 0.54 C 206 | ATOM 184 NZ LYS A 25 11.645 3.907 -0.548 1.00 0.50 N -------------------------------------------------------------------------------- /tests/esm/test_esmfold.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pathlib 3 | import numpy as np 4 | import torch 5 | from typing import Generator 6 | from modal import enable_output 7 | 8 | from boileroom import app, ESMFold 9 | from boileroom.models.esm.esmfold import ESMFoldOutput 10 | from boileroom.models.esm.linker import store_multimer_properties 11 | from boileroom.convert import pdb_string_to_atomarray 12 | from boileroom.constants import restype_3to1 13 | from biotite.structure import AtomArray, rmsd 14 | from io import StringIO 15 | from biotite.structure.io.pdb import PDBFile 16 | 17 | 18 | @pytest.fixture 19 | def esmfold_model(config={}) -> Generator[ESMFold, None, None]: 20 | with enable_output(), app.run(): 21 | yield ESMFold(config=config) 22 | 23 | 24 | def test_esmfold_basic(test_sequences: dict[str, str], esmfold_model: ESMFold, run_backend): 25 | """Test basic ESMFold functionality.""" 26 | result = run_backend(esmfold_model.fold)(test_sequences["short"]) 27 | 28 | assert isinstance(result, ESMFoldOutput), "Result should be an ESMFoldOutput" 29 | 30 | seq_len = len(test_sequences["short"]) 31 | positions_shape = result.positions.shape 32 | 33 | assert positions_shape[-1] == 3, "Coordinate dimension mismatch. Expected: 3, Got: {positions_shape[-1]}" 34 | assert ( 35 | positions_shape[-3] == seq_len 36 | ), "Number of residues mismatch. Expected: {seq_len}, Got: {positions_shape[-3]}" 37 | assert np.all(result.plddt >= 0), "pLDDT scores should be non-negative" 38 | assert np.all(result.plddt <= 100), "pLDDT scores should be less than or equal to 100" 39 | 40 | 41 | def test_esmfold_multimer(test_sequences, run_backend): 42 | """Test ESMFold multimer functionality.""" 43 | with enable_output(), app.run(): # TODO: make this better with a fixture, re-using the logic 44 | model = ESMFold(config={"output_pdb": True}) 45 | result = run_backend(model.fold)(test_sequences["multimer"]) 46 | 47 | assert result.pdb is not None, "PDB output should be generated" 48 | assert result.positions.shape[2] == len(test_sequences["multimer"].replace(":", "")), "Number of residues mismatch" 49 | assert np.all(result.residue_index[0][:54] == np.arange(0, 54)), "First chain residue index mismatch" 50 | assert np.all(result.residue_index[0][54:] == np.arange(0, 54)), "Second chain residue index mismatch" 51 | assert np.all(result.chain_index[0][:54] == 0), "First chain index mismatch" 52 | assert np.all(result.chain_index[0][54:] == 1), "Second chain index mismatch" 53 | 54 | structure = pdb_string_to_atomarray(result.pdb[0]) 55 | 56 | n_residues = len(set((chain, res) for chain, res in zip(structure.chain_id, structure.res_id, strict=True))) 57 | 58 | assert n_residues == len(test_sequences["multimer"].replace(":", "")), "Number of residues mismatch" 59 | assert len(result.chain_index[0]) == n_residues, "Chain index length mismatch" 60 | assert len(result.residue_index[0]) == n_residues, "Residue index length mismatch" 61 | 62 | # Check chain assignments 63 | unique_chains = np.unique(structure.chain_id) 64 | assert len(unique_chains) == 2, f"Expected 2 chains, got {len(unique_chains)}" 65 | 66 | # Check residues per chain 67 | chain_a_residues = len(np.unique(structure.res_id[structure.chain_id == "A"])) 68 | chain_b_residues = len(np.unique(structure.res_id[structure.chain_id == "B"])) 69 | assert chain_a_residues == 54, f"Chain A should have 54 residues, got {chain_a_residues}" 70 | assert chain_b_residues == 54, f"Chain B should have 54 residues, got {chain_b_residues}" 71 | 72 | # Assert correct folding outputs metrics (need to do it as we slice the linker out) 73 | assert result.predicted_aligned_error.shape == (1, n_residues, n_residues), "PAE matrix shape mismatch" 74 | assert result.plddt.shape == (1, n_residues, 37), "pLDDT matrix shape mismatch" 75 | assert result.ptm_logits.shape == (1, n_residues, n_residues, 64), "pTM matrix shape mismatch" 76 | assert result.aligned_confidence_probs.shape == (1, n_residues, n_residues, 64), "aligned confidence shape mismatch" 77 | assert result.s_z.shape == (1, n_residues, n_residues, 128), "s_z matrix shape mismatch" 78 | assert result.s_s.shape == (1, n_residues, 1024), "s_s matrix shape mismatch" 79 | assert result.distogram_logits.shape == (1, n_residues, n_residues, 64), "distogram logits matrix shape mismatch" 80 | assert result.lm_logits.shape == (1, n_residues, 23), "lm logits matrix shape mismatch" 81 | assert result.lddt_head.shape == (8, 1, n_residues, 37, 50), "lddt head matrix shape mismatch" 82 | assert result.plddt.shape == (1, n_residues, 37), "pLDDT matrix shape mismatch" 83 | 84 | 85 | def test_esmfold_linker_map(): 86 | """ 87 | Test ESMFold linker map. 88 | The linker map has 1 for residues to keep (i.e. those not part of the linker), 89 | and 0 for residues to remove (i.e. those part of the linker). 90 | """ 91 | sequences = ["AAAAAA:BBBBBBBBB", "CCCCC:DDDDDDD:EEEEEEE", "HHHH"] 92 | GLYCINE_LINKER = "G" * 50 93 | N = len(GLYCINE_LINKER) 94 | linker_map, _, _ = store_multimer_properties([sequences[0]], GLYCINE_LINKER) 95 | gt_map = torch.tensor([1] * 6 + [0] * N + [1] * 9) 96 | assert torch.all(linker_map == gt_map), "Linker map mismatch" 97 | 98 | linker_map, _, _ = store_multimer_properties([sequences[1]], GLYCINE_LINKER) 99 | gt_map = torch.tensor([1] * 5 + [0] * N + [1] * 7 + [0] * N + [1] * 7) 100 | assert torch.all(linker_map == gt_map), "Linker map mismatch" 101 | 102 | linker_map, _, _ = store_multimer_properties([sequences[2]], GLYCINE_LINKER) 103 | gt_map = torch.tensor([1] * 4) 104 | assert torch.all(linker_map == gt_map), "Linker map mismatch" 105 | 106 | 107 | def test_esmfold_no_glycine_linker(test_sequences, run_backend): 108 | """Test ESMFold no glycine linker.""" 109 | model = ESMFold( 110 | config={ 111 | "glycine_linker": "", 112 | } 113 | ) 114 | 115 | with enable_output(), app.run(): 116 | result = run_backend(model.fold)(test_sequences["multimer"]) 117 | 118 | assert result.positions is not None, "Positions should be generated" 119 | assert result.positions.shape[2] == len(test_sequences["multimer"].replace(":", "")), "Number of residues mismatch" 120 | 121 | assert result.residue_index is not None, "Residue index should be generated" 122 | assert result.plddt is not None, "pLDDT should be generated" 123 | assert result.ptm is not None, "pTM should be generated" 124 | 125 | # assert correct chain_indices 126 | assert np.all(result.chain_index[0] == np.array([0] * 54 + [1] * 54)), "Chain indices mismatch" 127 | assert np.all( 128 | result.residue_index[0] == np.concatenate([np.arange(0, 54), np.arange(0, 54)]) 129 | ), "Residue index mismatch" 130 | 131 | 132 | def test_esmfold_chain_indices(): 133 | """ 134 | Test ESMFold chain indices. Note that this is before we slice the linker out, that 135 | is why we need to check the presence of the linker indices here as well. And by construction, 136 | it is assigned to the first chain, i.e. 0. 137 | """ 138 | sequences = ["AAAAAA:CCCCCCCCC", "CCCCC:DDDDDDD:EEEEEEE", "HHHH"] 139 | GLYCINE_LINKER = "G" * 50 140 | N = len(GLYCINE_LINKER) 141 | 142 | _, _, chain_indices = store_multimer_properties([sequences[0]], GLYCINE_LINKER) 143 | 144 | expected_chain_indices = np.concatenate( 145 | [ 146 | np.zeros(6), # First chain (6 residues) 147 | np.zeros(N), # Linker region (N residues) - belongs to first chain 148 | np.ones(9), # Second chain (9 residues) 149 | ] 150 | ) 151 | assert np.array_equal(chain_indices[0], expected_chain_indices), "Chain indices mismatch" 152 | 153 | 154 | def test_esmfold_batch(esmfold_model: ESMFold, test_sequences: dict[str, str], run_backend): 155 | """Test ESMFold batch prediction.""" 156 | 157 | # Define input sequences 158 | sequences = [test_sequences["short"], test_sequences["medium"]] 159 | 160 | # Make prediction 161 | result = run_backend(esmfold_model.fold)(sequences) 162 | 163 | max_seq_length = max(len(seq) for seq in sequences) 164 | assert ( 165 | result.positions.shape == (8, len(sequences), max_seq_length, 14, 3) 166 | ), f"Position shape mismatch. Expected: (8, {len(sequences)}, {max_seq_length}, 14, 3), Got: {result.positions.shape}" 167 | 168 | # Check that batch outputs have correct sequence lengths 169 | assert result.aatype.shape[0] == len(sequences), "Batch size mismatch in aatype" 170 | assert result.plddt.shape[0] == len(sequences), "Batch size mismatch in plddt" 171 | assert result.ptm_logits.shape[0] == len(sequences), "Batch size mismatch in ptm_logits" 172 | assert result.predicted_aligned_error.shape[0] == len(sequences), "Batch size mismatch in predicted_aligned_error" 173 | 174 | 175 | # TODO: This is not obvious to do, given the way we wrap things around in Modal 176 | # This shows well how fragile relying on Modal is going to be moving forward, and we should think 177 | # of ways to make it more managable through local execution as well 178 | 179 | # def test_tokenize_sequences_with_mocker(mocker): 180 | # """Test tokenization of multimer sequences using pytest-mock.""" 181 | # from boileroom.esmfold import ESMFold 182 | 183 | # # Test data 184 | # sequences = ["AAAAAA:CCCCCCCCC", "CCCCC:DDDDDDD:EEEEEEE", "HHHH"] 185 | # GLYCINE_LINKER = "" 186 | # POSITION_IDS_SKIP = 512 187 | 188 | # # Create a model instance 189 | # model = ESMFold(config={"glycine_linker": GLYCINE_LINKER, "position_ids_skip": POSITION_IDS_SKIP}) 190 | 191 | # # Mock the tokenizer 192 | # mock_tokenizer = mocker.patch.object(model, 'tokenizer') 193 | # mock_tokenizer.return_value = { 194 | # "input_ids": torch.tensor([ 195 | # [1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, -1, -1, -1], 196 | # [3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5], 197 | # [8, 8, 8, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] 198 | # ]), 199 | # "attention_mask": torch.ones(3, 19), 200 | # } 201 | 202 | # # Call the method to test 203 | # tokenized_input = model._tokenize_sequences(sequences) 204 | 205 | # # Assert the tokenizer was called with the expected arguments 206 | # expected_sequences = [seq.replace(":", GLYCINE_LINKER) for seq in sequences] 207 | # mock_tokenizer.assert_called_once_with( 208 | # expected_sequences, 209 | # padding=True, 210 | # truncation=True, 211 | # return_tensors="pt", 212 | # add_special_tokens=False 213 | # ) 214 | 215 | # # Verify the output contains the expected keys 216 | # assert set(tokenized_input.keys()) >= {"input_ids", "attention_mask", "position_ids"} 217 | 218 | 219 | def test_sequence_validation(esmfold_model: ESMFold, test_sequences: dict[str, str], run_backend): 220 | """Test sequence validation in FoldingAlgorithm.""" 221 | 222 | # Test single sequence 223 | single_seq = test_sequences["short"] 224 | validated = esmfold_model._validate_sequences(single_seq) 225 | assert isinstance(validated, list), "Single sequence should be converted to list" 226 | assert len(validated) == 1, "Should contain one sequence" 227 | assert validated[0] == single_seq, "Sequence should be unchanged" 228 | 229 | # Test sequence list 230 | seq_list = [test_sequences["short"], test_sequences["medium"]] 231 | validated = esmfold_model._validate_sequences(seq_list) 232 | assert isinstance(validated, list), "Should return a list" 233 | assert len(validated) == 2, "Should contain two sequences" 234 | assert validated == seq_list, "Sequences should be unchanged" 235 | 236 | # Test invalid sequence 237 | with pytest.raises(ValueError) as exc_info: 238 | esmfold_model._validate_sequences(test_sequences["invalid"]) 239 | assert "Invalid amino acid" in str(exc_info.value), f"Expected 'Invalid amino acid', got {str(exc_info.value)}" 240 | 241 | # Test that fold method uses validation 242 | with pytest.raises(ValueError) as exc_info: 243 | run_backend(esmfold_model.fold)(test_sequences["invalid"]) 244 | assert "Invalid amino acid" in str(exc_info.value), f"Expected 'Invalid amino acid', got {str(exc_info.value)}" 245 | 246 | 247 | def test_esmfold_output_pdb_cif(data_dir: pathlib.Path, test_sequences: dict[str, str], run_backend): 248 | """Test ESMFold output PDB and CIF.""" 249 | 250 | def recover_sequence(atomarray: AtomArray) -> str: 251 | unique_res_ids = np.unique(atomarray.res_id) 252 | three_letter_codes = [atomarray.res_name[atomarray.res_id == res_id][0] for res_id in unique_res_ids] 253 | one_letter_codes = [restype_3to1[three_letter_code] for three_letter_code in three_letter_codes] 254 | return "".join(one_letter_codes) 255 | 256 | with enable_output(), app.run(): 257 | model = ESMFold(config={"output_pdb": True, "output_cif": False, "output_atomarray": True}) 258 | # Define input sequences 259 | sequences = [test_sequences["short"], test_sequences["medium"]] 260 | result = run_backend(model.fold)(sequences) 261 | 262 | assert result.pdb is not None, "PDB output should be generated" 263 | assert result.cif is None, "CIF output should be None" 264 | assert len(result.pdb) == len(result.atom_array) == len(sequences) == 2, "Batching output match!" 265 | assert isinstance(result.pdb, list), "PDB output should be a list" 266 | assert len(result.pdb) == len(sequences), "PDB output should have same length as input sequences" 267 | assert isinstance(result.atom_array, list), "Atom array should be a list" 268 | assert isinstance(result.atom_array[0], AtomArray), "Atom array should be an AtomArray" 269 | 270 | short_pdb = PDBFile.read(StringIO(result.pdb[0])).get_structure(model=1) 271 | medium_pdb = PDBFile.read(StringIO(result.pdb[1])).get_structure(model=1) 272 | short_atomarray = result.atom_array[0] 273 | medium_atomarray = result.atom_array[1] 274 | 275 | # Short protein checks 276 | num_residues = len(sequences[0]) 277 | assert np.all( 278 | np.unique(short_atomarray.res_id) == np.arange(0, num_residues) 279 | ), "AtomArray residues should be 0-indexed" 280 | recovered_seq = recover_sequence(short_atomarray) 281 | assert recovered_seq == sequences[0], "Recovered sequence should be equal to the input sequence" 282 | assert np.all(np.unique(short_pdb.res_id) == np.arange(0, num_residues)), "Residues should be 0-indexed" 283 | # Compare coordinates with tolerance 284 | assert np.allclose( 285 | short_pdb.coord, short_atomarray.coord, atol=0.1 286 | ), "Atom coordinates should be equal within 0.1Å tolerance" 287 | # Compare other attributes exactly 288 | assert np.array_equal(short_pdb.chain_id, short_atomarray.chain_id), "Chain IDs should match exactly" 289 | assert np.array_equal(short_pdb.res_id, short_atomarray.res_id), "Residue IDs should match exactly" 290 | assert np.array_equal(short_pdb.res_name, short_atomarray.res_name), "Residue names should match exactly" 291 | assert np.array_equal(short_pdb.atom_name, short_atomarray.atom_name), "Atom names should match exactly" 292 | 293 | # Medium protein checks 294 | num_residues = len(sequences[1]) 295 | assert np.all( 296 | np.unique(medium_atomarray.res_id) == np.arange(0, num_residues) 297 | ), "AtomArray residues should be 0-indexed" 298 | recovered_seq = recover_sequence(medium_atomarray) 299 | assert recovered_seq == sequences[1], "Recovered sequence should be equal to the input sequence" 300 | assert np.all(np.unique(medium_pdb.res_id) == np.arange(0, num_residues)), "Residues should be 0-indexed" 301 | 302 | # Compare coordinates with tolerance 303 | assert np.allclose( 304 | medium_pdb.coord, medium_atomarray.coord, atol=0.1 305 | ), "Atom coordinates should be equal within 0.1Å tolerance" 306 | # Compare other attributes exactly 307 | assert np.array_equal(medium_pdb.chain_id, medium_atomarray.chain_id), "Chain IDs should match exactly" 308 | assert np.array_equal(medium_pdb.res_id, medium_atomarray.res_id), "Residue IDs should match exactly" 309 | assert np.array_equal(medium_pdb.res_name, medium_atomarray.res_name), "Residue names should match exactly" 310 | assert np.array_equal(medium_pdb.atom_name, medium_atomarray.atom_name), "Atom names should match exactly" 311 | 312 | short_pdbfile = PDBFile().read(data_dir / "esmfold_server_short.pdb") 313 | saved_short_pdb = short_pdbfile.get_structure(model=1) 314 | saved_short_bfactor = short_pdbfile.get_b_factor() 315 | rmsd_value = rmsd(short_pdb, saved_short_pdb) 316 | assert ( 317 | rmsd_value < 1.5 318 | ), "PDB file should be almost equal to the saved ESMFold Server PDB file. Difference comes from HF vs. Meta implementation differences." 319 | 320 | medium_pdbfile = PDBFile().read(data_dir / "esmfold_server_medium.pdb") 321 | saved_medium_pdb = medium_pdbfile.get_structure(model=1) 322 | saved_medium_bfactor = medium_pdbfile.get_b_factor() 323 | rmsd_value = rmsd(medium_pdb, saved_medium_pdb) 324 | assert ( 325 | rmsd_value < 1.5 326 | ), "PDB file should be almost equal to the saved ESMFold Server PDB file. Difference comes from HF vs. Meta implementation differences." 327 | 328 | # compare b-factor 329 | short_bfactor = short_atomarray.get_annotation("b_factor") 330 | medium_bfactor = medium_atomarray.get_annotation("b_factor") 331 | assert np.allclose( 332 | short_bfactor, saved_short_bfactor, atol=0.05 333 | ), "B-factor should match within a tolerance (HF vs. Meta)" 334 | assert np.allclose( 335 | medium_bfactor, saved_medium_bfactor, atol=0.05 336 | ), "B-factor should match within a tolerance (HF vs. Meta)" 337 | -------------------------------------------------------------------------------- /boileroom/models/esm/esmfold.py: -------------------------------------------------------------------------------- 1 | """ESMFold implementation for protein structure prediction using Meta AI's ESM-2 model.""" 2 | 3 | import os 4 | import logging 5 | from dataclasses import dataclass 6 | from typing import Optional, List, Union 7 | 8 | import modal 9 | import numpy as np 10 | from biotite.structure import AtomArray 11 | 12 | from ... import app 13 | from ...base import FoldingAlgorithm, StructurePrediction, PredictionMetadata 14 | from ...images import esm_image 15 | from ...images.volumes import model_weights 16 | from ...utils import MINUTES, MODEL_DIR, GPUS_AVAIL_ON_MODAL, Timer 17 | from .linker import compute_position_ids, store_multimer_properties 18 | 19 | # ESMFold-Specific: A list of atoms (excluding hydrogen) for each AA type. PDB naming convention. 20 | # This might be re-used elsewhere, if OpenFold/AlphaFold or newer models use the same ordering convention. 21 | RESIDUE_ATOMS: dict[str, list[str]] = { 22 | "ALA": ["C", "CA", "CB", "N", "O"], 23 | "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"], 24 | "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"], 25 | "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"], 26 | "CYS": ["C", "CA", "CB", "N", "O", "SG"], 27 | "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"], 28 | "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"], 29 | "GLY": ["C", "CA", "N", "O"], 30 | "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"], 31 | "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"], 32 | "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"], 33 | "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"], 34 | "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"], 35 | "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"], 36 | "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"], 37 | "SER": ["C", "CA", "CB", "N", "O", "OG"], 38 | "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"], 39 | "TRP": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE2", "CE3", "CZ2", "CZ3", "CH2", "N", "NE1", "O"], 40 | "TYR": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O", "OH"], 41 | "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"], 42 | } 43 | 44 | with esm_image.imports(): 45 | import torch 46 | from transformers import EsmForProteinFolding, AutoTokenizer 47 | from transformers.models.esm.modeling_esmfold import EsmFoldingTrunk 48 | 49 | def always_no_grad_forward(self, seq_feats, pair_feats, true_aa, residx, mask, no_recycles): 50 | """ 51 | Inputs: 52 | seq_feats: B x L x C tensor of sequence features pair_feats: B x L x L x C tensor of pair features residx: B 53 | x L long tensor giving the position in the sequence mask: B x L boolean tensor indicating valid residues 54 | 55 | Output: 56 | predicted_structure: B x L x (num_atoms_per_residue * 3) tensor wrapped in a Coordinates object 57 | """ 58 | 59 | device = seq_feats.device 60 | s_s_0 = seq_feats 61 | s_z_0 = pair_feats 62 | 63 | if no_recycles is None: 64 | no_recycles = self.config.max_recycles 65 | else: 66 | if no_recycles < 0: 67 | raise ValueError("Number of recycles must not be negative.") 68 | no_recycles += 1 # First 'recycle' is just the standard forward pass through the model. 69 | 70 | def trunk_iter(s, z, residx, mask): 71 | z = z + self.pairwise_positional_embedding(residx, mask=mask) 72 | 73 | for block in self.blocks: 74 | s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size) 75 | return s, z 76 | 77 | s_s = s_s_0 78 | s_z = s_z_0 79 | recycle_s = torch.zeros_like(s_s) 80 | recycle_z = torch.zeros_like(s_z) 81 | recycle_bins = torch.zeros(*s_z.shape[:-1], device=device, dtype=torch.int64) 82 | 83 | for recycle_idx in range(no_recycles): 84 | with torch.no_grad(): 85 | # === Recycling === 86 | recycle_s = self.recycle_s_norm(recycle_s.detach()).to(device) 87 | recycle_z = self.recycle_z_norm(recycle_z.detach()).to(device) 88 | recycle_z += self.recycle_disto(recycle_bins.detach()).to(device) 89 | 90 | s_s, s_z = trunk_iter(s_s_0 + recycle_s, s_z_0 + recycle_z, residx, mask) 91 | 92 | # === Structure module === 93 | structure = self.structure_module( 94 | {"single": self.trunk2sm_s(s_s), "pair": self.trunk2sm_z(s_z)}, 95 | true_aa, 96 | mask.float(), 97 | ) 98 | 99 | recycle_s = s_s 100 | recycle_z = s_z 101 | # Distogram needs the N, CA, C coordinates, and bin constants same as alphafold. 102 | recycle_bins = EsmFoldingTrunk.distogram( 103 | structure["positions"][-1][:, :, :3], 104 | 3.375, 105 | 21.375, 106 | self.recycle_bins, 107 | ) 108 | 109 | structure["s_s"] = s_s 110 | structure["s_z"] = s_z 111 | 112 | return structure 113 | 114 | EsmFoldingTrunk.forward = always_no_grad_forward 115 | 116 | # Set up basic logging configuration 117 | logging.basicConfig(level=logging.INFO) 118 | logger = logging.getLogger(__name__) 119 | 120 | 121 | # TODO: turn this into a Pydantic model instead 122 | @dataclass 123 | class ESMFoldOutput(StructurePrediction): 124 | """Output from ESMFold prediction including all model outputs.""" 125 | 126 | # TODO: we should figure out what should be the verbosity of the output, 127 | # as a usual user does not need all of this information 128 | 129 | # Required by StructurePrediction protocol 130 | positions: np.ndarray # (model_layer, batch_size, residue, atom=14, xyz=3) 131 | metadata: PredictionMetadata 132 | 133 | # Additional ESMFold-specific outputs 134 | frames: np.ndarray # (model_layer, batch_size, residue, qxyz=7) 135 | sidechain_frames: np.ndarray # (model_layer, batch_size, residue, 8, 4, 4) [rot matrix per sidechain] 136 | unnormalized_angles: np.ndarray # (model_layer, batch_size, residue, 7, 2) [torsion angles] 137 | angles: np.ndarray # (model_layer, batch_size, residue, 7, 2) [torsion angles] 138 | states: np.ndarray # (model_layer, batch_size, residue, ???) 139 | s_s: np.ndarray # (batch_size, residue, 1024) 140 | s_z: np.ndarray # (batch_size, residue, residue, 128) 141 | distogram_logits: np.ndarray # (batch_size, residue, residue, 64) ??? 142 | lm_logits: np.ndarray # (batch_size, residue, 23) ??? 143 | aatype: np.ndarray # (batch_size, residue) amino acid identity 144 | atom14_atom_exists: np.ndarray # (batch_size, residue, atom=14) 145 | residx_atom14_to_atom37: np.ndarray # (batch_size, residue, atom=14) 146 | residx_atom37_to_atom14: np.ndarray # (batch_size, residue, atom=37) 147 | atom37_atom_exists: np.ndarray # (batch_size, residue, atom=37) 148 | residue_index: np.ndarray # (batch_size, residue) 149 | lddt_head: np.ndarray # (model_layer, batch_size, residue, atom=37, 50) ?? 150 | plddt: np.ndarray # (batch_size, residue, atom=37) 151 | ptm_logits: np.ndarray # (batch_size, residue, residue, 64) ??? 152 | ptm: np.ndarray # float # TODO: make it into a float when sending to the client 153 | aligned_confidence_probs: np.ndarray # (batch_size, residue, residue, 64) 154 | predicted_aligned_error: np.ndarray # (batch_size, residue, residue) 155 | max_predicted_aligned_error: np.ndarray # float # TODO: make it into a float when sending to the client 156 | chain_index: np.ndarray # (batch_size, residue) 157 | # TODO: maybe add this to the output to clearly indicate padded residues 158 | atom_array: Optional[AtomArray] = None # 0-indexed 159 | pdb: Optional[list[str]] = None # 0-indexed 160 | cif: Optional[list[str]] = None # 0-indexed 161 | 162 | # TODO: can add a save method here (to a pickle and a pdb file) that can be run locally 163 | # TODO: add verification of the outputs, and primarily the shape of all the arrays 164 | # (see test_esmfold_batch_multimer_linkers for the exact batched shapes) 165 | 166 | 167 | GPU_TO_USE = os.environ.get("BOILEROOM_GPU", "T4") 168 | 169 | if GPU_TO_USE not in GPUS_AVAIL_ON_MODAL: 170 | raise ValueError( 171 | f"GPU specified in BOILEROOM_GPU environment variable ('{GPU_TO_USE}') not available on " 172 | f"Modal. Please choose from: {GPUS_AVAIL_ON_MODAL}" 173 | ) 174 | 175 | 176 | @app.cls( 177 | image=esm_image, 178 | gpu=GPU_TO_USE, 179 | timeout=20 * MINUTES, 180 | container_idle_timeout=10 * MINUTES, 181 | volumes={MODEL_DIR: model_weights}, 182 | ) 183 | class ESMFold(FoldingAlgorithm): 184 | """ESMFold protein structure prediction model.""" 185 | 186 | # TODO: maybe this config should be input to the fold function, so that it can 187 | # changed programmatically on a single ephermal app, rather than re-creating the app? 188 | DEFAULT_CONFIG = { 189 | # ESMFold model config 190 | "output_pdb": False, 191 | "output_cif": False, 192 | "output_atomarray": False, 193 | # Chain linking and positioning config 194 | "glycine_linker": "", 195 | "position_ids_skip": 512, 196 | } 197 | 198 | # We need to properly asses whether using this or the original ESMFold is better 199 | # based on speed, accuracy, bugs, etc.; as well as customizability 200 | # For instance, if we want to also allow differently sized structure modules, than this would be good 201 | # TODO: we should add a settings dictionary or something, that would make it easier to add new options 202 | # TODO: maybe use OmegaConf instead to make it easier instead of config 203 | def __init__(self, config: dict = {}) -> None: 204 | """Initialize ESMFold.""" 205 | super().__init__(config) 206 | self.metadata = self._initialize_metadata( 207 | model_name="ESMFold", 208 | model_version="v4.49.0", # HuggingFace transformers version 209 | ) 210 | self.model_dir: Optional[str] = os.environ.get("MODEL_DIR", MODEL_DIR) 211 | self.tokenizer: Optional[AutoTokenizer] = None 212 | self.model: Optional[EsmForProteinFolding] = None 213 | 214 | @modal.enter() 215 | def _initialize(self) -> None: 216 | """Initialize the model during container startup. This helps us determine whether we run locally or remotely.""" 217 | self._load() 218 | 219 | def _load(self) -> None: 220 | """Load the ESMFold model and tokenizer.""" 221 | if self.tokenizer is None: 222 | self.tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1", cache_dir=self.model_dir) 223 | if self.model is None: 224 | self.model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", cache_dir=self.model_dir) 225 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 226 | self.model = self.model.to(self.device) 227 | self.model.eval() 228 | self.model.trunk.set_chunk_size(64) 229 | self.ready = True 230 | 231 | @modal.method() 232 | def fold(self, sequences: Union[str, List[str]]) -> ESMFoldOutput: 233 | """Predict protein structure(s) using ESMFold.""" 234 | if self.tokenizer is None or self.model is None: 235 | logger.warning("Model not loaded. Forcing the model to load... Next time call _load() first.") 236 | self._load() 237 | assert self.tokenizer is not None and self.model is not None, "Model not loaded" 238 | 239 | if isinstance(sequences, str): 240 | sequences = [sequences] 241 | 242 | sequences = self._validate_sequences(sequences) 243 | self.metadata.sequence_lengths = self._compute_sequence_lengths(sequences) 244 | 245 | tokenized_input, multimer_properties = self._tokenize_sequences(sequences) 246 | 247 | with Timer("Model Inference") as timer: 248 | with torch.inference_mode(): 249 | outputs = self.model(**tokenized_input) 250 | 251 | outputs = self._convert_outputs(outputs, multimer_properties, timer.duration) 252 | return outputs 253 | 254 | def _tokenize_sequences(self, sequences: List[str]) -> tuple[dict, dict[str, torch.Tensor] | None]: 255 | assert self.tokenizer is not None, "Tokenizer not loaded" 256 | if ":" in "".join(sequences): # MULTIMER setting 257 | tokenized, multimer_properties = self._tokenize_multimer(sequences) 258 | else: # MONOMER setting 259 | tokenized = self.tokenizer( 260 | sequences, return_tensors="pt", add_special_tokens=False, padding=True, truncation=True, max_length=1024 261 | ) 262 | multimer_properties = None 263 | tokenized = {k: v.to(self.device) for k, v in tokenized.items()} 264 | 265 | return tokenized, multimer_properties 266 | 267 | def _tokenize_multimer(self, sequences: List[str]) -> torch.Tensor: 268 | assert self.tokenizer is not None, "Tokenizer not loaded" 269 | # Store multimer properties first 270 | linker_map, residue_index, chain_index = store_multimer_properties(sequences, self.config["glycine_linker"]) 271 | 272 | # Create tokenized input using list comprehension directly 273 | glycine_linker = self.config["glycine_linker"] 274 | tokenized = self.tokenizer( 275 | [seq.replace(":", glycine_linker) for seq in sequences], 276 | padding=True, 277 | truncation=True, 278 | return_tensors="pt", 279 | add_special_tokens=False, 280 | ) 281 | 282 | # Add position IDs 283 | tokenized["position_ids"] = compute_position_ids(sequences, glycine_linker, self.config["position_ids_skip"]) 284 | 285 | # Create attention mask (1 means keep, 0 means mask) 286 | # This also masks padding tokens, which are -1 287 | tokenized["attention_mask"] = (linker_map == 1).to(torch.int32) 288 | 289 | return tokenized, {"linker_map": linker_map, "residue_index": residue_index, "chain_index": chain_index} 290 | 291 | def _mask_linker_region( 292 | self, 293 | outputs: dict, 294 | linker_map: torch.Tensor, 295 | residue_index: torch.Tensor, 296 | chain_index: torch.Tensor, 297 | ) -> dict: 298 | """Mask the linker region in the outputs and track padding information. 299 | This includes all the metrics. 300 | 301 | Args: 302 | outputs: Dictionary containing model outputs 303 | 304 | Returns: 305 | dict: Updated outputs with linker regions masked and padding information 306 | """ 307 | assert isinstance(linker_map, torch.Tensor), "linker_map must be a tensor" 308 | 309 | positions = [] 310 | frames = [] 311 | sidechain_frames = [] 312 | unnormalized_angles = [] 313 | angles = [] 314 | states = [] 315 | lddt_head = [] 316 | 317 | s_s = [] 318 | lm_logits = [] 319 | aatype = [] 320 | atom14_atom_exists = [] 321 | residx_atom14_to_atom37 = [] 322 | residx_atom37_to_atom14 = [] 323 | atom37_atom_exists = [] 324 | plddt = [] 325 | 326 | s_z = [] 327 | distogram_logits = [] 328 | ptm_logits = [] 329 | aligned_confidence_probs = [] 330 | predicted_aligned_error = [] 331 | 332 | _residue_index = [] 333 | _chain_index = [] 334 | 335 | for batch_idx, multimer in enumerate(linker_map): 336 | # Drop the -1 values, meaning 1s refer to residues we want to keep 337 | multimer = multimer.masked_fill(multimer == -1, 0).cpu().numpy() 338 | # Chain indices are the ones that were not masked, hence they were kept and are thus 1 339 | chain_indices = np.where(multimer == 1)[0] 340 | 341 | # 3rd dim is residue index 342 | positions.append(outputs["positions"][:, batch_idx, chain_indices]) 343 | frames.append(outputs["frames"][:, batch_idx, chain_indices]) 344 | sidechain_frames.append(outputs["sidechain_frames"][:, batch_idx, chain_indices]) 345 | unnormalized_angles.append(outputs["unnormalized_angles"][:, batch_idx, chain_indices]) 346 | angles.append(outputs["angles"][:, batch_idx, chain_indices]) 347 | states.append(outputs["states"][:, batch_idx, chain_indices]) 348 | lddt_head.append(outputs["lddt_head"][:, batch_idx, chain_indices]) 349 | 350 | # 2nd dim is residue index 351 | s_s.append(outputs["s_s"][batch_idx, chain_indices]) 352 | lm_logits.append(outputs["lm_logits"][batch_idx, chain_indices]) 353 | aatype.append(outputs["aatype"][batch_idx, chain_indices]) 354 | atom14_atom_exists.append(outputs["atom14_atom_exists"][batch_idx, chain_indices]) 355 | residx_atom14_to_atom37.append(outputs["residx_atom14_to_atom37"][batch_idx, chain_indices]) 356 | residx_atom37_to_atom14.append(outputs["residx_atom37_to_atom14"][batch_idx, chain_indices]) 357 | atom37_atom_exists.append(outputs["atom37_atom_exists"][batch_idx, chain_indices]) 358 | plddt.append(outputs["plddt"][batch_idx, chain_indices]) 359 | 360 | # 2D properties that are per residue pair; thus residues is both the 2nd and 3rd dim 361 | s_z.append(outputs["s_z"][batch_idx, chain_indices][:, chain_indices]) 362 | distogram_logits.append(outputs["distogram_logits"][batch_idx, chain_indices][:, chain_indices]) 363 | ptm_logits.append(outputs["ptm_logits"][batch_idx, chain_indices][:, chain_indices]) 364 | aligned_confidence_probs.append( 365 | outputs["aligned_confidence_probs"][batch_idx, chain_indices][:, chain_indices] 366 | ) 367 | predicted_aligned_error.append( 368 | outputs["predicted_aligned_error"][batch_idx, chain_indices][:, chain_indices] 369 | ) 370 | 371 | # Custom outputs, that also have 2nd dimension as residue index 372 | _residue_index.append(residue_index[batch_idx, chain_indices].cpu().numpy()) 373 | _chain_index.append(chain_index[batch_idx, chain_indices].cpu().numpy()) 374 | 375 | def pad_and_stack( 376 | arrays: list[np.ndarray], residue_dim: Union[int, List[int]], batch_dim: int, intermediate_dim: bool = False 377 | ) -> np.ndarray: 378 | """Pad arrays to match the largest size in the residue dimension and stack them in the batch dimension. 379 | 380 | Args: 381 | arrays: List of NumPy arrays to pad and stack 382 | residue_dim: Dimension(s) to pad to match sizes 383 | batch_dim: Dimension to stack the arrays along 384 | intermediate_dim: Whether the array has an intermediate dimension to preserve 385 | 386 | Returns: 387 | Stacked and padded NumPy array 388 | """ 389 | if isinstance(residue_dim, int): 390 | max_size = max(arr.shape[residue_dim] for arr in arrays) 391 | padded_arrays = [] 392 | for arr in arrays: 393 | padding = [(0, 0)] * arr.ndim 394 | padding[residue_dim] = (0, max_size - arr.shape[residue_dim]) 395 | padded_arrays.append(np.pad(arr, padding, mode="constant", constant_values=-1)) 396 | elif isinstance(residue_dim, list): 397 | # Multi-dimension padding (e.g., for 2D matrices) 398 | max_sizes = [] 399 | for dim in residue_dim: 400 | max_sizes.append(max(arr.shape[dim] for arr in arrays)) 401 | 402 | padded_arrays = [] 403 | for arr in arrays: 404 | padding = [(0, 0)] * arr.ndim 405 | for dim, max_size in zip(residue_dim, max_sizes): 406 | padding[dim] = (0, max_size - arr.shape[dim]) 407 | padded_arrays.append(np.pad(arr, padding, mode="constant", constant_values=-1)) 408 | 409 | # Handle intermediate dimensions differently 410 | if intermediate_dim: 411 | # Stack along axis=1 to preserve intermediate dim as first dimension 412 | return np.stack(padded_arrays, axis=1) 413 | else: 414 | return np.stack(padded_arrays, axis=batch_dim) 415 | 416 | # 2nd dimension is the batch size, 3rd dimension was the residue index (without batch it's the 2nd dim) 417 | # These are not done same as below is because of getting the 8 intermediate outputs from StructureModule 418 | outputs["positions"] = pad_and_stack(positions, residue_dim=1, batch_dim=0, intermediate_dim=True) 419 | outputs["frames"] = pad_and_stack(frames, residue_dim=1, batch_dim=0, intermediate_dim=True) 420 | outputs["sidechain_frames"] = pad_and_stack(sidechain_frames, residue_dim=1, batch_dim=0, intermediate_dim=True) 421 | outputs["unnormalized_angles"] = pad_and_stack( 422 | unnormalized_angles, residue_dim=1, batch_dim=0, intermediate_dim=True 423 | ) 424 | outputs["angles"] = pad_and_stack(angles, residue_dim=1, batch_dim=0, intermediate_dim=True) 425 | outputs["states"] = pad_and_stack(states, residue_dim=1, batch_dim=0, intermediate_dim=True) 426 | outputs["lddt_head"] = pad_and_stack(lddt_head, residue_dim=1, batch_dim=0, intermediate_dim=True) 427 | 428 | # 1st dimension is the batch size, 2nd dimension was the residue index (without batch it's the 1st dim) 429 | outputs["s_s"] = pad_and_stack(s_s, residue_dim=0, batch_dim=0) 430 | outputs["lm_logits"] = pad_and_stack(lm_logits, residue_dim=0, batch_dim=0) 431 | outputs["aatype"] = pad_and_stack(aatype, residue_dim=0, batch_dim=0) 432 | outputs["atom14_atom_exists"] = pad_and_stack(atom14_atom_exists, residue_dim=0, batch_dim=0) 433 | outputs["residx_atom14_to_atom37"] = pad_and_stack(residx_atom14_to_atom37, residue_dim=0, batch_dim=0) 434 | outputs["residx_atom37_to_atom14"] = pad_and_stack(residx_atom37_to_atom14, residue_dim=0, batch_dim=0) 435 | outputs["atom37_atom_exists"] = pad_and_stack(atom37_atom_exists, residue_dim=0, batch_dim=0) 436 | outputs["plddt"] = pad_and_stack(plddt, residue_dim=0, batch_dim=0) 437 | 438 | # 2D properties, otherwise same as above 439 | outputs["s_z"] = pad_and_stack(s_z, residue_dim=[0, 1], batch_dim=0) 440 | outputs["distogram_logits"] = pad_and_stack(distogram_logits, residue_dim=[0, 1], batch_dim=0) 441 | outputs["ptm_logits"] = pad_and_stack(ptm_logits, residue_dim=[0, 1], batch_dim=0) 442 | outputs["aligned_confidence_probs"] = pad_and_stack(aligned_confidence_probs, residue_dim=[0, 1], batch_dim=0) 443 | outputs["predicted_aligned_error"] = pad_and_stack(predicted_aligned_error, residue_dim=[0, 1], batch_dim=0) 444 | 445 | # Custom 446 | outputs["chain_index"] = pad_and_stack(_chain_index, residue_dim=0, batch_dim=0) 447 | outputs["residue_index"] = pad_and_stack(_residue_index, residue_dim=0, batch_dim=0) 448 | 449 | return outputs 450 | 451 | def _convert_outputs( 452 | self, 453 | outputs: dict, 454 | multimer_properties: dict[str, torch.Tensor] | None, 455 | prediction_time: float, 456 | ) -> ESMFoldOutput: 457 | """Convert model outputs to ESMFoldOutput format.""" 458 | 459 | outputs = {k: v.cpu().numpy() for k, v in outputs.items()} 460 | if multimer_properties is not None: 461 | # TODO: maybe add a proper MULTIMER flag? 462 | outputs = self._mask_linker_region(outputs, **multimer_properties) 463 | else: # only MONOMERs 464 | outputs["chain_index"] = np.zeros(outputs["residue_index"].shape, dtype=np.int32) 465 | 466 | self.metadata.prediction_time = prediction_time 467 | 468 | atom_array = self._convert_outputs_to_atomarray(outputs) 469 | if self.config["output_pdb"]: 470 | outputs["pdb"] = self._convert_outputs_to_pdb(atom_array) 471 | if self.config["output_cif"]: 472 | outputs["cif"] = self._convert_outputs_to_cif(atom_array) 473 | if self.config["output_atomarray"]: 474 | outputs["atom_array"] = atom_array 475 | 476 | return ESMFoldOutput(metadata=self.metadata, **outputs) 477 | 478 | def _convert_outputs_to_atomarray(self, outputs: dict) -> AtomArray: 479 | """Convert ESMFold outputs to a Biotite AtomArray. 480 | 481 | Args: 482 | outputs: Dictionary containing ESMFold model outputs 483 | 484 | Returns: 485 | AtomArray: Biotite structure representation 486 | """ 487 | from biotite.structure import Atom, array 488 | from transformers.models.esm.openfold_utils.feats import atom14_to_atom37 489 | from transformers.models.esm.openfold_utils.residue_constants import atom_types, restypes, restype_1to3 490 | 491 | # Convert atom14 to atom37 format 492 | atom_positions = atom14_to_atom37( 493 | outputs["positions"][-1], outputs 494 | ) # (model_layer, batch, residue, atom37, xyz) 495 | atom_mask = outputs["atom37_atom_exists"] # (batch, residue, atom37) 496 | 497 | assert len(atom_types) == atom_positions.shape[2] == 37, "Atom types must be 37" 498 | 499 | # Get batch and residue dimensions 500 | batch_size, n_residues, n_atoms = atom_mask.shape 501 | 502 | # Create list to store atoms 503 | arrays = [] 504 | 505 | # Process each protein in the batch 506 | for b in range(batch_size): 507 | atoms = [] # clear out the atoms list 508 | # Process each residue 509 | for res_idx in range(n_residues): 510 | # Get chain ID (convert numeric index to letter A-Z) 511 | chain_id = chr(65 + outputs["chain_index"][b, res_idx]) # A=65 in ASCII 512 | 513 | # Get residue name (3-letter code) 514 | aa_type = outputs["aatype"][b, res_idx] # id representing residue identity 515 | res_name = restypes[aa_type] # 1-letter residue identity 516 | res_name = restype_1to3[res_name] # 3-letter residue identity 517 | 518 | # Process each atom in the residue 519 | for atom_idx in range(n_atoms): # loops through all 37 atom types 520 | # Skip if atom doesn't exist 521 | if not atom_mask[b, res_idx, atom_idx]: 522 | continue 523 | 524 | # Get atom coordinates 525 | coord = atom_positions[b, res_idx, atom_idx] 526 | 527 | # Create Atom object 528 | atom = Atom( 529 | coord=coord, 530 | chain_id=chain_id, 531 | atom_name=atom_types[atom_idx], 532 | res_name=res_name, 533 | res_id=outputs["residue_index"][b, res_idx], # 0-indexed 534 | element=atom_types[atom_idx][0], 535 | # we only support C, N, O, S, [according to OpenFold Protein class] 536 | # element is thus the first character of any atom name (according to PDB nomenclature) 537 | b_factor=outputs["plddt"][b, res_idx, atom_idx], 538 | ) 539 | atoms.append(atom) 540 | arrays.append(array(atoms)) 541 | return arrays 542 | 543 | def _convert_outputs_to_pdb(self, atom_array: AtomArray) -> list[str]: 544 | # TODO: this might make more sense to do locally, instead of doing it on the Modal instance 545 | from biotite.structure.io.pdb import PDBFile, set_structure 546 | from io import StringIO 547 | 548 | pdbs = [] 549 | for a in atom_array: 550 | structure_file = PDBFile() 551 | set_structure(structure_file, a) 552 | string = StringIO() 553 | structure_file.write(string) 554 | pdbs.append(string.getvalue()) 555 | return pdbs 556 | 557 | def _convert_outputs_to_cif(self, atom_array: AtomArray) -> list[str]: 558 | # TODO: this might make more sense to do locally, instead of doing it on the Modal instance 559 | from biotite.structure.io.pdbx import CIFFile, set_structure 560 | from io import StringIO 561 | 562 | cifs = [] 563 | for a in atom_array: 564 | structure_file = CIFFile() 565 | set_structure(structure_file, a) 566 | string = StringIO() 567 | structure_file.write(string) 568 | cifs.append(string.getvalue()) 569 | return cifs 570 | 571 | 572 | def get_esmfold(gpu_type="T4", config: dict = {}): 573 | """ 574 | Note that the app will still show that's using T4, but the actual method / function call will use the correct GPU, 575 | and display accordingly in the Modal dashboard. 576 | """ 577 | Model = ESMFold.with_options(gpu=gpu_type) # type: ignore 578 | return Model(config=config) 579 | -------------------------------------------------------------------------------- /tests/data/esmfold_server_medium.pdb: -------------------------------------------------------------------------------- 1 | HEADER 18-OCT-22 2 | TITLE ESMFOLD V1 PREDICTION FOR INPUT 3 | REMARK 1 4 | REMARK 1 REFERENCE 1 5 | REMARK 1 AUTH ZEMING LIN, HALIL AKIN, ROSHAN RAO, BRIAN HIE, ZHONGKAI ZHU, 6 | REMARK 1 AUTH 2 WENTING LU, NIKITA SMETANIN, ROBERT VERKUIL, ORI KABELI, 7 | REMARK 1 AUTH 3 YANIV SHMUELI, ALLAN DOS SANTOS COSTA, 8 | REMARK 1 AUTH 4 MARYAM FAZEL-ZARANDI, TOM SERCU, SALVATORE CANDIDO, 9 | REMARK 1 AUTH 5 ALEXANDER RIVES 10 | REMARK 1 TITL EVOLUTIONARY-SCALE PREDICTION OF ATOMIC LEVEL PROTEIN 11 | REMARK 1 TITL 2 STRUCTURE WITH A LANGUAGE MODEL 12 | REMARK 1 REF 13 | REMARK 1 REFN 14 | REMARK 1 PMID 15 | REMARK 1 DOI 10.1101/2022.07.20.500902 16 | REMARK 1 17 | REMARK 1 LICENSE AND DISCLAIMERS 18 | REMARK 1 ESM METAGENOMIC ATLAS DATA IS AVAILABLE UNDER 19 | REMARK 1 A CC-BY-4.0 LICENSE FOR ACADEMIC AND COMMERCIAL USE. 20 | REMARK 1 COPYRIGHT (C) META PLATFORMS, INC. ALL RIGHTS RESERVED. 21 | REMARK 1 USE OF THE ESM METAGENOMIC ATLAS DATA IS SUBJECT 22 | REMARK 1 TO THE META OPEN SOURCE TERMS OF USE AND PRIVACY POLICY. 23 | ATOM 1 N MET A 1 26.052 18.992 -15.018 1.00 0.65 N 24 | ATOM 2 CA MET A 1 24.789 19.636 -14.668 1.00 0.65 C 25 | ATOM 3 C MET A 1 23.762 19.460 -15.781 1.00 0.66 C 26 | ATOM 4 CB MET A 1 25.004 21.124 -14.386 1.00 0.58 C 27 | ATOM 5 O MET A 1 22.557 19.439 -15.523 1.00 0.64 O 28 | ATOM 6 CG MET A 1 25.590 21.409 -13.012 1.00 0.55 C 29 | ATOM 7 SD MET A 1 25.304 23.139 -12.470 1.00 0.55 S 30 | ATOM 8 CE MET A 1 25.949 23.054 -10.777 1.00 0.50 C 31 | ATOM 9 N ALA A 2 24.292 19.398 -16.973 1.00 0.72 N 32 | ATOM 10 CA ALA A 2 23.414 19.330 -18.139 1.00 0.72 C 33 | ATOM 11 C ALA A 2 22.697 17.985 -18.208 1.00 0.72 C 34 | ATOM 12 CB ALA A 2 24.210 19.572 -19.419 1.00 0.66 C 35 | ATOM 13 O ALA A 2 21.561 17.903 -18.679 1.00 0.69 O 36 | ATOM 14 N LEU A 3 23.370 17.004 -17.718 1.00 0.80 N 37 | ATOM 15 CA LEU A 3 22.810 15.664 -17.858 1.00 0.79 C 38 | ATOM 16 C LEU A 3 21.563 15.503 -16.995 1.00 0.79 C 39 | ATOM 17 CB LEU A 3 23.849 14.607 -17.476 1.00 0.75 C 40 | ATOM 18 O LEU A 3 20.616 14.818 -17.388 1.00 0.76 O 41 | ATOM 19 CG LEU A 3 23.687 13.232 -18.126 1.00 0.68 C 42 | ATOM 20 CD1 LEU A 3 24.421 13.188 -19.462 1.00 0.62 C 43 | ATOM 21 CD2 LEU A 3 24.196 12.137 -17.194 1.00 0.62 C 44 | ATOM 22 N TRP A 4 21.538 16.162 -15.722 1.00 0.77 N 45 | ATOM 23 CA TRP A 4 20.392 16.073 -14.823 1.00 0.79 C 46 | ATOM 24 C TRP A 4 19.141 16.652 -15.476 1.00 0.76 C 47 | ATOM 25 CB TRP A 4 20.681 16.803 -13.509 1.00 0.72 C 48 | ATOM 26 O TRP A 4 18.042 16.119 -15.306 1.00 0.72 O 49 | ATOM 27 CG TRP A 4 21.608 16.063 -12.592 1.00 0.63 C 50 | ATOM 28 CD1 TRP A 4 22.958 16.241 -12.466 1.00 0.56 C 51 | ATOM 29 CD2 TRP A 4 21.253 15.023 -11.676 1.00 0.57 C 52 | ATOM 30 CE2 TRP A 4 22.439 14.615 -11.026 1.00 0.51 C 53 | ATOM 31 CE3 TRP A 4 20.046 14.395 -11.341 1.00 0.62 C 54 | ATOM 32 NE1 TRP A 4 23.464 15.373 -11.525 1.00 0.65 N 55 | ATOM 33 CH2 TRP A 4 21.258 13.010 -9.750 1.00 0.61 C 56 | ATOM 34 CZ2 TRP A 4 22.452 13.607 -10.059 1.00 0.66 C 57 | ATOM 35 CZ3 TRP A 4 20.061 13.392 -10.378 1.00 0.60 C 58 | ATOM 36 N MET A 5 19.363 17.705 -16.222 1.00 0.81 N 59 | ATOM 37 CA MET A 5 18.210 18.391 -16.799 1.00 0.81 C 60 | ATOM 38 C MET A 5 17.573 17.550 -17.900 1.00 0.80 C 61 | ATOM 39 CB MET A 5 18.618 19.756 -17.354 1.00 0.77 C 62 | ATOM 40 O MET A 5 16.382 17.693 -18.185 1.00 0.77 O 63 | ATOM 41 CG MET A 5 18.812 20.820 -16.285 1.00 0.70 C 64 | ATOM 42 SD MET A 5 19.375 22.422 -16.980 1.00 0.67 S 65 | ATOM 43 CE MET A 5 18.971 23.529 -15.601 1.00 0.64 C 66 | ATOM 44 N ARG A 6 18.403 16.787 -18.427 1.00 0.79 N 67 | ATOM 45 CA ARG A 6 17.871 15.935 -19.486 1.00 0.79 C 68 | ATOM 46 C ARG A 6 17.163 14.716 -18.905 1.00 0.79 C 69 | ATOM 47 CB ARG A 6 18.988 15.491 -20.432 1.00 0.76 C 70 | ATOM 48 O ARG A 6 16.239 14.177 -19.518 1.00 0.77 O 71 | ATOM 49 CG ARG A 6 19.569 16.618 -21.271 1.00 0.73 C 72 | ATOM 50 CD ARG A 6 20.611 16.108 -22.257 1.00 0.71 C 73 | ATOM 51 NE ARG A 6 21.169 17.192 -23.060 1.00 0.65 N 74 | ATOM 52 NH1 ARG A 6 22.611 15.841 -24.257 1.00 0.60 N 75 | ATOM 53 NH2 ARG A 6 22.543 18.096 -24.663 1.00 0.58 N 76 | ATOM 54 CZ ARG A 6 22.107 17.041 -23.991 1.00 0.65 C 77 | ATOM 55 N LEU A 7 17.636 14.248 -17.747 1.00 0.81 N 78 | ATOM 56 CA LEU A 7 17.089 13.018 -17.184 1.00 0.81 C 79 | ATOM 57 C LEU A 7 15.776 13.291 -16.457 1.00 0.81 C 80 | ATOM 58 CB LEU A 7 18.093 12.375 -16.224 1.00 0.78 C 81 | ATOM 59 O LEU A 7 14.936 12.398 -16.329 1.00 0.78 O 82 | ATOM 60 CG LEU A 7 19.228 11.575 -16.866 1.00 0.72 C 83 | ATOM 61 CD1 LEU A 7 20.409 11.468 -15.907 1.00 0.66 C 84 | ATOM 62 CD2 LEU A 7 18.740 10.191 -17.279 1.00 0.67 C 85 | ATOM 63 N LEU A 8 15.576 14.517 -16.010 1.00 0.80 N 86 | ATOM 64 CA LEU A 8 14.412 14.837 -15.190 1.00 0.79 C 87 | ATOM 65 C LEU A 8 13.120 14.603 -15.965 1.00 0.80 C 88 | ATOM 66 CB LEU A 8 14.477 16.289 -14.710 1.00 0.76 C 89 | ATOM 67 O LEU A 8 12.179 13.999 -15.444 1.00 0.78 O 90 | ATOM 68 CG LEU A 8 15.119 16.523 -13.342 1.00 0.71 C 91 | ATOM 69 CD1 LEU A 8 15.693 17.933 -13.259 1.00 0.64 C 92 | ATOM 70 CD2 LEU A 8 14.105 16.286 -12.228 1.00 0.65 C 93 | ATOM 71 N PRO A 9 13.063 15.189 -17.227 1.00 0.80 N 94 | ATOM 72 CA PRO A 9 11.797 14.925 -17.915 1.00 0.80 C 95 | ATOM 73 C PRO A 9 11.551 13.436 -18.147 1.00 0.80 C 96 | ATOM 74 CB PRO A 9 11.958 15.665 -19.245 1.00 0.78 C 97 | ATOM 75 O PRO A 9 10.399 12.999 -18.208 1.00 0.79 O 98 | ATOM 76 CG PRO A 9 13.354 16.199 -19.219 1.00 0.77 C 99 | ATOM 77 CD PRO A 9 13.973 15.852 -17.895 1.00 0.78 C 100 | ATOM 78 N LEU A 10 12.604 12.701 -18.345 1.00 0.79 N 101 | ATOM 79 CA LEU A 10 12.424 11.271 -18.567 1.00 0.78 C 102 | ATOM 80 C LEU A 10 11.899 10.586 -17.310 1.00 0.79 C 103 | ATOM 81 CB LEU A 10 13.744 10.626 -19.000 1.00 0.76 C 104 | ATOM 82 O LEU A 10 11.049 9.696 -17.392 1.00 0.77 O 105 | ATOM 83 CG LEU A 10 14.060 10.667 -20.496 1.00 0.72 C 106 | ATOM 84 CD1 LEU A 10 15.566 10.758 -20.717 1.00 0.67 C 107 | ATOM 85 CD2 LEU A 10 13.484 9.442 -21.198 1.00 0.67 C 108 | ATOM 86 N LEU A 11 12.426 10.939 -16.134 1.00 0.78 N 109 | ATOM 87 CA LEU A 11 11.946 10.378 -14.876 1.00 0.78 C 110 | ATOM 88 C LEU A 11 10.493 10.769 -14.626 1.00 0.78 C 111 | ATOM 89 CB LEU A 11 12.820 10.849 -13.711 1.00 0.76 C 112 | ATOM 90 O LEU A 11 9.724 9.987 -14.062 1.00 0.77 O 113 | ATOM 91 CG LEU A 11 14.179 10.161 -13.562 1.00 0.72 C 114 | ATOM 92 CD1 LEU A 11 15.106 11.003 -12.692 1.00 0.68 C 115 | ATOM 93 CD2 LEU A 11 14.009 8.764 -12.975 1.00 0.68 C 116 | ATOM 94 N ALA A 12 10.168 12.021 -14.929 1.00 0.80 N 117 | ATOM 95 CA ALA A 12 8.783 12.457 -14.771 1.00 0.79 C 118 | ATOM 96 C ALA A 12 7.841 11.614 -15.625 1.00 0.80 C 119 | ATOM 97 CB ALA A 12 8.646 13.934 -15.133 1.00 0.78 C 120 | ATOM 98 O ALA A 12 6.720 11.310 -15.209 1.00 0.78 O 121 | ATOM 99 N LEU A 13 8.367 11.285 -16.860 1.00 0.80 N 122 | ATOM 100 CA LEU A 13 7.550 10.464 -17.747 1.00 0.79 C 123 | ATOM 101 C LEU A 13 7.355 9.066 -17.168 1.00 0.79 C 124 | ATOM 102 CB LEU A 13 8.193 10.369 -19.133 1.00 0.77 C 125 | ATOM 103 O LEU A 13 6.274 8.485 -17.288 1.00 0.77 O 126 | ATOM 104 CG LEU A 13 7.907 11.524 -20.094 1.00 0.73 C 127 | ATOM 105 CD1 LEU A 13 8.997 11.611 -21.157 1.00 0.67 C 128 | ATOM 106 CD2 LEU A 13 6.535 11.356 -20.739 1.00 0.68 C 129 | ATOM 107 N LEU A 14 8.371 8.519 -16.560 1.00 0.78 N 130 | ATOM 108 CA LEU A 14 8.260 7.183 -15.984 1.00 0.76 C 131 | ATOM 109 C LEU A 14 7.288 7.176 -14.809 1.00 0.76 C 132 | ATOM 110 CB LEU A 14 9.633 6.680 -15.529 1.00 0.74 C 133 | ATOM 111 O LEU A 14 6.598 6.181 -14.575 1.00 0.74 O 134 | ATOM 112 CG LEU A 14 10.542 6.110 -16.618 1.00 0.70 C 135 | ATOM 113 CD1 LEU A 14 12.004 6.214 -16.197 1.00 0.66 C 136 | ATOM 114 CD2 LEU A 14 10.170 4.663 -16.924 1.00 0.66 C 137 | ATOM 115 N ALA A 15 7.321 8.268 -14.050 1.00 0.76 N 138 | ATOM 116 CA ALA A 15 6.430 8.354 -12.896 1.00 0.75 C 139 | ATOM 117 C ALA A 15 4.970 8.414 -13.333 1.00 0.75 C 140 | ATOM 118 CB ALA A 15 6.778 9.572 -12.044 1.00 0.73 C 141 | ATOM 119 O ALA A 15 4.078 7.968 -12.606 1.00 0.74 O 142 | ATOM 120 N LEU A 16 4.789 9.020 -14.538 1.00 0.77 N 143 | ATOM 121 CA LEU A 16 3.418 9.204 -14.999 1.00 0.75 C 144 | ATOM 122 C LEU A 16 2.878 7.923 -15.626 1.00 0.75 C 145 | ATOM 123 CB LEU A 16 3.342 10.351 -16.010 1.00 0.73 C 146 | ATOM 124 O LEU A 16 1.667 7.693 -15.634 1.00 0.72 O 147 | ATOM 125 CG LEU A 16 3.569 11.759 -15.456 1.00 0.70 C 148 | ATOM 126 CD1 LEU A 16 3.738 12.756 -16.597 1.00 0.66 C 149 | ATOM 127 CD2 LEU A 16 2.415 12.169 -14.548 1.00 0.66 C 150 | ATOM 128 N TRP A 17 3.722 7.027 -16.161 1.00 0.74 N 151 | ATOM 129 CA TRP A 17 3.296 5.799 -16.824 1.00 0.75 C 152 | ATOM 130 C TRP A 17 3.432 4.601 -15.890 1.00 0.73 C 153 | ATOM 131 CB TRP A 17 4.113 5.564 -18.098 1.00 0.70 C 154 | ATOM 132 O TRP A 17 3.072 3.478 -16.254 1.00 0.68 O 155 | ATOM 133 CG TRP A 17 3.781 6.507 -19.216 1.00 0.63 C 156 | ATOM 134 CD1 TRP A 17 4.047 7.847 -19.271 1.00 0.57 C 157 | ATOM 135 CD2 TRP A 17 3.116 6.179 -20.440 1.00 0.59 C 158 | ATOM 136 CE2 TRP A 17 3.013 7.371 -21.191 1.00 0.55 C 159 | ATOM 137 CE3 TRP A 17 2.598 4.992 -20.975 1.00 0.65 C 160 | ATOM 138 NE1 TRP A 17 3.588 8.372 -20.456 1.00 0.65 N 161 | ATOM 139 CH2 TRP A 17 1.912 6.234 -22.951 1.00 0.61 C 162 | ATOM 140 CZ2 TRP A 17 2.411 7.409 -22.451 1.00 0.65 C 163 | ATOM 141 CZ3 TRP A 17 1.999 5.033 -22.229 1.00 0.60 C 164 | ATOM 142 N GLY A 18 4.037 4.844 -14.696 1.00 0.69 N 165 | ATOM 143 CA GLY A 18 4.207 3.725 -13.783 1.00 0.67 C 166 | ATOM 144 C GLY A 18 2.892 3.168 -13.271 1.00 0.68 C 167 | ATOM 145 O GLY A 18 1.835 3.767 -13.478 1.00 0.64 O 168 | ATOM 146 N PRO A 19 2.745 1.860 -13.147 1.00 0.63 N 169 | ATOM 147 CA PRO A 19 1.523 1.249 -12.618 1.00 0.62 C 170 | ATOM 148 C PRO A 19 0.988 1.973 -11.384 1.00 0.63 C 171 | ATOM 149 CB PRO A 19 1.963 -0.174 -12.268 1.00 0.59 C 172 | ATOM 150 O PRO A 19 1.767 2.503 -10.588 1.00 0.61 O 173 | ATOM 151 CG PRO A 19 3.456 -0.115 -12.235 1.00 0.58 C 174 | ATOM 152 CD PRO A 19 3.902 1.107 -12.985 1.00 0.57 C 175 | ATOM 153 N ASP A 20 -0.137 2.514 -11.540 1.00 0.61 N 176 | ATOM 154 CA ASP A 20 -0.769 3.111 -10.368 1.00 0.60 C 177 | ATOM 155 C ASP A 20 -0.518 2.268 -9.120 1.00 0.61 C 178 | ATOM 156 CB ASP A 20 -2.273 3.279 -10.595 1.00 0.57 C 179 | ATOM 157 O ASP A 20 -0.774 1.062 -9.117 1.00 0.60 O 180 | ATOM 158 CG ASP A 20 -2.753 4.700 -10.359 1.00 0.55 C 181 | ATOM 159 OD1 ASP A 20 -2.071 5.463 -9.642 1.00 0.55 O 182 | ATOM 160 OD2 ASP A 20 -3.825 5.058 -10.892 1.00 0.56 O 183 | ATOM 161 N PRO A 21 0.428 2.770 -8.153 1.00 0.58 N 184 | ATOM 162 CA PRO A 21 0.552 1.983 -6.923 1.00 0.58 C 185 | ATOM 163 C PRO A 21 -0.798 1.527 -6.376 1.00 0.58 C 186 | ATOM 164 CB PRO A 21 1.236 2.950 -5.953 1.00 0.55 C 187 | ATOM 165 O PRO A 21 -0.882 0.489 -5.715 1.00 0.58 O 188 | ATOM 166 CG PRO A 21 1.236 4.266 -6.662 1.00 0.53 C 189 | ATOM 167 CD PRO A 21 0.822 4.040 -8.088 1.00 0.54 C 190 | ATOM 168 N ALA A 22 -1.842 2.398 -6.585 1.00 0.58 N 191 | ATOM 169 CA ALA A 22 -3.142 2.105 -5.987 1.00 0.58 C 192 | ATOM 170 C ALA A 22 -3.796 0.901 -6.659 1.00 0.58 C 193 | ATOM 171 CB ALA A 22 -4.057 3.324 -6.080 1.00 0.55 C 194 | ATOM 172 O ALA A 22 -4.564 0.171 -6.028 1.00 0.56 O 195 | ATOM 173 N ALA A 23 -3.407 0.679 -7.903 1.00 0.57 N 196 | ATOM 174 CA ALA A 23 -4.023 -0.432 -8.624 1.00 0.57 C 197 | ATOM 175 C ALA A 23 -3.591 -1.773 -8.038 1.00 0.56 C 198 | ATOM 176 CB ALA A 23 -3.670 -0.364 -10.108 1.00 0.53 C 199 | ATOM 177 O ALA A 23 -4.345 -2.748 -8.081 1.00 0.55 O 200 | ATOM 178 N ALA A 24 -2.296 -1.821 -7.577 1.00 0.56 N 201 | ATOM 179 CA ALA A 24 -1.852 -3.088 -7.002 1.00 0.55 C 202 | ATOM 180 C ALA A 24 -2.667 -3.443 -5.762 1.00 0.56 C 203 | ATOM 181 CB ALA A 24 -0.366 -3.024 -6.657 1.00 0.53 C 204 | ATOM 182 O ALA A 24 -2.923 -4.620 -5.496 1.00 0.55 O 205 | ATOM 183 N PHE A 25 -3.261 -2.394 -5.054 1.00 0.58 N 206 | ATOM 184 CA PHE A 25 -3.992 -2.699 -3.830 1.00 0.58 C 207 | ATOM 185 C PHE A 25 -5.456 -2.994 -4.133 1.00 0.58 C 208 | ATOM 186 CB PHE A 25 -3.885 -1.538 -2.836 1.00 0.55 C 209 | ATOM 187 O PHE A 25 -6.139 -3.648 -3.343 1.00 0.57 O 210 | ATOM 188 CG PHE A 25 -2.652 -1.588 -1.975 1.00 0.53 C 211 | ATOM 189 CD1 PHE A 25 -2.611 -2.389 -0.841 1.00 0.50 C 212 | ATOM 190 CD2 PHE A 25 -1.533 -0.832 -2.300 1.00 0.51 C 213 | ATOM 191 CE1 PHE A 25 -1.471 -2.437 -0.042 1.00 0.50 C 214 | ATOM 192 CE2 PHE A 25 -0.390 -0.875 -1.507 1.00 0.51 C 215 | ATOM 193 CZ PHE A 25 -0.361 -1.677 -0.378 1.00 0.49 C 216 | ATOM 194 N VAL A 26 -5.891 -2.531 -5.255 1.00 0.58 N 217 | ATOM 195 CA VAL A 26 -7.337 -2.503 -5.449 1.00 0.57 C 218 | ATOM 196 C VAL A 26 -7.829 -3.886 -5.869 1.00 0.58 C 219 | ATOM 197 CB VAL A 26 -7.749 -1.449 -6.501 1.00 0.54 C 220 | ATOM 198 O VAL A 26 -8.984 -4.242 -5.623 1.00 0.57 O 221 | ATOM 199 CG1 VAL A 26 -9.269 -1.376 -6.627 1.00 0.49 C 222 | ATOM 200 CG2 VAL A 26 -7.174 -0.081 -6.137 1.00 0.50 C 223 | ATOM 201 N ASN A 27 -6.835 -4.868 -6.257 1.00 0.56 N 224 | ATOM 202 CA ASN A 27 -7.415 -6.146 -6.657 1.00 0.57 C 225 | ATOM 203 C ASN A 27 -7.242 -7.204 -5.572 1.00 0.56 C 226 | ATOM 204 CB ASN A 27 -6.798 -6.627 -7.972 1.00 0.52 C 227 | ATOM 205 O ASN A 27 -7.445 -8.394 -5.821 1.00 0.54 O 228 | ATOM 206 CG ASN A 27 -7.627 -6.242 -9.182 1.00 0.50 C 229 | ATOM 207 ND2 ASN A 27 -7.010 -6.272 -10.357 1.00 0.45 N 230 | ATOM 208 OD1 ASN A 27 -8.811 -5.918 -9.061 1.00 0.50 O 231 | ATOM 209 N GLN A 28 -6.793 -6.747 -4.428 1.00 0.65 N 232 | ATOM 210 CA GLN A 28 -6.747 -7.783 -3.401 1.00 0.65 C 233 | ATOM 211 C GLN A 28 -8.020 -7.780 -2.560 1.00 0.65 C 234 | ATOM 212 CB GLN A 28 -5.524 -7.595 -2.503 1.00 0.62 C 235 | ATOM 213 O GLN A 28 -8.355 -6.771 -1.935 1.00 0.62 O 236 | ATOM 214 CG GLN A 28 -4.344 -8.482 -2.877 1.00 0.59 C 237 | ATOM 215 CD GLN A 28 -3.184 -8.356 -1.907 1.00 0.58 C 238 | ATOM 216 NE2 GLN A 28 -2.141 -9.150 -2.124 1.00 0.49 N 239 | ATOM 217 OE1 GLN A 28 -3.225 -7.552 -0.971 1.00 0.58 O 240 | ATOM 218 N HIS A 29 -9.039 -8.415 -2.963 1.00 0.64 N 241 | ATOM 219 CA HIS A 29 -10.237 -8.685 -2.175 1.00 0.64 C 242 | ATOM 220 C HIS A 29 -9.905 -9.498 -0.928 1.00 0.64 C 243 | ATOM 221 CB HIS A 29 -11.279 -9.420 -3.019 1.00 0.61 C 244 | ATOM 222 O HIS A 29 -9.259 -10.545 -1.019 1.00 0.62 O 245 | ATOM 223 CG HIS A 29 -11.923 -8.560 -4.060 1.00 0.59 C 246 | ATOM 224 CD2 HIS A 29 -11.791 -8.545 -5.407 1.00 0.55 C 247 | ATOM 225 ND1 HIS A 29 -12.825 -7.566 -3.749 1.00 0.53 N 248 | ATOM 226 CE1 HIS A 29 -13.222 -6.976 -4.865 1.00 0.53 C 249 | ATOM 227 NE2 HIS A 29 -12.610 -7.552 -5.885 1.00 0.50 N 250 | ATOM 228 N LEU A 30 -9.925 -8.826 0.194 1.00 0.71 N 251 | ATOM 229 CA LEU A 30 -9.866 -9.579 1.443 1.00 0.70 C 252 | ATOM 230 C LEU A 30 -11.227 -10.180 1.780 1.00 0.71 C 253 | ATOM 231 CB LEU A 30 -9.394 -8.680 2.589 1.00 0.68 C 254 | ATOM 232 O LEU A 30 -12.221 -9.458 1.885 1.00 0.68 O 255 | ATOM 233 CG LEU A 30 -7.962 -8.151 2.490 1.00 0.64 C 256 | ATOM 234 CD1 LEU A 30 -7.706 -7.111 3.575 1.00 0.61 C 257 | ATOM 235 CD2 LEU A 30 -6.961 -9.297 2.592 1.00 0.63 C 258 | ATOM 236 N CYS A 31 -11.548 -11.380 1.500 1.00 0.72 N 259 | ATOM 237 CA CYS A 31 -12.756 -12.157 1.757 1.00 0.72 C 260 | ATOM 238 C CYS A 31 -12.468 -13.314 2.706 1.00 0.73 C 261 | ATOM 239 CB CYS A 31 -13.338 -12.691 0.449 1.00 0.69 C 262 | ATOM 240 O CYS A 31 -11.339 -13.805 2.769 1.00 0.70 O 263 | ATOM 241 SG CYS A 31 -13.889 -11.397 -0.684 1.00 0.70 S 264 | ATOM 242 N GLY A 32 -13.698 -13.594 3.630 1.00 0.74 N 265 | ATOM 243 CA GLY A 32 -13.641 -14.740 4.523 1.00 0.74 C 266 | ATOM 244 C GLY A 32 -12.600 -14.593 5.616 1.00 0.75 C 267 | ATOM 245 O GLY A 32 -12.557 -13.573 6.307 1.00 0.72 O 268 | ATOM 246 N SER A 33 -11.803 -15.731 5.786 1.00 0.76 N 269 | ATOM 247 CA SER A 33 -10.773 -15.896 6.807 1.00 0.76 C 270 | ATOM 248 C SER A 33 -9.647 -14.884 6.626 1.00 0.76 C 271 | ATOM 249 CB SER A 33 -10.204 -17.315 6.770 1.00 0.74 C 272 | ATOM 250 O SER A 33 -8.989 -14.500 7.595 1.00 0.74 O 273 | ATOM 251 OG SER A 33 -9.593 -17.580 5.519 1.00 0.67 O 274 | ATOM 252 N HIS A 34 -9.453 -14.504 5.377 1.00 0.76 N 275 | ATOM 253 CA HIS A 34 -8.368 -13.554 5.160 1.00 0.76 C 276 | ATOM 254 C HIS A 34 -8.733 -12.170 5.687 1.00 0.76 C 277 | ATOM 255 CB HIS A 34 -8.016 -13.473 3.673 1.00 0.73 C 278 | ATOM 256 O HIS A 34 -7.874 -11.449 6.198 1.00 0.73 O 279 | ATOM 257 CG HIS A 34 -7.359 -14.708 3.144 1.00 0.69 C 280 | ATOM 258 CD2 HIS A 34 -7.793 -15.637 2.260 1.00 0.66 C 281 | ATOM 259 ND1 HIS A 34 -6.097 -15.104 3.531 1.00 0.64 N 282 | ATOM 260 CE1 HIS A 34 -5.782 -16.225 2.905 1.00 0.63 C 283 | ATOM 261 NE2 HIS A 34 -6.795 -16.570 2.128 1.00 0.61 N 284 | ATOM 262 N LEU A 35 -10.027 -11.761 5.551 1.00 0.77 N 285 | ATOM 263 CA LEU A 35 -10.529 -10.540 6.171 1.00 0.76 C 286 | ATOM 264 C LEU A 35 -10.294 -10.559 7.678 1.00 0.77 C 287 | ATOM 265 CB LEU A 35 -12.022 -10.364 5.878 1.00 0.74 C 288 | ATOM 266 O LEU A 35 -9.861 -9.560 8.256 1.00 0.76 O 289 | ATOM 267 CG LEU A 35 -12.700 -9.153 6.521 1.00 0.70 C 290 | ATOM 268 CD1 LEU A 35 -12.071 -7.860 6.012 1.00 0.65 C 291 | ATOM 269 CD2 LEU A 35 -14.200 -9.170 6.242 1.00 0.65 C 292 | ATOM 270 N VAL A 36 -10.599 -11.795 8.230 1.00 0.78 N 293 | ATOM 271 CA VAL A 36 -10.488 -11.902 9.681 1.00 0.77 C 294 | ATOM 272 C VAL A 36 -9.032 -11.719 10.103 1.00 0.78 C 295 | ATOM 273 CB VAL A 36 -11.025 -13.257 10.193 1.00 0.76 C 296 | ATOM 274 O VAL A 36 -8.745 -11.025 11.082 1.00 0.77 O 297 | ATOM 275 CG1 VAL A 36 -10.724 -13.429 11.681 1.00 0.70 C 298 | ATOM 276 CG2 VAL A 36 -12.526 -13.369 9.932 1.00 0.71 C 299 | ATOM 277 N GLU A 37 -8.181 -12.295 9.344 1.00 0.79 N 300 | ATOM 278 CA GLU A 37 -6.761 -12.183 9.666 1.00 0.79 C 301 | ATOM 279 C GLU A 37 -6.286 -10.737 9.570 1.00 0.79 C 302 | ATOM 280 CB GLU A 37 -5.928 -13.074 8.741 1.00 0.77 C 303 | ATOM 281 O GLU A 37 -5.507 -10.276 10.407 1.00 0.77 O 304 | ATOM 282 CG GLU A 37 -5.576 -14.427 9.342 1.00 0.71 C 305 | ATOM 283 CD GLU A 37 -4.825 -15.334 8.381 1.00 0.68 C 306 | ATOM 284 OE1 GLU A 37 -4.452 -14.873 7.278 1.00 0.65 O 307 | ATOM 285 OE2 GLU A 37 -4.607 -16.514 8.733 1.00 0.63 O 308 | ATOM 286 N ALA A 38 -6.713 -10.044 8.430 1.00 0.80 N 309 | ATOM 287 CA ALA A 38 -6.340 -8.642 8.261 1.00 0.80 C 310 | ATOM 288 C ALA A 38 -6.851 -7.794 9.422 1.00 0.80 C 311 | ATOM 289 CB ALA A 38 -6.878 -8.104 6.937 1.00 0.78 C 312 | ATOM 290 O ALA A 38 -6.160 -6.884 9.885 1.00 0.79 O 313 | ATOM 291 N LEU A 39 -8.106 -8.108 9.884 1.00 0.79 N 314 | ATOM 292 CA LEU A 39 -8.691 -7.357 10.990 1.00 0.79 C 315 | ATOM 293 C LEU A 39 -7.861 -7.524 12.259 1.00 0.80 C 316 | ATOM 294 CB LEU A 39 -10.130 -7.811 11.245 1.00 0.78 C 317 | ATOM 295 O LEU A 39 -7.699 -6.574 13.029 1.00 0.79 O 318 | ATOM 296 CG LEU A 39 -11.181 -7.328 10.244 1.00 0.75 C 319 | ATOM 297 CD1 LEU A 39 -12.509 -8.036 10.486 1.00 0.70 C 320 | ATOM 298 CD2 LEU A 39 -11.350 -5.815 10.335 1.00 0.71 C 321 | ATOM 299 N TYR A 40 -7.386 -8.776 12.413 1.00 0.79 N 322 | ATOM 300 CA TYR A 40 -6.564 -8.991 13.599 1.00 0.79 C 323 | ATOM 301 C TYR A 40 -5.293 -8.152 13.542 1.00 0.80 C 324 | ATOM 302 CB TYR A 40 -6.205 -10.473 13.742 1.00 0.77 C 325 | ATOM 303 O TYR A 40 -4.825 -7.653 14.567 1.00 0.78 O 326 | ATOM 304 CG TYR A 40 -7.159 -11.247 14.618 1.00 0.72 C 327 | ATOM 305 CD1 TYR A 40 -7.198 -11.038 15.995 1.00 0.66 C 328 | ATOM 306 CD2 TYR A 40 -8.023 -12.190 14.072 1.00 0.66 C 329 | ATOM 307 CE1 TYR A 40 -8.075 -11.750 16.806 1.00 0.66 C 330 | ATOM 308 CE2 TYR A 40 -8.904 -12.908 14.874 1.00 0.65 C 331 | ATOM 309 OH TYR A 40 -9.793 -13.389 17.036 1.00 0.64 O 332 | ATOM 310 CZ TYR A 40 -8.923 -12.681 16.237 1.00 0.64 C 333 | ATOM 311 N LEU A 41 -4.722 -8.098 12.285 1.00 0.81 N 334 | ATOM 312 CA LEU A 41 -3.487 -7.340 12.116 1.00 0.81 C 335 | ATOM 313 C LEU A 41 -3.717 -5.858 12.388 1.00 0.81 C 336 | ATOM 314 CB LEU A 41 -2.929 -7.532 10.704 1.00 0.79 C 337 | ATOM 315 O LEU A 41 -2.871 -5.194 12.991 1.00 0.79 O 338 | ATOM 316 CG LEU A 41 -1.552 -8.191 10.602 1.00 0.72 C 339 | ATOM 317 CD1 LEU A 41 -1.571 -9.300 9.555 1.00 0.65 C 340 | ATOM 318 CD2 LEU A 41 -0.486 -7.153 10.269 1.00 0.66 C 341 | ATOM 319 N VAL A 42 -4.952 -5.275 11.875 1.00 0.80 N 342 | ATOM 320 CA VAL A 42 -5.263 -3.855 11.996 1.00 0.80 C 343 | ATOM 321 C VAL A 42 -5.748 -3.551 13.412 1.00 0.80 C 344 | ATOM 322 CB VAL A 42 -6.325 -3.416 10.963 1.00 0.78 C 345 | ATOM 323 O VAL A 42 -5.401 -2.516 13.985 1.00 0.79 O 346 | ATOM 324 CG1 VAL A 42 -6.747 -1.968 11.204 1.00 0.69 C 347 | ATOM 325 CG2 VAL A 42 -5.791 -3.591 9.543 1.00 0.68 C 348 | ATOM 326 N CYS A 43 -6.665 -4.513 13.786 1.00 0.79 N 349 | ATOM 327 CA CYS A 43 -7.303 -4.224 15.065 1.00 0.80 C 350 | ATOM 328 C CYS A 43 -6.416 -4.655 16.227 1.00 0.79 C 351 | ATOM 329 CB CYS A 43 -8.658 -4.924 15.158 1.00 0.77 C 352 | ATOM 330 O CYS A 43 -6.525 -4.116 17.329 1.00 0.75 O 353 | ATOM 331 SG CYS A 43 -9.921 -4.213 14.081 1.00 0.73 S 354 | ATOM 332 N GLY A 44 -5.288 -5.501 15.916 1.00 0.74 N 355 | ATOM 333 CA GLY A 44 -4.387 -5.949 16.966 1.00 0.73 C 356 | ATOM 334 C GLY A 44 -5.111 -6.438 18.206 1.00 0.73 C 357 | ATOM 335 O GLY A 44 -5.996 -7.291 18.119 1.00 0.67 O 358 | ATOM 336 N GLU A 45 -4.713 -5.736 19.408 1.00 0.69 N 359 | ATOM 337 CA GLU A 45 -5.200 -5.984 20.762 1.00 0.68 C 360 | ATOM 338 C GLU A 45 -6.615 -5.444 20.946 1.00 0.69 C 361 | ATOM 339 CB GLU A 45 -4.260 -5.360 21.795 1.00 0.63 C 362 | ATOM 340 O GLU A 45 -7.292 -5.781 21.920 1.00 0.66 O 363 | ATOM 341 CG GLU A 45 -2.895 -6.030 21.868 1.00 0.58 C 364 | ATOM 342 CD GLU A 45 -2.127 -5.690 23.136 1.00 0.57 C 365 | ATOM 343 OE1 GLU A 45 -2.600 -4.837 23.922 1.00 0.58 O 366 | ATOM 344 OE2 GLU A 45 -1.044 -6.280 23.345 1.00 0.52 O 367 | ATOM 345 N ARG A 46 -7.092 -4.661 20.125 1.00 0.76 N 368 | ATOM 346 CA ARG A 46 -8.378 -4.018 20.377 1.00 0.75 C 369 | ATOM 347 C ARG A 46 -9.533 -4.972 20.092 1.00 0.73 C 370 | ATOM 348 CB ARG A 46 -8.522 -2.753 19.528 1.00 0.70 C 371 | ATOM 349 O ARG A 46 -10.645 -4.775 20.587 1.00 0.68 O 372 | ATOM 350 CG ARG A 46 -7.580 -1.630 19.932 1.00 0.65 C 373 | ATOM 351 CD ARG A 46 -7.850 -0.356 19.142 1.00 0.64 C 374 | ATOM 352 NE ARG A 46 -6.809 0.644 19.361 1.00 0.51 N 375 | ATOM 353 NH1 ARG A 46 -7.757 2.233 17.976 1.00 0.39 N 376 | ATOM 354 NH2 ARG A 46 -5.793 2.683 19.071 1.00 0.34 N 377 | ATOM 355 CZ ARG A 46 -6.789 1.851 18.802 1.00 0.59 C 378 | ATOM 356 N GLY A 47 -9.129 -6.353 19.616 1.00 0.74 N 379 | ATOM 357 CA GLY A 47 -10.195 -7.295 19.313 1.00 0.73 C 380 | ATOM 358 C GLY A 47 -11.246 -6.726 18.379 1.00 0.73 C 381 | ATOM 359 O GLY A 47 -11.345 -5.508 18.215 1.00 0.69 O 382 | ATOM 360 N PHE A 48 -11.830 -7.174 17.396 1.00 0.74 N 383 | ATOM 361 CA PHE A 48 -12.941 -6.773 16.541 1.00 0.75 C 384 | ATOM 362 C PHE A 48 -14.019 -7.849 16.513 1.00 0.74 C 385 | ATOM 363 CB PHE A 48 -12.450 -6.485 15.119 1.00 0.71 C 386 | ATOM 364 O PHE A 48 -13.757 -9.007 16.846 1.00 0.69 O 387 | ATOM 365 CG PHE A 48 -11.813 -7.670 14.445 1.00 0.67 C 388 | ATOM 366 CD1 PHE A 48 -10.431 -7.814 14.421 1.00 0.63 C 389 | ATOM 367 CD2 PHE A 48 -12.597 -8.641 13.834 1.00 0.64 C 390 | ATOM 368 CE1 PHE A 48 -9.839 -8.910 13.798 1.00 0.60 C 391 | ATOM 369 CE2 PHE A 48 -12.012 -9.739 13.210 1.00 0.61 C 392 | ATOM 370 CZ PHE A 48 -10.633 -9.871 13.192 1.00 0.60 C 393 | ATOM 371 N PHE A 49 -15.284 -7.565 16.604 1.00 0.72 N 394 | ATOM 372 CA PHE A 49 -16.459 -8.424 16.517 1.00 0.72 C 395 | ATOM 373 C PHE A 49 -16.824 -8.697 15.063 1.00 0.72 C 396 | ATOM 374 CB PHE A 49 -17.647 -7.788 17.246 1.00 0.68 C 397 | ATOM 375 O PHE A 49 -16.892 -7.773 14.250 1.00 0.68 O 398 | ATOM 376 CG PHE A 49 -18.878 -8.653 17.270 1.00 0.64 C 399 | ATOM 377 CD1 PHE A 49 -19.900 -8.458 16.348 1.00 0.60 C 400 | ATOM 378 CD2 PHE A 49 -19.014 -9.662 18.214 1.00 0.61 C 401 | ATOM 379 CE1 PHE A 49 -21.041 -9.258 16.368 1.00 0.57 C 402 | ATOM 380 CE2 PHE A 49 -20.151 -10.465 18.241 1.00 0.59 C 403 | ATOM 381 CZ PHE A 49 -21.163 -10.260 17.317 1.00 0.54 C 404 | ATOM 382 N TYR A 50 -16.792 -10.010 14.666 1.00 0.70 N 405 | ATOM 383 CA TYR A 50 -17.259 -10.411 13.344 1.00 0.70 C 406 | ATOM 384 C TYR A 50 -18.531 -11.246 13.445 1.00 0.71 C 407 | ATOM 385 CB TYR A 50 -16.174 -11.201 12.606 1.00 0.67 C 408 | ATOM 386 O TYR A 50 -18.576 -12.233 14.182 1.00 0.68 O 409 | ATOM 387 CG TYR A 50 -16.607 -11.711 11.254 1.00 0.63 C 410 | ATOM 388 CD1 TYR A 50 -16.923 -13.054 11.062 1.00 0.59 C 411 | ATOM 389 CD2 TYR A 50 -16.701 -10.851 10.164 1.00 0.60 C 412 | ATOM 390 CE1 TYR A 50 -17.322 -13.529 9.817 1.00 0.56 C 413 | ATOM 391 CE2 TYR A 50 -17.099 -11.315 8.914 1.00 0.58 C 414 | ATOM 392 OH TYR A 50 -17.800 -13.118 7.516 1.00 0.51 O 415 | ATOM 393 CZ TYR A 50 -17.407 -12.653 8.751 1.00 0.57 C 416 | ATOM 394 N THR A 51 -19.828 -10.760 12.997 1.00 0.71 N 417 | ATOM 395 CA THR A 51 -21.072 -11.507 12.841 1.00 0.71 C 418 | ATOM 396 C THR A 51 -21.209 -12.038 11.417 1.00 0.71 C 419 | ATOM 397 CB THR A 51 -22.294 -10.637 13.187 1.00 0.68 C 420 | ATOM 398 O THR A 51 -21.321 -11.261 10.467 1.00 0.68 O 421 | ATOM 399 CG2 THR A 51 -22.894 -11.040 14.530 1.00 0.62 C 422 | ATOM 400 OG1 THR A 51 -21.892 -9.263 13.249 1.00 0.65 O 423 | ATOM 401 N PRO A 52 -20.836 -13.373 11.219 1.00 0.69 N 424 | ATOM 402 CA PRO A 52 -21.056 -13.925 9.880 1.00 0.68 C 425 | ATOM 403 C PRO A 52 -22.496 -13.757 9.402 1.00 0.70 C 426 | ATOM 404 CB PRO A 52 -20.705 -15.405 10.046 1.00 0.67 C 427 | ATOM 405 O PRO A 52 -23.428 -13.807 10.209 1.00 0.70 O 428 | ATOM 406 CG PRO A 52 -20.298 -15.543 11.478 1.00 0.65 C 429 | ATOM 407 CD PRO A 52 -20.517 -14.226 12.165 1.00 0.64 C 430 | ATOM 408 N LYS A 53 -22.707 -13.101 8.244 1.00 0.65 N 431 | ATOM 409 CA LYS A 53 -24.025 -13.035 7.619 1.00 0.65 C 432 | ATOM 410 C LYS A 53 -24.600 -14.432 7.403 1.00 0.66 C 433 | ATOM 411 CB LYS A 53 -23.950 -12.288 6.287 1.00 0.62 C 434 | ATOM 412 O LYS A 53 -23.877 -15.355 7.023 1.00 0.65 O 435 | ATOM 413 CG LYS A 53 -23.951 -10.772 6.427 1.00 0.60 C 436 | ATOM 414 CD LYS A 53 -24.045 -10.086 5.070 1.00 0.60 C 437 | ATOM 415 CE LYS A 53 -23.998 -8.570 5.205 1.00 0.52 C 438 | ATOM 416 NZ LYS A 53 -24.054 -7.893 3.875 1.00 0.45 N 439 | ATOM 417 N THR A 54 -25.606 -14.981 8.222 1.00 0.63 N 440 | ATOM 418 CA THR A 54 -26.393 -16.152 7.853 1.00 0.65 C 441 | ATOM 419 C THR A 54 -26.967 -15.997 6.447 1.00 0.62 C 442 | ATOM 420 CB THR A 54 -27.538 -16.394 8.854 1.00 0.59 C 443 | ATOM 421 O THR A 54 -27.291 -14.886 6.022 1.00 0.61 O 444 | ATOM 422 CG2 THR A 54 -27.025 -17.060 10.126 1.00 0.49 C 445 | ATOM 423 OG1 THR A 54 -28.140 -15.139 9.194 1.00 0.52 O --------------------------------------------------------------------------------