├── .gitignore ├── LICENSE ├── README.md ├── data_utils.py ├── get_model_params.sh ├── inputs ├── 1BC8.pdb ├── 2GFB.pdb ├── 4GYT.pdb ├── bias_AA_per_residue.json ├── bias_AA_per_residue_multi.json ├── fix_residues_multi.json ├── omit_AA_per_residue.json ├── omit_AA_per_residue_multi.json ├── pdb_ids.json └── redesigned_residues_multi.json ├── model_utils.py ├── openfold ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── data_modules.py │ ├── data_pipeline.py │ ├── data_transforms.py │ ├── errors.py │ ├── feature_pipeline.py │ ├── input_pipeline.py │ ├── mmcif_parsing.py │ ├── parsers.py │ ├── templates.py │ └── tools │ │ ├── __init__.py │ │ ├── hhblits.py │ │ ├── hhsearch.py │ │ ├── jackhmmer.py │ │ ├── kalign.py │ │ └── utils.py ├── np │ ├── __init__.py │ ├── protein.py │ ├── relax │ │ ├── __init__.py │ │ ├── amber_minimize.py │ │ ├── cleanup.py │ │ ├── relax.py │ │ └── utils.py │ └── residue_constants.py ├── resources │ └── __init__.py └── utils │ ├── feats.py │ ├── loss.py │ ├── rigid_utils.py │ └── tensor_utils.py ├── outputs ├── autoregressive_score_w_seq │ └── 1BC8_1.pt ├── autoregressive_score_wo_seq │ └── 1BC8_1.pt ├── batch_size │ ├── backbones │ │ ├── 1BC8_1.pdb │ │ ├── 1BC8_10.pdb │ │ ├── 1BC8_11.pdb │ │ ├── 1BC8_12.pdb │ │ ├── 1BC8_13.pdb │ │ ├── 1BC8_14.pdb │ │ ├── 1BC8_15.pdb │ │ ├── 1BC8_2.pdb │ │ ├── 1BC8_3.pdb │ │ ├── 1BC8_4.pdb │ │ ├── 1BC8_5.pdb │ │ ├── 1BC8_6.pdb │ │ ├── 1BC8_7.pdb │ │ ├── 1BC8_8.pdb │ │ └── 1BC8_9.pdb │ └── seqs │ │ └── 1BC8.fa ├── bias_AA_per_residue_multi │ ├── backbones │ │ ├── 1BC8_1.pdb │ │ └── 4GYT_1.pdb │ └── seqs │ │ ├── 1BC8.fa │ │ └── 4GYT.fa ├── chains_to_design │ ├── backbones │ │ └── 4GYT_1.pdb │ └── seqs │ │ └── 4GYT.fa ├── default │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── fasta_seq_separation │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── file_ending │ ├── backbones │ │ └── 1BC8_1_xyz.pdb │ └── seqs │ │ └── 1BC8_xyz.fa ├── fix_residues │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── fixed_residues_multi │ ├── backbones │ │ ├── 1BC8_1.pdb │ │ └── 4GYT_1.pdb │ └── seqs │ │ ├── 1BC8.fa │ │ └── 4GYT.fa ├── global_bias │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── global_label_membrane_mpnn_0 │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── global_omit │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── homooligomer │ ├── backbones │ │ ├── 4GYT_1.pdb │ │ └── 4GYT_2.pdb │ └── seqs │ │ └── 4GYT.fa ├── insertion_code │ ├── backbones │ │ └── 2GFB_1.pdb │ └── seqs │ │ └── 2GFB.fa ├── ligand_mpnn_cutoff_for_score │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── ligandmpnn_default │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── ligandmpnn_no_context │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── ligandmpnn_use_side_chain_atoms │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── ligandmpnn_v_32_005_25 │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── omit_AA_per_residue_multi │ ├── backbones │ │ ├── 1BC8_1.pdb │ │ └── 4GYT_1.pdb │ └── seqs │ │ ├── 1BC8.fa │ │ └── 4GYT.fa ├── parse_atoms_with_zero_occupancy │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── parse_these_chains_only │ ├── backbones │ │ └── 4GYT_1.pdb │ └── seqs │ │ └── 4GYT.fa ├── pdb_path_multi │ ├── backbones │ │ ├── 1BC8_1.pdb │ │ └── 4GYT_1.pdb │ └── seqs │ │ ├── 1BC8.fa │ │ └── 4GYT.fa ├── per_residue_bias │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── per_residue_label_membrane_mpnn_default │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── per_residue_omit │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── random_seed │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── redesign_residues │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── redesigned_residues_multi │ ├── backbones │ │ ├── 1BC8_1.pdb │ │ └── 4GYT_1.pdb │ └── seqs │ │ ├── 1BC8.fa │ │ └── 4GYT.fa ├── save_stats │ ├── backbones │ │ └── 1BC8_1.pdb │ ├── seqs │ │ └── 1BC8.fa │ └── stats │ │ └── 1BC8.pt ├── sc_default │ ├── backbones │ │ └── 1BC8_1.pdb │ ├── packed │ │ ├── 1BC8_packed_1_1.pdb │ │ ├── 1BC8_packed_1_2.pdb │ │ ├── 1BC8_packed_1_3.pdb │ │ └── 1BC8_packed_1_4.pdb │ └── seqs │ │ └── 1BC8.fa ├── sc_default_fast │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── sc_fixed_residues │ ├── backbones │ │ └── 1BC8_1.pdb │ ├── packed │ │ ├── 1BC8_packed_1_1.pdb │ │ ├── 1BC8_packed_1_2.pdb │ │ ├── 1BC8_packed_1_3.pdb │ │ └── 1BC8_packed_1_4.pdb │ └── seqs │ │ └── 1BC8.fa ├── sc_fixed_residues_full_repack │ ├── backbones │ │ └── 1BC8_1.pdb │ ├── packed │ │ ├── 1BC8_packed_1_1.pdb │ │ ├── 1BC8_packed_1_2.pdb │ │ ├── 1BC8_packed_1_3.pdb │ │ └── 1BC8_packed_1_4.pdb │ └── seqs │ │ └── 1BC8.fa ├── sc_no_context │ ├── backbones │ │ └── 1BC8_1.pdb │ ├── packed │ │ ├── 1BC8_packed_1_1.pdb │ │ ├── 1BC8_packed_1_2.pdb │ │ ├── 1BC8_packed_1_3.pdb │ │ └── 1BC8_packed_1_4.pdb │ └── seqs │ │ └── 1BC8.fa ├── single_aa_score_w_seq │ └── 1BC8_1.pt ├── single_aa_score_wo_seq │ └── 1BC8_1.pt ├── soluble_mpnn_default │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── symmetry │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── temperature │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa ├── verbose │ ├── backbones │ │ └── 1BC8_1.pdb │ └── seqs │ │ └── 1BC8.fa └── zero_indexed │ ├── backbones │ ├── 1BC8_0.pdb │ └── 1BC8_1.pdb │ └── seqs │ └── 1BC8.fa ├── requirements.txt ├── run.py ├── run_examples.sh ├── sc_examples.sh ├── sc_utils.py ├── score.py └── training ├── README.md ├── test_metal.json ├── test_nucleotide.json ├── test_small_molecule.json ├── train.json └── valid.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Justas Dauparas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /get_model_params.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #make new directory for model parameters 4 | #e.g. bash get_model_params.sh "./model_params" 5 | 6 | mkdir -p $1 7 | 8 | #Original ProteinMPNN weights 9 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_48_002.pt -O $1"/proteinmpnn_v_48_002.pt" 10 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_48_010.pt -O $1"/proteinmpnn_v_48_010.pt" 11 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_48_020.pt -O $1"/proteinmpnn_v_48_020.pt" 12 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_48_030.pt -O $1"/proteinmpnn_v_48_030.pt" 13 | 14 | #ProteinMPNN with num_edges=32 15 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_32_002.pt -O $1"/proteinmpnn_v_32_002.pt" 16 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_32_010.pt -O $1"/proteinmpnn_v_32_010.pt" 17 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_32_020.pt -O $1"/proteinmpnn_v_32_020.pt" 18 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_32_030.pt -O $1"/proteinmpnn_v_32_030.pt" 19 | 20 | #LigandMPNN with num_edges=32; atom_context_num=25 21 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_005_25.pt -O $1"/ligandmpnn_v_32_005_25.pt" 22 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_010_25.pt -O $1"/ligandmpnn_v_32_010_25.pt" 23 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_020_25.pt -O $1"/ligandmpnn_v_32_020_25.pt" 24 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_030_25.pt -O $1"/ligandmpnn_v_32_030_25.pt" 25 | 26 | #LigandMPNN with num_edges=32; atom_context_num=16 27 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_005_16.pt -O $1"/ligandmpnn_v_32_005_16.pt" 28 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_010_16.pt -O $1"/ligandmpnn_v_32_010_16.pt" 29 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_020_16.pt -O $1"/ligandmpnn_v_32_020_16.pt" 30 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_030_16.pt -O $1"/ligandmpnn_v_32_030_16.pt" 31 | 32 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/publication_version_ligandmpnn_v_32_010_25.pt -O $1"/publication_version_ligandmpnn_v_32_010_25.pt" 33 | 34 | #Per residue label membrane ProteinMPNN 35 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/per_residue_label_membrane_mpnn_v_48_020.pt -O $1"/per_residue_label_membrane_mpnn_v_48_020.pt" 36 | 37 | #Global label membrane ProteinMPNN 38 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/global_label_membrane_mpnn_v_48_020.pt -O $1"/global_label_membrane_mpnn_v_48_020.pt" 39 | 40 | #SolubleMPNN 41 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/solublempnn_v_48_002.pt -O $1"/solublempnn_v_48_002.pt" 42 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/solublempnn_v_48_010.pt -O $1"/solublempnn_v_48_010.pt" 43 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/solublempnn_v_48_020.pt -O $1"/solublempnn_v_48_020.pt" 44 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/solublempnn_v_48_030.pt -O $1"/solublempnn_v_48_030.pt" 45 | 46 | #LigandMPNN for side-chain packing (multi-step denoising model) 47 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_sc_v_32_002_16.pt -O $1"/ligandmpnn_sc_v_32_002_16.pt" 48 | -------------------------------------------------------------------------------- /inputs/bias_AA_per_residue.json: -------------------------------------------------------------------------------- 1 | { 2 | "C1": {"G": -0.3, "C": -2.0, "P": 10.8}, 3 | "C3": {"P": 10.0}, 4 | "C5": {"G": -1.3, "P": 10.0}, 5 | "C7": {"G": -1.3, "P": 10.0} 6 | } 7 | -------------------------------------------------------------------------------- /inputs/bias_AA_per_residue_multi.json: -------------------------------------------------------------------------------- 1 | { 2 | "./inputs/1BC8.pdb": {"C1":{"A":3.0, "P":-2.0}, "C2":{"W":10.0, "G":-0.43}}, 3 | "./inputs/4GYT.pdb": {"A7":{"Y":5.0, "S":-2.0}, "A8":{"M":3.9, "G":-0.43}} 4 | } 5 | -------------------------------------------------------------------------------- /inputs/fix_residues_multi.json: -------------------------------------------------------------------------------- 1 | { 2 | "./inputs/1BC8.pdb": "C1 C2 C3 C4 C5 C10 C22", 3 | "./inputs/4GYT.pdb": "A7 A8 A9 A10 A11 A12 A13 B38" 4 | } 5 | -------------------------------------------------------------------------------- /inputs/omit_AA_per_residue.json: -------------------------------------------------------------------------------- 1 | { 2 | "C1": "ACDEFGHIKLMNPQRSTVW", 3 | "C3": "ACDEFGHIKLMNPQRSTVW", 4 | "C5": "ACDEFGHIKLMNPQRSTVW", 5 | "C7": "ACDEFGHIKLMNPQRSTVW" 6 | } 7 | -------------------------------------------------------------------------------- /inputs/omit_AA_per_residue_multi.json: -------------------------------------------------------------------------------- 1 | { 2 | "./inputs/1BC8.pdb": {"C1":"ACDEFGHILMNPQRSTVWY", "C2":"ACDEFGHILMNPQRSTVWY", "C3":"ACDEFGHILMNPQRSTVWY"}, 3 | "./inputs/4GYT.pdb": {"A7":"ACDEFGHILMNPQRSTVWY", "A8":"ACDEFGHILMNPQRSTVWY"} 4 | } 5 | -------------------------------------------------------------------------------- /inputs/pdb_ids.json: -------------------------------------------------------------------------------- 1 | { 2 | "./inputs/1BC8.pdb": "", 3 | "./inputs/4GYT.pdb": "" 4 | } 5 | -------------------------------------------------------------------------------- /inputs/redesigned_residues_multi.json: -------------------------------------------------------------------------------- 1 | { 2 | "./inputs/1BC8.pdb": "C1 C2 C3 C4 C5 C10", 3 | "./inputs/4GYT.pdb": "A7 A8 A9 A10 A12 A13 B38" 4 | } 5 | -------------------------------------------------------------------------------- /openfold/__init__.py: -------------------------------------------------------------------------------- 1 | #from . import model 2 | #from . import utils 3 | #from . import np 4 | #from . import resources 5 | 6 | #__all__ = ["model", "utils", "np", "data", "resources"] 7 | -------------------------------------------------------------------------------- /openfold/config.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import ml_collections as mlc 3 | 4 | 5 | def set_inf(c, inf): 6 | for k, v in c.items(): 7 | if isinstance(v, mlc.ConfigDict): 8 | set_inf(v, inf) 9 | elif k == "inf": 10 | c[k] = inf 11 | 12 | 13 | def enforce_config_constraints(config): 14 | def string_to_setting(s): 15 | path = s.split('.') 16 | setting = config 17 | for p in path: 18 | setting = setting[p] 19 | 20 | return setting 21 | 22 | mutually_exclusive_bools = [ 23 | ( 24 | "model.template.average_templates", 25 | "model.template.offload_templates" 26 | ) 27 | ] 28 | 29 | for s1, s2 in mutually_exclusive_bools: 30 | s1_setting = string_to_setting(s1) 31 | s2_setting = string_to_setting(s2) 32 | if(s1_setting and s2_setting): 33 | raise ValueError(f"Only one of {s1} and {s2} may be set at a time") 34 | 35 | 36 | def model_config(name, train=False, low_prec=False): 37 | c = copy.deepcopy(config) 38 | if name == "initial_training": 39 | # AF2 Suppl. Table 4, "initial training" setting 40 | pass 41 | elif name == "finetuning": 42 | # AF2 Suppl. Table 4, "finetuning" setting 43 | c.data.train.max_extra_msa = 5120 44 | c.data.train.crop_size = 384 45 | c.data.train.max_msa_clusters = 512 46 | c.loss.violation.weight = 1. 47 | c.loss.experimentally_resolved.weight = 0.01 48 | elif name == "finetuning_ptm": 49 | c.data.train.max_extra_msa = 5120 50 | c.data.train.crop_size = 384 51 | c.data.train.max_msa_clusters = 512 52 | c.loss.violation.weight = 1. 53 | c.loss.experimentally_resolved.weight = 0.01 54 | c.model.heads.tm.enabled = True 55 | c.loss.tm.weight = 0.1 56 | elif name == "model_1": 57 | # AF2 Suppl. Table 5, Model 1.1.1 58 | c.data.train.max_extra_msa = 5120 59 | c.data.predict.max_extra_msa = 5120 60 | c.data.common.reduce_max_clusters_by_max_templates = True 61 | c.data.common.use_templates = True 62 | c.data.common.use_template_torsion_angles = True 63 | c.model.template.enabled = True 64 | elif name == "model_2": 65 | # AF2 Suppl. Table 5, Model 1.1.2 66 | c.data.common.reduce_max_clusters_by_max_templates = True 67 | c.data.common.use_templates = True 68 | c.data.common.use_template_torsion_angles = True 69 | c.model.template.enabled = True 70 | elif name == "model_3": 71 | # AF2 Suppl. Table 5, Model 1.2.1 72 | c.data.train.max_extra_msa = 5120 73 | c.data.predict.max_extra_msa = 5120 74 | c.model.template.enabled = False 75 | elif name == "model_4": 76 | # AF2 Suppl. Table 5, Model 1.2.2 77 | c.data.train.max_extra_msa = 5120 78 | c.data.predict.max_extra_msa = 5120 79 | c.model.template.enabled = False 80 | elif name == "model_5": 81 | # AF2 Suppl. Table 5, Model 1.2.3 82 | c.model.template.enabled = False 83 | elif name == "model_1_ptm": 84 | c.data.train.max_extra_msa = 5120 85 | c.data.predict.max_extra_msa = 5120 86 | c.data.common.reduce_max_clusters_by_max_templates = True 87 | c.data.common.use_templates = True 88 | c.data.common.use_template_torsion_angles = True 89 | c.model.template.enabled = True 90 | c.model.heads.tm.enabled = True 91 | c.loss.tm.weight = 0.1 92 | elif name == "model_2_ptm": 93 | c.data.common.reduce_max_clusters_by_max_templates = True 94 | c.data.common.use_templates = True 95 | c.data.common.use_template_torsion_angles = True 96 | c.model.template.enabled = True 97 | c.model.heads.tm.enabled = True 98 | c.loss.tm.weight = 0.1 99 | elif name == "model_3_ptm": 100 | c.data.train.max_extra_msa = 5120 101 | c.data.predict.max_extra_msa = 5120 102 | c.model.template.enabled = False 103 | c.model.heads.tm.enabled = True 104 | c.loss.tm.weight = 0.1 105 | elif name == "model_4_ptm": 106 | c.data.train.max_extra_msa = 5120 107 | c.data.predict.max_extra_msa = 5120 108 | c.model.template.enabled = False 109 | c.model.heads.tm.enabled = True 110 | c.loss.tm.weight = 0.1 111 | elif name == "model_5_ptm": 112 | c.model.template.enabled = False 113 | c.model.heads.tm.enabled = True 114 | c.loss.tm.weight = 0.1 115 | else: 116 | raise ValueError("Invalid model name") 117 | 118 | if train: 119 | c.globals.blocks_per_ckpt = 1 120 | c.globals.chunk_size = None 121 | c.globals.use_lma = False 122 | c.globals.offload_inference = False 123 | c.model.template.average_templates = False 124 | c.model.template.offload_templates = False 125 | if low_prec: 126 | c.globals.eps = 1e-4 127 | # If we want exact numerical parity with the original, inf can't be 128 | # a global constant 129 | set_inf(c, 1e4) 130 | 131 | enforce_config_constraints(c) 132 | 133 | return c 134 | 135 | 136 | c_z = mlc.FieldReference(128, field_type=int) 137 | c_m = mlc.FieldReference(256, field_type=int) 138 | c_t = mlc.FieldReference(64, field_type=int) 139 | c_e = mlc.FieldReference(64, field_type=int) 140 | c_s = mlc.FieldReference(384, field_type=int) 141 | blocks_per_ckpt = mlc.FieldReference(None, field_type=int) 142 | chunk_size = mlc.FieldReference(4, field_type=int) 143 | aux_distogram_bins = mlc.FieldReference(64, field_type=int) 144 | tm_enabled = mlc.FieldReference(False, field_type=bool) 145 | eps = mlc.FieldReference(1e-8, field_type=float) 146 | templates_enabled = mlc.FieldReference(True, field_type=bool) 147 | embed_template_torsion_angles = mlc.FieldReference(True, field_type=bool) 148 | tune_chunk_size = mlc.FieldReference(True, field_type=bool) 149 | 150 | NUM_RES = "num residues placeholder" 151 | NUM_MSA_SEQ = "msa placeholder" 152 | NUM_EXTRA_SEQ = "extra msa placeholder" 153 | NUM_TEMPLATES = "num templates placeholder" 154 | 155 | config = mlc.ConfigDict( 156 | { 157 | "data": { 158 | "common": { 159 | "feat": { 160 | "aatype": [NUM_RES], 161 | "all_atom_mask": [NUM_RES, None], 162 | "all_atom_positions": [NUM_RES, None, None], 163 | "alt_chi_angles": [NUM_RES, None], 164 | "atom14_alt_gt_exists": [NUM_RES, None], 165 | "atom14_alt_gt_positions": [NUM_RES, None, None], 166 | "atom14_atom_exists": [NUM_RES, None], 167 | "atom14_atom_is_ambiguous": [NUM_RES, None], 168 | "atom14_gt_exists": [NUM_RES, None], 169 | "atom14_gt_positions": [NUM_RES, None, None], 170 | "atom37_atom_exists": [NUM_RES, None], 171 | "backbone_rigid_mask": [NUM_RES], 172 | "backbone_rigid_tensor": [NUM_RES, None, None], 173 | "bert_mask": [NUM_MSA_SEQ, NUM_RES], 174 | "chi_angles_sin_cos": [NUM_RES, None, None], 175 | "chi_mask": [NUM_RES, None], 176 | "extra_deletion_value": [NUM_EXTRA_SEQ, NUM_RES], 177 | "extra_has_deletion": [NUM_EXTRA_SEQ, NUM_RES], 178 | "extra_msa": [NUM_EXTRA_SEQ, NUM_RES], 179 | "extra_msa_mask": [NUM_EXTRA_SEQ, NUM_RES], 180 | "extra_msa_row_mask": [NUM_EXTRA_SEQ], 181 | "is_distillation": [], 182 | "msa_feat": [NUM_MSA_SEQ, NUM_RES, None], 183 | "msa_mask": [NUM_MSA_SEQ, NUM_RES], 184 | "msa_row_mask": [NUM_MSA_SEQ], 185 | "no_recycling_iters": [], 186 | "pseudo_beta": [NUM_RES, None], 187 | "pseudo_beta_mask": [NUM_RES], 188 | "residue_index": [NUM_RES], 189 | "residx_atom14_to_atom37": [NUM_RES, None], 190 | "residx_atom37_to_atom14": [NUM_RES, None], 191 | "resolution": [], 192 | "rigidgroups_alt_gt_frames": [NUM_RES, None, None, None], 193 | "rigidgroups_group_exists": [NUM_RES, None], 194 | "rigidgroups_group_is_ambiguous": [NUM_RES, None], 195 | "rigidgroups_gt_exists": [NUM_RES, None], 196 | "rigidgroups_gt_frames": [NUM_RES, None, None, None], 197 | "seq_length": [], 198 | "seq_mask": [NUM_RES], 199 | "target_feat": [NUM_RES, None], 200 | "template_aatype": [NUM_TEMPLATES, NUM_RES], 201 | "template_all_atom_mask": [NUM_TEMPLATES, NUM_RES, None], 202 | "template_all_atom_positions": [ 203 | NUM_TEMPLATES, NUM_RES, None, None, 204 | ], 205 | "template_alt_torsion_angles_sin_cos": [ 206 | NUM_TEMPLATES, NUM_RES, None, None, 207 | ], 208 | "template_backbone_rigid_mask": [NUM_TEMPLATES, NUM_RES], 209 | "template_backbone_rigid_tensor": [ 210 | NUM_TEMPLATES, NUM_RES, None, None, 211 | ], 212 | "template_mask": [NUM_TEMPLATES], 213 | "template_pseudo_beta": [NUM_TEMPLATES, NUM_RES, None], 214 | "template_pseudo_beta_mask": [NUM_TEMPLATES, NUM_RES], 215 | "template_sum_probs": [NUM_TEMPLATES, None], 216 | "template_torsion_angles_mask": [ 217 | NUM_TEMPLATES, NUM_RES, None, 218 | ], 219 | "template_torsion_angles_sin_cos": [ 220 | NUM_TEMPLATES, NUM_RES, None, None, 221 | ], 222 | "true_msa": [NUM_MSA_SEQ, NUM_RES], 223 | "use_clamped_fape": [], 224 | }, 225 | "masked_msa": { 226 | "profile_prob": 0.1, 227 | "same_prob": 0.1, 228 | "uniform_prob": 0.1, 229 | }, 230 | "max_recycling_iters": 3, 231 | "msa_cluster_features": True, 232 | "reduce_msa_clusters_by_max_templates": False, 233 | "resample_msa_in_recycling": True, 234 | "template_features": [ 235 | "template_all_atom_positions", 236 | "template_sum_probs", 237 | "template_aatype", 238 | "template_all_atom_mask", 239 | ], 240 | "unsupervised_features": [ 241 | "aatype", 242 | "residue_index", 243 | "msa", 244 | "num_alignments", 245 | "seq_length", 246 | "between_segment_residues", 247 | "deletion_matrix", 248 | "no_recycling_iters", 249 | ], 250 | "use_templates": templates_enabled, 251 | "use_template_torsion_angles": embed_template_torsion_angles, 252 | }, 253 | "supervised": { 254 | "clamp_prob": 0.9, 255 | "supervised_features": [ 256 | "all_atom_mask", 257 | "all_atom_positions", 258 | "resolution", 259 | "use_clamped_fape", 260 | "is_distillation", 261 | ], 262 | }, 263 | "predict": { 264 | "fixed_size": True, 265 | "subsample_templates": False, # We want top templates. 266 | "masked_msa_replace_fraction": 0.15, 267 | "max_msa_clusters": 512, 268 | "max_extra_msa": 1024, 269 | "max_template_hits": 4, 270 | "max_templates": 4, 271 | "crop": False, 272 | "crop_size": None, 273 | "supervised": False, 274 | "uniform_recycling": False, 275 | }, 276 | "eval": { 277 | "fixed_size": True, 278 | "subsample_templates": False, # We want top templates. 279 | "masked_msa_replace_fraction": 0.15, 280 | "max_msa_clusters": 128, 281 | "max_extra_msa": 1024, 282 | "max_template_hits": 4, 283 | "max_templates": 4, 284 | "crop": False, 285 | "crop_size": None, 286 | "supervised": True, 287 | "uniform_recycling": False, 288 | }, 289 | "train": { 290 | "fixed_size": True, 291 | "subsample_templates": True, 292 | "masked_msa_replace_fraction": 0.15, 293 | "max_msa_clusters": 128, 294 | "max_extra_msa": 1024, 295 | "max_template_hits": 4, 296 | "max_templates": 4, 297 | "shuffle_top_k_prefiltered": 20, 298 | "crop": True, 299 | "crop_size": 256, 300 | "supervised": True, 301 | "clamp_prob": 0.9, 302 | "max_distillation_msa_clusters": 1000, 303 | "uniform_recycling": True, 304 | "distillation_prob": 0.75, 305 | }, 306 | "data_module": { 307 | "use_small_bfd": False, 308 | "data_loaders": { 309 | "batch_size": 1, 310 | "num_workers": 16, 311 | }, 312 | }, 313 | }, 314 | # Recurring FieldReferences that can be changed globally here 315 | "globals": { 316 | "blocks_per_ckpt": blocks_per_ckpt, 317 | "chunk_size": chunk_size, 318 | "use_lma": False, 319 | "offload_inference": False, 320 | "c_z": c_z, 321 | "c_m": c_m, 322 | "c_t": c_t, 323 | "c_e": c_e, 324 | "c_s": c_s, 325 | "eps": eps, 326 | }, 327 | "model": { 328 | "_mask_trans": False, 329 | "input_embedder": { 330 | "tf_dim": 22, 331 | "msa_dim": 49, 332 | "c_z": c_z, 333 | "c_m": c_m, 334 | "relpos_k": 32, 335 | }, 336 | "recycling_embedder": { 337 | "c_z": c_z, 338 | "c_m": c_m, 339 | "min_bin": 3.25, 340 | "max_bin": 20.75, 341 | "no_bins": 15, 342 | "inf": 1e8, 343 | }, 344 | "template": { 345 | "distogram": { 346 | "min_bin": 3.25, 347 | "max_bin": 50.75, 348 | "no_bins": 39, 349 | }, 350 | "template_angle_embedder": { 351 | # DISCREPANCY: c_in is supposed to be 51. 352 | "c_in": 57, 353 | "c_out": c_m, 354 | }, 355 | "template_pair_embedder": { 356 | "c_in": 88, 357 | "c_out": c_t, 358 | }, 359 | "template_pair_stack": { 360 | "c_t": c_t, 361 | # DISCREPANCY: c_hidden_tri_att here is given in the supplement 362 | # as 64. In the code, it's 16. 363 | "c_hidden_tri_att": 16, 364 | "c_hidden_tri_mul": 64, 365 | "no_blocks": 2, 366 | "no_heads": 4, 367 | "pair_transition_n": 2, 368 | "dropout_rate": 0.25, 369 | "blocks_per_ckpt": blocks_per_ckpt, 370 | "tune_chunk_size": tune_chunk_size, 371 | "inf": 1e9, 372 | }, 373 | "template_pointwise_attention": { 374 | "c_t": c_t, 375 | "c_z": c_z, 376 | # DISCREPANCY: c_hidden here is given in the supplement as 64. 377 | # It's actually 16. 378 | "c_hidden": 16, 379 | "no_heads": 4, 380 | "inf": 1e5, # 1e9, 381 | }, 382 | "inf": 1e5, # 1e9, 383 | "eps": eps, # 1e-6, 384 | "enabled": templates_enabled, 385 | "embed_angles": embed_template_torsion_angles, 386 | "use_unit_vector": False, 387 | # Approximate template computation, saving memory. 388 | # In our experiments, results are equivalent to or better than 389 | # the stock implementation. Should be enabled for all new 390 | # training runs. 391 | "average_templates": False, 392 | # Offload template embeddings to CPU memory. Vastly reduced 393 | # memory consumption at the cost of a modest increase in 394 | # runtime. Useful for inference on very long sequences. 395 | # Mutually exclusive with average_templates. 396 | "offload_templates": False, 397 | }, 398 | "extra_msa": { 399 | "extra_msa_embedder": { 400 | "c_in": 25, 401 | "c_out": c_e, 402 | }, 403 | "extra_msa_stack": { 404 | "c_m": c_e, 405 | "c_z": c_z, 406 | "c_hidden_msa_att": 8, 407 | "c_hidden_opm": 32, 408 | "c_hidden_mul": 128, 409 | "c_hidden_pair_att": 32, 410 | "no_heads_msa": 8, 411 | "no_heads_pair": 4, 412 | "no_blocks": 4, 413 | "transition_n": 4, 414 | "msa_dropout": 0.15, 415 | "pair_dropout": 0.25, 416 | "clear_cache_between_blocks": False, 417 | "tune_chunk_size": tune_chunk_size, 418 | "inf": 1e9, 419 | "eps": eps, # 1e-10, 420 | "ckpt": blocks_per_ckpt is not None, 421 | }, 422 | "enabled": True, 423 | }, 424 | "evoformer_stack": { 425 | "c_m": c_m, 426 | "c_z": c_z, 427 | "c_hidden_msa_att": 32, 428 | "c_hidden_opm": 32, 429 | "c_hidden_mul": 128, 430 | "c_hidden_pair_att": 32, 431 | "c_s": c_s, 432 | "no_heads_msa": 8, 433 | "no_heads_pair": 4, 434 | "no_blocks": 48, 435 | "transition_n": 4, 436 | "msa_dropout": 0.15, 437 | "pair_dropout": 0.25, 438 | "blocks_per_ckpt": blocks_per_ckpt, 439 | "clear_cache_between_blocks": False, 440 | "tune_chunk_size": tune_chunk_size, 441 | "inf": 1e9, 442 | "eps": eps, # 1e-10, 443 | }, 444 | "structure_module": { 445 | "c_s": c_s, 446 | "c_z": c_z, 447 | "c_ipa": 16, 448 | "c_resnet": 128, 449 | "no_heads_ipa": 12, 450 | "no_qk_points": 4, 451 | "no_v_points": 8, 452 | "dropout_rate": 0.1, 453 | "no_blocks": 8, 454 | "no_transition_layers": 1, 455 | "no_resnet_blocks": 2, 456 | "no_angles": 7, 457 | "trans_scale_factor": 10, 458 | "epsilon": eps, # 1e-12, 459 | "inf": 1e5, 460 | }, 461 | "heads": { 462 | "lddt": { 463 | "no_bins": 50, 464 | "c_in": c_s, 465 | "c_hidden": 128, 466 | }, 467 | "distogram": { 468 | "c_z": c_z, 469 | "no_bins": aux_distogram_bins, 470 | }, 471 | "tm": { 472 | "c_z": c_z, 473 | "no_bins": aux_distogram_bins, 474 | "enabled": tm_enabled, 475 | }, 476 | "masked_msa": { 477 | "c_m": c_m, 478 | "c_out": 23, 479 | }, 480 | "experimentally_resolved": { 481 | "c_s": c_s, 482 | "c_out": 37, 483 | }, 484 | }, 485 | }, 486 | "relax": { 487 | "max_iterations": 0, # no max 488 | "tolerance": 2.39, 489 | "stiffness": 10.0, 490 | "max_outer_iterations": 20, 491 | "exclude_residues": [], 492 | }, 493 | "loss": { 494 | "distogram": { 495 | "min_bin": 2.3125, 496 | "max_bin": 21.6875, 497 | "no_bins": 64, 498 | "eps": eps, # 1e-6, 499 | "weight": 0.3, 500 | }, 501 | "experimentally_resolved": { 502 | "eps": eps, # 1e-8, 503 | "min_resolution": 0.1, 504 | "max_resolution": 3.0, 505 | "weight": 0.0, 506 | }, 507 | "fape": { 508 | "backbone": { 509 | "clamp_distance": 10.0, 510 | "loss_unit_distance": 10.0, 511 | "weight": 0.5, 512 | }, 513 | "sidechain": { 514 | "clamp_distance": 10.0, 515 | "length_scale": 10.0, 516 | "weight": 0.5, 517 | }, 518 | "eps": 1e-4, 519 | "weight": 1.0, 520 | }, 521 | "lddt": { 522 | "min_resolution": 0.1, 523 | "max_resolution": 3.0, 524 | "cutoff": 15.0, 525 | "no_bins": 50, 526 | "eps": eps, # 1e-10, 527 | "weight": 0.01, 528 | }, 529 | "masked_msa": { 530 | "eps": eps, # 1e-8, 531 | "weight": 2.0, 532 | }, 533 | "supervised_chi": { 534 | "chi_weight": 0.5, 535 | "angle_norm_weight": 0.01, 536 | "eps": eps, # 1e-6, 537 | "weight": 1.0, 538 | }, 539 | "violation": { 540 | "violation_tolerance_factor": 12.0, 541 | "clash_overlap_tolerance": 1.5, 542 | "eps": eps, # 1e-6, 543 | "weight": 0.0, 544 | }, 545 | "tm": { 546 | "max_bin": 31, 547 | "no_bins": 64, 548 | "min_resolution": 0.1, 549 | "max_resolution": 3.0, 550 | "eps": eps, # 1e-8, 551 | "weight": 0., 552 | "enabled": tm_enabled, 553 | }, 554 | "eps": eps, 555 | }, 556 | "ema": {"decay": 0.999}, 557 | } 558 | ) 559 | -------------------------------------------------------------------------------- /openfold/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/openfold/data/__init__.py -------------------------------------------------------------------------------- /openfold/data/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """General-purpose errors used throughout the data pipeline""" 17 | class Error(Exception): 18 | """Base class for exceptions.""" 19 | 20 | 21 | class MultipleChainsError(Error): 22 | """An error indicating that multiple chains were found for a given ID.""" 23 | -------------------------------------------------------------------------------- /openfold/data/feature_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import copy 17 | from typing import Mapping, Tuple, List, Optional, Dict, Sequence 18 | 19 | import ml_collections 20 | import numpy as np 21 | import torch 22 | 23 | from openfold.data import input_pipeline 24 | 25 | 26 | FeatureDict = Mapping[str, np.ndarray] 27 | TensorDict = Dict[str, torch.Tensor] 28 | 29 | 30 | def np_to_tensor_dict( 31 | np_example: Mapping[str, np.ndarray], 32 | features: Sequence[str], 33 | ) -> TensorDict: 34 | """Creates dict of tensors from a dict of NumPy arrays. 35 | 36 | Args: 37 | np_example: A dict of NumPy feature arrays. 38 | features: A list of strings of feature names to be returned in the dataset. 39 | 40 | Returns: 41 | A dictionary of features mapping feature names to features. Only the given 42 | features are returned, all other ones are filtered out. 43 | """ 44 | tensor_dict = { 45 | k: torch.tensor(v) for k, v in np_example.items() if k in features 46 | } 47 | 48 | return tensor_dict 49 | 50 | 51 | def make_data_config( 52 | config: ml_collections.ConfigDict, 53 | mode: str, 54 | num_res: int, 55 | ) -> Tuple[ml_collections.ConfigDict, List[str]]: 56 | cfg = copy.deepcopy(config) 57 | mode_cfg = cfg[mode] 58 | with cfg.unlocked(): 59 | if mode_cfg.crop_size is None: 60 | mode_cfg.crop_size = num_res 61 | 62 | feature_names = cfg.common.unsupervised_features 63 | 64 | if cfg.common.use_templates: 65 | feature_names += cfg.common.template_features 66 | 67 | if cfg[mode].supervised: 68 | feature_names += cfg.supervised.supervised_features 69 | 70 | return cfg, feature_names 71 | 72 | 73 | def np_example_to_features( 74 | np_example: FeatureDict, 75 | config: ml_collections.ConfigDict, 76 | mode: str, 77 | ): 78 | np_example = dict(np_example) 79 | num_res = int(np_example["seq_length"][0]) 80 | cfg, feature_names = make_data_config(config, mode=mode, num_res=num_res) 81 | 82 | if "deletion_matrix_int" in np_example: 83 | np_example["deletion_matrix"] = np_example.pop( 84 | "deletion_matrix_int" 85 | ).astype(np.float32) 86 | 87 | tensor_dict = np_to_tensor_dict( 88 | np_example=np_example, features=feature_names 89 | ) 90 | with torch.no_grad(): 91 | features = input_pipeline.process_tensors_from_config( 92 | tensor_dict, 93 | cfg.common, 94 | cfg[mode], 95 | ) 96 | 97 | return {k: v for k, v in features.items()} 98 | 99 | 100 | class FeaturePipeline: 101 | def __init__( 102 | self, 103 | config: ml_collections.ConfigDict, 104 | ): 105 | self.config = config 106 | 107 | def process_features( 108 | self, 109 | raw_features: FeatureDict, 110 | mode: str = "train", 111 | ) -> FeatureDict: 112 | return np_example_to_features( 113 | np_example=raw_features, 114 | config=self.config, 115 | mode=mode, 116 | ) 117 | -------------------------------------------------------------------------------- /openfold/data/input_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from functools import partial 17 | 18 | import torch 19 | 20 | from openfold.data import data_transforms 21 | 22 | 23 | def nonensembled_transform_fns(common_cfg, mode_cfg): 24 | """Input pipeline data transformers that are not ensembled.""" 25 | transforms = [ 26 | data_transforms.cast_to_64bit_ints, 27 | data_transforms.correct_msa_restypes, 28 | data_transforms.squeeze_features, 29 | data_transforms.randomly_replace_msa_with_unknown(0.0), 30 | data_transforms.make_seq_mask, 31 | data_transforms.make_msa_mask, 32 | data_transforms.make_hhblits_profile, 33 | ] 34 | if common_cfg.use_templates: 35 | transforms.extend( 36 | [ 37 | data_transforms.fix_templates_aatype, 38 | data_transforms.make_template_mask, 39 | data_transforms.make_pseudo_beta("template_"), 40 | ] 41 | ) 42 | if common_cfg.use_template_torsion_angles: 43 | transforms.extend( 44 | [ 45 | data_transforms.atom37_to_torsion_angles("template_"), 46 | ] 47 | ) 48 | 49 | transforms.extend( 50 | [ 51 | data_transforms.make_atom14_masks, 52 | ] 53 | ) 54 | 55 | if mode_cfg.supervised: 56 | transforms.extend( 57 | [ 58 | data_transforms.make_atom14_positions, 59 | data_transforms.atom37_to_frames, 60 | data_transforms.atom37_to_torsion_angles(""), 61 | data_transforms.make_pseudo_beta(""), 62 | data_transforms.get_backbone_frames, 63 | data_transforms.get_chi_angles, 64 | ] 65 | ) 66 | 67 | return transforms 68 | 69 | 70 | def ensembled_transform_fns(common_cfg, mode_cfg, ensemble_seed): 71 | """Input pipeline data transformers that can be ensembled and averaged.""" 72 | transforms = [] 73 | 74 | if "max_distillation_msa_clusters" in mode_cfg: 75 | transforms.append( 76 | data_transforms.sample_msa_distillation( 77 | mode_cfg.max_distillation_msa_clusters 78 | ) 79 | ) 80 | 81 | if common_cfg.reduce_msa_clusters_by_max_templates: 82 | pad_msa_clusters = mode_cfg.max_msa_clusters - mode_cfg.max_templates 83 | else: 84 | pad_msa_clusters = mode_cfg.max_msa_clusters 85 | 86 | max_msa_clusters = pad_msa_clusters 87 | max_extra_msa = mode_cfg.max_extra_msa 88 | 89 | msa_seed = None 90 | if(not common_cfg.resample_msa_in_recycling): 91 | msa_seed = ensemble_seed 92 | 93 | transforms.append( 94 | data_transforms.sample_msa( 95 | max_msa_clusters, 96 | keep_extra=True, 97 | seed=msa_seed, 98 | ) 99 | ) 100 | 101 | if "masked_msa" in common_cfg: 102 | # Masked MSA should come *before* MSA clustering so that 103 | # the clustering and full MSA profile do not leak information about 104 | # the masked locations and secret corrupted locations. 105 | transforms.append( 106 | data_transforms.make_masked_msa( 107 | common_cfg.masked_msa, mode_cfg.masked_msa_replace_fraction 108 | ) 109 | ) 110 | 111 | if common_cfg.msa_cluster_features: 112 | transforms.append(data_transforms.nearest_neighbor_clusters()) 113 | transforms.append(data_transforms.summarize_clusters()) 114 | 115 | # Crop after creating the cluster profiles. 116 | if max_extra_msa: 117 | transforms.append(data_transforms.crop_extra_msa(max_extra_msa)) 118 | else: 119 | transforms.append(data_transforms.delete_extra_msa) 120 | 121 | transforms.append(data_transforms.make_msa_feat()) 122 | 123 | crop_feats = dict(common_cfg.feat) 124 | 125 | if mode_cfg.fixed_size: 126 | transforms.append(data_transforms.select_feat(list(crop_feats))) 127 | transforms.append( 128 | data_transforms.random_crop_to_size( 129 | mode_cfg.crop_size, 130 | mode_cfg.max_templates, 131 | crop_feats, 132 | mode_cfg.subsample_templates, 133 | seed=ensemble_seed + 1, 134 | ) 135 | ) 136 | transforms.append( 137 | data_transforms.make_fixed_size( 138 | crop_feats, 139 | pad_msa_clusters, 140 | mode_cfg.max_extra_msa, 141 | mode_cfg.crop_size, 142 | mode_cfg.max_templates, 143 | ) 144 | ) 145 | else: 146 | transforms.append( 147 | data_transforms.crop_templates(mode_cfg.max_templates) 148 | ) 149 | 150 | return transforms 151 | 152 | 153 | def process_tensors_from_config(tensors, common_cfg, mode_cfg): 154 | """Based on the config, apply filters and transformations to the data.""" 155 | 156 | ensemble_seed = torch.Generator().seed() 157 | 158 | def wrap_ensemble_fn(data, i): 159 | """Function to be mapped over the ensemble dimension.""" 160 | d = data.copy() 161 | fns = ensembled_transform_fns( 162 | common_cfg, 163 | mode_cfg, 164 | ensemble_seed, 165 | ) 166 | fn = compose(fns) 167 | d["ensemble_index"] = i 168 | return fn(d) 169 | 170 | no_templates = True 171 | if("template_aatype" in tensors): 172 | no_templates = tensors["template_aatype"].shape[0] == 0 173 | 174 | nonensembled = nonensembled_transform_fns( 175 | common_cfg, 176 | mode_cfg, 177 | ) 178 | 179 | tensors = compose(nonensembled)(tensors) 180 | 181 | if("no_recycling_iters" in tensors): 182 | num_recycling = int(tensors["no_recycling_iters"]) 183 | else: 184 | num_recycling = common_cfg.max_recycling_iters 185 | 186 | tensors = map_fn( 187 | lambda x: wrap_ensemble_fn(tensors, x), torch.arange(num_recycling + 1) 188 | ) 189 | 190 | return tensors 191 | 192 | 193 | @data_transforms.curry1 194 | def compose(x, fs): 195 | for f in fs: 196 | x = f(x) 197 | return x 198 | 199 | 200 | def map_fn(fun, x): 201 | ensembles = [fun(elem) for elem in x] 202 | features = ensembles[0].keys() 203 | ensembled_dict = {} 204 | for feat in features: 205 | ensembled_dict[feat] = torch.stack( 206 | [dict_i[feat] for dict_i in ensembles], dim=-1 207 | ) 208 | return ensembled_dict 209 | -------------------------------------------------------------------------------- /openfold/data/mmcif_parsing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Parses the mmCIF file format.""" 17 | import collections 18 | import dataclasses 19 | import io 20 | import json 21 | import logging 22 | import os 23 | from typing import Any, Mapping, Optional, Sequence, Tuple 24 | 25 | from Bio import PDB 26 | from Bio.Data import SCOPData 27 | import numpy as np 28 | 29 | from openfold.data.errors import MultipleChainsError 30 | import openfold.np.residue_constants as residue_constants 31 | 32 | 33 | # Type aliases: 34 | ChainId = str 35 | PdbHeader = Mapping[str, Any] 36 | PdbStructure = PDB.Structure.Structure 37 | SeqRes = str 38 | MmCIFDict = Mapping[str, Sequence[str]] 39 | 40 | 41 | @dataclasses.dataclass(frozen=True) 42 | class Monomer: 43 | id: str 44 | num: int 45 | 46 | 47 | # Note - mmCIF format provides no guarantees on the type of author-assigned 48 | # sequence numbers. They need not be integers. 49 | @dataclasses.dataclass(frozen=True) 50 | class AtomSite: 51 | residue_name: str 52 | author_chain_id: str 53 | mmcif_chain_id: str 54 | author_seq_num: str 55 | mmcif_seq_num: int 56 | insertion_code: str 57 | hetatm_atom: str 58 | model_num: int 59 | 60 | 61 | # Used to map SEQRES index to a residue in the structure. 62 | @dataclasses.dataclass(frozen=True) 63 | class ResiduePosition: 64 | chain_id: str 65 | residue_number: int 66 | insertion_code: str 67 | 68 | 69 | @dataclasses.dataclass(frozen=True) 70 | class ResidueAtPosition: 71 | position: Optional[ResiduePosition] 72 | name: str 73 | is_missing: bool 74 | hetflag: str 75 | 76 | 77 | @dataclasses.dataclass(frozen=True) 78 | class MmcifObject: 79 | """Representation of a parsed mmCIF file. 80 | 81 | Contains: 82 | file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all 83 | files being processed. 84 | header: Biopython header. 85 | structure: Biopython structure. 86 | chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g. 87 | {'A': 'ABCDEFG'} 88 | seqres_to_structure: Dict; for each chain_id contains a mapping between 89 | SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition, 90 | 1: ResidueAtPosition, 91 | ...}} 92 | raw_string: The raw string used to construct the MmcifObject. 93 | """ 94 | 95 | file_id: str 96 | header: PdbHeader 97 | structure: PdbStructure 98 | chain_to_seqres: Mapping[ChainId, SeqRes] 99 | seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]] 100 | raw_string: Any 101 | 102 | 103 | @dataclasses.dataclass(frozen=True) 104 | class ParsingResult: 105 | """Returned by the parse function. 106 | 107 | Contains: 108 | mmcif_object: A MmcifObject, may be None if no chain could be successfully 109 | parsed. 110 | errors: A dict mapping (file_id, chain_id) to any exception generated. 111 | """ 112 | 113 | mmcif_object: Optional[MmcifObject] 114 | errors: Mapping[Tuple[str, str], Any] 115 | 116 | 117 | class ParseError(Exception): 118 | """An error indicating that an mmCIF file could not be parsed.""" 119 | 120 | 121 | def mmcif_loop_to_list( 122 | prefix: str, parsed_info: MmCIFDict 123 | ) -> Sequence[Mapping[str, str]]: 124 | """Extracts loop associated with a prefix from mmCIF data as a list. 125 | 126 | Reference for loop_ in mmCIF: 127 | http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html 128 | 129 | Args: 130 | prefix: Prefix shared by each of the data items in the loop. 131 | e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num, 132 | _entity_poly_seq.mon_id. Should include the trailing period. 133 | parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython 134 | parser. 135 | 136 | Returns: 137 | Returns a list of dicts; each dict represents 1 entry from an mmCIF loop. 138 | """ 139 | cols = [] 140 | data = [] 141 | for key, value in parsed_info.items(): 142 | if key.startswith(prefix): 143 | cols.append(key) 144 | data.append(value) 145 | 146 | assert all([len(xs) == len(data[0]) for xs in data]), ( 147 | "mmCIF error: Not all loops are the same length: %s" % cols 148 | ) 149 | 150 | return [dict(zip(cols, xs)) for xs in zip(*data)] 151 | 152 | 153 | def mmcif_loop_to_dict( 154 | prefix: str, 155 | index: str, 156 | parsed_info: MmCIFDict, 157 | ) -> Mapping[str, Mapping[str, str]]: 158 | """Extracts loop associated with a prefix from mmCIF data as a dictionary. 159 | 160 | Args: 161 | prefix: Prefix shared by each of the data items in the loop. 162 | e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num, 163 | _entity_poly_seq.mon_id. Should include the trailing period. 164 | index: Which item of loop data should serve as the key. 165 | parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython 166 | parser. 167 | 168 | Returns: 169 | Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop, 170 | indexed by the index column. 171 | """ 172 | entries = mmcif_loop_to_list(prefix, parsed_info) 173 | return {entry[index]: entry for entry in entries} 174 | 175 | 176 | def parse( 177 | *, file_id: str, mmcif_string: str, catch_all_errors: bool = True 178 | ) -> ParsingResult: 179 | """Entry point, parses an mmcif_string. 180 | 181 | Args: 182 | file_id: A string identifier for this file. Should be unique within the 183 | collection of files being processed. 184 | mmcif_string: Contents of an mmCIF file. 185 | catch_all_errors: If True, all exceptions are caught and error messages are 186 | returned as part of the ParsingResult. If False exceptions will be allowed 187 | to propagate. 188 | 189 | Returns: 190 | A ParsingResult. 191 | """ 192 | errors = {} 193 | try: 194 | parser = PDB.MMCIFParser(QUIET=True) 195 | handle = io.StringIO(mmcif_string) 196 | full_structure = parser.get_structure("", handle) 197 | first_model_structure = _get_first_model(full_structure) 198 | # Extract the _mmcif_dict from the parser, which contains useful fields not 199 | # reflected in the Biopython structure. 200 | parsed_info = parser._mmcif_dict # pylint:disable=protected-access 201 | 202 | # Ensure all values are lists, even if singletons. 203 | for key, value in parsed_info.items(): 204 | if not isinstance(value, list): 205 | parsed_info[key] = [value] 206 | 207 | header = _get_header(parsed_info) 208 | 209 | # Determine the protein chains, and their start numbers according to the 210 | # internal mmCIF numbering scheme (likely but not guaranteed to be 1). 211 | valid_chains = _get_protein_chains(parsed_info=parsed_info) 212 | if not valid_chains: 213 | return ParsingResult( 214 | None, {(file_id, ""): "No protein chains found in this file."} 215 | ) 216 | seq_start_num = { 217 | chain_id: min([monomer.num for monomer in seq]) 218 | for chain_id, seq in valid_chains.items() 219 | } 220 | 221 | # Loop over the atoms for which we have coordinates. Populate two mappings: 222 | # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used 223 | # the authors / Biopython). 224 | # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition). 225 | mmcif_to_author_chain_id = {} 226 | seq_to_structure_mappings = {} 227 | for atom in _get_atom_site_list(parsed_info): 228 | if atom.model_num != "1": 229 | # We only process the first model at the moment. 230 | continue 231 | 232 | mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id 233 | 234 | if atom.mmcif_chain_id in valid_chains: 235 | hetflag = " " 236 | if atom.hetatm_atom == "HETATM": 237 | # Water atoms are assigned a special hetflag of W in Biopython. We 238 | # need to do the same, so that this hetflag can be used to fetch 239 | # a residue from the Biopython structure by id. 240 | if atom.residue_name in ("HOH", "WAT"): 241 | hetflag = "W" 242 | else: 243 | hetflag = "H_" + atom.residue_name 244 | insertion_code = atom.insertion_code 245 | if not _is_set(atom.insertion_code): 246 | insertion_code = " " 247 | position = ResiduePosition( 248 | chain_id=atom.author_chain_id, 249 | residue_number=int(atom.author_seq_num), 250 | insertion_code=insertion_code, 251 | ) 252 | seq_idx = ( 253 | int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id] 254 | ) 255 | current = seq_to_structure_mappings.get( 256 | atom.author_chain_id, {} 257 | ) 258 | current[seq_idx] = ResidueAtPosition( 259 | position=position, 260 | name=atom.residue_name, 261 | is_missing=False, 262 | hetflag=hetflag, 263 | ) 264 | seq_to_structure_mappings[atom.author_chain_id] = current 265 | 266 | # Add missing residue information to seq_to_structure_mappings. 267 | for chain_id, seq_info in valid_chains.items(): 268 | author_chain = mmcif_to_author_chain_id[chain_id] 269 | current_mapping = seq_to_structure_mappings[author_chain] 270 | for idx, monomer in enumerate(seq_info): 271 | if idx not in current_mapping: 272 | current_mapping[idx] = ResidueAtPosition( 273 | position=None, 274 | name=monomer.id, 275 | is_missing=True, 276 | hetflag=" ", 277 | ) 278 | 279 | author_chain_to_sequence = {} 280 | for chain_id, seq_info in valid_chains.items(): 281 | author_chain = mmcif_to_author_chain_id[chain_id] 282 | seq = [] 283 | for monomer in seq_info: 284 | code = SCOPData.protein_letters_3to1.get(monomer.id, "X") 285 | seq.append(code if len(code) == 1 else "X") 286 | seq = "".join(seq) 287 | author_chain_to_sequence[author_chain] = seq 288 | 289 | mmcif_object = MmcifObject( 290 | file_id=file_id, 291 | header=header, 292 | structure=first_model_structure, 293 | chain_to_seqres=author_chain_to_sequence, 294 | seqres_to_structure=seq_to_structure_mappings, 295 | raw_string=parsed_info, 296 | ) 297 | 298 | return ParsingResult(mmcif_object=mmcif_object, errors=errors) 299 | except Exception as e: # pylint:disable=broad-except 300 | errors[(file_id, "")] = e 301 | if not catch_all_errors: 302 | raise 303 | return ParsingResult(mmcif_object=None, errors=errors) 304 | 305 | 306 | def _get_first_model(structure: PdbStructure) -> PdbStructure: 307 | """Returns the first model in a Biopython structure.""" 308 | return next(structure.get_models()) 309 | 310 | 311 | _MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21 312 | 313 | 314 | def get_release_date(parsed_info: MmCIFDict) -> str: 315 | """Returns the oldest revision date.""" 316 | revision_dates = parsed_info["_pdbx_audit_revision_history.revision_date"] 317 | return min(revision_dates) 318 | 319 | 320 | def _get_header(parsed_info: MmCIFDict) -> PdbHeader: 321 | """Returns a basic header containing method, release date and resolution.""" 322 | header = {} 323 | 324 | experiments = mmcif_loop_to_list("_exptl.", parsed_info) 325 | header["structure_method"] = ",".join( 326 | [experiment["_exptl.method"].lower() for experiment in experiments] 327 | ) 328 | 329 | # Note: The release_date here corresponds to the oldest revision. We prefer to 330 | # use this for dataset filtering over the deposition_date. 331 | if "_pdbx_audit_revision_history.revision_date" in parsed_info: 332 | header["release_date"] = get_release_date(parsed_info) 333 | else: 334 | logging.warning( 335 | "Could not determine release_date: %s", parsed_info["_entry.id"] 336 | ) 337 | 338 | header["resolution"] = 0.00 339 | for res_key in ( 340 | "_refine.ls_d_res_high", 341 | "_em_3d_reconstruction.resolution", 342 | "_reflns.d_resolution_high", 343 | ): 344 | if res_key in parsed_info: 345 | try: 346 | raw_resolution = parsed_info[res_key][0] 347 | header["resolution"] = float(raw_resolution) 348 | except ValueError: 349 | logging.info( 350 | "Invalid resolution format: %s", parsed_info[res_key] 351 | ) 352 | 353 | return header 354 | 355 | 356 | def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]: 357 | """Returns list of atom sites; contains data not present in the structure.""" 358 | return [ 359 | AtomSite(*site) 360 | for site in zip( # pylint:disable=g-complex-comprehension 361 | parsed_info["_atom_site.label_comp_id"], 362 | parsed_info["_atom_site.auth_asym_id"], 363 | parsed_info["_atom_site.label_asym_id"], 364 | parsed_info["_atom_site.auth_seq_id"], 365 | parsed_info["_atom_site.label_seq_id"], 366 | parsed_info["_atom_site.pdbx_PDB_ins_code"], 367 | parsed_info["_atom_site.group_PDB"], 368 | parsed_info["_atom_site.pdbx_PDB_model_num"], 369 | ) 370 | ] 371 | 372 | 373 | def _get_protein_chains( 374 | *, parsed_info: Mapping[str, Any] 375 | ) -> Mapping[ChainId, Sequence[Monomer]]: 376 | """Extracts polymer information for protein chains only. 377 | 378 | Args: 379 | parsed_info: _mmcif_dict produced by the Biopython parser. 380 | 381 | Returns: 382 | A dict mapping mmcif chain id to a list of Monomers. 383 | """ 384 | # Get polymer information for each entity in the structure. 385 | entity_poly_seqs = mmcif_loop_to_list("_entity_poly_seq.", parsed_info) 386 | 387 | polymers = collections.defaultdict(list) 388 | for entity_poly_seq in entity_poly_seqs: 389 | polymers[entity_poly_seq["_entity_poly_seq.entity_id"]].append( 390 | Monomer( 391 | id=entity_poly_seq["_entity_poly_seq.mon_id"], 392 | num=int(entity_poly_seq["_entity_poly_seq.num"]), 393 | ) 394 | ) 395 | 396 | # Get chemical compositions. Will allow us to identify which of these polymers 397 | # are proteins. 398 | chem_comps = mmcif_loop_to_dict("_chem_comp.", "_chem_comp.id", parsed_info) 399 | 400 | # Get chains information for each entity. Necessary so that we can return a 401 | # dict keyed on chain id rather than entity. 402 | struct_asyms = mmcif_loop_to_list("_struct_asym.", parsed_info) 403 | 404 | entity_to_mmcif_chains = collections.defaultdict(list) 405 | for struct_asym in struct_asyms: 406 | chain_id = struct_asym["_struct_asym.id"] 407 | entity_id = struct_asym["_struct_asym.entity_id"] 408 | entity_to_mmcif_chains[entity_id].append(chain_id) 409 | 410 | # Identify and return the valid protein chains. 411 | valid_chains = {} 412 | for entity_id, seq_info in polymers.items(): 413 | chain_ids = entity_to_mmcif_chains[entity_id] 414 | 415 | # Reject polymers without any peptide-like components, such as DNA/RNA. 416 | if any( 417 | [ 418 | "peptide" in chem_comps[monomer.id]["_chem_comp.type"] 419 | for monomer in seq_info 420 | ] 421 | ): 422 | for chain_id in chain_ids: 423 | valid_chains[chain_id] = seq_info 424 | return valid_chains 425 | 426 | 427 | def _is_set(data: str) -> bool: 428 | """Returns False if data is a special mmCIF character indicating 'unset'.""" 429 | return data not in (".", "?") 430 | 431 | 432 | def get_atom_coords( 433 | mmcif_object: MmcifObject, 434 | chain_id: str, 435 | _zero_center_positions: bool = False 436 | ) -> Tuple[np.ndarray, np.ndarray]: 437 | # Locate the right chain 438 | chains = list(mmcif_object.structure.get_chains()) 439 | relevant_chains = [c for c in chains if c.id == chain_id] 440 | if len(relevant_chains) != 1: 441 | raise MultipleChainsError( 442 | f"Expected exactly one chain in structure with id {chain_id}." 443 | ) 444 | chain = relevant_chains[0] 445 | 446 | # Extract the coordinates 447 | num_res = len(mmcif_object.chain_to_seqres[chain_id]) 448 | all_atom_positions = np.zeros( 449 | [num_res, residue_constants.atom_type_num, 3], dtype=np.float32 450 | ) 451 | all_atom_mask = np.zeros( 452 | [num_res, residue_constants.atom_type_num], dtype=np.float32 453 | ) 454 | for res_index in range(num_res): 455 | pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32) 456 | mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32) 457 | res_at_position = mmcif_object.seqres_to_structure[chain_id][res_index] 458 | if not res_at_position.is_missing: 459 | res = chain[ 460 | ( 461 | res_at_position.hetflag, 462 | res_at_position.position.residue_number, 463 | res_at_position.position.insertion_code, 464 | ) 465 | ] 466 | for atom in res.get_atoms(): 467 | atom_name = atom.get_name() 468 | x, y, z = atom.get_coord() 469 | if atom_name in residue_constants.atom_order.keys(): 470 | pos[residue_constants.atom_order[atom_name]] = [x, y, z] 471 | mask[residue_constants.atom_order[atom_name]] = 1.0 472 | elif atom_name.upper() == "SE" and res.get_resname() == "MSE": 473 | # Put the coords of the selenium atom in the sulphur column 474 | pos[residue_constants.atom_order["SD"]] = [x, y, z] 475 | mask[residue_constants.atom_order["SD"]] = 1.0 476 | 477 | all_atom_positions[res_index] = pos 478 | all_atom_mask[res_index] = mask 479 | 480 | if _zero_center_positions: 481 | binary_mask = all_atom_mask.astype(bool) 482 | translation_vec = all_atom_positions[binary_mask].mean(axis=0) 483 | all_atom_positions[binary_mask] -= translation_vec 484 | 485 | return all_atom_positions, all_atom_mask 486 | -------------------------------------------------------------------------------- /openfold/data/parsers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Functions for parsing various file formats.""" 17 | import collections 18 | import dataclasses 19 | import re 20 | import string 21 | from typing import Dict, Iterable, List, Optional, Sequence, Tuple 22 | 23 | 24 | DeletionMatrix = Sequence[Sequence[int]] 25 | 26 | 27 | @dataclasses.dataclass(frozen=True) 28 | class TemplateHit: 29 | """Class representing a template hit.""" 30 | 31 | index: int 32 | name: str 33 | aligned_cols: int 34 | sum_probs: float 35 | query: str 36 | hit_sequence: str 37 | indices_query: List[int] 38 | indices_hit: List[int] 39 | 40 | 41 | def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]: 42 | """Parses FASTA string and returns list of strings with amino-acid sequences. 43 | 44 | Arguments: 45 | fasta_string: The string contents of a FASTA file. 46 | 47 | Returns: 48 | A tuple of two lists: 49 | * A list of sequences. 50 | * A list of sequence descriptions taken from the comment lines. In the 51 | same order as the sequences. 52 | """ 53 | sequences = [] 54 | descriptions = [] 55 | index = -1 56 | for line in fasta_string.splitlines(): 57 | line = line.strip() 58 | if line.startswith(">"): 59 | index += 1 60 | descriptions.append(line[1:]) # Remove the '>' at the beginning. 61 | sequences.append("") 62 | continue 63 | elif not line: 64 | continue # Skip blank lines. 65 | sequences[index] += line 66 | 67 | return sequences, descriptions 68 | 69 | 70 | def parse_stockholm( 71 | stockholm_string: str, 72 | ) -> Tuple[Sequence[str], DeletionMatrix, Sequence[str]]: 73 | """Parses sequences and deletion matrix from stockholm format alignment. 74 | 75 | Args: 76 | stockholm_string: The string contents of a stockholm file. The first 77 | sequence in the file should be the query sequence. 78 | 79 | Returns: 80 | A tuple of: 81 | * A list of sequences that have been aligned to the query. These 82 | might contain duplicates. 83 | * The deletion matrix for the alignment as a list of lists. The element 84 | at `deletion_matrix[i][j]` is the number of residues deleted from 85 | the aligned sequence i at residue position j. 86 | * The names of the targets matched, including the jackhmmer subsequence 87 | suffix. 88 | """ 89 | name_to_sequence = collections.OrderedDict() 90 | for line in stockholm_string.splitlines(): 91 | line = line.strip() 92 | if not line or line.startswith(("#", "//")): 93 | continue 94 | name, sequence = line.split() 95 | if name not in name_to_sequence: 96 | name_to_sequence[name] = "" 97 | name_to_sequence[name] += sequence 98 | 99 | msa = [] 100 | deletion_matrix = [] 101 | 102 | query = "" 103 | keep_columns = [] 104 | for seq_index, sequence in enumerate(name_to_sequence.values()): 105 | if seq_index == 0: 106 | # Gather the columns with gaps from the query 107 | query = sequence 108 | keep_columns = [i for i, res in enumerate(query) if res != "-"] 109 | 110 | # Remove the columns with gaps in the query from all sequences. 111 | aligned_sequence = "".join([sequence[c] for c in keep_columns]) 112 | 113 | msa.append(aligned_sequence) 114 | 115 | # Count the number of deletions w.r.t. query. 116 | deletion_vec = [] 117 | deletion_count = 0 118 | for seq_res, query_res in zip(sequence, query): 119 | if seq_res != "-" or query_res != "-": 120 | if query_res == "-": 121 | deletion_count += 1 122 | else: 123 | deletion_vec.append(deletion_count) 124 | deletion_count = 0 125 | deletion_matrix.append(deletion_vec) 126 | 127 | return msa, deletion_matrix, list(name_to_sequence.keys()) 128 | 129 | 130 | def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]: 131 | """Parses sequences and deletion matrix from a3m format alignment. 132 | 133 | Args: 134 | a3m_string: The string contents of a a3m file. The first sequence in the 135 | file should be the query sequence. 136 | 137 | Returns: 138 | A tuple of: 139 | * A list of sequences that have been aligned to the query. These 140 | might contain duplicates. 141 | * The deletion matrix for the alignment as a list of lists. The element 142 | at `deletion_matrix[i][j]` is the number of residues deleted from 143 | the aligned sequence i at residue position j. 144 | """ 145 | sequences, _ = parse_fasta(a3m_string) 146 | deletion_matrix = [] 147 | for msa_sequence in sequences: 148 | deletion_vec = [] 149 | deletion_count = 0 150 | for j in msa_sequence: 151 | if j.islower(): 152 | deletion_count += 1 153 | else: 154 | deletion_vec.append(deletion_count) 155 | deletion_count = 0 156 | deletion_matrix.append(deletion_vec) 157 | 158 | # Make the MSA matrix out of aligned (deletion-free) sequences. 159 | deletion_table = str.maketrans("", "", string.ascii_lowercase) 160 | aligned_sequences = [s.translate(deletion_table) for s in sequences] 161 | return aligned_sequences, deletion_matrix 162 | 163 | 164 | def _convert_sto_seq_to_a3m( 165 | query_non_gaps: Sequence[bool], sto_seq: str 166 | ) -> Iterable[str]: 167 | for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq): 168 | if is_query_res_non_gap: 169 | yield sequence_res 170 | elif sequence_res != "-": 171 | yield sequence_res.lower() 172 | 173 | 174 | def convert_stockholm_to_a3m( 175 | stockholm_format: str, max_sequences: Optional[int] = None 176 | ) -> str: 177 | """Converts MSA in Stockholm format to the A3M format.""" 178 | descriptions = {} 179 | sequences = {} 180 | reached_max_sequences = False 181 | 182 | for line in stockholm_format.splitlines(): 183 | reached_max_sequences = ( 184 | max_sequences and len(sequences) >= max_sequences 185 | ) 186 | if line.strip() and not line.startswith(("#", "//")): 187 | # Ignore blank lines, markup and end symbols - remainder are alignment 188 | # sequence parts. 189 | seqname, aligned_seq = line.split(maxsplit=1) 190 | if seqname not in sequences: 191 | if reached_max_sequences: 192 | continue 193 | sequences[seqname] = "" 194 | sequences[seqname] += aligned_seq 195 | 196 | for line in stockholm_format.splitlines(): 197 | if line[:4] == "#=GS": 198 | # Description row - example format is: 199 | # #=GS UniRef90_Q9H5Z4/4-78 DE [subseq from] cDNA: FLJ22755 ... 200 | columns = line.split(maxsplit=3) 201 | seqname, feature = columns[1:3] 202 | value = columns[3] if len(columns) == 4 else "" 203 | if feature != "DE": 204 | continue 205 | if reached_max_sequences and seqname not in sequences: 206 | continue 207 | descriptions[seqname] = value 208 | if len(descriptions) == len(sequences): 209 | break 210 | 211 | # Convert sto format to a3m line by line 212 | a3m_sequences = {} 213 | # query_sequence is assumed to be the first sequence 214 | query_sequence = next(iter(sequences.values())) 215 | query_non_gaps = [res != "-" for res in query_sequence] 216 | for seqname, sto_sequence in sequences.items(): 217 | a3m_sequences[seqname] = "".join( 218 | _convert_sto_seq_to_a3m(query_non_gaps, sto_sequence) 219 | ) 220 | 221 | fasta_chunks = ( 222 | f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}" 223 | for k in a3m_sequences 224 | ) 225 | return "\n".join(fasta_chunks) + "\n" # Include terminating newline. 226 | 227 | 228 | def _get_hhr_line_regex_groups( 229 | regex_pattern: str, line: str 230 | ) -> Sequence[Optional[str]]: 231 | match = re.match(regex_pattern, line) 232 | if match is None: 233 | raise RuntimeError(f"Could not parse query line {line}") 234 | return match.groups() 235 | 236 | 237 | def _update_hhr_residue_indices_list( 238 | sequence: str, start_index: int, indices_list: List[int] 239 | ): 240 | """Computes the relative indices for each residue with respect to the original sequence.""" 241 | counter = start_index 242 | for symbol in sequence: 243 | if symbol == "-": 244 | indices_list.append(-1) 245 | else: 246 | indices_list.append(counter) 247 | counter += 1 248 | 249 | 250 | def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit: 251 | """Parses the detailed HMM HMM comparison section for a single Hit. 252 | 253 | This works on .hhr files generated from both HHBlits and HHSearch. 254 | 255 | Args: 256 | detailed_lines: A list of lines from a single comparison section between 2 257 | sequences (which each have their own HMM's) 258 | 259 | Returns: 260 | A dictionary with the information from that detailed comparison section 261 | 262 | Raises: 263 | RuntimeError: If a certain line cannot be processed 264 | """ 265 | # Parse first 2 lines. 266 | number_of_hit = int(detailed_lines[0].split()[-1]) 267 | name_hit = detailed_lines[1][1:] 268 | 269 | # Parse the summary line. 270 | pattern = ( 271 | "Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t" 272 | " ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t " 273 | "]*Template_Neff=(.*)" 274 | ) 275 | match = re.match(pattern, detailed_lines[2]) 276 | if match is None: 277 | raise RuntimeError( 278 | "Could not parse section: %s. Expected this: \n%s to contain summary." 279 | % (detailed_lines, detailed_lines[2]) 280 | ) 281 | (prob_true, e_value, _, aligned_cols, _, _, sum_probs, neff) = [ 282 | float(x) for x in match.groups() 283 | ] 284 | 285 | # The next section reads the detailed comparisons. These are in a 'human 286 | # readable' format which has a fixed length. The strategy employed is to 287 | # assume that each block starts with the query sequence line, and to parse 288 | # that with a regexp in order to deduce the fixed length used for that block. 289 | query = "" 290 | hit_sequence = "" 291 | indices_query = [] 292 | indices_hit = [] 293 | length_block = None 294 | 295 | for line in detailed_lines[3:]: 296 | # Parse the query sequence line 297 | if ( 298 | line.startswith("Q ") 299 | and not line.startswith("Q ss_dssp") 300 | and not line.startswith("Q ss_pred") 301 | and not line.startswith("Q Consensus") 302 | ): 303 | # Thus the first 17 characters must be 'Q ', and we can parse 304 | # everything after that. 305 | # start sequence end total_sequence_length 306 | patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)" 307 | groups = _get_hhr_line_regex_groups(patt, line[17:]) 308 | 309 | # Get the length of the parsed block using the start and finish indices, 310 | # and ensure it is the same as the actual block length. 311 | start = int(groups[0]) - 1 # Make index zero based. 312 | delta_query = groups[1] 313 | end = int(groups[2]) 314 | num_insertions = len([x for x in delta_query if x == "-"]) 315 | length_block = end - start + num_insertions 316 | assert length_block == len(delta_query) 317 | 318 | # Update the query sequence and indices list. 319 | query += delta_query 320 | _update_hhr_residue_indices_list(delta_query, start, indices_query) 321 | 322 | elif line.startswith("T "): 323 | # Parse the hit sequence. 324 | if ( 325 | not line.startswith("T ss_dssp") 326 | and not line.startswith("T ss_pred") 327 | and not line.startswith("T Consensus") 328 | ): 329 | # Thus the first 17 characters must be 'T ', and we can 330 | # parse everything after that. 331 | # start sequence end total_sequence_length 332 | patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)" 333 | groups = _get_hhr_line_regex_groups(patt, line[17:]) 334 | start = int(groups[0]) - 1 # Make index zero based. 335 | delta_hit_sequence = groups[1] 336 | assert length_block == len(delta_hit_sequence) 337 | 338 | # Update the hit sequence and indices list. 339 | hit_sequence += delta_hit_sequence 340 | _update_hhr_residue_indices_list( 341 | delta_hit_sequence, start, indices_hit 342 | ) 343 | 344 | return TemplateHit( 345 | index=number_of_hit, 346 | name=name_hit, 347 | aligned_cols=int(aligned_cols), 348 | sum_probs=sum_probs, 349 | query=query, 350 | hit_sequence=hit_sequence, 351 | indices_query=indices_query, 352 | indices_hit=indices_hit, 353 | ) 354 | 355 | 356 | def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]: 357 | """Parses the content of an entire HHR file.""" 358 | lines = hhr_string.splitlines() 359 | 360 | # Each .hhr file starts with a results table, then has a sequence of hit 361 | # "paragraphs", each paragraph starting with a line 'No '. We 362 | # iterate through each paragraph to parse each hit. 363 | 364 | block_starts = [i for i, line in enumerate(lines) if line.startswith("No ")] 365 | 366 | hits = [] 367 | if block_starts: 368 | block_starts.append(len(lines)) # Add the end of the final block. 369 | for i in range(len(block_starts) - 1): 370 | hits.append( 371 | _parse_hhr_hit(lines[block_starts[i] : block_starts[i + 1]]) 372 | ) 373 | return hits 374 | 375 | 376 | def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]: 377 | """Parse target to e-value mapping parsed from Jackhmmer tblout string.""" 378 | e_values = {"query": 0} 379 | lines = [line for line in tblout.splitlines() if line[0] != "#"] 380 | # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are 381 | # space-delimited. Relevant fields are (1) target name: and 382 | # (5) E-value (full sequence) (numbering from 1). 383 | for line in lines: 384 | fields = line.split() 385 | e_value = fields[4] 386 | target_name = fields[0] 387 | e_values[target_name] = float(e_value) 388 | return e_values 389 | -------------------------------------------------------------------------------- /openfold/data/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/openfold/data/tools/__init__.py -------------------------------------------------------------------------------- /openfold/data/tools/hhblits.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Library to run HHblits from Python.""" 17 | import glob 18 | import logging 19 | import os 20 | import subprocess 21 | from typing import Any, Mapping, Optional, Sequence 22 | 23 | from openfold.data.tools import utils 24 | 25 | 26 | _HHBLITS_DEFAULT_P = 20 27 | _HHBLITS_DEFAULT_Z = 500 28 | 29 | 30 | class HHBlits: 31 | """Python wrapper of the HHblits binary.""" 32 | 33 | def __init__( 34 | self, 35 | *, 36 | binary_path: str, 37 | databases: Sequence[str], 38 | n_cpu: int = 4, 39 | n_iter: int = 3, 40 | e_value: float = 0.001, 41 | maxseq: int = 1_000_000, 42 | realign_max: int = 100_000, 43 | maxfilt: int = 100_000, 44 | min_prefilter_hits: int = 1000, 45 | all_seqs: bool = False, 46 | alt: Optional[int] = None, 47 | p: int = _HHBLITS_DEFAULT_P, 48 | z: int = _HHBLITS_DEFAULT_Z, 49 | ): 50 | """Initializes the Python HHblits wrapper. 51 | 52 | Args: 53 | binary_path: The path to the HHblits executable. 54 | databases: A sequence of HHblits database paths. This should be the 55 | common prefix for the database files (i.e. up to but not including 56 | _hhm.ffindex etc.) 57 | n_cpu: The number of CPUs to give HHblits. 58 | n_iter: The number of HHblits iterations. 59 | e_value: The E-value, see HHblits docs for more details. 60 | maxseq: The maximum number of rows in an input alignment. Note that this 61 | parameter is only supported in HHBlits version 3.1 and higher. 62 | realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500. 63 | maxfilt: Max number of hits allowed to pass the 2nd prefilter. 64 | HHblits default: 20000. 65 | min_prefilter_hits: Min number of hits to pass prefilter. 66 | HHblits default: 100. 67 | all_seqs: Return all sequences in the MSA / Do not filter the result MSA. 68 | HHblits default: False. 69 | alt: Show up to this many alternative alignments. 70 | p: Minimum Prob for a hit to be included in the output hhr file. 71 | HHblits default: 20. 72 | z: Hard cap on number of hits reported in the hhr file. 73 | HHblits default: 500. NB: The relevant HHblits flag is -Z not -z. 74 | 75 | Raises: 76 | RuntimeError: If HHblits binary not found within the path. 77 | """ 78 | self.binary_path = binary_path 79 | self.databases = databases 80 | 81 | for database_path in self.databases: 82 | if not glob.glob(database_path + "_*"): 83 | logging.error( 84 | "Could not find HHBlits database %s", database_path 85 | ) 86 | raise ValueError( 87 | f"Could not find HHBlits database {database_path}" 88 | ) 89 | 90 | self.n_cpu = n_cpu 91 | self.n_iter = n_iter 92 | self.e_value = e_value 93 | self.maxseq = maxseq 94 | self.realign_max = realign_max 95 | self.maxfilt = maxfilt 96 | self.min_prefilter_hits = min_prefilter_hits 97 | self.all_seqs = all_seqs 98 | self.alt = alt 99 | self.p = p 100 | self.z = z 101 | 102 | def query(self, input_fasta_path: str) -> Mapping[str, Any]: 103 | """Queries the database using HHblits.""" 104 | with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir: 105 | a3m_path = os.path.join(query_tmp_dir, "output.a3m") 106 | 107 | db_cmd = [] 108 | for db_path in self.databases: 109 | db_cmd.append("-d") 110 | db_cmd.append(db_path) 111 | cmd = [ 112 | self.binary_path, 113 | "-i", 114 | input_fasta_path, 115 | "-cpu", 116 | str(self.n_cpu), 117 | "-oa3m", 118 | a3m_path, 119 | "-o", 120 | "/dev/null", 121 | "-n", 122 | str(self.n_iter), 123 | "-e", 124 | str(self.e_value), 125 | "-maxseq", 126 | str(self.maxseq), 127 | "-realign_max", 128 | str(self.realign_max), 129 | "-maxfilt", 130 | str(self.maxfilt), 131 | "-min_prefilter_hits", 132 | str(self.min_prefilter_hits), 133 | ] 134 | if self.all_seqs: 135 | cmd += ["-all"] 136 | if self.alt: 137 | cmd += ["-alt", str(self.alt)] 138 | if self.p != _HHBLITS_DEFAULT_P: 139 | cmd += ["-p", str(self.p)] 140 | if self.z != _HHBLITS_DEFAULT_Z: 141 | cmd += ["-Z", str(self.z)] 142 | cmd += db_cmd 143 | 144 | logging.info('Launching subprocess "%s"', " ".join(cmd)) 145 | process = subprocess.Popen( 146 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE 147 | ) 148 | 149 | with utils.timing("HHblits query"): 150 | stdout, stderr = process.communicate() 151 | retcode = process.wait() 152 | 153 | if retcode: 154 | # Logs have a 15k character limit, so log HHblits error line by line. 155 | logging.error("HHblits failed. HHblits stderr begin:") 156 | for error_line in stderr.decode("utf-8").splitlines(): 157 | if error_line.strip(): 158 | logging.error(error_line.strip()) 159 | logging.error("HHblits stderr end") 160 | raise RuntimeError( 161 | "HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n" 162 | % (stdout.decode("utf-8"), stderr[:500_000].decode("utf-8")) 163 | ) 164 | 165 | with open(a3m_path) as f: 166 | a3m = f.read() 167 | 168 | raw_output = dict( 169 | a3m=a3m, 170 | output=stdout, 171 | stderr=stderr, 172 | n_iter=self.n_iter, 173 | e_value=self.e_value, 174 | ) 175 | return raw_output 176 | -------------------------------------------------------------------------------- /openfold/data/tools/hhsearch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Library to run HHsearch from Python.""" 17 | import glob 18 | import logging 19 | import os 20 | import subprocess 21 | from typing import Sequence 22 | 23 | from openfold.data.tools import utils 24 | 25 | 26 | class HHSearch: 27 | """Python wrapper of the HHsearch binary.""" 28 | 29 | def __init__( 30 | self, 31 | *, 32 | binary_path: str, 33 | databases: Sequence[str], 34 | n_cpu: int = 2, 35 | maxseq: int = 1_000_000, 36 | ): 37 | """Initializes the Python HHsearch wrapper. 38 | 39 | Args: 40 | binary_path: The path to the HHsearch executable. 41 | databases: A sequence of HHsearch database paths. This should be the 42 | common prefix for the database files (i.e. up to but not including 43 | _hhm.ffindex etc.) 44 | n_cpu: The number of CPUs to use 45 | maxseq: The maximum number of rows in an input alignment. Note that this 46 | parameter is only supported in HHBlits version 3.1 and higher. 47 | 48 | Raises: 49 | RuntimeError: If HHsearch binary not found within the path. 50 | """ 51 | self.binary_path = binary_path 52 | self.databases = databases 53 | self.n_cpu = n_cpu 54 | self.maxseq = maxseq 55 | 56 | for database_path in self.databases: 57 | if not glob.glob(database_path + "_*"): 58 | logging.error( 59 | "Could not find HHsearch database %s", database_path 60 | ) 61 | raise ValueError( 62 | f"Could not find HHsearch database {database_path}" 63 | ) 64 | 65 | def query(self, a3m: str) -> str: 66 | """Queries the database using HHsearch using a given a3m.""" 67 | with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir: 68 | input_path = os.path.join(query_tmp_dir, "query.a3m") 69 | hhr_path = os.path.join(query_tmp_dir, "output.hhr") 70 | with open(input_path, "w") as f: 71 | f.write(a3m) 72 | 73 | db_cmd = [] 74 | for db_path in self.databases: 75 | db_cmd.append("-d") 76 | db_cmd.append(db_path) 77 | cmd = [ 78 | self.binary_path, 79 | "-i", 80 | input_path, 81 | "-o", 82 | hhr_path, 83 | "-maxseq", 84 | str(self.maxseq), 85 | "-cpu", 86 | str(self.n_cpu), 87 | ] + db_cmd 88 | 89 | logging.info('Launching subprocess "%s"', " ".join(cmd)) 90 | process = subprocess.Popen( 91 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE 92 | ) 93 | with utils.timing("HHsearch query"): 94 | stdout, stderr = process.communicate() 95 | retcode = process.wait() 96 | 97 | if retcode: 98 | # Stderr is truncated to prevent proto size errors in Beam. 99 | raise RuntimeError( 100 | "HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n" 101 | % (stdout.decode("utf-8"), stderr[:100_000].decode("utf-8")) 102 | ) 103 | 104 | with open(hhr_path) as f: 105 | hhr = f.read() 106 | return hhr 107 | -------------------------------------------------------------------------------- /openfold/data/tools/jackhmmer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Library to run Jackhmmer from Python.""" 17 | 18 | from concurrent import futures 19 | import glob 20 | import logging 21 | import os 22 | import subprocess 23 | from typing import Any, Callable, Mapping, Optional, Sequence 24 | from urllib import request 25 | 26 | from openfold.data.tools import utils 27 | 28 | 29 | class Jackhmmer: 30 | """Python wrapper of the Jackhmmer binary.""" 31 | 32 | def __init__( 33 | self, 34 | *, 35 | binary_path: str, 36 | database_path: str, 37 | n_cpu: int = 8, 38 | n_iter: int = 1, 39 | e_value: float = 0.0001, 40 | z_value: Optional[int] = None, 41 | get_tblout: bool = False, 42 | filter_f1: float = 0.0005, 43 | filter_f2: float = 0.00005, 44 | filter_f3: float = 0.0000005, 45 | incdom_e: Optional[float] = None, 46 | dom_e: Optional[float] = None, 47 | num_streamed_chunks: Optional[int] = None, 48 | streaming_callback: Optional[Callable[[int], None]] = None, 49 | ): 50 | """Initializes the Python Jackhmmer wrapper. 51 | 52 | Args: 53 | binary_path: The path to the jackhmmer executable. 54 | database_path: The path to the jackhmmer database (FASTA format). 55 | n_cpu: The number of CPUs to give Jackhmmer. 56 | n_iter: The number of Jackhmmer iterations. 57 | e_value: The E-value, see Jackhmmer docs for more details. 58 | z_value: The Z-value, see Jackhmmer docs for more details. 59 | get_tblout: Whether to save tblout string. 60 | filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off. 61 | filter_f2: Viterbi pre-filter, set to >1.0 to turn off. 62 | filter_f3: Forward pre-filter, set to >1.0 to turn off. 63 | incdom_e: Domain e-value criteria for inclusion of domains in MSA/next 64 | round. 65 | dom_e: Domain e-value criteria for inclusion in tblout. 66 | num_streamed_chunks: Number of database chunks to stream over. 67 | streaming_callback: Callback function run after each chunk iteration with 68 | the iteration number as argument. 69 | """ 70 | self.binary_path = binary_path 71 | self.database_path = database_path 72 | self.num_streamed_chunks = num_streamed_chunks 73 | 74 | if ( 75 | not os.path.exists(self.database_path) 76 | and num_streamed_chunks is None 77 | ): 78 | logging.error("Could not find Jackhmmer database %s", database_path) 79 | raise ValueError( 80 | f"Could not find Jackhmmer database {database_path}" 81 | ) 82 | 83 | self.n_cpu = n_cpu 84 | self.n_iter = n_iter 85 | self.e_value = e_value 86 | self.z_value = z_value 87 | self.filter_f1 = filter_f1 88 | self.filter_f2 = filter_f2 89 | self.filter_f3 = filter_f3 90 | self.incdom_e = incdom_e 91 | self.dom_e = dom_e 92 | self.get_tblout = get_tblout 93 | self.streaming_callback = streaming_callback 94 | 95 | def _query_chunk( 96 | self, input_fasta_path: str, database_path: str 97 | ) -> Mapping[str, Any]: 98 | """Queries the database chunk using Jackhmmer.""" 99 | with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir: 100 | sto_path = os.path.join(query_tmp_dir, "output.sto") 101 | 102 | # The F1/F2/F3 are the expected proportion to pass each of the filtering 103 | # stages (which get progressively more expensive), reducing these 104 | # speeds up the pipeline at the expensive of sensitivity. They are 105 | # currently set very low to make querying Mgnify run in a reasonable 106 | # amount of time. 107 | cmd_flags = [ 108 | # Don't pollute stdout with Jackhmmer output. 109 | "-o", 110 | "/dev/null", 111 | "-A", 112 | sto_path, 113 | "--noali", 114 | "--F1", 115 | str(self.filter_f1), 116 | "--F2", 117 | str(self.filter_f2), 118 | "--F3", 119 | str(self.filter_f3), 120 | "--incE", 121 | str(self.e_value), 122 | # Report only sequences with E-values <= x in per-sequence output. 123 | "-E", 124 | str(self.e_value), 125 | "--cpu", 126 | str(self.n_cpu), 127 | "-N", 128 | str(self.n_iter), 129 | ] 130 | if self.get_tblout: 131 | tblout_path = os.path.join(query_tmp_dir, "tblout.txt") 132 | cmd_flags.extend(["--tblout", tblout_path]) 133 | 134 | if self.z_value: 135 | cmd_flags.extend(["-Z", str(self.z_value)]) 136 | 137 | if self.dom_e is not None: 138 | cmd_flags.extend(["--domE", str(self.dom_e)]) 139 | 140 | if self.incdom_e is not None: 141 | cmd_flags.extend(["--incdomE", str(self.incdom_e)]) 142 | 143 | cmd = ( 144 | [self.binary_path] 145 | + cmd_flags 146 | + [input_fasta_path, database_path] 147 | ) 148 | 149 | logging.info('Launching subprocess "%s"', " ".join(cmd)) 150 | process = subprocess.Popen( 151 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE 152 | ) 153 | with utils.timing( 154 | f"Jackhmmer ({os.path.basename(database_path)}) query" 155 | ): 156 | _, stderr = process.communicate() 157 | retcode = process.wait() 158 | 159 | if retcode: 160 | raise RuntimeError( 161 | "Jackhmmer failed\nstderr:\n%s\n" % stderr.decode("utf-8") 162 | ) 163 | 164 | # Get e-values for each target name 165 | tbl = "" 166 | if self.get_tblout: 167 | with open(tblout_path) as f: 168 | tbl = f.read() 169 | 170 | with open(sto_path) as f: 171 | sto = f.read() 172 | 173 | raw_output = dict( 174 | sto=sto, 175 | tbl=tbl, 176 | stderr=stderr, 177 | n_iter=self.n_iter, 178 | e_value=self.e_value, 179 | ) 180 | 181 | return raw_output 182 | 183 | def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]: 184 | """Queries the database using Jackhmmer.""" 185 | if self.num_streamed_chunks is None: 186 | return [self._query_chunk(input_fasta_path, self.database_path)] 187 | 188 | db_basename = os.path.basename(self.database_path) 189 | db_remote_chunk = lambda db_idx: f"{self.database_path}.{db_idx}" 190 | db_local_chunk = lambda db_idx: f"/tmp/ramdisk/{db_basename}.{db_idx}" 191 | 192 | # Remove existing files to prevent OOM 193 | for f in glob.glob(db_local_chunk("[0-9]*")): 194 | try: 195 | os.remove(f) 196 | except OSError: 197 | print(f"OSError while deleting {f}") 198 | 199 | # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk 200 | with futures.ThreadPoolExecutor(max_workers=2) as executor: 201 | chunked_output = [] 202 | for i in range(1, self.num_streamed_chunks + 1): 203 | # Copy the chunk locally 204 | if i == 1: 205 | future = executor.submit( 206 | request.urlretrieve, 207 | db_remote_chunk(i), 208 | db_local_chunk(i), 209 | ) 210 | if i < self.num_streamed_chunks: 211 | next_future = executor.submit( 212 | request.urlretrieve, 213 | db_remote_chunk(i + 1), 214 | db_local_chunk(i + 1), 215 | ) 216 | 217 | # Run Jackhmmer with the chunk 218 | future.result() 219 | chunked_output.append( 220 | self._query_chunk(input_fasta_path, db_local_chunk(i)) 221 | ) 222 | 223 | # Remove the local copy of the chunk 224 | os.remove(db_local_chunk(i)) 225 | future = next_future 226 | if self.streaming_callback: 227 | self.streaming_callback(i) 228 | return chunked_output 229 | -------------------------------------------------------------------------------- /openfold/data/tools/kalign.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """A Python wrapper for Kalign.""" 17 | import os 18 | import subprocess 19 | from typing import Sequence 20 | 21 | from absl import logging 22 | 23 | from openfold.data.tools import utils 24 | 25 | 26 | def _to_a3m(sequences: Sequence[str]) -> str: 27 | """Converts sequences to an a3m file.""" 28 | names = ["sequence %d" % i for i in range(1, len(sequences) + 1)] 29 | a3m = [] 30 | for sequence, name in zip(sequences, names): 31 | a3m.append(u">" + name + u"\n") 32 | a3m.append(sequence + u"\n") 33 | return "".join(a3m) 34 | 35 | 36 | class Kalign: 37 | """Python wrapper of the Kalign binary.""" 38 | 39 | def __init__(self, *, binary_path: str): 40 | """Initializes the Python Kalign wrapper. 41 | 42 | Args: 43 | binary_path: The path to the Kalign binary. 44 | 45 | Raises: 46 | RuntimeError: If Kalign binary not found within the path. 47 | """ 48 | self.binary_path = binary_path 49 | 50 | def align(self, sequences: Sequence[str]) -> str: 51 | """Aligns the sequences and returns the alignment in A3M string. 52 | 53 | Args: 54 | sequences: A list of query sequence strings. The sequences have to be at 55 | least 6 residues long (Kalign requires this). Note that the order in 56 | which you give the sequences might alter the output slightly as 57 | different alignment tree might get constructed. 58 | 59 | Returns: 60 | A string with the alignment in a3m format. 61 | 62 | Raises: 63 | RuntimeError: If Kalign fails. 64 | ValueError: If any of the sequences is less than 6 residues long. 65 | """ 66 | logging.info("Aligning %d sequences", len(sequences)) 67 | 68 | for s in sequences: 69 | if len(s) < 6: 70 | raise ValueError( 71 | "Kalign requires all sequences to be at least 6 " 72 | "residues long. Got %s (%d residues)." % (s, len(s)) 73 | ) 74 | 75 | with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir: 76 | input_fasta_path = os.path.join(query_tmp_dir, "input.fasta") 77 | output_a3m_path = os.path.join(query_tmp_dir, "output.a3m") 78 | 79 | with open(input_fasta_path, "w") as f: 80 | f.write(_to_a3m(sequences)) 81 | 82 | cmd = [ 83 | self.binary_path, 84 | "-i", 85 | input_fasta_path, 86 | "-o", 87 | output_a3m_path, 88 | "-format", 89 | "fasta", 90 | ] 91 | 92 | logging.info('Launching subprocess "%s"', " ".join(cmd)) 93 | process = subprocess.Popen( 94 | cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE 95 | ) 96 | 97 | with utils.timing("Kalign query"): 98 | stdout, stderr = process.communicate() 99 | retcode = process.wait() 100 | logging.info( 101 | "Kalign stdout:\n%s\n\nstderr:\n%s\n", 102 | stdout.decode("utf-8"), 103 | stderr.decode("utf-8"), 104 | ) 105 | 106 | if retcode: 107 | raise RuntimeError( 108 | "Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n" 109 | % (stdout.decode("utf-8"), stderr.decode("utf-8")) 110 | ) 111 | 112 | with open(output_a3m_path) as f: 113 | a3m = f.read() 114 | 115 | return a3m 116 | -------------------------------------------------------------------------------- /openfold/data/tools/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Common utilities for data pipeline tools.""" 17 | import contextlib 18 | import datetime 19 | import logging 20 | import shutil 21 | import tempfile 22 | import time 23 | from typing import Optional 24 | 25 | 26 | @contextlib.contextmanager 27 | def tmpdir_manager(base_dir: Optional[str] = None): 28 | """Context manager that deletes a temporary directory on exit.""" 29 | tmpdir = tempfile.mkdtemp(dir=base_dir) 30 | try: 31 | yield tmpdir 32 | finally: 33 | shutil.rmtree(tmpdir, ignore_errors=True) 34 | 35 | 36 | @contextlib.contextmanager 37 | def timing(msg: str): 38 | logging.info("Started %s", msg) 39 | tic = time.perf_counter() 40 | yield 41 | toc = time.perf_counter() 42 | logging.info("Finished %s in %.3f seconds", msg, toc - tic) 43 | 44 | 45 | def to_date(s: str): 46 | return datetime.datetime( 47 | year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10]) 48 | ) 49 | -------------------------------------------------------------------------------- /openfold/np/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import importlib as importlib 4 | 5 | _files = glob.glob(os.path.join(os.path.dirname(__file__), "*.py")) 6 | __all__ = [ 7 | os.path.basename(f)[:-3] 8 | for f in _files 9 | if os.path.isfile(f) and not f.endswith("__init__.py") 10 | ] 11 | _modules = [(m, importlib.import_module("." + m, __name__)) for m in __all__] 12 | for _m in _modules: 13 | globals()[_m[0]] = _m[1] 14 | 15 | # Avoid needlessly cluttering the global namespace 16 | del _files, _m, _modules 17 | -------------------------------------------------------------------------------- /openfold/np/protein.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Protein data type.""" 17 | import dataclasses 18 | import io 19 | from typing import Any, Sequence, Mapping, Optional 20 | import re 21 | import string 22 | 23 | from openfold.np import residue_constants 24 | from Bio import PDB 25 | import numpy as np 26 | 27 | 28 | FeatureDict = Mapping[str, np.ndarray] 29 | ModelOutput = Mapping[str, Any] # Is a nested dict. 30 | PICO_TO_ANGSTROM = 0.01 31 | 32 | @dataclasses.dataclass(frozen=True) 33 | class Protein: 34 | """Protein structure representation.""" 35 | 36 | # Cartesian coordinates of atoms in angstroms. The atom types correspond to 37 | # residue_constants.atom_types, i.e. the first three are N, CA, CB. 38 | atom_positions: np.ndarray # [num_res, num_atom_type, 3] 39 | 40 | # Amino-acid type for each residue represented as an integer between 0 and 41 | # 20, where 20 is 'X'. 42 | aatype: np.ndarray # [num_res] 43 | 44 | # Binary float mask to indicate presence of a particular atom. 1.0 if an atom 45 | # is present and 0.0 if not. This should be used for loss masking. 46 | atom_mask: np.ndarray # [num_res, num_atom_type] 47 | 48 | # Residue index as used in PDB. It is not necessarily continuous or 0-indexed. 49 | residue_index: np.ndarray # [num_res] 50 | 51 | # B-factors, or temperature factors, of each residue (in sq. angstroms units), 52 | # representing the displacement of the residue from its ground truth mean 53 | # value. 54 | b_factors: np.ndarray # [num_res, num_atom_type] 55 | 56 | # Chain indices for multi-chain predictions 57 | chain_index: Optional[np.ndarray] = None 58 | 59 | # Optional remark about the protein. Included as a comment in output PDB 60 | # files 61 | remark: Optional[str] = None 62 | 63 | # Templates used to generate this protein (prediction-only) 64 | parents: Optional[Sequence[str]] = None 65 | 66 | # Chain corresponding to each parent 67 | parents_chain_index: Optional[Sequence[int]] = None 68 | 69 | 70 | def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein: 71 | """Takes a PDB string and constructs a Protein object. 72 | 73 | WARNING: All non-standard residue types will be converted into UNK. All 74 | non-standard atoms will be ignored. 75 | 76 | Args: 77 | pdb_str: The contents of the pdb file 78 | chain_id: If None, then the pdb file must contain a single chain (which 79 | will be parsed). If chain_id is specified (e.g. A), then only that chain 80 | is parsed. 81 | 82 | Returns: 83 | A new `Protein` parsed from the pdb contents. 84 | """ 85 | pdb_fh = pdb_str 86 | parser = PDB.PDBParser(QUIET=True) 87 | structure = parser.get_structure("none", pdb_fh) 88 | models = list(structure.get_models()) 89 | if len(models) != 1: 90 | raise ValueError( 91 | f"Only single model PDBs are supported. Found {len(models)} models." 92 | ) 93 | model = models[0] 94 | 95 | atom_positions = [] 96 | aatype = [] 97 | atom_mask = [] 98 | residue_index = [] 99 | chain_ids = [] 100 | b_factors = [] 101 | 102 | for chain in model: 103 | if(chain_id is not None and chain.id != chain_id): 104 | continue 105 | for res in chain: 106 | if res.id[2] != " ": 107 | raise ValueError( 108 | f"PDB contains an insertion code at chain {chain.id} and residue " 109 | f"index {res.id[1]}. These are not supported." 110 | ) 111 | res_shortname = residue_constants.restype_3to1.get(res.resname, "X") 112 | restype_idx = residue_constants.restype_order.get( 113 | res_shortname, residue_constants.restype_num 114 | ) 115 | pos = np.zeros((residue_constants.atom_type_num, 3)) 116 | mask = np.zeros((residue_constants.atom_type_num,)) 117 | res_b_factors = np.zeros((residue_constants.atom_type_num,)) 118 | for atom in res: 119 | if atom.name not in residue_constants.atom_types: 120 | continue 121 | pos[residue_constants.atom_order[atom.name]] = atom.coord 122 | mask[residue_constants.atom_order[atom.name]] = 1.0 123 | res_b_factors[ 124 | residue_constants.atom_order[atom.name] 125 | ] = atom.bfactor 126 | if np.sum(mask) < 0.5: 127 | # If no known atom positions are reported for the residue then skip it. 128 | continue 129 | aatype.append(restype_idx) 130 | atom_positions.append(pos) 131 | atom_mask.append(mask) 132 | residue_index.append(res.id[1]) 133 | chain_ids.append(chain.id) 134 | b_factors.append(res_b_factors) 135 | 136 | parents = None 137 | parents_chain_index = None 138 | if("PARENT" in pdb_str): 139 | parents = [] 140 | parents_chain_index = [] 141 | chain_id = 0 142 | for l in pdb_str.split("\n"): 143 | if("PARENT" in l): 144 | if(not "N/A" in l): 145 | parent_names = l.split()[1:] 146 | parents.extend(parent_names) 147 | parents_chain_index.extend([ 148 | chain_id for _ in parent_names 149 | ]) 150 | chain_id += 1 151 | 152 | unique_chain_ids = np.unique(chain_ids) 153 | chain_id_mapping = {cid: n for n, cid in enumerate(string.ascii_uppercase)} 154 | chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids]) 155 | 156 | return Protein( 157 | atom_positions=np.array(atom_positions), 158 | atom_mask=np.array(atom_mask), 159 | aatype=np.array(aatype), 160 | residue_index=np.array(residue_index), 161 | chain_index=chain_index, 162 | b_factors=np.array(b_factors), 163 | parents=parents, 164 | parents_chain_index=parents_chain_index, 165 | ) 166 | 167 | 168 | def from_proteinnet_string(proteinnet_str: str) -> Protein: 169 | tag_re = r'(\[[A-Z]+\]\n)' 170 | tags = [ 171 | tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0 172 | ] 173 | groups = zip(tags[0::2], [l.split('\n') for l in tags[1::2]]) 174 | 175 | atoms = ['N', 'CA', 'C'] 176 | aatype = None 177 | atom_positions = None 178 | atom_mask = None 179 | for g in groups: 180 | if("[PRIMARY]" == g[0]): 181 | seq = g[1][0].strip() 182 | for i in range(len(seq)): 183 | if(seq[i] not in residue_constants.restypes): 184 | seq[i] = 'X' 185 | aatype = np.array([ 186 | residue_constants.restype_order.get( 187 | res_symbol, residue_constants.restype_num 188 | ) for res_symbol in seq 189 | ]) 190 | elif("[TERTIARY]" == g[0]): 191 | tertiary = [] 192 | for axis in range(3): 193 | tertiary.append(list(map(float, g[1][axis].split()))) 194 | tertiary_np = np.array(tertiary) 195 | atom_positions = np.zeros( 196 | (len(tertiary[0])//3, residue_constants.atom_type_num, 3) 197 | ).astype(np.float32) 198 | for i, atom in enumerate(atoms): 199 | atom_positions[:, residue_constants.atom_order[atom], :] = ( 200 | np.transpose(tertiary_np[:, i::3]) 201 | ) 202 | atom_positions *= PICO_TO_ANGSTROM 203 | elif("[MASK]" == g[0]): 204 | mask = np.array(list(map({'-': 0, '+': 1}.get, g[1][0].strip()))) 205 | atom_mask = np.zeros( 206 | (len(mask), residue_constants.atom_type_num,) 207 | ).astype(np.float32) 208 | for i, atom in enumerate(atoms): 209 | atom_mask[:, residue_constants.atom_order[atom]] = 1 210 | atom_mask *= mask[..., None] 211 | 212 | return Protein( 213 | atom_positions=atom_positions, 214 | atom_mask=atom_mask, 215 | aatype=aatype, 216 | residue_index=np.arange(len(aatype)), 217 | b_factors=None, 218 | ) 219 | 220 | 221 | def get_pdb_headers(prot: Protein, chain_id: int = 0) -> Sequence[str]: 222 | pdb_headers = [] 223 | 224 | remark = prot.remark 225 | if(remark is not None): 226 | pdb_headers.append(f"REMARK {remark}") 227 | 228 | parents = prot.parents 229 | parents_chain_index = prot.parents_chain_index 230 | if(parents_chain_index is not None): 231 | parents = [ 232 | p for i, p in zip(parents_chain_index, parents) if i == chain_id 233 | ] 234 | 235 | if(parents is None or len(parents) == 0): 236 | parents = ["N/A"] 237 | 238 | pdb_headers.append(f"PARENT {' '.join(parents)}") 239 | 240 | return pdb_headers 241 | 242 | 243 | def add_pdb_headers(prot: Protein, pdb_str: str) -> str: 244 | """ Add pdb headers to an existing PDB string. Useful during multi-chain 245 | recycling 246 | """ 247 | out_pdb_lines = [] 248 | lines = pdb_str.split('\n') 249 | 250 | remark = prot.remark 251 | if(remark is not None): 252 | out_pdb_lines.append(f"REMARK {remark}") 253 | 254 | parents_per_chain = None 255 | if(prot.parents is not None and len(prot.parents) > 0): 256 | parents_per_chain = [] 257 | if(prot.parents_chain_index is not None): 258 | cur_chain = prot.parents_chain_index[0] 259 | parent_dict = {} 260 | for p, i in zip(prot.parents, prot.parents_chain_index): 261 | parent_dict.setdefault(str(i), []) 262 | parent_dict[str(i)].append(p) 263 | 264 | max_idx = max([int(chain_idx) for chain_idx in parent_dict]) 265 | for i in range(max_idx + 1): 266 | chain_parents = parent_dict.get(str(i), ["N/A"]) 267 | parents_per_chain.append(chain_parents) 268 | else: 269 | parents_per_chain.append(prot.parents) 270 | else: 271 | parents_per_chain = [["N/A"]] 272 | 273 | make_parent_line = lambda p: f"PARENT {' '.join(p)}" 274 | 275 | out_pdb_lines.append(make_parent_line(parents_per_chain[0])) 276 | 277 | chain_counter = 0 278 | for i, l in enumerate(lines): 279 | if("PARENT" not in l and "REMARK" not in l): 280 | out_pdb_lines.append(l) 281 | if("TER" in l and not "END" in lines[i + 1]): 282 | chain_counter += 1 283 | if(not chain_counter >= len(parents_per_chain)): 284 | chain_parents = parents_per_chain[chain_counter] 285 | else: 286 | chain_parents = ["N/A"] 287 | 288 | out_pdb_lines.append(make_parent_line(chain_parents)) 289 | 290 | return '\n'.join(out_pdb_lines) 291 | 292 | 293 | def to_pdb(prot: Protein) -> str: 294 | """Converts a `Protein` instance to a PDB string. 295 | 296 | Args: 297 | prot: The protein to convert to PDB. 298 | 299 | Returns: 300 | PDB string. 301 | """ 302 | restypes = residue_constants.restypes + ["X"] 303 | res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK") 304 | atom_types = residue_constants.atom_types 305 | 306 | pdb_lines = [] 307 | 308 | atom_mask = prot.atom_mask 309 | aatype = prot.aatype 310 | atom_positions = prot.atom_positions 311 | residue_index = prot.residue_index.astype(np.int32) 312 | b_factors = prot.b_factors 313 | chain_index = prot.chain_index 314 | 315 | if np.any(aatype > residue_constants.restype_num): 316 | raise ValueError("Invalid aatypes.") 317 | 318 | headers = get_pdb_headers(prot) 319 | if(len(headers) > 0): 320 | pdb_lines.extend(headers) 321 | 322 | n = aatype.shape[0] 323 | atom_index = 1 324 | prev_chain_index = 0 325 | chain_tags = string.ascii_uppercase 326 | # Add all atom sites. 327 | for i in range(n): 328 | res_name_3 = res_1to3(aatype[i]) 329 | for atom_name, pos, mask, b_factor in zip( 330 | atom_types, atom_positions[i], atom_mask[i], b_factors[i] 331 | ): 332 | if mask < 0.5: 333 | chain_tag = "A" 334 | if(chain_index is not None): 335 | chain_tag = chain_tags[chain_index[i]] 336 | continue 337 | 338 | record_type = "ATOM" 339 | name = atom_name if len(atom_name) == 4 else f" {atom_name}" 340 | alt_loc = "" 341 | insertion_code = "" 342 | occupancy = 1.00 343 | element = atom_name[ 344 | 0 345 | ] # Protein supports only C, N, O, S, this works. 346 | charge = "" 347 | 348 | chain_tag = "A" 349 | if(chain_index is not None): 350 | chain_tag = chain_tags[chain_index[i]] 351 | 352 | # PDB is a columnar format, every space matters here! 353 | atom_line = ( 354 | f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}" 355 | f"{res_name_3:>3} {chain_tag:>1}" 356 | f"{residue_index[i]:>4}{insertion_code:>1} " 357 | f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}" 358 | f"{occupancy:>6.2f}{b_factor:>6.2f} " 359 | f"{element:>2}{charge:>2}" 360 | ) 361 | pdb_lines.append(atom_line) 362 | atom_index += 1 363 | 364 | should_terminate = (i == n - 1) 365 | if(chain_index is not None): 366 | if(i != n - 1 and chain_index[i + 1] != prev_chain_index): 367 | should_terminate = True 368 | prev_chain_index = chain_index[i + 1] 369 | 370 | if(should_terminate): 371 | # Close the chain. 372 | chain_end = "TER" 373 | chain_termination_line = ( 374 | f"{chain_end:<6}{atom_index:>5} " 375 | f"{res_1to3(aatype[i]):>3} " 376 | f"{chain_tag:>1}{residue_index[i]:>4}" 377 | ) 378 | pdb_lines.append(chain_termination_line) 379 | atom_index += 1 380 | 381 | if(i != n - 1): 382 | # "prev" is a misnomer here. This happens at the beginning of 383 | # each new chain. 384 | pdb_lines.extend(get_pdb_headers(prot, prev_chain_index)) 385 | 386 | pdb_lines.append("END") 387 | pdb_lines.append("") 388 | return "\n".join(pdb_lines) 389 | 390 | 391 | def ideal_atom_mask(prot: Protein) -> np.ndarray: 392 | """Computes an ideal atom mask. 393 | 394 | `Protein.atom_mask` typically is defined according to the atoms that are 395 | reported in the PDB. This function computes a mask according to heavy atoms 396 | that should be present in the given sequence of amino acids. 397 | 398 | Args: 399 | prot: `Protein` whose fields are `numpy.ndarray` objects. 400 | 401 | Returns: 402 | An ideal atom mask. 403 | """ 404 | return residue_constants.STANDARD_ATOM_MASK[prot.aatype] 405 | 406 | 407 | def from_prediction( 408 | features: FeatureDict, 409 | result: ModelOutput, 410 | b_factors: Optional[np.ndarray] = None, 411 | chain_index: Optional[np.ndarray] = None, 412 | remark: Optional[str] = None, 413 | parents: Optional[Sequence[str]] = None, 414 | parents_chain_index: Optional[Sequence[int]] = None 415 | ) -> Protein: 416 | """Assembles a protein from a prediction. 417 | 418 | Args: 419 | features: Dictionary holding model inputs. 420 | result: Dictionary holding model outputs. 421 | b_factors: (Optional) B-factors to use for the protein. 422 | chain_index: (Optional) Chain indices for multi-chain predictions 423 | remark: (Optional) Remark about the prediction 424 | parents: (Optional) List of template names 425 | Returns: 426 | A protein instance. 427 | """ 428 | if b_factors is None: 429 | b_factors = np.zeros_like(result["final_atom_mask"]) 430 | 431 | return Protein( 432 | aatype=features["aatype"], 433 | atom_positions=result["final_atom_positions"], 434 | atom_mask=result["final_atom_mask"], 435 | residue_index=features["residue_index"] + 1, 436 | b_factors=b_factors, 437 | chain_index=chain_index, 438 | remark=remark, 439 | parents=parents, 440 | parents_chain_index=parents_chain_index, 441 | ) 442 | -------------------------------------------------------------------------------- /openfold/np/relax/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import importlib as importlib 4 | 5 | _files = glob.glob(os.path.join(os.path.dirname(__file__), "*.py")) 6 | __all__ = [ 7 | os.path.basename(f)[:-3] 8 | for f in _files 9 | if os.path.isfile(f) and not f.endswith("__init__.py") 10 | ] 11 | _modules = [(m, importlib.import_module("." + m, __name__)) for m in __all__] 12 | for _m in _modules: 13 | globals()[_m[0]] = _m[1] 14 | 15 | # Avoid needlessly cluttering the global namespace 16 | del _files, _m, _modules 17 | -------------------------------------------------------------------------------- /openfold/np/relax/cleanup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Cleans up a PDB file using pdbfixer in preparation for OpenMM simulations. 16 | 17 | fix_pdb uses a third-party tool. We also support fixing some additional edge 18 | cases like removing chains of length one (see clean_structure). 19 | """ 20 | import io 21 | 22 | import pdbfixer 23 | from simtk.openmm import app 24 | from simtk.openmm.app import element 25 | 26 | 27 | def fix_pdb(pdbfile, alterations_info): 28 | """Apply pdbfixer to the contents of a PDB file; return a PDB string result. 29 | 30 | 1) Replaces nonstandard residues. 31 | 2) Removes heterogens (non protein residues) including water. 32 | 3) Adds missing residues and missing atoms within existing residues. 33 | 4) Adds hydrogens assuming pH=7.0. 34 | 5) KeepIds is currently true, so the fixer must keep the existing chain and 35 | residue identifiers. This will fail for some files in wider PDB that have 36 | invalid IDs. 37 | 38 | Args: 39 | pdbfile: Input PDB file handle. 40 | alterations_info: A dict that will store details of changes made. 41 | 42 | Returns: 43 | A PDB string representing the fixed structure. 44 | """ 45 | fixer = pdbfixer.PDBFixer(pdbfile=pdbfile) 46 | fixer.findNonstandardResidues() 47 | alterations_info["nonstandard_residues"] = fixer.nonstandardResidues 48 | fixer.replaceNonstandardResidues() 49 | _remove_heterogens(fixer, alterations_info, keep_water=False) 50 | fixer.findMissingResidues() 51 | alterations_info["missing_residues"] = fixer.missingResidues 52 | fixer.findMissingAtoms() 53 | alterations_info["missing_heavy_atoms"] = fixer.missingAtoms 54 | alterations_info["missing_terminals"] = fixer.missingTerminals 55 | fixer.addMissingAtoms(seed=0) 56 | fixer.addMissingHydrogens() 57 | out_handle = io.StringIO() 58 | app.PDBFile.writeFile( 59 | fixer.topology, fixer.positions, out_handle, keepIds=True 60 | ) 61 | return out_handle.getvalue() 62 | 63 | 64 | def clean_structure(pdb_structure, alterations_info): 65 | """Applies additional fixes to an OpenMM structure, to handle edge cases. 66 | 67 | Args: 68 | pdb_structure: An OpenMM structure to modify and fix. 69 | alterations_info: A dict that will store details of changes made. 70 | """ 71 | _replace_met_se(pdb_structure, alterations_info) 72 | _remove_chains_of_length_one(pdb_structure, alterations_info) 73 | 74 | 75 | def _remove_heterogens(fixer, alterations_info, keep_water): 76 | """Removes the residues that Pdbfixer considers to be heterogens. 77 | 78 | Args: 79 | fixer: A Pdbfixer instance. 80 | alterations_info: A dict that will store details of changes made. 81 | keep_water: If True, water (HOH) is not considered to be a heterogen. 82 | """ 83 | initial_resnames = set() 84 | for chain in fixer.topology.chains(): 85 | for residue in chain.residues(): 86 | initial_resnames.add(residue.name) 87 | fixer.removeHeterogens(keepWater=keep_water) 88 | final_resnames = set() 89 | for chain in fixer.topology.chains(): 90 | for residue in chain.residues(): 91 | final_resnames.add(residue.name) 92 | alterations_info["removed_heterogens"] = initial_resnames.difference( 93 | final_resnames 94 | ) 95 | 96 | 97 | def _replace_met_se(pdb_structure, alterations_info): 98 | """Replace the Se in any MET residues that were not marked as modified.""" 99 | modified_met_residues = [] 100 | for res in pdb_structure.iter_residues(): 101 | name = res.get_name_with_spaces().strip() 102 | if name == "MET": 103 | s_atom = res.get_atom("SD") 104 | if s_atom.element_symbol == "Se": 105 | s_atom.element_symbol = "S" 106 | s_atom.element = element.get_by_symbol("S") 107 | modified_met_residues.append(s_atom.residue_number) 108 | alterations_info["Se_in_MET"] = modified_met_residues 109 | 110 | 111 | def _remove_chains_of_length_one(pdb_structure, alterations_info): 112 | """Removes chains that correspond to a single amino acid. 113 | 114 | A single amino acid in a chain is both N and C terminus. There is no force 115 | template for this case. 116 | 117 | Args: 118 | pdb_structure: An OpenMM pdb_structure to modify and fix. 119 | alterations_info: A dict that will store details of changes made. 120 | """ 121 | removed_chains = {} 122 | for model in pdb_structure.iter_models(): 123 | valid_chains = [c for c in model.iter_chains() if len(c) > 1] 124 | invalid_chain_ids = [ 125 | c.chain_id for c in model.iter_chains() if len(c) <= 1 126 | ] 127 | model.chains = valid_chains 128 | for chain_id in invalid_chain_ids: 129 | model.chains_by_id.pop(chain_id) 130 | removed_chains[model.number] = invalid_chain_ids 131 | alterations_info["removed_chains"] = removed_chains 132 | -------------------------------------------------------------------------------- /openfold/np/relax/relax.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Amber relaxation.""" 17 | from typing import Any, Dict, Sequence, Tuple 18 | from openfold.np import protein 19 | from openfold.np.relax import amber_minimize, utils 20 | import numpy as np 21 | 22 | 23 | class AmberRelaxation(object): 24 | """Amber relaxation.""" 25 | def __init__( 26 | self, 27 | *, 28 | max_iterations: int, 29 | tolerance: float, 30 | stiffness: float, 31 | exclude_residues: Sequence[int], 32 | max_outer_iterations: int, 33 | use_gpu: bool, 34 | ): 35 | """Initialize Amber Relaxer. 36 | 37 | Args: 38 | max_iterations: Maximum number of L-BFGS iterations. 0 means no max. 39 | tolerance: kcal/mol, the energy tolerance of L-BFGS. 40 | stiffness: kcal/mol A**2, spring constant of heavy atom restraining 41 | potential. 42 | exclude_residues: Residues to exclude from per-atom restraining. 43 | Zero-indexed. 44 | max_outer_iterations: Maximum number of violation-informed relax 45 | iterations. A value of 1 will run the non-iterative procedure used in 46 | CASP14. Use 20 so that >95% of the bad cases are relaxed. Relax finishes 47 | as soon as there are no violations, hence in most cases this causes no 48 | slowdown. In the worst case we do 20 outer iterations. 49 | use_gpu: Whether to run on GPU 50 | """ 51 | 52 | self._max_iterations = max_iterations 53 | self._tolerance = tolerance 54 | self._stiffness = stiffness 55 | self._exclude_residues = exclude_residues 56 | self._max_outer_iterations = max_outer_iterations 57 | self._use_gpu = use_gpu 58 | 59 | def process( 60 | self, *, prot: protein.Protein 61 | ) -> Tuple[str, Dict[str, Any], np.ndarray]: 62 | """Runs Amber relax on a prediction, adds hydrogens, returns PDB string.""" 63 | out = amber_minimize.run_pipeline( 64 | prot=prot, 65 | max_iterations=self._max_iterations, 66 | tolerance=self._tolerance, 67 | stiffness=self._stiffness, 68 | exclude_residues=self._exclude_residues, 69 | max_outer_iterations=self._max_outer_iterations, 70 | use_gpu=self._use_gpu, 71 | ) 72 | min_pos = out["pos"] 73 | start_pos = out["posinit"] 74 | rmsd = np.sqrt(np.sum((start_pos - min_pos) ** 2) / start_pos.shape[0]) 75 | debug_data = { 76 | "initial_energy": out["einit"], 77 | "final_energy": out["efinal"], 78 | "attempts": out["min_attempts"], 79 | "rmsd": rmsd, 80 | } 81 | pdb_str = amber_minimize.clean_protein(prot) 82 | min_pdb = utils.overwrite_pdb_coordinates(pdb_str, min_pos) 83 | min_pdb = utils.overwrite_b_factors(min_pdb, prot.b_factors) 84 | utils.assert_equal_nonterminal_atom_types( 85 | protein.from_pdb_string(min_pdb).atom_mask, prot.atom_mask 86 | ) 87 | violations = out["structural_violations"][ 88 | "total_per_residue_violations_mask" 89 | ] 90 | 91 | min_pdb = protein.add_pdb_headers(prot, min_pdb) 92 | 93 | return min_pdb, debug_data, violations 94 | -------------------------------------------------------------------------------- /openfold/np/relax/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utils for minimization.""" 17 | import io 18 | from openfold.np import residue_constants 19 | from Bio import PDB 20 | import numpy as np 21 | from simtk.openmm import app as openmm_app 22 | from simtk.openmm.app.internal.pdbstructure import PdbStructure 23 | 24 | 25 | def overwrite_pdb_coordinates(pdb_str: str, pos) -> str: 26 | pdb_file = io.StringIO(pdb_str) 27 | structure = PdbStructure(pdb_file) 28 | topology = openmm_app.PDBFile(structure).getTopology() 29 | with io.StringIO() as f: 30 | openmm_app.PDBFile.writeFile(topology, pos, f) 31 | return f.getvalue() 32 | 33 | 34 | def overwrite_b_factors(pdb_str: str, bfactors: np.ndarray) -> str: 35 | """Overwrites the B-factors in pdb_str with contents of bfactors array. 36 | 37 | Args: 38 | pdb_str: An input PDB string. 39 | bfactors: A numpy array with shape [1, n_residues, 37]. We assume that the 40 | B-factors are per residue; i.e. that the nonzero entries are identical in 41 | [0, i, :]. 42 | 43 | Returns: 44 | A new PDB string with the B-factors replaced. 45 | """ 46 | if bfactors.shape[-1] != residue_constants.atom_type_num: 47 | raise ValueError( 48 | f"Invalid final dimension size for bfactors: {bfactors.shape[-1]}." 49 | ) 50 | 51 | parser = PDB.PDBParser(QUIET=True) 52 | handle = io.StringIO(pdb_str) 53 | structure = parser.get_structure("", handle) 54 | 55 | curr_resid = ("", "", "") 56 | idx = -1 57 | for atom in structure.get_atoms(): 58 | atom_resid = atom.parent.get_id() 59 | if atom_resid != curr_resid: 60 | idx += 1 61 | if idx >= bfactors.shape[0]: 62 | raise ValueError( 63 | "Index into bfactors exceeds number of residues. " 64 | "B-factors shape: {shape}, idx: {idx}." 65 | ) 66 | curr_resid = atom_resid 67 | atom.bfactor = bfactors[idx, residue_constants.atom_order["CA"]] 68 | 69 | new_pdb = io.StringIO() 70 | pdb_io = PDB.PDBIO() 71 | pdb_io.set_structure(structure) 72 | pdb_io.save(new_pdb) 73 | return new_pdb.getvalue() 74 | 75 | 76 | def assert_equal_nonterminal_atom_types( 77 | atom_mask: np.ndarray, ref_atom_mask: np.ndarray 78 | ): 79 | """Checks that pre- and post-minimized proteins have same atom set.""" 80 | # Ignore any terminal OXT atoms which may have been added by minimization. 81 | oxt = residue_constants.atom_order["OXT"] 82 | no_oxt_mask = np.ones(shape=atom_mask.shape, dtype=np.bool) 83 | no_oxt_mask[..., oxt] = False 84 | np.testing.assert_almost_equal( 85 | ref_atom_mask[no_oxt_mask], atom_mask[no_oxt_mask] 86 | ) 87 | -------------------------------------------------------------------------------- /openfold/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/openfold/resources/__init__.py -------------------------------------------------------------------------------- /openfold/utils/feats.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import math 17 | 18 | import numpy as np 19 | import torch 20 | import torch.nn as nn 21 | from typing import Dict 22 | 23 | from openfold.np import protein 24 | import openfold.np.residue_constants as rc 25 | from openfold.utils.rigid_utils import Rotation, Rigid 26 | from openfold.utils.tensor_utils import ( 27 | batched_gather, 28 | one_hot, 29 | tree_map, 30 | tensor_tree_map, 31 | ) 32 | 33 | 34 | def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks): 35 | is_gly = aatype == rc.restype_order["G"] 36 | ca_idx = rc.atom_order["CA"] 37 | cb_idx = rc.atom_order["CB"] 38 | pseudo_beta = torch.where( 39 | is_gly[..., None].expand(*((-1,) * len(is_gly.shape)), 3), 40 | all_atom_positions[..., ca_idx, :], 41 | all_atom_positions[..., cb_idx, :], 42 | ) 43 | 44 | if all_atom_masks is not None: 45 | pseudo_beta_mask = torch.where( 46 | is_gly, 47 | all_atom_masks[..., ca_idx], 48 | all_atom_masks[..., cb_idx], 49 | ) 50 | return pseudo_beta, pseudo_beta_mask 51 | else: 52 | return pseudo_beta 53 | 54 | 55 | def atom14_to_atom37(atom14, batch): 56 | atom37_data = batched_gather( 57 | atom14, 58 | batch["residx_atom37_to_atom14"], 59 | dim=-2, 60 | no_batch_dims=len(atom14.shape[:-2]), 61 | ) 62 | 63 | atom37_data = atom37_data * batch["atom37_atom_exists"][..., None] 64 | 65 | return atom37_data 66 | 67 | 68 | def build_template_angle_feat(template_feats): 69 | template_aatype = template_feats["template_aatype"] 70 | torsion_angles_sin_cos = template_feats["template_torsion_angles_sin_cos"] 71 | alt_torsion_angles_sin_cos = template_feats[ 72 | "template_alt_torsion_angles_sin_cos" 73 | ] 74 | torsion_angles_mask = template_feats["template_torsion_angles_mask"] 75 | template_angle_feat = torch.cat( 76 | [ 77 | nn.functional.one_hot(template_aatype, 22), 78 | torsion_angles_sin_cos.reshape( 79 | *torsion_angles_sin_cos.shape[:-2], 14 80 | ), 81 | alt_torsion_angles_sin_cos.reshape( 82 | *alt_torsion_angles_sin_cos.shape[:-2], 14 83 | ), 84 | torsion_angles_mask, 85 | ], 86 | dim=-1, 87 | ) 88 | 89 | return template_angle_feat 90 | 91 | 92 | def build_template_pair_feat( 93 | batch, 94 | min_bin, max_bin, no_bins, 95 | use_unit_vector=False, 96 | eps=1e-20, inf=1e8 97 | ): 98 | template_mask = batch["template_pseudo_beta_mask"] 99 | template_mask_2d = template_mask[..., None] * template_mask[..., None, :] 100 | 101 | # Compute distogram (this seems to differ slightly from Alg. 5) 102 | tpb = batch["template_pseudo_beta"] 103 | dgram = torch.sum( 104 | (tpb[..., None, :] - tpb[..., None, :, :]) ** 2, dim=-1, keepdim=True 105 | ) 106 | lower = torch.linspace(min_bin, max_bin, no_bins, device=tpb.device) ** 2 107 | upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1) 108 | dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype) 109 | 110 | to_concat = [dgram, template_mask_2d[..., None]] 111 | 112 | aatype_one_hot = nn.functional.one_hot( 113 | batch["template_aatype"], 114 | rc.restype_num + 2, 115 | ) 116 | 117 | n_res = batch["template_aatype"].shape[-1] 118 | to_concat.append( 119 | aatype_one_hot[..., None, :, :].expand( 120 | *aatype_one_hot.shape[:-2], n_res, -1, -1 121 | ) 122 | ) 123 | to_concat.append( 124 | aatype_one_hot[..., None, :].expand( 125 | *aatype_one_hot.shape[:-2], -1, n_res, -1 126 | ) 127 | ) 128 | 129 | n, ca, c = [rc.atom_order[a] for a in ["N", "CA", "C"]] 130 | rigids = Rigid.make_transform_from_reference( 131 | n_xyz=batch["template_all_atom_positions"][..., n, :], 132 | ca_xyz=batch["template_all_atom_positions"][..., ca, :], 133 | c_xyz=batch["template_all_atom_positions"][..., c, :], 134 | eps=eps, 135 | ) 136 | points = rigids.get_trans()[..., None, :, :] 137 | rigid_vec = rigids[..., None].invert_apply(points) 138 | 139 | inv_distance_scalar = torch.rsqrt(eps + torch.sum(rigid_vec ** 2, dim=-1)) 140 | 141 | t_aa_masks = batch["template_all_atom_mask"] 142 | template_mask = ( 143 | t_aa_masks[..., n] * t_aa_masks[..., ca] * t_aa_masks[..., c] 144 | ) 145 | template_mask_2d = template_mask[..., None] * template_mask[..., None, :] 146 | 147 | inv_distance_scalar = inv_distance_scalar * template_mask_2d 148 | unit_vector = rigid_vec * inv_distance_scalar[..., None] 149 | 150 | if(not use_unit_vector): 151 | unit_vector = unit_vector * 0. 152 | 153 | to_concat.extend(torch.unbind(unit_vector[..., None, :], dim=-1)) 154 | to_concat.append(template_mask_2d[..., None]) 155 | 156 | act = torch.cat(to_concat, dim=-1) 157 | act = act * template_mask_2d[..., None] 158 | 159 | return act 160 | 161 | 162 | def build_extra_msa_feat(batch): 163 | msa_1hot = nn.functional.one_hot(batch["extra_msa"], 23) 164 | msa_feat = [ 165 | msa_1hot, 166 | batch["extra_has_deletion"].unsqueeze(-1), 167 | batch["extra_deletion_value"].unsqueeze(-1), 168 | ] 169 | return torch.cat(msa_feat, dim=-1) 170 | 171 | 172 | def torsion_angles_to_frames( 173 | r: Rigid, 174 | alpha: torch.Tensor, 175 | aatype: torch.Tensor, 176 | rrgdf: torch.Tensor, 177 | ): 178 | # [*, N, 8, 4, 4] 179 | default_4x4 = rrgdf[aatype, ...] 180 | 181 | # [*, N, 8] transformations, i.e. 182 | # One [*, N, 8, 3, 3] rotation matrix and 183 | # One [*, N, 8, 3] translation matrix 184 | default_r = r.from_tensor_4x4(default_4x4) 185 | 186 | bb_rot = alpha.new_zeros((*((1,) * len(alpha.shape[:-1])), 2)) 187 | bb_rot[..., 1] = 1 188 | 189 | # [*, N, 8, 2] 190 | alpha = torch.cat( 191 | [bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha], dim=-2 192 | ) 193 | 194 | # [*, N, 8, 3, 3] 195 | # Produces rotation matrices of the form: 196 | # [ 197 | # [1, 0 , 0 ], 198 | # [0, a_2,-a_1], 199 | # [0, a_1, a_2] 200 | # ] 201 | # This follows the original code rather than the supplement, which uses 202 | # different indices. 203 | 204 | all_rots = alpha.new_zeros(default_r.get_rots().get_rot_mats().shape) 205 | all_rots[..., 0, 0] = 1 206 | all_rots[..., 1, 1] = alpha[..., 1] 207 | all_rots[..., 1, 2] = -alpha[..., 0] 208 | all_rots[..., 2, 1:] = alpha 209 | 210 | all_rots = Rigid(Rotation(rot_mats=all_rots), None) 211 | 212 | all_frames = default_r.compose(all_rots) 213 | 214 | chi2_frame_to_frame = all_frames[..., 5] 215 | chi3_frame_to_frame = all_frames[..., 6] 216 | chi4_frame_to_frame = all_frames[..., 7] 217 | 218 | chi1_frame_to_bb = all_frames[..., 4] 219 | chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame) 220 | chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame) 221 | chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame) 222 | 223 | all_frames_to_bb = Rigid.cat( 224 | [ 225 | all_frames[..., :5], 226 | chi2_frame_to_bb.unsqueeze(-1), 227 | chi3_frame_to_bb.unsqueeze(-1), 228 | chi4_frame_to_bb.unsqueeze(-1), 229 | ], 230 | dim=-1, 231 | ) 232 | 233 | all_frames_to_global = r[..., None].compose(all_frames_to_bb) 234 | 235 | return all_frames_to_global 236 | 237 | 238 | def frames_and_literature_positions_to_atom14_pos( 239 | r: Rigid, 240 | aatype: torch.Tensor, 241 | default_frames, 242 | group_idx, 243 | atom_mask, 244 | lit_positions, 245 | ): 246 | # [*, N, 14, 4, 4] 247 | default_4x4 = default_frames[aatype, ...] 248 | 249 | # [*, N, 14] 250 | group_mask = group_idx[aatype, ...] 251 | 252 | # [*, N, 14, 8] 253 | group_mask = nn.functional.one_hot( 254 | group_mask, 255 | num_classes=default_frames.shape[-3], 256 | ) 257 | 258 | # [*, N, 14, 8] 259 | t_atoms_to_global = r[..., None, :] * group_mask 260 | 261 | # [*, N, 14] 262 | t_atoms_to_global = t_atoms_to_global.map_tensor_fn( 263 | lambda x: torch.sum(x, dim=-1) 264 | ) 265 | 266 | # [*, N, 14, 1] 267 | atom_mask = atom_mask[aatype, ...].unsqueeze(-1) 268 | 269 | # [*, N, 14, 3] 270 | lit_positions = lit_positions[aatype, ...] 271 | pred_positions = t_atoms_to_global.apply(lit_positions) 272 | pred_positions = pred_positions * atom_mask 273 | 274 | return pred_positions 275 | -------------------------------------------------------------------------------- /openfold/utils/tensor_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from functools import partial 17 | import logging 18 | from typing import Tuple, List, Callable, Any, Dict, Sequence, Optional 19 | 20 | import torch 21 | import torch.nn as nn 22 | 23 | 24 | def add(m1, m2, inplace): 25 | # The first operation in a checkpoint can't be in-place, but it's 26 | # nice to have in-place addition during inference. Thus... 27 | if(not inplace): 28 | m1 = m1 + m2 29 | else: 30 | m1 += m2 31 | 32 | return m1 33 | 34 | 35 | def permute_final_dims(tensor: torch.Tensor, inds: List[int]): 36 | zero_index = -1 * len(inds) 37 | first_inds = list(range(len(tensor.shape[:zero_index]))) 38 | return tensor.permute(first_inds + [zero_index + i for i in inds]) 39 | 40 | 41 | def flatten_final_dims(t: torch.Tensor, no_dims: int): 42 | return t.reshape(t.shape[:-no_dims] + (-1,)) 43 | 44 | 45 | def masked_mean(mask, value, dim, eps=1e-4): 46 | mask = mask.expand(*value.shape) 47 | return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim)) 48 | 49 | 50 | def pts_to_distogram(pts, min_bin=2.3125, max_bin=21.6875, no_bins=64): 51 | boundaries = torch.linspace( 52 | min_bin, max_bin, no_bins - 1, device=pts.device 53 | ) 54 | dists = torch.sqrt( 55 | torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1) 56 | ) 57 | return torch.bucketize(dists, boundaries) 58 | 59 | 60 | def dict_multimap(fn, dicts): 61 | first = dicts[0] 62 | new_dict = {} 63 | for k, v in first.items(): 64 | all_v = [d[k] for d in dicts] 65 | if type(v) is dict: 66 | new_dict[k] = dict_multimap(fn, all_v) 67 | else: 68 | new_dict[k] = fn(all_v) 69 | 70 | return new_dict 71 | 72 | 73 | def one_hot(x, v_bins): 74 | reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),)) 75 | diffs = x[..., None] - reshaped_bins 76 | am = torch.argmin(torch.abs(diffs), dim=-1) 77 | return nn.functional.one_hot(am, num_classes=len(v_bins)).float() 78 | 79 | 80 | def batched_gather(data, inds, dim=0, no_batch_dims=0): 81 | ranges = [] 82 | for i, s in enumerate(data.shape[:no_batch_dims]): 83 | r = torch.arange(s) 84 | r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1)))) 85 | ranges.append(r) 86 | 87 | remaining_dims = [ 88 | slice(None) for _ in range(len(data.shape) - no_batch_dims) 89 | ] 90 | remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds 91 | ranges.extend(remaining_dims) 92 | return data[ranges] 93 | 94 | 95 | # With tree_map, a poor man's JAX tree_map 96 | def dict_map(fn, dic, leaf_type): 97 | new_dict = {} 98 | for k, v in dic.items(): 99 | if type(v) is dict: 100 | new_dict[k] = dict_map(fn, v, leaf_type) 101 | else: 102 | new_dict[k] = tree_map(fn, v, leaf_type) 103 | 104 | return new_dict 105 | 106 | 107 | def tree_map(fn, tree, leaf_type): 108 | if isinstance(tree, dict): 109 | return dict_map(fn, tree, leaf_type) 110 | elif isinstance(tree, list): 111 | return [tree_map(fn, x, leaf_type) for x in tree] 112 | elif isinstance(tree, tuple): 113 | return tuple([tree_map(fn, x, leaf_type) for x in tree]) 114 | elif isinstance(tree, leaf_type): 115 | return fn(tree) 116 | else: 117 | print(type(tree)) 118 | raise ValueError("Not supported") 119 | 120 | 121 | tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor) 122 | -------------------------------------------------------------------------------- /outputs/autoregressive_score_w_seq/1BC8_1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/autoregressive_score_w_seq/1BC8_1.pt -------------------------------------------------------------------------------- /outputs/autoregressive_score_wo_seq/1BC8_1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/autoregressive_score_wo_seq/1BC8_1.pt -------------------------------------------------------------------------------- /outputs/batch_size/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=3, number_of_batches=5, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4090, ligand_confidence=0.4090, seq_rec=0.5376 4 | GTSNISLYEFLLKLLSKPEYKDIIEWTSDNGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKKNHYKFVNYPEILNK 5 | >1BC8, id=2, T=0.1, seed=111, overall_confidence=0.3984, ligand_confidence=0.3984, seq_rec=0.4946 6 | GKSSMSLPEFLLKLLSDPKYKDIIEWTSDNGTFKLKDPEAVAKLWGKEKGRPDMNYEKMYELLKKYEEKGIIKEVKGEKNTYKFVNYPEYLYP 7 | >1BC8, id=3, T=0.1, seed=111, overall_confidence=0.4034, ligand_confidence=0.4034, seq_rec=0.5054 8 | GTSNISLYEFLLELLSDPKYKDIIEWISDNGEFKLKDPEAVAKLWGKVKGKPDMNYEEFEKLLKEYEKKKIIEKVEGKPYTYKFVNYPEILNK 9 | >1BC8, id=4, T=0.1, seed=111, overall_confidence=0.3842, ligand_confidence=0.3842, seq_rec=0.4839 10 | GVSSMSLWEFLLELLSKPEYDDYIRWVSDNGEFELKDPEKVAKLWGEKKGEPDMNYEKLNKLLEKYEKKKIIEKVEGEPNVYRFVNYPEYLYP 11 | >1BC8, id=5, T=0.1, seed=111, overall_confidence=0.3910, ligand_confidence=0.3910, seq_rec=0.4731 12 | GKSKISLHEFLDKLLSDPKYDDIISWTSDDGEFELKDPEKVAKLWGKVKGKPDMNYEELEKLLDKYEKKGIIEKVKGKPNTYKFVNYPEYKFP 13 | >1BC8, id=6, T=0.1, seed=111, overall_confidence=0.4045, ligand_confidence=0.4045, seq_rec=0.5054 14 | GKSSISLHEFLLKLLSKPEYADIIRWVSDNGEFELVKPEEVAKLWGKVKGKPDMNYEELKKELKKYEKKGIIKEVKGKPNVYQFVNYPEILYP 15 | >1BC8, id=7, T=0.1, seed=111, overall_confidence=0.3844, ligand_confidence=0.3844, seq_rec=0.4946 16 | GTSSMSLWEFILKLLSDPKYKDIISWTSDNGEFELKDPEKLAKLYGKLKGKPNMNKKELFKELDKYKEKKIIEKVEGKKNTYKFVNYPEILNP 17 | >1BC8, id=8, T=0.1, seed=111, overall_confidence=0.3852, ligand_confidence=0.3852, seq_rec=0.4946 18 | GMSSMSLWEFLLKLLSKPEYKDIIEWVSDDGEFRLKKPEEVAKLWGKEKGEPDMNATKLFKELDKYEEKKIIERVEGEPNTYKFVNYPEYLYP 19 | >1BC8, id=9, T=0.1, seed=111, overall_confidence=0.3826, ligand_confidence=0.3826, seq_rec=0.5161 20 | GTSSISLPEFLLELLSKPEYKDIIEWTSDNGTFKLVDPEKVAKLWGKVKGKPNMNAKEMFKELKKYEKKKIIEEVPGEPNTYKFVKYPEILNP 21 | >1BC8, id=10, T=0.1, seed=111, overall_confidence=0.4005, ligand_confidence=0.4005, seq_rec=0.4624 22 | HMSHMSLHEFLLELLSKPEYADLIRWTSDDGTFELVKPEEVAKLWGERVGRPDMNAEKMFEELKKLEEKGIIEEVPGKPNTYRFVNYPEILLP 23 | >1BC8, id=11, T=0.1, seed=111, overall_confidence=0.3887, ligand_confidence=0.3887, seq_rec=0.4839 24 | GVSSISLYEFLYELLSDPKYADIIEWVSDNGEFRLKKPEAVAKLWGEKKGIPNMNYKKLYKELKKYEKKKIIEKVKGKKNTYKFVNYPEYLYP 25 | >1BC8, id=12, T=0.1, seed=111, overall_confidence=0.4031, ligand_confidence=0.4031, seq_rec=0.5054 26 | GKSKISLWEFLLKLLSDEKYKDYIEWTSDNGEFELKKPEAVAKLWGKEKGEPDMNYKKLYKELKKYEKKKIIEEVKGKKNTYKFVNYPEYLNP 27 | >1BC8, id=13, T=0.1, seed=111, overall_confidence=0.3932, ligand_confidence=0.3932, seq_rec=0.5161 28 | GTSSMSLPDFLLELLSDPKYKDYIEWVSDNGEFRLKKPEEVAKLWGKVKGKPDMNYKKLDEELKKYEAKGIIKRVEGKPNTYKFVNYPEILNP 29 | >1BC8, id=14, T=0.1, seed=111, overall_confidence=0.4077, ligand_confidence=0.4077, seq_rec=0.5054 30 | GTSSISLHEFLLELLSDPKYKDIIEWTSDNGEFVLKDPEAVAKLWGKVKGEPDMNYEKLYKELKKYEKKKIIKEVEGKENHYKFVNYPEILYP 31 | >1BC8, id=15, T=0.1, seed=111, overall_confidence=0.3999, ligand_confidence=0.3999, seq_rec=0.5161 32 | GTSSISLPEFLLKLLSDKKYEDIITWTSDDGTFKLKKPEEVAKLWGEVKGKPDMNYEKMYKELDKYKEKKIIEKVEGEPNTYKFVNYPEYLNP -------------------------------------------------------------------------------- /outputs/bias_AA_per_residue_multi/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3843, ligand_confidence=0.3843, seq_rec=0.5161 4 | AWSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/bias_AA_per_residue_multi/seqs/4GYT.fa: -------------------------------------------------------------------------------- 1 | >4GYT, T=0.1, seed=111, num_res=354, num_ligand_res=354, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE 3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4325, ligand_confidence=0.4325, seq_rec=0.3955 4 | YMTLPPYAEFAAAIAPLELPVSPSELAGLMLGFLAAGKTELGRAWIRALARGRTDAATQAALAALLEVFDILERQLNDPALELELLLPPADAPLATRAAALAAFARGFVRGLELAGVGPESFATEASRAALERARALAALDPSTLRAGPADEARLEADEAWLRESILAIRRDIAENG:SLTLPPYDEFAAAIAPLELPISPSALAGLMLGYLVAGKTELGRRWIRSLLRGRTDPASQAALAALLAVFDILEAQLTDPSLELELLLPPEDASLRERARALAEFAAGFALGLELAGVDRESFAREESRRDYERILELARLDVSTLKEGEEDRARLAALEAWLRDSIVRLARDLREHG -------------------------------------------------------------------------------- /outputs/chains_to_design/seqs/4GYT.fa: -------------------------------------------------------------------------------- 1 | >4GYT, T=0.1, seed=111, num_res=177, num_ligand_res=11, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE 3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4349, ligand_confidence=0.4305, seq_rec=0.4576 4 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:ALSLPPYDEFAASIAVLKLTISASELHGIMLGFLTAGAVEQGRRFIESLAKGRTDPATQAALAALMEVFDISERQLNDPSLELEMLLPPEEASLRERCRAFAEFCRGFVLGLTLAGVGEEEFAREESRRAYRRFVELADWDCSRLREGPEDRARLEALREEARRAIVALRRDLRETK -------------------------------------------------------------------------------- /outputs/default/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161 4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/fasta_seq_separation/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=73564, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=73564, overall_confidence=0.3789, ligand_confidence=0.3789, seq_rec=0.5054 4 | GTSSISLWEFLLKLLSDKKYDDIITWTSNNGEFKLKDPEKVAKLWGKEKGKPDMNYEELYKLLKEYEKKKIIERVKGKPNTYKFVNYPEYLNP -------------------------------------------------------------------------------- /outputs/file_ending/seqs/1BC8_xyz.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161 4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/fix_residues/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=83, num_ligand_res=83, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.1000, ligand_confidence=0.1000, seq_rec=0.0361 4 | MDSAITLWQFAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -------------------------------------------------------------------------------- /outputs/fixed_residues_multi/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=86, num_ligand_res=86, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4057, ligand_confidence=0.4057, seq_rec=0.5116 4 | MDSAISLHEFLLKLLSKPEYKHIIEWTSDNGEFKLKDPEAVAKLWGEEKGEPDMNWKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/fixed_residues_multi/seqs/4GYT.fa: -------------------------------------------------------------------------------- 1 | >4GYT, T=0.1, seed=111, num_res=346, num_ligand_res=346, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE 3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4273, ligand_confidence=0.4273, seq_rec=0.3931 4 | SLHLPKYAEFEAAIAPLNLPVSASELAGLMLGFLAAGKTELGRAWIRALSNGRTDAATQAALAALLEVFDILEKQLNNPEYPLELLLPPADAPLATRAAALAAFARGFVRGLELAGVGRESFKTEASKAALDRIRALAALDPSTLRAGPADEARLDADRAWLIESIRAIHKDISENG:ALSLPPYDEFAAAIAPLELPVSASELAGLMLGYLVAGKTELGRRWIRALARGRTDPATQAALAALLAVFDTLEAQLTDPSLELELLLPPAGASLRARARALAEFARGFVLGLELAGVEKESFAEEESREAYERILELARLDVSTLREGPEDEARLAALEAWLRDSIVRLHRDLREHG -------------------------------------------------------------------------------- /outputs/global_bias/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.1321, ligand_confidence=0.1321, seq_rec=0.2903 4 | PPSPICLWEWLWCLLCCPKWCPWCCWCCCCGCFCLCKPEWCCKCWGWCKCEPDMNWKKMCKCLCKCCPLKIICPCCCCPCCWRFCCWPECCWP -------------------------------------------------------------------------------- /outputs/global_label_membrane_mpnn_0/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/global_label_membrane_mpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4217, ligand_confidence=0.4217, seq_rec=0.5591 4 | GMSKKTLYEFLLELLKDPKYDDIIKWTSNDGEFQLLKPEEVAKLWGKEKGKPNMNYEKLYKELKKLEEKKIIERVEGKPNVYKFVNYPEILNP -------------------------------------------------------------------------------- /outputs/global_omit/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.1011, ligand_confidence=0.1011, seq_rec=0.1505 4 | KKKKKKKEEAEEEEEKKKKAKKKKKKKKKEEKEKKKKAEAEAKKEKEEKKKKKEKAKKEKKKAKKKKKKKKEEKEKKKKKEKKKKKKKEEEKK -------------------------------------------------------------------------------- /outputs/homooligomer/seqs/4GYT.fa: -------------------------------------------------------------------------------- 1 | >4GYT, T=0.1, seed=111, num_res=354, num_ligand_res=22, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=2, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE 3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4180, ligand_confidence=0.3824, seq_rec=0.4689 4 | SLSLPPYEEFEKSIAVLKLPISASELAGIMLGFLTAGAEEQGRAFIKSLANGRTDEETQAALKALMQVFDILKKQLTDPSLELEMLLPPEDASLEERCRAFADFCRGFVKGLTLAGVGEDDFKSEESKAALRRLKELADLDCSTLREGPADRARLEALREETRRDILRLAEDLRNSG:SLSLPPYEEFEKSIAVLKLPISASELAGIMLGFLTAGAEEQGRAFIKSLANGRTDEETQAALKALMQVFDILKKQLTDPSLELEMLLPPEDASLEERCRAFADFCRGFVKGLTLAGVGEDDFKSEESKAALRRLKELADLDCSTLREGPADRARLEALREETRRDILRLAEDLRNSG 5 | >4GYT, id=2, T=0.1, seed=111, overall_confidence=0.4237, ligand_confidence=0.4409, seq_rec=0.4576 6 | SLKLPAYDDFAAAIAVLELPISASELAGIMLGFLTAGAVEAGRAFIRALANGRTDAATQAALAAMMEVFDILEKQLNDPSLELEILLPPADRSLEERCRALSEFAKGFVRGLTLAGVGEKDFKSEECREALEKMKKLAEMDCSTLREGPEDRARLEALTEELREDILRMRDDLANSG:SLKLPAYDDFAAAIAVLELPISASELAGIMLGFLTAGAVEAGRAFIRALANGRTDAATQAALAAMMEVFDILEKQLNDPSLELEILLPPADRSLEERCRALSEFAKGFVRGLTLAGVGEKDFKSEECREALEKMKKLAEMDCSTLREGPEDRARLEALTEELREDILRMRDDLANSG -------------------------------------------------------------------------------- /outputs/insertion_code/seqs/2GFB.fa: -------------------------------------------------------------------------------- 1 | >2GFB, T=0.1, seed=111, num_res=4, num_ligand_res=4, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | DVKLVESGGGLVQPGGSRKLSCAASGFTFSSFGMHWVRQAPEKGLEWVAYISSGSSTIYYADTVKGRFTISRDNPKNTLFLQMTSLRSEDTAMYYCARGDYYGSRGAYWGQGTLVTVSAKTTAPSVYPLAPVCGDTTGSSVTLGCLVKGYFPEPVTLTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVTSSTWPSQSITCNVAHPASSTKVDKKIEPRG 3 | >2GFB, id=1, T=0.1, seed=111, overall_confidence=0.5097, ligand_confidence=0.5097, seq_rec=0.5000 4 | DVKLVESGGGLVQPGGSRKLSCAASGFTFSSFGMHWVRQAPEKGLEWVAYISSGSSTIYYADTVKGRFTISRDNPKNTLFLQMSNLRSEDTAMYYCARGDYYGSRGAYWGQGTLVTVSAKTTAPSVYPLAPVCGDTTGSSVTLGCLVKGYFPEPVTLTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVTSSTWPSQSITCNVAHPASSTKVDKKIEPRG -------------------------------------------------------------------------------- /outputs/ligand_mpnn_cutoff_for_score/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=21, use_ligand_context=True, ligand_cutoff_distance=6.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4784, ligand_confidence=0.5507, seq_rec=0.4839 4 | MKSPISLHEFLLELLSDPKYADIIEWVSDNGEFRLVDPERVAKLWGEVKGKPKMNWKNLHRALRGYKKKKIIETVKGKPYQYRFVNYPELLHP -------------------------------------------------------------------------------- /outputs/ligandmpnn_default/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_005_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4794, ligand_confidence=0.5111, seq_rec=0.5269 4 | GMSSISLHEFILELLSDPKYADMIKWTGDDGEFQFTKPEEVAKLWGETTGKPNMNYKTLLRAIRYYKKKGIISSVKGKKYTFKFVNYPEILNP -------------------------------------------------------------------------------- /outputs/ligandmpnn_no_context/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=False, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3959, ligand_confidence=0.3657, seq_rec=0.4086 4 | GGSPISLHEFLLRLLSDPRYAGIIEWVSDNGEFRLVDPEAVAKLWGEEIGEPDMNWTKLQELLDEMVEKKIISRVEGKPNQWRFVNYPELLHP -------------------------------------------------------------------------------- /outputs/ligandmpnn_use_side_chain_atoms/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=83, num_ligand_res=35, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.5022, ligand_confidence=0.5533, seq_rec=0.4578 4 | MDSAITLWQFLDRLLSDPAYAGLIEWVSDNGEFRLVDPEGVAKLWGEEKGKPKMNWKNMHRALRGYKKKKIIETVKGKPYQYRFVNYPEYLHP -------------------------------------------------------------------------------- /outputs/ligandmpnn_v_32_005_25/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_005_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4794, ligand_confidence=0.5111, seq_rec=0.5269 4 | GMSSISLHEFILELLSDPKYADMIKWTGDDGEFQFTKPEEVAKLWGETTGKPNMNYKTLLRAIRYYKKKGIISSVKGKKYTFKFVNYPEILNP -------------------------------------------------------------------------------- /outputs/omit_AA_per_residue_multi/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3736, ligand_confidence=0.3736, seq_rec=0.5054 4 | KKKSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/omit_AA_per_residue_multi/seqs/4GYT.fa: -------------------------------------------------------------------------------- 1 | >4GYT, T=0.1, seed=111, num_res=354, num_ligand_res=354, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE 3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4332, ligand_confidence=0.4332, seq_rec=0.3955 4 | KKTLPPYAEFAAAIAPLELPVSPSELAGLMLGFLAAGKTELGRAWIRALARGRTDAATQAALAALLEVFDILERQLNDPELELELLLPPADAPLATRAAALAAFARGFVRGLELAGVGPESFATEASRAALERARALAALDPSTLRAGPADEARLEADEAWLRESILAIRRDIAENG:SLTLPPYDEFAAAIAPLELPISPSALAGLMLGYLVAGKTELGRRWIRSLLRGRTDPASQAALAALLAVFDILEAQLTDPSLELELLLPPEDASLRERARALAEFAAGFALGLELAGVDRESFAREESRRDYERILELARLDVSTLKEGEEDRARLAALEAWLRDSIVRLARDLREHG -------------------------------------------------------------------------------- /outputs/parse_atoms_with_zero_occupancy/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4784, ligand_confidence=0.5487, seq_rec=0.4839 4 | MKSPISLHEFLLELLSDPKYADIIEWVSDNGEFRLVDPERVAKLWGEVKGKPKMNWKNLHRALRGYKKKKIIETVKGKPYQYRFVNYPELLHP -------------------------------------------------------------------------------- /outputs/parse_these_chains_only/seqs/4GYT.fa: -------------------------------------------------------------------------------- 1 | >4GYT, T=0.1, seed=111, num_res=177, num_ligand_res=11, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE 3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4269, ligand_confidence=0.4174, seq_rec=0.4802 4 | SLSLPEYDDFEASIAVLELPISASELHGIMLGYLTAGAYEEGKAFIESLLKGRTDAASQAALTALLRVFEISKKQLSDPSLEFEILLPPESKSLKERCKAFSDFAKGFVQGLEEAGVGEDDFASEESREMLRKFKEYANMDCSKFKEGEEDKKKLKEKTEELREGILRLARDLRHHH -------------------------------------------------------------------------------- /outputs/pdb_path_multi/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161 4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/pdb_path_multi/seqs/4GYT.fa: -------------------------------------------------------------------------------- 1 | >4GYT, T=0.1, seed=111, num_res=354, num_ligand_res=354, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE 3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4409, ligand_confidence=0.4409, seq_rec=0.4011 4 | SLTLPPYAEFAAAIAPLELPVSPSELAGLMLGFLAAGKTELGRAWIRALARGRTDAATQAALAALLEVFDILERQLNDPALELELLLPPADAPLATRAAALAAFARGFVRGLELAGVGPESFATEASRAALERARALAALDPSTLRAGPADEARLEADEAWLRESILAIRRDIAENG:SLTLPPYDEFAAAIAPLELPISPSALAGLMLGYLVAGKTELGRRWIRSLLRGRTDPASQAALAALLAVFDILEAQLTDPSLELELLLPPEDASLRERARALAEFAAGFALGLELAGVDRESFAREESRRDYERILELARLDVSTLKEGEEDRARLAALEAWLRDSIVRLARDLREHG -------------------------------------------------------------------------------- /outputs/per_residue_bias/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3271, ligand_confidence=0.3271, seq_rec=0.4839 4 | PVPTPKPWEFLLSLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEAKGEPDMNYKKFEKELKKLEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/per_residue_label_membrane_mpnn_default/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/per_residue_label_membrane_mpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3806, ligand_confidence=0.3806, seq_rec=0.5054 4 | MTSNISLVEFILKLLSNPKYKKYIEWVSDNGEFRLVKPEEVAKLWGKVKGKPNMNYEELEKELEKEVEKKLIEKVEGEKNVYRFVDYPGILNP -------------------------------------------------------------------------------- /outputs/per_residue_omit/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3284, ligand_confidence=0.3284, seq_rec=0.4731 4 | YTYSYSYHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEAKGEPDMNYKKFEKELKKLEKKKIIEKVKGKPNHYKFVNYPEILYP -------------------------------------------------------------------------------- /outputs/random_seed/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=96723, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=96723, overall_confidence=0.4171, ligand_confidence=0.4171, seq_rec=0.5376 4 | GTSKISLHEFLLELLSKPEYKDIIEWTSDDGTFKLKDPEKVAKLWGEKKGIPDMNYEKLYELLKEYEEKGIIEKVEGEPNTYKFVNYPEILYP -------------------------------------------------------------------------------- /outputs/redesign_residues/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=10, num_ligand_res=10, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.0236, ligand_confidence=0.0236, seq_rec=0.1000 4 | AAAAAAAAAALLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM -------------------------------------------------------------------------------- /outputs/redesigned_residues_multi/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=6, num_ligand_res=6, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3353, ligand_confidence=0.3353, seq_rec=0.5000 4 | GTSSITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM -------------------------------------------------------------------------------- /outputs/redesigned_residues_multi/seqs/4GYT.fa: -------------------------------------------------------------------------------- 1 | >4GYT, T=0.1, seed=111, num_res=7, num_ligand_res=7, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE 3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4333, ligand_confidence=0.4333, seq_rec=0.5714 4 | GLSLPPYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE -------------------------------------------------------------------------------- /outputs/save_stats/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161 4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/save_stats/stats/1BC8.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/save_stats/stats/1BC8.pt -------------------------------------------------------------------------------- /outputs/sc_default/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4764, ligand_confidence=0.5422, seq_rec=0.4731 4 | SRSPISLHEFIDELLSDPKYAHIIRWTSDDGRFRLVKPEEVAKLWGEEKGKPKMNWKNMHKALRGYKKKKIIETVKGKPYEYKFVNYPEHHHH -------------------------------------------------------------------------------- /outputs/sc_default_fast/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4764, ligand_confidence=0.5422, seq_rec=0.4731 4 | SRSPISLHEFIDELLSDPKYAHIIRWTSDDGRFRLVKPEEVAKLWGEEKGKPKMNWKNMHKALRGYKKKKIIETVKGKPYEYKFVNYPEHHHH -------------------------------------------------------------------------------- /outputs/sc_fixed_residues/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=83, num_ligand_res=36, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4659, ligand_confidence=0.5278, seq_rec=0.5181 4 | PRSPITLWQFLLQLLSDPAYAHIIRWTSDDGRFQLVQPEEVARLWGEEKGKPKMNWKNMHRALRGYKKKGIIETVKGKPYQYRFVNYPEHLHH -------------------------------------------------------------------------------- /outputs/sc_fixed_residues_full_repack/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=83, num_ligand_res=36, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4659, ligand_confidence=0.5278, seq_rec=0.5181 4 | PRSPITLWQFLLQLLSDPAYAHIIRWTSDDGRFQLVQPEEVARLWGEEKGKPKMNWKNMHRALRGYKKKGIIETVKGKPYQYRFVNYPEHLHH -------------------------------------------------------------------------------- /outputs/sc_no_context/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4764, ligand_confidence=0.5422, seq_rec=0.4731 4 | SRSPISLHEFIDELLSDPKYAHIIRWTSDDGRFRLVKPEEVAKLWGEEKGKPKMNWKNMHKALRGYKKKKIIETVKGKPYEYKFVNYPEHHHH -------------------------------------------------------------------------------- /outputs/single_aa_score_w_seq/1BC8_1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/single_aa_score_w_seq/1BC8_1.pt -------------------------------------------------------------------------------- /outputs/single_aa_score_wo_seq/1BC8_1.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/single_aa_score_wo_seq/1BC8_1.pt -------------------------------------------------------------------------------- /outputs/soluble_mpnn_default/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/solublempnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4129, ligand_confidence=0.4129, seq_rec=0.4946 4 | SMSKISLPEFLLSLLSDPKYKDKIEWTGDDGTFRLVDPEAVAKLWGEVKGEPDMNYEKLEEELKKYEEKGIIEKVEGKPNTYRFVNYPEILYP -------------------------------------------------------------------------------- /outputs/symmetry/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3738, ligand_confidence=0.3738, seq_rec=0.5054 4 | SSSTTLLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKDPEAVAKLWGEEKGEPDMNYEKMEKLLKKYEKKGIIEKVEGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/temperature/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.05, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.05, seed=111, overall_confidence=0.4012, ligand_confidence=0.4012, seq_rec=0.5054 4 | GTSSISLHEFLLKLLSKPEYKDIIEWTSDNGEFKLKKPEAVAKLWGEEKGEPDMNYKKMYKELKKYEKKKIIEEVKGKPNHYKFVNYPEILYP -------------------------------------------------------------------------------- /outputs/verbose/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161 4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP -------------------------------------------------------------------------------- /outputs/zero_indexed/seqs/1BC8.fa: -------------------------------------------------------------------------------- 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=2, model_path=./model_params/proteinmpnn_v_48_020.pt 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM 3 | >1BC8, id=0, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161 4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP 5 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.4839 6 | GMSSISLYEFLLELLSDPKYEDKIEWISDNGEFRLKDPEAVAKLWGKKKGDPNMNWEKFNKLLEKYEEKGIIEKVEGKKNTYKIVNYPEILNP -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | biopython==1.79 2 | filelock==3.13.1 3 | fsspec==2024.3.1 4 | Jinja2==3.1.3 5 | MarkupSafe==2.1.5 6 | mpmath==1.3.0 7 | networkx==3.2.1 8 | numpy==1.23.5 9 | nvidia-cublas-cu12==12.1.3.1 10 | nvidia-cuda-cupti-cu12==12.1.105 11 | nvidia-cuda-nvrtc-cu12==12.1.105 12 | nvidia-cuda-runtime-cu12==12.1.105 13 | nvidia-cudnn-cu12==8.9.2.26 14 | nvidia-cufft-cu12==11.0.2.54 15 | nvidia-curand-cu12==10.3.2.106 16 | nvidia-cusolver-cu12==11.4.5.107 17 | nvidia-cusparse-cu12==12.1.0.106 18 | nvidia-nccl-cu12==2.19.3 19 | nvidia-nvjitlink-cu12==12.4.99 20 | nvidia-nvtx-cu12==12.1.105 21 | ProDy==2.4.1 22 | pyparsing==3.1.1 23 | scipy==1.12.0 24 | sympy==1.12 25 | torch==2.2.1 26 | triton==2.2.0 27 | typing_extensions==4.10.0 28 | ml-collections==0.1.1 29 | dm-tree==0.1.8 30 | -------------------------------------------------------------------------------- /run_examples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #1 4 | python run.py \ 5 | --seed 111 \ 6 | --pdb_path "./inputs/1BC8.pdb" \ 7 | --out_folder "./outputs/default" 8 | #2 9 | python run.py \ 10 | --seed 111 \ 11 | --pdb_path "./inputs/1BC8.pdb" \ 12 | --temperature 0.05 \ 13 | --out_folder "./outputs/temperature" 14 | 15 | #3 16 | python run.py \ 17 | --pdb_path "./inputs/1BC8.pdb" \ 18 | --out_folder "./outputs/random_seed" 19 | 20 | #4 21 | python run.py \ 22 | --seed 111 \ 23 | --verbose 0 \ 24 | --pdb_path "./inputs/1BC8.pdb" \ 25 | --out_folder "./outputs/verbose" 26 | 27 | #5 28 | python run.py \ 29 | --seed 111 \ 30 | --pdb_path "./inputs/1BC8.pdb" \ 31 | --out_folder "./outputs/save_stats" \ 32 | --save_stats 1 33 | 34 | #6 35 | python run.py \ 36 | --seed 111 \ 37 | --pdb_path "./inputs/1BC8.pdb" \ 38 | --out_folder "./outputs/fix_residues" \ 39 | --fixed_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10" \ 40 | --bias_AA "A:10.0" 41 | 42 | #7 43 | python run.py \ 44 | --seed 111 \ 45 | --pdb_path "./inputs/1BC8.pdb" \ 46 | --out_folder "./outputs/redesign_residues" \ 47 | --redesigned_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10" \ 48 | --bias_AA "A:10.0" 49 | 50 | #8 51 | python run.py \ 52 | --seed 111 \ 53 | --pdb_path "./inputs/1BC8.pdb" \ 54 | --out_folder "./outputs/batch_size" \ 55 | --batch_size 3 \ 56 | --number_of_batches 5 57 | 58 | #9 59 | python run.py \ 60 | --seed 111 \ 61 | --pdb_path "./inputs/1BC8.pdb" \ 62 | --bias_AA "W:3.0,P:3.0,C:3.0,A:-3.0" \ 63 | --out_folder "./outputs/global_bias" 64 | 65 | #10 66 | python run.py \ 67 | --seed 111 \ 68 | --pdb_path "./inputs/1BC8.pdb" \ 69 | --bias_AA_per_residue "./inputs/bias_AA_per_residue.json" \ 70 | --out_folder "./outputs/per_residue_bias" 71 | 72 | #11 73 | python run.py \ 74 | --seed 111 \ 75 | --pdb_path "./inputs/1BC8.pdb" \ 76 | --omit_AA "CDFGHILMNPQRSTVWY" \ 77 | --out_folder "./outputs/global_omit" 78 | 79 | #12 80 | python run.py \ 81 | --seed 111 \ 82 | --pdb_path "./inputs/1BC8.pdb" \ 83 | --omit_AA_per_residue "./inputs/omit_AA_per_residue.json" \ 84 | --out_folder "./outputs/per_residue_omit" 85 | 86 | #13 87 | python run.py \ 88 | --seed 111 \ 89 | --pdb_path "./inputs/1BC8.pdb" \ 90 | --out_folder "./outputs/symmetry" \ 91 | --symmetry_residues "C1,C2,C3|C4,C5|C6,C7" \ 92 | --symmetry_weights "0.33,0.33,0.33|0.5,0.5|0.5,0.5" 93 | 94 | #14 95 | python run.py \ 96 | --model_type "ligand_mpnn" \ 97 | --seed 111 \ 98 | --pdb_path "./inputs/4GYT.pdb" \ 99 | --out_folder "./outputs/homooligomer" \ 100 | --homo_oligomer 1 \ 101 | --number_of_batches 2 102 | 103 | #15 104 | python run.py \ 105 | --seed 111 \ 106 | --pdb_path "./inputs/1BC8.pdb" \ 107 | --out_folder "./outputs/file_ending" \ 108 | --file_ending "_xyz" 109 | 110 | #16 111 | python run.py \ 112 | --seed 111 \ 113 | --pdb_path "./inputs/1BC8.pdb" \ 114 | --out_folder "./outputs/zero_indexed" \ 115 | --zero_indexed 1 \ 116 | --number_of_batches 2 117 | 118 | #17 119 | python run.py \ 120 | --model_type "ligand_mpnn" \ 121 | --seed 111 \ 122 | --pdb_path "./inputs/4GYT.pdb" \ 123 | --out_folder "./outputs/chains_to_design" \ 124 | --chains_to_design "A,B" 125 | 126 | #18 127 | python run.py \ 128 | --model_type "ligand_mpnn" \ 129 | --seed 111 \ 130 | --pdb_path "./inputs/4GYT.pdb" \ 131 | --out_folder "./outputs/parse_these_chains_only" \ 132 | --parse_these_chains_only "A,B" 133 | 134 | #19 135 | python run.py \ 136 | --model_type "ligand_mpnn" \ 137 | --seed 111 \ 138 | --pdb_path "./inputs/1BC8.pdb" \ 139 | --out_folder "./outputs/ligandmpnn_default" 140 | 141 | #20 142 | python run.py \ 143 | --checkpoint_ligand_mpnn "./model_params/ligandmpnn_v_32_005_25.pt" \ 144 | --model_type "ligand_mpnn" \ 145 | --seed 111 \ 146 | --pdb_path "./inputs/1BC8.pdb" \ 147 | --out_folder "./outputs/ligandmpnn_v_32_005_25" 148 | 149 | #21 150 | python run.py \ 151 | --model_type "ligand_mpnn" \ 152 | --seed 111 \ 153 | --pdb_path "./inputs/1BC8.pdb" \ 154 | --out_folder "./outputs/ligandmpnn_no_context" \ 155 | --ligand_mpnn_use_atom_context 0 156 | 157 | #22 158 | python run.py \ 159 | --model_type "ligand_mpnn" \ 160 | --seed 111 \ 161 | --pdb_path "./inputs/1BC8.pdb" \ 162 | --out_folder "./outputs/ligandmpnn_use_side_chain_atoms" \ 163 | --ligand_mpnn_use_side_chain_context 1 \ 164 | --fixed_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10" 165 | 166 | #23 167 | python run.py \ 168 | --model_type "soluble_mpnn" \ 169 | --seed 111 \ 170 | --pdb_path "./inputs/1BC8.pdb" \ 171 | --out_folder "./outputs/soluble_mpnn_default" 172 | 173 | #24 174 | python run.py \ 175 | --model_type "global_label_membrane_mpnn" \ 176 | --seed 111 \ 177 | --pdb_path "./inputs/1BC8.pdb" \ 178 | --out_folder "./outputs/global_label_membrane_mpnn_0" \ 179 | --global_transmembrane_label 0 180 | 181 | #25 182 | python run.py \ 183 | --model_type "per_residue_label_membrane_mpnn" \ 184 | --seed 111 \ 185 | --pdb_path "./inputs/1BC8.pdb" \ 186 | --out_folder "./outputs/per_residue_label_membrane_mpnn_default" \ 187 | --transmembrane_buried "C1 C2 C3 C11" \ 188 | --transmembrane_interface "C4 C5 C6 C22" 189 | 190 | #26 191 | python run.py \ 192 | --pdb_path "./inputs/1BC8.pdb" \ 193 | --out_folder "./outputs/fasta_seq_separation" \ 194 | --fasta_seq_separation ":" 195 | 196 | #27 197 | python run.py \ 198 | --pdb_path_multi "./inputs/pdb_ids.json" \ 199 | --out_folder "./outputs/pdb_path_multi" \ 200 | --seed 111 201 | 202 | #28 203 | python run.py \ 204 | --pdb_path_multi "./inputs/pdb_ids.json" \ 205 | --fixed_residues_multi "./inputs/fix_residues_multi.json" \ 206 | --out_folder "./outputs/fixed_residues_multi" \ 207 | --seed 111 208 | 209 | #29 210 | python run.py \ 211 | --pdb_path_multi "./inputs/pdb_ids.json" \ 212 | --redesigned_residues_multi "./inputs/redesigned_residues_multi.json" \ 213 | --out_folder "./outputs/redesigned_residues_multi" \ 214 | --seed 111 215 | 216 | #30 217 | python run.py \ 218 | --pdb_path_multi "./inputs/pdb_ids.json" \ 219 | --omit_AA_per_residue_multi "./inputs/omit_AA_per_residue_multi.json" \ 220 | --out_folder "./outputs/omit_AA_per_residue_multi" \ 221 | --seed 111 222 | 223 | #31 224 | python run.py \ 225 | --pdb_path_multi "./inputs/pdb_ids.json" \ 226 | --bias_AA_per_residue_multi "./inputs/bias_AA_per_residue_multi.json" \ 227 | --out_folder "./outputs/bias_AA_per_residue_multi" \ 228 | --seed 111 229 | 230 | #32 231 | python run.py \ 232 | --model_type "ligand_mpnn" \ 233 | --seed 111 \ 234 | --pdb_path "./inputs/1BC8.pdb" \ 235 | --ligand_mpnn_cutoff_for_score "6.0" \ 236 | --out_folder "./outputs/ligand_mpnn_cutoff_for_score" 237 | 238 | #33 239 | python run.py \ 240 | --seed 111 \ 241 | --pdb_path "./inputs/2GFB.pdb" \ 242 | --out_folder "./outputs/insertion_code" \ 243 | --redesigned_residues "B82 B82A B82B B82C" \ 244 | --parse_these_chains_only "B" 245 | -------------------------------------------------------------------------------- /sc_examples.sh: -------------------------------------------------------------------------------- 1 | #1 design a new sequence and pack side chains (return 1 side chain packing sample - fast) 2 | python run.py \ 3 | --model_type "ligand_mpnn" \ 4 | --seed 111 \ 5 | --pdb_path "./inputs/1BC8.pdb" \ 6 | --out_folder "./outputs/sc_default_fast" \ 7 | --pack_side_chains 1 \ 8 | --number_of_packs_per_design 0 \ 9 | --pack_with_ligand_context 1 10 | 11 | #2 design a new sequence and pack side chains (return 4 side chain packing samples) 12 | python run.py \ 13 | --model_type "ligand_mpnn" \ 14 | --seed 111 \ 15 | --pdb_path "./inputs/1BC8.pdb" \ 16 | --out_folder "./outputs/sc_default" \ 17 | --pack_side_chains 1 \ 18 | --number_of_packs_per_design 4 \ 19 | --pack_with_ligand_context 1 20 | 21 | 22 | #3 fix specific residues for design and packing 23 | python run.py \ 24 | --model_type "ligand_mpnn" \ 25 | --seed 111 \ 26 | --pdb_path "./inputs/1BC8.pdb" \ 27 | --out_folder "./outputs/sc_fixed_residues" \ 28 | --pack_side_chains 1 \ 29 | --number_of_packs_per_design 4 \ 30 | --pack_with_ligand_context 1 \ 31 | --fixed_residues "C6 C7 C8 C9 C10 C11 C12 C13 C14 C15" \ 32 | --repack_everything 0 33 | 34 | #4 fix specific residues for sequence design but repack everything 35 | python run.py \ 36 | --model_type "ligand_mpnn" \ 37 | --seed 111 \ 38 | --pdb_path "./inputs/1BC8.pdb" \ 39 | --out_folder "./outputs/sc_fixed_residues_full_repack" \ 40 | --pack_side_chains 1 \ 41 | --number_of_packs_per_design 4 \ 42 | --pack_with_ligand_context 1 \ 43 | --fixed_residues "C6 C7 C8 C9 C10 C11 C12 C13 C14 C15" \ 44 | --repack_everything 1 45 | 46 | 47 | #5 design a new sequence using LigandMPNN but pack side chains without considering ligand/DNA etc atoms 48 | python run.py \ 49 | --model_type "ligand_mpnn" \ 50 | --seed 111 \ 51 | --pdb_path "./inputs/1BC8.pdb" \ 52 | --out_folder "./outputs/sc_no_context" \ 53 | --pack_side_chains 1 \ 54 | --number_of_packs_per_design 4 \ 55 | --pack_with_ligand_context 0 56 | -------------------------------------------------------------------------------- /score.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os.path 4 | import random 5 | import sys 6 | 7 | import numpy as np 8 | import torch 9 | 10 | from data_utils import ( 11 | element_dict_rev, 12 | alphabet, 13 | restype_int_to_str, 14 | featurize, 15 | parse_PDB, 16 | ) 17 | from model_utils import ProteinMPNN 18 | 19 | 20 | def main(args) -> None: 21 | """ 22 | Inference function 23 | """ 24 | if args.seed: 25 | seed = args.seed 26 | else: 27 | seed = int(np.random.randint(0, high=99999, size=1, dtype=int)[0]) 28 | torch.manual_seed(seed) 29 | random.seed(seed) 30 | np.random.seed(seed) 31 | device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu") 32 | folder_for_outputs = args.out_folder 33 | base_folder = folder_for_outputs 34 | if base_folder[-1] != "/": 35 | base_folder = base_folder + "/" 36 | if not os.path.exists(base_folder): 37 | os.makedirs(base_folder, exist_ok=True) 38 | if args.model_type == "protein_mpnn": 39 | checkpoint_path = args.checkpoint_protein_mpnn 40 | elif args.model_type == "ligand_mpnn": 41 | checkpoint_path = args.checkpoint_ligand_mpnn 42 | elif args.model_type == "per_residue_label_membrane_mpnn": 43 | checkpoint_path = args.checkpoint_per_residue_label_membrane_mpnn 44 | elif args.model_type == "global_label_membrane_mpnn": 45 | checkpoint_path = args.checkpoint_global_label_membrane_mpnn 46 | elif args.model_type == "soluble_mpnn": 47 | checkpoint_path = args.checkpoint_soluble_mpnn 48 | else: 49 | print("Choose one of the available models") 50 | sys.exit() 51 | checkpoint = torch.load(checkpoint_path, map_location=device) 52 | if args.model_type == "ligand_mpnn": 53 | atom_context_num = checkpoint["atom_context_num"] 54 | ligand_mpnn_use_side_chain_context = args.ligand_mpnn_use_side_chain_context 55 | k_neighbors = checkpoint["num_edges"] 56 | else: 57 | atom_context_num = 1 58 | ligand_mpnn_use_side_chain_context = 0 59 | k_neighbors = checkpoint["num_edges"] 60 | 61 | model = ProteinMPNN( 62 | node_features=128, 63 | edge_features=128, 64 | hidden_dim=128, 65 | num_encoder_layers=3, 66 | num_decoder_layers=3, 67 | k_neighbors=k_neighbors, 68 | device=device, 69 | atom_context_num=atom_context_num, 70 | model_type=args.model_type, 71 | ligand_mpnn_use_side_chain_context=ligand_mpnn_use_side_chain_context, 72 | ) 73 | 74 | model.load_state_dict(checkpoint["model_state_dict"]) 75 | model.to(device) 76 | model.eval() 77 | 78 | if args.pdb_path_multi: 79 | with open(args.pdb_path_multi, "r") as fh: 80 | pdb_paths = list(json.load(fh)) 81 | else: 82 | pdb_paths = [args.pdb_path] 83 | 84 | if args.fixed_residues_multi: 85 | with open(args.fixed_residues_multi, "r") as fh: 86 | fixed_residues_multi = json.load(fh) 87 | else: 88 | fixed_residues = [item for item in args.fixed_residues.split()] 89 | fixed_residues_multi = {} 90 | for pdb in pdb_paths: 91 | fixed_residues_multi[pdb] = fixed_residues 92 | 93 | if args.redesigned_residues_multi: 94 | with open(args.redesigned_residues_multi, "r") as fh: 95 | redesigned_residues_multi = json.load(fh) 96 | else: 97 | redesigned_residues = [item for item in args.redesigned_residues.split()] 98 | redesigned_residues_multi = {} 99 | for pdb in pdb_paths: 100 | redesigned_residues_multi[pdb] = redesigned_residues 101 | 102 | # loop over PDB paths 103 | for pdb in pdb_paths: 104 | if args.verbose: 105 | print("Designing protein from this path:", pdb) 106 | fixed_residues = fixed_residues_multi[pdb] 107 | redesigned_residues = redesigned_residues_multi[pdb] 108 | protein_dict, backbone, other_atoms, icodes, _ = parse_PDB( 109 | pdb, 110 | device=device, 111 | chains=args.parse_these_chains_only, 112 | parse_all_atoms=args.ligand_mpnn_use_side_chain_context, 113 | parse_atoms_with_zero_occupancy=args.parse_atoms_with_zero_occupancy 114 | ) 115 | # make chain_letter + residue_idx + insertion_code mapping to integers 116 | R_idx_list = list(protein_dict["R_idx"].cpu().numpy()) # residue indices 117 | chain_letters_list = list(protein_dict["chain_letters"]) # chain letters 118 | encoded_residues = [] 119 | for i, R_idx_item in enumerate(R_idx_list): 120 | tmp = str(chain_letters_list[i]) + str(R_idx_item) + icodes[i] 121 | encoded_residues.append(tmp) 122 | encoded_residue_dict = dict(zip(encoded_residues, range(len(encoded_residues)))) 123 | encoded_residue_dict_rev = dict( 124 | zip(list(range(len(encoded_residues))), encoded_residues) 125 | ) 126 | 127 | fixed_positions = torch.tensor( 128 | [int(item not in fixed_residues) for item in encoded_residues], 129 | device=device, 130 | ) 131 | redesigned_positions = torch.tensor( 132 | [int(item not in redesigned_residues) for item in encoded_residues], 133 | device=device, 134 | ) 135 | 136 | # specify which residues are buried for checkpoint_per_residue_label_membrane_mpnn model 137 | if args.transmembrane_buried: 138 | buried_residues = [item for item in args.transmembrane_buried.split()] 139 | buried_positions = torch.tensor( 140 | [int(item in buried_residues) for item in encoded_residues], 141 | device=device, 142 | ) 143 | else: 144 | buried_positions = torch.zeros_like(fixed_positions) 145 | 146 | if args.transmembrane_interface: 147 | interface_residues = [item for item in args.transmembrane_interface.split()] 148 | interface_positions = torch.tensor( 149 | [int(item in interface_residues) for item in encoded_residues], 150 | device=device, 151 | ) 152 | else: 153 | interface_positions = torch.zeros_like(fixed_positions) 154 | protein_dict["membrane_per_residue_labels"] = 2 * buried_positions * ( 155 | 1 - interface_positions 156 | ) + 1 * interface_positions * (1 - buried_positions) 157 | 158 | if args.model_type == "global_label_membrane_mpnn": 159 | protein_dict["membrane_per_residue_labels"] = ( 160 | args.global_transmembrane_label + 0 * fixed_positions 161 | ) 162 | if type(args.chains_to_design) == str: 163 | chains_to_design_list = args.chains_to_design.split(",") 164 | else: 165 | chains_to_design_list = protein_dict["chain_letters"] 166 | chain_mask = torch.tensor( 167 | np.array( 168 | [ 169 | item in chains_to_design_list 170 | for item in protein_dict["chain_letters"] 171 | ], 172 | dtype=np.int32, 173 | ), 174 | device=device, 175 | ) 176 | 177 | # create chain_mask to notify which residues are fixed (0) and which need to be designed (1) 178 | if redesigned_residues: 179 | protein_dict["chain_mask"] = chain_mask * (1 - redesigned_positions) 180 | elif fixed_residues: 181 | protein_dict["chain_mask"] = chain_mask * fixed_positions 182 | else: 183 | protein_dict["chain_mask"] = chain_mask 184 | 185 | if args.verbose: 186 | PDB_residues_to_be_redesigned = [ 187 | encoded_residue_dict_rev[item] 188 | for item in range(protein_dict["chain_mask"].shape[0]) 189 | if protein_dict["chain_mask"][item] == 1 190 | ] 191 | PDB_residues_to_be_fixed = [ 192 | encoded_residue_dict_rev[item] 193 | for item in range(protein_dict["chain_mask"].shape[0]) 194 | if protein_dict["chain_mask"][item] == 0 195 | ] 196 | print("These residues will be redesigned: ", PDB_residues_to_be_redesigned) 197 | print("These residues will be fixed: ", PDB_residues_to_be_fixed) 198 | 199 | # specify which residues are linked 200 | if args.symmetry_residues: 201 | symmetry_residues_list_of_lists = [ 202 | x.split(",") for x in args.symmetry_residues.split("|") 203 | ] 204 | remapped_symmetry_residues = [] 205 | for t_list in symmetry_residues_list_of_lists: 206 | tmp_list = [] 207 | for t in t_list: 208 | tmp_list.append(encoded_residue_dict[t]) 209 | remapped_symmetry_residues.append(tmp_list) 210 | else: 211 | remapped_symmetry_residues = [[]] 212 | 213 | if args.homo_oligomer: 214 | if args.verbose: 215 | print("Designing HOMO-OLIGOMER") 216 | chain_letters_set = list(set(chain_letters_list)) 217 | reference_chain = chain_letters_set[0] 218 | lc = len(reference_chain) 219 | residue_indices = [ 220 | item[lc:] for item in encoded_residues if item[:lc] == reference_chain 221 | ] 222 | remapped_symmetry_residues = [] 223 | for res in residue_indices: 224 | tmp_list = [] 225 | tmp_w_list = [] 226 | for chain in chain_letters_set: 227 | name = chain + res 228 | tmp_list.append(encoded_residue_dict[name]) 229 | tmp_w_list.append(1 / len(chain_letters_set)) 230 | remapped_symmetry_residues.append(tmp_list) 231 | 232 | # set other atom bfactors to 0.0 233 | if other_atoms: 234 | other_bfactors = other_atoms.getBetas() 235 | other_atoms.setBetas(other_bfactors * 0.0) 236 | 237 | # adjust input PDB name by dropping .pdb if it does exist 238 | name = pdb[pdb.rfind("/") + 1 :] 239 | if name[-4:] == ".pdb": 240 | name = name[:-4] 241 | 242 | with torch.no_grad(): 243 | # run featurize to remap R_idx and add batch dimension 244 | if args.verbose: 245 | if "Y" in list(protein_dict): 246 | atom_coords = protein_dict["Y"].cpu().numpy() 247 | atom_types = list(protein_dict["Y_t"].cpu().numpy()) 248 | atom_mask = list(protein_dict["Y_m"].cpu().numpy()) 249 | number_of_atoms_parsed = np.sum(atom_mask) 250 | else: 251 | print("No ligand atoms parsed") 252 | number_of_atoms_parsed = 0 253 | atom_types = "" 254 | atom_coords = [] 255 | if number_of_atoms_parsed == 0: 256 | print("No ligand atoms parsed") 257 | elif args.model_type == "ligand_mpnn": 258 | print( 259 | f"The number of ligand atoms parsed is equal to: {number_of_atoms_parsed}" 260 | ) 261 | for i, atom_type in enumerate(atom_types): 262 | print( 263 | f"Type: {element_dict_rev[atom_type]}, Coords {atom_coords[i]}, Mask {atom_mask[i]}" 264 | ) 265 | feature_dict = featurize( 266 | protein_dict, 267 | cutoff_for_score=args.ligand_mpnn_cutoff_for_score, 268 | use_atom_context=args.ligand_mpnn_use_atom_context, 269 | number_of_ligand_atoms=atom_context_num, 270 | model_type=args.model_type, 271 | ) 272 | feature_dict["batch_size"] = args.batch_size 273 | B, L, _, _ = feature_dict["X"].shape # batch size should be 1 for now. 274 | # add additional keys to the feature dictionary 275 | feature_dict["symmetry_residues"] = remapped_symmetry_residues 276 | 277 | logits_list = [] 278 | probs_list = [] 279 | log_probs_list = [] 280 | decoding_order_list = [] 281 | for _ in range(args.number_of_batches): 282 | feature_dict["randn"] = torch.randn( 283 | [feature_dict["batch_size"], feature_dict["mask"].shape[1]], 284 | device=device, 285 | ) 286 | if args.autoregressive_score: 287 | score_dict = model.score(feature_dict, use_sequence=args.use_sequence) 288 | elif args.single_aa_score: 289 | score_dict = model.single_aa_score(feature_dict, use_sequence=args.use_sequence) 290 | else: 291 | print("Set either autoregressive_score or single_aa_score to True") 292 | sys.exit() 293 | logits_list.append(score_dict["logits"]) 294 | log_probs_list.append(score_dict["log_probs"]) 295 | probs_list.append(torch.exp(score_dict["log_probs"])) 296 | decoding_order_list.append(score_dict["decoding_order"]) 297 | log_probs_stack = torch.cat(log_probs_list, 0) 298 | logits_stack = torch.cat(logits_list, 0) 299 | probs_stack = torch.cat(probs_list, 0) 300 | decoding_order_stack = torch.cat(decoding_order_list, 0) 301 | 302 | output_stats_path = base_folder + name + args.file_ending + ".pt" 303 | out_dict = {} 304 | out_dict["logits"] = logits_stack.cpu().numpy() 305 | out_dict["probs"] = probs_stack.cpu().numpy() 306 | out_dict["log_probs"] = log_probs_stack.cpu().numpy() 307 | out_dict["decoding_order"] = decoding_order_stack.cpu().numpy() 308 | out_dict["native_sequence"] = feature_dict["S"][0].cpu().numpy() 309 | out_dict["mask"] = feature_dict["mask"][0].cpu().numpy() 310 | out_dict["chain_mask"] = feature_dict["chain_mask"][0].cpu().numpy() #this affects decoding order 311 | out_dict["seed"] = seed 312 | out_dict["alphabet"] = alphabet 313 | out_dict["residue_names"] = encoded_residue_dict_rev 314 | 315 | mean_probs = np.mean(out_dict["probs"], 0) 316 | std_probs = np.std(out_dict["probs"], 0) 317 | sequence = [restype_int_to_str[AA] for AA in out_dict["native_sequence"]] 318 | mean_dict = {} 319 | std_dict = {} 320 | for residue in range(L): 321 | mean_dict_ = dict(zip(alphabet, mean_probs[residue])) 322 | mean_dict[encoded_residue_dict_rev[residue]] = mean_dict_ 323 | std_dict_ = dict(zip(alphabet, std_probs[residue])) 324 | std_dict[encoded_residue_dict_rev[residue]] = std_dict_ 325 | 326 | out_dict["sequence"] = sequence 327 | out_dict["mean_of_probs"] = mean_dict 328 | out_dict["std_of_probs"] = std_dict 329 | torch.save(out_dict, output_stats_path) 330 | 331 | 332 | 333 | if __name__ == "__main__": 334 | argparser = argparse.ArgumentParser( 335 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 336 | ) 337 | 338 | argparser.add_argument( 339 | "--model_type", 340 | type=str, 341 | default="protein_mpnn", 342 | help="Choose your model: protein_mpnn, ligand_mpnn, per_residue_label_membrane_mpnn, global_label_membrane_mpnn, soluble_mpnn", 343 | ) 344 | # protein_mpnn - original ProteinMPNN trained on the whole PDB exluding non-protein atoms 345 | # ligand_mpnn - atomic context aware model trained with small molecules, nucleotides, metals etc on the whole PDB 346 | # per_residue_label_membrane_mpnn - ProteinMPNN model trained with addition label per residue specifying if that residue is buried or exposed 347 | # global_label_membrane_mpnn - ProteinMPNN model trained with global label per PDB id to specify if protein is transmembrane 348 | # soluble_mpnn - ProteinMPNN trained only on soluble PDB ids 349 | argparser.add_argument( 350 | "--checkpoint_protein_mpnn", 351 | type=str, 352 | default="./model_params/proteinmpnn_v_48_020.pt", 353 | help="Path to model weights.", 354 | ) 355 | argparser.add_argument( 356 | "--checkpoint_ligand_mpnn", 357 | type=str, 358 | default="./model_params/ligandmpnn_v_32_010_25.pt", 359 | help="Path to model weights.", 360 | ) 361 | argparser.add_argument( 362 | "--checkpoint_per_residue_label_membrane_mpnn", 363 | type=str, 364 | default="./model_params/per_residue_label_membrane_mpnn_v_48_020.pt", 365 | help="Path to model weights.", 366 | ) 367 | argparser.add_argument( 368 | "--checkpoint_global_label_membrane_mpnn", 369 | type=str, 370 | default="./model_params/global_label_membrane_mpnn_v_48_020.pt", 371 | help="Path to model weights.", 372 | ) 373 | argparser.add_argument( 374 | "--checkpoint_soluble_mpnn", 375 | type=str, 376 | default="./model_params/solublempnn_v_48_020.pt", 377 | help="Path to model weights.", 378 | ) 379 | 380 | argparser.add_argument("--verbose", type=int, default=1, help="Print stuff") 381 | 382 | argparser.add_argument( 383 | "--pdb_path", type=str, default="", help="Path to the input PDB." 384 | ) 385 | argparser.add_argument( 386 | "--pdb_path_multi", 387 | type=str, 388 | default="", 389 | help="Path to json listing PDB paths. {'/path/to/pdb': ''} - only keys will be used.", 390 | ) 391 | 392 | argparser.add_argument( 393 | "--fixed_residues", 394 | type=str, 395 | default="", 396 | help="Provide fixed residues, A12 A13 A14 B2 B25", 397 | ) 398 | argparser.add_argument( 399 | "--fixed_residues_multi", 400 | type=str, 401 | default="", 402 | help="Path to json mapping of fixed residues for each pdb i.e., {'/path/to/pdb': 'A12 A13 A14 B2 B25'}", 403 | ) 404 | 405 | argparser.add_argument( 406 | "--redesigned_residues", 407 | type=str, 408 | default="", 409 | help="Provide to be redesigned residues, everything else will be fixed, A12 A13 A14 B2 B25", 410 | ) 411 | argparser.add_argument( 412 | "--redesigned_residues_multi", 413 | type=str, 414 | default="", 415 | help="Path to json mapping of redesigned residues for each pdb i.e., {'/path/to/pdb': 'A12 A13 A14 B2 B25'}", 416 | ) 417 | 418 | argparser.add_argument( 419 | "--symmetry_residues", 420 | type=str, 421 | default="", 422 | help="Add list of lists for which residues need to be symmetric, e.g. 'A12,A13,A14|C2,C3|A5,B6'", 423 | ) 424 | 425 | argparser.add_argument( 426 | "--homo_oligomer", 427 | type=int, 428 | default=0, 429 | help="Setting this to 1 will automatically set --symmetry_residues and --symmetry_weights to do homooligomer design with equal weighting.", 430 | ) 431 | 432 | argparser.add_argument( 433 | "--out_folder", 434 | type=str, 435 | help="Path to a folder to output scores, e.g. /home/out/", 436 | ) 437 | argparser.add_argument( 438 | "--file_ending", type=str, default="", help="adding_string_to_the_end" 439 | ) 440 | argparser.add_argument( 441 | "--zero_indexed", 442 | type=str, 443 | default=0, 444 | help="1 - to start output PDB numbering with 0", 445 | ) 446 | argparser.add_argument( 447 | "--seed", 448 | type=int, 449 | default=0, 450 | help="Set seed for torch, numpy, and python random.", 451 | ) 452 | argparser.add_argument( 453 | "--batch_size", 454 | type=int, 455 | default=1, 456 | help="Number of sequence to generate per one pass.", 457 | ) 458 | argparser.add_argument( 459 | "--number_of_batches", 460 | type=int, 461 | default=1, 462 | help="Number of times to design sequence using a chosen batch size.", 463 | ) 464 | 465 | argparser.add_argument( 466 | "--ligand_mpnn_use_atom_context", 467 | type=int, 468 | default=1, 469 | help="1 - use atom context, 0 - do not use atom context.", 470 | ) 471 | 472 | argparser.add_argument( 473 | "--ligand_mpnn_use_side_chain_context", 474 | type=int, 475 | default=0, 476 | help="Flag to use side chain atoms as ligand context for the fixed residues", 477 | ) 478 | 479 | argparser.add_argument( 480 | "--ligand_mpnn_cutoff_for_score", 481 | type=float, 482 | default=8.0, 483 | help="Cutoff in angstroms between protein and context atoms to select residues for reporting score.", 484 | ) 485 | 486 | argparser.add_argument( 487 | "--chains_to_design", 488 | type=str, 489 | default=None, 490 | help="Specify which chains to redesign, all others will be kept fixed.", 491 | ) 492 | 493 | argparser.add_argument( 494 | "--parse_these_chains_only", 495 | type=str, 496 | default="", 497 | help="Provide chains letters for parsing backbones, 'ABCF'", 498 | ) 499 | 500 | argparser.add_argument( 501 | "--transmembrane_buried", 502 | type=str, 503 | default="", 504 | help="Provide buried residues when using checkpoint_per_residue_label_membrane_mpnn model, A12 A13 A14 B2 B25", 505 | ) 506 | argparser.add_argument( 507 | "--transmembrane_interface", 508 | type=str, 509 | default="", 510 | help="Provide interface residues when using checkpoint_per_residue_label_membrane_mpnn model, A12 A13 A14 B2 B25", 511 | ) 512 | 513 | argparser.add_argument( 514 | "--global_transmembrane_label", 515 | type=int, 516 | default=0, 517 | help="Provide global label for global_label_membrane_mpnn model. 1 - transmembrane, 0 - soluble", 518 | ) 519 | 520 | argparser.add_argument( 521 | "--parse_atoms_with_zero_occupancy", 522 | type=int, 523 | default=0, 524 | help="To parse atoms with zero occupancy in the PDB input files. 0 - do not parse, 1 - parse atoms with zero occupancy", 525 | ) 526 | 527 | argparser.add_argument( 528 | "--use_sequence", 529 | type=int, 530 | default=1, 531 | help="1 - get scores using amino acid sequence info; 0 - get scores using backbone info only", 532 | ) 533 | 534 | argparser.add_argument( 535 | "--autoregressive_score", 536 | type=int, 537 | default=0, 538 | help="1 - run autoregressive scoring function; p(AA_1|backbone); p(AA_2|backbone, AA_1) etc, 0 - False", 539 | ) 540 | 541 | argparser.add_argument( 542 | "--single_aa_score", 543 | type=int, 544 | default=1, 545 | help="1 - run single amino acid scoring function; p(AA_i|backbone, AA_{all except ith one}), 0 - False", 546 | ) 547 | 548 | args = argparser.parse_args() 549 | main(args) 550 | -------------------------------------------------------------------------------- /training/README.md: -------------------------------------------------------------------------------- 1 | ## Retraining LigandMPNN 2 | 3 | Training PDB ids: `train.json` 4 | 5 | Validation PDB ids: `valid.json` 6 | 7 | Test PDB ids: `test_small_molecule.json, test_nucleotide.json, test_metal.json` 8 | -------------------------------------------------------------------------------- /training/test_metal.json: -------------------------------------------------------------------------------- 1 | ["1dwh", "1e4m", "1e6s", "1e72", "1f35", "1fee", "1job", "1lqk", "1m5e", "1m5f", "1moj", "1mxy", "1mxz", "1my1", "1nki", "1qum", "1sgf", "1t31", "1u3e", "2bdh", "2bx2", "2cfv", "2e6c", "2nq9", "2nqj", "2nz6", "2ou7", "2vxx", "2zwn", "3bvx", "3cv5", "3f4v", "3f5l", "3fgg", "3hg9", "3hkn", "3hkt", "3i9z", "3k7r", "3l24", "3l7t", "3m7p", "3mi9", "3o1u", "3u92", "3u93", "3u94", "3won", "4aoj", "4dy1", "4hzt", "4i0f", "4i0j", "4i0z", "4i11", "4i12", "4jd1", "4naz", "4wd8", "4x68", "5f55", "5f56", "5fgs", "5hez", "5i4j", "5l70", "5vde", "6a4x", "6buu", "6cyt", "6iv2", "6lkp", "6lrd", "6wdz", "6x75", "7dnr", "7e34", "7kii", "7n7g", "7s7l", "7s7m", "7w5e", "7wb2"] -------------------------------------------------------------------------------- /training/test_nucleotide.json: -------------------------------------------------------------------------------- 1 | ["1a0a", "1am9", "1an4", "1b01", "1bc7", "1bc8", "1di2", "1ec6", "1hlo", "1hlv", "1i3j", "1pvi", "1qum", "1sfu", "1u3e", "1xpx", "1yo5", "1zx4", "2c5r", "2c62", "2nq9", "2o4a", "2p5l", "2xdb", "2ypb", "2zhg", "2zio", "3adl", "3bsu", "3fc3", "3g73", "3gna", "3gx4", "3lsr", "3mj0", "3mva", "3n7q", "3olt", "3vok", "3vwb", "3zp5", "4ato", "4bhm", "4bqa", "4e0p", "4nid", "4wal", "5cm3", "5haw", "5mht", "5vc9", "5w9s", "5ybd", "6bjv", "6dnw", "6fqr", "6gdr", "6kbs", "6lff", "6lmj", "6od4", "6wdz", "6x70", "6y93", "7bca", "7c0g", "7el3", "7jsa", "7ju3", "7kii", "7kij", "7mtl", "7z0u", "8dwm"] -------------------------------------------------------------------------------- /training/test_small_molecule.json: -------------------------------------------------------------------------------- 1 | ["1a28", "1bzc", "1drv", "1e3g", "1elb", "1elc", "1epo", "1f0r", "1g7f", "1g7g", "1gvw", "1gx8", "1i37", "1kav", "1kdk", "1kv1", "1l8g", "1lhu", "1lpg", "1nc1", "1nfx", "1nhz", "1nl9", "1nny", "1nwl", "1ony", "1pyn", "1qb1", "1qkt", "1qxk", "1r0p", "1sj0", "1sqn", "1v2n", "1xjd", "1xws", "1yc1", "1yqj", "1z95", "1zp8", "2ayr", "2b07", "2b4l", "2baj", "2bak", "2bal", "2bsm", "2cet", "2e2r", "2f6t", "2fdp", "2g94", "2hah", "2ihq", "2iwx", "2j2u", "2j34", "2j4i", "2j94", "2j95", "2o0u", "2oax", "2ojg", "2ojj", "2p4j", "2p7g", "2p7z", "2pog", "2qbp", "2qbq", "2qbs", "2qe4", "2qmg", "2uwl", "2uwo", "2uwp", "2v7a", "2vh0", "2vh6", "2vkm", "2vrj", "2vw5", "2vwc", "2w8y", "2wc3", "2web", "2wec", "2weq", "2wgj", "2wuf", "2wyg", "2wyj", "2xab", "2xb8", "2xda", "2xht", "2xj1", "2xj2", "2xjg", "2xjx", "2y7x", "2y7z", "2y80", "2y81", "2y82", "2ydw", "2yek", "2yel", "2yfe", "2yfx", "2yge", "2ygf", "2yi0", "2yi7", "2yix", "2zmm", "3acw", "3acx", "3b5r", "3b65", "3bgq", "3bgz", "3ckp", "3cow", "3coy", "3coz", "3d7z", "3d83", "3eax", "3ekr", "3fv1", "3fv2", "3fvk", "3gba", "3gbb", "3gcs", "3gcu", "3gy3", "3hek", "3i25", "3ioc", "3iph", "3iw6", "3k97", "3lpi", "3lpk", "3lxk", "3m35", "3myg", "3n76", "3nq3", "3nyx", "3o5x", "3o8p", "3pww", "3roc", "3tfn", "3u81", "3ueu", "3uev", "3uew", "3uex", "3vha", "3vhc", "3vhd", "3vje", "3vvy", "3vw1", "3vw2", "3wha", "3wz6", "3wz8", "3zc5", "3zm9", "3zze", "4a4v", "4a4w", "4a7i", "4ag8", "4ap7", "4b6o", "4b9k", "4cd0", "4cga", "4cmo", "4da5", "4e5w", "4e6d", "4e9u", "4ea2", "4egk", "4er1", "4fcq", "4ffs", "4flp", "4g8n", "4gny", "4gu6", "4hge", "4igt", "4k0y", "4k9y", "4kao", "4kcx", "4lyw", "4m0r", "4m12", "4m13", "4muf", "4nh8", "4nwc", "4o04", "4o05", "4o07", "4o09", "4o0b", "4p5z", "4pmm", "4pop", "4qev", "4qew", "4qyy", "4rfm", "4rwj", "4twp", "4uyf", "4v01", "4w9f", "4w9l", "4wa9", "4wkn", "4x6p", "4xip", "4xir", "4y79", "4ybk", "4ymb", "4yml", "4ynb", "4yth", "4z0k", "4zae", "5aa9", "5acy", "5d26", "5d3h", "5d3j", "5d3l", "5d3t", "5dlx", "5dqc", "5dwr", "5e74", "5egm", "5eng", "5eqp", "5eqy", "5er1", "5exm", "5exn", "5f9b", "5fto", "5fut", "5hcv", "5i3v", "5i3y", "5i9x", "5i9z", "5ie1", "5ih9", "5jq5", "5kz0", "5l2s", "5lli", "5lny", "5lsg", "5neb", "5nw1", "5nyh", "5op5", "5oq8", "5qqp", "5t19", "5tpx", "5v82", "5yfs", "5yft", "6c2r", "6cjr", "6cpw", "6dgq", "6dgr", "6dyu", "6dyv", "6el5", "6elo", "6elp", "6ey9", "6eyb", "6f1n", "6ge7", "6gf9", "6gfs", "6ghh", "6i61", "6i64", "6i67", "6md0", "6mh1", "6mh7", "6n7a", "6n8x", "6no9", "6nv7", "6nv9", "6olx", "6qi7"] --------------------------------------------------------------------------------