├── .gitignore
├── LICENSE
├── README.md
├── data_utils.py
├── get_model_params.sh
├── inputs
    ├── 1BC8.pdb
    ├── 2GFB.pdb
    ├── 4GYT.pdb
    ├── bias_AA_per_residue.json
    ├── bias_AA_per_residue_multi.json
    ├── fix_residues_multi.json
    ├── omit_AA_per_residue.json
    ├── omit_AA_per_residue_multi.json
    ├── pdb_ids.json
    └── redesigned_residues_multi.json
├── model_utils.py
├── openfold
    ├── __init__.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   ├── data_modules.py
    │   ├── data_pipeline.py
    │   ├── data_transforms.py
    │   ├── errors.py
    │   ├── feature_pipeline.py
    │   ├── input_pipeline.py
    │   ├── mmcif_parsing.py
    │   ├── parsers.py
    │   ├── templates.py
    │   └── tools
    │   │   ├── __init__.py
    │   │   ├── hhblits.py
    │   │   ├── hhsearch.py
    │   │   ├── jackhmmer.py
    │   │   ├── kalign.py
    │   │   └── utils.py
    ├── np
    │   ├── __init__.py
    │   ├── protein.py
    │   ├── relax
    │   │   ├── __init__.py
    │   │   ├── amber_minimize.py
    │   │   ├── cleanup.py
    │   │   ├── relax.py
    │   │   └── utils.py
    │   └── residue_constants.py
    ├── resources
    │   └── __init__.py
    └── utils
    │   ├── feats.py
    │   ├── loss.py
    │   ├── rigid_utils.py
    │   └── tensor_utils.py
├── outputs
    ├── autoregressive_score_w_seq
    │   └── 1BC8_1.pt
    ├── autoregressive_score_wo_seq
    │   └── 1BC8_1.pt
    ├── batch_size
    │   ├── backbones
    │   │   ├── 1BC8_1.pdb
    │   │   ├── 1BC8_10.pdb
    │   │   ├── 1BC8_11.pdb
    │   │   ├── 1BC8_12.pdb
    │   │   ├── 1BC8_13.pdb
    │   │   ├── 1BC8_14.pdb
    │   │   ├── 1BC8_15.pdb
    │   │   ├── 1BC8_2.pdb
    │   │   ├── 1BC8_3.pdb
    │   │   ├── 1BC8_4.pdb
    │   │   ├── 1BC8_5.pdb
    │   │   ├── 1BC8_6.pdb
    │   │   ├── 1BC8_7.pdb
    │   │   ├── 1BC8_8.pdb
    │   │   └── 1BC8_9.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── bias_AA_per_residue_multi
    │   ├── backbones
    │   │   ├── 1BC8_1.pdb
    │   │   └── 4GYT_1.pdb
    │   └── seqs
    │   │   ├── 1BC8.fa
    │   │   └── 4GYT.fa
    ├── chains_to_design
    │   ├── backbones
    │   │   └── 4GYT_1.pdb
    │   └── seqs
    │   │   └── 4GYT.fa
    ├── default
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── fasta_seq_separation
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── file_ending
    │   ├── backbones
    │   │   └── 1BC8_1_xyz.pdb
    │   └── seqs
    │   │   └── 1BC8_xyz.fa
    ├── fix_residues
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── fixed_residues_multi
    │   ├── backbones
    │   │   ├── 1BC8_1.pdb
    │   │   └── 4GYT_1.pdb
    │   └── seqs
    │   │   ├── 1BC8.fa
    │   │   └── 4GYT.fa
    ├── global_bias
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── global_label_membrane_mpnn_0
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── global_omit
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── homooligomer
    │   ├── backbones
    │   │   ├── 4GYT_1.pdb
    │   │   └── 4GYT_2.pdb
    │   └── seqs
    │   │   └── 4GYT.fa
    ├── insertion_code
    │   ├── backbones
    │   │   └── 2GFB_1.pdb
    │   └── seqs
    │   │   └── 2GFB.fa
    ├── ligand_mpnn_cutoff_for_score
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── ligandmpnn_default
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── ligandmpnn_no_context
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── ligandmpnn_use_side_chain_atoms
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── ligandmpnn_v_32_005_25
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── omit_AA_per_residue_multi
    │   ├── backbones
    │   │   ├── 1BC8_1.pdb
    │   │   └── 4GYT_1.pdb
    │   └── seqs
    │   │   ├── 1BC8.fa
    │   │   └── 4GYT.fa
    ├── parse_atoms_with_zero_occupancy
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── parse_these_chains_only
    │   ├── backbones
    │   │   └── 4GYT_1.pdb
    │   └── seqs
    │   │   └── 4GYT.fa
    ├── pdb_path_multi
    │   ├── backbones
    │   │   ├── 1BC8_1.pdb
    │   │   └── 4GYT_1.pdb
    │   └── seqs
    │   │   ├── 1BC8.fa
    │   │   └── 4GYT.fa
    ├── per_residue_bias
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── per_residue_label_membrane_mpnn_default
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── per_residue_omit
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── random_seed
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── redesign_residues
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── redesigned_residues_multi
    │   ├── backbones
    │   │   ├── 1BC8_1.pdb
    │   │   └── 4GYT_1.pdb
    │   └── seqs
    │   │   ├── 1BC8.fa
    │   │   └── 4GYT.fa
    ├── save_stats
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   ├── seqs
    │   │   └── 1BC8.fa
    │   └── stats
    │   │   └── 1BC8.pt
    ├── sc_default
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   ├── packed
    │   │   ├── 1BC8_packed_1_1.pdb
    │   │   ├── 1BC8_packed_1_2.pdb
    │   │   ├── 1BC8_packed_1_3.pdb
    │   │   └── 1BC8_packed_1_4.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── sc_default_fast
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── sc_fixed_residues
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   ├── packed
    │   │   ├── 1BC8_packed_1_1.pdb
    │   │   ├── 1BC8_packed_1_2.pdb
    │   │   ├── 1BC8_packed_1_3.pdb
    │   │   └── 1BC8_packed_1_4.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── sc_fixed_residues_full_repack
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   ├── packed
    │   │   ├── 1BC8_packed_1_1.pdb
    │   │   ├── 1BC8_packed_1_2.pdb
    │   │   ├── 1BC8_packed_1_3.pdb
    │   │   └── 1BC8_packed_1_4.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── sc_no_context
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   ├── packed
    │   │   ├── 1BC8_packed_1_1.pdb
    │   │   ├── 1BC8_packed_1_2.pdb
    │   │   ├── 1BC8_packed_1_3.pdb
    │   │   └── 1BC8_packed_1_4.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── single_aa_score_w_seq
    │   └── 1BC8_1.pt
    ├── single_aa_score_wo_seq
    │   └── 1BC8_1.pt
    ├── soluble_mpnn_default
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── symmetry
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── temperature
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    ├── verbose
    │   ├── backbones
    │   │   └── 1BC8_1.pdb
    │   └── seqs
    │   │   └── 1BC8.fa
    └── zero_indexed
    │   ├── backbones
    │       ├── 1BC8_0.pdb
    │       └── 1BC8_1.pdb
    │   └── seqs
    │       └── 1BC8.fa
├── requirements.txt
├── run.py
├── run_examples.sh
├── sc_examples.sh
├── sc_utils.py
├── score.py
└── training
    ├── README.md
    ├── test_metal.json
    ├── test_nucleotide.json
    ├── test_small_molecule.json
    ├── train.json
    └── valid.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Justas Dauparas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/get_model_params.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #make new directory for model parameters
 4 | #e.g.   bash get_model_params.sh "./model_params"
 5 | 
 6 | mkdir -p $1
 7 | 
 8 | #Original ProteinMPNN weights
 9 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_48_002.pt -O $1"/proteinmpnn_v_48_002.pt"
10 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_48_010.pt -O $1"/proteinmpnn_v_48_010.pt"
11 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_48_020.pt -O $1"/proteinmpnn_v_48_020.pt"
12 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_48_030.pt -O $1"/proteinmpnn_v_48_030.pt"
13 | 
14 | #ProteinMPNN with num_edges=32
15 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_32_002.pt -O $1"/proteinmpnn_v_32_002.pt"
16 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_32_010.pt -O $1"/proteinmpnn_v_32_010.pt"
17 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_32_020.pt -O $1"/proteinmpnn_v_32_020.pt"
18 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/proteinmpnn_v_32_030.pt -O $1"/proteinmpnn_v_32_030.pt"
19 | 
20 | #LigandMPNN with num_edges=32; atom_context_num=25
21 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_005_25.pt -O $1"/ligandmpnn_v_32_005_25.pt"
22 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_010_25.pt -O $1"/ligandmpnn_v_32_010_25.pt"
23 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_020_25.pt -O $1"/ligandmpnn_v_32_020_25.pt"
24 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_030_25.pt -O $1"/ligandmpnn_v_32_030_25.pt"
25 | 
26 | #LigandMPNN with num_edges=32; atom_context_num=16
27 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_005_16.pt -O $1"/ligandmpnn_v_32_005_16.pt"
28 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_010_16.pt -O $1"/ligandmpnn_v_32_010_16.pt"
29 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_020_16.pt -O $1"/ligandmpnn_v_32_020_16.pt"
30 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_030_16.pt -O $1"/ligandmpnn_v_32_030_16.pt"
31 | 
32 | # wget -q https://files.ipd.uw.edu/pub/ligandmpnn/publication_version_ligandmpnn_v_32_010_25.pt -O $1"/publication_version_ligandmpnn_v_32_010_25.pt"
33 | 
34 | #Per residue label membrane ProteinMPNN
35 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/per_residue_label_membrane_mpnn_v_48_020.pt -O $1"/per_residue_label_membrane_mpnn_v_48_020.pt"
36 | 
37 | #Global label membrane ProteinMPNN
38 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/global_label_membrane_mpnn_v_48_020.pt -O $1"/global_label_membrane_mpnn_v_48_020.pt"
39 | 
40 | #SolubleMPNN
41 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/solublempnn_v_48_002.pt -O $1"/solublempnn_v_48_002.pt"
42 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/solublempnn_v_48_010.pt -O $1"/solublempnn_v_48_010.pt"
43 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/solublempnn_v_48_020.pt -O $1"/solublempnn_v_48_020.pt"
44 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/solublempnn_v_48_030.pt -O $1"/solublempnn_v_48_030.pt"
45 | 
46 | #LigandMPNN for side-chain packing (multi-step denoising model)
47 | wget -q https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_sc_v_32_002_16.pt -O $1"/ligandmpnn_sc_v_32_002_16.pt"
48 | 


--------------------------------------------------------------------------------
/inputs/bias_AA_per_residue.json:
--------------------------------------------------------------------------------
1 | {
2 | "C1": {"G": -0.3, "C": -2.0, "P": 10.8}, 
3 | "C3": {"P": 10.0},
4 | "C5": {"G": -1.3, "P": 10.0},
5 | "C7": {"G": -1.3, "P": 10.0}
6 | }
7 | 


--------------------------------------------------------------------------------
/inputs/bias_AA_per_residue_multi.json:
--------------------------------------------------------------------------------
1 | {
2 | "./inputs/1BC8.pdb": {"C1":{"A":3.0, "P":-2.0}, "C2":{"W":10.0, "G":-0.43}},
3 | "./inputs/4GYT.pdb": {"A7":{"Y":5.0, "S":-2.0}, "A8":{"M":3.9, "G":-0.43}}
4 | }
5 | 


--------------------------------------------------------------------------------
/inputs/fix_residues_multi.json:
--------------------------------------------------------------------------------
1 | {
2 | "./inputs/1BC8.pdb": "C1 C2 C3 C4 C5 C10 C22",
3 | "./inputs/4GYT.pdb": "A7 A8 A9 A10 A11 A12 A13 B38"
4 | }
5 | 


--------------------------------------------------------------------------------
/inputs/omit_AA_per_residue.json:
--------------------------------------------------------------------------------
1 | {
2 | "C1": "ACDEFGHIKLMNPQRSTVW", 
3 | "C3": "ACDEFGHIKLMNPQRSTVW",
4 | "C5": "ACDEFGHIKLMNPQRSTVW",
5 | "C7": "ACDEFGHIKLMNPQRSTVW"
6 | }
7 | 


--------------------------------------------------------------------------------
/inputs/omit_AA_per_residue_multi.json:
--------------------------------------------------------------------------------
1 | {
2 | "./inputs/1BC8.pdb": {"C1":"ACDEFGHILMNPQRSTVWY", "C2":"ACDEFGHILMNPQRSTVWY", "C3":"ACDEFGHILMNPQRSTVWY"},
3 | "./inputs/4GYT.pdb": {"A7":"ACDEFGHILMNPQRSTVWY", "A8":"ACDEFGHILMNPQRSTVWY"}
4 | }
5 | 


--------------------------------------------------------------------------------
/inputs/pdb_ids.json:
--------------------------------------------------------------------------------
1 | {
2 | "./inputs/1BC8.pdb": "",
3 | "./inputs/4GYT.pdb": ""
4 | }
5 | 


--------------------------------------------------------------------------------
/inputs/redesigned_residues_multi.json:
--------------------------------------------------------------------------------
1 | {
2 | "./inputs/1BC8.pdb": "C1 C2 C3 C4 C5 C10",
3 | "./inputs/4GYT.pdb": "A7 A8 A9 A10 A12 A13 B38"
4 | }
5 | 


--------------------------------------------------------------------------------
/openfold/__init__.py:
--------------------------------------------------------------------------------
1 | #from . import model
2 | #from . import utils
3 | #from . import np
4 | #from . import resources
5 | 
6 | #__all__ = ["model", "utils", "np", "data", "resources"]
7 | 


--------------------------------------------------------------------------------
/openfold/config.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import ml_collections as mlc
  3 | 
  4 | 
  5 | def set_inf(c, inf):
  6 |     for k, v in c.items():
  7 |         if isinstance(v, mlc.ConfigDict):
  8 |             set_inf(v, inf)
  9 |         elif k == "inf":
 10 |             c[k] = inf
 11 | 
 12 | 
 13 | def enforce_config_constraints(config):
 14 |     def string_to_setting(s):
 15 |         path = s.split('.')
 16 |         setting = config
 17 |         for p in path:
 18 |             setting = setting[p]
 19 | 
 20 |         return setting
 21 | 
 22 |     mutually_exclusive_bools = [
 23 |         (
 24 |             "model.template.average_templates", 
 25 |             "model.template.offload_templates"
 26 |         )
 27 |     ]
 28 | 
 29 |     for s1, s2 in mutually_exclusive_bools:
 30 |         s1_setting = string_to_setting(s1)
 31 |         s2_setting = string_to_setting(s2)
 32 |         if(s1_setting and s2_setting):
 33 |             raise ValueError(f"Only one of {s1} and {s2} may be set at a time")
 34 | 
 35 | 
 36 | def model_config(name, train=False, low_prec=False):
 37 |     c = copy.deepcopy(config)
 38 |     if name == "initial_training":
 39 |         # AF2 Suppl. Table 4, "initial training" setting
 40 |         pass
 41 |     elif name == "finetuning":
 42 |         # AF2 Suppl. Table 4, "finetuning" setting
 43 |         c.data.train.max_extra_msa = 5120
 44 |         c.data.train.crop_size = 384
 45 |         c.data.train.max_msa_clusters = 512
 46 |         c.loss.violation.weight = 1.
 47 |         c.loss.experimentally_resolved.weight = 0.01
 48 |     elif name == "finetuning_ptm":
 49 |         c.data.train.max_extra_msa = 5120
 50 |         c.data.train.crop_size = 384
 51 |         c.data.train.max_msa_clusters = 512
 52 |         c.loss.violation.weight = 1.
 53 |         c.loss.experimentally_resolved.weight = 0.01
 54 |         c.model.heads.tm.enabled = True
 55 |         c.loss.tm.weight = 0.1
 56 |     elif name == "model_1":
 57 |         # AF2 Suppl. Table 5, Model 1.1.1
 58 |         c.data.train.max_extra_msa = 5120
 59 |         c.data.predict.max_extra_msa = 5120
 60 |         c.data.common.reduce_max_clusters_by_max_templates = True
 61 |         c.data.common.use_templates = True
 62 |         c.data.common.use_template_torsion_angles = True
 63 |         c.model.template.enabled = True
 64 |     elif name == "model_2":
 65 |         # AF2 Suppl. Table 5, Model 1.1.2
 66 |         c.data.common.reduce_max_clusters_by_max_templates = True
 67 |         c.data.common.use_templates = True
 68 |         c.data.common.use_template_torsion_angles = True
 69 |         c.model.template.enabled = True
 70 |     elif name == "model_3":
 71 |         # AF2 Suppl. Table 5, Model 1.2.1
 72 |         c.data.train.max_extra_msa = 5120
 73 |         c.data.predict.max_extra_msa = 5120
 74 |         c.model.template.enabled = False
 75 |     elif name == "model_4":
 76 |         # AF2 Suppl. Table 5, Model 1.2.2
 77 |         c.data.train.max_extra_msa = 5120
 78 |         c.data.predict.max_extra_msa = 5120
 79 |         c.model.template.enabled = False
 80 |     elif name == "model_5":
 81 |         # AF2 Suppl. Table 5, Model 1.2.3
 82 |         c.model.template.enabled = False
 83 |     elif name == "model_1_ptm":
 84 |         c.data.train.max_extra_msa = 5120
 85 |         c.data.predict.max_extra_msa = 5120 
 86 |         c.data.common.reduce_max_clusters_by_max_templates = True
 87 |         c.data.common.use_templates = True
 88 |         c.data.common.use_template_torsion_angles = True
 89 |         c.model.template.enabled = True
 90 |         c.model.heads.tm.enabled = True
 91 |         c.loss.tm.weight = 0.1
 92 |     elif name == "model_2_ptm":
 93 |         c.data.common.reduce_max_clusters_by_max_templates = True
 94 |         c.data.common.use_templates = True
 95 |         c.data.common.use_template_torsion_angles = True
 96 |         c.model.template.enabled = True
 97 |         c.model.heads.tm.enabled = True
 98 |         c.loss.tm.weight = 0.1
 99 |     elif name == "model_3_ptm":
100 |         c.data.train.max_extra_msa = 5120
101 |         c.data.predict.max_extra_msa = 5120
102 |         c.model.template.enabled = False
103 |         c.model.heads.tm.enabled = True
104 |         c.loss.tm.weight = 0.1
105 |     elif name == "model_4_ptm":
106 |         c.data.train.max_extra_msa = 5120
107 |         c.data.predict.max_extra_msa = 5120
108 |         c.model.template.enabled = False
109 |         c.model.heads.tm.enabled = True
110 |         c.loss.tm.weight = 0.1
111 |     elif name == "model_5_ptm":
112 |         c.model.template.enabled = False
113 |         c.model.heads.tm.enabled = True
114 |         c.loss.tm.weight = 0.1
115 |     else:
116 |         raise ValueError("Invalid model name")
117 | 
118 |     if train:
119 |         c.globals.blocks_per_ckpt = 1
120 |         c.globals.chunk_size = None
121 |         c.globals.use_lma = False
122 |         c.globals.offload_inference = False
123 |         c.model.template.average_templates = False
124 |         c.model.template.offload_templates = False
125 |     if low_prec:
126 |         c.globals.eps = 1e-4
127 |         # If we want exact numerical parity with the original, inf can't be
128 |         # a global constant
129 |         set_inf(c, 1e4)
130 | 
131 |     enforce_config_constraints(c)
132 | 
133 |     return c
134 | 
135 | 
136 | c_z = mlc.FieldReference(128, field_type=int)
137 | c_m = mlc.FieldReference(256, field_type=int)
138 | c_t = mlc.FieldReference(64, field_type=int)
139 | c_e = mlc.FieldReference(64, field_type=int)
140 | c_s = mlc.FieldReference(384, field_type=int)
141 | blocks_per_ckpt = mlc.FieldReference(None, field_type=int)
142 | chunk_size = mlc.FieldReference(4, field_type=int)
143 | aux_distogram_bins = mlc.FieldReference(64, field_type=int)
144 | tm_enabled = mlc.FieldReference(False, field_type=bool)
145 | eps = mlc.FieldReference(1e-8, field_type=float)
146 | templates_enabled = mlc.FieldReference(True, field_type=bool)
147 | embed_template_torsion_angles = mlc.FieldReference(True, field_type=bool)
148 | tune_chunk_size = mlc.FieldReference(True, field_type=bool)
149 | 
150 | NUM_RES = "num residues placeholder"
151 | NUM_MSA_SEQ = "msa placeholder"
152 | NUM_EXTRA_SEQ = "extra msa placeholder"
153 | NUM_TEMPLATES = "num templates placeholder"
154 | 
155 | config = mlc.ConfigDict(
156 |     {
157 |         "data": {
158 |             "common": {
159 |                 "feat": {
160 |                     "aatype": [NUM_RES],
161 |                     "all_atom_mask": [NUM_RES, None],
162 |                     "all_atom_positions": [NUM_RES, None, None],
163 |                     "alt_chi_angles": [NUM_RES, None],
164 |                     "atom14_alt_gt_exists": [NUM_RES, None],
165 |                     "atom14_alt_gt_positions": [NUM_RES, None, None],
166 |                     "atom14_atom_exists": [NUM_RES, None],
167 |                     "atom14_atom_is_ambiguous": [NUM_RES, None],
168 |                     "atom14_gt_exists": [NUM_RES, None],
169 |                     "atom14_gt_positions": [NUM_RES, None, None],
170 |                     "atom37_atom_exists": [NUM_RES, None],
171 |                     "backbone_rigid_mask": [NUM_RES],
172 |                     "backbone_rigid_tensor": [NUM_RES, None, None],
173 |                     "bert_mask": [NUM_MSA_SEQ, NUM_RES],
174 |                     "chi_angles_sin_cos": [NUM_RES, None, None],
175 |                     "chi_mask": [NUM_RES, None],
176 |                     "extra_deletion_value": [NUM_EXTRA_SEQ, NUM_RES],
177 |                     "extra_has_deletion": [NUM_EXTRA_SEQ, NUM_RES],
178 |                     "extra_msa": [NUM_EXTRA_SEQ, NUM_RES],
179 |                     "extra_msa_mask": [NUM_EXTRA_SEQ, NUM_RES],
180 |                     "extra_msa_row_mask": [NUM_EXTRA_SEQ],
181 |                     "is_distillation": [],
182 |                     "msa_feat": [NUM_MSA_SEQ, NUM_RES, None],
183 |                     "msa_mask": [NUM_MSA_SEQ, NUM_RES],
184 |                     "msa_row_mask": [NUM_MSA_SEQ],
185 |                     "no_recycling_iters": [],
186 |                     "pseudo_beta": [NUM_RES, None],
187 |                     "pseudo_beta_mask": [NUM_RES],
188 |                     "residue_index": [NUM_RES],
189 |                     "residx_atom14_to_atom37": [NUM_RES, None],
190 |                     "residx_atom37_to_atom14": [NUM_RES, None],
191 |                     "resolution": [],
192 |                     "rigidgroups_alt_gt_frames": [NUM_RES, None, None, None],
193 |                     "rigidgroups_group_exists": [NUM_RES, None],
194 |                     "rigidgroups_group_is_ambiguous": [NUM_RES, None],
195 |                     "rigidgroups_gt_exists": [NUM_RES, None],
196 |                     "rigidgroups_gt_frames": [NUM_RES, None, None, None],
197 |                     "seq_length": [],
198 |                     "seq_mask": [NUM_RES],
199 |                     "target_feat": [NUM_RES, None],
200 |                     "template_aatype": [NUM_TEMPLATES, NUM_RES],
201 |                     "template_all_atom_mask": [NUM_TEMPLATES, NUM_RES, None],
202 |                     "template_all_atom_positions": [
203 |                         NUM_TEMPLATES, NUM_RES, None, None,
204 |                     ],
205 |                     "template_alt_torsion_angles_sin_cos": [
206 |                         NUM_TEMPLATES, NUM_RES, None, None,
207 |                     ],
208 |                     "template_backbone_rigid_mask": [NUM_TEMPLATES, NUM_RES],
209 |                     "template_backbone_rigid_tensor": [
210 |                         NUM_TEMPLATES, NUM_RES, None, None,
211 |                     ],
212 |                     "template_mask": [NUM_TEMPLATES],
213 |                     "template_pseudo_beta": [NUM_TEMPLATES, NUM_RES, None],
214 |                     "template_pseudo_beta_mask": [NUM_TEMPLATES, NUM_RES],
215 |                     "template_sum_probs": [NUM_TEMPLATES, None],
216 |                     "template_torsion_angles_mask": [
217 |                         NUM_TEMPLATES, NUM_RES, None,
218 |                     ],
219 |                     "template_torsion_angles_sin_cos": [
220 |                         NUM_TEMPLATES, NUM_RES, None, None,
221 |                     ],
222 |                     "true_msa": [NUM_MSA_SEQ, NUM_RES],
223 |                     "use_clamped_fape": [],
224 |                 },
225 |                 "masked_msa": {
226 |                     "profile_prob": 0.1,
227 |                     "same_prob": 0.1,
228 |                     "uniform_prob": 0.1,
229 |                 },
230 |                 "max_recycling_iters": 3,
231 |                 "msa_cluster_features": True,
232 |                 "reduce_msa_clusters_by_max_templates": False,
233 |                 "resample_msa_in_recycling": True,
234 |                 "template_features": [
235 |                     "template_all_atom_positions",
236 |                     "template_sum_probs",
237 |                     "template_aatype",
238 |                     "template_all_atom_mask",
239 |                 ],
240 |                 "unsupervised_features": [
241 |                     "aatype",
242 |                     "residue_index",
243 |                     "msa",
244 |                     "num_alignments",
245 |                     "seq_length",
246 |                     "between_segment_residues",
247 |                     "deletion_matrix",
248 |                     "no_recycling_iters",
249 |                 ],
250 |                 "use_templates": templates_enabled,
251 |                 "use_template_torsion_angles": embed_template_torsion_angles,
252 |             },
253 |             "supervised": {
254 |                 "clamp_prob": 0.9,
255 |                 "supervised_features": [
256 |                     "all_atom_mask",
257 |                     "all_atom_positions",
258 |                     "resolution",
259 |                     "use_clamped_fape",
260 |                     "is_distillation",
261 |                 ],
262 |             },
263 |             "predict": {
264 |                 "fixed_size": True,
265 |                 "subsample_templates": False,  # We want top templates.
266 |                 "masked_msa_replace_fraction": 0.15,
267 |                 "max_msa_clusters": 512,
268 |                 "max_extra_msa": 1024,
269 |                 "max_template_hits": 4,
270 |                 "max_templates": 4,
271 |                 "crop": False,
272 |                 "crop_size": None,
273 |                 "supervised": False,
274 |                 "uniform_recycling": False,
275 |             },
276 |             "eval": {
277 |                 "fixed_size": True,
278 |                 "subsample_templates": False,  # We want top templates.
279 |                 "masked_msa_replace_fraction": 0.15,
280 |                 "max_msa_clusters": 128,
281 |                 "max_extra_msa": 1024,
282 |                 "max_template_hits": 4,
283 |                 "max_templates": 4,
284 |                 "crop": False,
285 |                 "crop_size": None,
286 |                 "supervised": True,
287 |                 "uniform_recycling": False,
288 |             },
289 |             "train": {
290 |                 "fixed_size": True,
291 |                 "subsample_templates": True,
292 |                 "masked_msa_replace_fraction": 0.15,
293 |                 "max_msa_clusters": 128,
294 |                 "max_extra_msa": 1024,
295 |                 "max_template_hits": 4,
296 |                 "max_templates": 4,
297 |                 "shuffle_top_k_prefiltered": 20,
298 |                 "crop": True,
299 |                 "crop_size": 256,
300 |                 "supervised": True,
301 |                 "clamp_prob": 0.9,
302 |                 "max_distillation_msa_clusters": 1000,
303 |                 "uniform_recycling": True,
304 |                 "distillation_prob": 0.75,
305 |             },
306 |             "data_module": {
307 |                 "use_small_bfd": False,
308 |                 "data_loaders": {
309 |                     "batch_size": 1,
310 |                     "num_workers": 16,
311 |                 },
312 |             },
313 |         },
314 |         # Recurring FieldReferences that can be changed globally here
315 |         "globals": {
316 |             "blocks_per_ckpt": blocks_per_ckpt,
317 |             "chunk_size": chunk_size,
318 |             "use_lma": False,
319 |             "offload_inference": False,
320 |             "c_z": c_z,
321 |             "c_m": c_m,
322 |             "c_t": c_t,
323 |             "c_e": c_e,
324 |             "c_s": c_s,
325 |             "eps": eps,
326 |         },
327 |         "model": {
328 |             "_mask_trans": False,
329 |             "input_embedder": {
330 |                 "tf_dim": 22,
331 |                 "msa_dim": 49,
332 |                 "c_z": c_z,
333 |                 "c_m": c_m,
334 |                 "relpos_k": 32,
335 |             },
336 |             "recycling_embedder": {
337 |                 "c_z": c_z,
338 |                 "c_m": c_m,
339 |                 "min_bin": 3.25,
340 |                 "max_bin": 20.75,
341 |                 "no_bins": 15,
342 |                 "inf": 1e8,
343 |             },
344 |             "template": {
345 |                 "distogram": {
346 |                     "min_bin": 3.25,
347 |                     "max_bin": 50.75,
348 |                     "no_bins": 39,
349 |                 },
350 |                 "template_angle_embedder": {
351 |                     # DISCREPANCY: c_in is supposed to be 51.
352 |                     "c_in": 57,
353 |                     "c_out": c_m,
354 |                 },
355 |                 "template_pair_embedder": {
356 |                     "c_in": 88,
357 |                     "c_out": c_t,
358 |                 },
359 |                 "template_pair_stack": {
360 |                     "c_t": c_t,
361 |                     # DISCREPANCY: c_hidden_tri_att here is given in the supplement
362 |                     # as 64. In the code, it's 16.
363 |                     "c_hidden_tri_att": 16,
364 |                     "c_hidden_tri_mul": 64,
365 |                     "no_blocks": 2,
366 |                     "no_heads": 4,
367 |                     "pair_transition_n": 2,
368 |                     "dropout_rate": 0.25,
369 |                     "blocks_per_ckpt": blocks_per_ckpt,
370 |                     "tune_chunk_size": tune_chunk_size,
371 |                     "inf": 1e9,
372 |                 },
373 |                 "template_pointwise_attention": {
374 |                     "c_t": c_t,
375 |                     "c_z": c_z,
376 |                     # DISCREPANCY: c_hidden here is given in the supplement as 64.
377 |                     # It's actually 16.
378 |                     "c_hidden": 16,
379 |                     "no_heads": 4,
380 |                     "inf": 1e5,  # 1e9,
381 |                 },
382 |                 "inf": 1e5,  # 1e9,
383 |                 "eps": eps,  # 1e-6,
384 |                 "enabled": templates_enabled,
385 |                 "embed_angles": embed_template_torsion_angles,
386 |                 "use_unit_vector": False,
387 |                 # Approximate template computation, saving memory.
388 |                 # In our experiments, results are equivalent to or better than
389 |                 # the stock implementation. Should be enabled for all new
390 |                 # training runs.
391 |                 "average_templates": False,
392 |                 # Offload template embeddings to CPU memory. Vastly reduced
393 |                 # memory consumption at the cost of a modest increase in
394 |                 # runtime. Useful for inference on very long sequences.
395 |                 # Mutually exclusive with average_templates.
396 |                 "offload_templates": False,
397 |             },
398 |             "extra_msa": {
399 |                 "extra_msa_embedder": {
400 |                     "c_in": 25,
401 |                     "c_out": c_e,
402 |                 },
403 |                 "extra_msa_stack": {
404 |                     "c_m": c_e,
405 |                     "c_z": c_z,
406 |                     "c_hidden_msa_att": 8,
407 |                     "c_hidden_opm": 32,
408 |                     "c_hidden_mul": 128,
409 |                     "c_hidden_pair_att": 32,
410 |                     "no_heads_msa": 8,
411 |                     "no_heads_pair": 4,
412 |                     "no_blocks": 4,
413 |                     "transition_n": 4,
414 |                     "msa_dropout": 0.15,
415 |                     "pair_dropout": 0.25,
416 |                     "clear_cache_between_blocks": False,
417 |                     "tune_chunk_size": tune_chunk_size,
418 |                     "inf": 1e9,
419 |                     "eps": eps,  # 1e-10,
420 |                     "ckpt": blocks_per_ckpt is not None,
421 |                 },
422 |                 "enabled": True,
423 |             },
424 |             "evoformer_stack": {
425 |                 "c_m": c_m,
426 |                 "c_z": c_z,
427 |                 "c_hidden_msa_att": 32,
428 |                 "c_hidden_opm": 32,
429 |                 "c_hidden_mul": 128,
430 |                 "c_hidden_pair_att": 32,
431 |                 "c_s": c_s,
432 |                 "no_heads_msa": 8,
433 |                 "no_heads_pair": 4,
434 |                 "no_blocks": 48,
435 |                 "transition_n": 4,
436 |                 "msa_dropout": 0.15,
437 |                 "pair_dropout": 0.25,
438 |                 "blocks_per_ckpt": blocks_per_ckpt,
439 |                 "clear_cache_between_blocks": False,
440 |                 "tune_chunk_size": tune_chunk_size,
441 |                 "inf": 1e9,
442 |                 "eps": eps,  # 1e-10,
443 |             },
444 |             "structure_module": {
445 |                 "c_s": c_s,
446 |                 "c_z": c_z,
447 |                 "c_ipa": 16,
448 |                 "c_resnet": 128,
449 |                 "no_heads_ipa": 12,
450 |                 "no_qk_points": 4,
451 |                 "no_v_points": 8,
452 |                 "dropout_rate": 0.1,
453 |                 "no_blocks": 8,
454 |                 "no_transition_layers": 1,
455 |                 "no_resnet_blocks": 2,
456 |                 "no_angles": 7,
457 |                 "trans_scale_factor": 10,
458 |                 "epsilon": eps,  # 1e-12,
459 |                 "inf": 1e5,
460 |             },
461 |             "heads": {
462 |                 "lddt": {
463 |                     "no_bins": 50,
464 |                     "c_in": c_s,
465 |                     "c_hidden": 128,
466 |                 },
467 |                 "distogram": {
468 |                     "c_z": c_z,
469 |                     "no_bins": aux_distogram_bins,
470 |                 },
471 |                 "tm": {
472 |                     "c_z": c_z,
473 |                     "no_bins": aux_distogram_bins,
474 |                     "enabled": tm_enabled,
475 |                 },
476 |                 "masked_msa": {
477 |                     "c_m": c_m,
478 |                     "c_out": 23,
479 |                 },
480 |                 "experimentally_resolved": {
481 |                     "c_s": c_s,
482 |                     "c_out": 37,
483 |                 },
484 |             },
485 |         },
486 |         "relax": {
487 |             "max_iterations": 0,  # no max
488 |             "tolerance": 2.39,
489 |             "stiffness": 10.0,
490 |             "max_outer_iterations": 20,
491 |             "exclude_residues": [],
492 |         },
493 |         "loss": {
494 |             "distogram": {
495 |                 "min_bin": 2.3125,
496 |                 "max_bin": 21.6875,
497 |                 "no_bins": 64,
498 |                 "eps": eps,  # 1e-6,
499 |                 "weight": 0.3,
500 |             },
501 |             "experimentally_resolved": {
502 |                 "eps": eps,  # 1e-8,
503 |                 "min_resolution": 0.1,
504 |                 "max_resolution": 3.0,
505 |                 "weight": 0.0,
506 |             },
507 |             "fape": {
508 |                 "backbone": {
509 |                     "clamp_distance": 10.0,
510 |                     "loss_unit_distance": 10.0,
511 |                     "weight": 0.5,
512 |                 },
513 |                 "sidechain": {
514 |                     "clamp_distance": 10.0,
515 |                     "length_scale": 10.0,
516 |                     "weight": 0.5,
517 |                 },
518 |                 "eps": 1e-4,
519 |                 "weight": 1.0,
520 |             },
521 |             "lddt": {
522 |                 "min_resolution": 0.1,
523 |                 "max_resolution": 3.0,
524 |                 "cutoff": 15.0,
525 |                 "no_bins": 50,
526 |                 "eps": eps,  # 1e-10,
527 |                 "weight": 0.01,
528 |             },
529 |             "masked_msa": {
530 |                 "eps": eps,  # 1e-8,
531 |                 "weight": 2.0,
532 |             },
533 |             "supervised_chi": {
534 |                 "chi_weight": 0.5,
535 |                 "angle_norm_weight": 0.01,
536 |                 "eps": eps,  # 1e-6,
537 |                 "weight": 1.0,
538 |             },
539 |             "violation": {
540 |                 "violation_tolerance_factor": 12.0,
541 |                 "clash_overlap_tolerance": 1.5,
542 |                 "eps": eps,  # 1e-6,
543 |                 "weight": 0.0,
544 |             },
545 |             "tm": {
546 |                 "max_bin": 31,
547 |                 "no_bins": 64,
548 |                 "min_resolution": 0.1,
549 |                 "max_resolution": 3.0,
550 |                 "eps": eps,  # 1e-8,
551 |                 "weight": 0.,
552 |                 "enabled": tm_enabled,
553 |             },
554 |             "eps": eps,
555 |         },
556 |         "ema": {"decay": 0.999},
557 |     }
558 | )
559 | 


--------------------------------------------------------------------------------
/openfold/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/openfold/data/__init__.py


--------------------------------------------------------------------------------
/openfold/data/errors.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 AlQuraishi Laboratory
 2 | # Copyright 2021 DeepMind Technologies Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """General-purpose errors used throughout the data pipeline"""
17 | class Error(Exception):
18 |     """Base class for exceptions."""
19 | 
20 | 
21 | class MultipleChainsError(Error):
22 |     """An error indicating that multiple chains were found for a given ID."""
23 | 


--------------------------------------------------------------------------------
/openfold/data/feature_pipeline.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import copy
 17 | from typing import Mapping, Tuple, List, Optional, Dict, Sequence
 18 | 
 19 | import ml_collections
 20 | import numpy as np
 21 | import torch
 22 | 
 23 | from openfold.data import input_pipeline
 24 | 
 25 | 
 26 | FeatureDict = Mapping[str, np.ndarray]
 27 | TensorDict = Dict[str, torch.Tensor]
 28 | 
 29 | 
 30 | def np_to_tensor_dict(
 31 |     np_example: Mapping[str, np.ndarray],
 32 |     features: Sequence[str],
 33 | ) -> TensorDict:
 34 |     """Creates dict of tensors from a dict of NumPy arrays.
 35 | 
 36 |     Args:
 37 |         np_example: A dict of NumPy feature arrays.
 38 |         features: A list of strings of feature names to be returned in the dataset.
 39 | 
 40 |     Returns:
 41 |         A dictionary of features mapping feature names to features. Only the given
 42 |         features are returned, all other ones are filtered out.
 43 |     """ 
 44 |     tensor_dict = {
 45 |         k: torch.tensor(v) for k, v in np_example.items() if k in features
 46 |     }
 47 | 
 48 |     return tensor_dict
 49 | 
 50 | 
 51 | def make_data_config(
 52 |     config: ml_collections.ConfigDict,
 53 |     mode: str,
 54 |     num_res: int,
 55 | ) -> Tuple[ml_collections.ConfigDict, List[str]]:
 56 |     cfg = copy.deepcopy(config)
 57 |     mode_cfg = cfg[mode]
 58 |     with cfg.unlocked():
 59 |         if mode_cfg.crop_size is None:
 60 |             mode_cfg.crop_size = num_res
 61 | 
 62 |     feature_names = cfg.common.unsupervised_features
 63 | 
 64 |     if cfg.common.use_templates:
 65 |         feature_names += cfg.common.template_features
 66 | 
 67 |     if cfg[mode].supervised:
 68 |         feature_names += cfg.supervised.supervised_features
 69 | 
 70 |     return cfg, feature_names
 71 | 
 72 | 
 73 | def np_example_to_features(
 74 |     np_example: FeatureDict,
 75 |     config: ml_collections.ConfigDict,
 76 |     mode: str,
 77 | ):
 78 |     np_example = dict(np_example)
 79 |     num_res = int(np_example["seq_length"][0])
 80 |     cfg, feature_names = make_data_config(config, mode=mode, num_res=num_res)
 81 | 
 82 |     if "deletion_matrix_int" in np_example:
 83 |         np_example["deletion_matrix"] = np_example.pop(
 84 |             "deletion_matrix_int"
 85 |         ).astype(np.float32)
 86 | 
 87 |     tensor_dict = np_to_tensor_dict(
 88 |         np_example=np_example, features=feature_names
 89 |     )
 90 |     with torch.no_grad():
 91 |         features = input_pipeline.process_tensors_from_config(
 92 |             tensor_dict,
 93 |             cfg.common,
 94 |             cfg[mode],
 95 |         )
 96 | 
 97 |     return {k: v for k, v in features.items()}
 98 | 
 99 | 
100 | class FeaturePipeline:
101 |     def __init__(
102 |         self,
103 |         config: ml_collections.ConfigDict,
104 |     ):
105 |         self.config = config
106 | 
107 |     def process_features(
108 |         self,
109 |         raw_features: FeatureDict,
110 |         mode: str = "train", 
111 |     ) -> FeatureDict:
112 |         return np_example_to_features(
113 |             np_example=raw_features,
114 |             config=self.config,
115 |             mode=mode,
116 |         )
117 | 


--------------------------------------------------------------------------------
/openfold/data/input_pipeline.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from functools import partial
 17 | 
 18 | import torch
 19 | 
 20 | from openfold.data import data_transforms
 21 | 
 22 | 
 23 | def nonensembled_transform_fns(common_cfg, mode_cfg):
 24 |     """Input pipeline data transformers that are not ensembled."""
 25 |     transforms = [
 26 |         data_transforms.cast_to_64bit_ints,
 27 |         data_transforms.correct_msa_restypes,
 28 |         data_transforms.squeeze_features,
 29 |         data_transforms.randomly_replace_msa_with_unknown(0.0),
 30 |         data_transforms.make_seq_mask,
 31 |         data_transforms.make_msa_mask,
 32 |         data_transforms.make_hhblits_profile,
 33 |     ]
 34 |     if common_cfg.use_templates:
 35 |         transforms.extend(
 36 |             [
 37 |                 data_transforms.fix_templates_aatype,
 38 |                 data_transforms.make_template_mask,
 39 |                 data_transforms.make_pseudo_beta("template_"),
 40 |             ]
 41 |         )
 42 |         if common_cfg.use_template_torsion_angles:
 43 |             transforms.extend(
 44 |                 [
 45 |                     data_transforms.atom37_to_torsion_angles("template_"),
 46 |                 ]
 47 |             )
 48 | 
 49 |     transforms.extend(
 50 |         [
 51 |             data_transforms.make_atom14_masks,
 52 |         ]
 53 |     )
 54 | 
 55 |     if mode_cfg.supervised:
 56 |         transforms.extend(
 57 |             [
 58 |                 data_transforms.make_atom14_positions,
 59 |                 data_transforms.atom37_to_frames,
 60 |                 data_transforms.atom37_to_torsion_angles(""),
 61 |                 data_transforms.make_pseudo_beta(""),
 62 |                 data_transforms.get_backbone_frames,
 63 |                 data_transforms.get_chi_angles,
 64 |             ]
 65 |         )
 66 | 
 67 |     return transforms
 68 | 
 69 | 
 70 | def ensembled_transform_fns(common_cfg, mode_cfg, ensemble_seed):
 71 |     """Input pipeline data transformers that can be ensembled and averaged."""
 72 |     transforms = []
 73 | 
 74 |     if "max_distillation_msa_clusters" in mode_cfg:
 75 |         transforms.append(
 76 |             data_transforms.sample_msa_distillation(
 77 |                 mode_cfg.max_distillation_msa_clusters
 78 |             )
 79 |         )
 80 | 
 81 |     if common_cfg.reduce_msa_clusters_by_max_templates:
 82 |         pad_msa_clusters = mode_cfg.max_msa_clusters - mode_cfg.max_templates
 83 |     else:
 84 |         pad_msa_clusters = mode_cfg.max_msa_clusters
 85 | 
 86 |     max_msa_clusters = pad_msa_clusters
 87 |     max_extra_msa = mode_cfg.max_extra_msa
 88 | 
 89 |     msa_seed = None
 90 |     if(not common_cfg.resample_msa_in_recycling):
 91 |         msa_seed = ensemble_seed
 92 |     
 93 |     transforms.append(
 94 |         data_transforms.sample_msa(
 95 |             max_msa_clusters, 
 96 |             keep_extra=True,
 97 |             seed=msa_seed,
 98 |         )
 99 |     )
100 | 
101 |     if "masked_msa" in common_cfg:
102 |         # Masked MSA should come *before* MSA clustering so that
103 |         # the clustering and full MSA profile do not leak information about
104 |         # the masked locations and secret corrupted locations.
105 |         transforms.append(
106 |             data_transforms.make_masked_msa(
107 |                 common_cfg.masked_msa, mode_cfg.masked_msa_replace_fraction
108 |             )
109 |         )
110 | 
111 |     if common_cfg.msa_cluster_features:
112 |         transforms.append(data_transforms.nearest_neighbor_clusters())
113 |         transforms.append(data_transforms.summarize_clusters())
114 | 
115 |     # Crop after creating the cluster profiles.
116 |     if max_extra_msa:
117 |         transforms.append(data_transforms.crop_extra_msa(max_extra_msa))
118 |     else:
119 |         transforms.append(data_transforms.delete_extra_msa)
120 | 
121 |     transforms.append(data_transforms.make_msa_feat())
122 | 
123 |     crop_feats = dict(common_cfg.feat)
124 | 
125 |     if mode_cfg.fixed_size:
126 |         transforms.append(data_transforms.select_feat(list(crop_feats)))
127 |         transforms.append(
128 |             data_transforms.random_crop_to_size(
129 |                 mode_cfg.crop_size,
130 |                 mode_cfg.max_templates,
131 |                 crop_feats,
132 |                 mode_cfg.subsample_templates,
133 |                 seed=ensemble_seed + 1,
134 |             )
135 |         )
136 |         transforms.append(
137 |             data_transforms.make_fixed_size(
138 |                 crop_feats,
139 |                 pad_msa_clusters,
140 |                 mode_cfg.max_extra_msa,
141 |                 mode_cfg.crop_size,
142 |                 mode_cfg.max_templates,
143 |             )
144 |         )
145 |     else:
146 |         transforms.append(
147 |             data_transforms.crop_templates(mode_cfg.max_templates)
148 |         )
149 | 
150 |     return transforms
151 | 
152 | 
153 | def process_tensors_from_config(tensors, common_cfg, mode_cfg):
154 |     """Based on the config, apply filters and transformations to the data."""
155 | 
156 |     ensemble_seed = torch.Generator().seed()
157 | 
158 |     def wrap_ensemble_fn(data, i):
159 |         """Function to be mapped over the ensemble dimension."""
160 |         d = data.copy()
161 |         fns = ensembled_transform_fns(
162 |             common_cfg, 
163 |             mode_cfg, 
164 |             ensemble_seed,
165 |         )
166 |         fn = compose(fns)
167 |         d["ensemble_index"] = i
168 |         return fn(d)
169 | 
170 |     no_templates = True
171 |     if("template_aatype" in tensors):
172 |         no_templates = tensors["template_aatype"].shape[0] == 0
173 | 
174 |     nonensembled = nonensembled_transform_fns(
175 |         common_cfg,
176 |         mode_cfg,
177 |     )
178 | 
179 |     tensors = compose(nonensembled)(tensors)
180 | 
181 |     if("no_recycling_iters" in tensors):
182 |         num_recycling = int(tensors["no_recycling_iters"])
183 |     else:
184 |         num_recycling = common_cfg.max_recycling_iters
185 | 
186 |     tensors = map_fn(
187 |         lambda x: wrap_ensemble_fn(tensors, x), torch.arange(num_recycling + 1)
188 |     )
189 | 
190 |     return tensors
191 | 
192 | 
193 | @data_transforms.curry1
194 | def compose(x, fs):
195 |     for f in fs:
196 |         x = f(x)
197 |     return x
198 | 
199 | 
200 | def map_fn(fun, x):
201 |     ensembles = [fun(elem) for elem in x]
202 |     features = ensembles[0].keys()
203 |     ensembled_dict = {}
204 |     for feat in features:
205 |         ensembled_dict[feat] = torch.stack(
206 |             [dict_i[feat] for dict_i in ensembles], dim=-1
207 |         )
208 |     return ensembled_dict
209 | 


--------------------------------------------------------------------------------
/openfold/data/mmcif_parsing.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Parses the mmCIF file format."""
 17 | import collections
 18 | import dataclasses
 19 | import io
 20 | import json
 21 | import logging
 22 | import os
 23 | from typing import Any, Mapping, Optional, Sequence, Tuple
 24 | 
 25 | from Bio import PDB
 26 | from Bio.Data import SCOPData
 27 | import numpy as np
 28 | 
 29 | from openfold.data.errors import MultipleChainsError
 30 | import openfold.np.residue_constants as residue_constants
 31 | 
 32 | 
 33 | # Type aliases:
 34 | ChainId = str
 35 | PdbHeader = Mapping[str, Any]
 36 | PdbStructure = PDB.Structure.Structure
 37 | SeqRes = str
 38 | MmCIFDict = Mapping[str, Sequence[str]]
 39 | 
 40 | 
 41 | @dataclasses.dataclass(frozen=True)
 42 | class Monomer:
 43 |     id: str
 44 |     num: int
 45 | 
 46 | 
 47 | # Note - mmCIF format provides no guarantees on the type of author-assigned
 48 | # sequence numbers. They need not be integers.
 49 | @dataclasses.dataclass(frozen=True)
 50 | class AtomSite:
 51 |     residue_name: str
 52 |     author_chain_id: str
 53 |     mmcif_chain_id: str
 54 |     author_seq_num: str
 55 |     mmcif_seq_num: int
 56 |     insertion_code: str
 57 |     hetatm_atom: str
 58 |     model_num: int
 59 | 
 60 | 
 61 | # Used to map SEQRES index to a residue in the structure.
 62 | @dataclasses.dataclass(frozen=True)
 63 | class ResiduePosition:
 64 |     chain_id: str
 65 |     residue_number: int
 66 |     insertion_code: str
 67 | 
 68 | 
 69 | @dataclasses.dataclass(frozen=True)
 70 | class ResidueAtPosition:
 71 |     position: Optional[ResiduePosition]
 72 |     name: str
 73 |     is_missing: bool
 74 |     hetflag: str
 75 | 
 76 | 
 77 | @dataclasses.dataclass(frozen=True)
 78 | class MmcifObject:
 79 |     """Representation of a parsed mmCIF file.
 80 | 
 81 |     Contains:
 82 |       file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
 83 |         files being processed.
 84 |       header: Biopython header.
 85 |       structure: Biopython structure.
 86 |       chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
 87 |         {'A': 'ABCDEFG'}
 88 |       seqres_to_structure: Dict; for each chain_id contains a mapping between
 89 |         SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
 90 |                                                           1: ResidueAtPosition,
 91 |                                                           ...}}
 92 |       raw_string: The raw string used to construct the MmcifObject.
 93 |     """
 94 | 
 95 |     file_id: str
 96 |     header: PdbHeader
 97 |     structure: PdbStructure
 98 |     chain_to_seqres: Mapping[ChainId, SeqRes]
 99 |     seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
100 |     raw_string: Any
101 | 
102 | 
103 | @dataclasses.dataclass(frozen=True)
104 | class ParsingResult:
105 |     """Returned by the parse function.
106 | 
107 |     Contains:
108 |       mmcif_object: A MmcifObject, may be None if no chain could be successfully
109 |         parsed.
110 |       errors: A dict mapping (file_id, chain_id) to any exception generated.
111 |     """
112 | 
113 |     mmcif_object: Optional[MmcifObject]
114 |     errors: Mapping[Tuple[str, str], Any]
115 | 
116 | 
117 | class ParseError(Exception):
118 |     """An error indicating that an mmCIF file could not be parsed."""
119 | 
120 | 
121 | def mmcif_loop_to_list(
122 |     prefix: str, parsed_info: MmCIFDict
123 | ) -> Sequence[Mapping[str, str]]:
124 |     """Extracts loop associated with a prefix from mmCIF data as a list.
125 | 
126 |     Reference for loop_ in mmCIF:
127 |       http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
128 | 
129 |     Args:
130 |       prefix: Prefix shared by each of the data items in the loop.
131 |         e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
132 |         _entity_poly_seq.mon_id. Should include the trailing period.
133 |       parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
134 |         parser.
135 | 
136 |     Returns:
137 |       Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
138 |     """
139 |     cols = []
140 |     data = []
141 |     for key, value in parsed_info.items():
142 |         if key.startswith(prefix):
143 |             cols.append(key)
144 |             data.append(value)
145 | 
146 |     assert all([len(xs) == len(data[0]) for xs in data]), (
147 |         "mmCIF error: Not all loops are the same length: %s" % cols
148 |     )
149 | 
150 |     return [dict(zip(cols, xs)) for xs in zip(*data)]
151 | 
152 | 
153 | def mmcif_loop_to_dict(
154 |     prefix: str,
155 |     index: str,
156 |     parsed_info: MmCIFDict,
157 | ) -> Mapping[str, Mapping[str, str]]:
158 |     """Extracts loop associated with a prefix from mmCIF data as a dictionary.
159 | 
160 |     Args:
161 |       prefix: Prefix shared by each of the data items in the loop.
162 |         e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
163 |         _entity_poly_seq.mon_id. Should include the trailing period.
164 |       index: Which item of loop data should serve as the key.
165 |       parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
166 |         parser.
167 | 
168 |     Returns:
169 |       Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
170 |       indexed by the index column.
171 |     """
172 |     entries = mmcif_loop_to_list(prefix, parsed_info)
173 |     return {entry[index]: entry for entry in entries}
174 | 
175 | 
176 | def parse(
177 |     *, file_id: str, mmcif_string: str, catch_all_errors: bool = True
178 | ) -> ParsingResult:
179 |     """Entry point, parses an mmcif_string.
180 | 
181 |     Args:
182 |       file_id: A string identifier for this file. Should be unique within the
183 |         collection of files being processed.
184 |       mmcif_string: Contents of an mmCIF file.
185 |       catch_all_errors: If True, all exceptions are caught and error messages are
186 |         returned as part of the ParsingResult. If False exceptions will be allowed
187 |         to propagate.
188 | 
189 |     Returns:
190 |       A ParsingResult.
191 |     """
192 |     errors = {}
193 |     try:
194 |         parser = PDB.MMCIFParser(QUIET=True)
195 |         handle = io.StringIO(mmcif_string)
196 |         full_structure = parser.get_structure("", handle)
197 |         first_model_structure = _get_first_model(full_structure)
198 |         # Extract the _mmcif_dict from the parser, which contains useful fields not
199 |         # reflected in the Biopython structure.
200 |         parsed_info = parser._mmcif_dict  # pylint:disable=protected-access
201 | 
202 |         # Ensure all values are lists, even if singletons.
203 |         for key, value in parsed_info.items():
204 |             if not isinstance(value, list):
205 |                 parsed_info[key] = [value]
206 | 
207 |         header = _get_header(parsed_info)
208 | 
209 |         # Determine the protein chains, and their start numbers according to the
210 |         # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
211 |         valid_chains = _get_protein_chains(parsed_info=parsed_info)
212 |         if not valid_chains:
213 |             return ParsingResult(
214 |                 None, {(file_id, ""): "No protein chains found in this file."}
215 |             )
216 |         seq_start_num = {
217 |             chain_id: min([monomer.num for monomer in seq])
218 |             for chain_id, seq in valid_chains.items()
219 |         }
220 | 
221 |         # Loop over the atoms for which we have coordinates. Populate two mappings:
222 |         # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
223 |         # the authors / Biopython).
224 |         # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
225 |         mmcif_to_author_chain_id = {}
226 |         seq_to_structure_mappings = {}
227 |         for atom in _get_atom_site_list(parsed_info):
228 |             if atom.model_num != "1":
229 |                 # We only process the first model at the moment.
230 |                 continue
231 | 
232 |             mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
233 | 
234 |             if atom.mmcif_chain_id in valid_chains:
235 |                 hetflag = " "
236 |                 if atom.hetatm_atom == "HETATM":
237 |                     # Water atoms are assigned a special hetflag of W in Biopython. We
238 |                     # need to do the same, so that this hetflag can be used to fetch
239 |                     # a residue from the Biopython structure by id.
240 |                     if atom.residue_name in ("HOH", "WAT"):
241 |                         hetflag = "W"
242 |                     else:
243 |                         hetflag = "H_" + atom.residue_name
244 |                 insertion_code = atom.insertion_code
245 |                 if not _is_set(atom.insertion_code):
246 |                     insertion_code = " "
247 |                 position = ResiduePosition(
248 |                     chain_id=atom.author_chain_id,
249 |                     residue_number=int(atom.author_seq_num),
250 |                     insertion_code=insertion_code,
251 |                 )
252 |                 seq_idx = (
253 |                     int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
254 |                 )
255 |                 current = seq_to_structure_mappings.get(
256 |                     atom.author_chain_id, {}
257 |                 )
258 |                 current[seq_idx] = ResidueAtPosition(
259 |                     position=position,
260 |                     name=atom.residue_name,
261 |                     is_missing=False,
262 |                     hetflag=hetflag,
263 |                 )
264 |                 seq_to_structure_mappings[atom.author_chain_id] = current
265 | 
266 |         # Add missing residue information to seq_to_structure_mappings.
267 |         for chain_id, seq_info in valid_chains.items():
268 |             author_chain = mmcif_to_author_chain_id[chain_id]
269 |             current_mapping = seq_to_structure_mappings[author_chain]
270 |             for idx, monomer in enumerate(seq_info):
271 |                 if idx not in current_mapping:
272 |                     current_mapping[idx] = ResidueAtPosition(
273 |                         position=None,
274 |                         name=monomer.id,
275 |                         is_missing=True,
276 |                         hetflag=" ",
277 |                     )
278 | 
279 |         author_chain_to_sequence = {}
280 |         for chain_id, seq_info in valid_chains.items():
281 |             author_chain = mmcif_to_author_chain_id[chain_id]
282 |             seq = []
283 |             for monomer in seq_info:
284 |                 code = SCOPData.protein_letters_3to1.get(monomer.id, "X")
285 |                 seq.append(code if len(code) == 1 else "X")
286 |             seq = "".join(seq)
287 |             author_chain_to_sequence[author_chain] = seq
288 | 
289 |         mmcif_object = MmcifObject(
290 |             file_id=file_id,
291 |             header=header,
292 |             structure=first_model_structure,
293 |             chain_to_seqres=author_chain_to_sequence,
294 |             seqres_to_structure=seq_to_structure_mappings,
295 |             raw_string=parsed_info,
296 |         )
297 | 
298 |         return ParsingResult(mmcif_object=mmcif_object, errors=errors)
299 |     except Exception as e:  # pylint:disable=broad-except
300 |         errors[(file_id, "")] = e
301 |         if not catch_all_errors:
302 |             raise
303 |         return ParsingResult(mmcif_object=None, errors=errors)
304 | 
305 | 
306 | def _get_first_model(structure: PdbStructure) -> PdbStructure:
307 |     """Returns the first model in a Biopython structure."""
308 |     return next(structure.get_models())
309 | 
310 | 
311 | _MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21
312 | 
313 | 
314 | def get_release_date(parsed_info: MmCIFDict) -> str:
315 |     """Returns the oldest revision date."""
316 |     revision_dates = parsed_info["_pdbx_audit_revision_history.revision_date"]
317 |     return min(revision_dates)
318 | 
319 | 
320 | def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
321 |     """Returns a basic header containing method, release date and resolution."""
322 |     header = {}
323 | 
324 |     experiments = mmcif_loop_to_list("_exptl.", parsed_info)
325 |     header["structure_method"] = ",".join(
326 |         [experiment["_exptl.method"].lower() for experiment in experiments]
327 |     )
328 | 
329 |     # Note: The release_date here corresponds to the oldest revision. We prefer to
330 |     # use this for dataset filtering over the deposition_date.
331 |     if "_pdbx_audit_revision_history.revision_date" in parsed_info:
332 |         header["release_date"] = get_release_date(parsed_info)
333 |     else:
334 |         logging.warning(
335 |             "Could not determine release_date: %s", parsed_info["_entry.id"]
336 |         )
337 | 
338 |     header["resolution"] = 0.00
339 |     for res_key in (
340 |         "_refine.ls_d_res_high",
341 |         "_em_3d_reconstruction.resolution",
342 |         "_reflns.d_resolution_high",
343 |     ):
344 |         if res_key in parsed_info:
345 |             try:
346 |                 raw_resolution = parsed_info[res_key][0]
347 |                 header["resolution"] = float(raw_resolution)
348 |             except ValueError:
349 |                 logging.info(
350 |                     "Invalid resolution format: %s", parsed_info[res_key]
351 |                 )
352 | 
353 |     return header
354 | 
355 | 
356 | def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
357 |     """Returns list of atom sites; contains data not present in the structure."""
358 |     return [
359 |         AtomSite(*site)
360 |         for site in zip(  # pylint:disable=g-complex-comprehension
361 |             parsed_info["_atom_site.label_comp_id"],
362 |             parsed_info["_atom_site.auth_asym_id"],
363 |             parsed_info["_atom_site.label_asym_id"],
364 |             parsed_info["_atom_site.auth_seq_id"],
365 |             parsed_info["_atom_site.label_seq_id"],
366 |             parsed_info["_atom_site.pdbx_PDB_ins_code"],
367 |             parsed_info["_atom_site.group_PDB"],
368 |             parsed_info["_atom_site.pdbx_PDB_model_num"],
369 |         )
370 |     ]
371 | 
372 | 
373 | def _get_protein_chains(
374 |     *, parsed_info: Mapping[str, Any]
375 | ) -> Mapping[ChainId, Sequence[Monomer]]:
376 |     """Extracts polymer information for protein chains only.
377 | 
378 |     Args:
379 |       parsed_info: _mmcif_dict produced by the Biopython parser.
380 | 
381 |     Returns:
382 |       A dict mapping mmcif chain id to a list of Monomers.
383 |     """
384 |     # Get polymer information for each entity in the structure.
385 |     entity_poly_seqs = mmcif_loop_to_list("_entity_poly_seq.", parsed_info)
386 | 
387 |     polymers = collections.defaultdict(list)
388 |     for entity_poly_seq in entity_poly_seqs:
389 |         polymers[entity_poly_seq["_entity_poly_seq.entity_id"]].append(
390 |             Monomer(
391 |                 id=entity_poly_seq["_entity_poly_seq.mon_id"],
392 |                 num=int(entity_poly_seq["_entity_poly_seq.num"]),
393 |             )
394 |         )
395 | 
396 |     # Get chemical compositions. Will allow us to identify which of these polymers
397 |     # are proteins.
398 |     chem_comps = mmcif_loop_to_dict("_chem_comp.", "_chem_comp.id", parsed_info)
399 | 
400 |     # Get chains information for each entity. Necessary so that we can return a
401 |     # dict keyed on chain id rather than entity.
402 |     struct_asyms = mmcif_loop_to_list("_struct_asym.", parsed_info)
403 | 
404 |     entity_to_mmcif_chains = collections.defaultdict(list)
405 |     for struct_asym in struct_asyms:
406 |         chain_id = struct_asym["_struct_asym.id"]
407 |         entity_id = struct_asym["_struct_asym.entity_id"]
408 |         entity_to_mmcif_chains[entity_id].append(chain_id)
409 | 
410 |     # Identify and return the valid protein chains.
411 |     valid_chains = {}
412 |     for entity_id, seq_info in polymers.items():
413 |         chain_ids = entity_to_mmcif_chains[entity_id]
414 | 
415 |         # Reject polymers without any peptide-like components, such as DNA/RNA.
416 |         if any(
417 |             [
418 |                 "peptide" in chem_comps[monomer.id]["_chem_comp.type"]
419 |                 for monomer in seq_info
420 |             ]
421 |         ):
422 |             for chain_id in chain_ids:
423 |                 valid_chains[chain_id] = seq_info
424 |     return valid_chains
425 | 
426 | 
427 | def _is_set(data: str) -> bool:
428 |     """Returns False if data is a special mmCIF character indicating 'unset'."""
429 |     return data not in (".", "?")
430 | 
431 | 
432 | def get_atom_coords(
433 |     mmcif_object: MmcifObject, 
434 |     chain_id: str, 
435 |     _zero_center_positions: bool = False
436 | ) -> Tuple[np.ndarray, np.ndarray]:
437 |     # Locate the right chain
438 |     chains = list(mmcif_object.structure.get_chains())
439 |     relevant_chains = [c for c in chains if c.id == chain_id]
440 |     if len(relevant_chains) != 1:
441 |         raise MultipleChainsError(
442 |             f"Expected exactly one chain in structure with id {chain_id}."
443 |         )
444 |     chain = relevant_chains[0]
445 | 
446 |     # Extract the coordinates
447 |     num_res = len(mmcif_object.chain_to_seqres[chain_id])
448 |     all_atom_positions = np.zeros(
449 |         [num_res, residue_constants.atom_type_num, 3], dtype=np.float32
450 |     )
451 |     all_atom_mask = np.zeros(
452 |         [num_res, residue_constants.atom_type_num], dtype=np.float32
453 |     )
454 |     for res_index in range(num_res):
455 |         pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32)
456 |         mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
457 |         res_at_position = mmcif_object.seqres_to_structure[chain_id][res_index]
458 |         if not res_at_position.is_missing:
459 |             res = chain[
460 |                 (
461 |                     res_at_position.hetflag,
462 |                     res_at_position.position.residue_number,
463 |                     res_at_position.position.insertion_code,
464 |                 )
465 |             ]
466 |             for atom in res.get_atoms():
467 |                 atom_name = atom.get_name()
468 |                 x, y, z = atom.get_coord()
469 |                 if atom_name in residue_constants.atom_order.keys():
470 |                     pos[residue_constants.atom_order[atom_name]] = [x, y, z]
471 |                     mask[residue_constants.atom_order[atom_name]] = 1.0
472 |                 elif atom_name.upper() == "SE" and res.get_resname() == "MSE":
473 |                     # Put the coords of the selenium atom in the sulphur column
474 |                     pos[residue_constants.atom_order["SD"]] = [x, y, z]
475 |                     mask[residue_constants.atom_order["SD"]] = 1.0
476 | 
477 |         all_atom_positions[res_index] = pos
478 |         all_atom_mask[res_index] = mask
479 | 
480 |     if _zero_center_positions:
481 |         binary_mask = all_atom_mask.astype(bool)
482 |         translation_vec = all_atom_positions[binary_mask].mean(axis=0)
483 |         all_atom_positions[binary_mask] -= translation_vec
484 | 
485 |     return all_atom_positions, all_atom_mask
486 | 


--------------------------------------------------------------------------------
/openfold/data/parsers.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Functions for parsing various file formats."""
 17 | import collections
 18 | import dataclasses
 19 | import re
 20 | import string
 21 | from typing import Dict, Iterable, List, Optional, Sequence, Tuple
 22 | 
 23 | 
 24 | DeletionMatrix = Sequence[Sequence[int]]
 25 | 
 26 | 
 27 | @dataclasses.dataclass(frozen=True)
 28 | class TemplateHit:
 29 |     """Class representing a template hit."""
 30 | 
 31 |     index: int
 32 |     name: str
 33 |     aligned_cols: int
 34 |     sum_probs: float
 35 |     query: str
 36 |     hit_sequence: str
 37 |     indices_query: List[int]
 38 |     indices_hit: List[int]
 39 | 
 40 | 
 41 | def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
 42 |     """Parses FASTA string and returns list of strings with amino-acid sequences.
 43 | 
 44 |     Arguments:
 45 |         fasta_string: The string contents of a FASTA file.
 46 | 
 47 |     Returns:
 48 |         A tuple of two lists:
 49 |         * A list of sequences.
 50 |         * A list of sequence descriptions taken from the comment lines. In the
 51 |             same order as the sequences.
 52 |     """
 53 |     sequences = []
 54 |     descriptions = []
 55 |     index = -1
 56 |     for line in fasta_string.splitlines():
 57 |         line = line.strip()
 58 |         if line.startswith(">"):
 59 |             index += 1
 60 |             descriptions.append(line[1:])  # Remove the '>' at the beginning.
 61 |             sequences.append("")
 62 |             continue
 63 |         elif not line:
 64 |             continue  # Skip blank lines.
 65 |         sequences[index] += line
 66 | 
 67 |     return sequences, descriptions
 68 | 
 69 | 
 70 | def parse_stockholm(
 71 |     stockholm_string: str,
 72 | ) -> Tuple[Sequence[str], DeletionMatrix, Sequence[str]]:
 73 |     """Parses sequences and deletion matrix from stockholm format alignment.
 74 | 
 75 |     Args:
 76 |         stockholm_string: The string contents of a stockholm file. The first
 77 |             sequence in the file should be the query sequence.
 78 | 
 79 |     Returns:
 80 |         A tuple of:
 81 |             * A list of sequences that have been aligned to the query. These
 82 |                 might contain duplicates.
 83 |             * The deletion matrix for the alignment as a list of lists. The element
 84 |                 at `deletion_matrix[i][j]` is the number of residues deleted from
 85 |                 the aligned sequence i at residue position j.
 86 |             * The names of the targets matched, including the jackhmmer subsequence
 87 |                 suffix.
 88 |     """
 89 |     name_to_sequence = collections.OrderedDict()
 90 |     for line in stockholm_string.splitlines():
 91 |         line = line.strip()
 92 |         if not line or line.startswith(("#", "//")):
 93 |             continue
 94 |         name, sequence = line.split()
 95 |         if name not in name_to_sequence:
 96 |             name_to_sequence[name] = ""
 97 |         name_to_sequence[name] += sequence
 98 | 
 99 |     msa = []
100 |     deletion_matrix = []
101 | 
102 |     query = ""
103 |     keep_columns = []
104 |     for seq_index, sequence in enumerate(name_to_sequence.values()):
105 |         if seq_index == 0:
106 |             # Gather the columns with gaps from the query
107 |             query = sequence
108 |             keep_columns = [i for i, res in enumerate(query) if res != "-"]
109 | 
110 |         # Remove the columns with gaps in the query from all sequences.
111 |         aligned_sequence = "".join([sequence[c] for c in keep_columns])
112 | 
113 |         msa.append(aligned_sequence)
114 | 
115 |         # Count the number of deletions w.r.t. query.
116 |         deletion_vec = []
117 |         deletion_count = 0
118 |         for seq_res, query_res in zip(sequence, query):
119 |             if seq_res != "-" or query_res != "-":
120 |                 if query_res == "-":
121 |                     deletion_count += 1
122 |                 else:
123 |                     deletion_vec.append(deletion_count)
124 |                     deletion_count = 0
125 |         deletion_matrix.append(deletion_vec)
126 | 
127 |     return msa, deletion_matrix, list(name_to_sequence.keys())
128 | 
129 | 
130 | def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]:
131 |     """Parses sequences and deletion matrix from a3m format alignment.
132 | 
133 |     Args:
134 |         a3m_string: The string contents of a a3m file. The first sequence in the
135 |             file should be the query sequence.
136 | 
137 |     Returns:
138 |         A tuple of:
139 |             * A list of sequences that have been aligned to the query. These
140 |                 might contain duplicates.
141 |             * The deletion matrix for the alignment as a list of lists. The element
142 |                 at `deletion_matrix[i][j]` is the number of residues deleted from
143 |                 the aligned sequence i at residue position j.
144 |     """
145 |     sequences, _ = parse_fasta(a3m_string)
146 |     deletion_matrix = []
147 |     for msa_sequence in sequences:
148 |         deletion_vec = []
149 |         deletion_count = 0
150 |         for j in msa_sequence:
151 |             if j.islower():
152 |                 deletion_count += 1
153 |             else:
154 |                 deletion_vec.append(deletion_count)
155 |                 deletion_count = 0
156 |         deletion_matrix.append(deletion_vec)
157 | 
158 |     # Make the MSA matrix out of aligned (deletion-free) sequences.
159 |     deletion_table = str.maketrans("", "", string.ascii_lowercase)
160 |     aligned_sequences = [s.translate(deletion_table) for s in sequences]
161 |     return aligned_sequences, deletion_matrix
162 | 
163 | 
164 | def _convert_sto_seq_to_a3m(
165 |     query_non_gaps: Sequence[bool], sto_seq: str
166 | ) -> Iterable[str]:
167 |     for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
168 |         if is_query_res_non_gap:
169 |             yield sequence_res
170 |         elif sequence_res != "-":
171 |             yield sequence_res.lower()
172 | 
173 | 
174 | def convert_stockholm_to_a3m(
175 |     stockholm_format: str, max_sequences: Optional[int] = None
176 | ) -> str:
177 |     """Converts MSA in Stockholm format to the A3M format."""
178 |     descriptions = {}
179 |     sequences = {}
180 |     reached_max_sequences = False
181 | 
182 |     for line in stockholm_format.splitlines():
183 |         reached_max_sequences = (
184 |             max_sequences and len(sequences) >= max_sequences
185 |         )
186 |         if line.strip() and not line.startswith(("#", "//")):
187 |             # Ignore blank lines, markup and end symbols - remainder are alignment
188 |             # sequence parts.
189 |             seqname, aligned_seq = line.split(maxsplit=1)
190 |             if seqname not in sequences:
191 |                 if reached_max_sequences:
192 |                     continue
193 |                 sequences[seqname] = ""
194 |             sequences[seqname] += aligned_seq
195 | 
196 |     for line in stockholm_format.splitlines():
197 |         if line[:4] == "#=GS":
198 |             # Description row - example format is:
199 |             # #=GS UniRef90_Q9H5Z4/4-78            DE [subseq from] cDNA: FLJ22755 ...
200 |             columns = line.split(maxsplit=3)
201 |             seqname, feature = columns[1:3]
202 |             value = columns[3] if len(columns) == 4 else ""
203 |             if feature != "DE":
204 |                 continue
205 |             if reached_max_sequences and seqname not in sequences:
206 |                 continue
207 |             descriptions[seqname] = value
208 |             if len(descriptions) == len(sequences):
209 |                 break
210 | 
211 |     # Convert sto format to a3m line by line
212 |     a3m_sequences = {}
213 |     # query_sequence is assumed to be the first sequence
214 |     query_sequence = next(iter(sequences.values()))
215 |     query_non_gaps = [res != "-" for res in query_sequence]
216 |     for seqname, sto_sequence in sequences.items():
217 |         a3m_sequences[seqname] = "".join(
218 |             _convert_sto_seq_to_a3m(query_non_gaps, sto_sequence)
219 |         )
220 | 
221 |     fasta_chunks = (
222 |         f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
223 |         for k in a3m_sequences
224 |     )
225 |     return "\n".join(fasta_chunks) + "\n"  # Include terminating newline.
226 | 
227 | 
228 | def _get_hhr_line_regex_groups(
229 |     regex_pattern: str, line: str
230 | ) -> Sequence[Optional[str]]:
231 |     match = re.match(regex_pattern, line)
232 |     if match is None:
233 |         raise RuntimeError(f"Could not parse query line {line}")
234 |     return match.groups()
235 | 
236 | 
237 | def _update_hhr_residue_indices_list(
238 |     sequence: str, start_index: int, indices_list: List[int]
239 | ):
240 |     """Computes the relative indices for each residue with respect to the original sequence."""
241 |     counter = start_index
242 |     for symbol in sequence:
243 |         if symbol == "-":
244 |             indices_list.append(-1)
245 |         else:
246 |             indices_list.append(counter)
247 |             counter += 1
248 | 
249 | 
250 | def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
251 |     """Parses the detailed HMM HMM comparison section for a single Hit.
252 | 
253 |     This works on .hhr files generated from both HHBlits and HHSearch.
254 | 
255 |     Args:
256 |         detailed_lines: A list of lines from a single comparison section between 2
257 |             sequences (which each have their own HMM's)
258 | 
259 |     Returns:
260 |         A dictionary with the information from that detailed comparison section
261 | 
262 |     Raises:
263 |         RuntimeError: If a certain line cannot be processed
264 |     """
265 |     # Parse first 2 lines.
266 |     number_of_hit = int(detailed_lines[0].split()[-1])
267 |     name_hit = detailed_lines[1][1:]
268 | 
269 |     # Parse the summary line.
270 |     pattern = (
271 |         "Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t"
272 |         " ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t "
273 |         "]*Template_Neff=(.*)"
274 |     )
275 |     match = re.match(pattern, detailed_lines[2])
276 |     if match is None:
277 |         raise RuntimeError(
278 |             "Could not parse section: %s. Expected this: \n%s to contain summary."
279 |             % (detailed_lines, detailed_lines[2])
280 |         )
281 |     (prob_true, e_value, _, aligned_cols, _, _, sum_probs, neff) = [
282 |         float(x) for x in match.groups()
283 |     ]
284 | 
285 |     # The next section reads the detailed comparisons. These are in a 'human
286 |     # readable' format which has a fixed length. The strategy employed is to
287 |     # assume that each block starts with the query sequence line, and to parse
288 |     # that with a regexp in order to deduce the fixed length used for that block.
289 |     query = ""
290 |     hit_sequence = ""
291 |     indices_query = []
292 |     indices_hit = []
293 |     length_block = None
294 | 
295 |     for line in detailed_lines[3:]:
296 |         # Parse the query sequence line
297 |         if (
298 |             line.startswith("Q ")
299 |             and not line.startswith("Q ss_dssp")
300 |             and not line.startswith("Q ss_pred")
301 |             and not line.startswith("Q Consensus")
302 |         ):
303 |             # Thus the first 17 characters must be 'Q <query_name> ', and we can parse
304 |             # everything after that.
305 |             #              start    sequence       end       total_sequence_length
306 |             patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)"
307 |             groups = _get_hhr_line_regex_groups(patt, line[17:])
308 | 
309 |             # Get the length of the parsed block using the start and finish indices,
310 |             # and ensure it is the same as the actual block length.
311 |             start = int(groups[0]) - 1  # Make index zero based.
312 |             delta_query = groups[1]
313 |             end = int(groups[2])
314 |             num_insertions = len([x for x in delta_query if x == "-"])
315 |             length_block = end - start + num_insertions
316 |             assert length_block == len(delta_query)
317 | 
318 |             # Update the query sequence and indices list.
319 |             query += delta_query
320 |             _update_hhr_residue_indices_list(delta_query, start, indices_query)
321 | 
322 |         elif line.startswith("T "):
323 |             # Parse the hit sequence.
324 |             if (
325 |                 not line.startswith("T ss_dssp")
326 |                 and not line.startswith("T ss_pred")
327 |                 and not line.startswith("T Consensus")
328 |             ):
329 |                 # Thus the first 17 characters must be 'T <hit_name> ', and we can
330 |                 # parse everything after that.
331 |                 #              start    sequence       end     total_sequence_length
332 |                 patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)"
333 |                 groups = _get_hhr_line_regex_groups(patt, line[17:])
334 |                 start = int(groups[0]) - 1  # Make index zero based.
335 |                 delta_hit_sequence = groups[1]
336 |                 assert length_block == len(delta_hit_sequence)
337 | 
338 |                 # Update the hit sequence and indices list.
339 |                 hit_sequence += delta_hit_sequence
340 |                 _update_hhr_residue_indices_list(
341 |                     delta_hit_sequence, start, indices_hit
342 |                 )
343 | 
344 |     return TemplateHit(
345 |         index=number_of_hit,
346 |         name=name_hit,
347 |         aligned_cols=int(aligned_cols),
348 |         sum_probs=sum_probs,
349 |         query=query,
350 |         hit_sequence=hit_sequence,
351 |         indices_query=indices_query,
352 |         indices_hit=indices_hit,
353 |     )
354 | 
355 | 
356 | def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
357 |     """Parses the content of an entire HHR file."""
358 |     lines = hhr_string.splitlines()
359 | 
360 |     # Each .hhr file starts with a results table, then has a sequence of hit
361 |     # "paragraphs", each paragraph starting with a line 'No <hit number>'. We
362 |     # iterate through each paragraph to parse each hit.
363 | 
364 |     block_starts = [i for i, line in enumerate(lines) if line.startswith("No ")]
365 | 
366 |     hits = []
367 |     if block_starts:
368 |         block_starts.append(len(lines))  # Add the end of the final block.
369 |         for i in range(len(block_starts) - 1):
370 |             hits.append(
371 |                 _parse_hhr_hit(lines[block_starts[i] : block_starts[i + 1]])
372 |             )
373 |     return hits
374 | 
375 | 
376 | def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
377 |     """Parse target to e-value mapping parsed from Jackhmmer tblout string."""
378 |     e_values = {"query": 0}
379 |     lines = [line for line in tblout.splitlines() if line[0] != "#"]
380 |     # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
381 |     # space-delimited. Relevant fields are (1) target name:  and
382 |     # (5) E-value (full sequence) (numbering from 1).
383 |     for line in lines:
384 |         fields = line.split()
385 |         e_value = fields[4]
386 |         target_name = fields[0]
387 |         e_values[target_name] = float(e_value)
388 |     return e_values
389 | 


--------------------------------------------------------------------------------
/openfold/data/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/openfold/data/tools/__init__.py


--------------------------------------------------------------------------------
/openfold/data/tools/hhblits.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Library to run HHblits from Python."""
 17 | import glob
 18 | import logging
 19 | import os
 20 | import subprocess
 21 | from typing import Any, Mapping, Optional, Sequence
 22 | 
 23 | from openfold.data.tools import utils
 24 | 
 25 | 
 26 | _HHBLITS_DEFAULT_P = 20
 27 | _HHBLITS_DEFAULT_Z = 500
 28 | 
 29 | 
 30 | class HHBlits:
 31 |     """Python wrapper of the HHblits binary."""
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         *,
 36 |         binary_path: str,
 37 |         databases: Sequence[str],
 38 |         n_cpu: int = 4,
 39 |         n_iter: int = 3,
 40 |         e_value: float = 0.001,
 41 |         maxseq: int = 1_000_000,
 42 |         realign_max: int = 100_000,
 43 |         maxfilt: int = 100_000,
 44 |         min_prefilter_hits: int = 1000,
 45 |         all_seqs: bool = False,
 46 |         alt: Optional[int] = None,
 47 |         p: int = _HHBLITS_DEFAULT_P,
 48 |         z: int = _HHBLITS_DEFAULT_Z,
 49 |     ):
 50 |         """Initializes the Python HHblits wrapper.
 51 | 
 52 |         Args:
 53 |           binary_path: The path to the HHblits executable.
 54 |           databases: A sequence of HHblits database paths. This should be the
 55 |             common prefix for the database files (i.e. up to but not including
 56 |             _hhm.ffindex etc.)
 57 |           n_cpu: The number of CPUs to give HHblits.
 58 |           n_iter: The number of HHblits iterations.
 59 |           e_value: The E-value, see HHblits docs for more details.
 60 |           maxseq: The maximum number of rows in an input alignment. Note that this
 61 |             parameter is only supported in HHBlits version 3.1 and higher.
 62 |           realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
 63 |           maxfilt: Max number of hits allowed to pass the 2nd prefilter.
 64 |             HHblits default: 20000.
 65 |           min_prefilter_hits: Min number of hits to pass prefilter.
 66 |             HHblits default: 100.
 67 |           all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
 68 |             HHblits default: False.
 69 |           alt: Show up to this many alternative alignments.
 70 |           p: Minimum Prob for a hit to be included in the output hhr file.
 71 |             HHblits default: 20.
 72 |           z: Hard cap on number of hits reported in the hhr file.
 73 |             HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
 74 | 
 75 |         Raises:
 76 |           RuntimeError: If HHblits binary not found within the path.
 77 |         """
 78 |         self.binary_path = binary_path
 79 |         self.databases = databases
 80 | 
 81 |         for database_path in self.databases:
 82 |             if not glob.glob(database_path + "_*"):
 83 |                 logging.error(
 84 |                     "Could not find HHBlits database %s", database_path
 85 |                 )
 86 |                 raise ValueError(
 87 |                     f"Could not find HHBlits database {database_path}"
 88 |                 )
 89 | 
 90 |         self.n_cpu = n_cpu
 91 |         self.n_iter = n_iter
 92 |         self.e_value = e_value
 93 |         self.maxseq = maxseq
 94 |         self.realign_max = realign_max
 95 |         self.maxfilt = maxfilt
 96 |         self.min_prefilter_hits = min_prefilter_hits
 97 |         self.all_seqs = all_seqs
 98 |         self.alt = alt
 99 |         self.p = p
100 |         self.z = z
101 | 
102 |     def query(self, input_fasta_path: str) -> Mapping[str, Any]:
103 |         """Queries the database using HHblits."""
104 |         with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
105 |             a3m_path = os.path.join(query_tmp_dir, "output.a3m")
106 | 
107 |             db_cmd = []
108 |             for db_path in self.databases:
109 |                 db_cmd.append("-d")
110 |                 db_cmd.append(db_path)
111 |             cmd = [
112 |                 self.binary_path,
113 |                 "-i",
114 |                 input_fasta_path,
115 |                 "-cpu",
116 |                 str(self.n_cpu),
117 |                 "-oa3m",
118 |                 a3m_path,
119 |                 "-o",
120 |                 "/dev/null",
121 |                 "-n",
122 |                 str(self.n_iter),
123 |                 "-e",
124 |                 str(self.e_value),
125 |                 "-maxseq",
126 |                 str(self.maxseq),
127 |                 "-realign_max",
128 |                 str(self.realign_max),
129 |                 "-maxfilt",
130 |                 str(self.maxfilt),
131 |                 "-min_prefilter_hits",
132 |                 str(self.min_prefilter_hits),
133 |             ]
134 |             if self.all_seqs:
135 |                 cmd += ["-all"]
136 |             if self.alt:
137 |                 cmd += ["-alt", str(self.alt)]
138 |             if self.p != _HHBLITS_DEFAULT_P:
139 |                 cmd += ["-p", str(self.p)]
140 |             if self.z != _HHBLITS_DEFAULT_Z:
141 |                 cmd += ["-Z", str(self.z)]
142 |             cmd += db_cmd
143 | 
144 |             logging.info('Launching subprocess "%s"', " ".join(cmd))
145 |             process = subprocess.Popen(
146 |                 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
147 |             )
148 | 
149 |             with utils.timing("HHblits query"):
150 |                 stdout, stderr = process.communicate()
151 |                 retcode = process.wait()
152 | 
153 |             if retcode:
154 |                 # Logs have a 15k character limit, so log HHblits error line by line.
155 |                 logging.error("HHblits failed. HHblits stderr begin:")
156 |                 for error_line in stderr.decode("utf-8").splitlines():
157 |                     if error_line.strip():
158 |                         logging.error(error_line.strip())
159 |                 logging.error("HHblits stderr end")
160 |                 raise RuntimeError(
161 |                     "HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n"
162 |                     % (stdout.decode("utf-8"), stderr[:500_000].decode("utf-8"))
163 |                 )
164 | 
165 |             with open(a3m_path) as f:
166 |                 a3m = f.read()
167 | 
168 |         raw_output = dict(
169 |             a3m=a3m,
170 |             output=stdout,
171 |             stderr=stderr,
172 |             n_iter=self.n_iter,
173 |             e_value=self.e_value,
174 |         )
175 |         return raw_output
176 | 


--------------------------------------------------------------------------------
/openfold/data/tools/hhsearch.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Library to run HHsearch from Python."""
 17 | import glob
 18 | import logging
 19 | import os
 20 | import subprocess
 21 | from typing import Sequence
 22 | 
 23 | from openfold.data.tools import utils
 24 | 
 25 | 
 26 | class HHSearch:
 27 |     """Python wrapper of the HHsearch binary."""
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         *,
 32 |         binary_path: str,
 33 |         databases: Sequence[str],
 34 |         n_cpu: int = 2,
 35 |         maxseq: int = 1_000_000,
 36 |     ):
 37 |         """Initializes the Python HHsearch wrapper.
 38 | 
 39 |         Args:
 40 |           binary_path: The path to the HHsearch executable.
 41 |           databases: A sequence of HHsearch database paths. This should be the
 42 |             common prefix for the database files (i.e. up to but not including
 43 |             _hhm.ffindex etc.)
 44 |           n_cpu: The number of CPUs to use
 45 |           maxseq: The maximum number of rows in an input alignment. Note that this
 46 |             parameter is only supported in HHBlits version 3.1 and higher.
 47 | 
 48 |         Raises:
 49 |           RuntimeError: If HHsearch binary not found within the path.
 50 |         """
 51 |         self.binary_path = binary_path
 52 |         self.databases = databases
 53 |         self.n_cpu = n_cpu
 54 |         self.maxseq = maxseq
 55 | 
 56 |         for database_path in self.databases:
 57 |             if not glob.glob(database_path + "_*"):
 58 |                 logging.error(
 59 |                     "Could not find HHsearch database %s", database_path
 60 |                 )
 61 |                 raise ValueError(
 62 |                     f"Could not find HHsearch database {database_path}"
 63 |                 )
 64 | 
 65 |     def query(self, a3m: str) -> str:
 66 |         """Queries the database using HHsearch using a given a3m."""
 67 |         with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
 68 |             input_path = os.path.join(query_tmp_dir, "query.a3m")
 69 |             hhr_path = os.path.join(query_tmp_dir, "output.hhr")
 70 |             with open(input_path, "w") as f:
 71 |                 f.write(a3m)
 72 | 
 73 |             db_cmd = []
 74 |             for db_path in self.databases:
 75 |                 db_cmd.append("-d")
 76 |                 db_cmd.append(db_path)
 77 |             cmd = [
 78 |                 self.binary_path,
 79 |                 "-i",
 80 |                 input_path,
 81 |                 "-o",
 82 |                 hhr_path,
 83 |                 "-maxseq",
 84 |                 str(self.maxseq),
 85 |                 "-cpu",
 86 |                 str(self.n_cpu),
 87 |             ] + db_cmd
 88 | 
 89 |             logging.info('Launching subprocess "%s"', " ".join(cmd))
 90 |             process = subprocess.Popen(
 91 |                 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
 92 |             )
 93 |             with utils.timing("HHsearch query"):
 94 |                 stdout, stderr = process.communicate()
 95 |                 retcode = process.wait()
 96 | 
 97 |             if retcode:
 98 |                 # Stderr is truncated to prevent proto size errors in Beam.
 99 |                 raise RuntimeError(
100 |                     "HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n"
101 |                     % (stdout.decode("utf-8"), stderr[:100_000].decode("utf-8"))
102 |                 )
103 | 
104 |             with open(hhr_path) as f:
105 |                 hhr = f.read()
106 |         return hhr
107 | 


--------------------------------------------------------------------------------
/openfold/data/tools/jackhmmer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Library to run Jackhmmer from Python."""
 17 | 
 18 | from concurrent import futures
 19 | import glob
 20 | import logging
 21 | import os
 22 | import subprocess
 23 | from typing import Any, Callable, Mapping, Optional, Sequence
 24 | from urllib import request
 25 | 
 26 | from openfold.data.tools import utils
 27 | 
 28 | 
 29 | class Jackhmmer:
 30 |     """Python wrapper of the Jackhmmer binary."""
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         *,
 35 |         binary_path: str,
 36 |         database_path: str,
 37 |         n_cpu: int = 8,
 38 |         n_iter: int = 1,
 39 |         e_value: float = 0.0001,
 40 |         z_value: Optional[int] = None,
 41 |         get_tblout: bool = False,
 42 |         filter_f1: float = 0.0005,
 43 |         filter_f2: float = 0.00005,
 44 |         filter_f3: float = 0.0000005,
 45 |         incdom_e: Optional[float] = None,
 46 |         dom_e: Optional[float] = None,
 47 |         num_streamed_chunks: Optional[int] = None,
 48 |         streaming_callback: Optional[Callable[[int], None]] = None,
 49 |     ):
 50 |         """Initializes the Python Jackhmmer wrapper.
 51 | 
 52 |         Args:
 53 |           binary_path: The path to the jackhmmer executable.
 54 |           database_path: The path to the jackhmmer database (FASTA format).
 55 |           n_cpu: The number of CPUs to give Jackhmmer.
 56 |           n_iter: The number of Jackhmmer iterations.
 57 |           e_value: The E-value, see Jackhmmer docs for more details.
 58 |           z_value: The Z-value, see Jackhmmer docs for more details.
 59 |           get_tblout: Whether to save tblout string.
 60 |           filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.
 61 |           filter_f2: Viterbi pre-filter, set to >1.0 to turn off.
 62 |           filter_f3: Forward pre-filter, set to >1.0 to turn off.
 63 |           incdom_e: Domain e-value criteria for inclusion of domains in MSA/next
 64 |             round.
 65 |           dom_e: Domain e-value criteria for inclusion in tblout.
 66 |           num_streamed_chunks: Number of database chunks to stream over.
 67 |           streaming_callback: Callback function run after each chunk iteration with
 68 |             the iteration number as argument.
 69 |         """
 70 |         self.binary_path = binary_path
 71 |         self.database_path = database_path
 72 |         self.num_streamed_chunks = num_streamed_chunks
 73 | 
 74 |         if (
 75 |             not os.path.exists(self.database_path)
 76 |             and num_streamed_chunks is None
 77 |         ):
 78 |             logging.error("Could not find Jackhmmer database %s", database_path)
 79 |             raise ValueError(
 80 |                 f"Could not find Jackhmmer database {database_path}"
 81 |             )
 82 | 
 83 |         self.n_cpu = n_cpu
 84 |         self.n_iter = n_iter
 85 |         self.e_value = e_value
 86 |         self.z_value = z_value
 87 |         self.filter_f1 = filter_f1
 88 |         self.filter_f2 = filter_f2
 89 |         self.filter_f3 = filter_f3
 90 |         self.incdom_e = incdom_e
 91 |         self.dom_e = dom_e
 92 |         self.get_tblout = get_tblout
 93 |         self.streaming_callback = streaming_callback
 94 | 
 95 |     def _query_chunk(
 96 |         self, input_fasta_path: str, database_path: str
 97 |     ) -> Mapping[str, Any]:
 98 |         """Queries the database chunk using Jackhmmer."""
 99 |         with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
100 |             sto_path = os.path.join(query_tmp_dir, "output.sto")
101 | 
102 |             # The F1/F2/F3 are the expected proportion to pass each of the filtering
103 |             # stages (which get progressively more expensive), reducing these
104 |             # speeds up the pipeline at the expensive of sensitivity.  They are
105 |             # currently set very low to make querying Mgnify run in a reasonable
106 |             # amount of time.
107 |             cmd_flags = [
108 |                 # Don't pollute stdout with Jackhmmer output.
109 |                 "-o",
110 |                 "/dev/null",
111 |                 "-A",
112 |                 sto_path,
113 |                 "--noali",
114 |                 "--F1",
115 |                 str(self.filter_f1),
116 |                 "--F2",
117 |                 str(self.filter_f2),
118 |                 "--F3",
119 |                 str(self.filter_f3),
120 |                 "--incE",
121 |                 str(self.e_value),
122 |                 # Report only sequences with E-values <= x in per-sequence output.
123 |                 "-E",
124 |                 str(self.e_value),
125 |                 "--cpu",
126 |                 str(self.n_cpu),
127 |                 "-N",
128 |                 str(self.n_iter),
129 |             ]
130 |             if self.get_tblout:
131 |                 tblout_path = os.path.join(query_tmp_dir, "tblout.txt")
132 |                 cmd_flags.extend(["--tblout", tblout_path])
133 | 
134 |             if self.z_value:
135 |                 cmd_flags.extend(["-Z", str(self.z_value)])
136 | 
137 |             if self.dom_e is not None:
138 |                 cmd_flags.extend(["--domE", str(self.dom_e)])
139 | 
140 |             if self.incdom_e is not None:
141 |                 cmd_flags.extend(["--incdomE", str(self.incdom_e)])
142 | 
143 |             cmd = (
144 |                 [self.binary_path]
145 |                 + cmd_flags
146 |                 + [input_fasta_path, database_path]
147 |             )
148 | 
149 |             logging.info('Launching subprocess "%s"', " ".join(cmd))
150 |             process = subprocess.Popen(
151 |                 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
152 |             )
153 |             with utils.timing(
154 |                 f"Jackhmmer ({os.path.basename(database_path)}) query"
155 |             ):
156 |                 _, stderr = process.communicate()
157 |                 retcode = process.wait()
158 | 
159 |             if retcode:
160 |                 raise RuntimeError(
161 |                     "Jackhmmer failed\nstderr:\n%s\n" % stderr.decode("utf-8")
162 |                 )
163 | 
164 |             # Get e-values for each target name
165 |             tbl = ""
166 |             if self.get_tblout:
167 |                 with open(tblout_path) as f:
168 |                     tbl = f.read()
169 | 
170 |             with open(sto_path) as f:
171 |                 sto = f.read()
172 | 
173 |         raw_output = dict(
174 |             sto=sto,
175 |             tbl=tbl,
176 |             stderr=stderr,
177 |             n_iter=self.n_iter,
178 |             e_value=self.e_value,
179 |         )
180 | 
181 |         return raw_output
182 | 
183 |     def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]:
184 |         """Queries the database using Jackhmmer."""
185 |         if self.num_streamed_chunks is None:
186 |             return [self._query_chunk(input_fasta_path, self.database_path)]
187 | 
188 |         db_basename = os.path.basename(self.database_path)
189 |         db_remote_chunk = lambda db_idx: f"{self.database_path}.{db_idx}"
190 |         db_local_chunk = lambda db_idx: f"/tmp/ramdisk/{db_basename}.{db_idx}"
191 | 
192 |         # Remove existing files to prevent OOM
193 |         for f in glob.glob(db_local_chunk("[0-9]*")):
194 |             try:
195 |                 os.remove(f)
196 |             except OSError:
197 |                 print(f"OSError while deleting {f}")
198 | 
199 |         # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk
200 |         with futures.ThreadPoolExecutor(max_workers=2) as executor:
201 |             chunked_output = []
202 |             for i in range(1, self.num_streamed_chunks + 1):
203 |                 # Copy the chunk locally
204 |                 if i == 1:
205 |                     future = executor.submit(
206 |                         request.urlretrieve,
207 |                         db_remote_chunk(i),
208 |                         db_local_chunk(i),
209 |                     )
210 |                 if i < self.num_streamed_chunks:
211 |                     next_future = executor.submit(
212 |                         request.urlretrieve,
213 |                         db_remote_chunk(i + 1),
214 |                         db_local_chunk(i + 1),
215 |                     )
216 | 
217 |                 # Run Jackhmmer with the chunk
218 |                 future.result()
219 |                 chunked_output.append(
220 |                     self._query_chunk(input_fasta_path, db_local_chunk(i))
221 |                 )
222 | 
223 |                 # Remove the local copy of the chunk
224 |                 os.remove(db_local_chunk(i))
225 |                 future = next_future
226 |                 if self.streaming_callback:
227 |                     self.streaming_callback(i)
228 |         return chunked_output
229 | 


--------------------------------------------------------------------------------
/openfold/data/tools/kalign.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """A Python wrapper for Kalign."""
 17 | import os
 18 | import subprocess
 19 | from typing import Sequence
 20 | 
 21 | from absl import logging
 22 | 
 23 | from openfold.data.tools import utils
 24 | 
 25 | 
 26 | def _to_a3m(sequences: Sequence[str]) -> str:
 27 |     """Converts sequences to an a3m file."""
 28 |     names = ["sequence %d" % i for i in range(1, len(sequences) + 1)]
 29 |     a3m = []
 30 |     for sequence, name in zip(sequences, names):
 31 |         a3m.append(u">" + name + u"\n")
 32 |         a3m.append(sequence + u"\n")
 33 |     return "".join(a3m)
 34 | 
 35 | 
 36 | class Kalign:
 37 |     """Python wrapper of the Kalign binary."""
 38 | 
 39 |     def __init__(self, *, binary_path: str):
 40 |         """Initializes the Python Kalign wrapper.
 41 | 
 42 |         Args:
 43 |           binary_path: The path to the Kalign binary.
 44 | 
 45 |         Raises:
 46 |           RuntimeError: If Kalign binary not found within the path.
 47 |         """
 48 |         self.binary_path = binary_path
 49 | 
 50 |     def align(self, sequences: Sequence[str]) -> str:
 51 |         """Aligns the sequences and returns the alignment in A3M string.
 52 | 
 53 |         Args:
 54 |           sequences: A list of query sequence strings. The sequences have to be at
 55 |             least 6 residues long (Kalign requires this). Note that the order in
 56 |             which you give the sequences might alter the output slightly as
 57 |             different alignment tree might get constructed.
 58 | 
 59 |         Returns:
 60 |           A string with the alignment in a3m format.
 61 | 
 62 |         Raises:
 63 |           RuntimeError: If Kalign fails.
 64 |           ValueError: If any of the sequences is less than 6 residues long.
 65 |         """
 66 |         logging.info("Aligning %d sequences", len(sequences))
 67 | 
 68 |         for s in sequences:
 69 |             if len(s) < 6:
 70 |                 raise ValueError(
 71 |                     "Kalign requires all sequences to be at least 6 "
 72 |                     "residues long. Got %s (%d residues)." % (s, len(s))
 73 |                 )
 74 | 
 75 |         with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
 76 |             input_fasta_path = os.path.join(query_tmp_dir, "input.fasta")
 77 |             output_a3m_path = os.path.join(query_tmp_dir, "output.a3m")
 78 | 
 79 |             with open(input_fasta_path, "w") as f:
 80 |                 f.write(_to_a3m(sequences))
 81 | 
 82 |             cmd = [
 83 |                 self.binary_path,
 84 |                 "-i",
 85 |                 input_fasta_path,
 86 |                 "-o",
 87 |                 output_a3m_path,
 88 |                 "-format",
 89 |                 "fasta",
 90 |             ]
 91 | 
 92 |             logging.info('Launching subprocess "%s"', " ".join(cmd))
 93 |             process = subprocess.Popen(
 94 |                 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
 95 |             )
 96 | 
 97 |             with utils.timing("Kalign query"):
 98 |                 stdout, stderr = process.communicate()
 99 |                 retcode = process.wait()
100 |                 logging.info(
101 |                     "Kalign stdout:\n%s\n\nstderr:\n%s\n",
102 |                     stdout.decode("utf-8"),
103 |                     stderr.decode("utf-8"),
104 |                 )
105 | 
106 |             if retcode:
107 |                 raise RuntimeError(
108 |                     "Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n"
109 |                     % (stdout.decode("utf-8"), stderr.decode("utf-8"))
110 |                 )
111 | 
112 |             with open(output_a3m_path) as f:
113 |                 a3m = f.read()
114 | 
115 |             return a3m
116 | 


--------------------------------------------------------------------------------
/openfold/data/tools/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 AlQuraishi Laboratory
 2 | # Copyright 2021 DeepMind Technologies Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Common utilities for data pipeline tools."""
17 | import contextlib
18 | import datetime
19 | import logging
20 | import shutil
21 | import tempfile
22 | import time
23 | from typing import Optional
24 | 
25 | 
26 | @contextlib.contextmanager
27 | def tmpdir_manager(base_dir: Optional[str] = None):
28 |     """Context manager that deletes a temporary directory on exit."""
29 |     tmpdir = tempfile.mkdtemp(dir=base_dir)
30 |     try:
31 |         yield tmpdir
32 |     finally:
33 |         shutil.rmtree(tmpdir, ignore_errors=True)
34 | 
35 | 
36 | @contextlib.contextmanager
37 | def timing(msg: str):
38 |     logging.info("Started %s", msg)
39 |     tic = time.perf_counter()
40 |     yield
41 |     toc = time.perf_counter()
42 |     logging.info("Finished %s in %.3f seconds", msg, toc - tic)
43 | 
44 | 
45 | def to_date(s: str):
46 |     return datetime.datetime(
47 |         year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10])
48 |     )
49 | 


--------------------------------------------------------------------------------
/openfold/np/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import importlib as importlib
 4 | 
 5 | _files = glob.glob(os.path.join(os.path.dirname(__file__), "*.py"))
 6 | __all__ = [
 7 |     os.path.basename(f)[:-3]
 8 |     for f in _files
 9 |     if os.path.isfile(f) and not f.endswith("__init__.py")
10 | ]
11 | _modules = [(m, importlib.import_module("." + m, __name__)) for m in __all__]
12 | for _m in _modules:
13 |     globals()[_m[0]] = _m[1]
14 | 
15 | # Avoid needlessly cluttering the global namespace
16 | del _files, _m, _modules
17 | 


--------------------------------------------------------------------------------
/openfold/np/protein.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Protein data type."""
 17 | import dataclasses
 18 | import io
 19 | from typing import Any, Sequence, Mapping, Optional
 20 | import re
 21 | import string
 22 | 
 23 | from openfold.np import residue_constants
 24 | from Bio import PDB
 25 | import numpy as np
 26 | 
 27 | 
 28 | FeatureDict = Mapping[str, np.ndarray]
 29 | ModelOutput = Mapping[str, Any]  # Is a nested dict.
 30 | PICO_TO_ANGSTROM = 0.01
 31 | 
 32 | @dataclasses.dataclass(frozen=True)
 33 | class Protein:
 34 |     """Protein structure representation."""
 35 | 
 36 |     # Cartesian coordinates of atoms in angstroms. The atom types correspond to
 37 |     # residue_constants.atom_types, i.e. the first three are N, CA, CB.
 38 |     atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
 39 | 
 40 |     # Amino-acid type for each residue represented as an integer between 0 and
 41 |     # 20, where 20 is 'X'.
 42 |     aatype: np.ndarray  # [num_res]
 43 | 
 44 |     # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
 45 |     # is present and 0.0 if not. This should be used for loss masking.
 46 |     atom_mask: np.ndarray  # [num_res, num_atom_type]
 47 | 
 48 |     # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
 49 |     residue_index: np.ndarray  # [num_res]
 50 | 
 51 |     # B-factors, or temperature factors, of each residue (in sq. angstroms units),
 52 |     # representing the displacement of the residue from its ground truth mean
 53 |     # value.
 54 |     b_factors: np.ndarray  # [num_res, num_atom_type]
 55 | 
 56 |     # Chain indices for multi-chain predictions
 57 |     chain_index: Optional[np.ndarray] = None
 58 | 
 59 |     # Optional remark about the protein. Included as a comment in output PDB 
 60 |     # files
 61 |     remark: Optional[str] = None
 62 | 
 63 |     # Templates used to generate this protein (prediction-only)
 64 |     parents: Optional[Sequence[str]] = None
 65 | 
 66 |     # Chain corresponding to each parent
 67 |     parents_chain_index: Optional[Sequence[int]] = None
 68 | 
 69 | 
 70 | def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
 71 |     """Takes a PDB string and constructs a Protein object.
 72 | 
 73 |     WARNING: All non-standard residue types will be converted into UNK. All
 74 |       non-standard atoms will be ignored.
 75 | 
 76 |     Args:
 77 |       pdb_str: The contents of the pdb file
 78 |       chain_id: If None, then the pdb file must contain a single chain (which
 79 |         will be parsed). If chain_id is specified (e.g. A), then only that chain
 80 |         is parsed.
 81 | 
 82 |     Returns:
 83 |       A new `Protein` parsed from the pdb contents.
 84 |     """
 85 |     pdb_fh = pdb_str
 86 |     parser = PDB.PDBParser(QUIET=True)
 87 |     structure = parser.get_structure("none", pdb_fh)
 88 |     models = list(structure.get_models())
 89 |     if len(models) != 1:
 90 |         raise ValueError(
 91 |             f"Only single model PDBs are supported. Found {len(models)} models."
 92 |         )
 93 |     model = models[0]
 94 | 
 95 |     atom_positions = []
 96 |     aatype = []
 97 |     atom_mask = []
 98 |     residue_index = []
 99 |     chain_ids = []
100 |     b_factors = []
101 | 
102 |     for chain in model:
103 |         if(chain_id is not None and chain.id != chain_id):
104 |             continue
105 |         for res in chain:
106 |             if res.id[2] != " ":
107 |                 raise ValueError(
108 |                     f"PDB contains an insertion code at chain {chain.id} and residue "
109 |                     f"index {res.id[1]}. These are not supported."
110 |                 )
111 |             res_shortname = residue_constants.restype_3to1.get(res.resname, "X")
112 |             restype_idx = residue_constants.restype_order.get(
113 |                 res_shortname, residue_constants.restype_num
114 |             )
115 |             pos = np.zeros((residue_constants.atom_type_num, 3))
116 |             mask = np.zeros((residue_constants.atom_type_num,))
117 |             res_b_factors = np.zeros((residue_constants.atom_type_num,))
118 |             for atom in res:
119 |                 if atom.name not in residue_constants.atom_types:
120 |                     continue
121 |                 pos[residue_constants.atom_order[atom.name]] = atom.coord
122 |                 mask[residue_constants.atom_order[atom.name]] = 1.0
123 |                 res_b_factors[
124 |                     residue_constants.atom_order[atom.name]
125 |                 ] = atom.bfactor
126 |             if np.sum(mask) < 0.5:
127 |                 # If no known atom positions are reported for the residue then skip it.
128 |                 continue
129 |             aatype.append(restype_idx)
130 |             atom_positions.append(pos)
131 |             atom_mask.append(mask)
132 |             residue_index.append(res.id[1])
133 |             chain_ids.append(chain.id)
134 |             b_factors.append(res_b_factors)
135 | 
136 |     parents = None
137 |     parents_chain_index = None
138 |     if("PARENT" in pdb_str):
139 |         parents = []
140 |         parents_chain_index = []
141 |         chain_id = 0
142 |         for l in pdb_str.split("\n"):
143 |             if("PARENT" in l):
144 |                 if(not "N/A" in l):
145 |                     parent_names = l.split()[1:]
146 |                     parents.extend(parent_names)
147 |                     parents_chain_index.extend([
148 |                         chain_id for _ in parent_names
149 |                     ])
150 |                 chain_id += 1
151 | 
152 |     unique_chain_ids = np.unique(chain_ids)
153 |     chain_id_mapping = {cid: n for n, cid in enumerate(string.ascii_uppercase)}
154 |     chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
155 | 
156 |     return Protein(
157 |         atom_positions=np.array(atom_positions),
158 |         atom_mask=np.array(atom_mask),
159 |         aatype=np.array(aatype),
160 |         residue_index=np.array(residue_index),
161 |         chain_index=chain_index,
162 |         b_factors=np.array(b_factors),
163 |         parents=parents,
164 |         parents_chain_index=parents_chain_index,
165 |     )
166 | 
167 | 
168 | def from_proteinnet_string(proteinnet_str: str) -> Protein:
169 |     tag_re = r'(\[[A-Z]+\]\n)'
170 |     tags = [
171 |         tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0
172 |     ]
173 |     groups = zip(tags[0::2], [l.split('\n') for l in tags[1::2]])
174 |    
175 |     atoms = ['N', 'CA', 'C']
176 |     aatype = None
177 |     atom_positions = None
178 |     atom_mask = None
179 |     for g in groups:
180 |         if("[PRIMARY]" == g[0]):
181 |             seq = g[1][0].strip()
182 |             for i in range(len(seq)):
183 |                 if(seq[i] not in residue_constants.restypes):
184 |                     seq[i] = 'X'
185 |             aatype = np.array([
186 |                 residue_constants.restype_order.get(
187 |                     res_symbol, residue_constants.restype_num
188 |                 ) for res_symbol in seq
189 |             ])
190 |         elif("[TERTIARY]" == g[0]):
191 |             tertiary = []
192 |             for axis in range(3):
193 |                 tertiary.append(list(map(float, g[1][axis].split())))
194 |             tertiary_np = np.array(tertiary)
195 |             atom_positions = np.zeros(
196 |                 (len(tertiary[0])//3, residue_constants.atom_type_num, 3)
197 |             ).astype(np.float32)
198 |             for i, atom in enumerate(atoms):
199 |                 atom_positions[:, residue_constants.atom_order[atom], :] = (
200 |                     np.transpose(tertiary_np[:, i::3])
201 |                 )
202 |             atom_positions *= PICO_TO_ANGSTROM
203 |         elif("[MASK]" == g[0]):
204 |             mask = np.array(list(map({'-': 0, '+': 1}.get, g[1][0].strip())))
205 |             atom_mask = np.zeros(
206 |                 (len(mask), residue_constants.atom_type_num,)
207 |             ).astype(np.float32)
208 |             for i, atom in enumerate(atoms):
209 |                 atom_mask[:, residue_constants.atom_order[atom]] = 1
210 |             atom_mask *= mask[..., None]
211 | 
212 |     return Protein(
213 |         atom_positions=atom_positions,
214 |         atom_mask=atom_mask,
215 |         aatype=aatype,
216 |         residue_index=np.arange(len(aatype)),
217 |         b_factors=None,
218 |     )
219 | 
220 | 
221 | def get_pdb_headers(prot: Protein, chain_id: int = 0) -> Sequence[str]:
222 |     pdb_headers = []
223 | 
224 |     remark = prot.remark
225 |     if(remark is not None):
226 |         pdb_headers.append(f"REMARK {remark}")
227 | 
228 |     parents = prot.parents
229 |     parents_chain_index = prot.parents_chain_index
230 |     if(parents_chain_index is not None):
231 |         parents = [
232 |             p for i, p in zip(parents_chain_index, parents) if i == chain_id
233 |         ]
234 | 
235 |     if(parents is None or len(parents) == 0):
236 |         parents = ["N/A"]
237 | 
238 |     pdb_headers.append(f"PARENT {' '.join(parents)}")
239 | 
240 |     return pdb_headers
241 | 
242 | 
243 | def add_pdb_headers(prot: Protein, pdb_str: str) -> str:
244 |     """ Add pdb headers to an existing PDB string. Useful during multi-chain
245 |         recycling
246 |     """
247 |     out_pdb_lines = []
248 |     lines = pdb_str.split('\n')
249 |     
250 |     remark = prot.remark
251 |     if(remark is not None):
252 |         out_pdb_lines.append(f"REMARK {remark}")
253 | 
254 |     parents_per_chain = None
255 |     if(prot.parents is not None and len(prot.parents) > 0):
256 |         parents_per_chain = []
257 |         if(prot.parents_chain_index is not None):
258 |             cur_chain = prot.parents_chain_index[0]
259 |             parent_dict = {}
260 |             for p, i in zip(prot.parents, prot.parents_chain_index):
261 |                 parent_dict.setdefault(str(i), [])
262 |                 parent_dict[str(i)].append(p)
263 | 
264 |             max_idx = max([int(chain_idx) for chain_idx in parent_dict])
265 |             for i in range(max_idx + 1):
266 |                 chain_parents = parent_dict.get(str(i), ["N/A"])
267 |                 parents_per_chain.append(chain_parents)
268 |         else:
269 |             parents_per_chain.append(prot.parents)
270 |     else:
271 |         parents_per_chain = [["N/A"]]
272 | 
273 |     make_parent_line = lambda p: f"PARENT {' '.join(p)}"
274 | 
275 |     out_pdb_lines.append(make_parent_line(parents_per_chain[0]))
276 | 
277 |     chain_counter = 0
278 |     for i, l in enumerate(lines):
279 |         if("PARENT" not in l and "REMARK" not in l):
280 |             out_pdb_lines.append(l)
281 |         if("TER" in l and not "END" in lines[i + 1]):
282 |             chain_counter += 1
283 |             if(not chain_counter >= len(parents_per_chain)):
284 |                 chain_parents = parents_per_chain[chain_counter]
285 |             else:
286 |                 chain_parents = ["N/A"]
287 | 
288 |             out_pdb_lines.append(make_parent_line(chain_parents))
289 | 
290 |     return '\n'.join(out_pdb_lines)
291 | 
292 | 
293 | def to_pdb(prot: Protein) -> str:
294 |     """Converts a `Protein` instance to a PDB string.
295 | 
296 |     Args:
297 |       prot: The protein to convert to PDB.
298 | 
299 |     Returns:
300 |       PDB string.
301 |     """
302 |     restypes = residue_constants.restypes + ["X"]
303 |     res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK")
304 |     atom_types = residue_constants.atom_types
305 | 
306 |     pdb_lines = []
307 | 
308 |     atom_mask = prot.atom_mask
309 |     aatype = prot.aatype
310 |     atom_positions = prot.atom_positions
311 |     residue_index = prot.residue_index.astype(np.int32)
312 |     b_factors = prot.b_factors
313 |     chain_index = prot.chain_index
314 | 
315 |     if np.any(aatype > residue_constants.restype_num):
316 |         raise ValueError("Invalid aatypes.")
317 | 
318 |     headers = get_pdb_headers(prot)
319 |     if(len(headers) > 0):
320 |         pdb_lines.extend(headers)
321 | 
322 |     n = aatype.shape[0]
323 |     atom_index = 1
324 |     prev_chain_index = 0
325 |     chain_tags = string.ascii_uppercase
326 |     # Add all atom sites.
327 |     for i in range(n):
328 |         res_name_3 = res_1to3(aatype[i])
329 |         for atom_name, pos, mask, b_factor in zip(
330 |             atom_types, atom_positions[i], atom_mask[i], b_factors[i]
331 |         ):
332 |             if mask < 0.5:
333 |                 chain_tag = "A"
334 |                 if(chain_index is not None):
335 |                     chain_tag = chain_tags[chain_index[i]]
336 |                 continue
337 | 
338 |             record_type = "ATOM"
339 |             name = atom_name if len(atom_name) == 4 else f" {atom_name}"
340 |             alt_loc = ""
341 |             insertion_code = ""
342 |             occupancy = 1.00
343 |             element = atom_name[
344 |                 0
345 |             ]  # Protein supports only C, N, O, S, this works.
346 |             charge = ""
347 |     
348 |             chain_tag = "A"
349 |             if(chain_index is not None):
350 |                 chain_tag = chain_tags[chain_index[i]]
351 | 
352 |             # PDB is a columnar format, every space matters here!
353 |             atom_line = (
354 |                 f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}"
355 |                 f"{res_name_3:>3} {chain_tag:>1}"
356 |                 f"{residue_index[i]:>4}{insertion_code:>1}   "
357 |                 f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
358 |                 f"{occupancy:>6.2f}{b_factor:>6.2f}          "
359 |                 f"{element:>2}{charge:>2}"
360 |             )
361 |             pdb_lines.append(atom_line)
362 |             atom_index += 1
363 | 
364 |         should_terminate = (i == n - 1)
365 |         if(chain_index is not None):
366 |             if(i != n - 1 and chain_index[i + 1] != prev_chain_index):
367 |                 should_terminate = True
368 |                 prev_chain_index = chain_index[i + 1]
369 | 
370 |         if(should_terminate):
371 |             # Close the chain.
372 |             chain_end = "TER"
373 |             chain_termination_line = (
374 |                 f"{chain_end:<6}{atom_index:>5}      "
375 |                 f"{res_1to3(aatype[i]):>3} "
376 |                 f"{chain_tag:>1}{residue_index[i]:>4}"
377 |             )
378 |             pdb_lines.append(chain_termination_line)
379 |             atom_index += 1
380 | 
381 |             if(i != n - 1):
382 |                 # "prev" is a misnomer here. This happens at the beginning of
383 |                 # each new chain.
384 |                 pdb_lines.extend(get_pdb_headers(prot, prev_chain_index))
385 | 
386 |     pdb_lines.append("END")
387 |     pdb_lines.append("")
388 |     return "\n".join(pdb_lines)
389 | 
390 | 
391 | def ideal_atom_mask(prot: Protein) -> np.ndarray:
392 |     """Computes an ideal atom mask.
393 | 
394 |     `Protein.atom_mask` typically is defined according to the atoms that are
395 |     reported in the PDB. This function computes a mask according to heavy atoms
396 |     that should be present in the given sequence of amino acids.
397 | 
398 |     Args:
399 |       prot: `Protein` whose fields are `numpy.ndarray` objects.
400 | 
401 |     Returns:
402 |       An ideal atom mask.
403 |     """
404 |     return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
405 | 
406 | 
407 | def from_prediction(
408 |     features: FeatureDict,
409 |     result: ModelOutput,
410 |     b_factors: Optional[np.ndarray] = None,
411 |     chain_index: Optional[np.ndarray] = None,
412 |     remark: Optional[str] = None,
413 |     parents: Optional[Sequence[str]] = None,
414 |     parents_chain_index: Optional[Sequence[int]] = None
415 | ) -> Protein:
416 |     """Assembles a protein from a prediction.
417 | 
418 |     Args:
419 |       features: Dictionary holding model inputs.
420 |       result: Dictionary holding model outputs.
421 |       b_factors: (Optional) B-factors to use for the protein.
422 |       chain_index: (Optional) Chain indices for multi-chain predictions
423 |       remark: (Optional) Remark about the prediction
424 |       parents: (Optional) List of template names
425 |     Returns:
426 |       A protein instance.
427 |     """
428 |     if b_factors is None:
429 |         b_factors = np.zeros_like(result["final_atom_mask"])
430 | 
431 |     return Protein(
432 |         aatype=features["aatype"],
433 |         atom_positions=result["final_atom_positions"],
434 |         atom_mask=result["final_atom_mask"],
435 |         residue_index=features["residue_index"] + 1,
436 |         b_factors=b_factors,
437 |         chain_index=chain_index,
438 |         remark=remark,
439 |         parents=parents,
440 |         parents_chain_index=parents_chain_index,
441 |     )
442 | 


--------------------------------------------------------------------------------
/openfold/np/relax/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import importlib as importlib
 4 | 
 5 | _files = glob.glob(os.path.join(os.path.dirname(__file__), "*.py"))
 6 | __all__ = [
 7 |     os.path.basename(f)[:-3]
 8 |     for f in _files
 9 |     if os.path.isfile(f) and not f.endswith("__init__.py")
10 | ]
11 | _modules = [(m, importlib.import_module("." + m, __name__)) for m in __all__]
12 | for _m in _modules:
13 |     globals()[_m[0]] = _m[1]
14 | 
15 | # Avoid needlessly cluttering the global namespace
16 | del _files, _m, _modules
17 | 


--------------------------------------------------------------------------------
/openfold/np/relax/cleanup.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 DeepMind Technologies Limited
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Cleans up a PDB file using pdbfixer in preparation for OpenMM simulations.
 16 | 
 17 | fix_pdb uses a third-party tool. We also support fixing some additional edge
 18 | cases like removing chains of length one (see clean_structure).
 19 | """
 20 | import io
 21 | 
 22 | import pdbfixer
 23 | from simtk.openmm import app
 24 | from simtk.openmm.app import element
 25 | 
 26 | 
 27 | def fix_pdb(pdbfile, alterations_info):
 28 |     """Apply pdbfixer to the contents of a PDB file; return a PDB string result.
 29 | 
 30 |     1) Replaces nonstandard residues.
 31 |     2) Removes heterogens (non protein residues) including water.
 32 |     3) Adds missing residues and missing atoms within existing residues.
 33 |     4) Adds hydrogens assuming pH=7.0.
 34 |     5) KeepIds is currently true, so the fixer must keep the existing chain and
 35 |        residue identifiers. This will fail for some files in wider PDB that have
 36 |        invalid IDs.
 37 | 
 38 |     Args:
 39 |       pdbfile: Input PDB file handle.
 40 |       alterations_info: A dict that will store details of changes made.
 41 | 
 42 |     Returns:
 43 |       A PDB string representing the fixed structure.
 44 |     """
 45 |     fixer = pdbfixer.PDBFixer(pdbfile=pdbfile)
 46 |     fixer.findNonstandardResidues()
 47 |     alterations_info["nonstandard_residues"] = fixer.nonstandardResidues
 48 |     fixer.replaceNonstandardResidues()
 49 |     _remove_heterogens(fixer, alterations_info, keep_water=False)
 50 |     fixer.findMissingResidues()
 51 |     alterations_info["missing_residues"] = fixer.missingResidues
 52 |     fixer.findMissingAtoms()
 53 |     alterations_info["missing_heavy_atoms"] = fixer.missingAtoms
 54 |     alterations_info["missing_terminals"] = fixer.missingTerminals
 55 |     fixer.addMissingAtoms(seed=0)
 56 |     fixer.addMissingHydrogens()
 57 |     out_handle = io.StringIO()
 58 |     app.PDBFile.writeFile(
 59 |         fixer.topology, fixer.positions, out_handle, keepIds=True
 60 |     )
 61 |     return out_handle.getvalue()
 62 | 
 63 | 
 64 | def clean_structure(pdb_structure, alterations_info):
 65 |     """Applies additional fixes to an OpenMM structure, to handle edge cases.
 66 | 
 67 |     Args:
 68 |       pdb_structure: An OpenMM structure to modify and fix.
 69 |       alterations_info: A dict that will store details of changes made.
 70 |     """
 71 |     _replace_met_se(pdb_structure, alterations_info)
 72 |     _remove_chains_of_length_one(pdb_structure, alterations_info)
 73 | 
 74 | 
 75 | def _remove_heterogens(fixer, alterations_info, keep_water):
 76 |     """Removes the residues that Pdbfixer considers to be heterogens.
 77 | 
 78 |     Args:
 79 |       fixer: A Pdbfixer instance.
 80 |       alterations_info: A dict that will store details of changes made.
 81 |       keep_water: If True, water (HOH) is not considered to be a heterogen.
 82 |     """
 83 |     initial_resnames = set()
 84 |     for chain in fixer.topology.chains():
 85 |         for residue in chain.residues():
 86 |             initial_resnames.add(residue.name)
 87 |     fixer.removeHeterogens(keepWater=keep_water)
 88 |     final_resnames = set()
 89 |     for chain in fixer.topology.chains():
 90 |         for residue in chain.residues():
 91 |             final_resnames.add(residue.name)
 92 |     alterations_info["removed_heterogens"] = initial_resnames.difference(
 93 |         final_resnames
 94 |     )
 95 | 
 96 | 
 97 | def _replace_met_se(pdb_structure, alterations_info):
 98 |     """Replace the Se in any MET residues that were not marked as modified."""
 99 |     modified_met_residues = []
100 |     for res in pdb_structure.iter_residues():
101 |         name = res.get_name_with_spaces().strip()
102 |         if name == "MET":
103 |             s_atom = res.get_atom("SD")
104 |             if s_atom.element_symbol == "Se":
105 |                 s_atom.element_symbol = "S"
106 |                 s_atom.element = element.get_by_symbol("S")
107 |                 modified_met_residues.append(s_atom.residue_number)
108 |     alterations_info["Se_in_MET"] = modified_met_residues
109 | 
110 | 
111 | def _remove_chains_of_length_one(pdb_structure, alterations_info):
112 |     """Removes chains that correspond to a single amino acid.
113 | 
114 |     A single amino acid in a chain is both N and C terminus. There is no force
115 |     template for this case.
116 | 
117 |     Args:
118 |       pdb_structure: An OpenMM pdb_structure to modify and fix.
119 |       alterations_info: A dict that will store details of changes made.
120 |     """
121 |     removed_chains = {}
122 |     for model in pdb_structure.iter_models():
123 |         valid_chains = [c for c in model.iter_chains() if len(c) > 1]
124 |         invalid_chain_ids = [
125 |             c.chain_id for c in model.iter_chains() if len(c) <= 1
126 |         ]
127 |         model.chains = valid_chains
128 |         for chain_id in invalid_chain_ids:
129 |             model.chains_by_id.pop(chain_id)
130 |         removed_chains[model.number] = invalid_chain_ids
131 |     alterations_info["removed_chains"] = removed_chains
132 | 


--------------------------------------------------------------------------------
/openfold/np/relax/relax.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 AlQuraishi Laboratory
 2 | # Copyright 2021 DeepMind Technologies Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Amber relaxation."""
17 | from typing import Any, Dict, Sequence, Tuple
18 | from openfold.np import protein
19 | from openfold.np.relax import amber_minimize, utils
20 | import numpy as np
21 | 
22 | 
23 | class AmberRelaxation(object):
24 |     """Amber relaxation."""
25 |     def __init__(
26 |         self,
27 |         *,
28 |         max_iterations: int,
29 |         tolerance: float,
30 |         stiffness: float,
31 |         exclude_residues: Sequence[int],
32 |         max_outer_iterations: int,
33 |         use_gpu: bool,
34 |     ):
35 |         """Initialize Amber Relaxer.
36 | 
37 |         Args:
38 |           max_iterations: Maximum number of L-BFGS iterations. 0 means no max.
39 |           tolerance: kcal/mol, the energy tolerance of L-BFGS.
40 |           stiffness: kcal/mol A**2, spring constant of heavy atom restraining
41 |             potential.
42 |           exclude_residues: Residues to exclude from per-atom restraining.
43 |             Zero-indexed.
44 |           max_outer_iterations: Maximum number of violation-informed relax
45 |            iterations. A value of 1 will run the non-iterative procedure used in
46 |            CASP14. Use 20 so that >95% of the bad cases are relaxed. Relax finishes
47 |            as soon as there are no violations, hence in most cases this causes no
48 |            slowdown. In the worst case we do 20 outer iterations.
49 |           use_gpu: Whether to run on GPU
50 |         """
51 | 
52 |         self._max_iterations = max_iterations
53 |         self._tolerance = tolerance
54 |         self._stiffness = stiffness
55 |         self._exclude_residues = exclude_residues
56 |         self._max_outer_iterations = max_outer_iterations
57 |         self._use_gpu = use_gpu
58 | 
59 |     def process(
60 |         self, *, prot: protein.Protein
61 |     ) -> Tuple[str, Dict[str, Any], np.ndarray]:
62 |         """Runs Amber relax on a prediction, adds hydrogens, returns PDB string."""
63 |         out = amber_minimize.run_pipeline(
64 |             prot=prot,
65 |             max_iterations=self._max_iterations,
66 |             tolerance=self._tolerance,
67 |             stiffness=self._stiffness,
68 |             exclude_residues=self._exclude_residues,
69 |             max_outer_iterations=self._max_outer_iterations,
70 |             use_gpu=self._use_gpu,
71 |         )
72 |         min_pos = out["pos"]
73 |         start_pos = out["posinit"]
74 |         rmsd = np.sqrt(np.sum((start_pos - min_pos) ** 2) / start_pos.shape[0])
75 |         debug_data = {
76 |             "initial_energy": out["einit"],
77 |             "final_energy": out["efinal"],
78 |             "attempts": out["min_attempts"],
79 |             "rmsd": rmsd,
80 |         }
81 |         pdb_str = amber_minimize.clean_protein(prot)
82 |         min_pdb = utils.overwrite_pdb_coordinates(pdb_str, min_pos)
83 |         min_pdb = utils.overwrite_b_factors(min_pdb, prot.b_factors)
84 |         utils.assert_equal_nonterminal_atom_types(
85 |             protein.from_pdb_string(min_pdb).atom_mask, prot.atom_mask
86 |         )
87 |         violations = out["structural_violations"][
88 |             "total_per_residue_violations_mask"
89 |         ]
90 | 
91 |         min_pdb = protein.add_pdb_headers(prot, min_pdb)
92 | 
93 |         return min_pdb, debug_data, violations
94 | 


--------------------------------------------------------------------------------
/openfold/np/relax/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 AlQuraishi Laboratory
 2 | # Copyright 2021 DeepMind Technologies Limited
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Utils for minimization."""
17 | import io
18 | from openfold.np import residue_constants
19 | from Bio import PDB
20 | import numpy as np
21 | from simtk.openmm import app as openmm_app
22 | from simtk.openmm.app.internal.pdbstructure import PdbStructure
23 | 
24 | 
25 | def overwrite_pdb_coordinates(pdb_str: str, pos) -> str:
26 |     pdb_file = io.StringIO(pdb_str)
27 |     structure = PdbStructure(pdb_file)
28 |     topology = openmm_app.PDBFile(structure).getTopology()
29 |     with io.StringIO() as f:
30 |         openmm_app.PDBFile.writeFile(topology, pos, f)
31 |         return f.getvalue()
32 | 
33 | 
34 | def overwrite_b_factors(pdb_str: str, bfactors: np.ndarray) -> str:
35 |     """Overwrites the B-factors in pdb_str with contents of bfactors array.
36 | 
37 |     Args:
38 |       pdb_str: An input PDB string.
39 |       bfactors: A numpy array with shape [1, n_residues, 37]. We assume that the
40 |         B-factors are per residue; i.e. that the nonzero entries are identical in
41 |         [0, i, :].
42 | 
43 |     Returns:
44 |       A new PDB string with the B-factors replaced.
45 |     """
46 |     if bfactors.shape[-1] != residue_constants.atom_type_num:
47 |         raise ValueError(
48 |             f"Invalid final dimension size for bfactors: {bfactors.shape[-1]}."
49 |         )
50 | 
51 |     parser = PDB.PDBParser(QUIET=True)
52 |     handle = io.StringIO(pdb_str)
53 |     structure = parser.get_structure("", handle)
54 | 
55 |     curr_resid = ("", "", "")
56 |     idx = -1
57 |     for atom in structure.get_atoms():
58 |         atom_resid = atom.parent.get_id()
59 |         if atom_resid != curr_resid:
60 |             idx += 1
61 |             if idx >= bfactors.shape[0]:
62 |                 raise ValueError(
63 |                     "Index into bfactors exceeds number of residues. "
64 |                     "B-factors shape: {shape}, idx: {idx}."
65 |                 )
66 |         curr_resid = atom_resid
67 |         atom.bfactor = bfactors[idx, residue_constants.atom_order["CA"]]
68 | 
69 |     new_pdb = io.StringIO()
70 |     pdb_io = PDB.PDBIO()
71 |     pdb_io.set_structure(structure)
72 |     pdb_io.save(new_pdb)
73 |     return new_pdb.getvalue()
74 | 
75 | 
76 | def assert_equal_nonterminal_atom_types(
77 |     atom_mask: np.ndarray, ref_atom_mask: np.ndarray
78 | ):
79 |     """Checks that pre- and post-minimized proteins have same atom set."""
80 |     # Ignore any terminal OXT atoms which may have been added by minimization.
81 |     oxt = residue_constants.atom_order["OXT"]
82 |     no_oxt_mask = np.ones(shape=atom_mask.shape, dtype=np.bool)
83 |     no_oxt_mask[..., oxt] = False
84 |     np.testing.assert_almost_equal(
85 |         ref_atom_mask[no_oxt_mask], atom_mask[no_oxt_mask]
86 |     )
87 | 


--------------------------------------------------------------------------------
/openfold/resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/openfold/resources/__init__.py


--------------------------------------------------------------------------------
/openfold/utils/feats.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import math
 17 | 
 18 | import numpy as np
 19 | import torch
 20 | import torch.nn as nn
 21 | from typing import Dict
 22 | 
 23 | from openfold.np import protein
 24 | import openfold.np.residue_constants as rc
 25 | from openfold.utils.rigid_utils import Rotation, Rigid
 26 | from openfold.utils.tensor_utils import (
 27 |     batched_gather,
 28 |     one_hot,
 29 |     tree_map,
 30 |     tensor_tree_map,
 31 | )
 32 | 
 33 | 
 34 | def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks):
 35 |     is_gly = aatype == rc.restype_order["G"]
 36 |     ca_idx = rc.atom_order["CA"]
 37 |     cb_idx = rc.atom_order["CB"]
 38 |     pseudo_beta = torch.where(
 39 |         is_gly[..., None].expand(*((-1,) * len(is_gly.shape)), 3),
 40 |         all_atom_positions[..., ca_idx, :],
 41 |         all_atom_positions[..., cb_idx, :],
 42 |     )
 43 | 
 44 |     if all_atom_masks is not None:
 45 |         pseudo_beta_mask = torch.where(
 46 |             is_gly,
 47 |             all_atom_masks[..., ca_idx],
 48 |             all_atom_masks[..., cb_idx],
 49 |         )
 50 |         return pseudo_beta, pseudo_beta_mask
 51 |     else:
 52 |         return pseudo_beta
 53 | 
 54 | 
 55 | def atom14_to_atom37(atom14, batch):
 56 |     atom37_data = batched_gather(
 57 |         atom14,
 58 |         batch["residx_atom37_to_atom14"],
 59 |         dim=-2,
 60 |         no_batch_dims=len(atom14.shape[:-2]),
 61 |     )
 62 | 
 63 |     atom37_data = atom37_data * batch["atom37_atom_exists"][..., None]
 64 | 
 65 |     return atom37_data
 66 | 
 67 | 
 68 | def build_template_angle_feat(template_feats):
 69 |     template_aatype = template_feats["template_aatype"]
 70 |     torsion_angles_sin_cos = template_feats["template_torsion_angles_sin_cos"]
 71 |     alt_torsion_angles_sin_cos = template_feats[
 72 |         "template_alt_torsion_angles_sin_cos"
 73 |     ]
 74 |     torsion_angles_mask = template_feats["template_torsion_angles_mask"]
 75 |     template_angle_feat = torch.cat(
 76 |         [
 77 |             nn.functional.one_hot(template_aatype, 22),
 78 |             torsion_angles_sin_cos.reshape(
 79 |                 *torsion_angles_sin_cos.shape[:-2], 14
 80 |             ),
 81 |             alt_torsion_angles_sin_cos.reshape(
 82 |                 *alt_torsion_angles_sin_cos.shape[:-2], 14
 83 |             ),
 84 |             torsion_angles_mask,
 85 |         ],
 86 |         dim=-1,
 87 |     )
 88 | 
 89 |     return template_angle_feat
 90 | 
 91 | 
 92 | def build_template_pair_feat(
 93 |     batch, 
 94 |     min_bin, max_bin, no_bins, 
 95 |     use_unit_vector=False, 
 96 |     eps=1e-20, inf=1e8
 97 | ):
 98 |     template_mask = batch["template_pseudo_beta_mask"]
 99 |     template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
100 | 
101 |     # Compute distogram (this seems to differ slightly from Alg. 5)
102 |     tpb = batch["template_pseudo_beta"]
103 |     dgram = torch.sum(
104 |         (tpb[..., None, :] - tpb[..., None, :, :]) ** 2, dim=-1, keepdim=True
105 |     )
106 |     lower = torch.linspace(min_bin, max_bin, no_bins, device=tpb.device) ** 2
107 |     upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1)
108 |     dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype)
109 | 
110 |     to_concat = [dgram, template_mask_2d[..., None]]
111 | 
112 |     aatype_one_hot = nn.functional.one_hot(
113 |         batch["template_aatype"],
114 |         rc.restype_num + 2,
115 |     )
116 | 
117 |     n_res = batch["template_aatype"].shape[-1]
118 |     to_concat.append(
119 |         aatype_one_hot[..., None, :, :].expand(
120 |             *aatype_one_hot.shape[:-2], n_res, -1, -1
121 |         )
122 |     )
123 |     to_concat.append(
124 |         aatype_one_hot[..., None, :].expand(
125 |             *aatype_one_hot.shape[:-2], -1, n_res, -1
126 |         )
127 |     )
128 | 
129 |     n, ca, c = [rc.atom_order[a] for a in ["N", "CA", "C"]]
130 |     rigids = Rigid.make_transform_from_reference(
131 |         n_xyz=batch["template_all_atom_positions"][..., n, :],
132 |         ca_xyz=batch["template_all_atom_positions"][..., ca, :],
133 |         c_xyz=batch["template_all_atom_positions"][..., c, :],
134 |         eps=eps,
135 |     )
136 |     points = rigids.get_trans()[..., None, :, :]
137 |     rigid_vec = rigids[..., None].invert_apply(points)
138 | 
139 |     inv_distance_scalar = torch.rsqrt(eps + torch.sum(rigid_vec ** 2, dim=-1))
140 | 
141 |     t_aa_masks = batch["template_all_atom_mask"]
142 |     template_mask = (
143 |         t_aa_masks[..., n] * t_aa_masks[..., ca] * t_aa_masks[..., c]
144 |     )
145 |     template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
146 | 
147 |     inv_distance_scalar = inv_distance_scalar * template_mask_2d
148 |     unit_vector = rigid_vec * inv_distance_scalar[..., None]
149 |     
150 |     if(not use_unit_vector):
151 |         unit_vector = unit_vector * 0.
152 |     
153 |     to_concat.extend(torch.unbind(unit_vector[..., None, :], dim=-1))
154 |     to_concat.append(template_mask_2d[..., None])
155 | 
156 |     act = torch.cat(to_concat, dim=-1)
157 |     act = act * template_mask_2d[..., None]
158 | 
159 |     return act
160 | 
161 | 
162 | def build_extra_msa_feat(batch):
163 |     msa_1hot = nn.functional.one_hot(batch["extra_msa"], 23)
164 |     msa_feat = [
165 |         msa_1hot,
166 |         batch["extra_has_deletion"].unsqueeze(-1),
167 |         batch["extra_deletion_value"].unsqueeze(-1),
168 |     ]
169 |     return torch.cat(msa_feat, dim=-1)
170 | 
171 | 
172 | def torsion_angles_to_frames(
173 |     r: Rigid,
174 |     alpha: torch.Tensor,
175 |     aatype: torch.Tensor,
176 |     rrgdf: torch.Tensor,
177 | ):
178 |     # [*, N, 8, 4, 4]
179 |     default_4x4 = rrgdf[aatype, ...]
180 | 
181 |     # [*, N, 8] transformations, i.e.
182 |     #   One [*, N, 8, 3, 3] rotation matrix and
183 |     #   One [*, N, 8, 3]    translation matrix
184 |     default_r = r.from_tensor_4x4(default_4x4)
185 | 
186 |     bb_rot = alpha.new_zeros((*((1,) * len(alpha.shape[:-1])), 2))
187 |     bb_rot[..., 1] = 1
188 | 
189 |     # [*, N, 8, 2]
190 |     alpha = torch.cat(
191 |         [bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha], dim=-2
192 |     )
193 | 
194 |     # [*, N, 8, 3, 3]
195 |     # Produces rotation matrices of the form:
196 |     # [
197 |     #   [1, 0  , 0  ],
198 |     #   [0, a_2,-a_1],
199 |     #   [0, a_1, a_2]
200 |     # ]
201 |     # This follows the original code rather than the supplement, which uses
202 |     # different indices.
203 | 
204 |     all_rots = alpha.new_zeros(default_r.get_rots().get_rot_mats().shape)
205 |     all_rots[..., 0, 0] = 1
206 |     all_rots[..., 1, 1] = alpha[..., 1]
207 |     all_rots[..., 1, 2] = -alpha[..., 0]
208 |     all_rots[..., 2, 1:] = alpha
209 | 
210 |     all_rots = Rigid(Rotation(rot_mats=all_rots), None)
211 | 
212 |     all_frames = default_r.compose(all_rots)
213 | 
214 |     chi2_frame_to_frame = all_frames[..., 5]
215 |     chi3_frame_to_frame = all_frames[..., 6]
216 |     chi4_frame_to_frame = all_frames[..., 7]
217 | 
218 |     chi1_frame_to_bb = all_frames[..., 4]
219 |     chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame)
220 |     chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame)
221 |     chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame)
222 | 
223 |     all_frames_to_bb = Rigid.cat(
224 |         [
225 |             all_frames[..., :5],
226 |             chi2_frame_to_bb.unsqueeze(-1),
227 |             chi3_frame_to_bb.unsqueeze(-1),
228 |             chi4_frame_to_bb.unsqueeze(-1),
229 |         ],
230 |         dim=-1,
231 |     )
232 | 
233 |     all_frames_to_global = r[..., None].compose(all_frames_to_bb)
234 | 
235 |     return all_frames_to_global
236 | 
237 | 
238 | def frames_and_literature_positions_to_atom14_pos(
239 |     r: Rigid,
240 |     aatype: torch.Tensor,
241 |     default_frames,
242 |     group_idx,
243 |     atom_mask,
244 |     lit_positions,
245 | ):
246 |     # [*, N, 14, 4, 4]
247 |     default_4x4 = default_frames[aatype, ...]
248 | 
249 |     # [*, N, 14]
250 |     group_mask = group_idx[aatype, ...]
251 | 
252 |     # [*, N, 14, 8]
253 |     group_mask = nn.functional.one_hot(
254 |         group_mask,
255 |         num_classes=default_frames.shape[-3],
256 |     )
257 | 
258 |     # [*, N, 14, 8]
259 |     t_atoms_to_global = r[..., None, :] * group_mask
260 | 
261 |     # [*, N, 14]
262 |     t_atoms_to_global = t_atoms_to_global.map_tensor_fn(
263 |         lambda x: torch.sum(x, dim=-1)
264 |     )
265 | 
266 |     # [*, N, 14, 1]
267 |     atom_mask = atom_mask[aatype, ...].unsqueeze(-1)
268 | 
269 |     # [*, N, 14, 3]
270 |     lit_positions = lit_positions[aatype, ...]
271 |     pred_positions = t_atoms_to_global.apply(lit_positions)
272 |     pred_positions = pred_positions * atom_mask
273 | 
274 |     return pred_positions
275 | 


--------------------------------------------------------------------------------
/openfold/utils/tensor_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 AlQuraishi Laboratory
  2 | # Copyright 2021 DeepMind Technologies Limited
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from functools import partial
 17 | import logging
 18 | from typing import Tuple, List, Callable, Any, Dict, Sequence, Optional
 19 | 
 20 | import torch
 21 | import torch.nn as nn
 22 | 
 23 | 
 24 | def add(m1, m2, inplace):
 25 |     # The first operation in a checkpoint can't be in-place, but it's
 26 |     # nice to have in-place addition during inference. Thus...
 27 |     if(not inplace):
 28 |         m1 = m1 + m2
 29 |     else:
 30 |         m1 += m2
 31 | 
 32 |     return m1
 33 | 
 34 | 
 35 | def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
 36 |     zero_index = -1 * len(inds)
 37 |     first_inds = list(range(len(tensor.shape[:zero_index])))
 38 |     return tensor.permute(first_inds + [zero_index + i for i in inds])
 39 | 
 40 | 
 41 | def flatten_final_dims(t: torch.Tensor, no_dims: int):
 42 |     return t.reshape(t.shape[:-no_dims] + (-1,))
 43 | 
 44 | 
 45 | def masked_mean(mask, value, dim, eps=1e-4):
 46 |     mask = mask.expand(*value.shape)
 47 |     return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
 48 | 
 49 | 
 50 | def pts_to_distogram(pts, min_bin=2.3125, max_bin=21.6875, no_bins=64):
 51 |     boundaries = torch.linspace(
 52 |         min_bin, max_bin, no_bins - 1, device=pts.device
 53 |     )
 54 |     dists = torch.sqrt(
 55 |         torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)
 56 |     )
 57 |     return torch.bucketize(dists, boundaries)
 58 | 
 59 | 
 60 | def dict_multimap(fn, dicts):
 61 |     first = dicts[0]
 62 |     new_dict = {}
 63 |     for k, v in first.items():
 64 |         all_v = [d[k] for d in dicts]
 65 |         if type(v) is dict:
 66 |             new_dict[k] = dict_multimap(fn, all_v)
 67 |         else:
 68 |             new_dict[k] = fn(all_v)
 69 | 
 70 |     return new_dict
 71 | 
 72 | 
 73 | def one_hot(x, v_bins):
 74 |     reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
 75 |     diffs = x[..., None] - reshaped_bins
 76 |     am = torch.argmin(torch.abs(diffs), dim=-1)
 77 |     return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
 78 | 
 79 | 
 80 | def batched_gather(data, inds, dim=0, no_batch_dims=0):
 81 |     ranges = []
 82 |     for i, s in enumerate(data.shape[:no_batch_dims]):
 83 |         r = torch.arange(s)
 84 |         r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
 85 |         ranges.append(r)
 86 | 
 87 |     remaining_dims = [
 88 |         slice(None) for _ in range(len(data.shape) - no_batch_dims)
 89 |     ]
 90 |     remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
 91 |     ranges.extend(remaining_dims)
 92 |     return data[ranges]
 93 | 
 94 | 
 95 | # With tree_map, a poor man's JAX tree_map
 96 | def dict_map(fn, dic, leaf_type):
 97 |     new_dict = {}
 98 |     for k, v in dic.items():
 99 |         if type(v) is dict:
100 |             new_dict[k] = dict_map(fn, v, leaf_type)
101 |         else:
102 |             new_dict[k] = tree_map(fn, v, leaf_type)
103 | 
104 |     return new_dict
105 | 
106 | 
107 | def tree_map(fn, tree, leaf_type):
108 |     if isinstance(tree, dict):
109 |         return dict_map(fn, tree, leaf_type)
110 |     elif isinstance(tree, list):
111 |         return [tree_map(fn, x, leaf_type) for x in tree]
112 |     elif isinstance(tree, tuple):
113 |         return tuple([tree_map(fn, x, leaf_type) for x in tree])
114 |     elif isinstance(tree, leaf_type):
115 |         return fn(tree)
116 |     else:
117 |         print(type(tree))
118 |         raise ValueError("Not supported")
119 | 
120 | 
121 | tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)
122 | 


--------------------------------------------------------------------------------
/outputs/autoregressive_score_w_seq/1BC8_1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/autoregressive_score_w_seq/1BC8_1.pt


--------------------------------------------------------------------------------
/outputs/autoregressive_score_wo_seq/1BC8_1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/autoregressive_score_wo_seq/1BC8_1.pt


--------------------------------------------------------------------------------
/outputs/batch_size/seqs/1BC8.fa:
--------------------------------------------------------------------------------
 1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=3, number_of_batches=5, model_path=./model_params/proteinmpnn_v_48_020.pt
 2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
 3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4090, ligand_confidence=0.4090, seq_rec=0.5376
 4 | GTSNISLYEFLLKLLSKPEYKDIIEWTSDNGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKKNHYKFVNYPEILNK
 5 | >1BC8, id=2, T=0.1, seed=111, overall_confidence=0.3984, ligand_confidence=0.3984, seq_rec=0.4946
 6 | GKSSMSLPEFLLKLLSDPKYKDIIEWTSDNGTFKLKDPEAVAKLWGKEKGRPDMNYEKMYELLKKYEEKGIIKEVKGEKNTYKFVNYPEYLYP
 7 | >1BC8, id=3, T=0.1, seed=111, overall_confidence=0.4034, ligand_confidence=0.4034, seq_rec=0.5054
 8 | GTSNISLYEFLLELLSDPKYKDIIEWISDNGEFKLKDPEAVAKLWGKVKGKPDMNYEEFEKLLKEYEKKKIIEKVEGKPYTYKFVNYPEILNK
 9 | >1BC8, id=4, T=0.1, seed=111, overall_confidence=0.3842, ligand_confidence=0.3842, seq_rec=0.4839
10 | GVSSMSLWEFLLELLSKPEYDDYIRWVSDNGEFELKDPEKVAKLWGEKKGEPDMNYEKLNKLLEKYEKKKIIEKVEGEPNVYRFVNYPEYLYP
11 | >1BC8, id=5, T=0.1, seed=111, overall_confidence=0.3910, ligand_confidence=0.3910, seq_rec=0.4731
12 | GKSKISLHEFLDKLLSDPKYDDIISWTSDDGEFELKDPEKVAKLWGKVKGKPDMNYEELEKLLDKYEKKGIIEKVKGKPNTYKFVNYPEYKFP
13 | >1BC8, id=6, T=0.1, seed=111, overall_confidence=0.4045, ligand_confidence=0.4045, seq_rec=0.5054
14 | GKSSISLHEFLLKLLSKPEYADIIRWVSDNGEFELVKPEEVAKLWGKVKGKPDMNYEELKKELKKYEKKGIIKEVKGKPNVYQFVNYPEILYP
15 | >1BC8, id=7, T=0.1, seed=111, overall_confidence=0.3844, ligand_confidence=0.3844, seq_rec=0.4946
16 | GTSSMSLWEFILKLLSDPKYKDIISWTSDNGEFELKDPEKLAKLYGKLKGKPNMNKKELFKELDKYKEKKIIEKVEGKKNTYKFVNYPEILNP
17 | >1BC8, id=8, T=0.1, seed=111, overall_confidence=0.3852, ligand_confidence=0.3852, seq_rec=0.4946
18 | GMSSMSLWEFLLKLLSKPEYKDIIEWVSDDGEFRLKKPEEVAKLWGKEKGEPDMNATKLFKELDKYEEKKIIERVEGEPNTYKFVNYPEYLYP
19 | >1BC8, id=9, T=0.1, seed=111, overall_confidence=0.3826, ligand_confidence=0.3826, seq_rec=0.5161
20 | GTSSISLPEFLLELLSKPEYKDIIEWTSDNGTFKLVDPEKVAKLWGKVKGKPNMNAKEMFKELKKYEKKKIIEEVPGEPNTYKFVKYPEILNP
21 | >1BC8, id=10, T=0.1, seed=111, overall_confidence=0.4005, ligand_confidence=0.4005, seq_rec=0.4624
22 | HMSHMSLHEFLLELLSKPEYADLIRWTSDDGTFELVKPEEVAKLWGERVGRPDMNAEKMFEELKKLEEKGIIEEVPGKPNTYRFVNYPEILLP
23 | >1BC8, id=11, T=0.1, seed=111, overall_confidence=0.3887, ligand_confidence=0.3887, seq_rec=0.4839
24 | GVSSISLYEFLYELLSDPKYADIIEWVSDNGEFRLKKPEAVAKLWGEKKGIPNMNYKKLYKELKKYEKKKIIEKVKGKKNTYKFVNYPEYLYP
25 | >1BC8, id=12, T=0.1, seed=111, overall_confidence=0.4031, ligand_confidence=0.4031, seq_rec=0.5054
26 | GKSKISLWEFLLKLLSDEKYKDYIEWTSDNGEFELKKPEAVAKLWGKEKGEPDMNYKKLYKELKKYEKKKIIEEVKGKKNTYKFVNYPEYLNP
27 | >1BC8, id=13, T=0.1, seed=111, overall_confidence=0.3932, ligand_confidence=0.3932, seq_rec=0.5161
28 | GTSSMSLPDFLLELLSDPKYKDYIEWVSDNGEFRLKKPEEVAKLWGKVKGKPDMNYKKLDEELKKYEAKGIIKRVEGKPNTYKFVNYPEILNP
29 | >1BC8, id=14, T=0.1, seed=111, overall_confidence=0.4077, ligand_confidence=0.4077, seq_rec=0.5054
30 | GTSSISLHEFLLELLSDPKYKDIIEWTSDNGEFVLKDPEAVAKLWGKVKGEPDMNYEKLYKELKKYEKKKIIKEVEGKENHYKFVNYPEILYP
31 | >1BC8, id=15, T=0.1, seed=111, overall_confidence=0.3999, ligand_confidence=0.3999, seq_rec=0.5161
32 | GTSSISLPEFLLKLLSDKKYEDIITWTSDDGTFKLKKPEEVAKLWGEVKGKPDMNYEKMYKELDKYKEKKIIEKVEGEPNTYKFVNYPEYLNP


--------------------------------------------------------------------------------
/outputs/bias_AA_per_residue_multi/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3843, ligand_confidence=0.3843, seq_rec=0.5161
4 | AWSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/bias_AA_per_residue_multi/seqs/4GYT.fa:
--------------------------------------------------------------------------------
1 | >4GYT, T=0.1, seed=111, num_res=354, num_ligand_res=354, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE
3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4325, ligand_confidence=0.4325, seq_rec=0.3955
4 | YMTLPPYAEFAAAIAPLELPVSPSELAGLMLGFLAAGKTELGRAWIRALARGRTDAATQAALAALLEVFDILERQLNDPALELELLLPPADAPLATRAAALAAFARGFVRGLELAGVGPESFATEASRAALERARALAALDPSTLRAGPADEARLEADEAWLRESILAIRRDIAENG:SLTLPPYDEFAAAIAPLELPISPSALAGLMLGYLVAGKTELGRRWIRSLLRGRTDPASQAALAALLAVFDILEAQLTDPSLELELLLPPEDASLRERARALAEFAAGFALGLELAGVDRESFAREESRRDYERILELARLDVSTLKEGEEDRARLAALEAWLRDSIVRLARDLREHG


--------------------------------------------------------------------------------
/outputs/chains_to_design/seqs/4GYT.fa:
--------------------------------------------------------------------------------
1 | >4GYT, T=0.1, seed=111, num_res=177, num_ligand_res=11, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE
3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4349, ligand_confidence=0.4305, seq_rec=0.4576
4 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:ALSLPPYDEFAASIAVLKLTISASELHGIMLGFLTAGAVEQGRRFIESLAKGRTDPATQAALAALMEVFDISERQLNDPSLELEMLLPPEEASLRERCRAFAEFCRGFVLGLTLAGVGEEEFAREESRRAYRRFVELADWDCSRLREGPEDRARLEALREEARRAIVALRRDLRETK


--------------------------------------------------------------------------------
/outputs/default/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161
4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/fasta_seq_separation/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=73564, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=73564, overall_confidence=0.3789, ligand_confidence=0.3789, seq_rec=0.5054
4 | GTSSISLWEFLLKLLSDKKYDDIITWTSNNGEFKLKDPEKVAKLWGKEKGKPDMNYEELYKLLKEYEKKKIIERVKGKPNTYKFVNYPEYLNP


--------------------------------------------------------------------------------
/outputs/file_ending/seqs/1BC8_xyz.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161
4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/fix_residues/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=83, num_ligand_res=83, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.1000, ligand_confidence=0.1000, seq_rec=0.0361
4 | MDSAITLWQFAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA


--------------------------------------------------------------------------------
/outputs/fixed_residues_multi/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=86, num_ligand_res=86, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4057, ligand_confidence=0.4057, seq_rec=0.5116
4 | MDSAISLHEFLLKLLSKPEYKHIIEWTSDNGEFKLKDPEAVAKLWGEEKGEPDMNWKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/fixed_residues_multi/seqs/4GYT.fa:
--------------------------------------------------------------------------------
1 | >4GYT, T=0.1, seed=111, num_res=346, num_ligand_res=346, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE
3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4273, ligand_confidence=0.4273, seq_rec=0.3931
4 | SLHLPKYAEFEAAIAPLNLPVSASELAGLMLGFLAAGKTELGRAWIRALSNGRTDAATQAALAALLEVFDILEKQLNNPEYPLELLLPPADAPLATRAAALAAFARGFVRGLELAGVGRESFKTEASKAALDRIRALAALDPSTLRAGPADEARLDADRAWLIESIRAIHKDISENG:ALSLPPYDEFAAAIAPLELPVSASELAGLMLGYLVAGKTELGRRWIRALARGRTDPATQAALAALLAVFDTLEAQLTDPSLELELLLPPAGASLRARARALAEFARGFVLGLELAGVEKESFAEEESREAYERILELARLDVSTLREGPEDEARLAALEAWLRDSIVRLHRDLREHG


--------------------------------------------------------------------------------
/outputs/global_bias/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.1321, ligand_confidence=0.1321, seq_rec=0.2903
4 | PPSPICLWEWLWCLLCCPKWCPWCCWCCCCGCFCLCKPEWCCKCWGWCKCEPDMNWKKMCKCLCKCCPLKIICPCCCCPCCWRFCCWPECCWP


--------------------------------------------------------------------------------
/outputs/global_label_membrane_mpnn_0/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/global_label_membrane_mpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4217, ligand_confidence=0.4217, seq_rec=0.5591
4 | GMSKKTLYEFLLELLKDPKYDDIIKWTSNDGEFQLLKPEEVAKLWGKEKGKPNMNYEKLYKELKKLEEKKIIERVEGKPNVYKFVNYPEILNP


--------------------------------------------------------------------------------
/outputs/global_omit/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.1011, ligand_confidence=0.1011, seq_rec=0.1505
4 | KKKKKKKEEAEEEEEKKKKAKKKKKKKKKEEKEKKKKAEAEAKKEKEEKKKKKEKAKKEKKKAKKKKKKKKEEKEKKKKKEKKKKKKKEEEKK


--------------------------------------------------------------------------------
/outputs/homooligomer/seqs/4GYT.fa:
--------------------------------------------------------------------------------
1 | >4GYT, T=0.1, seed=111, num_res=354, num_ligand_res=22, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=2, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE
3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4180, ligand_confidence=0.3824, seq_rec=0.4689
4 | SLSLPPYEEFEKSIAVLKLPISASELAGIMLGFLTAGAEEQGRAFIKSLANGRTDEETQAALKALMQVFDILKKQLTDPSLELEMLLPPEDASLEERCRAFADFCRGFVKGLTLAGVGEDDFKSEESKAALRRLKELADLDCSTLREGPADRARLEALREETRRDILRLAEDLRNSG:SLSLPPYEEFEKSIAVLKLPISASELAGIMLGFLTAGAEEQGRAFIKSLANGRTDEETQAALKALMQVFDILKKQLTDPSLELEMLLPPEDASLEERCRAFADFCRGFVKGLTLAGVGEDDFKSEESKAALRRLKELADLDCSTLREGPADRARLEALREETRRDILRLAEDLRNSG
5 | >4GYT, id=2, T=0.1, seed=111, overall_confidence=0.4237, ligand_confidence=0.4409, seq_rec=0.4576
6 | SLKLPAYDDFAAAIAVLELPISASELAGIMLGFLTAGAVEAGRAFIRALANGRTDAATQAALAAMMEVFDILEKQLNDPSLELEILLPPADRSLEERCRALSEFAKGFVRGLTLAGVGEKDFKSEECREALEKMKKLAEMDCSTLREGPEDRARLEALTEELREDILRMRDDLANSG:SLKLPAYDDFAAAIAVLELPISASELAGIMLGFLTAGAVEAGRAFIRALANGRTDAATQAALAAMMEVFDILEKQLNDPSLELEILLPPADRSLEERCRALSEFAKGFVRGLTLAGVGEKDFKSEECREALEKMKKLAEMDCSTLREGPEDRARLEALTEELREDILRMRDDLANSG


--------------------------------------------------------------------------------
/outputs/insertion_code/seqs/2GFB.fa:
--------------------------------------------------------------------------------
1 | >2GFB, T=0.1, seed=111, num_res=4, num_ligand_res=4, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | DVKLVESGGGLVQPGGSRKLSCAASGFTFSSFGMHWVRQAPEKGLEWVAYISSGSSTIYYADTVKGRFTISRDNPKNTLFLQMTSLRSEDTAMYYCARGDYYGSRGAYWGQGTLVTVSAKTTAPSVYPLAPVCGDTTGSSVTLGCLVKGYFPEPVTLTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVTSSTWPSQSITCNVAHPASSTKVDKKIEPRG
3 | >2GFB, id=1, T=0.1, seed=111, overall_confidence=0.5097, ligand_confidence=0.5097, seq_rec=0.5000
4 | DVKLVESGGGLVQPGGSRKLSCAASGFTFSSFGMHWVRQAPEKGLEWVAYISSGSSTIYYADTVKGRFTISRDNPKNTLFLQMSNLRSEDTAMYYCARGDYYGSRGAYWGQGTLVTVSAKTTAPSVYPLAPVCGDTTGSSVTLGCLVKGYFPEPVTLTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVTSSTWPSQSITCNVAHPASSTKVDKKIEPRG


--------------------------------------------------------------------------------
/outputs/ligand_mpnn_cutoff_for_score/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=21, use_ligand_context=True, ligand_cutoff_distance=6.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4784, ligand_confidence=0.5507, seq_rec=0.4839
4 | MKSPISLHEFLLELLSDPKYADIIEWVSDNGEFRLVDPERVAKLWGEVKGKPKMNWKNLHRALRGYKKKKIIETVKGKPYQYRFVNYPELLHP


--------------------------------------------------------------------------------
/outputs/ligandmpnn_default/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_005_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4794, ligand_confidence=0.5111, seq_rec=0.5269
4 | GMSSISLHEFILELLSDPKYADMIKWTGDDGEFQFTKPEEVAKLWGETTGKPNMNYKTLLRAIRYYKKKGIISSVKGKKYTFKFVNYPEILNP


--------------------------------------------------------------------------------
/outputs/ligandmpnn_no_context/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=False, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3959, ligand_confidence=0.3657, seq_rec=0.4086
4 | GGSPISLHEFLLRLLSDPRYAGIIEWVSDNGEFRLVDPEAVAKLWGEEIGEPDMNWTKLQELLDEMVEKKIISRVEGKPNQWRFVNYPELLHP


--------------------------------------------------------------------------------
/outputs/ligandmpnn_use_side_chain_atoms/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=83, num_ligand_res=35, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.5022, ligand_confidence=0.5533, seq_rec=0.4578
4 | MDSAITLWQFLDRLLSDPAYAGLIEWVSDNGEFRLVDPEGVAKLWGEEKGKPKMNWKNMHRALRGYKKKKIIETVKGKPYQYRFVNYPEYLHP


--------------------------------------------------------------------------------
/outputs/ligandmpnn_v_32_005_25/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_005_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4794, ligand_confidence=0.5111, seq_rec=0.5269
4 | GMSSISLHEFILELLSDPKYADMIKWTGDDGEFQFTKPEEVAKLWGETTGKPNMNYKTLLRAIRYYKKKGIISSVKGKKYTFKFVNYPEILNP


--------------------------------------------------------------------------------
/outputs/omit_AA_per_residue_multi/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3736, ligand_confidence=0.3736, seq_rec=0.5054
4 | KKKSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/omit_AA_per_residue_multi/seqs/4GYT.fa:
--------------------------------------------------------------------------------
1 | >4GYT, T=0.1, seed=111, num_res=354, num_ligand_res=354, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE
3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4332, ligand_confidence=0.4332, seq_rec=0.3955
4 | KKTLPPYAEFAAAIAPLELPVSPSELAGLMLGFLAAGKTELGRAWIRALARGRTDAATQAALAALLEVFDILERQLNDPELELELLLPPADAPLATRAAALAAFARGFVRGLELAGVGPESFATEASRAALERARALAALDPSTLRAGPADEARLEADEAWLRESILAIRRDIAENG:SLTLPPYDEFAAAIAPLELPISPSALAGLMLGYLVAGKTELGRRWIRSLLRGRTDPASQAALAALLAVFDILEAQLTDPSLELELLLPPEDASLRERARALAEFAAGFALGLELAGVDRESFAREESRRDYERILELARLDVSTLKEGEEDRARLAALEAWLRDSIVRLARDLREHG


--------------------------------------------------------------------------------
/outputs/parse_atoms_with_zero_occupancy/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4784, ligand_confidence=0.5487, seq_rec=0.4839
4 | MKSPISLHEFLLELLSDPKYADIIEWVSDNGEFRLVDPERVAKLWGEVKGKPKMNWKNLHRALRGYKKKKIIETVKGKPYQYRFVNYPELLHP


--------------------------------------------------------------------------------
/outputs/parse_these_chains_only/seqs/4GYT.fa:
--------------------------------------------------------------------------------
1 | >4GYT, T=0.1, seed=111, num_res=177, num_ligand_res=11, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE
3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4269, ligand_confidence=0.4174, seq_rec=0.4802
4 | SLSLPEYDDFEASIAVLELPISASELHGIMLGYLTAGAYEEGKAFIESLLKGRTDAASQAALTALLRVFEISKKQLSDPSLEFEILLPPESKSLKERCKAFSDFAKGFVQGLEEAGVGEDDFASEESREMLRKFKEYANMDCSKFKEGEEDKKKLKEKTEELREGILRLARDLRHHH


--------------------------------------------------------------------------------
/outputs/pdb_path_multi/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161
4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/pdb_path_multi/seqs/4GYT.fa:
--------------------------------------------------------------------------------
1 | >4GYT, T=0.1, seed=111, num_res=354, num_ligand_res=354, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE
3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4409, ligand_confidence=0.4409, seq_rec=0.4011
4 | SLTLPPYAEFAAAIAPLELPVSPSELAGLMLGFLAAGKTELGRAWIRALARGRTDAATQAALAALLEVFDILERQLNDPALELELLLPPADAPLATRAAALAAFARGFVRGLELAGVGPESFATEASRAALERARALAALDPSTLRAGPADEARLEADEAWLRESILAIRRDIAENG:SLTLPPYDEFAAAIAPLELPISPSALAGLMLGYLVAGKTELGRRWIRSLLRGRTDPASQAALAALLAVFDILEAQLTDPSLELELLLPPEDASLRERARALAEFAAGFALGLELAGVDRESFAREESRRDYERILELARLDVSTLKEGEEDRARLAALEAWLRDSIVRLARDLREHG


--------------------------------------------------------------------------------
/outputs/per_residue_bias/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3271, ligand_confidence=0.3271, seq_rec=0.4839
4 | PVPTPKPWEFLLSLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEAKGEPDMNYKKFEKELKKLEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/per_residue_label_membrane_mpnn_default/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/per_residue_label_membrane_mpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3806, ligand_confidence=0.3806, seq_rec=0.5054
4 | MTSNISLVEFILKLLSNPKYKKYIEWVSDNGEFRLVKPEEVAKLWGKVKGKPNMNYEELEKELEKEVEKKLIEKVEGEKNVYRFVDYPGILNP


--------------------------------------------------------------------------------
/outputs/per_residue_omit/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3284, ligand_confidence=0.3284, seq_rec=0.4731
4 | YTYSYSYHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEAKGEPDMNYKKFEKELKKLEKKKIIEKVKGKPNHYKFVNYPEILYP


--------------------------------------------------------------------------------
/outputs/random_seed/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=96723, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=96723, overall_confidence=0.4171, ligand_confidence=0.4171, seq_rec=0.5376
4 | GTSKISLHEFLLELLSKPEYKDIIEWTSDDGTFKLKDPEKVAKLWGEKKGIPDMNYEKLYELLKEYEEKGIIEKVEGEPNTYKFVNYPEILYP


--------------------------------------------------------------------------------
/outputs/redesign_residues/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=10, num_ligand_res=10, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.0236, ligand_confidence=0.0236, seq_rec=0.1000
4 | AAAAAAAAAALLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM


--------------------------------------------------------------------------------
/outputs/redesigned_residues_multi/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=6, num_ligand_res=6, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3353, ligand_confidence=0.3353, seq_rec=0.5000
4 | GTSSITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM


--------------------------------------------------------------------------------
/outputs/redesigned_residues_multi/seqs/4GYT.fa:
--------------------------------------------------------------------------------
1 | >4GYT, T=0.1, seed=111, num_res=7, num_ligand_res=7, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE
3 | >4GYT, id=1, T=0.1, seed=111, overall_confidence=0.4333, ligand_confidence=0.4333, seq_rec=0.5714
4 | GLSLPPYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE:SLHLPKYDDFVQSISVLALTMSGSELHGIMCGYLCAGADSQGEAYIRALLNNKKDEQSRNALLSMFSVFSISQQQMNNFDFEFEMLLPDDDESLVTRAQAFSEWCEGFTQGLTIAGVGMEQFYEEESQDALQHLMEFAELDCESLEVGEEDERALMEVSEYTRMAVLRLHSDLVLHE


--------------------------------------------------------------------------------
/outputs/save_stats/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161
4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/save_stats/stats/1BC8.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/save_stats/stats/1BC8.pt


--------------------------------------------------------------------------------
/outputs/sc_default/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4764, ligand_confidence=0.5422, seq_rec=0.4731
4 | SRSPISLHEFIDELLSDPKYAHIIRWTSDDGRFRLVKPEEVAKLWGEEKGKPKMNWKNMHKALRGYKKKKIIETVKGKPYEYKFVNYPEHHHH


--------------------------------------------------------------------------------
/outputs/sc_default_fast/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4764, ligand_confidence=0.5422, seq_rec=0.4731
4 | SRSPISLHEFIDELLSDPKYAHIIRWTSDDGRFRLVKPEEVAKLWGEEKGKPKMNWKNMHKALRGYKKKKIIETVKGKPYEYKFVNYPEHHHH


--------------------------------------------------------------------------------
/outputs/sc_fixed_residues/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=83, num_ligand_res=36, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4659, ligand_confidence=0.5278, seq_rec=0.5181
4 | PRSPITLWQFLLQLLSDPAYAHIIRWTSDDGRFQLVQPEEVARLWGEEKGKPKMNWKNMHRALRGYKKKGIIETVKGKPYQYRFVNYPEHLHH


--------------------------------------------------------------------------------
/outputs/sc_fixed_residues_full_repack/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=83, num_ligand_res=36, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4659, ligand_confidence=0.5278, seq_rec=0.5181
4 | PRSPITLWQFLLQLLSDPAYAHIIRWTSDDGRFQLVQPEEVARLWGEEKGKPKMNWKNMHRALRGYKKKGIIETVKGKPYQYRFVNYPEHLHH


--------------------------------------------------------------------------------
/outputs/sc_no_context/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=41, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/ligandmpnn_v_32_010_25.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4764, ligand_confidence=0.5422, seq_rec=0.4731
4 | SRSPISLHEFIDELLSDPKYAHIIRWTSDDGRFRLVKPEEVAKLWGEEKGKPKMNWKNMHKALRGYKKKKIIETVKGKPYEYKFVNYPEHHHH


--------------------------------------------------------------------------------
/outputs/single_aa_score_w_seq/1BC8_1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/single_aa_score_w_seq/1BC8_1.pt


--------------------------------------------------------------------------------
/outputs/single_aa_score_wo_seq/1BC8_1.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dauparas/LigandMPNN/26ec57ac976ade5379920dbd43c7f97a91cf82de/outputs/single_aa_score_wo_seq/1BC8_1.pt


--------------------------------------------------------------------------------
/outputs/soluble_mpnn_default/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/solublempnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.4129, ligand_confidence=0.4129, seq_rec=0.4946
4 | SMSKISLPEFLLSLLSDPKYKDKIEWTGDDGTFRLVDPEAVAKLWGEVKGEPDMNYEKLEEELKKYEEKGIIEKVEGKPNTYRFVNYPEILYP


--------------------------------------------------------------------------------
/outputs/symmetry/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3738, ligand_confidence=0.3738, seq_rec=0.5054
4 | SSSTTLLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKDPEAVAKLWGEEKGEPDMNYEKMEKLLKKYEKKGIIEKVEGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/temperature/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.05, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.05, seed=111, overall_confidence=0.4012, ligand_confidence=0.4012, seq_rec=0.5054
4 | GTSSISLHEFLLKLLSKPEYKDIIEWTSDNGEFKLKKPEAVAKLWGEEKGEPDMNYKKMYKELKKYEKKKIIEEVKGKPNHYKFVNYPEILYP


--------------------------------------------------------------------------------
/outputs/verbose/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161
4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP


--------------------------------------------------------------------------------
/outputs/zero_indexed/seqs/1BC8.fa:
--------------------------------------------------------------------------------
1 | >1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=2, model_path=./model_params/proteinmpnn_v_48_020.pt
2 | MDSAITLWQFLLQLLQKPQNKHMICWTSNDGQFKLLQAEEVARLWGIRKNKPNMNYDKLSRALRYYYVKNIIKKVNGQKFVYKFVSYPEILNM
3 | >1BC8, id=0, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.5161
4 | GTSSISLHEFLLKLLSDPAYKDIIEWTSDDGEFKLKKPEAVAKLWGEEKGEPDMNYKKMEKELKKYEKKKIIEKVKGKPNHYKFVNYPEILFP
5 | >1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3987, ligand_confidence=0.3987, seq_rec=0.4839
6 | GMSSISLYEFLLELLSDPKYEDKIEWISDNGEFRLKDPEAVAKLWGKKKGDPNMNWEKFNKLLEKYEEKGIIEKVEGKKNTYKIVNYPEILNP


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | biopython==1.79
 2 | filelock==3.13.1
 3 | fsspec==2024.3.1
 4 | Jinja2==3.1.3
 5 | MarkupSafe==2.1.5
 6 | mpmath==1.3.0
 7 | networkx==3.2.1
 8 | numpy==1.23.5
 9 | nvidia-cublas-cu12==12.1.3.1
10 | nvidia-cuda-cupti-cu12==12.1.105
11 | nvidia-cuda-nvrtc-cu12==12.1.105
12 | nvidia-cuda-runtime-cu12==12.1.105
13 | nvidia-cudnn-cu12==8.9.2.26
14 | nvidia-cufft-cu12==11.0.2.54
15 | nvidia-curand-cu12==10.3.2.106
16 | nvidia-cusolver-cu12==11.4.5.107
17 | nvidia-cusparse-cu12==12.1.0.106
18 | nvidia-nccl-cu12==2.19.3
19 | nvidia-nvjitlink-cu12==12.4.99
20 | nvidia-nvtx-cu12==12.1.105
21 | ProDy==2.4.1
22 | pyparsing==3.1.1
23 | scipy==1.12.0
24 | sympy==1.12
25 | torch==2.2.1
26 | triton==2.2.0
27 | typing_extensions==4.10.0
28 | ml-collections==0.1.1
29 | dm-tree==0.1.8
30 | 


--------------------------------------------------------------------------------
/run_examples.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #1
  4 | python run.py \
  5 |         --seed 111 \
  6 |         --pdb_path "./inputs/1BC8.pdb" \
  7 |         --out_folder "./outputs/default"
  8 | #2
  9 | python run.py \
 10 |         --seed 111 \
 11 |         --pdb_path "./inputs/1BC8.pdb" \
 12 |         --temperature 0.05 \
 13 |         --out_folder "./outputs/temperature"
 14 | 
 15 | #3
 16 | python run.py \
 17 |         --pdb_path "./inputs/1BC8.pdb" \
 18 |         --out_folder "./outputs/random_seed"
 19 | 
 20 | #4
 21 | python run.py \
 22 |         --seed 111 \
 23 |         --verbose 0 \
 24 |         --pdb_path "./inputs/1BC8.pdb" \
 25 |         --out_folder "./outputs/verbose"
 26 | 
 27 | #5
 28 | python run.py \
 29 |         --seed 111 \
 30 |         --pdb_path "./inputs/1BC8.pdb" \
 31 |         --out_folder "./outputs/save_stats" \
 32 |         --save_stats 1
 33 | 
 34 | #6
 35 | python run.py \
 36 |         --seed 111 \
 37 |         --pdb_path "./inputs/1BC8.pdb" \
 38 |         --out_folder "./outputs/fix_residues" \
 39 |         --fixed_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10" \
 40 |         --bias_AA "A:10.0"
 41 | 
 42 | #7
 43 | python run.py \
 44 |         --seed 111 \
 45 |         --pdb_path "./inputs/1BC8.pdb" \
 46 |         --out_folder "./outputs/redesign_residues" \
 47 |         --redesigned_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10" \
 48 |         --bias_AA "A:10.0"
 49 | 
 50 | #8
 51 | python run.py \
 52 |         --seed 111 \
 53 |         --pdb_path "./inputs/1BC8.pdb" \
 54 |         --out_folder "./outputs/batch_size" \
 55 |         --batch_size 3 \
 56 |         --number_of_batches 5
 57 | 
 58 | #9
 59 | python run.py \
 60 |         --seed 111 \
 61 |         --pdb_path "./inputs/1BC8.pdb" \
 62 |         --bias_AA "W:3.0,P:3.0,C:3.0,A:-3.0" \
 63 |         --out_folder "./outputs/global_bias"
 64 | 
 65 | #10
 66 | python run.py \
 67 |         --seed 111 \
 68 |         --pdb_path "./inputs/1BC8.pdb" \
 69 |         --bias_AA_per_residue "./inputs/bias_AA_per_residue.json" \
 70 |         --out_folder "./outputs/per_residue_bias"
 71 | 
 72 | #11
 73 | python run.py \
 74 |         --seed 111 \
 75 |         --pdb_path "./inputs/1BC8.pdb" \
 76 |         --omit_AA "CDFGHILMNPQRSTVWY" \
 77 |         --out_folder "./outputs/global_omit"
 78 | 
 79 | #12
 80 | python run.py \
 81 |         --seed 111 \
 82 |         --pdb_path "./inputs/1BC8.pdb" \
 83 |         --omit_AA_per_residue "./inputs/omit_AA_per_residue.json" \
 84 |         --out_folder "./outputs/per_residue_omit"
 85 | 
 86 | #13
 87 | python run.py \
 88 |         --seed 111 \
 89 |         --pdb_path "./inputs/1BC8.pdb" \
 90 |         --out_folder "./outputs/symmetry" \
 91 |         --symmetry_residues "C1,C2,C3|C4,C5|C6,C7" \
 92 |         --symmetry_weights "0.33,0.33,0.33|0.5,0.5|0.5,0.5"
 93 | 
 94 | #14
 95 | python run.py \
 96 |         --model_type "ligand_mpnn" \
 97 |         --seed 111 \
 98 |         --pdb_path "./inputs/4GYT.pdb" \
 99 |         --out_folder "./outputs/homooligomer" \
100 |         --homo_oligomer 1 \
101 |         --number_of_batches 2
102 | 
103 | #15
104 | python run.py \
105 |         --seed 111 \
106 |         --pdb_path "./inputs/1BC8.pdb" \
107 |         --out_folder "./outputs/file_ending" \
108 |         --file_ending "_xyz"
109 | 
110 | #16
111 | python run.py \
112 |         --seed 111 \
113 |         --pdb_path "./inputs/1BC8.pdb" \
114 |         --out_folder "./outputs/zero_indexed" \
115 |         --zero_indexed 1 \
116 |         --number_of_batches 2
117 | 
118 | #17
119 | python run.py \
120 |         --model_type "ligand_mpnn" \
121 |         --seed 111 \
122 |         --pdb_path "./inputs/4GYT.pdb" \
123 |         --out_folder "./outputs/chains_to_design" \
124 |         --chains_to_design "A,B"
125 | 
126 | #18
127 | python run.py \
128 |         --model_type "ligand_mpnn" \
129 |         --seed 111 \
130 |         --pdb_path "./inputs/4GYT.pdb" \
131 |         --out_folder "./outputs/parse_these_chains_only" \
132 |         --parse_these_chains_only "A,B"
133 | 
134 | #19
135 | python run.py \
136 |         --model_type "ligand_mpnn" \
137 |         --seed 111 \
138 |         --pdb_path "./inputs/1BC8.pdb" \
139 |         --out_folder "./outputs/ligandmpnn_default"
140 | 
141 | #20
142 | python run.py \
143 |         --checkpoint_ligand_mpnn "./model_params/ligandmpnn_v_32_005_25.pt" \
144 |         --model_type "ligand_mpnn" \
145 |         --seed 111 \
146 |         --pdb_path "./inputs/1BC8.pdb" \
147 |         --out_folder "./outputs/ligandmpnn_v_32_005_25"
148 | 
149 | #21
150 | python run.py \
151 |         --model_type "ligand_mpnn" \
152 |         --seed 111 \
153 |         --pdb_path "./inputs/1BC8.pdb" \
154 |         --out_folder "./outputs/ligandmpnn_no_context" \
155 |         --ligand_mpnn_use_atom_context 0
156 | 
157 | #22
158 | python run.py \
159 |         --model_type "ligand_mpnn" \
160 |         --seed 111 \
161 |         --pdb_path "./inputs/1BC8.pdb" \
162 |         --out_folder "./outputs/ligandmpnn_use_side_chain_atoms" \
163 |         --ligand_mpnn_use_side_chain_context 1 \
164 |         --fixed_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10"
165 | 
166 | #23
167 | python run.py \
168 |         --model_type "soluble_mpnn" \
169 |         --seed 111 \
170 |         --pdb_path "./inputs/1BC8.pdb" \
171 |         --out_folder "./outputs/soluble_mpnn_default"
172 | 
173 | #24
174 | python run.py \
175 |         --model_type "global_label_membrane_mpnn" \
176 |         --seed 111 \
177 |         --pdb_path "./inputs/1BC8.pdb" \
178 |         --out_folder "./outputs/global_label_membrane_mpnn_0" \
179 |         --global_transmembrane_label 0
180 | 
181 | #25
182 | python run.py \
183 |         --model_type "per_residue_label_membrane_mpnn" \
184 |         --seed 111 \
185 |         --pdb_path "./inputs/1BC8.pdb" \
186 |         --out_folder "./outputs/per_residue_label_membrane_mpnn_default" \
187 |         --transmembrane_buried "C1 C2 C3 C11" \
188 |         --transmembrane_interface "C4 C5 C6 C22"
189 | 
190 | #26
191 | python run.py \
192 |         --pdb_path "./inputs/1BC8.pdb" \
193 |         --out_folder "./outputs/fasta_seq_separation" \
194 |         --fasta_seq_separation ":"
195 | 
196 | #27
197 | python run.py \
198 |         --pdb_path_multi "./inputs/pdb_ids.json" \
199 |         --out_folder "./outputs/pdb_path_multi" \
200 |         --seed 111
201 | 
202 | #28
203 | python run.py \
204 |         --pdb_path_multi "./inputs/pdb_ids.json" \
205 |         --fixed_residues_multi "./inputs/fix_residues_multi.json" \
206 |         --out_folder "./outputs/fixed_residues_multi" \
207 |         --seed 111
208 | 
209 | #29
210 | python run.py \
211 |         --pdb_path_multi "./inputs/pdb_ids.json" \
212 |         --redesigned_residues_multi "./inputs/redesigned_residues_multi.json" \
213 |         --out_folder "./outputs/redesigned_residues_multi" \
214 |         --seed 111
215 | 
216 | #30
217 | python run.py \
218 |         --pdb_path_multi "./inputs/pdb_ids.json" \
219 |         --omit_AA_per_residue_multi "./inputs/omit_AA_per_residue_multi.json" \
220 |         --out_folder "./outputs/omit_AA_per_residue_multi" \
221 |         --seed 111
222 | 
223 | #31
224 | python run.py \
225 |         --pdb_path_multi "./inputs/pdb_ids.json" \
226 |         --bias_AA_per_residue_multi "./inputs/bias_AA_per_residue_multi.json" \
227 |         --out_folder "./outputs/bias_AA_per_residue_multi" \
228 |         --seed 111
229 | 
230 | #32
231 | python run.py \
232 |         --model_type "ligand_mpnn" \
233 |         --seed 111 \
234 |         --pdb_path "./inputs/1BC8.pdb" \
235 |         --ligand_mpnn_cutoff_for_score "6.0" \
236 |         --out_folder "./outputs/ligand_mpnn_cutoff_for_score"
237 | 
238 | #33
239 | python run.py \
240 |         --seed 111 \
241 |         --pdb_path "./inputs/2GFB.pdb" \
242 |         --out_folder "./outputs/insertion_code" \
243 |         --redesigned_residues "B82 B82A B82B B82C" \
244 |         --parse_these_chains_only "B"
245 | 


--------------------------------------------------------------------------------
/sc_examples.sh:
--------------------------------------------------------------------------------
 1 | #1 design a new sequence and pack side chains (return 1 side chain packing sample - fast) 
 2 | python run.py \
 3 |         --model_type "ligand_mpnn" \
 4 |         --seed 111 \
 5 |         --pdb_path "./inputs/1BC8.pdb" \
 6 |         --out_folder "./outputs/sc_default_fast" \
 7 |         --pack_side_chains 1 \
 8 |         --number_of_packs_per_design 0 \
 9 |         --pack_with_ligand_context 1
10 | 
11 | #2 design a new sequence and pack side chains (return 4 side chain packing samples) 
12 | python run.py \
13 |         --model_type "ligand_mpnn" \
14 |         --seed 111 \
15 |         --pdb_path "./inputs/1BC8.pdb" \
16 |         --out_folder "./outputs/sc_default" \
17 |         --pack_side_chains 1 \
18 |         --number_of_packs_per_design 4 \
19 |         --pack_with_ligand_context 1
20 | 
21 | 
22 | #3 fix specific residues for design and packing 
23 | python run.py \
24 |         --model_type "ligand_mpnn" \
25 |         --seed 111 \
26 |         --pdb_path "./inputs/1BC8.pdb" \
27 |         --out_folder "./outputs/sc_fixed_residues" \
28 |         --pack_side_chains 1 \
29 |         --number_of_packs_per_design 4 \
30 |         --pack_with_ligand_context 1 \
31 |         --fixed_residues "C6 C7 C8 C9 C10 C11 C12 C13 C14 C15" \
32 |         --repack_everything 0
33 | 
34 | #4 fix specific residues for sequence design but repack everything 
35 | python run.py \
36 |         --model_type "ligand_mpnn" \
37 |         --seed 111 \
38 |         --pdb_path "./inputs/1BC8.pdb" \
39 |         --out_folder "./outputs/sc_fixed_residues_full_repack" \
40 |         --pack_side_chains 1 \
41 |         --number_of_packs_per_design 4 \
42 |         --pack_with_ligand_context 1 \
43 |         --fixed_residues "C6 C7 C8 C9 C10 C11 C12 C13 C14 C15" \
44 |         --repack_everything 1
45 | 
46 | 
47 | #5 design a new sequence using LigandMPNN but pack side chains without considering ligand/DNA etc atoms 
48 | python run.py \
49 |         --model_type "ligand_mpnn" \
50 |         --seed 111 \
51 |         --pdb_path "./inputs/1BC8.pdb" \
52 |         --out_folder "./outputs/sc_no_context" \
53 |         --pack_side_chains 1 \
54 |         --number_of_packs_per_design 4 \
55 |         --pack_with_ligand_context 0
56 | 


--------------------------------------------------------------------------------
/score.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os.path
  4 | import random
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | 
 10 | from data_utils import (
 11 |     element_dict_rev,
 12 |     alphabet,
 13 |     restype_int_to_str,
 14 |     featurize,
 15 |     parse_PDB,
 16 | )
 17 | from model_utils import ProteinMPNN
 18 | 
 19 | 
 20 | def main(args) -> None:
 21 |     """
 22 |     Inference function
 23 |     """
 24 |     if args.seed:
 25 |         seed = args.seed
 26 |     else:
 27 |         seed = int(np.random.randint(0, high=99999, size=1, dtype=int)[0])
 28 |     torch.manual_seed(seed)
 29 |     random.seed(seed)
 30 |     np.random.seed(seed)
 31 |     device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
 32 |     folder_for_outputs = args.out_folder
 33 |     base_folder = folder_for_outputs
 34 |     if base_folder[-1] != "/":
 35 |         base_folder = base_folder + "/"
 36 |     if not os.path.exists(base_folder):
 37 |         os.makedirs(base_folder, exist_ok=True)
 38 |     if args.model_type == "protein_mpnn":
 39 |         checkpoint_path = args.checkpoint_protein_mpnn
 40 |     elif args.model_type == "ligand_mpnn":
 41 |         checkpoint_path = args.checkpoint_ligand_mpnn
 42 |     elif args.model_type == "per_residue_label_membrane_mpnn":
 43 |         checkpoint_path = args.checkpoint_per_residue_label_membrane_mpnn
 44 |     elif args.model_type == "global_label_membrane_mpnn":
 45 |         checkpoint_path = args.checkpoint_global_label_membrane_mpnn
 46 |     elif args.model_type == "soluble_mpnn":
 47 |         checkpoint_path = args.checkpoint_soluble_mpnn
 48 |     else:
 49 |         print("Choose one of the available models")
 50 |         sys.exit()
 51 |     checkpoint = torch.load(checkpoint_path, map_location=device)
 52 |     if args.model_type == "ligand_mpnn":
 53 |         atom_context_num = checkpoint["atom_context_num"]
 54 |         ligand_mpnn_use_side_chain_context = args.ligand_mpnn_use_side_chain_context
 55 |         k_neighbors = checkpoint["num_edges"]
 56 |     else:
 57 |         atom_context_num = 1
 58 |         ligand_mpnn_use_side_chain_context = 0
 59 |         k_neighbors = checkpoint["num_edges"]
 60 | 
 61 |     model = ProteinMPNN(
 62 |         node_features=128,
 63 |         edge_features=128,
 64 |         hidden_dim=128,
 65 |         num_encoder_layers=3,
 66 |         num_decoder_layers=3,
 67 |         k_neighbors=k_neighbors,
 68 |         device=device,
 69 |         atom_context_num=atom_context_num,
 70 |         model_type=args.model_type,
 71 |         ligand_mpnn_use_side_chain_context=ligand_mpnn_use_side_chain_context,
 72 |     )
 73 | 
 74 |     model.load_state_dict(checkpoint["model_state_dict"])
 75 |     model.to(device)
 76 |     model.eval()
 77 | 
 78 |     if args.pdb_path_multi:
 79 |         with open(args.pdb_path_multi, "r") as fh:
 80 |             pdb_paths = list(json.load(fh))
 81 |     else:
 82 |         pdb_paths = [args.pdb_path]
 83 | 
 84 |     if args.fixed_residues_multi:
 85 |         with open(args.fixed_residues_multi, "r") as fh:
 86 |             fixed_residues_multi = json.load(fh)
 87 |     else:
 88 |         fixed_residues = [item for item in args.fixed_residues.split()]
 89 |         fixed_residues_multi = {}
 90 |         for pdb in pdb_paths:
 91 |             fixed_residues_multi[pdb] = fixed_residues
 92 | 
 93 |     if args.redesigned_residues_multi:
 94 |         with open(args.redesigned_residues_multi, "r") as fh:
 95 |             redesigned_residues_multi = json.load(fh)
 96 |     else:
 97 |         redesigned_residues = [item for item in args.redesigned_residues.split()]
 98 |         redesigned_residues_multi = {}
 99 |         for pdb in pdb_paths:
100 |             redesigned_residues_multi[pdb] = redesigned_residues
101 | 
102 |     # loop over PDB paths
103 |     for pdb in pdb_paths:
104 |         if args.verbose:
105 |             print("Designing protein from this path:", pdb)
106 |         fixed_residues = fixed_residues_multi[pdb]
107 |         redesigned_residues = redesigned_residues_multi[pdb]
108 |         protein_dict, backbone, other_atoms, icodes, _ = parse_PDB(
109 |             pdb,
110 |             device=device,
111 |             chains=args.parse_these_chains_only,
112 |             parse_all_atoms=args.ligand_mpnn_use_side_chain_context,
113 |             parse_atoms_with_zero_occupancy=args.parse_atoms_with_zero_occupancy
114 |         )
115 |         # make chain_letter + residue_idx + insertion_code mapping to integers
116 |         R_idx_list = list(protein_dict["R_idx"].cpu().numpy())  # residue indices
117 |         chain_letters_list = list(protein_dict["chain_letters"])  # chain letters
118 |         encoded_residues = []
119 |         for i, R_idx_item in enumerate(R_idx_list):
120 |             tmp = str(chain_letters_list[i]) + str(R_idx_item) + icodes[i]
121 |             encoded_residues.append(tmp)
122 |         encoded_residue_dict = dict(zip(encoded_residues, range(len(encoded_residues))))
123 |         encoded_residue_dict_rev = dict(
124 |             zip(list(range(len(encoded_residues))), encoded_residues)
125 |         )
126 | 
127 |         fixed_positions = torch.tensor(
128 |             [int(item not in fixed_residues) for item in encoded_residues],
129 |             device=device,
130 |         )
131 |         redesigned_positions = torch.tensor(
132 |             [int(item not in redesigned_residues) for item in encoded_residues],
133 |             device=device,
134 |         )
135 | 
136 |         # specify which residues are buried for checkpoint_per_residue_label_membrane_mpnn model
137 |         if args.transmembrane_buried:
138 |             buried_residues = [item for item in args.transmembrane_buried.split()]
139 |             buried_positions = torch.tensor(
140 |                 [int(item in buried_residues) for item in encoded_residues],
141 |                 device=device,
142 |             )
143 |         else:
144 |             buried_positions = torch.zeros_like(fixed_positions)
145 | 
146 |         if args.transmembrane_interface:
147 |             interface_residues = [item for item in args.transmembrane_interface.split()]
148 |             interface_positions = torch.tensor(
149 |                 [int(item in interface_residues) for item in encoded_residues],
150 |                 device=device,
151 |             )
152 |         else:
153 |             interface_positions = torch.zeros_like(fixed_positions)
154 |         protein_dict["membrane_per_residue_labels"] = 2 * buried_positions * (
155 |             1 - interface_positions
156 |         ) + 1 * interface_positions * (1 - buried_positions)
157 | 
158 |         if args.model_type == "global_label_membrane_mpnn":
159 |             protein_dict["membrane_per_residue_labels"] = (
160 |                 args.global_transmembrane_label + 0 * fixed_positions
161 |             )
162 |         if type(args.chains_to_design) == str:
163 |             chains_to_design_list = args.chains_to_design.split(",")
164 |         else:
165 |             chains_to_design_list = protein_dict["chain_letters"]
166 |         chain_mask = torch.tensor(
167 |             np.array(
168 |                 [
169 |                     item in chains_to_design_list
170 |                     for item in protein_dict["chain_letters"]
171 |                 ],
172 |                 dtype=np.int32,
173 |             ),
174 |             device=device,
175 |         )
176 | 
177 |         # create chain_mask to notify which residues are fixed (0) and which need to be designed (1)
178 |         if redesigned_residues:
179 |             protein_dict["chain_mask"] = chain_mask * (1 - redesigned_positions)
180 |         elif fixed_residues:
181 |             protein_dict["chain_mask"] = chain_mask * fixed_positions
182 |         else:
183 |             protein_dict["chain_mask"] = chain_mask
184 | 
185 |         if args.verbose:
186 |             PDB_residues_to_be_redesigned = [
187 |                 encoded_residue_dict_rev[item]
188 |                 for item in range(protein_dict["chain_mask"].shape[0])
189 |                 if protein_dict["chain_mask"][item] == 1
190 |             ]
191 |             PDB_residues_to_be_fixed = [
192 |                 encoded_residue_dict_rev[item]
193 |                 for item in range(protein_dict["chain_mask"].shape[0])
194 |                 if protein_dict["chain_mask"][item] == 0
195 |             ]
196 |             print("These residues will be redesigned: ", PDB_residues_to_be_redesigned)
197 |             print("These residues will be fixed: ", PDB_residues_to_be_fixed)
198 | 
199 |         # specify which residues are linked
200 |         if args.symmetry_residues:
201 |             symmetry_residues_list_of_lists = [
202 |                 x.split(",") for x in args.symmetry_residues.split("|")
203 |             ]
204 |             remapped_symmetry_residues = []
205 |             for t_list in symmetry_residues_list_of_lists:
206 |                 tmp_list = []
207 |                 for t in t_list:
208 |                     tmp_list.append(encoded_residue_dict[t])
209 |                 remapped_symmetry_residues.append(tmp_list)
210 |         else:
211 |             remapped_symmetry_residues = [[]]
212 | 
213 |         if args.homo_oligomer:
214 |             if args.verbose:
215 |                 print("Designing HOMO-OLIGOMER")
216 |             chain_letters_set = list(set(chain_letters_list))
217 |             reference_chain = chain_letters_set[0]
218 |             lc = len(reference_chain)
219 |             residue_indices = [
220 |                 item[lc:] for item in encoded_residues if item[:lc] == reference_chain
221 |             ]
222 |             remapped_symmetry_residues = []
223 |             for res in residue_indices:
224 |                 tmp_list = []
225 |                 tmp_w_list = []
226 |                 for chain in chain_letters_set:
227 |                     name = chain + res
228 |                     tmp_list.append(encoded_residue_dict[name])
229 |                     tmp_w_list.append(1 / len(chain_letters_set))
230 |                 remapped_symmetry_residues.append(tmp_list)
231 | 
232 |         # set other atom bfactors to 0.0
233 |         if other_atoms:
234 |             other_bfactors = other_atoms.getBetas()
235 |             other_atoms.setBetas(other_bfactors * 0.0)
236 | 
237 |         # adjust input PDB name by dropping .pdb if it does exist
238 |         name = pdb[pdb.rfind("/") + 1 :]
239 |         if name[-4:] == ".pdb":
240 |             name = name[:-4]
241 | 
242 |         with torch.no_grad():
243 |             # run featurize to remap R_idx and add batch dimension
244 |             if args.verbose:
245 |                 if "Y" in list(protein_dict):
246 |                     atom_coords = protein_dict["Y"].cpu().numpy()
247 |                     atom_types = list(protein_dict["Y_t"].cpu().numpy())
248 |                     atom_mask = list(protein_dict["Y_m"].cpu().numpy())
249 |                     number_of_atoms_parsed = np.sum(atom_mask)
250 |                 else:
251 |                     print("No ligand atoms parsed")
252 |                     number_of_atoms_parsed = 0
253 |                     atom_types = ""
254 |                     atom_coords = []
255 |                 if number_of_atoms_parsed == 0:
256 |                     print("No ligand atoms parsed")
257 |                 elif args.model_type == "ligand_mpnn":
258 |                     print(
259 |                         f"The number of ligand atoms parsed is equal to: {number_of_atoms_parsed}"
260 |                     )
261 |                     for i, atom_type in enumerate(atom_types):
262 |                         print(
263 |                             f"Type: {element_dict_rev[atom_type]}, Coords {atom_coords[i]}, Mask {atom_mask[i]}"
264 |                         )
265 |             feature_dict = featurize(
266 |                 protein_dict,
267 |                 cutoff_for_score=args.ligand_mpnn_cutoff_for_score,
268 |                 use_atom_context=args.ligand_mpnn_use_atom_context,
269 |                 number_of_ligand_atoms=atom_context_num,
270 |                 model_type=args.model_type,
271 |             )
272 |             feature_dict["batch_size"] = args.batch_size
273 |             B, L, _, _ = feature_dict["X"].shape  # batch size should be 1 for now.
274 |             # add additional keys to the feature dictionary
275 |             feature_dict["symmetry_residues"] = remapped_symmetry_residues
276 | 
277 |             logits_list = []
278 |             probs_list = []
279 |             log_probs_list = []
280 |             decoding_order_list = []
281 |             for _ in range(args.number_of_batches):
282 |                 feature_dict["randn"] = torch.randn(
283 |                     [feature_dict["batch_size"], feature_dict["mask"].shape[1]],
284 |                     device=device,
285 |                 )
286 |                 if args.autoregressive_score:
287 |                     score_dict = model.score(feature_dict, use_sequence=args.use_sequence)
288 |                 elif args.single_aa_score:
289 |                     score_dict = model.single_aa_score(feature_dict, use_sequence=args.use_sequence)
290 |                 else:
291 |                     print("Set either autoregressive_score or single_aa_score to True")
292 |                     sys.exit()
293 |                 logits_list.append(score_dict["logits"])
294 |                 log_probs_list.append(score_dict["log_probs"])
295 |                 probs_list.append(torch.exp(score_dict["log_probs"]))
296 |                 decoding_order_list.append(score_dict["decoding_order"])
297 |             log_probs_stack = torch.cat(log_probs_list, 0)
298 |             logits_stack = torch.cat(logits_list, 0)
299 |             probs_stack = torch.cat(probs_list, 0)
300 |             decoding_order_stack = torch.cat(decoding_order_list, 0)
301 | 
302 |             output_stats_path = base_folder + name + args.file_ending + ".pt"
303 |             out_dict = {}
304 |             out_dict["logits"] = logits_stack.cpu().numpy()
305 |             out_dict["probs"] = probs_stack.cpu().numpy()
306 |             out_dict["log_probs"] = log_probs_stack.cpu().numpy()
307 |             out_dict["decoding_order"] = decoding_order_stack.cpu().numpy()
308 |             out_dict["native_sequence"] = feature_dict["S"][0].cpu().numpy()
309 |             out_dict["mask"] = feature_dict["mask"][0].cpu().numpy()
310 |             out_dict["chain_mask"] = feature_dict["chain_mask"][0].cpu().numpy() #this affects decoding order
311 |             out_dict["seed"] = seed
312 |             out_dict["alphabet"] = alphabet
313 |             out_dict["residue_names"] = encoded_residue_dict_rev
314 | 
315 |             mean_probs = np.mean(out_dict["probs"], 0)
316 |             std_probs = np.std(out_dict["probs"], 0)
317 |             sequence = [restype_int_to_str[AA] for AA in out_dict["native_sequence"]]
318 |             mean_dict = {}
319 |             std_dict = {}
320 |             for residue in range(L):
321 |                 mean_dict_ = dict(zip(alphabet, mean_probs[residue]))
322 |                 mean_dict[encoded_residue_dict_rev[residue]] = mean_dict_
323 |                 std_dict_ = dict(zip(alphabet, std_probs[residue]))
324 |                 std_dict[encoded_residue_dict_rev[residue]] = std_dict_
325 | 
326 |             out_dict["sequence"] = sequence
327 |             out_dict["mean_of_probs"] = mean_dict
328 |             out_dict["std_of_probs"] = std_dict
329 |             torch.save(out_dict, output_stats_path)
330 | 
331 | 
332 | 
333 | if __name__ == "__main__":
334 |     argparser = argparse.ArgumentParser(
335 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
336 |     )
337 | 
338 |     argparser.add_argument(
339 |         "--model_type",
340 |         type=str,
341 |         default="protein_mpnn",
342 |         help="Choose your model: protein_mpnn, ligand_mpnn, per_residue_label_membrane_mpnn, global_label_membrane_mpnn, soluble_mpnn",
343 |     )
344 |     # protein_mpnn - original ProteinMPNN trained on the whole PDB exluding non-protein atoms
345 |     # ligand_mpnn - atomic context aware model trained with small molecules, nucleotides, metals etc on the whole PDB
346 |     # per_residue_label_membrane_mpnn - ProteinMPNN model trained with addition label per residue specifying if that residue is buried or exposed
347 |     # global_label_membrane_mpnn - ProteinMPNN model trained with global label per PDB id to specify if protein is transmembrane
348 |     # soluble_mpnn - ProteinMPNN trained only on soluble PDB ids
349 |     argparser.add_argument(
350 |         "--checkpoint_protein_mpnn",
351 |         type=str,
352 |         default="./model_params/proteinmpnn_v_48_020.pt",
353 |         help="Path to model weights.",
354 |     )
355 |     argparser.add_argument(
356 |         "--checkpoint_ligand_mpnn",
357 |         type=str,
358 |         default="./model_params/ligandmpnn_v_32_010_25.pt",
359 |         help="Path to model weights.",
360 |     )
361 |     argparser.add_argument(
362 |         "--checkpoint_per_residue_label_membrane_mpnn",
363 |         type=str,
364 |         default="./model_params/per_residue_label_membrane_mpnn_v_48_020.pt",
365 |         help="Path to model weights.",
366 |     )
367 |     argparser.add_argument(
368 |         "--checkpoint_global_label_membrane_mpnn",
369 |         type=str,
370 |         default="./model_params/global_label_membrane_mpnn_v_48_020.pt",
371 |         help="Path to model weights.",
372 |     )
373 |     argparser.add_argument(
374 |         "--checkpoint_soluble_mpnn",
375 |         type=str,
376 |         default="./model_params/solublempnn_v_48_020.pt",
377 |         help="Path to model weights.",
378 |     )
379 | 
380 |     argparser.add_argument("--verbose", type=int, default=1, help="Print stuff")
381 | 
382 |     argparser.add_argument(
383 |         "--pdb_path", type=str, default="", help="Path to the input PDB."
384 |     )
385 |     argparser.add_argument(
386 |         "--pdb_path_multi",
387 |         type=str,
388 |         default="",
389 |         help="Path to json listing PDB paths. {'/path/to/pdb': ''} - only keys will be used.",
390 |     )
391 | 
392 |     argparser.add_argument(
393 |         "--fixed_residues",
394 |         type=str,
395 |         default="",
396 |         help="Provide fixed residues, A12 A13 A14 B2 B25",
397 |     )
398 |     argparser.add_argument(
399 |         "--fixed_residues_multi",
400 |         type=str,
401 |         default="",
402 |         help="Path to json mapping of fixed residues for each pdb i.e., {'/path/to/pdb': 'A12 A13 A14 B2 B25'}",
403 |     )
404 | 
405 |     argparser.add_argument(
406 |         "--redesigned_residues",
407 |         type=str,
408 |         default="",
409 |         help="Provide to be redesigned residues, everything else will be fixed, A12 A13 A14 B2 B25",
410 |     )
411 |     argparser.add_argument(
412 |         "--redesigned_residues_multi",
413 |         type=str,
414 |         default="",
415 |         help="Path to json mapping of redesigned residues for each pdb i.e., {'/path/to/pdb': 'A12 A13 A14 B2 B25'}",
416 |     )
417 | 
418 |     argparser.add_argument(
419 |         "--symmetry_residues",
420 |         type=str,
421 |         default="",
422 |         help="Add list of lists for which residues need to be symmetric, e.g. 'A12,A13,A14|C2,C3|A5,B6'",
423 |     )
424 |     
425 |     argparser.add_argument(
426 |         "--homo_oligomer",
427 |         type=int,
428 |         default=0,
429 |         help="Setting this to 1 will automatically set --symmetry_residues and --symmetry_weights to do homooligomer design with equal weighting.",
430 |     )
431 | 
432 |     argparser.add_argument(
433 |         "--out_folder",
434 |         type=str,
435 |         help="Path to a folder to output scores, e.g. /home/out/",
436 |     )
437 |     argparser.add_argument(
438 |         "--file_ending", type=str, default="", help="adding_string_to_the_end"
439 |     )
440 |     argparser.add_argument(
441 |         "--zero_indexed",
442 |         type=str,
443 |         default=0,
444 |         help="1 - to start output PDB numbering with 0",
445 |     )
446 |     argparser.add_argument(
447 |         "--seed",
448 |         type=int,
449 |         default=0,
450 |         help="Set seed for torch, numpy, and python random.",
451 |     )
452 |     argparser.add_argument(
453 |         "--batch_size",
454 |         type=int,
455 |         default=1,
456 |         help="Number of sequence to generate per one pass.",
457 |     )
458 |     argparser.add_argument(
459 |         "--number_of_batches",
460 |         type=int,
461 |         default=1,
462 |         help="Number of times to design sequence using a chosen batch size.",
463 |     )
464 | 
465 |     argparser.add_argument(
466 |         "--ligand_mpnn_use_atom_context",
467 |         type=int,
468 |         default=1,
469 |         help="1 - use atom context, 0 - do not use atom context.",
470 |     )
471 | 
472 |     argparser.add_argument(
473 |         "--ligand_mpnn_use_side_chain_context",
474 |         type=int,
475 |         default=0,
476 |         help="Flag to use side chain atoms as ligand context for the fixed residues",
477 |     )
478 | 
479 |     argparser.add_argument(
480 |         "--ligand_mpnn_cutoff_for_score",
481 |         type=float,
482 |         default=8.0,
483 |         help="Cutoff in angstroms between protein and context atoms to select residues for reporting score.",
484 |     )
485 | 
486 |     argparser.add_argument(
487 |         "--chains_to_design",
488 |         type=str,
489 |         default=None,
490 |         help="Specify which chains to redesign, all others will be kept fixed.",
491 |     )
492 | 
493 |     argparser.add_argument(
494 |         "--parse_these_chains_only",
495 |         type=str,
496 |         default="",
497 |         help="Provide chains letters for parsing backbones, 'ABCF'",
498 |     )
499 | 
500 |     argparser.add_argument(
501 |         "--transmembrane_buried",
502 |         type=str,
503 |         default="",
504 |         help="Provide buried residues when using checkpoint_per_residue_label_membrane_mpnn model, A12 A13 A14 B2 B25",
505 |     )
506 |     argparser.add_argument(
507 |         "--transmembrane_interface",
508 |         type=str,
509 |         default="",
510 |         help="Provide interface residues when using checkpoint_per_residue_label_membrane_mpnn model, A12 A13 A14 B2 B25",
511 |     )
512 | 
513 |     argparser.add_argument(
514 |         "--global_transmembrane_label",
515 |         type=int,
516 |         default=0,
517 |         help="Provide global label for global_label_membrane_mpnn model. 1 - transmembrane, 0 - soluble",
518 |     )
519 | 
520 |     argparser.add_argument(
521 |         "--parse_atoms_with_zero_occupancy",
522 |         type=int,
523 |         default=0,
524 |         help="To parse atoms with zero occupancy in the PDB input files. 0 - do not parse, 1 - parse atoms with zero occupancy",
525 |     )
526 | 
527 |     argparser.add_argument(
528 |         "--use_sequence",
529 |         type=int,
530 |         default=1,
531 |         help="1 - get scores using amino acid sequence info; 0 - get scores using backbone info only",
532 |     )
533 | 
534 |     argparser.add_argument(
535 |         "--autoregressive_score",
536 |         type=int,
537 |         default=0,
538 |         help="1 - run autoregressive scoring function; p(AA_1|backbone); p(AA_2|backbone, AA_1) etc, 0 - False",
539 |     )
540 | 
541 |     argparser.add_argument(
542 |         "--single_aa_score",
543 |         type=int,
544 |         default=1,
545 |         help="1 - run single amino acid scoring function; p(AA_i|backbone, AA_{all except ith one}), 0 - False",
546 |     )
547 | 
548 |     args = argparser.parse_args()
549 |     main(args)
550 | 


--------------------------------------------------------------------------------
/training/README.md:
--------------------------------------------------------------------------------
1 | ## Retraining LigandMPNN
2 | 
3 | Training PDB ids: `train.json`
4 | 
5 | Validation PDB ids: `valid.json`
6 | 
7 | Test PDB ids: `test_small_molecule.json, test_nucleotide.json, test_metal.json`
8 | 


--------------------------------------------------------------------------------
/training/test_metal.json:
--------------------------------------------------------------------------------
1 | ["1dwh", "1e4m", "1e6s", "1e72", "1f35", "1fee", "1job", "1lqk", "1m5e", "1m5f", "1moj", "1mxy", "1mxz", "1my1", "1nki", "1qum", "1sgf", "1t31", "1u3e", "2bdh", "2bx2", "2cfv", "2e6c", "2nq9", "2nqj", "2nz6", "2ou7", "2vxx", "2zwn", "3bvx", "3cv5", "3f4v", "3f5l", "3fgg", "3hg9", "3hkn", "3hkt", "3i9z", "3k7r", "3l24", "3l7t", "3m7p", "3mi9", "3o1u", "3u92", "3u93", "3u94", "3won", "4aoj", "4dy1", "4hzt", "4i0f", "4i0j", "4i0z", "4i11", "4i12", "4jd1", "4naz", "4wd8", "4x68", "5f55", "5f56", "5fgs", "5hez", "5i4j", "5l70", "5vde", "6a4x", "6buu", "6cyt", "6iv2", "6lkp", "6lrd", "6wdz", "6x75", "7dnr", "7e34", "7kii", "7n7g", "7s7l", "7s7m", "7w5e", "7wb2"]


--------------------------------------------------------------------------------
/training/test_nucleotide.json:
--------------------------------------------------------------------------------
1 | ["1a0a", "1am9", "1an4", "1b01", "1bc7", "1bc8", "1di2", "1ec6", "1hlo", "1hlv", "1i3j", "1pvi", "1qum", "1sfu", "1u3e", "1xpx", "1yo5", "1zx4", "2c5r", "2c62", "2nq9", "2o4a", "2p5l", "2xdb", "2ypb", "2zhg", "2zio", "3adl", "3bsu", "3fc3", "3g73", "3gna", "3gx4", "3lsr", "3mj0", "3mva", "3n7q", "3olt", "3vok", "3vwb", "3zp5", "4ato", "4bhm", "4bqa", "4e0p", "4nid", "4wal", "5cm3", "5haw", "5mht", "5vc9", "5w9s", "5ybd", "6bjv", "6dnw", "6fqr", "6gdr", "6kbs", "6lff", "6lmj", "6od4", "6wdz", "6x70", "6y93", "7bca", "7c0g", "7el3", "7jsa", "7ju3", "7kii", "7kij", "7mtl", "7z0u", "8dwm"]


--------------------------------------------------------------------------------
/training/test_small_molecule.json:
--------------------------------------------------------------------------------
1 | ["1a28", "1bzc", "1drv", "1e3g", "1elb", "1elc", "1epo", "1f0r", "1g7f", "1g7g", "1gvw", "1gx8", "1i37", "1kav", "1kdk", "1kv1", "1l8g", "1lhu", "1lpg", "1nc1", "1nfx", "1nhz", "1nl9", "1nny", "1nwl", "1ony", "1pyn", "1qb1", "1qkt", "1qxk", "1r0p", "1sj0", "1sqn", "1v2n", "1xjd", "1xws", "1yc1", "1yqj", "1z95", "1zp8", "2ayr", "2b07", "2b4l", "2baj", "2bak", "2bal", "2bsm", "2cet", "2e2r", "2f6t", "2fdp", "2g94", "2hah", "2ihq", "2iwx", "2j2u", "2j34", "2j4i", "2j94", "2j95", "2o0u", "2oax", "2ojg", "2ojj", "2p4j", "2p7g", "2p7z", "2pog", "2qbp", "2qbq", "2qbs", "2qe4", "2qmg", "2uwl", "2uwo", "2uwp", "2v7a", "2vh0", "2vh6", "2vkm", "2vrj", "2vw5", "2vwc", "2w8y", "2wc3", "2web", "2wec", "2weq", "2wgj", "2wuf", "2wyg", "2wyj", "2xab", "2xb8", "2xda", "2xht", "2xj1", "2xj2", "2xjg", "2xjx", "2y7x", "2y7z", "2y80", "2y81", "2y82", "2ydw", "2yek", "2yel", "2yfe", "2yfx", "2yge", "2ygf", "2yi0", "2yi7", "2yix", "2zmm", "3acw", "3acx", "3b5r", "3b65", "3bgq", "3bgz", "3ckp", "3cow", "3coy", "3coz", "3d7z", "3d83", "3eax", "3ekr", "3fv1", "3fv2", "3fvk", "3gba", "3gbb", "3gcs", "3gcu", "3gy3", "3hek", "3i25", "3ioc", "3iph", "3iw6", "3k97", "3lpi", "3lpk", "3lxk", "3m35", "3myg", "3n76", "3nq3", "3nyx", "3o5x", "3o8p", "3pww", "3roc", "3tfn", "3u81", "3ueu", "3uev", "3uew", "3uex", "3vha", "3vhc", "3vhd", "3vje", "3vvy", "3vw1", "3vw2", "3wha", "3wz6", "3wz8", "3zc5", "3zm9", "3zze", "4a4v", "4a4w", "4a7i", "4ag8", "4ap7", "4b6o", "4b9k", "4cd0", "4cga", "4cmo", "4da5", "4e5w", "4e6d", "4e9u", "4ea2", "4egk", "4er1", "4fcq", "4ffs", "4flp", "4g8n", "4gny", "4gu6", "4hge", "4igt", "4k0y", "4k9y", "4kao", "4kcx", "4lyw", "4m0r", "4m12", "4m13", "4muf", "4nh8", "4nwc", "4o04", "4o05", "4o07", "4o09", "4o0b", "4p5z", "4pmm", "4pop", "4qev", "4qew", "4qyy", "4rfm", "4rwj", "4twp", "4uyf", "4v01", "4w9f", "4w9l", "4wa9", "4wkn", "4x6p", "4xip", "4xir", "4y79", "4ybk", "4ymb", "4yml", "4ynb", "4yth", "4z0k", "4zae", "5aa9", "5acy", "5d26", "5d3h", "5d3j", "5d3l", "5d3t", "5dlx", "5dqc", "5dwr", "5e74", "5egm", "5eng", "5eqp", "5eqy", "5er1", "5exm", "5exn", "5f9b", "5fto", "5fut", "5hcv", "5i3v", "5i3y", "5i9x", "5i9z", "5ie1", "5ih9", "5jq5", "5kz0", "5l2s", "5lli", "5lny", "5lsg", "5neb", "5nw1", "5nyh", "5op5", "5oq8", "5qqp", "5t19", "5tpx", "5v82", "5yfs", "5yft", "6c2r", "6cjr", "6cpw", "6dgq", "6dgr", "6dyu", "6dyv", "6el5", "6elo", "6elp", "6ey9", "6eyb", "6f1n", "6ge7", "6gf9", "6gfs", "6ghh", "6i61", "6i64", "6i67", "6md0", "6mh1", "6mh7", "6n7a", "6n8x", "6no9", "6nv7", "6nv9", "6olx", "6qi7"]


--------------------------------------------------------------------------------