├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── build.sh
├── clean.sh
├── pyproject.toml
├── setup.py
├── src
    └── afpdb
    │   ├── __init__.py
    │   ├── afpdb.py
    │   ├── aiparser.py
    │   ├── mol3D.py
    │   ├── myalphafold
    │       ├── __init__.py
    │       ├── common
    │       │   ├── protein.py
    │       │   ├── residue_constants.py
    │       │   └── stereo_chemical_props.txt
    │       └── model
    │       │   └── utils.py
    │   ├── mycolabdesign
    │       ├── __init__.py
    │       ├── getpdb.py
    │       ├── protein.py
    │       └── utils.py
    │   ├── mycolabfold
    │       ├── __init__.py
    │       └── utils.py
    │   ├── mypymol.py
    │   ├── thread_seq.py
    │   └── util.py
├── tests
    ├── 1a3d.pdb
    ├── 5cil.pdb
    ├── 5cil_100.pdb
    ├── fake.pdb
    └── test_all.py
└── tutorial
    ├── AI.ipynb
    ├── AI.pdf
    ├── Afpdb_Tutorial.docx
    ├── Afpdb_Tutorial.pdf
    ├── Developer.ipynb
    ├── Developer.pdf
    ├── afpdb.ipynb
    ├── afpdb.pdf
    ├── example_files
        ├── 1a3d.pdb
        ├── 5cil.pdb
        ├── 5cil_100.pdb
        ├── 5cil_AF.json
        ├── 5cil_AF.pdb
        ├── 5cil_ab_AF.pdb
        ├── 5cil_rfdiffuse_H3.pdb
        ├── esmfold.pdb
        └── fake.pdb
    ├── img
        ├── .DS_Store
        ├── afpdb.png
        ├── afpdb_numpy.png
        ├── demo.png
        ├── drSASA.png
        └── mypm.png
    └── mypm.png


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | 
 3 | ## v0.2.3
 4 | 
 5 | ### Feature
 6 | 
 7 | - Added aiparser.LigandMPNNParser
 8 | - Added aiparser.ProteinMPNNParser.make_structure
 9 | - Improved thread_sequence to support rl_from and rl_to, remove chain_map
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Yingyao Zhou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | Source Code Origins:
24 | 
25 | afpdb/mycolabdesign folder contains code adapted from the ColabDesign package:
26 | 	https://github.com/sokrypton/ColabDesign
27 | 
28 | afpdb/mycolabfold folder contains code adapted from the ColabFold package:
29 | 	https://github.com/sokrypton/ColabFold
30 | 
31 | afpdb/myalphafold folder contains code adapted from the AlphaFold package:
32 | 	https://github.com/google-deepmind/alphafold
33 | 
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Afpdb - An Efficient Protein Structure Manipulation Tool
  2 | 
  3 | <a href="https://pypi.org/project/afpdb" rel="nofollow">
  4 | <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/afpdb?logo=pypi">
  5 | </a>
  6 | <a href="https://anaconda.org/bioconda/afpdb" rel="nofollow">
  7 | <img alt="Conda Version" src="https://img.shields.io/conda/vn/bioconda/afpdb">
  8 | </a>
  9 | 
 10 | The advent of AlphaFold and other protein AI models has transformed protein design, necessitating efficient handling of large-scale data and complex workflows. Traditional programming packages, developed before these AI advancements, often lead to inefficiencies in coding and slow execution. To bridge this gap, we introduce Afpdb, a high-performance Python module built on AlphaFold’s NumPy architecture. Afpdb leverages RFDiffusion's contig syntax to streamline residue and atom selection, making coding simpler and more readable. By integrating PyMOL’s visualization capabilities, Afpdb enables automatic visual quality control, enhancing productivity in structural biology. With over 180 methods commonly used in protein AI design, Afpdb supports the development of concise, high-performance code, addressing the limitations of existing tools like Biopython. Afpdb is designed to complement powerful AI models such as AlphaFold and ProteinMPNN, providing the additional utility needed to effectively manipulate protein structures and drive innovation in protein design.
 11 | 
 12 | Please read our short artical published in <a href="https://doi.org/10.1093/bioinformatics/btae654">Bioinformatics</a>.
 13 | 
 14 | <img src="https://github.com/data2code/afpdb/blob/main/tutorial/img/afpdb.png?raw=true">
 15 | 
 16 | ## Tutorial
 17 | 
 18 | The tutorial book is availabe in <a href="tutorial/Afpdb_Tutorial.pdf">PDF</a>.
 19 | 
 20 | The best way to learn and practice Afpdb is to open [Tutorial Notebook](https://colab.research.google.com/github/data2code/afpdb/blob/main/tutorial/afpdb.ipynb) in Google Colab.
 21 | 
 22 | Table of Content
 23 | 
 24 | 1. Demo
 25 | 2. Fundamental Concepts
 26 |    - Internal Data Structure
 27 |    - Contig 
 28 | 3. Selection
 29 |    - Atom Selection
 30 |    - Residue Selection
 31 |    - Residue List
 32 | 4. Read/Write
 33 | 5. Sequence & Chain
 34 | 6. Geometry, Measurement, & Visualization
 35 |    - Select Neighboring Residues
 36 |    - Display
 37 |    - B-factors
 38 |    - PyMOL Interface
 39 |    - RMSD
 40 |    - Solvent-Accessible Surface Area (SASA)
 41 |    - Secondary Structures - DSSP
 42 |    - Internal Coordinates
 43 | 7. Object Manipulation
 44 |    - Move Objects
 45 |    - Align
 46 |    - Split & Merge Objects
 47 | 8. Parsers for AI Models
 48 | 
 49 | ## AI Use Cases
 50 | 
 51 | Interested in applying Afpdb to AI protein design? Open [AI Use Case Notebook](https://colab.research.google.com/github/data2code/afpdb/blob/main/tutorial/AI.ipynb) in Google Colab.
 52 | 
 53 | Table of Content
 54 | 
 55 | - Example AI Protein Design Use Cases
 56 |    - Handle Missing Residues in AlphaFold Prediction
 57 |    - Structure Prediction with ESMFold
 58 |    - Create Side Chains for de novo Designed Proteins
 59 |    - Compute Binding Scores in EvoPro
 60 | 
 61 | ## Developer's Note
 62 | 
 63 | Open [Developer Notebook](https://colab.research.google.com/github/data2code/afpdb/blob/main/tutorial/Developer.ipynb) in Google Colab.
 64 | 
 65 | ## Install
 66 | Stable version:
 67 | ```
 68 | pip install afpdb
 69 | ```
 70 | or
 71 | ```
 72 | conda install bioconda::afpdb
 73 | ```
 74 | Development version:
 75 | ```
 76 | pip install git+https://github.com/data2code/afpdb.git
 77 | ```
 78 | or
 79 | ```
 80 | git clone https://github.com/data2code/afpdb.git
 81 | cd afpdb
 82 | pip install .
 83 | ```
 84 | To import the package use:
 85 | ```
 86 | from afpdb.afpdb import Protein,RS,RL,ATS
 87 | ```
 88 | ## Demo
 89 | 
 90 | ### Structure Read & Summary
 91 | ```
 92 | # load the ab-ag complex structure 5CIL from PDB
 93 | p=Protein("5cil")
 94 | # show key statistics summary of the structure
 95 | p.summary().display()
 96 | ```
 97 | Output
 98 | ```
 99 |     Chain    Sequence                    Length    #Missing Residues    #Insertion Code    First Residue Name    Last Residue Name
100 | --  -------  ---------------------------------------------------------------------------------------------------------------------
101 |  0  H        VQLVQSGAEVKRPGSSVTVS...        220                   20                 14                     2                  227
102 |  1  L        EIVLTQSPGTQSLSPGERAT...        212                    0                  1                     1                  211
103 |  2  P        NWFDITNWLWYIK                   13                    0                  0                   671                  683
104 | ```
105 | ### Residue Relabeling
106 | 
107 | ```
108 | print("Old P chain residue numbering:", p.rs("P").name(), "\n")
109 | 
110 | Output:
111 | Old P chain residue numbering: ['671', '672', '673', '674', '675', '676', '677', '678', '679', '680', '681', '682', '683'] 
112 | 
113 | p.renumber("RESTART", inplace=True)
114 | print("New P chain residue numbering:", p.rs("P").name(), "\n")
115 | 
116 | Output:
117 | New P chain residue numbering: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13'] 
118 | 
119 | p.summary()
120 | ```
121 | Output
122 | 
123 | ```
124 |     Chain    Sequence                    Length    #Missing Residues    #Insertion Code    First Residue Name    Last Residue Name
125 | --  -------  ---------------------------------------------------------------------------------------------------------------------
126 |  0  H        VQLVQSGAEVKRPGSSVTVS...        220                   20                 14                     1                  226
127 |  1  L        EIVLTQSPGTQSLSPGERAT...        212                    0                  1                     1                  211
128 |  2  P        NWFDITNWLWYIK                   13                    0                  0                     1                   13
129 |  ```
130 | ### Replace Missing Residues for AI Prediction
131 | ```
132 | print("Sequence for AlphaFold modeling, with missing residues replaced by Glycine:")
133 | print(">5cil\n"+p.seq(gap="G")+"\n")
134 | ```
135 | Output
136 | ```
137 | Sequence for AlphaFold modeling, with missing residues replaced by Glycine:
138 | >5cil
139 | VQLVQSGAEVKRPGSSVTVSCKASGGSFSTYALSWVRQAPGRGLEWMGGVIPLLTITNYAPRFQGRITITADRSTSTAYLELNSLRPEDTAVYYCAREGTTGDGDLGKPIGAFAHWGQGTLVTVSSASTKGPSVFPLAPSGGGGGGGGGTAALGCLVKDYFPEPVTVGSWGGGGNSGALTSGGVHTFPAVLQSGSGLYSLSSVVTVPSSSLGTGGQGTYICNVNHKPSNTKVDKKGGVEP:EIVLTQSPGTQSLSPGERATLSCRASQSVGNNKLAWYQQRPGQAPRLLIYGASSRPSGVADRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGQSLSTFGQGTKVEVKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNR:NWFDITNWLWYIK
140 | ```
141 | ### Interface Computing
142 | ```
143 | # identify H,L chain residues within 4A to antigen P chain
144 | rs_binder, rs_seed, df_dist=p.rs_around("P", dist=4)
145 | 
146 | # show the distance of binder residues to antigen P chain
147 | df_dist[:5].display()
148 | ```
149 | Output
150 | ```
151 |      chain_a      resn_a    resn_i_a    resi_a  res_a    chain_b    resn_b      resn_i_b    resi_b  res_b       dist  atom_a    atom_b
152 | ---  ---------  --------  ----------  --------  -------  ---------  --------  ----------  --------  -------  -------  --------  --------
153 | 408  P                 6           6       437  T        H          94                94        97  E        2.63625  OG1       OE2
154 | 640  P                 4           4       435  D        L          32                32       252  K        2.81482  OD1       NZ
155 | 807  P                 2           2       433  W        L          94                94       314  S        2.91194  N         OG
156 | 767  P                 1           1       432  N        L          91                91       311  Y        2.9295   ND2       O
157 | 526  P                 7           7       438  N        H          99E               99       107  K        3.03857  ND2       CE
158 | ```
159 | ### Residue Selection & Boolean Operations
160 | ```
161 | # create a new PDB file only containing the antigen and binder residues
162 | p=p.extract(rs_binder | "P")
163 | ```
164 | ### Structure I/O
165 | ```
166 | # save the new structure into a local PDB file
167 | p.save("binders.pdb")
168 | ```
169 | ### Structure Display within Jupyter Notebook
170 | ```
171 | # display the PDB struture, default is show ribbon and color by chains.
172 | p.show(show_sidechains=True)
173 | ```
174 | Output (It will be 3D interactive within Jupyter Notebook)<br>
175 | <img src="https://github.com/data2code/afpdb/blob/main/tutorial/img/demo.png?raw=true">
176 | ### PyMOL Integration
177 | ```
178 | # convert the selection into a PyMOL selection command
179 | rs = (rs_binder | P).str(format="PYMOL", rs_name="myint")
180 | cmd=f'''fetch 5cil, myobj; util.cbc;
181 | {rs}
182 | show sticks, myint; zoom myint; deselect;
183 | save binders.pse
184 | # generate a PyMOL session file binders.pse
185 | Protein.PyMOL().run(cmd)
186 | ```
187 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | ./clean.sh
3 | python3 -m pip install --upgrade build
4 | python3 -m build
5 | python3 -m pip install --upgrade twine
6 | #python3 -m twine upload --repository testpypi dist/*
7 | python3 -m twine upload --repository pypi dist/*
8 | 


--------------------------------------------------------------------------------
/clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | rm -rf build dist src/afpdb.egg-info tx.pdb
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "afpdb"
 3 | dynamic = ["version", "authors", "description", "requires-python", "license", "classifiers", "dependencies"]
 4 | readme = "README.md"
 5 | 
 6 | [build-system]
 7 | requires = ["setuptools>=61.0"]
 8 | build-backend = "setuptools.build_meta"
 9 | 
10 | [project.urls]
11 | homepage = "https://github.com/data2code/afpdb"
12 | documentation = "https://github.com/data2code/afpdb/tutorial"
13 | issues = "https://github.com/data2code/afpdb/issues"
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='afpdb',
 5 |     version='0.2.3',
 6 |     description='A Numpy-based PDB structure manipulation package',
 7 |     url='https://github.com/data2code/afpdb',
 8 |     author='Yingyao Zhou',
 9 |     author_email='yingyao.zhou@novartis.com',
10 |     license= 'MIT',
11 |     python_requires=">=3.7",
12 |     zip_safe=False,
13 |     package_dir={'': 'src'},
14 |     #packages=find_packages(where='src'),
15 |     packages=["afpdb","afpdb.mycolabfold","afpdb.myalphafold","afpdb.myalphafold.common","afpdb.mycolabdesign"],
16 |     install_requires=['pandas',
17 |                       'numpy',
18 |                       'scipy',
19 |                       'biopython',
20 |                       'dm-tree',
21 |                       'py3Dmol',
22 |                       'tabulate',
23 |                       'requests'
24 |                       ],
25 | 
26 |     classifiers=[
27 |         'Development Status :: 5 - Production/Stable',
28 |         'Intended Audience :: Science/Research',
29 |         'License :: OSI Approved :: MIT License',
30 |         'Operating System :: POSIX :: Linux',
31 |         'Programming Language :: Python :: 3.7',
32 |     ],
33 | )
34 | 


--------------------------------------------------------------------------------
/src/afpdb/__init__.py:
--------------------------------------------------------------------------------
1 | #from .afpdb import Protein,RS,RL,ATS
2 | #from . import util
3 | #from . import myalphafold
4 | #from . import mol3D
5 | #__all__ = ["Protein","RS","RL","ATS","util","myalphafold","mol3D"]
6 | 


--------------------------------------------------------------------------------
/src/afpdb/aiparser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from .afpdb import util
  3 | import os,re,glob,json
  4 | import numpy as np, pandas as pd,pickle
  5 | from Bio import SeqIO
  6 | from Bio.Seq import Seq
  7 | from .afpdb import Protein
  8 | 
  9 | class ESMFoldParser:
 10 |     """ESMFold output models in pdb files. To read pLDDT, use:
 11 |         p=Protein(pdb_file_name)
 12 |         p.b_factors()
 13 |     """
 14 | 
 15 |     def __init__(self, folder, relaxed=True):
 16 |         self.fd=folder
 17 |         self.data=self.parse()
 18 | 
 19 |     def parse(self):
 20 |         S=glob.glob(f"{self.fd}/*.pdb")
 21 |         out=[]
 22 |         for fn in S:
 23 |             out.append({'path':fn, 'name':re.sub(r'\.pdb$', '', os.path.basename(fn))})
 24 |         t=pd.DataFrame(out)
 25 |         #test_130a1/ptm0.374_r3_default.pae.txt
 26 |         S=glob.glob(f"{self.fd}/*.pae.txt")
 27 |         out=[]
 28 |         for fn in S:
 29 |             out.append({'pae_path':fn, 'name':re.sub(r'\.pae\.txt$', '', os.path.basename(fn))})
 30 |         if len(out):
 31 |             t2=pd.DataFrame(out)
 32 |             t=t.merge(t2, on=['name'], how='left')
 33 |         return t
 34 | 
 35 |     def get_pdb(self, idx=0):
 36 |         if len(self.data):
 37 |             return self.data.loc[idx, 'path']
 38 |         return None
 39 | 
 40 |     def get_pae(self, idx=0):
 41 |         ## the following was tested with https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/ESMFold.ipynb
 42 |         if len(self.data) and 'pae_path' in self.data.header():
 43 |             fn=self.data.loc[idx, 'pae_path']
 44 |             t=pd.read_table(fn, sep=' ', header=None)
 45 |             return t.values
 46 |         return None
 47 | 
 48 |     def get_plddt(self, idx=0):
 49 |         fn=self.get_pdb(idx=idx)
 50 |         return Protein(fn).b_factors()
 51 | 
 52 | class ColabFoldParser:
 53 | 
 54 |     def __init__(self, folder):
 55 |         self.fd=folder
 56 |         self.data=self.parse()
 57 | 
 58 |     def parse(self, relaxed=True):
 59 |         S=glob.glob(f"{self.fd}/*.pdb")
 60 |         out=[]
 61 |         for fn in S:
 62 |             one={'path':fn, 'name':re.sub(r'\.pdb$', '', os.path.basename(fn))}
 63 |             one['relaxed']='_relaxed_' in fn
 64 |             if not relaxed and one['relaxed']: continue
 65 |             m = re.search(r'_model_(?P<model>\d+)', fn)
 66 |             if m is None: continue
 67 |             one['model']=int(m.group('model'))
 68 |             m = re.search(r'_rank_(?P<rank>\d+)', fn)
 69 |             one['ranking']=int(m.group('rank')) if m is not None else -1
 70 |             m = re.search(r'_seed_(?P<seed>\d+)', fn)
 71 |             one['seed']=int(m.group('seed')) if m is not None else -1
 72 |             out.append(one)
 73 |         t=pd.DataFrame(out)
 74 | 
 75 |         S=glob.glob(f"{self.fd}/*.json")
 76 |         out=[]
 77 |         for fn in S:
 78 |             one={'json_path':fn}
 79 |             m = re.search(r'_model_(?P<model>\d+)', fn)
 80 |             if m is None: continue
 81 |             one['model']=int(m.group('model'))
 82 |             m = re.search(r'_seed_(?P<seed>\d+)', fn)
 83 |             one['seed']=int(m.group('seed')) if m is not None else -1
 84 |             x=json.loads(util.read_string(fn))
 85 |             one['ptm']=x.get('ptm', None)
 86 |             one['iptm']=x.get('iptm', None)
 87 |             out.append(one)
 88 |         if len(out):
 89 |             t2=pd.DataFrame(out)
 90 |             t=t.merge(t2, on=['model','seed'], how='left')
 91 |         t.sort_values(['relaxed','ranking'], ascending=[False, True], inplace=True)
 92 |         t.index=range(len(t))
 93 |         return t
 94 | 
 95 |     def get_pdb(self, idx=0):
 96 |         if len(self.data):
 97 |             return self.data.loc[idx, 'path']
 98 |         return None
 99 | 
100 |     def get_ptm(self, idx=0):
101 |         if len(self.data):
102 |             return self.data.loc[idx, 'ptm']
103 |         return None
104 | 
105 |     def get_iptm(self, idx=0):
106 |         if len(self.data):
107 |             return self.data.loc[idx, 'iptm']
108 |         return None
109 | 
110 |     def get_pae(self, idx=0):
111 |         if len(self.data) and 'json_path' in self.data.header():
112 |             fn=self.data.loc[idx, 'json_path']
113 |             data=np.array(json.loads(util.read_string(fn)).get('pae'), None)
114 |             return data
115 |         return None
116 | 
117 |     def get_plddt(self, idx=0):
118 |         if len(self.data) and 'json_path' in self.data.header():
119 |             fn=self.data.loc[idx, 'json_path']
120 |             data=np.array(json.loads(util.read_string(fn)).get('plddt'), None)
121 |             return data
122 |         return None
123 | 
124 | #https://stackoverflow.com/questions/46857615/how-to-replace-objects-causing-import-errors-with-none-during-pickle-load
125 | #https://github.com/google-deepmind/alphafold/issues/629
126 | class Dummy:
127 | 
128 |     def __init__(*args):
129 |         pass
130 | 
131 | class MyUnpickler(pickle._Unpickler):
132 | 
133 |     def find_class(self, module, name):
134 |         try:
135 |             return super().find_class(module, name)
136 |         except Exception as e:
137 |             return Dummy
138 | 
139 | class AlphaFoldParser(ColabFoldParser):
140 | 
141 |     def parse(self, relaxed=True):
142 |         S=glob.glob(f"{self.fd}/*.pdb")
143 |         out=[]
144 |         for fn in S:
145 |             #relaxed_model_5_multimer_v3_pred_1.pdb
146 |             one={'path':fn, 'name':re.sub(r'\.pdb$', '', os.path.basename(fn))}
147 |             one['relaxed']=one['name'].startswith('relaxed_')
148 |             if not relaxed and one['relaxed']: continue
149 |             m = re.search(r'_model_(?P<model>\d+)', fn)
150 |             if m is None: continue
151 |             one['model']=int(m.group('model'))
152 |             m = re.search(r'_pred_(?P<seed>\d+)', fn)
153 |             one['seed']=int(m.group('seed')) if m is not None else -1
154 |             out.append(one)
155 |         t=pd.DataFrame(out)
156 | 
157 |         S=glob.glob(f"{self.fd}/*.pkl")
158 |         out=[]
159 |         for fn in S:
160 |             #result_model_1_multimer_v3_pred_2.pkl
161 |             if not os.path.basename(fn).startswith('result_'): continue
162 |             one={'pkl_path':fn}
163 |             m = re.search(r'_model_(?P<model>\d+)', fn)
164 |             if m is None: continue
165 |             one['model']=int(m.group('model'))
166 |             m = re.search(r'_pred_(?P<seed>\d+)', fn)
167 |             one['seed']=int(m.group('seed')) if m is not None else -1
168 |             with open(fn, 'rb') as f:
169 |                 x=MyUnpickler(f).load()
170 |                 one['ptm']=x.get('ptm').item()
171 |                 one['iptm']=x.get('iptm').item()
172 |             out.append(one)
173 |         t2=pd.DataFrame(out)
174 |         t=t.merge(t2, on=['model','seed'], how='left')
175 |         t.sort_values(['relaxed','iptm','ptm'], ascending=[False, False, False], inplace=True)
176 |         t.index=range(len(t))
177 |         return t
178 | 
179 |     def get_pae(self, idx=0):
180 |         if len(self.data) and 'pkl_path' in self.data.header():
181 |             fn=self.data.loc[idx, 'pkl_path']
182 |             with open(fn, 'rb') as f:
183 |                 x=MyUnpickler(f).load()
184 |                 pae=x.get('predicted_aligned_error', None)
185 |             if pae is not None: return np.array(pae)
186 |         return None
187 | 
188 |     def get_plddt(self, idx=0):
189 |         if len(self.data) and 'pkl_path' in self.data.header():
190 |             fn=self.data.loc[idx, 'pkl_path']
191 |             with open(fn, 'rb') as f:
192 |                 x=MyUnpickler(f).load()
193 |                 plddt=x.get('plddt', None)
194 |             if plddt is not None: return np.array(plddt)
195 |         return None
196 | 
197 | class ProteinMPNNParser:
198 | 
199 |     def __init__(self, folder):
200 |         self.fd=folder
201 |         self.data=self.parse()
202 | 
203 |     def parse(self):
204 |         S=glob.glob(f"{self.fd}/seqs/*.fa")
205 |         out=[]
206 |         for fn in S:
207 |             #>1crn, score=1.7228, global_score=1.7573, fixed_chains=['E'], designed_chains=['A'], model_name=v_48_020, git_hash=8907e6671bfbfc92303b5f79c4b5e6ce47cdef57, seed=931
208 |             #TTCCPSIXXRSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDY
209 |             #>T=0.1, sample=1, score=1.1982, global_score=1.2612, seq_recovery=0.5000
210 |             name=os.path.basename(fn)
211 |             for r in SeqIO.parse(fn, "fasta"):
212 |                 x={s.split("=")[0]:s.split("=")[1] for s in re.split(r',\s*', r.description) if '=' in s}
213 |                 one={'path':fn, 'name':name}
214 |                 if 'sample' not in x: continue
215 |                 for k in ['sample','score','global_score','T','seq_recovery']:
216 |                     one[k]=x[k]
217 |                 one['seq']=str(r.seq)
218 |                 out.append(one)
219 |         t=pd.DataFrame(out)
220 |         t.sort_values('score', ascending=True, inplace=True)
221 |         t.index=range(len(t))
222 |         return t
223 | 
224 |     def make_structure(self, pdb, output_folder, side_chain_pdb=None, rl_from=None, rl_to=None):
225 |         p=Protein(pdb)
226 |         os.makedirs(output_folder, exist_ok=True)
227 |         pg=util.Progress(len(self.data))
228 |         for i,r in self.data.iterrows():
229 |             seq=r['seq']
230 |             p.thread_sequence(seq, f"{output_folder}/sample{r['sample']}.pdb", relax=0, seq2bfactor=False, side_chain_pdb=side_chain_pdb, rl_from=rl_from, rl_to=rl_to)
231 |             pg.check(i+1)
232 | 
233 | class LigandMPNNParser:
234 | 
235 |     def __init__(self, folder):
236 |         self.fd=folder
237 |         self.data=self.parse()
238 | 
239 |     def parse(self):
240 |         S=glob.glob(f"{self.fd}/seqs/*.fa")
241 |         out=[]
242 |         for fn in S:
243 |             #>1BC8, T=0.1, seed=111, num_res=93, num_ligand_res=93, use_ligand_context=True, ligand_cutoff_distance=8.0, batch_size=1, number_of_batches=1, model_path=./model_params/proteinmpnn_v_48_020.pt
244 |             #>1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3848, ligand_confidence=0.3848, seq_rec=0.4946
245 |             name=os.path.basename(fn)
246 |             for r in SeqIO.parse(fn, "fasta"):
247 |                 x={s.split("=")[0]:s.split("=")[1] for s in re.split(r',\s*', r.description) if '=' in s}
248 |                 one={'path':fn, 'name':name}
249 |                 if 'id' not in x: continue
250 |                 #>1BC8, id=1, T=0.1, seed=111, overall_confidence=0.3848, ligand_confidence=0.3848, seq_rec=0.4946
251 |                 for k in ['id','T','seed','overall_confidence','ligand_confidence','seq_rec']:
252 |                     one[k]=x[k]
253 |                 one['seq']=str(r.seq)
254 |                 out.append(one)
255 |         t=pd.DataFrame(out)
256 |         t.sort_values(['overall_confidence','ligand_confidence'], ascending=[False, False], inplace=True)
257 |         t.index=range(len(t))
258 |         return t
259 | 
260 | 


--------------------------------------------------------------------------------
/src/afpdb/mol3D.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #@title Display 3D structure {run: "auto"}
  3 | import py3Dmol
  4 | import re
  5 | pymol_color_list = ["#33ff33","#00ffff","#ff33cc","#ffff00","#ff9999","#e5e5e5","#7f7fff","#ff7f00",
  6 |                     "#7fff7f","#199999","#ff007f","#ffdd5e","#8c3f99","#b2b2b2","#007fff","#c4b200",
  7 |                     "#8cb266","#00bfbf","#b27f7f","#fcd1a5","#ff7f7f","#ffbfdd","#7fffff","#ffff7f",
  8 |                     "#00ff7f","#337fcc","#d8337f","#bfff3f","#ff7fff","#d8d8ff","#3fffbf","#b78c4c",
  9 |                     "#339933","#66b2b2","#ba8c84","#84bf00","#b24c66","#7f7f7f","#3f3fa5","#a5512b"]
 10 | 
 11 | #pymol_cmap = matplotlib.colors.ListedColormap(pymol_color_list)
 12 | from string import ascii_uppercase, ascii_lowercase
 13 | alphabet_list = list(ascii_uppercase+ascii_lowercase)
 14 | 
 15 | def my_style(self, color, style, chains=1, model_id=0):
 16 |   if color=="lDDT":
 17 |     self.setStyle({'model': model_id}, {style: {'colorscheme': {'prop':'b','gradient': 'roygb','min':50,'max':90}}})
 18 |   elif color=='b':
 19 |     self.setStyle({'model': model_id}, {style: {'colorscheme': {'prop':'b','gradient': 'roygb','min':0,'max':1}}})
 20 |   elif color in ("rainbow","spectrum"):
 21 |     self.setStyle({'model': model_id}, {style: {'color':'spectrum'}})
 22 |   elif color == "chain":
 23 |     #chains = len(queries[0][1]) + 1 if is_complex else 1
 24 |     for n,chain,color in zip(range(chains),alphabet_list,pymol_color_list):
 25 |        #print({'model': model_id}, {'chain':chain},{style: {'color':color}})
 26 |        # color by chain is not compatible with model_id, see
 27 |        # https://github.com/3dmol/3Dmol.js/issues/671
 28 |        #self.setStyle({'model': model_id}, {'chain':chain},{style: {'color':color}})
 29 |        self.setStyle({'model': model_id,'chain':chain},{style: {'color':color}})
 30 |   elif color == "ss":
 31 |     #https://github.com/3dmol/3Dmol.js/issues/668
 32 |     self.setStyle({'model': model_id}, {style: {'colorscheme':{'prop':'ss','map':"@REPLACE@$3Dmol.ssColors.Jmol@REPLACE@"}}})
 33 |   else:
 34 |     self.setStyle({'model': model_id}, {style: {'color':color}})
 35 | 
 36 | def my_html(self):
 37 |   self.zoomTo()
 38 |   out=self._make_html()
 39 |   out=re.sub(r'"?@REPLACE@"?', '', out)
 40 |   return out
 41 | 
 42 | py3Dmol.view.my_style=my_style
 43 | py3Dmol.view.my_html=my_html
 44 | 
 45 | class Mol3D:
 46 | 
 47 |     URL='https://3dmol.org/build/3Dmol.js'
 48 | 
 49 |     def add_model(self, pdb_file):
 50 |         if pdb_file is None or pdb_file=="_model_":
 51 |             # multiple models, model already preloaded
 52 |             self.chains=1
 53 |         else:
 54 |             from afpdb.afpdb import Protein
 55 |             p=Protein(pdb_file)
 56 |             # rename chain to A,B,C, etc
 57 |             p.data_prt.chain_id[:]=alphabet_list[:len(p.chain_id())]
 58 |             self.chains=len(p.chain_id())
 59 |             p._renumber('RESTART')
 60 |             data=p.to_pdb_str()
 61 |             self.view.addModel(data,'pdb')
 62 | 
 63 |     def show(self, pdb_file=None, show_sidechains=False, show_mainchains=False, color="lDDT", style="cartoon", width=480, height=480, model_id=0):
 64 |         """
 65 |         color: lDDT/b, spectrum/rainbow, chain, ss
 66 |         style: cartoon, stick, line, sphere, cross
 67 |         Not all combinations work, see https://3dmol.org/viewer.html?pdb=4KW4&select=all&style=cartoon:color~spectrum;stick:radius~0.25,colorscheme~greenCarbon&select=bonds:0&style=sphere:radius~0.5
 68 |         """
 69 |         self.view = py3Dmol.view(js=Mol3D.URL, width=width, height=height)
 70 |         self.add_model(pdb_file)
 71 |         self.view.my_style(color, style, model_id=model_id, chains=self.chains)
 72 | 
 73 |         if show_sidechains:
 74 |             BB = ['C','O','N']
 75 |             self.view.addStyle({'and':[{'resn':["GLY","PRO"],'invert':True},{'atom':BB,'invert':True}]},
 76 |                                 {'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
 77 |             self.view.addStyle({'and':[{'resn':"GLY"},{'atom':'CA'}]},
 78 |                                 {'sphere':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
 79 |             self.view.addStyle({'and':[{'resn':"PRO"},{'atom':['C','O'],'invert':True}]},
 80 |                                 {'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
 81 |         if show_mainchains:
 82 |             BB = ['C','O','N','CA']
 83 |             self.view.addStyle({'atom':BB},{'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
 84 | 
 85 |         return self.view.my_html()
 86 | 
 87 |     def cartoon_b(self, pdb_file=None, width=480, height=480, model_id=0):
 88 |         return self.show(pdb_file, color="lDDT", style="cartoon", width=width, height=height, model_id=model_id)
 89 | 
 90 |     def cartoon_spectrum(self, pdb_file=None, width=480, height=480, model_id=0):
 91 |         return self.show(pdb_file, color="spectrum", style="cartoon", width=width, height=height, model_id=model_id)
 92 | 
 93 |     def cartoon_chain(self, pdb_file=None, width=480, height=480, model_id=0):
 94 |         return self.show(pdb_file, color="chain", style="cartoon", width=width, height=height, model_id=model_id)
 95 | 
 96 |     def cartoon_ss(self, pdb_file=None, width=480, height=480, model_id=0):
 97 |         return self.show(pdb_file, color="ss", style="cartoon", width=width, height=height, model_id=model_id)
 98 | 
 99 |     def stick_b(self, pdb_file=None, width=480, height=480, model_id=0):
100 |         return self.show(pdb_file, color="lDDT", style="stick", width=width, height=height, model_id=model_id)
101 | 
102 |     def stick_chain(self, pdb_file=None, width=480, height=480, model_id=0):
103 |         return self.show(pdb_file, color="chain", style="stick", width=width, height=height, model_id=model_id)
104 | 
105 |     #spectrum does not supported for stick
106 |     #def show_stick_spectrum(pdb_file, width=480, height=480):
107 |     #  return show_pdb(pdb_file, color="spectrum", style="stick", width=width, height=height)
108 | 
109 |     def stick_ss(self, pdb_file=None, width=480, height=480, model_id=0):
110 |         return self.show(pdb_file, color="ss", style="stick", width=width, height=height, model_id=model_id)
111 | 
112 |     # currently color by chain only works if chains are named as A, B, C ...
113 |     # otherwise, you need to rename chains in pymol first
114 |     def show_stick_chain(pdb_file=None, width=480, height=480, model_id=0):
115 |         return self.show(pdb_file, color="chain", style="stick", width=width, height=height, model_id=model_id)
116 | 
117 |     def show_many(self, S_pdb, S_style, S_color, S_chains=None, width=480, height=480):
118 |         self.view = py3Dmol.view(width=width, height=height)
119 |         if S_chains is None: S_chains=1
120 |         if type(S_pdb) is str: S_pdb=[S_pdb]
121 |         n=len(S_pdb)
122 |         if type(S_style) not in (list, tuple): S_style=[S_style]*n
123 |         if type(S_color) not in (list, tuple): S_color=[S_color]*n
124 |         if type(S_chains) not in (list, tuple): S_chains=[S_chains]*n
125 |         model_id=0
126 |         for pdb,style,color,chains in zip(S_pdb, S_style, S_color, S_chains):
127 |             self.view.addModel(open(pdb,'r').read(), 'pdb')
128 |             self.view.my_style(model_id=model_id, color=color, style=style, chains=self.chains)
129 |             model_id+=1
130 |         return self.view.my_html()
131 | 
132 | if __name__=="__main__":
133 |     x=Mol3D()
134 |     out=["<table><tr><td>"]
135 |     out.append(x.cartoon_b("6bgn.pdb"))
136 |     out.append("</td><td>")
137 |     out.append(x.cartoon_spectrum("6bgn.pdb"))
138 |     out.append("</td><td>")
139 |     out.append(x.cartoon_ss("6bgn.pdb"))
140 |     out.append("</td><td>")
141 |     out.append(x.cartoon_chain("7x95.pdb"))
142 |     out.append("</td></tr><tr><td>")
143 |     out.append(x.stick_b("6bgn.pdb"))
144 |     out.append("</td><td>")
145 |     out.append(x.show_many(["1crn.pdb","6bgn.pdb"], ["cartoon", "cartoon"], ["#bcbddc", "ss"]))
146 |     out.append("</td><td>")
147 |     out.append(x.stick_ss("6bgn.pdb"))
148 |     out.append("</td><td>")
149 |     out.append(x.stick_chain("7x95.pdb"))
150 |     out.append("</td></tr></table>")
151 |     print("\n".join(out))
152 | 


--------------------------------------------------------------------------------
/src/afpdb/myalphafold/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/src/afpdb/myalphafold/__init__.py


--------------------------------------------------------------------------------
/src/afpdb/myalphafold/common/protein.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 DeepMind Technologies Limited
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Protein data type."""
 16 | import dataclasses
 17 | import io
 18 | from typing import Any, Mapping, Optional
 19 | from afpdb.myalphafold.common import residue_constants
 20 | from Bio.PDB import PDBParser
 21 | import numpy as np,os
 22 | from string import ascii_uppercase,ascii_lowercase
 23 | from afpdb.mycolabdesign.protein import MODRES
 24 | 
 25 | CHAIN_IDs = ascii_uppercase+ascii_lowercase
 26 | 
 27 | FeatureDict = Mapping[str, np.ndarray]
 28 | ModelOutput = Mapping[str, Any]  # Is a nested dict.
 29 | 
 30 | # Complete sequence of chain IDs supported by the PDB format.
 31 | PDB_CHAIN_IDS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
 32 | PDB_MAX_CHAINS = len(PDB_CHAIN_IDS)  # := 62.
 33 | 
 34 | 
 35 | #YZ: @dataclasses.dataclass(frozen=True)
 36 | @dataclasses.dataclass(frozen=False)
 37 | class Protein:
 38 |   """Protein structure representation."""
 39 | 
 40 |   # Cartesian coordinates of atoms in angstroms. The atom types correspond to
 41 |   # residue_constants.atom_types, i.e. the first three are N, CA, CB.
 42 |   atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
 43 | 
 44 |   # Amino-acid type for each residue represented as an integer between 0 and
 45 |   # 20, where 20 is 'X'.
 46 |   aatype: np.ndarray  # [num_res]
 47 | 
 48 |   # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
 49 |   # is present and 0.0 if not. This should be used for loss masking.
 50 |   atom_mask: np.ndarray  # [num_res, num_atom_type]
 51 | 
 52 |   # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
 53 |   residue_index: np.ndarray  # [num_res]
 54 | 
 55 |   # 0-indexed number corresponding to the chain in the protein that this residue
 56 |   # belongs to.
 57 |   chain_index: np.ndarray  # [num_res]
 58 | 
 59 |   # B-factors, or temperature factors, of each residue (in sq. angstroms units),
 60 |   # representing the displacement of the residue from its ground truth mean
 61 |   # value.
 62 |   b_factors: np.ndarray  # [num_res, num_atom_type]
 63 | 
 64 |   def __post_init__(self):
 65 |     if len(np.unique(self.chain_index)) > PDB_MAX_CHAINS:
 66 |       raise ValueError(
 67 |           f'Cannot build an instance with more than {PDB_MAX_CHAINS} chains '
 68 |           'because these cannot be written to PDB format.')
 69 | 
 70 |   def clone(self):
 71 |     p=Protein(
 72 |       atom_positions=np.copy(self.atom_positions),
 73 |       atom_mask=np.copy(self.atom_mask),
 74 |       aatype=np.copy(self.aatype),
 75 |       residue_index=np.copy(self.residue_index),
 76 |       chain_index=np.copy(self.chain_index),
 77 |       b_factors=np.copy(self.b_factors))
 78 |     p.chain_id=np.copy(self.chain_id)
 79 |     p.warning(self.warning())
 80 |     p.header=self.header.copy()
 81 |     return p
 82 | 
 83 |   def warning(self, data=None):
 84 |     if data is not None:
 85 |       self._warning=data.copy()
 86 |     elif not hasattr(self, '_warning'):
 87 |       self._warning={}
 88 |     return self._warning
 89 | 
 90 |   @staticmethod
 91 |   def from_biopython(structure, model=None, chains=None):
 92 |     """convert an BioPython structure object to Protein object"""
 93 |     # code taken from Bio/PDB/PDBIO.py
 94 |     # avoid converting to PDB to support huge strutures
 95 | 
 96 |     c_warning={'renamed_res':[], 'insertion_res':[], 'unknown_res':[]}
 97 |     models = list(structure.get_models())
 98 |     if model is not None:
 99 |         models=[x for x in models if x.id==model]
100 |     if len(models) != 1:
101 |         print(f"WARNING: Only single model PDBs are supported. Found {len(models)} models, use the first one!!!")
102 |     m = models[0] if len(models)>0 else from_pdb_string("MODEL     1\nENDMDL\nEND")
103 | 
104 |     #  atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
105 |     #  aatype: np.ndarray  # [num_res]
106 |     #  atom_mask: np.ndarray  # [num_res, num_atom_type]
107 |     #  residue_index: np.ndarray  # [num_res]
108 |     #  chain_index: np.ndarray  # [num_res]
109 |     #  b_factors: np.ndarray  # [num_res, num_atom_type]
110 |     chain_ids=[]
111 |     residue_index=[]
112 |     aatype=[]
113 |     atom_positions=[]
114 |     atom_mask=[]
115 |     b_factors=[]
116 |     for chain in m.get_list():
117 |         if chains is not None and chain not in chains: continue
118 |         chain_id=chain.id
119 |         for residue in chain.get_unpacked_list():
120 |             hetfield, resseq, icode = residue.id
121 |             if hetfield!=" ": continue
122 |             resname = residue.resname
123 |             segid = residue.segid
124 |             resid = str(residue.id[1])+residue.id[2].strip()
125 |             resn=f"{chain_id}{resid}"
126 | 
127 |             rn=MODRES.get(resname, resname)
128 |             # YZ, take care of Modified Residues
129 |             if rn!=resname:
130 |                 c_warning['renamed_res'].append((resn, resname))
131 |                 print(f"Warning: modified residue converted: {resname} to {rn} at {resn}!")
132 |             res_shortname = residue_constants.restype_3to1.get(resname, 'X')
133 | 
134 |             if res_shortname=='X':
135 |                 c_warning['unknown_res'].append((f"{chain_id}{resid}", resname))
136 |                 continue
137 |             if residue.id[2].strip() != '':
138 |                 c_warning['insertion_res'].append((resn, resname))
139 | 
140 |             atom_pos=np.zeros([residue_constants.atom_type_num,3])
141 |             atom_msk=np.zeros(residue_constants.atom_type_num)
142 |             b_fact=np.zeros(residue_constants.atom_type_num)
143 | 
144 |             for atom in residue.get_unpacked_list():
145 |                 x, y, z = atom.coord
146 |                 name = atom.fullname.strip()
147 |                 if name not in residue_constants.atom_order: continue
148 |                 bfactor = atom.bfactor
149 |                 idx=residue_constants.atom_order[name]
150 |                 atom_pos[idx]=[x, y, z]
151 |                 atom_msk[idx]=1
152 |                 b_fact[idx]=bfactor
153 |             # If no known atom positions are reported for the residue then skip it.
154 |             if np.sum(atom_msk)<0.5: continue
155 |             atom_positions.append(atom_pos)
156 |             atom_mask.append(atom_msk)
157 |             b_factors.append(b_fact)
158 | 
159 |             residue_index.append(resid)
160 |             chain_ids.append(chain_id)
161 |             restype_idx = residue_constants.restype_order.get(res_shortname, residue_constants.restype_num)
162 |             aatype.append(restype_idx)
163 | 
164 |     unique_chain_ids=[]
165 |     for x in chain_ids:
166 |         if x not in unique_chain_ids:
167 |             unique_chain_ids.append(x)
168 |     chain_id_mapping = {cid: n for n, cid in enumerate(unique_chain_ids)}
169 |     chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
170 |     p=Protein(
171 |       atom_positions=np.array(atom_positions),
172 |       atom_mask=np.array(atom_mask),
173 |       aatype=np.array(aatype),
174 |       residue_index=np.array(residue_index, dtype=np.dtype("<U6")),
175 |       chain_index=chain_index,
176 |       b_factors=np.array(b_factors))
177 |     p.chain_id=unique_chain_ids
178 |     p.warning(c_warning)
179 |     if sum([len(v) for k,v in c_warning.items()]):
180 |         print(f"Warning: {c_warning}")
181 |     p.header=structure.header
182 |     return p
183 | 
184 | 
185 | def pdb2structure(fn:str):
186 |     parser = PDBParser(QUIET=True)
187 |     ext=os.path.splitext(fn)[1]
188 |     if ext in ('.gz'):
189 |         import gzip
190 |         with gzip.open(fn, 'rt') as f:
191 |             structure = parser.get_structure('none', f)
192 |     else:
193 |         structure = parser.get_structure('none', fn)
194 |     return structure
195 | 
196 | def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None, model:int = 0) -> Protein:
197 |   """Takes a PDB string and constructs a Protein object.
198 | 
199 |   WARNING: All non-standard residue types will be converted into UNK. All
200 |     non-standard atoms will be ignored.
201 | 
202 |   Args:
203 |     pdb_str: The contents of the pdb file
204 |     chain_id: If chain_id is specified (e.g. A), then only that chain
205 |       is parsed. Otherwise all chains are parsed.
206 | 
207 |   Returns:
208 |     A new `Protein` parsed from the pdb contents.
209 |   """
210 |   pdb_fh = io.StringIO(pdb_str)
211 |   parser = PDBParser(QUIET=True)
212 |   structure = parser.get_structure('none', pdb_fh)
213 |   return Protein.from_biopython(structure, model=model, chains=chain_id)
214 | 
215 | def _chain_end(atom_index, end_resname, chain_name, residue_index) -> str:
216 |   chain_end = 'TER'
217 |   return (f'{chain_end:<6}{atom_index:>5}      {end_resname:>3} '
218 |           f'{chain_name:>1}{residue_index:>4}')
219 | 
220 | 
221 | 
222 | def to_pdb(prot: Protein) -> str:
223 |   """Converts a `Protein` instance to a PDB string.
224 | 
225 |   Args:
226 |     prot: The protein to convert to PDB.
227 | 
228 |   Returns:
229 |     PDB string.
230 |   """
231 |   restypes = residue_constants.restypes + ['X']
232 |   res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
233 |   atom_types = residue_constants.atom_types
234 | 
235 |   pdb_lines = []
236 | 
237 |   # ZHOU Y
238 |   # mkdssp requires the presence of the header line
239 |   from datetime import datetime
240 |   today=datetime.now().strftime("%d-%b-%y")
241 |   pdb_lines.append(f"HEADER    AFPDB PROTEIN                           {today}   XXXX")
242 |   # END
243 | 
244 |   atom_mask = prot.atom_mask
245 |   aatype = prot.aatype
246 |   atom_positions = prot.atom_positions
247 |   ##YZresidue_index = prot.residue_index.astype(np.int32)
248 |   residue_index = np.copy(prot.residue_index)
249 |   ##
250 |   chain_index = prot.chain_index.astype(np.int32)
251 |   b_factors = prot.b_factors
252 | 
253 |   if np.any(aatype > residue_constants.restype_num):
254 |     raise ValueError('Invalid aatypes.')
255 | 
256 |   # Construct a mapping from chain integer indices to chain ID strings.
257 |   chain_ids = {}
258 |   #YZ: use chain_id if available
259 |   CHAIN_IDS=prot.chain_id if hasattr(prot, 'chain_id') else PDB_CHAIN_IDS
260 |   ##
261 |   for i in np.unique(chain_index):  # np.unique gives sorted output.
262 |     if i >= PDB_MAX_CHAINS:
263 |       raise ValueError(
264 |           f'The PDB format supports at most {PDB_MAX_CHAINS} chains.')
265 |     #chain_ids[i] = PDB_CHAIN_IDS[i]
266 |     #YZ: use chain_id if available
267 |     chain_ids[i] = CHAIN_IDS[i]
268 | 
269 |   pdb_lines.append('MODEL     1')
270 |   atom_index = 1
271 |   last_chain_index = chain_index[0]
272 |   # Add all atom sites.
273 |   for i in range(aatype.shape[0]):
274 |     # Close the previous chain if in a multichain PDB.
275 |     if last_chain_index != chain_index[i]:
276 |       pdb_lines.append(_chain_end(
277 |           atom_index, res_1to3(aatype[i - 1]), chain_ids[chain_index[i - 1]],
278 |           residue_index[i - 1]))
279 |       last_chain_index = chain_index[i]
280 |       atom_index += 1  # Atom index increases at the TER symbol.
281 | 
282 |     res_name_3 = res_1to3(aatype[i])
283 |     for atom_name, pos, mask, b_factor in zip(
284 |         atom_types, atom_positions[i], atom_mask[i], b_factors[i]):
285 |       if mask < 0.5:
286 |         continue
287 | 
288 |       record_type = 'ATOM'
289 |       name = atom_name if len(atom_name) == 4 else f' {atom_name}'
290 |       alt_loc = ''
291 |       ##YZ, modified to support insertion code
292 |       if (str(residue_index[i])[-1].isalpha()):
293 |         residue_idx, insertion_code=residue_index[i][:-1], residue_index[i][-1]
294 |       else:
295 |         residue_idx, insertion_code=residue_index[i], ''
296 |       ##
297 |       occupancy = 1.00
298 |       element = atom_name[0]  # Protein supports only C, N, O, S, this works.
299 |       charge = ''
300 |       # PDB is a columnar format, every space matters here!
301 |       atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
302 |                    f'{res_name_3:>3} {chain_ids[chain_index[i]]:>1}'
303 |                    ##YZ
304 |                    #f'{residue_index[i]:>4}{insertion_code:>1}   '
305 |                    f'{residue_idx:>4}{insertion_code:>1}   '
306 |                    ##
307 |                    f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
308 |                    f'{occupancy:>6.2f}{b_factor:>6.2f}          '
309 |                    f'{element:>2}{charge:>2}')
310 |       pdb_lines.append(atom_line)
311 |       atom_index += 1
312 | 
313 |   # Close the final chain.
314 |   pdb_lines.append(_chain_end(atom_index, res_1to3(aatype[-1]),
315 |                               chain_ids[chain_index[-1]], residue_index[-1]))
316 |   pdb_lines.append('ENDMDL')
317 |   pdb_lines.append('END   ')
318 | 
319 |   # Pad all lines to 80 characters.
320 |   pdb_lines = [line.ljust(80) for line in pdb_lines]
321 |   return '\n'.join(pdb_lines) + '\n'  # Add terminating newline.
322 | 
323 | def ideal_atom_mask(prot: Protein) -> np.ndarray:
324 |   """Computes an ideal atom mask.
325 | 
326 |   `Protein.atom_mask` typically is defined according to the atoms that are
327 |   reported in the PDB. This function computes a mask according to heavy atoms
328 |   that should be present in the given sequence of amino acids.
329 | 
330 |   Args:
331 |     prot: `Protein` whose fields are `numpy.ndarray` objects.
332 | 
333 |   Returns:
334 |     An ideal atom mask.
335 |   """
336 |   return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
337 | 
338 | 
339 | def from_prediction(
340 |     features: FeatureDict,
341 |     result: ModelOutput,
342 |     b_factors: Optional[np.ndarray] = None,
343 |     remove_leading_feature_dimension: bool = True) -> Protein:
344 |   """Assembles a protein from a prediction.
345 | 
346 |   Args:
347 |     features: Dictionary holding model inputs.
348 |     result: Dictionary holding model outputs.
349 |     b_factors: (Optional) B-factors to use for the protein.
350 |     remove_leading_feature_dimension: Whether to remove the leading dimension
351 |       of the `features` values.
352 | 
353 |   Returns:
354 |     A protein instance.
355 |   """
356 |   fold_output = result['structure_module']
357 | 
358 |   def _maybe_remove_leading_dim(arr: np.ndarray) -> np.ndarray:
359 |     return arr[0] if remove_leading_feature_dimension else arr
360 | 
361 |   if 'asym_id' in features:
362 |     chain_index = _maybe_remove_leading_dim(features['asym_id'])
363 |   else:
364 |     chain_index = np.zeros_like(_maybe_remove_leading_dim(features['aatype']))
365 | 
366 |   if b_factors is None:
367 |     b_factors = np.zeros_like(fold_output['final_atom_mask'])
368 | 
369 |   return Protein(
370 |       aatype=_maybe_remove_leading_dim(features['aatype']),
371 |       atom_positions=fold_output['final_atom_positions'],
372 |       atom_mask=fold_output['final_atom_mask'],
373 |       residue_index=_maybe_remove_leading_dim(features['residue_index']) + 1,
374 |       chain_index=chain_index,
375 |       b_factors=b_factors)
376 | 


--------------------------------------------------------------------------------
/src/afpdb/myalphafold/common/residue_constants.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 DeepMind Technologies Limited
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Constants used in AlphaFold."""
 16 | 
 17 | import collections
 18 | import functools
 19 | import os
 20 | from typing import List, Mapping, Tuple
 21 | 
 22 | import numpy as np
 23 | #pip install dm-tree
 24 | import tree
 25 | 
 26 | # Internal import (35fd).
 27 | 
 28 | #YZ: comment out
 29 | #from . import __file__
 30 | stereo_chemical_props_path = os.path.join(os.path.dirname(__file__), f'stereo_chemical_props.txt')
 31 | 
 32 | # Distance from one CA to next CA [trans configuration: omega = 180].
 33 | ca_ca = 3.80209737096
 34 | 
 35 | # Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
 36 | # this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
 37 | # chi angles so their chi angle lists are empty.
 38 | chi_angles_atoms = {
 39 |     'ALA': [],
 40 |     # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
 41 |     'ARG': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
 42 |             ['CB', 'CG', 'CD', 'NE'], ['CG', 'CD', 'NE', 'CZ']],
 43 |     'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
 44 |     'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
 45 |     'CYS': [['N', 'CA', 'CB', 'SG']],
 46 |     'GLN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
 47 |             ['CB', 'CG', 'CD', 'OE1']],
 48 |     'GLU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
 49 |             ['CB', 'CG', 'CD', 'OE1']],
 50 |     'GLY': [],
 51 |     'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']],
 52 |     'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']],
 53 |     'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
 54 |     'LYS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
 55 |             ['CB', 'CG', 'CD', 'CE'], ['CG', 'CD', 'CE', 'NZ']],
 56 |     'MET': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'SD'],
 57 |             ['CB', 'CG', 'SD', 'CE']],
 58 |     'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
 59 |     'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']],
 60 |     'SER': [['N', 'CA', 'CB', 'OG']],
 61 |     'THR': [['N', 'CA', 'CB', 'OG1']],
 62 |     'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
 63 |     'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
 64 |     'VAL': [['N', 'CA', 'CB', 'CG1']],
 65 | }
 66 | 
 67 | # If chi angles given in fixed-length array, this matrix determines how to mask
 68 | # them for each AA type. The order is as per restype_order (see below).
 69 | chi_angles_mask = [
 70 |     [0.0, 0.0, 0.0, 0.0],  # ALA
 71 |     [1.0, 1.0, 1.0, 1.0],  # ARG
 72 |     [1.0, 1.0, 0.0, 0.0],  # ASN
 73 |     [1.0, 1.0, 0.0, 0.0],  # ASP
 74 |     [1.0, 0.0, 0.0, 0.0],  # CYS
 75 |     [1.0, 1.0, 1.0, 0.0],  # GLN
 76 |     [1.0, 1.0, 1.0, 0.0],  # GLU
 77 |     [0.0, 0.0, 0.0, 0.0],  # GLY
 78 |     [1.0, 1.0, 0.0, 0.0],  # HIS
 79 |     [1.0, 1.0, 0.0, 0.0],  # ILE
 80 |     [1.0, 1.0, 0.0, 0.0],  # LEU
 81 |     [1.0, 1.0, 1.0, 1.0],  # LYS
 82 |     [1.0, 1.0, 1.0, 0.0],  # MET
 83 |     [1.0, 1.0, 0.0, 0.0],  # PHE
 84 |     [1.0, 1.0, 0.0, 0.0],  # PRO
 85 |     [1.0, 0.0, 0.0, 0.0],  # SER
 86 |     [1.0, 0.0, 0.0, 0.0],  # THR
 87 |     [1.0, 1.0, 0.0, 0.0],  # TRP
 88 |     [1.0, 1.0, 0.0, 0.0],  # TYR
 89 |     [1.0, 0.0, 0.0, 0.0],  # VAL
 90 | ]
 91 | 
 92 | # The following chi angles are pi periodic: they can be rotated by a multiple
 93 | # of pi without affecting the structure.
 94 | chi_pi_periodic = [
 95 |     [0.0, 0.0, 0.0, 0.0],  # ALA
 96 |     [0.0, 0.0, 0.0, 0.0],  # ARG
 97 |     [0.0, 0.0, 0.0, 0.0],  # ASN
 98 |     [0.0, 1.0, 0.0, 0.0],  # ASP
 99 |     [0.0, 0.0, 0.0, 0.0],  # CYS
100 |     [0.0, 0.0, 0.0, 0.0],  # GLN
101 |     [0.0, 0.0, 1.0, 0.0],  # GLU
102 |     [0.0, 0.0, 0.0, 0.0],  # GLY
103 |     [0.0, 0.0, 0.0, 0.0],  # HIS
104 |     [0.0, 0.0, 0.0, 0.0],  # ILE
105 |     [0.0, 0.0, 0.0, 0.0],  # LEU
106 |     [0.0, 0.0, 0.0, 0.0],  # LYS
107 |     [0.0, 0.0, 0.0, 0.0],  # MET
108 |     [0.0, 1.0, 0.0, 0.0],  # PHE
109 |     [0.0, 0.0, 0.0, 0.0],  # PRO
110 |     [0.0, 0.0, 0.0, 0.0],  # SER
111 |     [0.0, 0.0, 0.0, 0.0],  # THR
112 |     [0.0, 0.0, 0.0, 0.0],  # TRP
113 |     [0.0, 1.0, 0.0, 0.0],  # TYR
114 |     [0.0, 0.0, 0.0, 0.0],  # VAL
115 |     [0.0, 0.0, 0.0, 0.0],  # UNK
116 | ]
117 | 
118 | # Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
119 | # psi and chi angles:
120 | # 0: 'backbone group',
121 | # 1: 'pre-omega-group', (empty)
122 | # 2: 'phi-group', (currently empty, because it defines only hydrogens)
123 | # 3: 'psi-group',
124 | # 4,5,6,7: 'chi1,2,3,4-group'
125 | # The atom positions are relative to the axis-end-atom of the corresponding
126 | # rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
127 | # is defined such that the dihedral-angle-defining atom (the last entry in
128 | # chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
129 | # format: [atomname, group_idx, rel_position]
130 | rigid_group_atom_positions = {
131 |     'ALA': [
132 |         ['N', 0, (-0.525, 1.363, 0.000)],
133 |         ['CA', 0, (0.000, 0.000, 0.000)],
134 |         ['C', 0, (1.526, -0.000, -0.000)],
135 |         ['CB', 0, (-0.529, -0.774, -1.205)],
136 |         ['O', 3, (0.627, 1.062, 0.000)],
137 |     ],
138 |     'ARG': [
139 |         ['N', 0, (-0.524, 1.362, -0.000)],
140 |         ['CA', 0, (0.000, 0.000, 0.000)],
141 |         ['C', 0, (1.525, -0.000, -0.000)],
142 |         ['CB', 0, (-0.524, -0.778, -1.209)],
143 |         ['O', 3, (0.626, 1.062, 0.000)],
144 |         ['CG', 4, (0.616, 1.390, -0.000)],
145 |         ['CD', 5, (0.564, 1.414, 0.000)],
146 |         ['NE', 6, (0.539, 1.357, -0.000)],
147 |         ['NH1', 7, (0.206, 2.301, 0.000)],
148 |         ['NH2', 7, (2.078, 0.978, -0.000)],
149 |         ['CZ', 7, (0.758, 1.093, -0.000)],
150 |     ],
151 |     'ASN': [
152 |         ['N', 0, (-0.536, 1.357, 0.000)],
153 |         ['CA', 0, (0.000, 0.000, 0.000)],
154 |         ['C', 0, (1.526, -0.000, -0.000)],
155 |         ['CB', 0, (-0.531, -0.787, -1.200)],
156 |         ['O', 3, (0.625, 1.062, 0.000)],
157 |         ['CG', 4, (0.584, 1.399, 0.000)],
158 |         ['ND2', 5, (0.593, -1.188, 0.001)],
159 |         ['OD1', 5, (0.633, 1.059, 0.000)],
160 |     ],
161 |     'ASP': [
162 |         ['N', 0, (-0.525, 1.362, -0.000)],
163 |         ['CA', 0, (0.000, 0.000, 0.000)],
164 |         ['C', 0, (1.527, 0.000, -0.000)],
165 |         ['CB', 0, (-0.526, -0.778, -1.208)],
166 |         ['O', 3, (0.626, 1.062, -0.000)],
167 |         ['CG', 4, (0.593, 1.398, -0.000)],
168 |         ['OD1', 5, (0.610, 1.091, 0.000)],
169 |         ['OD2', 5, (0.592, -1.101, -0.003)],
170 |     ],
171 |     'CYS': [
172 |         ['N', 0, (-0.522, 1.362, -0.000)],
173 |         ['CA', 0, (0.000, 0.000, 0.000)],
174 |         ['C', 0, (1.524, 0.000, 0.000)],
175 |         ['CB', 0, (-0.519, -0.773, -1.212)],
176 |         ['O', 3, (0.625, 1.062, -0.000)],
177 |         ['SG', 4, (0.728, 1.653, 0.000)],
178 |     ],
179 |     'GLN': [
180 |         ['N', 0, (-0.526, 1.361, -0.000)],
181 |         ['CA', 0, (0.000, 0.000, 0.000)],
182 |         ['C', 0, (1.526, 0.000, 0.000)],
183 |         ['CB', 0, (-0.525, -0.779, -1.207)],
184 |         ['O', 3, (0.626, 1.062, -0.000)],
185 |         ['CG', 4, (0.615, 1.393, 0.000)],
186 |         ['CD', 5, (0.587, 1.399, -0.000)],
187 |         ['NE2', 6, (0.593, -1.189, -0.001)],
188 |         ['OE1', 6, (0.634, 1.060, 0.000)],
189 |     ],
190 |     'GLU': [
191 |         ['N', 0, (-0.528, 1.361, 0.000)],
192 |         ['CA', 0, (0.000, 0.000, 0.000)],
193 |         ['C', 0, (1.526, -0.000, -0.000)],
194 |         ['CB', 0, (-0.526, -0.781, -1.207)],
195 |         ['O', 3, (0.626, 1.062, 0.000)],
196 |         ['CG', 4, (0.615, 1.392, 0.000)],
197 |         ['CD', 5, (0.600, 1.397, 0.000)],
198 |         ['OE1', 6, (0.607, 1.095, -0.000)],
199 |         ['OE2', 6, (0.589, -1.104, -0.001)],
200 |     ],
201 |     'GLY': [
202 |         ['N', 0, (-0.572, 1.337, 0.000)],
203 |         ['CA', 0, (0.000, 0.000, 0.000)],
204 |         ['C', 0, (1.517, -0.000, -0.000)],
205 |         ['O', 3, (0.626, 1.062, -0.000)],
206 |     ],
207 |     'HIS': [
208 |         ['N', 0, (-0.527, 1.360, 0.000)],
209 |         ['CA', 0, (0.000, 0.000, 0.000)],
210 |         ['C', 0, (1.525, 0.000, 0.000)],
211 |         ['CB', 0, (-0.525, -0.778, -1.208)],
212 |         ['O', 3, (0.625, 1.063, 0.000)],
213 |         ['CG', 4, (0.600, 1.370, -0.000)],
214 |         ['CD2', 5, (0.889, -1.021, 0.003)],
215 |         ['ND1', 5, (0.744, 1.160, -0.000)],
216 |         ['CE1', 5, (2.030, 0.851, 0.002)],
217 |         ['NE2', 5, (2.145, -0.466, 0.004)],
218 |     ],
219 |     'ILE': [
220 |         ['N', 0, (-0.493, 1.373, -0.000)],
221 |         ['CA', 0, (0.000, 0.000, 0.000)],
222 |         ['C', 0, (1.527, -0.000, -0.000)],
223 |         ['CB', 0, (-0.536, -0.793, -1.213)],
224 |         ['O', 3, (0.627, 1.062, -0.000)],
225 |         ['CG1', 4, (0.534, 1.437, -0.000)],
226 |         ['CG2', 4, (0.540, -0.785, -1.199)],
227 |         ['CD1', 5, (0.619, 1.391, 0.000)],
228 |     ],
229 |     'LEU': [
230 |         ['N', 0, (-0.520, 1.363, 0.000)],
231 |         ['CA', 0, (0.000, 0.000, 0.000)],
232 |         ['C', 0, (1.525, -0.000, -0.000)],
233 |         ['CB', 0, (-0.522, -0.773, -1.214)],
234 |         ['O', 3, (0.625, 1.063, -0.000)],
235 |         ['CG', 4, (0.678, 1.371, 0.000)],
236 |         ['CD1', 5, (0.530, 1.430, -0.000)],
237 |         ['CD2', 5, (0.535, -0.774, 1.200)],
238 |     ],
239 |     'LYS': [
240 |         ['N', 0, (-0.526, 1.362, -0.000)],
241 |         ['CA', 0, (0.000, 0.000, 0.000)],
242 |         ['C', 0, (1.526, 0.000, 0.000)],
243 |         ['CB', 0, (-0.524, -0.778, -1.208)],
244 |         ['O', 3, (0.626, 1.062, -0.000)],
245 |         ['CG', 4, (0.619, 1.390, 0.000)],
246 |         ['CD', 5, (0.559, 1.417, 0.000)],
247 |         ['CE', 6, (0.560, 1.416, 0.000)],
248 |         ['NZ', 7, (0.554, 1.387, 0.000)],
249 |     ],
250 |     'MET': [
251 |         ['N', 0, (-0.521, 1.364, -0.000)],
252 |         ['CA', 0, (0.000, 0.000, 0.000)],
253 |         ['C', 0, (1.525, 0.000, 0.000)],
254 |         ['CB', 0, (-0.523, -0.776, -1.210)],
255 |         ['O', 3, (0.625, 1.062, -0.000)],
256 |         ['CG', 4, (0.613, 1.391, -0.000)],
257 |         ['SD', 5, (0.703, 1.695, 0.000)],
258 |         ['CE', 6, (0.320, 1.786, -0.000)],
259 |     ],
260 |     'PHE': [
261 |         ['N', 0, (-0.518, 1.363, 0.000)],
262 |         ['CA', 0, (0.000, 0.000, 0.000)],
263 |         ['C', 0, (1.524, 0.000, -0.000)],
264 |         ['CB', 0, (-0.525, -0.776, -1.212)],
265 |         ['O', 3, (0.626, 1.062, -0.000)],
266 |         ['CG', 4, (0.607, 1.377, 0.000)],
267 |         ['CD1', 5, (0.709, 1.195, -0.000)],
268 |         ['CD2', 5, (0.706, -1.196, 0.000)],
269 |         ['CE1', 5, (2.102, 1.198, -0.000)],
270 |         ['CE2', 5, (2.098, -1.201, -0.000)],
271 |         ['CZ', 5, (2.794, -0.003, -0.001)],
272 |     ],
273 |     'PRO': [
274 |         ['N', 0, (-0.566, 1.351, -0.000)],
275 |         ['CA', 0, (0.000, 0.000, 0.000)],
276 |         ['C', 0, (1.527, -0.000, 0.000)],
277 |         ['CB', 0, (-0.546, -0.611, -1.293)],
278 |         ['O', 3, (0.621, 1.066, 0.000)],
279 |         ['CG', 4, (0.382, 1.445, 0.0)],
280 |         # ['CD', 5, (0.427, 1.440, 0.0)],
281 |         ['CD', 5, (0.477, 1.424, 0.0)],  # manually made angle 2 degrees larger
282 |     ],
283 |     'SER': [
284 |         ['N', 0, (-0.529, 1.360, -0.000)],
285 |         ['CA', 0, (0.000, 0.000, 0.000)],
286 |         ['C', 0, (1.525, -0.000, -0.000)],
287 |         ['CB', 0, (-0.518, -0.777, -1.211)],
288 |         ['O', 3, (0.626, 1.062, -0.000)],
289 |         ['OG', 4, (0.503, 1.325, 0.000)],
290 |     ],
291 |     'THR': [
292 |         ['N', 0, (-0.517, 1.364, 0.000)],
293 |         ['CA', 0, (0.000, 0.000, 0.000)],
294 |         ['C', 0, (1.526, 0.000, -0.000)],
295 |         ['CB', 0, (-0.516, -0.793, -1.215)],
296 |         ['O', 3, (0.626, 1.062, 0.000)],
297 |         ['CG2', 4, (0.550, -0.718, -1.228)],
298 |         ['OG1', 4, (0.472, 1.353, 0.000)],
299 |     ],
300 |     'TRP': [
301 |         ['N', 0, (-0.521, 1.363, 0.000)],
302 |         ['CA', 0, (0.000, 0.000, 0.000)],
303 |         ['C', 0, (1.525, -0.000, 0.000)],
304 |         ['CB', 0, (-0.523, -0.776, -1.212)],
305 |         ['O', 3, (0.627, 1.062, 0.000)],
306 |         ['CG', 4, (0.609, 1.370, -0.000)],
307 |         ['CD1', 5, (0.824, 1.091, 0.000)],
308 |         ['CD2', 5, (0.854, -1.148, -0.005)],
309 |         ['CE2', 5, (2.186, -0.678, -0.007)],
310 |         ['CE3', 5, (0.622, -2.530, -0.007)],
311 |         ['NE1', 5, (2.140, 0.690, -0.004)],
312 |         ['CH2', 5, (3.028, -2.890, -0.013)],
313 |         ['CZ2', 5, (3.283, -1.543, -0.011)],
314 |         ['CZ3', 5, (1.715, -3.389, -0.011)],
315 |     ],
316 |     'TYR': [
317 |         ['N', 0, (-0.522, 1.362, 0.000)],
318 |         ['CA', 0, (0.000, 0.000, 0.000)],
319 |         ['C', 0, (1.524, -0.000, -0.000)],
320 |         ['CB', 0, (-0.522, -0.776, -1.213)],
321 |         ['O', 3, (0.627, 1.062, -0.000)],
322 |         ['CG', 4, (0.607, 1.382, -0.000)],
323 |         ['CD1', 5, (0.716, 1.195, -0.000)],
324 |         ['CD2', 5, (0.713, -1.194, -0.001)],
325 |         ['CE1', 5, (2.107, 1.200, -0.002)],
326 |         ['CE2', 5, (2.104, -1.201, -0.003)],
327 |         ['OH', 5, (4.168, -0.002, -0.005)],
328 |         ['CZ', 5, (2.791, -0.001, -0.003)],
329 |     ],
330 |     'VAL': [
331 |         ['N', 0, (-0.494, 1.373, -0.000)],
332 |         ['CA', 0, (0.000, 0.000, 0.000)],
333 |         ['C', 0, (1.527, -0.000, -0.000)],
334 |         ['CB', 0, (-0.533, -0.795, -1.213)],
335 |         ['O', 3, (0.627, 1.062, -0.000)],
336 |         ['CG1', 4, (0.540, 1.429, -0.000)],
337 |         ['CG2', 4, (0.533, -0.776, 1.203)],
338 |     ],
339 | }
340 | 
341 | # A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
342 | residue_atoms = {
343 |     'ALA': ['C', 'CA', 'CB', 'N', 'O'],
344 |     'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'],
345 |     'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'],
346 |     'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'],
347 |     'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'],
348 |     'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'],
349 |     'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'],
350 |     'GLY': ['C', 'CA', 'N', 'O'],
351 |     'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'],
352 |     'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'],
353 |     'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'],
354 |     'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'],
355 |     'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'],
356 |     'PHE': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'],
357 |     'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'],
358 |     'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'],
359 |     'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'],
360 |     'TRP': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'CZ2', 'CZ3',
361 |             'CH2', 'N', 'NE1', 'O'],
362 |     'TYR': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O',
363 |             'OH'],
364 |     'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O']
365 | }
366 | 
367 | # Naming swaps for ambiguous atom names.
368 | # Due to symmetries in the amino acids the naming of atoms is ambiguous in
369 | # 4 of the 20 amino acids.
370 | # (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
371 | # in LEU, VAL and ARG can be resolved by using the 3d constellations of
372 | # the 'ambiguous' atoms and their neighbours)
373 | residue_atom_renaming_swaps = {
374 |     'ASP': {'OD1': 'OD2'},
375 |     'GLU': {'OE1': 'OE2'},
376 |     'PHE': {'CD1': 'CD2', 'CE1': 'CE2'},
377 |     'TYR': {'CD1': 'CD2', 'CE1': 'CE2'},
378 | }
379 | 
380 | # Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
381 | van_der_waals_radius = {
382 |     'C': 1.7,
383 |     'N': 1.55,
384 |     'O': 1.52,
385 |     'S': 1.8,
386 | }
387 | 
388 | Bond = collections.namedtuple(
389 |     'Bond', ['atom1_name', 'atom2_name', 'length', 'stddev'])
390 | BondAngle = collections.namedtuple(
391 |     'BondAngle',
392 |     ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev'])
393 | 
394 | 
395 | @functools.lru_cache(maxsize=None)
396 | def load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]],
397 |                                           Mapping[str, List[Bond]],
398 |                                           Mapping[str, List[BondAngle]]]:
399 |   """Load stereo_chemical_props.txt into a nice structure.
400 | 
401 |   Load literature values for bond lengths and bond angles and translate
402 |   bond angles into the length of the opposite edge of the triangle
403 |   ("residue_virtual_bonds").
404 | 
405 |   Returns:
406 |     residue_bonds: Dict that maps resname -> list of Bond tuples.
407 |     residue_virtual_bonds: Dict that maps resname -> list of Bond tuples.
408 |     residue_bond_angles: Dict that maps resname -> list of BondAngle tuples.
409 |   """
410 |   with open(stereo_chemical_props_path, 'rt') as f:
411 |     stereo_chemical_props = f.read()
412 |   lines_iter = iter(stereo_chemical_props.splitlines())
413 |   # Load bond lengths.
414 |   residue_bonds = {}
415 |   next(lines_iter)  # Skip header line.
416 |   for line in lines_iter:
417 |     if line.strip() == '-':
418 |       break
419 |     bond, resname, length, stddev = line.split()
420 |     atom1, atom2 = bond.split('-')
421 |     if resname not in residue_bonds:
422 |       residue_bonds[resname] = []
423 |     residue_bonds[resname].append(
424 |         Bond(atom1, atom2, float(length), float(stddev)))
425 |   residue_bonds['UNK'] = []
426 | 
427 |   # Load bond angles.
428 |   residue_bond_angles = {}
429 |   next(lines_iter)  # Skip empty line.
430 |   next(lines_iter)  # Skip header line.
431 |   for line in lines_iter:
432 |     if line.strip() == '-':
433 |       break
434 |     bond, resname, angle_degree, stddev_degree = line.split()
435 |     atom1, atom2, atom3 = bond.split('-')
436 |     if resname not in residue_bond_angles:
437 |       residue_bond_angles[resname] = []
438 |     residue_bond_angles[resname].append(
439 |         BondAngle(atom1, atom2, atom3,
440 |                   float(angle_degree) / 180. * np.pi,
441 |                   float(stddev_degree) / 180. * np.pi))
442 |   residue_bond_angles['UNK'] = []
443 | 
444 |   def make_bond_key(atom1_name, atom2_name):
445 |     """Unique key to lookup bonds."""
446 |     return '-'.join(sorted([atom1_name, atom2_name]))
447 | 
448 |   # Translate bond angles into distances ("virtual bonds").
449 |   residue_virtual_bonds = {}
450 |   for resname, bond_angles in residue_bond_angles.items():
451 |     # Create a fast lookup dict for bond lengths.
452 |     bond_cache = {}
453 |     for b in residue_bonds[resname]:
454 |       bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
455 |     residue_virtual_bonds[resname] = []
456 |     for ba in bond_angles:
457 |       bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
458 |       bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
459 | 
460 |       # Compute distance between atom1 and atom3 using the law of cosines
461 |       # c^2 = a^2 + b^2 - 2ab*cos(gamma).
462 |       gamma = ba.angle_rad
463 |       length = np.sqrt(bond1.length**2 + bond2.length**2
464 |                        - 2 * bond1.length * bond2.length * np.cos(gamma))
465 | 
466 |       # Propagation of uncertainty assuming uncorrelated errors.
467 |       dl_outer = 0.5 / length
468 |       dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer
469 |       dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
470 |       dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
471 |       stddev = np.sqrt((dl_dgamma * ba.stddev)**2 +
472 |                        (dl_db1 * bond1.stddev)**2 +
473 |                        (dl_db2 * bond2.stddev)**2)
474 |       residue_virtual_bonds[resname].append(
475 |           Bond(ba.atom1_name, ba.atom3name, length, stddev))
476 | 
477 |   return (residue_bonds,
478 |           residue_virtual_bonds,
479 |           residue_bond_angles)
480 | 
481 | 
482 | # Between-residue bond lengths for general bonds (first element) and for Proline
483 | # (second element).
484 | between_res_bond_length_c_n = [1.329, 1.341]
485 | between_res_bond_length_stddev_c_n = [0.014, 0.016]
486 | 
487 | # Between-residue cos_angles.
488 | between_res_cos_angles_c_n_ca = [-0.5203, 0.0353]  # degrees: 121.352 +- 2.315
489 | between_res_cos_angles_ca_c_n = [-0.4473, 0.0311]  # degrees: 116.568 +- 1.995
490 | 
491 | # This mapping is used when we need to store atom data in a format that requires
492 | # fixed atom data size for every residue (e.g. a numpy array).
493 | atom_types = [
494 |     'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
495 |     'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
496 |     'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
497 |     'CZ3', 'NZ', 'OXT'
498 | ]
499 | atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
500 | atom_type_num = len(atom_types)  # := 37.
501 | 
502 | # A compact atom encoding with 14 columns
503 | # pylint: disable=line-too-long
504 | # pylint: disable=bad-whitespace
505 | restype_name_to_atom14_names = {
506 |     'ALA': ['N', 'CA', 'C', 'O', 'CB', '',    '',    '',    '',    '',    '',    '',    '',    ''],
507 |     'ARG': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  'NE',  'CZ',  'NH1', 'NH2', '',    '',    ''],
508 |     'ASN': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'OD1', 'ND2', '',    '',    '',    '',    '',    ''],
509 |     'ASP': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'OD1', 'OD2', '',    '',    '',    '',    '',    ''],
510 |     'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG',  '',    '',    '',    '',    '',    '',    '',    ''],
511 |     'GLN': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  'OE1', 'NE2', '',    '',    '',    '',    ''],
512 |     'GLU': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  'OE1', 'OE2', '',    '',    '',    '',    ''],
513 |     'GLY': ['N', 'CA', 'C', 'O', '',   '',    '',    '',    '',    '',    '',    '',    '',    ''],
514 |     'HIS': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'ND1', 'CD2', 'CE1', 'NE2', '',    '',    '',    ''],
515 |     'ILE': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '',    '',    '',    '',    '',    ''],
516 |     'LEU': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD1', 'CD2', '',    '',    '',    '',    '',    ''],
517 |     'LYS': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  'CE',  'NZ',  '',    '',    '',    '',    ''],
518 |     'MET': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'SD',  'CE',  '',    '',    '',    '',    '',    ''],
519 |     'PHE': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD1', 'CD2', 'CE1', 'CE2', 'CZ',  '',    '',    ''],
520 |     'PRO': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD',  '',    '',    '',    '',    '',    '',    ''],
521 |     'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG',  '',    '',    '',    '',    '',    '',    '',    ''],
522 |     'THR': ['N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '',    '',    '',    '',    '',    '',    ''],
523 |     'TRP': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2'],
524 |     'TYR': ['N', 'CA', 'C', 'O', 'CB', 'CG',  'CD1', 'CD2', 'CE1', 'CE2', 'CZ',  'OH',  '',    ''],
525 |     'VAL': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '',    '',    '',    '',    '',    '',    ''],
526 |     'UNK': ['',  '',   '',  '',  '',   '',    '',    '',    '',    '',    '',    '',    '',    ''],
527 | 
528 | }
529 | # pylint: enable=line-too-long
530 | # pylint: enable=bad-whitespace
531 | 
532 | 
533 | # This is the standard residue order when coding AA type as a number.
534 | # Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
535 | restypes = [
536 |     'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P',
537 |     'S', 'T', 'W', 'Y', 'V'
538 | ]
539 | restype_order = {restype: i for i, restype in enumerate(restypes)}
540 | restype_num = len(restypes)  # := 20.
541 | unk_restype_index = restype_num  # Catch-all index for unknown restypes.
542 | 
543 | restypes_with_x = restypes + ['X']
544 | restype_order_with_x = {restype: i for i, restype in enumerate(restypes_with_x)}
545 | 
546 | 
547 | def sequence_to_onehot(
548 |     sequence: str,
549 |     mapping: Mapping[str, int],
550 |     map_unknown_to_x: bool = False) -> np.ndarray:
551 |   """Maps the given sequence into a one-hot encoded matrix.
552 | 
553 |   Args:
554 |     sequence: An amino acid sequence.
555 |     mapping: A dictionary mapping amino acids to integers.
556 |     map_unknown_to_x: If True, any amino acid that is not in the mapping will be
557 |       mapped to the unknown amino acid 'X'. If the mapping doesn't contain
558 |       amino acid 'X', an error will be thrown. If False, any amino acid not in
559 |       the mapping will throw an error.
560 | 
561 |   Returns:
562 |     A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
563 |     the sequence.
564 | 
565 |   Raises:
566 |     ValueError: If the mapping doesn't contain values from 0 to
567 |       num_unique_aas - 1 without any gaps.
568 |   """
569 |   num_entries = max(mapping.values()) + 1
570 | 
571 |   if sorted(set(mapping.values())) != list(range(num_entries)):
572 |     raise ValueError('The mapping must have values from 0 to num_unique_aas-1 '
573 |                      'without any gaps. Got: %s' % sorted(mapping.values()))
574 | 
575 |   one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
576 | 
577 |   for aa_index, aa_type in enumerate(sequence):
578 |     if map_unknown_to_x:
579 |       if aa_type.isalpha() and aa_type.isupper():
580 |         aa_id = mapping.get(aa_type, mapping['X'])
581 |       else:
582 |         raise ValueError(f'Invalid character in the sequence: {aa_type}')
583 |     else:
584 |       aa_id = mapping[aa_type]
585 |     one_hot_arr[aa_index, aa_id] = 1
586 | 
587 |   return one_hot_arr
588 | 
589 | 
590 | restype_1to3 = {
591 |     'A': 'ALA',
592 |     'R': 'ARG',
593 |     'N': 'ASN',
594 |     'D': 'ASP',
595 |     'C': 'CYS',
596 |     'Q': 'GLN',
597 |     'E': 'GLU',
598 |     'G': 'GLY',
599 |     'H': 'HIS',
600 |     'I': 'ILE',
601 |     'L': 'LEU',
602 |     'K': 'LYS',
603 |     'M': 'MET',
604 |     'F': 'PHE',
605 |     'P': 'PRO',
606 |     'S': 'SER',
607 |     'T': 'THR',
608 |     'W': 'TRP',
609 |     'Y': 'TYR',
610 |     'V': 'VAL',
611 | }
612 | 
613 | 
614 | # NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
615 | # 1-to-1 mapping of 3 letter names to one letter names. The latter contains
616 | # many more, and less common, three letter names as keys and maps many of these
617 | # to the same one letter name (including 'X' and 'U' which we don't use here).
618 | restype_3to1 = {v: k for k, v in restype_1to3.items()}
619 | 
620 | # Define a restype name for all unknown residues.
621 | unk_restype = 'UNK'
622 | 
623 | resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
624 | resname_to_idx = {resname: i for i, resname in enumerate(resnames)}
625 | 
626 | 
627 | # The mapping here uses hhblits convention, so that B is mapped to D, J and O
628 | # are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
629 | # remaining 20 amino acids are kept in alphabetical order.
630 | # There are 2 non-amino acid codes, X (representing any amino acid) and
631 | # "-" representing a missing amino acid in an alignment.  The id for these
632 | # codes is put at the end (20 and 21) so that they can easily be ignored if
633 | # desired.
634 | HHBLITS_AA_TO_ID = {
635 |     'A': 0,
636 |     'B': 2,
637 |     'C': 1,
638 |     'D': 2,
639 |     'E': 3,
640 |     'F': 4,
641 |     'G': 5,
642 |     'H': 6,
643 |     'I': 7,
644 |     'J': 20,
645 |     'K': 8,
646 |     'L': 9,
647 |     'M': 10,
648 |     'N': 11,
649 |     'O': 20,
650 |     'P': 12,
651 |     'Q': 13,
652 |     'R': 14,
653 |     'S': 15,
654 |     'T': 16,
655 |     'U': 1,
656 |     'V': 17,
657 |     'W': 18,
658 |     'X': 20,
659 |     'Y': 19,
660 |     'Z': 3,
661 |     '-': 21,
662 | }
663 | 
664 | # Partial inversion of HHBLITS_AA_TO_ID.
665 | ID_TO_HHBLITS_AA = {
666 |     0: 'A',
667 |     1: 'C',  # Also U.
668 |     2: 'D',  # Also B.
669 |     3: 'E',  # Also Z.
670 |     4: 'F',
671 |     5: 'G',
672 |     6: 'H',
673 |     7: 'I',
674 |     8: 'K',
675 |     9: 'L',
676 |     10: 'M',
677 |     11: 'N',
678 |     12: 'P',
679 |     13: 'Q',
680 |     14: 'R',
681 |     15: 'S',
682 |     16: 'T',
683 |     17: 'V',
684 |     18: 'W',
685 |     19: 'Y',
686 |     20: 'X',  # Includes J and O.
687 |     21: '-',
688 | }
689 | 
690 | restypes_with_x_and_gap = restypes + ['X', '-']
691 | MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
692 |     restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
693 |     for i in range(len(restypes_with_x_and_gap)))
694 | 
695 | 
696 | def _make_standard_atom_mask() -> np.ndarray:
697 |   """Returns [num_res_types, num_atom_types] mask array."""
698 |   # +1 to account for unknown (all 0s).
699 |   mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
700 |   for restype, restype_letter in enumerate(restypes):
701 |     restype_name = restype_1to3[restype_letter]
702 |     atom_names = residue_atoms[restype_name]
703 |     for atom_name in atom_names:
704 |       atom_type = atom_order[atom_name]
705 |       mask[restype, atom_type] = 1
706 |   return mask
707 | 
708 | 
709 | STANDARD_ATOM_MASK = _make_standard_atom_mask()
710 | 
711 | 
712 | # A one hot representation for the first and second atoms defining the axis
713 | # of rotation for each chi-angle in each residue.
714 | def chi_angle_atom(atom_index: int) -> np.ndarray:
715 |   """Define chi-angle rigid groups via one-hot representations."""
716 |   chi_angles_index = {}
717 |   one_hots = []
718 | 
719 |   for k, v in chi_angles_atoms.items():
720 |     indices = [atom_types.index(s[atom_index]) for s in v]
721 |     indices.extend([-1]*(4-len(indices)))
722 |     chi_angles_index[k] = indices
723 | 
724 |   for r in restypes:
725 |     res3 = restype_1to3[r]
726 |     one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
727 |     one_hots.append(one_hot)
728 | 
729 |   one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.
730 |   one_hot = np.stack(one_hots, axis=0)
731 |   one_hot = np.transpose(one_hot, [0, 2, 1])
732 | 
733 |   return one_hot
734 | 
735 | chi_atom_1_one_hot = chi_angle_atom(1)
736 | chi_atom_2_one_hot = chi_angle_atom(2)
737 | 
738 | # An array like chi_angles_atoms but using indices rather than names.
739 | chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
740 | chi_angles_atom_indices = tree.map_structure(
741 |     lambda atom_name: atom_order[atom_name], chi_angles_atom_indices)
742 | chi_angles_atom_indices = np.array([
743 |     chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
744 |     for chi_atoms in chi_angles_atom_indices])
745 | 
746 | # Mapping from (res_name, atom_name) pairs to the atom's chi group index
747 | # and atom index within that group.
748 | chi_groups_for_atom = collections.defaultdict(list)
749 | for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
750 |   for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
751 |     for atom_i, atom in enumerate(chi_group):
752 |       chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
753 | chi_groups_for_atom = dict(chi_groups_for_atom)
754 | 
755 | 
756 | def _make_rigid_transformation_4x4(ex, ey, translation):
757 |   """Create a rigid 4x4 transformation matrix from two axes and transl."""
758 |   # Normalize ex.
759 |   ex_normalized = ex / np.linalg.norm(ex)
760 | 
761 |   # make ey perpendicular to ex
762 |   ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
763 |   ey_normalized /= np.linalg.norm(ey_normalized)
764 | 
765 |   # compute ez as cross product
766 |   eznorm = np.cross(ex_normalized, ey_normalized)
767 |   m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose()
768 |   m = np.concatenate([m, [[0., 0., 0., 1.]]], axis=0)
769 |   return m
770 | 
771 | 
772 | # create an array with (restype, atomtype) --> rigid_group_idx
773 | # and an array with (restype, atomtype, coord) for the atom positions
774 | # and compute affine transformation matrices (4,4) from one rigid group to the
775 | # previous group
776 | restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=int)
777 | restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
778 | restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
779 | restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=int)
780 | restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
781 | restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
782 | restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
783 | 
784 | 
785 | def _make_rigid_group_constants():
786 |   """Fill the arrays above."""
787 |   for restype, restype_letter in enumerate(restypes):
788 |     resname = restype_1to3[restype_letter]
789 |     for atomname, group_idx, atom_position in rigid_group_atom_positions[
790 |         resname]:
791 |       atomtype = atom_order[atomname]
792 |       restype_atom37_to_rigid_group[restype, atomtype] = group_idx
793 |       restype_atom37_mask[restype, atomtype] = 1
794 |       restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position
795 | 
796 |       atom14idx = restype_name_to_atom14_names[resname].index(atomname)
797 |       restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
798 |       restype_atom14_mask[restype, atom14idx] = 1
799 |       restype_atom14_rigid_group_positions[restype,
800 |                                            atom14idx, :] = atom_position
801 | 
802 |   for restype, restype_letter in enumerate(restypes):
803 |     resname = restype_1to3[restype_letter]
804 |     atom_positions = {name: np.array(pos) for name, _, pos
805 |                       in rigid_group_atom_positions[resname]}
806 | 
807 |     # backbone to backbone is the identity transform
808 |     restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
809 | 
810 |     # pre-omega-frame to backbone (currently dummy identity matrix)
811 |     restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
812 | 
813 |     # phi-frame to backbone
814 |     mat = _make_rigid_transformation_4x4(
815 |         ex=atom_positions['N'] - atom_positions['CA'],
816 |         ey=np.array([1., 0., 0.]),
817 |         translation=atom_positions['N'])
818 |     restype_rigid_group_default_frame[restype, 2, :, :] = mat
819 | 
820 |     # psi-frame to backbone
821 |     mat = _make_rigid_transformation_4x4(
822 |         ex=atom_positions['C'] - atom_positions['CA'],
823 |         ey=atom_positions['CA'] - atom_positions['N'],
824 |         translation=atom_positions['C'])
825 |     restype_rigid_group_default_frame[restype, 3, :, :] = mat
826 | 
827 |     # chi1-frame to backbone
828 |     if chi_angles_mask[restype][0]:
829 |       base_atom_names = chi_angles_atoms[resname][0]
830 |       base_atom_positions = [atom_positions[name] for name in base_atom_names]
831 |       mat = _make_rigid_transformation_4x4(
832 |           ex=base_atom_positions[2] - base_atom_positions[1],
833 |           ey=base_atom_positions[0] - base_atom_positions[1],
834 |           translation=base_atom_positions[2])
835 |       restype_rigid_group_default_frame[restype, 4, :, :] = mat
836 | 
837 |     # chi2-frame to chi1-frame
838 |     # chi3-frame to chi2-frame
839 |     # chi4-frame to chi3-frame
840 |     # luckily all rotation axes for the next frame start at (0,0,0) of the
841 |     # previous frame
842 |     for chi_idx in range(1, 4):
843 |       if chi_angles_mask[restype][chi_idx]:
844 |         axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
845 |         axis_end_atom_position = atom_positions[axis_end_atom_name]
846 |         mat = _make_rigid_transformation_4x4(
847 |             ex=axis_end_atom_position,
848 |             ey=np.array([-1., 0., 0.]),
849 |             translation=axis_end_atom_position)
850 |         restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat
851 | 
852 | 
853 | _make_rigid_group_constants()
854 | 
855 | 
856 | def make_atom14_dists_bounds(overlap_tolerance=1.5,
857 |                              bond_length_tolerance_factor=15):
858 |   """compute upper and lower bounds for bonds to assess violations."""
859 |   restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
860 |   restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
861 |   restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
862 |   residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
863 |   for restype, restype_letter in enumerate(restypes):
864 |     resname = restype_1to3[restype_letter]
865 |     atom_list = restype_name_to_atom14_names[resname]
866 | 
867 |     # create lower and upper bounds for clashes
868 |     for atom1_idx, atom1_name in enumerate(atom_list):
869 |       if not atom1_name:
870 |         continue
871 |       atom1_radius = van_der_waals_radius[atom1_name[0]]
872 |       for atom2_idx, atom2_name in enumerate(atom_list):
873 |         if (not atom2_name) or atom1_idx == atom2_idx:
874 |           continue
875 |         atom2_radius = van_der_waals_radius[atom2_name[0]]
876 |         lower = atom1_radius + atom2_radius - overlap_tolerance
877 |         upper = 1e10
878 |         restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
879 |         restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
880 |         restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
881 |         restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
882 | 
883 |     # overwrite lower and upper bounds for bonds and angles
884 |     for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
885 |       atom1_idx = atom_list.index(b.atom1_name)
886 |       atom2_idx = atom_list.index(b.atom2_name)
887 |       lower = b.length - bond_length_tolerance_factor * b.stddev
888 |       upper = b.length + bond_length_tolerance_factor * b.stddev
889 |       restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
890 |       restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
891 |       restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
892 |       restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
893 |       restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
894 |       restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
895 |   return {'lower_bound': restype_atom14_bond_lower_bound,  # shape (21,14,14)
896 |           'upper_bound': restype_atom14_bond_upper_bound,  # shape (21,14,14)
897 |           'stddev': restype_atom14_bond_stddev,  # shape (21,14,14)
898 |          }
899 | 


--------------------------------------------------------------------------------
/src/afpdb/myalphafold/common/stereo_chemical_props.txt:
--------------------------------------------------------------------------------
  1 | Bond			Residue		Mean		StdDev
  2 | CA-CB			ALA		1.520		0.021
  3 | N-CA			ALA		1.459		0.020
  4 | CA-C			ALA		1.525		0.026
  5 | C-O			ALA		1.229		0.019
  6 | CA-CB			ARG		1.535		0.022
  7 | CB-CG			ARG		1.521		0.027
  8 | CG-CD			ARG		1.515		0.025
  9 | CD-NE			ARG		1.460		0.017
 10 | NE-CZ			ARG		1.326		0.013
 11 | CZ-NH1			ARG		1.326		0.013
 12 | CZ-NH2			ARG		1.326		0.013
 13 | N-CA			ARG		1.459		0.020
 14 | CA-C			ARG		1.525		0.026
 15 | C-O			ARG		1.229		0.019
 16 | CA-CB			ASN		1.527		0.026
 17 | CB-CG			ASN		1.506		0.023
 18 | CG-OD1			ASN		1.235		0.022
 19 | CG-ND2			ASN		1.324		0.025
 20 | N-CA			ASN		1.459		0.020
 21 | CA-C			ASN		1.525		0.026
 22 | C-O			ASN		1.229		0.019
 23 | CA-CB			ASP		1.535		0.022
 24 | CB-CG			ASP		1.513		0.021
 25 | CG-OD1			ASP		1.249		0.023
 26 | CG-OD2			ASP		1.249		0.023
 27 | N-CA			ASP		1.459		0.020
 28 | CA-C			ASP		1.525		0.026
 29 | C-O			ASP		1.229		0.019
 30 | CA-CB			CYS		1.526		0.013
 31 | CB-SG			CYS		1.812		0.016
 32 | N-CA			CYS		1.459		0.020
 33 | CA-C			CYS		1.525		0.026
 34 | C-O			CYS		1.229		0.019
 35 | CA-CB			GLU		1.535		0.022
 36 | CB-CG			GLU		1.517		0.019
 37 | CG-CD			GLU		1.515		0.015
 38 | CD-OE1			GLU		1.252		0.011
 39 | CD-OE2			GLU		1.252		0.011
 40 | N-CA			GLU		1.459		0.020
 41 | CA-C			GLU		1.525		0.026
 42 | C-O			GLU		1.229		0.019
 43 | CA-CB			GLN		1.535		0.022
 44 | CB-CG			GLN		1.521		0.027
 45 | CG-CD			GLN		1.506		0.023
 46 | CD-OE1			GLN		1.235		0.022
 47 | CD-NE2			GLN		1.324		0.025
 48 | N-CA			GLN		1.459		0.020
 49 | CA-C			GLN		1.525		0.026
 50 | C-O			GLN		1.229		0.019
 51 | N-CA			GLY		1.456		0.015
 52 | CA-C			GLY		1.514		0.016
 53 | C-O			GLY		1.232		0.016
 54 | CA-CB			HIS		1.535		0.022
 55 | CB-CG			HIS		1.492		0.016
 56 | CG-ND1			HIS		1.369		0.015
 57 | CG-CD2			HIS		1.353		0.017
 58 | ND1-CE1			HIS		1.343		0.025
 59 | CD2-NE2			HIS		1.415		0.021
 60 | CE1-NE2			HIS		1.322		0.023
 61 | N-CA			HIS		1.459		0.020
 62 | CA-C			HIS		1.525		0.026
 63 | C-O			HIS		1.229		0.019
 64 | CA-CB			ILE		1.544		0.023
 65 | CB-CG1			ILE		1.536		0.028
 66 | CB-CG2			ILE		1.524		0.031
 67 | CG1-CD1			ILE		1.500		0.069
 68 | N-CA			ILE		1.459		0.020
 69 | CA-C			ILE		1.525		0.026
 70 | C-O			ILE		1.229		0.019
 71 | CA-CB			LEU		1.533		0.023
 72 | CB-CG			LEU		1.521		0.029
 73 | CG-CD1			LEU		1.514		0.037
 74 | CG-CD2			LEU		1.514		0.037
 75 | N-CA			LEU		1.459		0.020
 76 | CA-C			LEU		1.525		0.026
 77 | C-O			LEU		1.229		0.019
 78 | CA-CB			LYS		1.535		0.022
 79 | CB-CG			LYS		1.521		0.027
 80 | CG-CD			LYS		1.520		0.034
 81 | CD-CE			LYS		1.508		0.025
 82 | CE-NZ			LYS		1.486		0.025
 83 | N-CA			LYS		1.459		0.020
 84 | CA-C			LYS		1.525		0.026
 85 | C-O			LYS		1.229		0.019
 86 | CA-CB			MET		1.535		0.022
 87 | CB-CG			MET		1.509		0.032
 88 | CG-SD			MET		1.807		0.026
 89 | SD-CE			MET		1.774		0.056
 90 | N-CA			MET		1.459		0.020
 91 | CA-C			MET		1.525		0.026
 92 | C-O			MET		1.229		0.019
 93 | CA-CB			PHE		1.535		0.022
 94 | CB-CG			PHE		1.509		0.017
 95 | CG-CD1			PHE		1.383		0.015
 96 | CG-CD2			PHE		1.383		0.015
 97 | CD1-CE1			PHE		1.388		0.020
 98 | CD2-CE2			PHE		1.388		0.020
 99 | CE1-CZ			PHE		1.369		0.019
100 | CE2-CZ			PHE		1.369		0.019
101 | N-CA			PHE		1.459		0.020
102 | CA-C			PHE		1.525		0.026
103 | C-O			PHE		1.229		0.019
104 | CA-CB			PRO		1.531		0.020
105 | CB-CG			PRO		1.495		0.050
106 | CG-CD			PRO		1.502		0.033
107 | CD-N			PRO		1.474		0.014
108 | N-CA			PRO		1.468		0.017
109 | CA-C			PRO		1.524		0.020
110 | C-O			PRO		1.228		0.020
111 | CA-CB			SER		1.525		0.015
112 | CB-OG			SER		1.418		0.013
113 | N-CA			SER		1.459		0.020
114 | CA-C			SER		1.525		0.026
115 | C-O			SER		1.229		0.019
116 | CA-CB			THR		1.529		0.026
117 | CB-OG1			THR		1.428		0.020
118 | CB-CG2			THR		1.519		0.033
119 | N-CA			THR		1.459		0.020
120 | CA-C			THR		1.525		0.026
121 | C-O			THR		1.229		0.019
122 | CA-CB			TRP		1.535		0.022
123 | CB-CG			TRP		1.498		0.018
124 | CG-CD1			TRP		1.363		0.014
125 | CG-CD2			TRP		1.432		0.017
126 | CD1-NE1			TRP		1.375		0.017
127 | NE1-CE2			TRP		1.371		0.013
128 | CD2-CE2			TRP		1.409		0.012
129 | CD2-CE3			TRP		1.399		0.015
130 | CE2-CZ2			TRP		1.393		0.017
131 | CE3-CZ3			TRP		1.380		0.017
132 | CZ2-CH2			TRP		1.369		0.019
133 | CZ3-CH2			TRP		1.396		0.016
134 | N-CA			TRP		1.459		0.020
135 | CA-C			TRP		1.525		0.026
136 | C-O			TRP		1.229		0.019
137 | CA-CB			TYR		1.535		0.022
138 | CB-CG			TYR		1.512		0.015
139 | CG-CD1			TYR		1.387		0.013
140 | CG-CD2			TYR		1.387		0.013
141 | CD1-CE1			TYR		1.389		0.015
142 | CD2-CE2			TYR		1.389		0.015
143 | CE1-CZ			TYR		1.381		0.013
144 | CE2-CZ			TYR		1.381		0.013
145 | CZ-OH			TYR		1.374		0.017
146 | N-CA			TYR		1.459		0.020
147 | CA-C			TYR		1.525		0.026
148 | C-O			TYR		1.229		0.019
149 | CA-CB			VAL		1.543		0.021
150 | CB-CG1			VAL		1.524		0.021
151 | CB-CG2			VAL		1.524		0.021
152 | N-CA			VAL		1.459		0.020
153 | CA-C			VAL		1.525		0.026
154 | C-O			VAL		1.229		0.019
155 | -
156 | 
157 | Angle			Residue		Mean		StdDev
158 | N-CA-CB			ALA		110.1		1.4
159 | CB-CA-C			ALA		110.1		1.5
160 | N-CA-C			ALA		111.0		2.7
161 | CA-C-O			ALA		120.1		2.1
162 | N-CA-CB			ARG		110.6		1.8
163 | CB-CA-C			ARG		110.4		2.0
164 | CA-CB-CG		ARG		113.4		2.2
165 | CB-CG-CD		ARG		111.6		2.6
166 | CG-CD-NE		ARG		111.8		2.1
167 | CD-NE-CZ		ARG		123.6		1.4
168 | NE-CZ-NH1		ARG		120.3		0.5
169 | NE-CZ-NH2		ARG		120.3		0.5
170 | NH1-CZ-NH2		ARG		119.4		1.1
171 | N-CA-C			ARG		111.0		2.7
172 | CA-C-O			ARG		120.1		2.1
173 | N-CA-CB			ASN		110.6		1.8
174 | CB-CA-C			ASN		110.4		2.0
175 | CA-CB-CG		ASN		113.4		2.2
176 | CB-CG-ND2		ASN		116.7		2.4
177 | CB-CG-OD1		ASN		121.6		2.0
178 | ND2-CG-OD1		ASN		121.9		2.3
179 | N-CA-C			ASN		111.0		2.7
180 | CA-C-O			ASN		120.1		2.1
181 | N-CA-CB			ASP		110.6		1.8
182 | CB-CA-C			ASP		110.4		2.0
183 | CA-CB-CG		ASP		113.4		2.2
184 | CB-CG-OD1		ASP		118.3		0.9
185 | CB-CG-OD2		ASP		118.3		0.9
186 | OD1-CG-OD2		ASP		123.3		1.9
187 | N-CA-C			ASP		111.0		2.7
188 | CA-C-O			ASP		120.1		2.1
189 | N-CA-CB			CYS		110.8		1.5
190 | CB-CA-C			CYS		111.5		1.2
191 | CA-CB-SG		CYS		114.2		1.1
192 | N-CA-C			CYS		111.0		2.7
193 | CA-C-O			CYS		120.1		2.1
194 | N-CA-CB			GLU		110.6		1.8
195 | CB-CA-C			GLU		110.4		2.0
196 | CA-CB-CG		GLU		113.4		2.2
197 | CB-CG-CD		GLU		114.2		2.7
198 | CG-CD-OE1		GLU		118.3		2.0
199 | CG-CD-OE2		GLU		118.3		2.0
200 | OE1-CD-OE2		GLU		123.3		1.2
201 | N-CA-C			GLU		111.0		2.7
202 | CA-C-O			GLU		120.1		2.1
203 | N-CA-CB			GLN		110.6		1.8
204 | CB-CA-C			GLN		110.4		2.0
205 | CA-CB-CG		GLN		113.4		2.2
206 | CB-CG-CD		GLN		111.6		2.6
207 | CG-CD-OE1		GLN		121.6		2.0
208 | CG-CD-NE2		GLN		116.7		2.4
209 | OE1-CD-NE2		GLN		121.9		2.3
210 | N-CA-C			GLN		111.0		2.7
211 | CA-C-O			GLN		120.1		2.1
212 | N-CA-C			GLY		113.1		2.5
213 | CA-C-O			GLY		120.6		1.8
214 | N-CA-CB			HIS		110.6		1.8
215 | CB-CA-C			HIS		110.4		2.0
216 | CA-CB-CG		HIS		113.6		1.7
217 | CB-CG-ND1		HIS		123.2		2.5
218 | CB-CG-CD2		HIS		130.8		3.1
219 | CG-ND1-CE1		HIS		108.2		1.4
220 | ND1-CE1-NE2		HIS		109.9		2.2
221 | CE1-NE2-CD2		HIS		106.6		2.5
222 | NE2-CD2-CG		HIS		109.2		1.9
223 | CD2-CG-ND1		HIS		106.0		1.4
224 | N-CA-C			HIS		111.0		2.7
225 | CA-C-O			HIS		120.1		2.1
226 | N-CA-CB			ILE		110.8		2.3
227 | CB-CA-C			ILE		111.6		2.0
228 | CA-CB-CG1		ILE		111.0		1.9
229 | CB-CG1-CD1		ILE		113.9		2.8
230 | CA-CB-CG2		ILE		110.9		2.0
231 | CG1-CB-CG2		ILE		111.4		2.2
232 | N-CA-C			ILE		111.0		2.7
233 | CA-C-O			ILE		120.1		2.1
234 | N-CA-CB			LEU		110.4		2.0
235 | CB-CA-C			LEU		110.2		1.9
236 | CA-CB-CG		LEU		115.3		2.3
237 | CB-CG-CD1		LEU		111.0		1.7
238 | CB-CG-CD2		LEU		111.0		1.7
239 | CD1-CG-CD2		LEU		110.5		3.0
240 | N-CA-C			LEU		111.0		2.7
241 | CA-C-O			LEU		120.1		2.1
242 | N-CA-CB			LYS		110.6		1.8
243 | CB-CA-C			LYS		110.4		2.0
244 | CA-CB-CG		LYS		113.4		2.2
245 | CB-CG-CD		LYS		111.6		2.6
246 | CG-CD-CE		LYS		111.9		3.0
247 | CD-CE-NZ		LYS		111.7		2.3
248 | N-CA-C			LYS		111.0		2.7
249 | CA-C-O			LYS		120.1		2.1
250 | N-CA-CB			MET		110.6		1.8
251 | CB-CA-C			MET		110.4		2.0
252 | CA-CB-CG		MET		113.3		1.7
253 | CB-CG-SD		MET		112.4		3.0
254 | CG-SD-CE		MET		100.2		1.6
255 | N-CA-C			MET		111.0		2.7
256 | CA-C-O			MET		120.1		2.1
257 | N-CA-CB			PHE		110.6		1.8
258 | CB-CA-C			PHE		110.4		2.0
259 | CA-CB-CG		PHE		113.9		2.4
260 | CB-CG-CD1		PHE		120.8		0.7
261 | CB-CG-CD2		PHE		120.8		0.7
262 | CD1-CG-CD2		PHE		118.3		1.3
263 | CG-CD1-CE1		PHE		120.8		1.1
264 | CG-CD2-CE2		PHE		120.8		1.1
265 | CD1-CE1-CZ		PHE		120.1		1.2
266 | CD2-CE2-CZ		PHE		120.1		1.2
267 | CE1-CZ-CE2		PHE		120.0		1.8
268 | N-CA-C			PHE		111.0		2.7
269 | CA-C-O			PHE		120.1		2.1
270 | N-CA-CB			PRO		103.3		1.2
271 | CB-CA-C			PRO		111.7		2.1
272 | CA-CB-CG		PRO		104.8		1.9
273 | CB-CG-CD		PRO		106.5		3.9
274 | CG-CD-N			PRO		103.2		1.5
275 | CA-N-CD			PRO		111.7		1.4
276 | N-CA-C			PRO		112.1		2.6
277 | CA-C-O			PRO		120.2		2.4
278 | N-CA-CB			SER		110.5		1.5
279 | CB-CA-C			SER		110.1		1.9
280 | CA-CB-OG		SER		111.2		2.7
281 | N-CA-C			SER		111.0		2.7
282 | CA-C-O			SER		120.1		2.1
283 | N-CA-CB			THR		110.3		1.9
284 | CB-CA-C			THR		111.6		2.7
285 | CA-CB-OG1		THR		109.0		2.1
286 | CA-CB-CG2		THR		112.4		1.4
287 | OG1-CB-CG2		THR		110.0		2.3
288 | N-CA-C			THR		111.0		2.7
289 | CA-C-O			THR		120.1		2.1
290 | N-CA-CB			TRP		110.6		1.8
291 | CB-CA-C			TRP		110.4		2.0
292 | CA-CB-CG		TRP		113.7		1.9
293 | CB-CG-CD1		TRP		127.0		1.3
294 | CB-CG-CD2		TRP		126.6		1.3
295 | CD1-CG-CD2		TRP		106.3		0.8
296 | CG-CD1-NE1		TRP		110.1		1.0
297 | CD1-NE1-CE2		TRP		109.0		0.9
298 | NE1-CE2-CD2		TRP		107.3		1.0
299 | CE2-CD2-CG		TRP		107.3		0.8
300 | CG-CD2-CE3		TRP		133.9		0.9
301 | NE1-CE2-CZ2		TRP		130.4		1.1
302 | CE3-CD2-CE2		TRP		118.7		1.2
303 | CD2-CE2-CZ2		TRP		122.3		1.2
304 | CE2-CZ2-CH2		TRP		117.4		1.0
305 | CZ2-CH2-CZ3		TRP		121.6		1.2
306 | CH2-CZ3-CE3		TRP		121.2		1.1
307 | CZ3-CE3-CD2		TRP		118.8		1.3
308 | N-CA-C			TRP		111.0		2.7
309 | CA-C-O			TRP		120.1		2.1
310 | N-CA-CB			TYR		110.6		1.8
311 | CB-CA-C			TYR		110.4		2.0
312 | CA-CB-CG		TYR		113.4		1.9
313 | CB-CG-CD1		TYR		121.0		0.6
314 | CB-CG-CD2		TYR		121.0		0.6
315 | CD1-CG-CD2		TYR		117.9		1.1
316 | CG-CD1-CE1		TYR		121.3		0.8
317 | CG-CD2-CE2		TYR		121.3		0.8
318 | CD1-CE1-CZ		TYR		119.8		0.9
319 | CD2-CE2-CZ		TYR		119.8		0.9
320 | CE1-CZ-CE2		TYR		119.8		1.6
321 | CE1-CZ-OH		TYR		120.1		2.7
322 | CE2-CZ-OH		TYR		120.1		2.7
323 | N-CA-C			TYR		111.0		2.7
324 | CA-C-O			TYR		120.1		2.1
325 | N-CA-CB			VAL		111.5		2.2
326 | CB-CA-C			VAL		111.4		1.9
327 | CA-CB-CG1		VAL		110.9		1.5
328 | CA-CB-CG2		VAL		110.9		1.5
329 | CG1-CB-CG2		VAL		110.9		1.6
330 | N-CA-C			VAL		111.0		2.7
331 | CA-C-O			VAL		120.1		2.1
332 | -
333 | 
334 | Non-bonded distance     Minimum Dist    Tolerance
335 | C-C                     3.4             1.5
336 | C-N                     3.25            1.5
337 | C-S                     3.5             1.5
338 | C-O                     3.22            1.5
339 | N-N                     3.1             1.5
340 | N-S                     3.35            1.5
341 | N-O                     3.07            1.5
342 | O-S                     3.32            1.5
343 | O-O                     3.04            1.5
344 | S-S                     2.03            1.0
345 | -
346 | 


--------------------------------------------------------------------------------
/src/afpdb/myalphafold/model/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 DeepMind Technologies Limited
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """A collection of JAX utility functions for use in protein folding."""
 16 | 
 17 | import collections
 18 | import contextlib
 19 | import functools
 20 | import numbers
 21 | from typing import Mapping
 22 | 
 23 | import haiku as hk
 24 | import jax
 25 | import jax.numpy as jnp
 26 | import numpy as np
 27 | 
 28 | 
 29 | def bfloat16_creator(next_creator, shape, dtype, init, context):
 30 |   """Creates float32 variables when bfloat16 is requested."""
 31 |   if context.original_dtype == jnp.bfloat16:
 32 |     dtype = jnp.float32
 33 |   return next_creator(shape, dtype, init)
 34 | 
 35 | 
 36 | def bfloat16_getter(next_getter, value, context):
 37 |   """Casts float32 to bfloat16 when bfloat16 was originally requested."""
 38 |   if context.original_dtype == jnp.bfloat16:
 39 |     assert value.dtype == jnp.float32
 40 |     value = value.astype(jnp.bfloat16)
 41 |   return next_getter(value)
 42 | 
 43 | 
 44 | @contextlib.contextmanager
 45 | def bfloat16_context():
 46 |   with hk.custom_creator(bfloat16_creator), hk.custom_getter(bfloat16_getter):
 47 |     yield
 48 | 
 49 | 
 50 | def final_init(config):
 51 |   if config.zero_init:
 52 |     return 'zeros'
 53 |   else:
 54 |     return 'linear'
 55 | 
 56 | 
 57 | def batched_gather(params, indices, axis=0, batch_dims=0):
 58 |   """Implements a JAX equivalent of `tf.gather` with `axis` and `batch_dims`."""
 59 |   take_fn = lambda p, i: jnp.take(p, i, axis=axis, mode='clip')
 60 |   for _ in range(batch_dims):
 61 |     take_fn = jax.vmap(take_fn)
 62 |   return take_fn(params, indices)
 63 | 
 64 | 
 65 | def mask_mean(mask, value, axis=None, drop_mask_channel=False, eps=1e-10):
 66 |   """Masked mean."""
 67 |   if drop_mask_channel:
 68 |     mask = mask[..., 0]
 69 | 
 70 |   mask_shape = mask.shape
 71 |   value_shape = value.shape
 72 | 
 73 |   assert len(mask_shape) == len(value_shape)
 74 | 
 75 |   if isinstance(axis, numbers.Integral):
 76 |     axis = [axis]
 77 |   elif axis is None:
 78 |     axis = list(range(len(mask_shape)))
 79 |   assert isinstance(axis, collections.abc.Iterable), (
 80 |       'axis needs to be either an iterable, integer or "None"')
 81 | 
 82 |   broadcast_factor = 1.
 83 |   for axis_ in axis:
 84 |     value_size = value_shape[axis_]
 85 |     mask_size = mask_shape[axis_]
 86 |     if mask_size == 1:
 87 |       broadcast_factor *= value_size
 88 |     else:
 89 |       assert mask_size == value_size
 90 | 
 91 |   return (jnp.sum(mask * value, axis=axis) /
 92 |           (jnp.sum(mask, axis=axis) * broadcast_factor + eps))
 93 | 
 94 | 
 95 | def flat_params_to_haiku(params, fuse=True):
 96 |   """Convert a dictionary of NumPy arrays to Haiku parameters."""
 97 |   P = {}
 98 |   for path, array in params.items():
 99 |     scope, name = path.split('//')
100 |     if scope not in P:
101 |       P[scope] = {}
102 |     P[scope][name] = jnp.array(array)
103 |   for a in ["evoformer_iteration",
104 |             "extra_msa_stack",
105 |             "template_embedding/single_template_embedding/template_embedding_iteration",
106 |             "template_embedding/single_template_embedding/template_pair_stack/__layer_stack_no_state"]:
107 |     for b in ["triangle_multiplication_incoming","triangle_multiplication_outgoing"]:
108 |       k = f"alphafold/alphafold_iteration/evoformer/{a}/{b}"
109 |       
110 |       if fuse and f"{k}/center_layer_norm" in P:
111 |         for c in ["gate","projection"]:
112 |           L = P.pop(f"{k}/left_{c}")
113 |           R = P.pop(f"{k}/right_{c}")
114 |           P[f"{k}/{c}"] = {}
115 |           for d in ["bias","weights"]:
116 |             P[f"{k}/{c}"][d] = jnp.concatenate([L[d],R[d]],-1)
117 |         P[f"{k}/center_norm"] = P.pop(f"{k}/center_layer_norm")
118 |         P[f"{k}/left_norm_input"] = P.pop(f"{k}/layer_norm_input")
119 |       
120 |       if not fuse and f"{k}/center_norm" in P:
121 |         for c in ["gate","projection"]:
122 |           LR = P.pop(f"{k}/{c}")
123 |           P[f"{k}/left_{c}"] = {}
124 |           P[f"{k}/right_{c}"] = {}
125 |           for d in ["bias","weights"]:
126 |             half = LR[d].shape[-1] // 2
127 |             P[f"{k}/left_{c}"][d] = LR[d][...,:half]
128 |             P[f"{k}/right_{c}"][d] = LR[d][...,half:]
129 |         P[f"{k}/center_layer_norm"] = P.pop(f"{k}/center_norm")
130 |         P[f"{k}/layer_norm_input"] = P.pop(f"{k}/left_norm_input")
131 |   return P
132 | 
133 | def padding_consistent_rng(f):
134 |   """Modify any element-wise random function to be consistent with padding.
135 | 
136 |   Normally if you take a function like jax.random.normal and generate an array,
137 |   say of size (10,10), you will get a different set of random numbers to if you
138 |   add padding and take the first (10,10) sub-array.
139 | 
140 |   This function makes a random function that is consistent regardless of the
141 |   amount of padding added.
142 | 
143 |   Note: The padding-consistent function is likely to be slower to compile and
144 |   run than the function it is wrapping, but these slowdowns are likely to be
145 |   negligible in a large network.
146 | 
147 |   Args:
148 |     f: Any element-wise function that takes (PRNG key, shape) as the first 2
149 |       arguments.
150 | 
151 |   Returns:
152 |     An equivalent function to f, that is now consistent for different amounts of
153 |     padding.
154 |   """
155 |   def grid_keys(key, shape):
156 |     """Generate a grid of rng keys that is consistent with different padding.
157 | 
158 |     Generate random keys such that the keys will be identical, regardless of
159 |     how much padding is added to any dimension.
160 | 
161 |     Args:
162 |       key: A PRNG key.
163 |       shape: The shape of the output array of keys that will be generated.
164 | 
165 |     Returns:
166 |       An array of shape `shape` consisting of random keys.
167 |     """
168 |     if not shape:
169 |       return key
170 |     new_keys = jax.vmap(functools.partial(jax.random.fold_in, key))(
171 |         jnp.arange(shape[0]))
172 |     return jax.vmap(functools.partial(grid_keys, shape=shape[1:]))(new_keys)
173 | 
174 |   def inner(key, shape, **kwargs):
175 |     return jnp.vectorize(
176 |         lambda key: f(key, shape=(), **kwargs),
177 |         signature='(2)->()')(
178 |             grid_keys(key, shape))
179 |   return inner
180 | 


--------------------------------------------------------------------------------
/src/afpdb/mycolabdesign/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/src/afpdb/mycolabdesign/__init__.py


--------------------------------------------------------------------------------
/src/afpdb/mycolabdesign/getpdb.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # from ColabDesign
 3 | # https://colab.research.google.com/github/sokrypton/ColabDesign/blob/main/rf/examples/diffusion.ipynb#scrollTo=pZQnHLuDCsZm
 4 | def get_pdb(pdb_code=None, assembly1=False):
 5 |   if os.path.isfile(pdb_code):
 6 |     return pdb_code
 7 |   elif len(pdb_code) == 4:
 8 |     #fn=f"{pdb_code}.pdb1" if not assembly1 else f"{pdb_code}-assembly1.cif"
 9 |     fn=f"{pdb_code}.cif" if not assembly1 else f"{pdb_code}-assembly1.cif"
10 |     if not os.path.isfile(fn):
11 |       cmd=f"wget -nc --no-check-certificate https://files.rcsb.org/download/{fn}.gz && gunzip {fn}.gz"
12 |       os.system(cmd)
13 |       if not os.path.exists(fn):
14 |         print(f"Fail to download PDB file: {pdb_code}.pdb1!")
15 |         print(cmd)
16 |         return None
17 |     return fn
18 |   else:
19 |     os.system(f"wget -nc --no-check-certificate https://alphafold.ebi.ac.uk/files/AF-{pdb_code}-F1-model_v3.pdb")
20 |     return f"AF-{pdb_code}-F1-model_v3.pdb"
21 | 
22 | if __name__=="__main__":
23 |     print(get_pdb("1crn"))
24 | 
25 | 


--------------------------------------------------------------------------------
/src/afpdb/mycolabdesign/protein.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | jnp=np
  3 | 
  4 | from ..myalphafold.common import residue_constants
  5 | from string import ascii_uppercase, ascii_lowercase
  6 | alphabet_list = list(ascii_uppercase+ascii_lowercase)
  7 | 
  8 | MODRES = {'MSE':'MET','MLY':'LYS','FME':'MET','HYP':'PRO',
  9 |           'TPO':'THR','CSO':'CYS','SEP':'SER','M3L':'LYS',
 10 |           'HSK':'HIS','SAC':'SER','PCA':'GLU','DAL':'ALA',
 11 |           'CME':'CYS','CSD':'CYS','OCS':'CYS','DPR':'PRO',
 12 |           'B3K':'LYS','ALY':'LYS','YCM':'CYS','MLZ':'LYS',
 13 |           '4BF':'TYR','KCX':'LYS','B3E':'GLU','B3D':'ASP',
 14 |           'HZP':'PRO','CSX':'CYS','BAL':'ALA','HIC':'HIS',
 15 |           'DBZ':'ALA','DCY':'CYS','DVA':'VAL','NLE':'LEU',
 16 |           'SMC':'CYS','AGM':'ARG','B3A':'ALA','DAS':'ASP',
 17 |           'DLY':'LYS','DSN':'SER','DTH':'THR','GL3':'GLY',
 18 |           'HY3':'PRO','LLP':'LYS','MGN':'GLN','MHS':'HIS',
 19 |           'TRQ':'TRP','B3Y':'TYR','PHI':'PHE','PTR':'TYR',
 20 |           'TYS':'TYR','IAS':'ASP','GPL':'LYS','KYN':'TRP',
 21 |           'CSD':'CYS','SEC':'CYS',
 22 |           ## YZ: addition from MOE models
 23 |           'HID':'HIS', # This refers to histidine with the proton on the delta nitrogen (Nδ).
 24 |           'HIE':'HIS', # This refers to histidine with the proton on the epsilon nitrogen (Nε).
 25 |           'HIP':'HIS', # This refers to histidine with both nitrogens (Nδ and Nε) protonated, giving it a +1 charge
 26 |          }
 27 | 
 28 | def pdb_to_string(pdb_file, chains=None, models=None):
 29 |   '''read pdb file and return as string'''
 30 | 
 31 |   if chains is not None:
 32 |     if "," in chains: chains = chains.split(",")
 33 |     if not isinstance(chains,list): chains = [chains]
 34 |   if models is not None:
 35 |     if not isinstance(models,list): models = [models]
 36 | 
 37 |   modres = {**MODRES}
 38 |   lines = []
 39 |   seen = []
 40 |   model = 1
 41 | 
 42 |   if "\n" in pdb_file:
 43 |     old_lines = pdb_file.split("\n")
 44 |   else:
 45 |     with open(pdb_file,"rb") as f:
 46 |       old_lines = [line.decode("utf-8","ignore").rstrip() for line in f]
 47 |   for line in old_lines:
 48 |     if line[:5] == "MODEL":
 49 |       model = int(line[5:])
 50 |     if models is None or model in models:
 51 |       if line[:6] == "MODRES":
 52 |         k = line[12:15]
 53 |         v = line[24:27]
 54 |         if k not in modres and v in residue_constants.restype_3to1:
 55 |           modres[k] = v
 56 |       if line[:6] == "HETATM":
 57 |         k = line[17:20]
 58 |         if k in modres:
 59 |           line = "ATOM  "+line[6:17]+modres[k]+line[20:]
 60 |       if line[:4] == "ATOM":
 61 |         chain = line[21:22]
 62 |         if chains is None or chain in chains:
 63 |           atom = line[12:12+4].strip()
 64 |           resi = line[17:17+3]
 65 |           resn = line[22:22+5].strip()
 66 |           # Let's support insertion
 67 |           #if resn[-1].isalpha(): # alternative atom
 68 |             # Let's support insertion
 69 |             #YZ: rather raise an exception
 70 |             #raise ValueError(
 71 |             #print("Warning:"
 72 |             #f'PDB contains an insertion code at chain {chain} and residue '
 73 |             #f'index {resn}. These are not supported.')
 74 |             ##resn = resn[:-1]
 75 |             #line = line[:26]+" "+line[27:]
 76 |           key = f"{model}_{chain}_{resn}_{resi}_{atom}"
 77 |           if key not in seen: # skip alternative placements
 78 |             lines.append(line)
 79 |             seen.append(key)
 80 |       if line[:5] == "MODEL" or line[:3] == "TER" or line[:6] == "ENDMDL":
 81 |         lines.append(line)
 82 |   return "\n".join(lines)
 83 | 
 84 | def renum_pdb_str(pdb_str, Ls=None, renum=True, offset=1):
 85 |   if Ls is not None:
 86 |     L_init = 0
 87 |     new_chain = {}
 88 |     for L,c in zip(Ls, alphabet_list):
 89 |       new_chain.update({i:c for i in range(L_init,L_init+L)})
 90 |       L_init += L
 91 | 
 92 |   n,num,pdb_out = 0,offset,[]
 93 |   resnum_ = None
 94 |   chain_ = None
 95 |   new_chain_ = new_chain[0]
 96 |   for line in pdb_str.split("\n"):
 97 |     if line[:4] == "ATOM":
 98 |       chain = line[21:22]
 99 |       resnum = int(line[22:22+5])
100 |       if resnum_ is None: resnum_ = resnum
101 |       if chain_ is None: chain_ = chain
102 |       if resnum != resnum_ or chain != chain_:
103 |         num += (resnum - resnum_)
104 |         n += 1
105 |         resnum_,chain_ = resnum,chain
106 |       if Ls is not None:
107 |         if new_chain[n] != new_chain_:
108 |           num = offset
109 |           new_chain_ = new_chain[n]
110 |       N = num if renum else resnum
111 |       if Ls is None: pdb_out.append("%s%4i%s" % (line[:22],N,line[26:]))
112 |       else: pdb_out.append("%s%s%4i%s" % (line[:21],new_chain[n],N,line[26:]))
113 |   return "\n".join(pdb_out)
114 | 
115 | #################################################################################
116 | 
117 | def _np_len_pw(x, use_jax=True):
118 |   '''compute pairwise distance'''
119 |   _np = jnp if use_jax else np
120 | 
121 |   x_norm = _np.square(x).sum(-1)
122 |   xx = _np.einsum("...ia,...ja->...ij",x,x)
123 |   sq_dist = x_norm[...,:,None] + x_norm[...,None,:] - 2 * xx
124 | 
125 |   # due to precision errors the values can sometimes be negative
126 |   if use_jax: sq_dist = jax.nn.relu(sq_dist)
127 |   else: sq_dist[sq_dist < 0] = 0
128 | 
129 |   # return euclidean pairwise distance matrix
130 |   return _np.sqrt(sq_dist + 1e-8)
131 | 
132 | def _np_rmsdist(true, pred, use_jax=True):
133 |   '''compute RMSD of distance matrices'''
134 |   _np = jnp if use_jax else np
135 |   t = _np_len_pw(true, use_jax=use_jax)
136 |   p = _np_len_pw(pred, use_jax=use_jax)
137 |   return _np.sqrt(_np.square(t-p).mean() + 1e-8)
138 | 
139 | def _np_kabsch(a, b, return_v=False, use_jax=True):
140 |   '''get alignment matrix for two sets of coodinates'''
141 |   _np = jnp if use_jax else np
142 |   ab = a.swapaxes(-1,-2) @ b
143 |   u, s, vh = _np.linalg.svd(ab, full_matrices=False)
144 |   flip = _np.linalg.det(u @ vh) < 0
145 |   u_ = _np.where(flip, -u[...,-1].T, u[...,-1].T).T
146 |   if use_jax: u = u.at[...,-1].set(u_)
147 |   else: u[...,-1] = u_
148 |   return u if return_v else (u @ vh)
149 | 
150 | def _np_rmsd(true, pred, use_jax=True):
151 |   '''compute RMSD of coordinates after alignment'''
152 |   _np = jnp if use_jax else np
153 |   p = true - true.mean(-2,keepdims=True)
154 |   q = pred - pred.mean(-2,keepdims=True)
155 |   p = p @ _np_kabsch(p, q, use_jax=use_jax)
156 |   return _np.sqrt(_np.square(p-q).sum(-1).mean(-1) + 1e-8)
157 | 
158 | def _np_norm(x, axis=-1, keepdims=True, eps=1e-8, use_jax=True):
159 |   '''compute norm of vector'''
160 |   _np = jnp if use_jax else np
161 |   return _np.sqrt(_np.square(x).sum(axis,keepdims=keepdims) + 1e-8)
162 | 
163 | def _np_len(a, b, use_jax=True):
164 |   '''given coordinates a-b, return length or distance'''
165 |   return _np_norm(a-b, use_jax=use_jax)
166 | 
167 | def _np_ang(a, b, c, use_acos=False, use_jax=True):
168 |   '''given coordinates a-b-c, return angle'''
169 |   _np = jnp if use_jax else np
170 |   norm = lambda x: _np_norm(x, use_jax=use_jax)
171 |   ba, bc = b-a, b-c
172 |   cos_ang = (ba * bc).sum(-1,keepdims=True) / (norm(ba) * norm(bc))
173 |   # note the derivative at acos(-1 or 1) is inf, to avoid nans we use cos(ang)
174 |   if use_acos: return _np.arccos(cos_ang)
175 |   else: return cos_ang
176 | 
177 | def _np_dih(a, b, c, d, use_atan2=False, standardize=False, use_jax=True):
178 |   '''given coordinates a-b-c-d, return dihedral'''
179 |   _np = jnp if use_jax else np
180 |   normalize = lambda x: x/_np_norm(x, use_jax=use_jax)
181 |   ab, bc, cd = normalize(a-b), normalize(b-c), normalize(c-d)
182 |   n1,n2 = _np.cross(ab, bc), _np.cross(bc, cd)
183 |   sin_ang = (_np.cross(n1, bc) * n2).sum(-1,keepdims=True)
184 |   cos_ang = (n1 * n2).sum(-1,keepdims=True)
185 |   if use_atan2:
186 |     return _np.arctan2(sin_ang, cos_ang)
187 |   else:
188 |     angs = _np.concatenate([sin_ang, cos_ang],-1)
189 |     if standardize: return normalize(angs)
190 |     else: return angs
191 | 
192 | def _np_extend(a,b,c, L,A,D, use_jax=True):
193 |   '''
194 |   given coordinates a-b-c,
195 |   c-d (L)ength, b-c-d (A)ngle, and a-b-c-d (D)ihedral
196 |   return 4th coordinate d
197 |   '''
198 |   _np = jnp if use_jax else np
199 |   normalize = lambda x: x/_np_norm(x, use_jax=use_jax)
200 |   bc = normalize(b-c)
201 |   n = normalize(_np.cross(b-a, bc))
202 |   return c + sum([L * _np.cos(A) * bc,
203 |                   L * _np.sin(A) * _np.cos(D) * _np.cross(n, bc),
204 |                   L * _np.sin(A) * _np.sin(D) * -n])
205 | 
206 | def _np_get_cb(N,CA,C, use_jax=True):
207 |   '''compute CB placement from N, CA, C'''
208 |   return _np_extend(C, N, CA, 1.522, 1.927, -2.143, use_jax=use_jax)
209 | 
210 | def _np_get_6D(all_atom_positions, all_atom_mask=None, use_jax=True, for_trrosetta=False):
211 |   '''get 6D features (see TrRosetta paper)'''
212 | 
213 |   # get CB coordinate
214 |   atom_idx = {k:residue_constants.atom_order[k] for k in ["N","CA","C"]}
215 |   out = {k:all_atom_positions[...,i,:] for k,i in atom_idx.items()}
216 |   out["CB"] = _np_get_cb(**out, use_jax=use_jax)
217 | 
218 |   if all_atom_mask is not None:
219 |     idx = np.fromiter(atom_idx.values(),int)
220 |     out["CB_mask"] = all_atom_mask[...,idx].prod(-1)
221 | 
222 |   # get pairwise features
223 |   N,A,B = (out[k] for k in ["N","CA","CB"])
224 |   n0 = N[...,:,None,:]
225 |   a0,a1 = A[...,:,None,:],A[...,None,:,:]
226 |   b0,b1 = B[...,:,None,:],B[...,None,:,:]
227 | 
228 |   if for_trrosetta:
229 |     out.update({"dist":  _np_len(b0,b1,       use_jax=use_jax),
230 |                 "phi":   _np_ang(a0,b0,b1,    use_jax=use_jax, use_acos=True),
231 |                 "omega": _np_dih(a0,b0,b1,a1, use_jax=use_jax, use_atan2=True),
232 |                 "theta": _np_dih(n0,a0,b0,b1, use_jax=use_jax, use_atan2=True)})
233 |   else:
234 |     out.update({"dist":  _np_len(b0,b1,       use_jax=use_jax),
235 |                 "phi":   _np_ang(a0,b0,b1,    use_jax=use_jax, use_acos=False),
236 |                 "omega": _np_dih(a0,b0,b1,a1, use_jax=use_jax, use_atan2=False),
237 |                 "theta": _np_dih(n0,a0,b0,b1, use_jax=use_jax, use_atan2=False)})
238 |   return out
239 | 
240 | ####################
241 | # losses
242 | ####################
243 | 
244 | # RMSD
245 | def jnp_rmsdist(true, pred):
246 |   return _np_rmsdist(true, pred)
247 | 
248 | def jnp_rmsd(true, pred, add_dist=False):
249 |   rmsd = _np_rmsd(true, pred)
250 |   if add_dist: rmsd = (rmsd + _np_rmsdist(true, pred))/2
251 |   return rmsd
252 | 
253 | def jnp_kabsch_w(a, b, weights):
254 |   return _np_kabsch(a * weights[:,None], b)
255 | 
256 | def jnp_rmsd_w(true, pred, weights):
257 |   p = true - (true * weights[:,None]).sum(0,keepdims=True)/weights.sum()
258 |   q = pred - (pred * weights[:,None]).sum(0,keepdims=True)/weights.sum()
259 |   p = p @ _np_kabsch(p * weights[:,None], q)
260 |   return jnp.sqrt((weights*jnp.square(p-q).sum(-1)).sum()/weights.sum() + 1e-8)
261 | 
262 | # 6D (see TrRosetta paper)
263 | def _np_get_6D_loss(true, pred, mask=None, use_theta=True, use_dist=False, use_jax=True):
264 |   _np = jnp if use_jax else np
265 | 
266 |   f = {"T":_np_get_6D(true, mask, use_jax=use_jax),
267 |        "P":_np_get_6D(pred, use_jax=use_jax)}
268 | 
269 |   for k in f: f[k]["dist"] /= 10.0
270 | 
271 |   keys = ["omega","phi"]
272 |   if use_theta: keys.append("theta")
273 |   if use_dist: keys.append("dist")
274 |   sq_diff = sum([_np.square(f["T"][k]-f["P"][k]).sum(-1) for k in keys])
275 | 
276 |   mask = _np.ones(true.shape[0]) if mask is None else f["T"]["CB_mask"]
277 |   mask = mask[:,None] * mask[None,:]
278 |   loss = (sq_diff * mask).sum((-1,-2)) / mask.sum((-1,-2))
279 | 
280 |   return _np.sqrt(loss + 1e-8).mean()
281 | 
282 | def _np_get_6D_binned(all_atom_positions, all_atom_mask, use_jax=None):
283 |   # TODO: make differentiable, add use_jax option
284 |   ref = _np_get_6D(all_atom_positions,
285 |                    all_atom_mask,
286 |                    use_jax=False, for_trrosetta=True)
287 |   ref = jax.tree_map(jnp.squeeze,ref)
288 | 
289 |   def mtx2bins(x_ref, start, end, nbins, mask):
290 |     bins = np.linspace(start, end, nbins)
291 |     x_true = np.digitize(x_ref, bins).astype(np.uint8)
292 |     x_true = np.where(mask,0,x_true)
293 |     return np.eye(nbins+1)[x_true][...,:-1]
294 | 
295 |   mask = (ref["dist"] > 20) | (np.eye(ref["dist"].shape[0]) == 1)
296 |   return {"dist": mtx2bins(ref["dist"],    2.0,  20.0,  37,  mask=mask),
297 |           "omega":mtx2bins(ref["omega"], -np.pi, np.pi, 25,  mask=mask),
298 |           "theta":mtx2bins(ref["theta"], -np.pi, np.pi, 25,  mask=mask),
299 |           "phi":  mtx2bins(ref["phi"],      0.0, np.pi, 13,  mask=mask)}
300 | 


--------------------------------------------------------------------------------
/src/afpdb/mycolabdesign/utils.py:
--------------------------------------------------------------------------------
  1 | from string import ascii_uppercase, ascii_lowercase
  2 | alphabet_list = list(ascii_uppercase+ascii_lowercase)
  3 | import numpy as np
  4 | 
  5 | def fix_partial_contigs(contigs, parsed_pdb):
  6 |   INF = float("inf")
  7 | 
  8 |   # get unique chains
  9 |   chains = []
 10 |   for c, i in parsed_pdb["pdb_idx"]:
 11 |     if c not in chains: chains.append(c)
 12 | 
 13 |   # get observed positions and chains
 14 |   ok = []
 15 |   for contig in contigs:
 16 |     for x in contig.split("/"):
 17 |       if x[0].isalpha:
 18 |         C,x = x[0],x[1:]
 19 |         S,E = -INF,INF
 20 |         if x.startswith("-"):
 21 |           E = int(x[1:])
 22 |         elif x.endswith("-"):
 23 |           S = int(x[:-1])
 24 |         elif "-" in x:
 25 |           (S,E) = (int(y) for y in x.split("-"))
 26 |         elif x.isnumeric():
 27 |           S = E = int(x)
 28 |         for c, i in parsed_pdb["pdb_idx"]:
 29 |           if c == C and i >= S and i <= E:
 30 |             if [c,i] not in ok: ok.append([c,i])
 31 | 
 32 |   # define new contigs
 33 |   new_contigs = []
 34 |   for C in chains:
 35 |     new_contig = []
 36 |     unseen = []
 37 |     seen = []
 38 |     for c,i in parsed_pdb["pdb_idx"]:
 39 |       if c == C:
 40 |         if [c,i] in ok:
 41 |           L = len(unseen)
 42 |           if L > 0:
 43 |             new_contig.append(f"{L}-{L}")
 44 |             unseen = []
 45 |    #       #YZ: in case residue numbering jumps
 46 |    #       elif len(seen)>0 and seen[-1][1]!=i-1:
 47 |    #           new_contig.append(f"{seen[0][0]}{seen[0][1]}-{seen[-1][1]}")
 48 |    #           seen = []
 49 |    #       ##
 50 |           seen.append([c,i])
 51 |         else:
 52 |           L = len(seen)
 53 |           if L > 0:
 54 |             new_contig.append(f"{seen[0][0]}{seen[0][1]}-{seen[-1][1]}")
 55 |             seen = []
 56 |           unseen.append([c,i])
 57 | 
 58 |     L = len(unseen)
 59 |     if L > 0:
 60 |       new_contig.append(f"{L}-{L}")
 61 |     L = len(seen)
 62 |     if L > 0:
 63 |       new_contig.append(f"{seen[0][0]}{seen[0][1]}-{seen[-1][1]}")
 64 |     new_contigs.append("/".join(new_contig))
 65 | 
 66 |   return new_contigs
 67 | 
 68 | def fix_contigs(contigs,parsed_pdb):
 69 |   def fix_contig(contig):
 70 |     INF = float("inf")
 71 |     X = contig.split("/")
 72 |     Y = []
 73 |     for n,x in enumerate(X):
 74 |       if x[0].isalpha():
 75 |         C,x = x[0],x[1:]
 76 |         S,E = -INF,INF
 77 |         if x.startswith("-"):
 78 |           E = int(x[1:])
 79 |         elif x.endswith("-"):
 80 |           S = int(x[:-1])
 81 |         elif "-" in x:
 82 |           (S,E) = (int(y) for y in x.split("-"))
 83 |         elif x.isnumeric():
 84 |           S = E = int(x)
 85 |         new_x = ""
 86 |         c_,i_ = None,0
 87 |         for c, i in parsed_pdb["pdb_idx"]:
 88 |           if c == C and i >= S and i <= E:
 89 |             if c_ is None:
 90 |               new_x = f"{c}{i}"
 91 |             else:
 92 |               if c != c_ or i != i_+1:
 93 |                 new_x += f"-{i_}/{c}{i}"
 94 |             c_,i_ = c,i
 95 |         Y.append(new_x + f"-{i_}")
 96 |       elif "-" in x:
 97 |         # sample length
 98 |         s,e = x.split("-")
 99 |         m = np.random.randint(int(s),int(e)+1)
100 |         Y.append(f"{m}-{m}")
101 |       elif x.isnumeric() and x != "0":
102 |         Y.append(f"{x}-{x}")
103 |     return "/".join(Y)
104 |   return [fix_contig(x) for x in contigs]
105 | 
106 | def fix_pdb(pdb_str, contigs):
107 |   def get_range(contig):
108 |     L_init = 1
109 |     R = []
110 |     sub_contigs = [x.split("-") for x in contig.split("/")]
111 |     for n,(a,b) in enumerate(sub_contigs):
112 |       if a[0].isalpha():
113 |         if n > 0:
114 |           pa,pb = sub_contigs[n-1]
115 |           if pa[0].isalpha() and a[0] == pa[0]:
116 |             L_init += int(a[1:]) - int(pb) - 1
117 |         L = int(b)-int(a[1:]) + 1
118 |       else:
119 |         L = int(b)
120 |       R += range(L_init,L_init+L)
121 |       L_init += L
122 |     return R
123 | 
124 |   contig_ranges = [get_range(x) for x in contigs]
125 |   R,C = [],[]
126 |   for n,r in enumerate(contig_ranges):
127 |     R += r
128 |     C += [alphabet_list[n]] * len(r)
129 | 
130 |   pdb_out = []
131 |   r_, c_,n = None, None, 0
132 |   for line in pdb_str.split("\n"):
133 |     if line[:4] == "ATOM":
134 |       c = line[21:22]
135 |       r = int(line[22:22+5])
136 |       if r_ is None: r_ = r
137 |       if c_ is None: c_ = c
138 |       if r != r_ or c != c_:
139 |         n += 1
140 |         r_,c_ = r,c
141 |       pdb_out.append("%s%s%4i%s" % (line[:21],C[n],R[n],line[26:]))
142 |     if line[:5] == "MODEL" or line[:3] == "TER" or line[:6] == "ENDMDL":
143 |       pdb_out.append(line)
144 |       r_, c_,n = None, None, 0
145 |   return "\n".join(pdb_out)
146 | 
147 | def get_ca(pdb_filename, get_bfact=False):
148 |   xyz = []
149 |   bfact = []
150 |   for line in open(pdb_filename, "r"):
151 |     line = line.rstrip()
152 |     if line[:4] == "ATOM":
153 |       atom = line[12:12+4].strip()
154 |       if atom == "CA":
155 |         x = float(line[30:30+8])
156 |         y = float(line[38:38+8])
157 |         z = float(line[46:46+8])
158 |         xyz.append([x, y, z])
159 |         if get_bfact:
160 |           b_factor = float(line[60:60+6].strip())
161 |           bfact.append(b_factor)
162 |   if get_bfact:
163 |     return np.array(xyz), np.array(bfact)
164 |   else:
165 |     return np.array(xyz)
166 | 
167 | def get_Ls(contigs):
168 |   Ls = []
169 |   for contig in contigs:
170 |     L = 0
171 |     for n,(a,b) in enumerate(x.split("-") for x in contig.split("/")):
172 |       if a[0].isalpha():
173 |         L += int(b)-int(a[1:]) + 1
174 |       else:
175 |         L += int(b)
176 |     Ls.append(L)
177 |   return Ls
178 | 
179 | 


--------------------------------------------------------------------------------
/src/afpdb/mycolabfold/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/src/afpdb/mycolabfold/__init__.py


--------------------------------------------------------------------------------
/src/afpdb/mycolabfold/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from Bio.PDB import MMCIFIO
  3 | from Bio.PDB.Polypeptide import standard_aa_names
  4 | #YZ
  5 | from collections import defaultdict
  6 | #
  7 | 
  8 | CIF_REVISION_DATE = """loop_
  9 | _pdbx_audit_revision_history.ordinal
 10 | _pdbx_audit_revision_history.data_content_type
 11 | _pdbx_audit_revision_history.major_revision
 12 | _pdbx_audit_revision_history.minor_revision
 13 | _pdbx_audit_revision_history.revision_date
 14 | 1 'Structure model' 1 0 1971-01-01
 15 | #\n"""
 16 | 
 17 | ### begin section copied from Bio.PDB
 18 | mmcif_order = {
 19 |     "_atom_site": [
 20 |         "group_PDB",
 21 |         "id",
 22 |         "type_symbol",
 23 |         "label_atom_id",
 24 |         "label_alt_id",
 25 |         "label_comp_id",
 26 |         "label_asym_id",
 27 |         "label_entity_id",
 28 |         "label_seq_id",
 29 |         "pdbx_PDB_ins_code",
 30 |         "Cartn_x",
 31 |         "Cartn_y",
 32 |         "Cartn_z",
 33 |         "occupancy",
 34 |         "B_iso_or_equiv",
 35 |         "pdbx_formal_charge",
 36 |         "auth_seq_id",
 37 |         "auth_comp_id",
 38 |         "auth_asym_id",
 39 |         "auth_atom_id",
 40 |         "pdbx_PDB_model_num",
 41 |     ]
 42 | }
 43 | 
 44 | ## YZ
 45 | # if True: keep the original chain name in PDB
 46 | #   use https://github.com/sokrypton/ColabFold/issues/449
 47 | # if False: rename chain to A, B, C, but add a mapping section in cif to fix AF error
 48 | #   use https://github.com/speleo3/ColabFold/commit/68e7090c0c2401257e4a9392370f66b28ac82543
 49 | _KEEP_CHAIN=False
 50 | ##
 51 | 
 52 | class CFMMCIFIO(MMCIFIO):
 53 |     def _save_dict(self, out_file):
 54 |         ## YZ
 55 |         # alternatively, use fix provided by
 56 |         #  https://github.com/speleo3/ColabFold/commit/68e7090c0c2401257e4a9392370f66b28ac82543
 57 |         if not _KEEP_CHAIN:
 58 |             asym_id_auth_to_label = dict(
 59 |                 zip(self.dic.get("_atom_site.auth_asym_id", ()),
 60 |                     self.dic.get("_atom_site.label_asym_id", ())))
 61 |         ##
 62 |         # Form dictionary where key is first part of mmCIF key and value is list
 63 |         # of corresponding second parts
 64 |         key_lists = {}
 65 |         for key in self.dic:
 66 |             if key == "data_":
 67 |                 data_val = self.dic[key]
 68 |             else:
 69 |                 s = re.split(r"\.", key)
 70 |                 if len(s) == 2:
 71 |                     if s[0] in key_lists:
 72 |                         key_lists[s[0]].append(s[1])
 73 |                     else:
 74 |                         key_lists[s[0]] = [s[1]]
 75 |                 else:
 76 |                     raise ValueError("Invalid key in mmCIF dictionary: " + key)
 77 | 
 78 |         # Re-order lists if an order has been specified
 79 |         # Not all elements from the specified order are necessarily present
 80 |         for key, key_list in key_lists.items():
 81 |             if key in mmcif_order:
 82 |                 inds = []
 83 |                 for i in key_list:
 84 |                     try:
 85 |                         inds.append(mmcif_order[key].index(i))
 86 |                     # Unrecognised key - add at end
 87 |                     except ValueError:
 88 |                         inds.append(len(mmcif_order[key]))
 89 |                 key_lists[key] = [k for _, k in sorted(zip(inds, key_list))]
 90 | 
 91 |         # Write out top data_ line
 92 |         if data_val:
 93 |             out_file.write("data_" + data_val + "\n#\n")
 94 |             ### end section copied from Bio.PDB
 95 |             # Add poly_seq as default MMCIFIO doesn't handle this
 96 |             out_file.write(
 97 |                 """loop_
 98 | _entity_poly_seq.entity_id
 99 | _entity_poly_seq.num
100 | _entity_poly_seq.mon_id
101 | _entity_poly_seq.hetero
102 | #\n"""
103 |             )
104 |             poly_seq = []
105 |             chain_idx = 1
106 |             for model in self.structure:
107 |                 for chain in model:
108 |                     res_idx = 1
109 |                     for residue in chain:
110 |                         poly_seq.append(
111 |                             (chain_idx, res_idx, residue.get_resname(), "n")
112 |                         )
113 |                         res_idx += 1
114 |                     chain_idx += 1
115 |             for seq in poly_seq:
116 |                 out_file.write(f"{seq[0]} {seq[1]} {seq[2]}  {seq[3]}\n")
117 |             out_file.write("#\n")
118 |             out_file.write(
119 |                 """loop_
120 | _chem_comp.id
121 | _chem_comp.type
122 | #\n"""
123 |             )
124 |             for three in standard_aa_names:
125 |                 out_file.write(f'{three} "peptide linking"\n')
126 |             out_file.write("#\n")
127 |             out_file.write(
128 |                 """loop_
129 | _struct_asym.id
130 | _struct_asym.entity_id
131 | #\n"""
132 |             )
133 |             chain_idx = 1
134 |             for model in self.structure:
135 |                 for chain in model:
136 |                     ## YZ
137 |                     if _KEEP_CHAIN:
138 |                         out_file.write(f"{chain.get_id()} {chain_idx}\n")
139 |                     else:
140 |                         label_asym_id = asym_id_auth_to_label[chain.get_id()]
141 |                         out_file.write(f"{label_asym_id} {chain_idx}\n")
142 |                     ####
143 |                     chain_idx += 1
144 |             out_file.write("#\n")
145 | 
146 |         ### begin section copied from Bio.PDB
147 |         for key, key_list in key_lists.items():
148 |             # Pick a sample mmCIF value, which can be a list or a single value
149 |             sample_val = self.dic[key + "." + key_list[0]]
150 |             n_vals = len(sample_val)
151 |             # Check the mmCIF dictionary has consistent list sizes
152 |             for i in key_list:
153 |                 val = self.dic[key + "." + i]
154 |                 if (
155 |                     isinstance(sample_val, list)
156 |                     and (isinstance(val, str) or len(val) != n_vals)
157 |                 ) or (isinstance(sample_val, str) and isinstance(val, list)):
158 |                     raise ValueError(
159 |                         "Inconsistent list sizes in mmCIF dictionary: " + key + "." + i
160 |                     )
161 |             # If the value is a single value, write as key-value pairs
162 |             if isinstance(sample_val, str) or (
163 |                 isinstance(sample_val, list) and len(sample_val) == 1
164 |             ):
165 |                 m = 0
166 |                 # Find the maximum key length
167 |                 for i in key_list:
168 |                     if len(i) > m:
169 |                         m = len(i)
170 |                 for i in key_list:
171 |                     # If the value is a single item list, just take the value
172 |                     if isinstance(sample_val, str):
173 |                         value_no_list = self.dic[key + "." + i]
174 |                     else:
175 |                         value_no_list = self.dic[key + "." + i][0]
176 |                     out_file.write(
177 |                         "{k: <{width}}".format(k=key + "." + i, width=len(key) + m + 4)
178 |                         + self._format_mmcif_col(value_no_list, len(value_no_list))
179 |                         + "\n"
180 |                     )
181 |             # If the value is more than one value, write as keys then a value table
182 |             elif isinstance(sample_val, list):
183 |                 out_file.write("loop_\n")
184 |                 col_widths = {}
185 |                 # Write keys and find max widths for each set of values
186 |                 for i in key_list:
187 |                     out_file.write(key + "." + i + "\n")
188 |                     col_widths[i] = 0
189 |                     for val in self.dic[key + "." + i]:
190 |                         len_val = len(val)
191 |                         # If the value requires quoting it will add 2 characters
192 |                         if self._requires_quote(val) and not self._requires_newline(
193 |                             val
194 |                         ):
195 |                             len_val += 2
196 |                         if len_val > col_widths[i]:
197 |                             col_widths[i] = len_val
198 |                 # Technically the max of the sum of the column widths is 2048
199 | 
200 |                 # Write the values as rows
201 |                 for i in range(n_vals):
202 |                     for col in key_list:
203 |                         out_file.write(
204 |                             self._format_mmcif_col(
205 |                                 self.dic[key + "." + col][i], col_widths[col] + 1
206 |                             )
207 |                         )
208 |                     out_file.write("\n")
209 |             else:
210 |                 raise ValueError(
211 |                     "Invalid type in mmCIF dictionary: " + str(type(sample_val))
212 |                 )
213 |             out_file.write("#\n")
214 |             ### end section copied from Bio.PDB
215 |             out_file.write(CIF_REVISION_DATE)
216 | 
217 |     # Preserve chain_id
218 |     # https://github.com/sokrypton/ColabFold/issues/449
219 |     def _save_structure(self, out_file, select, preserve_atom_numbering):
220 |         atom_dict = defaultdict(list)
221 | 
222 |         for model in self.structure.get_list():
223 |             if not select.accept_model(model):
224 |                 continue
225 |             # mmCIF files with a single model have it specified as model 1
226 |             if model.serial_num == 0:
227 |                 model_n = "1"
228 |             else:
229 |                 model_n = str(model.serial_num)
230 |             # This is used to write label_entity_id and label_asym_id and
231 |             # increments from 1, changing with each molecule
232 |             entity_id = 0
233 |             if not preserve_atom_numbering:
234 |                 atom_number = 1
235 |             for chain in model.get_list():
236 |                 if not select.accept_chain(chain):
237 |                     continue
238 |                 chain_id = chain.get_id()
239 |                 if chain_id == " ":
240 |                     chain_id = "."
241 |                 # This is used to write label_seq_id and increments from 1,
242 |                 # remaining blank for hetero residues
243 |                 residue_number = 1
244 |                 prev_residue_type = ""
245 |                 prev_resname = ""
246 |                 for residue in chain.get_unpacked_list():
247 |                     if not select.accept_residue(residue):
248 |                         continue
249 |                     hetfield, resseq, icode = residue.get_id()
250 |                     if hetfield == " ":
251 |                         residue_type = "ATOM"
252 |                         label_seq_id = str(residue_number)
253 |                         residue_number += 1
254 |                     else:
255 |                         residue_type = "HETATM"
256 |                         label_seq_id = "."
257 |                     resseq = str(resseq)
258 |                     if icode == " ":
259 |                         icode = "?"
260 |                     resname = residue.get_resname()
261 |                     # Check if the molecule changes within the chain
262 |                     # This will always increment for the first residue in a
263 |                     # chain due to the starting values above
264 |                     if residue_type != prev_residue_type or (
265 |                         residue_type == "HETATM" and resname != prev_resname
266 |                     ):
267 |                         entity_id += 1
268 |                     prev_residue_type = residue_type
269 |                     prev_resname = resname
270 |                     label_asym_id = self._get_label_asym_id(entity_id)
271 |                     ##YZ
272 |                     if _KEEP_CHAIN:
273 |                         label_asym_id = chain.id
274 |                     ##YZ END
275 |                     for atom in residue.get_unpacked_list():
276 |                         if select.accept_atom(atom):
277 |                             atom_dict["_atom_site.group_PDB"].append(residue_type)
278 |                             if preserve_atom_numbering:
279 |                                 atom_number = atom.get_serial_number()
280 |                             atom_dict["_atom_site.id"].append(str(atom_number))
281 |                             if not preserve_atom_numbering:
282 |                                 atom_number += 1
283 |                             element = atom.element.strip()
284 |                             if element == "":
285 |                                 element = "?"
286 |                             atom_dict["_atom_site.type_symbol"].append(element)
287 |                             atom_dict["_atom_site.label_atom_id"].append(
288 |                                 atom.get_name().strip()
289 |                             )
290 |                             altloc = atom.get_altloc()
291 |                             if altloc == " ":
292 |                                 altloc = "."
293 |                             atom_dict["_atom_site.label_alt_id"].append(altloc)
294 |                             atom_dict["_atom_site.label_comp_id"].append(
295 |                                 resname.strip()
296 |                             )
297 |                             atom_dict["_atom_site.label_asym_id"].append(label_asym_id)
298 |                             # The entity ID should be the same for similar chains
299 |                             # However this is non-trivial to calculate so we write "?"
300 |                             atom_dict["_atom_site.label_entity_id"].append("?")
301 |                             atom_dict["_atom_site.label_seq_id"].append(label_seq_id)
302 |                             atom_dict["_atom_site.pdbx_PDB_ins_code"].append(icode)
303 |                             coord = atom.get_coord()
304 |                             atom_dict["_atom_site.Cartn_x"].append(f"{coord[0]:.3f}")
305 |                             atom_dict["_atom_site.Cartn_y"].append(f"{coord[1]:.3f}")
306 |                             atom_dict["_atom_site.Cartn_z"].append(f"{coord[2]:.3f}")
307 |                             atom_dict["_atom_site.occupancy"].append(
308 |                                 str(atom.get_occupancy())
309 |                             )
310 |                             atom_dict["_atom_site.B_iso_or_equiv"].append(
311 |                                 str(atom.get_bfactor())
312 |                             )
313 |                             atom_dict["_atom_site.auth_seq_id"].append(resseq)
314 |                             atom_dict["_atom_site.auth_asym_id"].append(chain_id)
315 |                             atom_dict["_atom_site.pdbx_PDB_model_num"].append(model_n)
316 | 
317 |         # Data block name is the structure ID with special characters removed
318 |         structure_id = self.structure.id
319 |         for c in ["#", "$", "'", '"', "[", "]", " ", "\t", "\n"]:
320 |             structure_id = structure_id.replace(c, "")
321 |         atom_dict["data_"] = structure_id
322 | 
323 |         # Set the dictionary and write out using the generic dictionary method
324 |         self.dic = atom_dict
325 |         self._save_dict(out_file)
326 | 
327 | 


--------------------------------------------------------------------------------
/src/afpdb/mypymol.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import pymol2
 3 | import pymol.util as pyutil
 4 | import os,ssl,sys
 5 | from .afpdb import util
 6 | ssl._create_default_https_context = ssl._create_unverified_context
 7 | 
 8 | class PyMOL:
 9 | 
10 |     def __init__(self):
11 |         self.p=pymol2.PyMOL()
12 |         self.p.start()
13 | 
14 |     def __del__(self):
15 |         self.close()
16 | 
17 |     def close(self):
18 |         if self.p is not None and hasattr(self.p, '_stop'):
19 |             self.p.stop()
20 |         self.p=None
21 | 
22 |     def __call__(self, cmd, *args, **kwargs):
23 |         # use API
24 |         try:
25 |             f=getattr(self.p.cmd, cmd)
26 |         except:
27 |             print(f"Invalid PyMOL command: {cmd}")
28 |         return f(*args, **kwargs)
29 | 
30 |     def cmd(self, cmd_str):
31 |         return self.run(cmd_str)
32 | 
33 |     def rs(self, rs_name, p):
34 |         """Return residue index array for a PyMOL selection object 'rs_name'
35 |             p is an afpdb.afpdb.Protein object
36 |         """
37 |         out=[]
38 |         def f(chain, resi): out.append(chain+resi)
39 |         self.p.cmd.iterate(rs_name, "f(chain, resi)", space={'f':f})
40 |         out=[p.res_map.get(x) for x in out if x in p.res_map]
41 |         return p.rs(out)
42 | 
43 |     def run(self, cmd_str):
44 |         """cmd_str can be one command or multi-line string
45 |         empty line or a line starts with # will be ignored"""
46 |         # use pymol command line string
47 |         def is_empty(s): s=='' or s.startswith('#')
48 |         out=None
49 |         if "\n" in cmd_str:
50 |             for _cmd in cmd_str.split("\n"):
51 |                 _cmd=_cmd.strip()
52 |                 if is_empty(_cmd): continue
53 |                 out=self.p.cmd.do(_cmd)
54 |             return out
55 |         cmd_str=cmd_str.strip()
56 |         if not is_empty(cmd_str):
57 |             out=self.p.cmd.do(cmd_str)
58 |         return out
59 | 
60 |     def script(self, fn):
61 |         S=util.read_list(fn)
62 |         out=None
63 |         for cmd_str in S:
64 |             cmd_str=cmd_str.strip()
65 |             if cmd_str=='' or cmd_str.stratswith('#'): continue
66 |             out=self.cmd(cmd_str)
67 |         return out
68 | 
69 | if __name__=="__main__":
70 |     # see example_pymol.py in the same folder for a better example
71 |     x=PyMOL()
72 |     # using API
73 |     out=x.cmd("""
74 | fetch 8a47, myobj
75 | save x.pdb
76 | color red
77 | select a, /myobj//A
78 | color white, a
79 | save x.png
80 | """)
81 | 


--------------------------------------------------------------------------------
/src/afpdb/thread_seq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # code taken from
  3 | # https://github.com/nrbennet/dl_binder_design/blob/main/mpnn_fr/dl_interface_design.py#L185
  4 | import sys
  5 | import os, shutil
  6 | import re
  7 | import tempfile
  8 | from Bio.PDB import PDBParser
  9 | from .afpdb import Protein,util
 10 | import json
 11 | import numpy as np
 12 | import traceback
 13 | 
 14 | restype_1to3 = {
 15 |     'A': 'ALA',
 16 |     'R': 'ARG',
 17 |     'N': 'ASN',
 18 |     'D': 'ASP',
 19 |     'C': 'CYS',
 20 |     'Q': 'GLN',
 21 |     'E': 'GLU',
 22 |     'G': 'GLY',
 23 |     'H': 'HIS',
 24 |     'I': 'ILE',
 25 |     'L': 'LEU',
 26 |     'K': 'LYS',
 27 |     'M': 'MET',
 28 |     'F': 'PHE',
 29 |     'P': 'PRO',
 30 |     'S': 'SER',
 31 |     'T': 'THR',
 32 |     'W': 'TRP',
 33 |     'Y': 'TYR',
 34 |     'V': 'VAL',
 35 | }
 36 | 
 37 | def remove_files(S_file):
 38 |     for x in S_file:
 39 |         if os.path.exists(x): os.remove(x)
 40 | 
 41 | class ThreadSeq:
 42 | 
 43 |     USE_AMBER=True # pyrosetta relax does not seem to work, use amber instead
 44 | 
 45 |     def __init__(self, pdb_file):
 46 |         self.pdb=pdb_file
 47 |         self.p=Protein(pdb_file)
 48 |         self.old_seq=self.p.seq_dict()
 49 |         self.chains=self.p.chain_list()
 50 |         # identify residues with missing atoms
 51 |         # these residues need to be mutated regardless
 52 |         self.miss_res=self.p.rs_missing_atoms()
 53 | 
 54 |     def run(self, out_file, seq, replace_X_with='A', relax=0, seq2bfactor=False, amber_gpu=False, cores=1, side_chain_pdb=None, rl_from=None, rl_to=None):
 55 |         """replace_X_with, if there is unrecognized residue in seq, use A or G,
 56 |             A is preferred. If the original PDB has CB, G might fail (have not verified)
 57 |             otherwise, give an error
 58 | 
 59 |             sometimes not all residues can be successfully thethers, so we return the final sequence
 60 |             Example: in AbDb, 2J6E_1.pdb contains P and G at the end of chain A & B, however, they miss Ca,
 61 |             so these two terminal aa will be dropped in the output .pdb by PyMOL
 62 | 
 63 |             seq can be a str for monomer, a json str or a dict for multimer
 64 |             {"A": "VAPLHLGKCNIAG", "L":"IVGGTASVRGEWPWQVTLHTT"}
 65 |         """
 66 | 
 67 |         # parse seq object
 68 |         if type(seq) is str:
 69 |             if seq=='': # empty sequence means we want to fix residues with missing atoms
 70 |                 seq=self.old_seq
 71 |             elif '{' in seq:
 72 |                 seq=json.loads(seq)
 73 |             else: #multi-chain format
 74 |                 S=re.split(r'\W', seq)
 75 |                 if len(self.chains)!=len(S):
 76 |                     raise Exception(f"""When using a colon-delimited string, there are {len(S)} chains in your sequence, but {len(self.chains)} chains in PDB!""")
 77 |                 seq={k:v for k,v in zip(self.chains, S)}
 78 |         if len(seq)==0: seq=self.old_seq
 79 | 
 80 |         # check seq is sensible
 81 |         for k,v in seq.items():
 82 |             if k not in self.old_seq:
 83 |                 print(f"ERROR ThreadSeq> Bad chain id: {k}")
 84 |                 return {"ok":False}
 85 |             self.old_seq[k]=self.old_seq[k].replace('X', '')
 86 |             if len(v)!=len(self.old_seq[k]):
 87 |                 print(f"Sequence length mistach for chain {k}: PDB has {len(self.old_seq[k])} residues, -s contains {len(v)}!")
 88 |                 print(f"PDB:  {self.old_seq[k]}")
 89 |                 print(f"-s:   {v}")
 90 |                 return {"ok":False}
 91 |         #print(self.miss_res)
 92 | 
 93 |         if side_chain_pdb is not None: # we need to copy side chains first
 94 |             tmp=tempfile.NamedTemporaryFile(delete=False, prefix="_THREAD", suffix=".pdb")
 95 |             shutil.copyfile(self.pdb, tmp.name)
 96 |             self.p=Protein.copy_side_chain(tmp.name, side_chain_pdb, rl_from=rl_from, rl_to=rl_to)
 97 |             if self.p is None:
 98 |                 print(f"ERROR ThreadSeq> Fail to copy side chains: {out_file}, not found!")
 99 |                 return {"ok":False}
100 |             self.p.save(tmp.name)
101 |             self.pdb=tmp.name
102 |             print("Generated new input PDB: ", tmp.name)
103 |             # these should not change
104 |             self.old_seq=self.p.seq_dict()
105 |             self.chains=self.p.chain_list()
106 |             #
107 |             self.miss_res=self.p.rs_missing_atoms()
108 |             #self.pdb=tmp.name
109 | 
110 |         # threading
111 |         try:
112 |             mutations=self.mutate_seq_pymol(out_file, seq, replace_X_with=replace_X_with)
113 |         except Exception as e:
114 |             print(traceback.format_exc())
115 |             print("ERROR ThreadSeq> PyMOL failed to mutate the protein. No output is generated.")
116 |             return {"ok":False}
117 | 
118 |         if not os.path.exists(out_file):
119 |             print(f"ERROR ThreadSeq> expecting mutated output file: {out_file}, not found!")
120 |             return {"ok":False}
121 | 
122 |         # relaxation
123 |         relax_flag=False
124 |         if relax>0 and len(mutations)>0: # if there is no mutation, we don't need relax
125 |             if ThreadSeq.USE_AMBER:
126 |                 relax_flag=self.relax_amber(out_file, amber_gpu, cores)
127 | 
128 |         # extra steps
129 |         if seq2bfactor:
130 |             try:
131 |                 if side_chain_pdb is not None and rl_from is not None and rl_to is not None:
132 |                     self.add_b_factor_rl(out_file, rl_to)
133 |                 else:
134 |                     self.add_b_factor(out_file, rl_to, seq)
135 |             except Exception as e:
136 |                 print(traceback.format_exc())
137 |                 print("Skip b factor!!!!!")
138 |         else:
139 |             self.remove_hydrogen(out_file)
140 | 
141 |         # return JSON
142 |         data={}
143 |         p=Protein(out_file)
144 |         out_seq=p.seq_dict()
145 |         same=True
146 |         for k,v in seq.items():
147 |             if v.upper()!=out_seq.get(k, ""): same=False
148 |         data["output_pdb"]=out_file
149 |         data["ok"]=True
150 |         data["output_equal_target"] = same
151 |         data["input"]=self.old_seq
152 |         data["output"]=out_seq
153 |         data["target"]=seq
154 |         data["relax"]=relax_flag
155 |         data["residues_with_missing_atom"]=str(self.miss_res)
156 |         data["mutations"]=mutations
157 |         print("###JSON STARTS")
158 |         print(json.dumps(data))
159 |         print("###JSON END")
160 |         return data
161 | 
162 |     def relax_amber(self, out_file, amber_gpu=False, cores=1):
163 |         if not os.path.exists("/da/NBC/ds/lib/protein/amberrelax.py"): return False
164 |         cuda_env='' if amber_gpu else 'CUDA_VISIBLE_DEVICES= ' # looks like we need to unset CUDA env to prevent it from using GPU
165 |         out_file2=out_file.replace('.pdb', '.relaxed.pdb')
166 |         # we use different names to make sure amber relax indeed runs
167 |         cmd=f"source /da/NBC/ds/bin/envs/colabfold.env && {cuda_env}OPENMM_CPU_THREADS={cores} OMP_NUM_THREADS={cores} MKL_NUM_THREADS={cores} NUMEXPR_MAX_THREADS={cores} python /da/NBC/ds/lib/protein/amberrelax.py {'--gpu' if amber_gpu else ''} {out_file} {out_file2}"
168 |         print(cmd)
169 |         util.unix(cmd)
170 |         if os.path.exists(out_file2):
171 |             print("INFO> Amber relax is successful!")
172 |             relax_flag=True
173 |             os.replace(out_file2, out_file)
174 |             return True
175 |         return False
176 | 
177 |     def mutate_seq_pymol(self, out_file, seq, replace_X_with='A'):
178 |         #from pymol import cmd
179 |         # make it thread safe, see https://pymolwiki.org/index.php/Launching_From_a_Script
180 |         try:
181 |             import pymol2
182 |         except:
183 |             print("Please install PyMOL with:\nconda install conda-forge::pymol-open-source")
184 |             exit()
185 | 
186 |         def mutate(session, molecule, chain, resi, target="CYS", mutframe="1"):
187 |             target = target.upper()
188 |             session.cmd.wizard("mutagenesis")
189 |             session.cmd.do("refresh_wizard")
190 |             session.cmd.get_wizard().set_mode("%s" % target)
191 |             selection = "/%s//%s/%s" % (molecule, chain, resi)
192 |             session.cmd.get_wizard().do_select(selection)
193 |             session.cmd.frame(str(mutframe))
194 |             session.cmd.get_wizard().apply()
195 |             # cmd.set_wizard("done")
196 |             session.cmd.set_wizard()
197 | 
198 |         with pymol2.PyMOL() as session:
199 |             session.cmd.load(self.pdb, "myobj")
200 |             parser = PDBParser(QUIET=True)
201 |             structure = parser.get_structure("myobj", self.pdb)
202 |             res_dict = {}
203 |             mutations=[]
204 |             missing=self.miss_res
205 |             for model in structure:
206 |                 for chain in model:
207 |                     c=chain.id
208 |                     if c not in seq: continue # this chain is skipped
209 |                     cnt=0
210 |                     #missing=self.miss_res.get(c, [])
211 |                     for residue in chain:
212 |                         resi=residue.id[1]
213 |                         mut_to=seq[c][cnt].upper()
214 |                         cnt+=1
215 |                         if mut_to not in restype_1to3:
216 |                             if replace_X_with in restype_1to3:
217 |                                 mut_to=replace_X_with
218 |                             else:
219 |                                 raise Exception(f"ERROR PyMOL> Unrecognized residue {mut_to} at position {resi+1}!")
220 |                         name3 = restype_1to3[ mut_to ]
221 |                         # skip if no mutation, use existing sidechain
222 |                         # but force mutation if there were missing atoms in the original PDB
223 |                         if name3==residue.resname and (resi-1) not in missing: continue
224 |                         print("MUTATE PyMOL> Old ", chain.id, resi, residue.resname, ">>> New", name3)
225 |                         mutate(session, "myobj", chain.id, resi=resi, target=name3, mutframe="1")
226 |                         mutations.append((chain.id, resi, residue.resname, name3))
227 |             # pymol reorder the chain by alphabetic order
228 |             session.cmd.save(out_file)
229 |         return mutations
230 | 
231 |     def remove_hydrogen(self, out_file):
232 |         # Remove hydrogens
233 |         pdb_str=util.read_list(out_file)
234 |         out=[]
235 |         for i,s in enumerate(pdb_str):
236 |             if s.startswith("ATOM ") and s[76:78]==" H":
237 |                 continue
238 |             else:
239 |                 out.append(s)
240 | 
241 |         data_str="\n".join(out)
242 |         p = Protein(pdb_str=data_str)
243 |         p.save(out_file)
244 | 
245 |     def add_b_factor(self, out_file, seq):
246 |         print(out_file)
247 |         p=Protein(out_file)
248 |         x=p.seq_dict()
249 |         C_b={}
250 |         print(seq)
251 |         for k,v in seq.items():
252 |             C_b[k]=np.zeros(len(v))
253 |             for i,res in enumerate(v):
254 |                 C_b[k][i]=0.5 if res==res.lower() else 1.0
255 |         p.b_factors_by_chain(C_b)
256 |         p.save(out_file)
257 | 
258 |     def add_b_factor_rl(self, out_file, rl_to):
259 |         print(out_file)
260 |         p=Protein(out_file)
261 |         p.b_factors(0.5)
262 |         p.b_factors(1, rs=rl_to)
263 |         p.save(out_file)
264 | 
265 | if __name__=="__main__":
266 |     print("INFO> For multi-chain structures, provide one sequence in the order of how they appear in PDB, chain sequences can be optionally separated by space or colon for the sake of clarity.\n")
267 |     print("INFO> if sequence is not specified, no mutation is done")
268 |     print("Input PDB only needs to contain N, CA, C")
269 |     print('EXAMPLE> ./thread_seq.py -i myexample/mybb.pdb -o my_fill.pdb -s "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLKKKKKKKKKKTTTT:SS"')
270 |     print('EXAMPLE> ./thread_seq.py -i myexample/1crn.pdb1 -o my_fill.pdb -s "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"')
271 |     import argparse as arg
272 |     opt=arg.ArgumentParser(description='Thread a sequence onto a PDB template')
273 |     opt.add_argument('-i', '--input', type=str, default=None, help='input PDB file', required=True)
274 |     opt.add_argument('-o', '--output', type=str, default=None, help='output PDB file', required=True)
275 |     opt.add_argument('-s','--sequence', type=str, default="", help='sequence, empty sequence means we will fix residues with missing atoms')
276 |     opt.add_argument('-r','--relax', type=int, default=100, help='relax cycles')
277 |     opt.add_argument('-x','--replace_X', type=str, default='', help='replace unknown residues in sequence by specied one-letter code (e.g., A or G for Alanine/Glycine)')
278 |     opt.add_argument('-b','--seq2bfactor', action='store_true', default=False, help='Convert upper/lower case in sequence into bfactor 1.0 and 0.5, useful to highlight fixed/hallucinated residues from RFDiffusion output')
279 |     opt.add_argument('-g','--gpu', action='store_true', default=False, help='Use GPU in amber')
280 |     opt.add_argument('-c','--cores', type=int, default=1, help='Number of CPU cores used for amber')
281 |     opt.add_argument('--scpdb', type=str, default=None, help='PDB file provides side chain coordinate')
282 |     # scmap is replaced by contig_scpdb and contig_input, two contigs for residue lists afpdb.RL
283 |     #opt.add_argument('--scmap', type=str, default=None, help='JSON maps chain names, scpdb into input pdb')
284 |     opt.add_argument('--contig_scpdb', type=str, default=None, help='Contig string specifying the fixed residues in scpdb')
285 |     opt.add_argument('--contig_input', type=str, default=None, help='Contig string specifying the fixed residues in input')
286 | 
287 |     #thread_seq("1cmp.pdb1", "my_fill.pdb", "L"*44+":"+"TK")
288 |     args = opt.parse_args()
289 | 
290 |     if args.gpu:
291 |         util.warn_msg('GPU for amber does not work most of the time!!!')
292 |     ts=ThreadSeq(args.input)
293 |     ts.run(args.output, args.sequence, relax=args.relax, replace_X_with=args.replace_X, seq2bfactor=args.seq2bfactor, amber_gpu=args.gpu, cores=args.cores, side_chain_pdb=args.scpdb, rl_from=args.contig_scpdb, rl_to=args.contig_input)
294 |     # thread_seq('/da/NBC/ds/zhoubi1/ides/data/init_guess/a.pdb',
295 |     #            '/da/NBC/ds/zhoubi1/ides/data/init_guess/6a4k.pdb',
296 |     #            'VAPLHLGKCNIAGWILGNPECESLSTASSWSYIVETPSSDNGTCYPGDFIDYEELREQLSSVSSFERFEIFPKTSSWPNHDSNKGVTAACPHAGAKSFYKNLIWLVKKGNSYPKLSKSYINDKGKEVLVLWGIHHPSTSADQQSLYQNADAYVFVGSSRYSKKFKPEIAIRPKVRDQEGRMNYYWTLVEPGDKITFEATGNLVVPRYAFAMERNAGSGIIISD:QVQLQESGPGLVKPSETLSLTCTVSGGSVNTGSYYWSWIRQPPGKGLEWIAYSSVSGTSNYNPSLKSRVTLTVDTSKNQFSLSVRSVTAADTAVYFCARLNYDILTGYYFFDFWGQGTLVIVSSASTKGPSVFPLAPSSKSASGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSSGTQTYICNVNHKPSNTKVDKRVEPKSCDKT:QVELTQSPSASASLGTSVKLTCTLSSGHSTYAIAWHQQRPGKGPRYLMNLSSGGRHTRGDGIPDRFSGSSSGADRYLIISSLQSEDEADYYCQTWDAGMVFGGGTKLTVLGQSKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPTECS',
297 |     #            10)
298 | 
299 | 
300 | 


--------------------------------------------------------------------------------
/tests/fake.pdb:
--------------------------------------------------------------------------------
 1 | MODEL     1
 2 | ATOM      1  N   GLU L   5       5.195 -14.817 -19.187  1.00  1.00           N  
 3 | ATOM      2  CA  GLU L   5       6.302 -14.276 -18.361  1.00  1.00           C  
 4 | ATOM      3  C   GLU L   5       7.148 -15.388 -17.731  1.00  1.00           C  
 5 | ATOM      4  CB  GLU L   5       5.794 -13.368 -17.248  1.00  1.00           C  
 6 | ATOM      5  O   GLU L   5       6.658 -16.231 -17.006  1.00  1.00           O  
 7 | ATOM      6  CG  GLU L   5       6.934 -12.664 -16.494  1.00  1.00           C  
 8 | ATOM      7  CD  GLU L   5       6.461 -11.817 -15.327  1.00  1.00           C  
 9 | ATOM      8  OE1 GLU L   5       7.282 -11.138 -14.677  1.00  1.00           O  
10 | ATOM      9  OE2 GLU L   5       5.243 -11.804 -15.070  1.00  1.00           O  
11 | ATOM     10  N   ILE L   6A      8.444 -15.321 -17.934  1.00  1.00           N  
12 | ATOM     11  CA  ILE L   6A      9.340 -16.291 -17.336  1.00  1.00           C  
13 | ATOM     12  C   ILE L   6A      9.657 -15.849 -15.925  1.00  1.00           C  
14 | ATOM     13  CB  ILE L   6A     10.604 -16.433 -18.162  1.00  1.00           C  
15 | ATOM     14  O   ILE L   6A     10.192 -14.739 -15.685  1.00  1.00           O  
16 | ATOM     15  CG1 ILE L   6A     10.228 -16.847 -19.590  1.00  1.00           C  
17 | ATOM     16  CG2 ILE L   6A     11.540 -17.469 -17.523  1.00  1.00           C  
18 | ATOM     17  CD1 ILE L   6A     11.401 -17.319 -20.426  1.00  1.00           C  
19 | ATOM     18  N   VAL L   6B      9.339 -16.725 -14.982  1.00  1.00           N  
20 | ATOM     19  CA  VAL L   6B      9.678 -16.518 -13.599  1.00  1.00           C  
21 | ATOM     20  C   VAL L   6B     11.024 -17.188 -13.330  1.00  1.00           C  
22 | ATOM     21  CB  VAL L   6B      8.569 -17.028 -12.666  1.00  1.00           C  
23 | ATOM     22  O   VAL L   6B     11.242 -18.372 -13.679  1.00  1.00           O  
24 | ATOM     23  CG1 VAL L   6B      8.960 -16.919 -11.194  1.00  1.00           C  
25 | ATOM     24  CG2 VAL L   6B      7.268 -16.234 -12.927  1.00  1.00           C  
26 | ATOM     40  N   GLN L  10      15.587 -17.776  -7.649  1.00  1.00           N  
27 | ATOM     41  CA  GLN L  10      16.895 -17.892  -7.030  1.00  1.00           C  
28 | ATOM     42  C   GLN L  10      16.721 -18.330  -5.569  1.00  1.00           C  
29 | ATOM     43  CB  GLN L  10      17.616 -16.572  -7.093  1.00  1.00           C  
30 | ATOM     44  O   GLN L  10      16.270 -17.557  -4.746  1.00  1.00           O  
31 | ATOM     45  CG  GLN L  10      17.963 -16.094  -8.483  1.00  1.00           C  
32 | ATOM     46  CD  GLN L  10      18.781 -14.822  -8.460  1.00  1.00           C  
33 | ATOM     47  NE2 GLN L  10      20.052 -14.951  -8.083  1.00  1.00           N  
34 | ATOM     48  OE1 GLN L  10      18.284 -13.727  -8.786  1.00  1.00           O  
35 | ATOM    862  N   LEU H   3      27.368   6.440 -19.107  1.00  1.00           N  
36 | ATOM    863  CA  LEU H   3      25.970   6.871 -19.038  1.00  1.00           C  
37 | ATOM    864  C   LEU H   3      25.761   7.794 -17.840  1.00  1.00           C  
38 | ATOM    865  CB  LEU H   3      25.089   5.647 -18.873  1.00  1.00           C  
39 | ATOM    866  O   LEU H   3      25.979   7.398 -16.661  1.00  1.00           O  
40 | ATOM    867  CG  LEU H   3      25.225   4.606 -19.964  1.00  1.00           C  
41 | ATOM    868  CD1 LEU H   3      24.282   3.430 -19.748  1.00  1.00           C  
42 | ATOM    869  CD2 LEU H   3      24.962   5.190 -21.355  1.00  1.00           C  
43 | ATOM    870  N   VAL H   4      25.291   9.008 -18.090  1.00  1.00           N  
44 | ATOM    871  CA  VAL H   4      25.112   9.983 -16.986  1.00  1.00           C  
45 | ATOM    872  C   VAL H   4      23.672  10.399 -16.963  1.00  1.00           C  
46 | ATOM    873  CB  VAL H   4      25.976  11.257 -17.222  1.00  1.00           C  
47 | ATOM    874  O   VAL H   4      23.143  10.875 -17.973  1.00  1.00           O  
48 | ATOM    875  CG1 VAL H   4      25.686  12.319 -16.177  1.00  1.00           C  
49 | ATOM    876  CG2 VAL H   4      27.466  10.902 -17.258  1.00  1.00           C  
50 | TER     877      VAL H   4
51 | ENDMDL
52 | END   
53 | 


--------------------------------------------------------------------------------
/tests/test_all.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import afpdb.util as util
  3 | from afpdb.afpdb import Protein,RS,ATS
  4 | import os
  5 | import numpy as np
  6 | import afpdb.myalphafold.common.residue_constants as afres
  7 | 
  8 | fn = os.path.join(os.path.dirname(__file__), "5cil.pdb")
  9 | fk = os.path.join(os.path.dirname(__file__), "fake.pdb")
 10 | f3 = os.path.join(os.path.dirname(__file__), "1a3d.pdb")
 11 | f4 = os.path.join(os.path.dirname(__file__), "5cil_100.pdb")
 12 | 
 13 | def check_p(p):
 14 |     return ((set(p.chain_id()) == {"L","H","P"}) and p.seq()=='EIVLTQSPGTQSLSPGERATLSCRASQSVGNNKLAWYQQRPGQAPRLLIYGASSRPSGVADRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGQSLSTFGQGTKVEVKRTV:VQLVQSGAEVKRPGSSVTVSCKASGGSFSTYALSWVRQAPGRGLEWMGGVIPLLTITNYAPRFQGRITITADRSTSTAYLELNSLRPEDTAVYYCAREGTTGDGDLGKPIGAFAHWGQGTLVTVSS:NWFDITNWLWYIK')
 15 | 
 16 | ### Demo
 17 | def test_demo():
 18 |     p=Protein(fn)
 19 |     c=p.seq_dict()
 20 |     assert c['P']=='NWFDITNWLWYIK', "Load object and seq_dict()"
 21 | 
 22 |     rs_binders, rs_seed, df_dist=p.rs_around("P", dist=4, drop_duplicates=True)
 23 |     assert str(rs_binders)=="L33,92-95,97:H30,32,46,49-51,54,56-58,98,106-109", "rs_around, rs2str"
 24 | 
 25 |     assert(len(df_dist)==21), "df_dist"
 26 |     p=p.extract(rs_binders | "P")
 27 |     p.save("test.pdb")
 28 |     assert(len(p)==34)
 29 | 
 30 | ### Data structure
 31 | def test_data():
 32 |     p=Protein(fk)
 33 |     c=p.len_dict()
 34 |     assert(c["H"]==2 and c["L"]==4)
 35 | 
 36 |     g=p.data_prt
 37 |     assert(np.array_equal(g.chain_index, np.array([0,0,0,0,1,1]))), "chain_index"
 38 |     assert(np.array_equal(g.chain_id, np.array(["L","H"]))), "chain_id"
 39 |     assert(np.array_equal(g.aatype, np.array([6,9,19,5,10,19]))), "aatype"
 40 |     assert(np.array_equal(np.array(g.atom_positions.shape), np.array([6,37,3]))), "atom_positoins"
 41 |     assert(np.all(g.atom_mask[:, afres.atom_order['CA']]==1)), "atom_mask, CA"
 42 |     assert(np.all(g.atom_mask[np.array([1,2,5]), afres.atom_order['CG']]==0)), "atom_mask CG"
 43 |     assert(np.all(np.abs(g.atom_positions[:, afres.atom_order['CA']])>1)), "atom_positions, CA"
 44 |     assert(np.all(np.abs(g.atom_positions[np.array([1,2,5]), afres.atom_order['CG']])<1)), "atom_positions, CG"
 45 | 
 46 | ### Contig
 47 | def test_contig():
 48 |     p=Protein(fn)
 49 |     assert(len(p.rs("L11"))==1)
 50 |     assert(len(p.rs("H-5:H10-15:L-10"))==21)
 51 |     assert(np.array_equal(p.rs("H-5:H10-15:L-10").data, p.rs("H-5,10-15:L-10").data))
 52 | 
 53 | ### inplace & clone
 54 | def test_inplace():
 55 |     p=Protein(fn)
 56 |     q=p.extract("H:L")
 57 |     assert (p!=q), "inplace 1"
 58 | 
 59 |     q=p.extract("H:L", inplace=True)
 60 |     assert p==q, "inplace 2"
 61 |     assert not check_p(p)
 62 | 
 63 |     p=Protein(fn)
 64 |     q=p.clone()
 65 |     q.extract("H:L", inplace=True)
 66 |     assert check_p(p), "clone"
 67 | 
 68 | ### Read/Write
 69 | 
 70 | def test_local():
 71 |     p=Protein(fn)
 72 |     assert check_p(p)
 73 | 
 74 | def test_save():
 75 |     p=Protein(fn)
 76 |     p.save("test.pdb")
 77 |     p=Protein("test.pdb")
 78 |     assert check_p(p), "save to pdb file"
 79 |     p.save("test.cif")
 80 |     p=Protein("test.cif")
 81 |     assert check_p(p), "save to and read from cif file"
 82 |     if os.path.exists("test.pdb"): os.remove("test.pdb")
 83 |     if os.path.exists("test.cif"): os.remove("test.cif")
 84 | 
 85 | def test_pdb():
 86 |     p=Protein("1crn")
 87 |     assert len(p)==46, f"create from PDB"
 88 | 
 89 | def test_embl():
 90 |     p=Protein("Q2M403")
 91 |     print(len(p))
 92 |     assert len(p)==458, f"create from EMBL"
 93 | 
 94 | def test_alphafold():
 95 |     p=Protein(fn)
 96 |     p=p.data_prt
 97 |     p=Protein(p)
 98 |     assert check_p(p), "create from DeepMind protein object"
 99 | 
100 | def test_biopython():
101 |     p=Protein(fn)
102 |     p=Protein(p.to_biopython())
103 |     assert check_p(p), "create from BioPython object"
104 | 
105 | def test_pdb_str():
106 |     p=Protein(fn)
107 |     p=Protein(p.to_pdb_str())
108 |     assert check_p(p), "create from PDB string"
109 | 
110 | def test_missing():
111 |     p=Protein(fn)
112 |     q=p.extract(~p.rs("H10-14"))
113 |     rs_missing=p.rs_insertion(q)
114 |     assert str(rs_missing)=="H10-14"
115 |     q=p.extract("H1-10,15-:L-20,25-")
116 |     assert str(q.rs_next2missing()) == "H10-15:L20-25"
117 | 
118 | ### Sequence, missing residues, insertion code
119 | def test_inplace():
120 |     p=Protein(fn)
121 |     check_p(p), "Sequence 1"
122 | 
123 |     c=p.seq_dict()
124 |     assert c["P"]=='NWFDITNWLWYIK', "Sequence 2"
125 |     assert c["H"]=='VQLVQSGAEVKRPGSSVTVSCKASGGSFSTYALSWVRQAPGRGLEWMGGVIPLLTITNYAPRFQGRITITADRSTSTAYLELNSLRPEDTAVYYCAREGTTGDGDLGKPIGAFAHWGQGTLVTVSS', "Sequence 3"
126 |     assert c["L"]=='EIVLTQSPGTQSLSPGERATLSCRASQSVGNNKLAWYQQRPGQAPRLLIYGASSRPSGVADRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGQSLSTFGQGTKVEVKRTV', "Sequence 4"
127 | 
128 |     p=Protein(fk)
129 | 
130 |     assert np.array_equal(p.data_prt.residue_index, np.array(['5','6A','6B','10','3','4'])), "Insertion code"
131 | 
132 |     assert p.seq()=="EIVXXXQ:LV", "missing X"
133 | 
134 |     assert p.seq(gap="")=="EIVQ:LV", "gap for missing"
135 | 
136 |     p=Protein(fn)
137 |     assert len(p)==250, "len"
138 |     assert p.len_dict()["H"]==126, "len_dict"
139 |     out=p.rs_seq('LTI')
140 |     for x in out:
141 |         print(str(x), x.seq())
142 |     assert str(RS._or(* p.rs_seq('LTI')))=='L74-76:H54-56', "seq_search"
143 | 
144 | ### Mutagenesis
145 | def test_thread():
146 |     # currently thread_seq, pymol is not forced to be installed
147 |     pass
148 | 
149 | ### Chain
150 | def test_chain():
151 |     p=Protein(fn)
152 |     assert "".join(p.chain_id())=="LHP", "chain 1"
153 | 
154 |     p.reorder_chains(["P","L","H"], inplace=True)
155 |     assert not check_p(p), "chain 2"
156 |     assert "".join(p.chain_id())=="PLH", "chain 3"
157 | 
158 |     p=Protein(fn)
159 |     c=p.chain_pos()
160 |     assert c["L"][0]==0 and c["L"][1]==110, "chain_pos 1"
161 |     assert c["H"][0]==111 and c["H"][1]==236, "chain_pos 1"
162 |     assert c["P"][0]==237 and c["P"][1]==249, "chain_pos 1"
163 | 
164 |     p=Protein(fk)
165 |     q=p.data_prt
166 |     p.renumber(None, inplace=True)
167 |     assert ",".join(q.residue_index)=="5,6A,6B,10,3,4", "_renumber None"
168 |     p.renumber("RESTART", inplace=True)
169 |     assert ",".join(q.residue_index)=="1,2A,2B,6,1,2", "_renumber RESTART"
170 |     p.renumber("CONTINUE", inplace=True)
171 |     assert ",".join(q.residue_index)=="1,2A,2B,6,7,8", "_renumber CONTINUE"
172 |     p.renumber("GAP33", inplace=True)
173 |     assert ",".join(q.residue_index)=="1,2A,2B,6,40,41", "_renumber GAP33"
174 |     p.renumber("NOCODE", inplace=True)
175 |     assert ",".join(q.residue_index)=="1,2,3,7,8,9", "_renumber NOCODE"
176 | 
177 |     p=Protein(fn)
178 |     q, c_pos=p.merge_chains(gap=200, inplace=False)
179 |     assert c_pos["H"][0]==111 and c_pos["H"][1]==236, "merge_chain 1"
180 |     assert len(q.chain_id())==1, "merge chain 2"
181 |     r=q.split_chains(c_pos, inplace=False)
182 |     c_pos=r.chain_pos()
183 |     assert len(r.chain_id())==3 and c_pos["H"][0]==111 and c_pos["H"][1]==236, "merge_chain 3"
184 | 
185 | ### Selection
186 | def test_ats():
187 |     p=Protein()
188 |     assert np.array_equal(ATS("N,CA,C,O").data, np.array([0,1,2,4])), "ats 1"
189 |     assert np.array_equal(ATS(["N","CA","C","O"]).data, np.array([0,1,2,4])), "ats 2"
190 |     assert len(p.ats(""))==0, "ats 3"
191 |     assert len(p.ats(None))==37, "ats 4"
192 |     assert len(p.ats("ALL"))==37, "ats 5"
193 |     assert len(p.ats("NULL"))==0, "ats 6"
194 |     assert np.array_equal(p.ats_not(["N","CA","C","O"]).data, p.ats("CB,CG,CG1,CG2,OG,OG1,SG,CD,CD1,CD2,ND1,ND2,OD1,OD2,SD,CE,CE1,CE2,CE3,NE,NE1,NE2,OE1,OE2,CH2,NH1,NH2,OH,CZ,CZ2,CZ3,NZ,OXT").data), "ats 6"
195 | 
196 | def test_rs():
197 |     p=Protein(fn)
198 |     rs1=p.rs("H1-3")
199 |     rs2=p.rs("H3-5:L-3")
200 |     assert len(p.rs_and(rs1, rs2))==1, "rs_and"
201 |     assert len(p.rs_or(rs1, rs2))==8, "rs_or"
202 |     assert len(p.rs_not("H:L"))==13, "rs_not"
203 |     assert len(p.rs_not("L-100", rs_full="L"))==11, "rs_notin"
204 |     assert p.rs2str(p.rs_or(rs1, rs2))=="L1-3:H1-5", "rs2str"
205 | 
206 |     p=Protein(fn)
207 |     p=p.extract(rs="H-5:L-5", ats="N,CA,C,O")
208 |     assert len(p)==10, "rs and ats"
209 |     # no side chain
210 |     assert np.sum(p.data_prt.atom_mask[:, p.ats("CB").data])==0, "backbone only"
211 | 
212 | def test_rs2():
213 |     p=Protein(fn)
214 |     rs1=RS(p, "H1-3")
215 |     rs2=RS(p, "H3-5:L-3")
216 |     assert len(rs1 & rs2)==1, "rs_and"
217 |     assert len(rs1 | rs2)==8, "rs_or"
218 |     assert len(~RS(p, "H:L"))==13, "rs_not"
219 |     assert len(RS(p, "L-100")._not(rs_full="L"))==11, "rs_notin"
220 |     assert str(rs1 | rs2)=="L1-3:H1-5", "rs2str"
221 |     assert len(RS._or(RS(p,"H1-3"), "H10", "H11", "H12", "H13"))==7, "rs_or"
222 |     assert len(RS._and(RS(p,"H1-10"), "H3-20", "H5-22"))==6, "rs_and"
223 |     assert len(rs1 - rs2)==2, "rs_minus"
224 |     rs1 |= rs2
225 |     assert (len(rs1) == 8), "rs_ior"
226 |     rs1 &= rs2
227 |     assert (len(rs1) == len(rs2)), "rs_iand"
228 | def test_rs2str():
229 |     p=Protein(f4)
230 |     assert p.rs2str("L5-10,100:H")=="H:L5-10,100", "rs2str 1"
231 |     assert p.rs2str("L5-10,100:H", format="PYMOL")=="select rs, (chain H) or (chain L and resi 5-10+100)", "rs2str 2"
232 | 
233 |     assert p.rs2str("H100A-102", format="PYMOL")=="select rs, (chain H and resi 100A+100B+100C+100D+100E+100F+100G+100H+100I+100J+101-102)", "rs2str 3"
234 |     assert p.rs2str("H100-102", format="PYMOL")=="select rs, (chain H and resi 100-102)", "rs2str 4"
235 |     assert p.rs2str("H100-100D", format="PYMOL")=="select rs, (chain H and resi 100+100A+100B+100C+100D)", "rs2str 5"
236 |     assert p.rs2str("H100D-100F", format="PYMOL")=="select rs, (chain H and resi 100D+100E+100F)", "rs2str 6"
237 |     assert p.rs2str("H98-100,100D,100F-102", format="PYMOL")=="select rs, (chain H and resi 98-99+100+100D+100F+100G+100H+100I+100J+101-102)", "rs2str 7"
238 | 
239 | def test_canonicalize_rs():
240 |     p=Protein(fn)
241 |     # an unusal selection
242 |     rs=p.rs("L1-5:P12-13:L6-10")
243 |     try:
244 |         p.extract(rs)
245 |         assert False, "unusal selection 1"
246 |     except Exception as e:
247 |         assert True, "unusal selection 1"
248 |     try:
249 |         p.extract(p.rs2str(rs))
250 |         assert True, "unusal selection 2"
251 |     except Exception as e:
252 |         assert False, "unusal selection 2"
253 | 
254 | def test_rs_around():
255 |     p=Protein(fn)
256 |     rs=p.rs("P")
257 |     rs_nbr, r_seed, t=p.rs_around(rs, dist=3.5, drop_duplicates=True)
258 |     assert p.rs2str(rs_nbr)=="L33,92-95:H30,56,58,98,106-109", "rs_around 1"
259 |     assert len(t)==len(rs_nbr), "rs_around 2"
260 | 
261 | def test_residue_id():
262 |     p=Protein(fn)
263 |     rs=p.rs("P")
264 |     rs_nbr, rs_seed, t=p.rs_around(rs, dist=3.5, drop_duplicates=True)
265 |     t.display()
266 |     t2=t[(t.chain_b=="H")&(t.resn_b>="95")&(t.resn_b<="106")]
267 |     t2.display()
268 |     assert len(t2)==0, "residue filter 1"
269 |     t2=t[(t.chain_b=="H")&(t.resn_i_b>=95)&(t.resn_i_b<=106)]
270 |     t2.display()
271 |     assert len(t2)==2, "residue filter 2"
272 | 
273 | ### Display
274 | def test_html():
275 |     p=Protein(fn)
276 |     s=p.html()
277 |     assert "ATOM      1  N   GLU A   1" in s, "html"
278 | 
279 | ### B-factors
280 | def test_b_factors():
281 |     p=Protein(fn)
282 |     n=len(p)
283 |     d=np.sin(np.arange(n)/n*np.pi)
284 |     p.b_factors(d)
285 |     assert np.array_equal(p.data_prt.b_factors[:,0], d), "b_factors 1"
286 | 
287 |     p.b_factors(np.zeros(len(p.rs("H"))), rs="H")
288 |     p.b_factors(0.5, rs="L")
289 |     p.b_factors(1, rs="P")
290 |     assert np.all(np.abs(p.data_prt.b_factors[p.rs("L").data, 0]-0.5)<0.001), "b_factors 2"
291 | 
292 |     p.b_factors_by_chain({"H":0.1,"L":0.2,"P":0.3})
293 |     assert np.all(np.abs(p.data_prt.b_factors[p.rs("L").data, 0]-0.2)<0.001), "b_factors 3"
294 | 
295 | ### Distance
296 | def test_dist():
297 |     p=Protein(fk)
298 |     t=p.rs_dist("L","H")
299 |     r=t.iloc[0]
300 |     assert np.abs(r['dist']-21.2251)<0.001, "distance 1"
301 |     assert r['atom_a']=="OE1" and r['atom_b']=="CD1", "distance 2"
302 |     assert r['resi_a']==3 and r['resi_b']==4, "distance 2"
303 | 
304 |     t=p.atom_dist("L6A", "H3")
305 |     assert np.abs(t.iloc[0]["dist"]-23.3484)<0.001, "distance 3"
306 | 
307 |     t=p.atom_dist("L6A", "H3", ats="N,CA,C,O")
308 |     assert np.abs(t.iloc[0]["dist"]-26.9663)<0.001, "distance 4"
309 | 
310 | ### RMSD, translate, rotate
311 | def test_rmsd():
312 |     p=Protein(fk)
313 |     q=p.translate([3.0,4.0,0.0], inplace=False)
314 |     assert np.all(np.isclose(q.center()-p.center(), np.array([3.,4.,0.]))), "rmsd 1"
315 | 
316 |     assert np.abs(q.rmsd(p)-5)<0.001, "rmsd 2"
317 |     assert np.abs(q.rmsd(p, ats="CA")-5)<0.001, "rmsd 2"
318 | 
319 |     p=Protein(fk)
320 |     q=p.translate([3.0,4.0,0.0], inplace=False)
321 |     assert np.all(np.isclose(q.center()-p.center(), np.array([3.,4.,0.]))), "rmsd 1"
322 | 
323 |     assert np.abs(q.rmsd(p)-5)<0.001, "rmsd 2"
324 |     assert np.abs(q.rmsd(p, ats="CA")-5)<0.001, "rmsd 2"
325 | 
326 | ### SASA
327 | def test_sasa():
328 |     p=Protein(fn)
329 |     t=p.sasa()
330 |     sasa=p.sasa().SASA.values
331 |     t_all=t[(t.chain=="L")][90:105]
332 |     t=p.sasa("H:L")
333 |     t_ag=t[(t.chain=="L")][90:105]
334 |     t_ag['DELTA_SASA']=t_ag['SASA'].values-t_all['SASA'].values
335 |     t_ag.display()
336 |     x=t_ag.DELTA_SASA.values
337 |     assert np.all(x[1:4]>8), "SASA 1"
338 |     assert np.all(x[7:]<10), "SASA 2"
339 | 
340 | ### DSSP
341 | def test_dssp():
342 |     # currently DSSP is not forced to be installed
343 |     pass
344 | 
345 | ### Internal Coordinate
346 | def test_ic():
347 |     p=Protein(fk)
348 |     t=p.internal_coord(rs="L")
349 |     assert np.isnan(t['-1C:N'].values[-1]), "ic 1"
350 | 
351 |     t=p.internal_coord(rs="L", MaxPeptideBond=1e8)
352 |     assert np.abs(t['-1C:N'].values[-1]-7.310)<0.001, "ic 2"
353 | 
354 |     assert np.abs(t['chi1'].values[0]+172.925)<0.001, "ic 3"
355 | 
356 | ### Move Object
357 | def test_move():
358 |     p=Protein(fk)
359 |     assert np.all(np.isclose(p.center(), np.array([ 15.54949999,-8.0205001,-15.39166681]))), "center"
360 |     q=p.center_at([3.0,4.0,5.0], inplace=False)
361 |     print(q.center())
362 |     assert np.all(np.isclose(q.center(), np.array([ 3.,4.,5.]))), "center_at"
363 |     q=p.translate([1.,0.,-1.], inplace=False)
364 |     assert np.abs(q.rmsd(p, None, None, "CA")-1.414)<0.001, "translate"
365 |     q=p.rotate([1,1,1], 5, inplace=False)
366 |     assert q.rmsd(p, ats="CA")>2.0, "rotate 1"
367 |     rc=p.center()
368 |     q=p.center_at([0,0,0], inplace=False)
369 |     q.rotate([1,1,1], 5, inplace=True)
370 |     q.center_at(rc, inplace=True)
371 |     assert q.rmsd(p, None, None, "CA") < 1, "rotate 2"
372 | 
373 |     p.rotate(np.random.random(3), np.random.random()*180)
374 |     p.reset_pos(inplace=True)
375 |     assert np.all(np.isclose(p.center(), np.zeros(3))), "reset_pos"
376 | 
377 | def test_align():
378 |     p=Protein(fn)
379 |     q=p.translate([3.,4.,0.], inplace=False)
380 |     q.rotate([1,1,1], 90, inplace=True)
381 |     R,t=q.align(p, ats="CA")
382 |     assert np.all(np.isclose(t, np.array([-3,-4,0]), atol=1e-3)), "align 1"
383 |     assert np.all(np.isclose(np.diagonal(R), np.array([1/3,1/3,1/3]), atol=1e-3)), "align 1"
384 | 
385 | ### Merge Object
386 | def test_merge():
387 |     # 1a3d.pdb is a three-chain homotrimer.
388 |     p=Protein(f3)
389 |     #Exact chain into a new object
390 |     Q=[ p.extract(x, inplace=False) for x in p.chain_id() ]
391 |     for i,q in enumerate(Q[1:]):
392 |         q.rename_chains({'B':'A', 'C':'A'})
393 |         assert q.rmsd(Q[0], ats='N,CA,C,O')>10, f"rmsd obj {i+1} 1"
394 |         q.align(Q[0], ats='N,CA,C,O')
395 |         assert q.rmsd(Q[0], ats='N,CA,C,O')<0.001, f"rsmd obj {i+1} 1 after align"
396 |     q=Protein.merge(Q)
397 |     assert len(q.chain_id())==3, "merge 1"
398 |     assert len(q)==3*len(Q[0]), "merge 2"
399 | 
400 | if __name__=="__main__":
401 |     #test_align()
402 |     #test_rs2str()
403 |     test_b_factors()
404 | 


--------------------------------------------------------------------------------
/tutorial/AI.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/AI.pdf


--------------------------------------------------------------------------------
/tutorial/Afpdb_Tutorial.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/Afpdb_Tutorial.docx


--------------------------------------------------------------------------------
/tutorial/Afpdb_Tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/Afpdb_Tutorial.pdf


--------------------------------------------------------------------------------
/tutorial/Developer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "ff23b777-c657-4956-b4f1-bf510c8b7a80",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<b><font size=\"6\">Afpdb - Developer's Note</font></b>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "db1a3ee8-8b93-4f71-8d01-500022f3e4a6",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Installation\n",
 17 |     "TODO: When the package is considered stable, we need to publish afpdb into pypi, it then simply \"pip install afpdb\".\n",
 18 |     "\n",
 19 |     "For Colab users, please skip this cell.\n",
 20 |     "\n",
 21 |     "The instructions below are for users who would like to install **Afpdb** locally and for developers.\n",
 22 |     "\n",
 23 |     "1. Python\n",
 24 |     "If you do not have Python installed, follow the instructions on https://docs.anaconda.com/free/miniconda/ to install the latest miniconda.\n",
 25 |     "Type command ```python``` should launch the Python programming shell, if it is installed successfully.\n",
 26 |     "\n",
 27 |     "3. Install Afpdb\n",
 28 |     "```\n",
 29 |     "pip install git+https://github.com/data2code/afpdb.git\n",
 30 |     "```\n",
 31 |     "\n",
 32 |     "3. Jupyter Notebook (optional)\n",
 33 |     "To view and run this tutorial, Jupyter should be installed:\n",
 34 |     "```\n",
 35 |     "pip install notebook\n",
 36 |     "```\n",
 37 |     "&emsp;&emsp;Type command ```jupyter notebook``` to lauch the Jupyter Notebook, if it is installed successfully.\n",
 38 |     "\n",
 39 |     "&emsp;&emsp;This is no longer needed. However, if the embedded protein structures do not display in Jupyter after rerun the cell, install the required plugin:\n",
 40 |     "```\n",
 41 |     "jupyter labextension install jupyterlab_3dmol\n",
 42 |     "```\n",
 43 |     "\n",
 44 |     "4. PyMOL (optional)\n",
 45 |     "PyMOL is the preferred application for visualizing protein structures.\n",
 46 |     "It is required by examples using ```thread_sequence()``` or `PyMOL()```.\n",
 47 |     "To install the open source PyMOL:\n",
 48 |     "\n",
 49 |     "```\n",
 50 |     "conda install conda-forge::pymol-open-source\n",
 51 |     "```\n",
 52 |     "&emsp;&emsp;In Colab, we also need to run:\n",
 53 |     "```\n",
 54 |     "conda install conda-forge::openssl=3.2.0 \n",
 55 |     "```\n",
 56 |     "\n",
 57 |     "5. DSSP (optional)\n",
 58 |     "Required for the secondary structure assignment with method ```dssp()```.\n",
 59 |     "```\n",
 60 |     "conda install sbl::dssp\n",
 61 |     "```\n",
 62 |     "There are multiple options, sbl::dssp suits Apple Silicon.\n",
 63 |     "\n",
 64 |     "6. matplotlib (optional)\n",
 65 |     "Required for the Ramachandra plot example\n",
 66 |     "```\n",
 67 |     "pip install matplotlib\n",
 68 |     "```\n",
 69 |     "\n",
 70 |     "5. Install pytest as a developer\n",
 71 |     "```\n",
 72 |     "pip install pytest\n",
 73 |     "```\n",
 74 |     "&emsp;&emsp;Type command ```pytest``` within the root folder of the Afpdb package, you will run all test examples in ```tests\\test_all.py```.\n",
 75 |     "\n",
 76 |     "&emsp;&emsp;For developers, after we fixed the bugs and passed ```pytest```,  we run ```pip install .``` to update the package under the conda installation."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 1,
 82 |    "id": "55140b73-dee9-4d29-a25e-48b968f3f04b",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from pathlib import Path\n",
 87 |     "import os\n",
 88 |     "pwd=Path(os.getcwd())\n",
 89 |     "IN_COLAB=str(pwd)==\"/content\" # we are in Google Colab\n",
 90 |     "\n",
 91 |     "if IN_COLAB:\n",
 92 |     "    pwd=Path(\"/content/afpdb/tutorial\")\n",
 93 |     "    # remove local proxy setting\n",
 94 |     "    os.environ[\"https_proxy\"]=\"\"\n",
 95 |     "    os.environ[\"http_proxy\"]=\"\"\n",
 96 |     "    os.environ[\"ftp_proxy\"]=\"\"\n",
 97 |     "    # install afpdb\n",
 98 |     "    if not os.path.isfile(\"INSTALL_AFPDB\"):\n",
 99 |     "        ! git clone git+https://github.com/data2code/afpdb.git && cd afpdb && pip install .\n",
100 |     "        ! touch INSTALL_AFPDB\n",
101 |     "    from IPython.display import Javascript\n",
102 |     "    display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 50000})'''))\n",
103 |     "    from IPython.display import HTML, display\n",
104 |     "    def set_css():\n",
105 |     "        display(HTML('''\n",
106 |     "          <style>\n",
107 |     "            pre {\n",
108 |     "                white-space: pre-wrap;\n",
109 |     "            }\n",
110 |     "          </style>\n",
111 |     "        '''))\n",
112 |     "    get_ipython().events.register('pre_run_cell', set_css)\n",
113 |     "else: # in a local jupyter notebook\n",
114 |     "    %reload_ext autoreload\n",
115 |     "    %autoreload 2\n",
116 |     "    # we assume afpdb has been preinstall\n",
117 |     "\n",
118 |     "def install_pymol():\n",
119 |     "    try:\n",
120 |     "        import pymol2\n",
121 |     "    except Exception as e:\n",
122 |     "        if not IN_COLAB:\n",
123 |     "            print(\"Please install PyMOL first!\")\n",
124 |     "        else:\n",
125 |     "            !pip install -q condacolab\n",
126 |     "            import condacolab\n",
127 |     "            condacolab.install()\n",
128 |     "            ! conda install conda-forge::pymol-open-source\n",
129 |     "            print(\"Colab does not have openssl 3.2.0, install it...\")\n",
130 |     "            ! conda install conda-forge::openssl=3.2.0\n",
131 |     "            import pymol2\n",
132 |     "\n",
133 |     "from afpdb.afpdb import Protein,util,RS,RL,ATS\n",
134 |     "import numpy as np\n",
135 |     "import pandas as pd\n",
136 |     "import re\n",
137 |     "# two example PDB files used in this tutorial\n",
138 |     "fn = pwd / \"example_files/5cil.pdb\"\n",
139 |     "fk = pwd / \"example_files/fake.pdb\""
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "id": "5e9d9282-8892-49e5-82a6-975c1dc22111",
145 |    "metadata": {},
146 |    "source": [
147 |     "# Selection\n",
148 |     "When creating a method that takes a selection argument named 'rs', the first step is to convert it into an internal selection object using:\n",
149 |     "```rs = self.rs(rs)```, this will convert the argument into a RS object, which has its ```data``` member storing the residue indices. Similarly, if we have an atom selection argument named 'ats', do ```ats=self.ats(ats)```. Similarly, if we take a residue list object, we do ```rl=self.rl(rl)```. When we use a residue/atom selection to index atom_positions or atom_mask, check if the selection is empty/full with ats.is_empty() and ats.is_full(). Empty selection often implies an error on the users' side, a full selection means you can skip the indexing, as the original array is already good.\n",
150 |     "\n",
151 |     "Please use ```extract()``` as an example to see how we support selection arguments.\n",
152 |     "\n",
153 |     "# Change in residue/chain\n",
154 |     "The Protein class contains a data structure called ```res_map```, which is a dictionary that maps a full residue name \"{chain}{residue_id}{code}\" into its internal ndarray index. A few methods rely on this mapping. Therefore, whenever a method renames a chain, changes chain orders, mutates a residue, or changes the full residue name and its internal index, ```self._make_res_map()``` should be called at the end. This is also needed in ```extract()``` as the underlying arrays have been changed.\n",
155 |     "\n",
156 |     "# Residue Identifier\n",
157 |     "When outputting a dataframe containing a residue, our recommendation is to provide all residue ID formats. This includes chain, resn, resn_i, resi. Please use ```rs_dist``` as an example. We often use the resi column to create a Residue List object, then use its ```name, namei, chain, aa``` methods to add additional residue annotation data. See the example under ```rs_dist()```.\n",
158 |     "\n",
159 |     "# inplace\n",
160 |     "To support ```inplace```, the idiom is to use: ```obj = self if inplace else self.clone()```, then use obj to manipulate the structure.\n",
161 |     "Please set inplace=False as the default, so that users do not have to memorize what the default is.\n",
162 |     "\n",
163 |     "# Extract Atom Coordinates\n",
164 |     "\n",
165 |     "```p.data.atom_positions``` contains non-existent atoms. It is often faster to compute distances between two residue sets, if we only keep the coordinates for real atoms. This is done with ```_get_xyz()``` method, which returns three variables: (residue_indices, atom_indices, XYZ_array)."
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 2,
171 |    "id": "45e25018-805a-4428-b155-2ac9bffeb93f",
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "name": "stdout",
176 |      "output_type": "stream",
177 |      "text": [
178 |       "Warning: residues with insertion code: L6A, L6B\n",
179 |       "Residue ID: [4 4 5 5] ['3', '3', '4', '4'] \n",
180 |       "\n",
181 |       "Atom ID: [0 1 0 1] ['N', 'CA', 'N', 'CA'] \n",
182 |       "\n",
183 |       "XYZ: [[ 27.36800003   6.44000006 -19.10700035]\n",
184 |       " [ 25.96999931   6.87099981 -19.03800011]\n",
185 |       " [ 25.29100037   9.00800037 -18.09000015]\n",
186 |       " [ 25.11199951   9.9829998  -16.98600006]]\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "p=Protein(fk)\n",
192 |     "rs_i, atom_i, xyz=p._get_xyz(p.rs(\"H\"), p.ats(\"N,CA\"))\n",
193 |     "print(\"Residue ID:\", rs_i, p.rl(rs_i).name(), \"\\n\")\n",
194 |     "print(\"Atom ID:\", atom_i, [str(p.ats(x)) for x in atom_i], \"\\n\")\n",
195 |     "print(\"XYZ:\", xyz)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "id": "42592323-e4f2-41b0-99ca-81c4eba49cd6",
201 |    "metadata": {},
202 |    "source": [
203 |     "Note: To extract a rectangular subarray of rows and columns, we need to use ```np.ix_```."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 3,
209 |    "id": "46b1a5fc-1af5-4aa7-8661-424049b599e9",
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "name": "stdout",
214 |      "output_type": "stream",
215 |      "text": [
216 |       "Warning: residues with insertion code: L6A, L6B\n",
217 |       "shape mismatch: indexing arrays could not be broadcast together with shapes (2,) (6,) \n",
218 |       "\n",
219 |       "\n",
220 |       "(2, 6) \n",
221 |       "a [[1. 1. 1. 1. 1. 0.]\n",
222 |       " [1. 1. 1. 1. 1. 1.]] \n",
223 |       "\n",
224 |       "[[1. 1. 1. 1. 1. 0.]\n",
225 |       " [1. 1. 1. 1. 1. 1.]]\n"
226 |      ]
227 |     }
228 |    ],
229 |    "source": [
230 |     "p=Protein(fk)\n",
231 |     "# the followin is an error, as the row indice have two residues, column indices have 4 atoms\n",
232 |     "# NumPy tries to pair the indices\n",
233 |     "try:\n",
234 |     "    p.data.atom_mask[np.array([2,3]), ATS(\"N,CA,C,O,CB,CG\").data]\n",
235 |     "except Exception as e:\n",
236 |     "    print(e)\n",
237 |     "# The correct way is to generate a mesh indices\n",
238 |     "print(\"\\n\")\n",
239 |     "x=p.data.atom_mask[np.ix_(np.array([2,3]), ATS(\"N,CA,C,O,CB,CG\").data)]\n",
240 |     "print(x.shape, \"\\na\", x, \"\\n\")\n",
241 |     "# or\n",
242 |     "print(p.data.atom_mask[np.array([2,3])][:, ATS(\"N,CA,C,O,CB,CG\").data])"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "id": "b5669d71-b3dc-49b1-9ba7-eb9785396ff1",
248 |    "metadata": {},
249 |    "source": [
250 |     "## Extract Atom Pair Coordinates\n",
251 |     "\n",
252 |     "For ```align``` and ```rmsd```, we need to extract atom coordinates in pairs, we can use ```_get_xyz_pair```.\n",
253 |     "\n",
254 |     "Note: If two residues have different types (their side chain atoms are different), only the common atoms are included."
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 4,
260 |    "id": "a9330d5a-3167-40a7-92ae-ad84b8e55da1",
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "Warning: residues with insertion code: L6A, L6B\n",
268 |       "Protein i\n",
269 |       "\n",
270 |       "Residue ID: [4 4 5 5] ['3', '3', '4', '4'] \n",
271 |       "\n",
272 |       "Atom ID: [0 1 0 1] ['N', 'CA', 'N', 'CA'] \n",
273 |       "\n",
274 |       "XYZ: [[ 27.36800003   6.44000006 -19.10700035]\n",
275 |       " [ 25.96999931   6.87099981 -19.03800011]\n",
276 |       " [ 25.29100037   9.00800037 -18.09000015]\n",
277 |       " [ 25.11199951   9.9829998  -16.98600006]]\n",
278 |       "\n",
279 |       "\n",
280 |       "\n",
281 |       "Protein j\n",
282 |       "\n",
283 |       "Residue ID: [4 4 5 5] ['3', '3', '4', '4'] \n",
284 |       "\n",
285 |       "Atom ID: [0 1 0 1] ['N', 'CA', 'N', 'CA'] \n",
286 |       "\n",
287 |       "XYZ: [[ 28.36800003   6.44000006 -20.10700035]\n",
288 |       " [ 26.96999931   6.87099981 -20.03800011]\n",
289 |       " [ 26.29100037   9.00800037 -19.09000015]\n",
290 |       " [ 26.11199951   9.9829998  -17.98600006]]\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "p=Protein(fk)\n",
296 |     "# move X by 1, Y/Z remains the same\n",
297 |     "q=p.translate(np.array([1,0,-1]), inplace=False)\n",
298 |     "rs_i, atom_i, rs_j, atom_j, xyz_i, xyz_j=p._get_xyz_pair(q, p.rs(\"H\"), q.rs(\"H\"), ATS(\"N,CA\"))\n",
299 |     "print(\"Protein i\\n\")\n",
300 |     "print(\"Residue ID:\", rs_i, p.rl(rs_i).name(), \"\\n\")\n",
301 |     "print(\"Atom ID:\", atom_i, [str(p.ats(x)) for x in atom_i], \"\\n\")\n",
302 |     "print(\"XYZ:\", xyz_i)\n",
303 |     "print(\"\\n\\n\")\n",
304 |     "print(\"Protein j\\n\")\n",
305 |     "print(\"Residue ID:\", rs_j, p.rl(rs_j).name(), \"\\n\")\n",
306 |     "print(\"Atom ID:\", atom_j, [str(p.ats(x)) for x in atom_j], \"\\n\")\n",
307 |     "print(\"XYZ:\", xyz_j)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "id": "f0166b8a-d024-46a1-a27f-bca6a06430da",
313 |    "metadata": {},
314 |    "source": [
315 |     "## Caution\n",
316 |     "When we add a new method, please keep in mind that the residue index may not start from 1, a residue index may contain insertion code, there can be gaps in the residue index (missing residues), the integer part of the residue index may not be unique within a chain (e.g. 6A and 6B). You should use the file \"fk\" to test your method. Please also add a corresponding test method into ```tests/test_all.py```."
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "id": "f9c06119-24a1-48cc-b45a-fc7ff693b329",
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": []
326 |   }
327 |  ],
328 |  "metadata": {
329 |   "kernelspec": {
330 |    "display_name": "Python 3 (ipykernel)",
331 |    "language": "python",
332 |    "name": "python3"
333 |   },
334 |   "language_info": {
335 |    "codemirror_mode": {
336 |     "name": "ipython",
337 |     "version": 3
338 |    },
339 |    "file_extension": ".py",
340 |    "mimetype": "text/x-python",
341 |    "name": "python",
342 |    "nbconvert_exporter": "python",
343 |    "pygments_lexer": "ipython3",
344 |    "version": "3.11.8"
345 |   }
346 |  },
347 |  "nbformat": 4,
348 |  "nbformat_minor": 5
349 | }
350 | 


--------------------------------------------------------------------------------
/tutorial/Developer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/Developer.pdf


--------------------------------------------------------------------------------
/tutorial/afpdb.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/afpdb.pdf


--------------------------------------------------------------------------------
/tutorial/example_files/fake.pdb:
--------------------------------------------------------------------------------
 1 | MODEL     1
 2 | ATOM      1  N   GLU L   5       5.195 -14.817 -19.187  1.00  1.00           N  
 3 | ATOM      2  CA  GLU L   5       6.302 -14.276 -18.361  1.00  1.00           C  
 4 | ATOM      3  C   GLU L   5       7.148 -15.388 -17.731  1.00  1.00           C  
 5 | ATOM      4  CB  GLU L   5       5.794 -13.368 -17.248  1.00  1.00           C  
 6 | ATOM      5  O   GLU L   5       6.658 -16.231 -17.006  1.00  1.00           O  
 7 | ATOM      6  CG  GLU L   5       6.934 -12.664 -16.494  1.00  1.00           C  
 8 | ATOM      7  CD  GLU L   5       6.461 -11.817 -15.327  1.00  1.00           C  
 9 | ATOM      8  OE1 GLU L   5       7.282 -11.138 -14.677  1.00  1.00           O  
10 | ATOM      9  OE2 GLU L   5       5.243 -11.804 -15.070  1.00  1.00           O  
11 | ATOM     10  N   ILE L   6A      8.444 -15.321 -17.934  1.00  1.00           N  
12 | ATOM     11  CA  ILE L   6A      9.340 -16.291 -17.336  1.00  1.00           C  
13 | ATOM     12  C   ILE L   6A      9.657 -15.849 -15.925  1.00  1.00           C  
14 | ATOM     13  CB  ILE L   6A     10.604 -16.433 -18.162  1.00  1.00           C  
15 | ATOM     14  O   ILE L   6A     10.192 -14.739 -15.685  1.00  1.00           O  
16 | ATOM     15  CG1 ILE L   6A     10.228 -16.847 -19.590  1.00  1.00           C  
17 | ATOM     16  CG2 ILE L   6A     11.540 -17.469 -17.523  1.00  1.00           C  
18 | ATOM     17  CD1 ILE L   6A     11.401 -17.319 -20.426  1.00  1.00           C  
19 | ATOM     18  N   VAL L   6B      9.339 -16.725 -14.982  1.00  1.00           N  
20 | ATOM     19  CA  VAL L   6B      9.678 -16.518 -13.599  1.00  1.00           C  
21 | ATOM     20  C   VAL L   6B     11.024 -17.188 -13.330  1.00  1.00           C  
22 | ATOM     21  CB  VAL L   6B      8.569 -17.028 -12.666  1.00  1.00           C  
23 | ATOM     22  O   VAL L   6B     11.242 -18.372 -13.679  1.00  1.00           O  
24 | ATOM     23  CG1 VAL L   6B      8.960 -16.919 -11.194  1.00  1.00           C  
25 | ATOM     24  CG2 VAL L   6B      7.268 -16.234 -12.927  1.00  1.00           C  
26 | ATOM     40  N   GLN L  10      15.587 -17.776  -7.649  1.00  1.00           N  
27 | ATOM     41  CA  GLN L  10      16.895 -17.892  -7.030  1.00  1.00           C  
28 | ATOM     42  C   GLN L  10      16.721 -18.330  -5.569  1.00  1.00           C  
29 | ATOM     43  CB  GLN L  10      17.616 -16.572  -7.093  1.00  1.00           C  
30 | ATOM     44  O   GLN L  10      16.270 -17.557  -4.746  1.00  1.00           O  
31 | ATOM     45  CG  GLN L  10      17.963 -16.094  -8.483  1.00  1.00           C  
32 | ATOM     46  CD  GLN L  10      18.781 -14.822  -8.460  1.00  1.00           C  
33 | ATOM     47  NE2 GLN L  10      20.052 -14.951  -8.083  1.00  1.00           N  
34 | ATOM     48  OE1 GLN L  10      18.284 -13.727  -8.786  1.00  1.00           O  
35 | ATOM    862  N   LEU H   3      27.368   6.440 -19.107  1.00  1.00           N  
36 | ATOM    863  CA  LEU H   3      25.970   6.871 -19.038  1.00  1.00           C  
37 | ATOM    864  C   LEU H   3      25.761   7.794 -17.840  1.00  1.00           C  
38 | ATOM    865  CB  LEU H   3      25.089   5.647 -18.873  1.00  1.00           C  
39 | ATOM    866  O   LEU H   3      25.979   7.398 -16.661  1.00  1.00           O  
40 | ATOM    867  CG  LEU H   3      25.225   4.606 -19.964  1.00  1.00           C  
41 | ATOM    868  CD1 LEU H   3      24.282   3.430 -19.748  1.00  1.00           C  
42 | ATOM    869  CD2 LEU H   3      24.962   5.190 -21.355  1.00  1.00           C  
43 | ATOM    870  N   VAL H   4      25.291   9.008 -18.090  1.00  1.00           N  
44 | ATOM    871  CA  VAL H   4      25.112   9.983 -16.986  1.00  1.00           C  
45 | ATOM    872  C   VAL H   4      23.672  10.399 -16.963  1.00  1.00           C  
46 | ATOM    873  CB  VAL H   4      25.976  11.257 -17.222  1.00  1.00           C  
47 | ATOM    874  O   VAL H   4      23.143  10.875 -17.973  1.00  1.00           O  
48 | ATOM    875  CG1 VAL H   4      25.686  12.319 -16.177  1.00  1.00           C  
49 | ATOM    876  CG2 VAL H   4      27.466  10.902 -17.258  1.00  1.00           C  
50 | TER     877      VAL H   4
51 | ENDMDL
52 | END   
53 | 


--------------------------------------------------------------------------------
/tutorial/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/img/.DS_Store


--------------------------------------------------------------------------------
/tutorial/img/afpdb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/img/afpdb.png


--------------------------------------------------------------------------------
/tutorial/img/afpdb_numpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/img/afpdb_numpy.png


--------------------------------------------------------------------------------
/tutorial/img/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/img/demo.png


--------------------------------------------------------------------------------
/tutorial/img/drSASA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/img/drSASA.png


--------------------------------------------------------------------------------
/tutorial/img/mypm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/img/mypm.png


--------------------------------------------------------------------------------
/tutorial/mypm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data2code/afpdb/e18ff796c64ab25e29faf2d091b4f826b7169dcc/tutorial/mypm.png


--------------------------------------------------------------------------------