├── utils
    ├── __pycache__
    │   ├── util.cpython-37.pyc
    │   ├── util.cpython-38.pyc
    │   ├── util.cpython-39.pyc
    │   ├── util.cpython-310.pyc
    │   ├── parsers.cpython-310.pyc
    │   ├── parsers.cpython-37.pyc
    │   ├── parsers.cpython-38.pyc
    │   ├── parsers.cpython-39.pyc
    │   ├── align_pdbs.cpython-310.pyc
    │   ├── align_pdbs.cpython-37.pyc
    │   ├── align_pdbs.cpython-38.pyc
    │   ├── align_pdbs.cpython-39.pyc
    │   ├── kinematics.cpython-310.pyc
    │   ├── kinematics.cpython-37.pyc
    │   ├── kinematics.cpython-38.pyc
    │   ├── kinematics.cpython-39.pyc
    │   ├── kabsch_align.cpython-37.pyc
    │   ├── dunbrack_rotlib.cpython-310.pyc
    │   ├── dunbrack_rotlib.cpython-37.pyc
    │   ├── dunbrack_rotlib.cpython-38.pyc
    │   └── dunbrack_rotlib.cpython-39.pyc
    ├── kabsch_align.py
    ├── dunbrack_rotlib.py
    ├── align_pdbs.py
    └── util.py
├── examples
    ├── P450
    │   ├── command
    │   └── inputs
    │   │   ├── HBA_CYS_P450_nosample.cst
    │   │   ├── P450_motif.pdb
    │   │   └── HBA_unique.params
    └── Kemp_eliminase
    │   ├── command
    │   └── inputs
    │       ├── BIO.params
    │       └── BIO_His_ED_oxy_nosample.cst
├── utils.py
├── README.md
├── invrotzyme.py
└── protocol.py


/utils/__pycache__/util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/util.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/util.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/util.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/util.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/util.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/util.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/util.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parsers.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/parsers.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parsers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/parsers.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parsers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/parsers.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parsers.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/parsers.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/align_pdbs.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/align_pdbs.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/align_pdbs.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/align_pdbs.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/align_pdbs.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/align_pdbs.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/align_pdbs.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/align_pdbs.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/kinematics.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/kinematics.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/kinematics.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/kinematics.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/kinematics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/kinematics.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/kinematics.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/kinematics.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/kabsch_align.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/kabsch_align.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/dunbrack_rotlib.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/dunbrack_rotlib.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/dunbrack_rotlib.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/dunbrack_rotlib.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/dunbrack_rotlib.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/dunbrack_rotlib.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/dunbrack_rotlib.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ikalvet/invrotzyme/HEAD/utils/__pycache__/dunbrack_rotlib.cpython-39.pyc


--------------------------------------------------------------------------------
/examples/P450/command:
--------------------------------------------------------------------------------
1 | python ../../invrotzyme.py --cstfile inputs/HBA_CYS_P450_nosample.cst --params inputs/HBA_unique.params --motif_for_cst 1:3:inputs/P450_motif.pdb --frac_random_rotamers 0.1 --prefix outputs/
2 | 


--------------------------------------------------------------------------------
/examples/Kemp_eliminase/command:
--------------------------------------------------------------------------------
1 | python ../../invrotzyme.py --cstfile inputs/BIO_His_ED_oxy_nosample.cst --params inputs/BIO.params --dunbrack_prob 0.6 --frac_random_rotamers_per_cst 0.5 0.5 0.5 0.5 --secstruct_per_cst H H E --prefix outputs/ --suffix HHE
2 | 


--------------------------------------------------------------------------------
/examples/P450/inputs/HBA_CYS_P450_nosample.cst:
--------------------------------------------------------------------------------
 1 | # cst constraint descriptor for ferryl intermediate C-H abstraction TS from methoxybiphenyl
 2 | # CYS coordinating to the Heme Fe based on P450 geometry
 3 | # I. Kalvet, Baker lab, UW,   ikalvet@uw.edu
 4 | 
 5 | 
 6 | #block 1 for CYS coordinated to Fe
 7 | 
 8 | CST::BEGIN
 9 | 
10 |   TEMPLATE::   ATOM_MAP: 1 atom_name: FE1 N4 C19
11 |   TEMPLATE::   ATOM_MAP: 1 residue3:  HBA
12 | 
13 |   TEMPLATE::   ATOM_MAP: 2 atom_type: SH1
14 |   TEMPLATE::   ATOM_MAP: 2 residue3: CYS
15 | 
16 |   CONSTRAINT:: distanceAB:    2.5   0.15  100.   1   0
17 |   CONSTRAINT::    angle_A:   85.9   5.0  100.0  360. 0
18 |   CONSTRAINT::    angle_B:  111.0   5.0   75.0  360. 0
19 |   CONSTRAINT::  torsion_A:   84.5   5.0   75.0  360. 0
20 |   CONSTRAINT:: torsion_AB:  108.0  15.0    0.0  360. 0
21 |   CONSTRAINT::  torsion_B:   82.4  20.0   25.0  360. 0
22 | 
23 |   ALGORITHM_INFO:: match
24 |      MAX_DUNBRACK_ENERGY 5.0
25 |      IGNORE_UPSTREAM_PROTON_CHI
26 |   ALGORITHM_INFO::END
27 | 
28 | CST::END
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/examples/Kemp_eliminase/inputs/BIO.params:
--------------------------------------------------------------------------------
 1 | NAME BIO
 2 | IO_STRING BIO Z
 3 | TYPE LIGAND
 4 | AA UNK
 5 | ATOM  C1  aroC  X   -0.01
 6 | ATOM  C6  aroC  X   -0.03
 7 | ATOM  C5  aroC  X   0.13
 8 | ATOM  N2  Npro  X   0.06
 9 | ATOM  O3  ONH2  X   -0.13
10 | ATOM  O2  ONH2  X   -0.13
11 | ATOM  C4  aroC  X   -0.02
12 | ATOM  C3  aroC  X   0.04
13 | ATOM  C7  aroC  X   0.08
14 | ATOM  N1  Nhis  X   -0.09
15 | ATOM  O1  ONH2  X   -1.05
16 | ATOM  C2  aroC  X   0.17
17 | ATOM  H4  Haro  X   0.09
18 | ATOM  H2  Haro  X   0.06
19 | ATOM  H3  Haro  X   0.06
20 | ATOM  H1  Haro  X   0.07
21 | BOND_TYPE  O3   N2  2   
22 | BOND_TYPE  N2   O2  2   
23 | BOND_TYPE  N2   C5  1   
24 | BOND_TYPE  H2   C4  1   
25 | BOND_TYPE  C4   C5  4   
26 | BOND_TYPE  C4   C3  4   
27 | BOND_TYPE  C5   C6  4   
28 | BOND_TYPE  H4   C7  1   
29 | BOND_TYPE  C3   C7  4   
30 | BOND_TYPE  C3   C2  4   
31 | BOND_TYPE  C6   H3  1   
32 | BOND_TYPE  C6   C1  4   
33 | BOND_TYPE  C7   N1  4   
34 | BOND_TYPE  C2   C1  4   
35 | BOND_TYPE  C2   O1  4   
36 | BOND_TYPE  C1   H1  1   
37 | BOND_TYPE  N1   O1  4   
38 | CHI 1  C6   C5   N2   O3 
39 | NBR_ATOM  C4 
40 | NBR_RADIUS 4.083104
41 | ICOOR_INTERNAL    C1     0.000000    0.000000    0.000000   C1    C6    C5 
42 | ICOOR_INTERNAL    C6     0.000000  180.000000    1.382716   C1    C6    C5 
43 | ICOOR_INTERNAL    C5     0.000000   59.182789    1.409222   C6    C1    C5 
44 | ICOOR_INTERNAL    N2  -179.998004   61.281665    1.447079   C5    C6    C1 
45 | ICOOR_INTERNAL    O3  -179.984056   61.433633    1.236799   N2    C5    C6 
46 | ICOOR_INTERNAL    O2   179.986424   61.557617    1.239696   N2    C5    O3 
47 | ICOOR_INTERNAL    C4   179.889661   57.795737    1.396329   C5    C6    N2 
48 | ICOOR_INTERNAL    C3     0.095219   63.144774    1.385375   C4    C5    C6 
49 | ICOOR_INTERNAL    C7  -179.747781   48.470187    1.449305   C3    C4    C5 
50 | ICOOR_INTERNAL    N1   179.814748   63.412331    1.251719   C7    C3    C4 
51 | ICOOR_INTERNAL    O1     0.121877   78.864331    1.794487   N1    C7    C3 
52 | ICOOR_INTERNAL    C2    -0.218402   78.350293    1.303685   O1    N1    C7 
53 | ICOOR_INTERNAL    H4   179.979921   55.243279    1.277743   C7    C3    N1 
54 | ICOOR_INTERNAL    H2  -179.886000   56.975238    1.084893   C4    C5    C3 
55 | ICOOR_INTERNAL    H3  -179.972679   59.482293    1.082576   C6    C1    C5 
56 | ICOOR_INTERNAL    H1   179.798701   58.039008    1.083917   C1    C6    C5 
57 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Aug 25 23:12:52 2024
 5 | 
 6 | @author: indrek
 7 | """
 8 | import numpy as np
 9 | 
10 | 
11 | # number of chis, excluding proton-chis
12 | N_chis = {'ALA': 0, 'ARG': 4, 'TRP': 2, 'GLY': 0, 'ASP': 2, 'HIS': 2, 'GLU': 3,
13 |           'GLN': 3, 'ASN': 2, 'LEU': 2, 'ILE': 2, 'THR': 1, 'VAL': 1, 'SER': 1,
14 |           'MET': 3, 'CYS': 1, 'PRO': 3, 'LYS': 4, 'PHE': 2, 'TYR': 2, "CYX": 1}
15 | 
16 | 
17 | # PHI and PSI values for ideal backbone, and tolerances for randomization
18 | idealized_SS_phi_psi = {"H": {"phi": (-57.0, 10.0), "psi": (-47.0, 10.0)},
19 |                         "E": {"phi": (-140.0, 20.0), "psi": (130.0, 20.0)},
20 |                         "-": {"phi": (-140.0, 20.0), "psi": (130.0, 20.0)}}
21 | 
22 | 
23 | def get_dist(a, b):
24 |     return np.linalg.norm(a-b)
25 | 
26 | 
27 | def get_angle(a1, a2, a3):
28 |     a1 = np.array(a1)
29 |     a2 = np.array(a2)
30 |     a3 = np.array(a3)
31 | 
32 |     ba = a1 - a2
33 |     bc = a3 - a2
34 | 
35 |     cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
36 |     angle = np.arccos(cosine_angle)
37 | 
38 |     return round(np.degrees(angle), 1)
39 | 
40 | 
41 | 
42 | def get_dihedral(a1, a2, a3, a4):
43 |     """
44 |     a1, a2, a3, a4 (np.array)
45 |     Each array has to contain 3 floats corresponding to X, Y and Z of an atom.
46 |     Solution by 'Praxeolitic' from Stackoverflow:
47 |     https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python#
48 |     1 sqrt, 1 cross product
49 |     Calculates the dihedral/torsion between atoms a1, a2, a3 and a4
50 |     Output is in degrees
51 |     """
52 | 
53 |     b0 = a1 - a2
54 |     b1 = a3 - a2
55 |     b2 = a4 - a3
56 | 
57 |     # normalize b1 so that it does not influence magnitude of vector
58 |     # rejections that come next
59 |     b1 /= np.linalg.norm(b1)
60 | 
61 |     # vector rejections
62 |     # v = projection of b0 onto plane perpendicular to b1
63 |     #   = b0 minus component that aligns with b1
64 |     # w = projection of b2 onto plane perpendicular to b1
65 |     #   = b2 minus component that aligns with b1
66 |     v = b0 - np.dot(b0, b1)*b1
67 |     w = b2 - np.dot(b2, b1)*b1
68 | 
69 |     # angle between v and w in a plane is the torsion angle
70 |     # v and w may not be normalized but that's fine since tan is y/x
71 |     x = np.dot(v, w)
72 |     y = np.dot(np.cross(b1, v), w)
73 |     return np.degrees(np.arctan2(y, x))
74 | 
75 | 
76 | def rmsd(geom, target):
77 |     return np.sqrt(((geom - target) ** 2).mean())
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/examples/Kemp_eliminase/inputs/BIO_His_ED_oxy_nosample.cst:
--------------------------------------------------------------------------------
  1 | # Rosetta matcher/enzdes CST description for Kemp Eliminase
  2 | # Active consisting of a HIS-GLU/ASP dyad and SER/THR/TYR/GLN/ASN oxyanion hole
  3 | # CYS coordinating to the Heme Fe based on UPO geometry
  4 | # I. Kalvet, Baker lab, UW,   ikalvet@uw.edu
  5 | 
  6 | 
  7 | ################## CST_1 ( His base ) ###############
  8 | CST::BEGIN
  9 | 
 10 |   TEMPLATE::   ATOM_MAP: 1 atom_name: C7 N1 O1
 11 |   TEMPLATE::   ATOM_MAP: 1 residue3:  BIO
 12 | 
 13 |   TEMPLATE::   ATOM_MAP: 2 atom_type: Nhis
 14 |   TEMPLATE::   ATOM_MAP: 2 residue1: H
 15 | 
 16 |   CONSTRAINT:: distanceAB:    2.68   0.15  100.   1   0
 17 |   CONSTRAINT::    angle_A:   125.8  5.0  100.0  360. 0
 18 |   CONSTRAINT::    angle_B:  114.7   5.0   75.0  360. 0
 19 |   CONSTRAINT::  torsion_A:  180.0   5.0   75.0  360. 0
 20 |   CONSTRAINT:: torsion_AB:   58.5  45.0    0.0   90. 0
 21 |   CONSTRAINT::  torsion_B:  180.0   5.0   25.0  360. 0
 22 | 
 23 | CST::END
 24 | 
 25 | ################## CST_2 ( GLU/ASP activating His ) ###############
 26 | CST::BEGIN
 27 | 
 28 | TEMPLATE::   ATOM_MAP: 1 atom_type: Ntrp
 29 | TEMPLATE::   ATOM_MAP: 1 residue3:  HIS
 30 | 
 31 | TEMPLATE::   ATOM_MAP: 2 atom_type: OOC
 32 | TEMPLATE::   ATOM_MAP: 2 residue1: ED
 33 | 
 34 | 
 35 | CONSTRAINT:: distanceAB:   2.62  0.2   100.  1    0
 36 | CONSTRAINT::    angle_A:  126.0  15.0   50.0  360. 0
 37 | CONSTRAINT::    angle_B:  106.5  25.0   50.0  180. 0
 38 | CONSTRAINT::  torsion_A:    0.0  25.0   50.0  180. 0
 39 | CONSTRAINT:: torsion_AB:   90.0  10.0    0.0  180. 0
 40 | CONSTRAINT::  torsion_B:  180.0  60.0   25.0  360. 0
 41 | 
 42 |   ALGORITHM_INFO:: match
 43 |      SECONDARY_MATCH: UPSTREAM_CST 1
 44 |   ALGORITHM_INFO::END
 45 | 
 46 | CST::END
 47 | 
 48 | 
 49 | 
 50 | ################## CST_3 ( oxyanion hole ) ###############
 51 | ############  either SER/THR or TYR or ASN/GLN  ##########
 52 | VARIABLE_CST::BEGIN
 53 | 
 54 |  CST::BEGIN
 55 |   TEMPLATE::   ATOM_MAP: 1 atom_name: O1 N1 C7
 56 |   TEMPLATE::   ATOM_MAP: 1 residue3: BIO
 57 |  
 58 |   TEMPLATE::   ATOM_MAP: 2 atom_type: OH 
 59 |   TEMPLATE::   ATOM_MAP: 2 residue1:  ST
 60 |  
 61 |   CONSTRAINT:: distanceAB:    2.81    0.2   80.0    0   0 
 62 |   CONSTRAINT::    angle_A:   150.0    5.0   10.0  360   0 
 63 |   CONSTRAINT::    angle_B:   100.0    5.0   10.0  360   0 
 64 |   CONSTRAINT::  torsion_A:   180.0   10.0   10.0  360   0 
 65 |   CONSTRAINT:: torsion_AB:    71.0   10.0   10.0   90   0
 66 |   CONSTRAINT::  torsion_B:   180.0   10.0   10.0  120   0 
 67 |  
 68 |   ALGORITHM_INFO:: match
 69 |    SECONDARY_MATCH: DOWNSTREAM
 70 |   ALGORITHM_INFO::END
 71 |  CST::END
 72 | 
 73 |  CST::BEGIN
 74 |   TEMPLATE::   ATOM_MAP: 1 atom_name: O1 N1 C7
 75 |   TEMPLATE::   ATOM_MAP: 1 residue3: BIO
 76 |  
 77 |   TEMPLATE::   ATOM_MAP: 2 atom_name: OH CZ CE2 
 78 |   TEMPLATE::   ATOM_MAP: 2 residue3:  TYR
 79 |  
 80 |   CONSTRAINT:: distanceAB:    2.81    0.2   80.0    0   0 
 81 |   CONSTRAINT::    angle_A:   150.0    5.0   10.0  360   0 
 82 |   CONSTRAINT::    angle_B:   100.0    5.0   10.0  360   0 
 83 |   CONSTRAINT::  torsion_A:   180.0   10.0   10.0  360   0 
 84 |   CONSTRAINT:: torsion_AB:    71.0   10.0   10.0   90   0
 85 |   CONSTRAINT::  torsion_B:    90.0   10.0   10.0  180   0 
 86 |  
 87 |   ALGORITHM_INFO:: match
 88 |    SECONDARY_MATCH: DOWNSTREAM
 89 |   ALGORITHM_INFO::END
 90 |  CST::END
 91 |  
 92 |  CST::BEGIN
 93 |   TEMPLATE::   ATOM_MAP: 1 atom_name: O1 N1 C7
 94 |   TEMPLATE::   ATOM_MAP: 1 residue3: BIO
 95 |  
 96 |   TEMPLATE::   ATOM_MAP: 2 atom_type: NH2O
 97 |   TEMPLATE::   ATOM_MAP: 2 residue1: NQ
 98 |  
 99 |   CONSTRAINT:: distanceAB:    2.81    0.2   80.0    0   0 
100 |   CONSTRAINT::    angle_A:   150.0    5.0   10.0  360   0 
101 |   CONSTRAINT::    angle_B:   100.0    5.0   10.0  360   0 
102 |   CONSTRAINT::  torsion_A:   180.0   10.0   10.0  360   0 
103 |   CONSTRAINT:: torsion_AB:    71.0   10.0   10.0   90   0
104 |   CONSTRAINT::  torsion_B:   180.0   10.0   10.0  180   0 
105 | 
106 |   ALGORITHM_INFO:: match
107 |    SECONDARY_MATCH: DOWNSTREAM
108 |   ALGORITHM_INFO::END
109 |  CST::END
110 | 
111 | VARIABLE_CST::END
112 | 
113 | 


--------------------------------------------------------------------------------
/utils/kabsch_align.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import copy
  3 | #Gyu Rie Lee
  4 | #Borrowed kabsch code and modified slightly for superimposition
  5 | 
  6 | #Use kabsch algorithm to align van der Mers with mainchain atoms (or given subset of coord)
  7 | #get transformation matrix from xyz1 and xyz2 (could be N-CA-C of residues)
  8 | #then use this to align residue+functional group
  9 | #xyz1/coord_for_align1 would be the reference
 10 | #IMPORTANT: xyz1_in is being copied inside as xyz1 because xyz1_in will be used repeatedly outside of this code
 11 | 
 12 | 
 13 | def np_kabsch(A,B):
 14 |     """
 15 |     Numpy version of kabsch algorithm. Superimposes B onto A
 16 | 
 17 |     Parameters:
 18 |         (A,B) np.array - shape (N,3) arrays of xyz crds of points
 19 | 
 20 | 
 21 |     Returns:
 22 |         rms - rmsd between A and B
 23 |         R - rotation matrix to superimpose B onto A
 24 |         rB - the rotated B coordinates
 25 |     """
 26 |     A = np.copy(A)
 27 |     B = np.copy(B)
 28 | 
 29 |     def centroid(X):
 30 |         # return the mean X,Y,Z down the atoms
 31 |         return np.mean(X, axis=0, keepdims=True)
 32 | 
 33 |     def rmsd(V,W, eps=1e-6):
 34 |         # First sum down atoms, then sum down xyz
 35 |         N = V.shape[-2]
 36 |         return np.sqrt(np.sum((V-W)*(V-W), axis=(-2,-1)) / N + eps)
 37 | 
 38 | 
 39 |     N, ndim = A.shape
 40 | 
 41 |     # move to centroid
 42 |     A = A - centroid(A)
 43 |     B = B - centroid(B)
 44 | 
 45 |     # computation of the covariance matrix
 46 |     C = np.matmul(A.T, B)
 47 | 
 48 |     # compute optimal rotation matrix using SVD
 49 |     U,S,Vt = np.linalg.svd(C)
 50 | 
 51 | 
 52 |     # ensure right handed coordinate system
 53 |     d = np.eye(3)
 54 |     d[-1,-1] = np.sign(np.linalg.det(Vt.T@U.T))
 55 | 
 56 |     # construct rotation matrix
 57 |     R = Vt.T@d@U.T
 58 | 
 59 |     # get rotated coords
 60 |     rB = B@R
 61 | 
 62 |     # calculate rmsd
 63 |     rms = rmsd(A,rB)
 64 | 
 65 |     return rms, rB, R
 66 | 
 67 | 
 68 | def kabsch_align_coords(xyz1, xyz2_in, mobile_coord):
 69 | 
 70 | #    xyz1 = copy.deepcopy(xyz1_in)
 71 |     xyz2 = copy.deepcopy(xyz2_in)
 72 |     # check dimensions
 73 |     #print(len(xyz1), len(xyz2))
 74 |     assert len(xyz1) == len(xyz2)
 75 |     L = len(xyz1)
 76 |     assert L > 2
 77 | 
 78 |     # move two both sets of points to their
 79 |     # centers of masses (COM)
 80 |     COM1 = np.sum(xyz1, axis=0) / float(L)
 81 |     COM2 = np.sum(xyz2, axis=0) / float(L)
 82 |     xyz1 -= COM1
 83 |     xyz2 -= COM2
 84 | 
 85 |     # Initial residual, see Kabsch.
 86 |     E0 = np.sum( np.sum(xyz1*xyz1,axis=0),axis=0) + np.sum( np.sum(xyz2*xyz2,axis=0),axis=0 )
 87 | 
 88 |     # SVD of the covariance matrix
 89 |     V, S, Wt = np.linalg.svd( np.dot( np.transpose(xyz2), xyz1))
 90 | 
 91 |     # check parity of the transformation
 92 |     reflect = float(str(float(np.linalg.det(V) * np.linalg.det(Wt))))
 93 |     if reflect == -1.0:
 94 |         S[-1] = -S[-1]
 95 |         V[:,-1] = -V[:,-1]
 96 | 
 97 |     RMSD = E0 - (2.0 * sum(S))
 98 |     RMSD = np.sqrt(abs(RMSD / L))
 99 | 
100 |     # U is simply V*Wt
101 |     U = np.dot(V, Wt)
102 | 
103 |     # translation vector
104 |     t = COM1 - COM2
105 | 
106 |     superimposed_coord = np.dot((mobile_coord-COM2), U)
107 |     superimposed_coord += COM1
108 | #    rot_coord_2 = np.dot((coord_for_align2 - COM2), U)
109 | #    rot_coord_1 = coord_for_align1 - COM1
110 |     
111 | #    rot_coord_2 = np.dot((coord_for_align2 - COM2), U) + COM1
112 |     
113 | #    return coord_for_align1, rot_coord_2
114 |     return superimposed_coord
115 | #    return RMSD, t, U
116 | 
117 | def kabsch_rmsd(xyz1_in,xyz2_in):
118 | 
119 |     xyz1 = copy.deepcopy(xyz1_in)
120 |     xyz2 = copy.deepcopy(xyz2_in)
121 |     # check dimensions
122 |     assert len(xyz1) == len(xyz2)
123 |     L = len(xyz1)
124 |     assert L > 2
125 | 
126 |     # move two both sets of points to their
127 |     # centers of masses (COM)
128 |     COM1 = np.sum(xyz1, axis=0) / float(L)
129 |     COM2 = np.sum(xyz2, axis=0) / float(L)
130 |     xyz1 -= COM1
131 |     xyz2 -= COM2
132 | 
133 |     # Initial residual, see Kabsch.
134 |     E0 = np.sum( np.sum(xyz1*xyz1,axis=0),axis=0) + np.sum( np.sum(xyz2*xyz2,axis=0),axis=0 )
135 | 
136 |     # SVD of the covariance matrix
137 |     V, S, Wt = np.linalg.svd( np.dot( np.transpose(xyz2), xyz1))
138 | 
139 |     # check parity of the transformation
140 |     reflect = float(str(float(np.linalg.det(V) * np.linalg.det(Wt))))
141 |     if reflect == -1.0:
142 |         S[-1] = -S[-1]
143 |         V[:,-1] = -V[:,-1]
144 | 
145 |     RMSD = E0 - (2.0 * sum(S))
146 |     RMSD = np.sqrt(abs(RMSD / L))
147 | 
148 |     # U is simply V*Wt
149 |     U = np.dot(V, Wt)
150 | 
151 |     # translation vector
152 |     t = COM1 - COM2
153 | 
154 |     return RMSD
155 | #    return RMSD, t, U
156 | 
157 | 


--------------------------------------------------------------------------------
/utils/dunbrack_rotlib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pandas as pd
  3 | import os
  4 | 
  5 | comparisons = {'<=': '__le__',
  6 |                '<': '__lt__',
  7 |                '>': '__gt__',
  8 |                '>=': '__ge__',
  9 |                '=': '__eq__'}
 10 | 
 11 | chi_psi_SS = {"H": {"phi": (-72.0, -50.0),
 12 |                     "psi": (-50.0, -30.0)},
 13 |               "E": {"phi": (-161.0, -89.0),
 14 |                     "psi": (109.0, 151.0)},
 15 |               "L": {"phi": (),
 16 |                     "psi": ()},
 17 |               "-": {"phi": (-180.0, 180.0),
 18 |                     "psi": (-180.0, 180.0)}}
 19 | 
 20 | 
 21 | def load_rotamer_df(dunbrack_database):
 22 |     header = ["restype", "phi", "psi", "N", "r1", "r2", "r3", "r4", "prob", "chi1", "chi2", "chi3", "chi4", "std1", "std2", "std3", "std4"]
 23 |     rotlib = pd.read_csv(dunbrack_database, sep="\s+", names=header)
 24 |     for n in range(1, 5):
 25 |         rotlib[f"chi{n}_min"] = rotlib[f"chi{n}"]-rotlib[f"std{n}"]
 26 |         rotlib[f"chi{n}_max"] = rotlib[f"chi{n}"]+rotlib[f"std{n}"]
 27 |     return rotlib
 28 | 
 29 | 
 30 | def filter_rotlib(scores, filters):
 31 |     filtered_scores = scores.copy()
 32 | 
 33 |     for s in filters.keys():
 34 |         _fltrs = []
 35 |         if isinstance(filters[s][0], list):
 36 |             _fltrs = filters[s]
 37 |         else:
 38 |             _fltrs.append(filters[s])
 39 |         for fltr in _fltrs:
 40 |             if fltr is not None and s in scores.keys():
 41 |                 val = fltr[0]
 42 |                 sign = comparisons[fltr[1]]
 43 |                 filtered_scores =\
 44 |                   filtered_scores.loc[(filtered_scores[s].__getattribute__(sign)(val))]
 45 |     return filtered_scores
 46 | 
 47 | 
 48 | def find_good_rotamers(rotlib, restype, cumulative_prob=1.0, secstruct=None, phi=None, psi=None, keep_only_best=False):
 49 |     """
 50 |     Arugments:
 51 |         rotlib (pandas.DataFrame)
 52 |         restype (str) :: name3 of an amino acid in the rotamer library
 53 |         cumulative_prob (float) :: cumulative probability up to which rotamers are returned
 54 |         secstruct (str, ('H', 'E')) :: secondary structure type for which rotamers are searched.
 55 |         phi (tuple, (float, float)) :: min and max phi value for defining a subset of the library
 56 |         psi (tuple, (float, float)) :: min and max psi value for defining a subset of the library
 57 |         keep_only_best (bool) :: only the highest probability rotamer is returned for each phi/psi bin
 58 |     """
 59 |     assert isinstance(phi, (tuple, type(None)))
 60 |     assert isinstance(psi, (tuple, type(None)))
 61 |     assert secstruct in ("H", "E", "-", None), "Not implemented for other secondary structures yet"
 62 |     # assert restype not in ["ALA", "GLY"], "No rotamer library for ALA and GLY"
 63 |     assert not all([x is None for x in [secstruct, phi]]), "Must provide either secstruct letter OR phi and psi values"
 64 |     assert not all([x is None for x in [secstruct, psi]]), "Must provide either secstruct letter OR phi and psi values"
 65 | 
 66 |     if secstruct is not None:
 67 |         phi_limits = chi_psi_SS[secstruct]["phi"]
 68 |         psi_limits = chi_psi_SS[secstruct]["psi"]
 69 |     elif phi is not None and psi is not None:
 70 |         phi_limits = phi
 71 |         psi_limits = psi
 72 |     else:
 73 |         print("Both phi and psi need to be defined")
 74 |         return None
 75 | 
 76 |     filters = {'restype': [restype, '='],
 77 |                'phi': [[phi_limits[0], '>='], [phi_limits[1], '<=']],
 78 |                'psi': [[psi_limits[0], '>='], [psi_limits[1], '<=']]}
 79 | 
 80 |     SS_rotlib = filter_rotlib(rotlib, filters)
 81 |     phi_psi_bins = list(set([(row.phi, row.psi) for idx, row in SS_rotlib.iterrows()]))
 82 |     df = pd.DataFrame()
 83 |     for phi_psi_bin in phi_psi_bins:
 84 |         _df = SS_rotlib.loc[(SS_rotlib["phi"] == phi_psi_bin[0]) & (SS_rotlib["psi"] == phi_psi_bin[1])]
 85 |         if keep_only_best is True:
 86 |             _df2 = _df.iloc[0]
 87 |         else:
 88 |             if cumulative_prob == 1.0:
 89 |                 _df2 = _df.copy()
 90 |             else:
 91 |                 _df2 = _df.loc[_df.prob.cumsum() <= cumulative_prob]
 92 | 
 93 |                 # Also adding the next most probable rotamer that would push the cumulative sum over the cutoff
 94 |                 # This fixes the issue where no rotamers are returned when the cutoff is lower than the prob of the most likely rotamer
 95 |                 if len(_df2) == 0:
 96 |                     idx_to_add = 0
 97 |                 elif len(_df2) < len(_df):
 98 |                     idx_to_add = len(_df2)
 99 |                 else:
100 |                     idx_to_add = None
101 |                 if idx_to_add is not None:
102 |                     _df2 = pd.concat([_df2, _df.iloc[idx_to_add].to_frame().T], ignore_index=True)
103 |         df = pd.concat([df, _df2], ignore_index=True)
104 |     return df
105 | 
106 | 
107 | def find_bb_from_inverse(rotlib, chis):
108 |     df = pd.DataFrame()
109 |     for idx, row in rotlib.iterrows():
110 |         _chi_matches = []
111 |         for i, ch in enumerate(chis):
112 |             _chi_matches.append(row[f"chi{i+1}"]-row[f"std{i+1}"] <= ch <= row[f"chi{i+1}"]+row[f"std{i+1}"])
113 |         if all(_chi_matches):
114 |             # df = df.append(row)
115 |             df = pd.concat([df, row])
116 |     return df
117 | 
118 | 
119 | def find_bb_from_inverse_loc(rotlib, chis):
120 |     """
121 |     Finds
122 |     Arguments:
123 |         rotlib (pandas.DataFrame) :: rotamer library. Preferrably for a given amino acid.
124 |         chis (list) :: list of chi values
125 |     """
126 |     assert isinstance(rotlib, pd.DataFrame)
127 |     rl = rotlib.copy()
128 |     for i, ch in enumerate(chis):
129 |         rl = rl.loc[(rl[f"chi{i+1}_min"] <= ch) & (rl[f"chi{i+1}_max"] >= ch)]
130 |     return rl
131 | 
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/examples/P450/inputs/P450_motif.pdb:
--------------------------------------------------------------------------------
 1 | ATOM      1  N   HIS A 363       4.913 -43.057  15.166  1.00 11.88      A    N  
 2 | ATOM      2  CA  HIS A 363       4.586 -41.709  15.616  1.00 11.40      A    C  
 3 | ATOM      3  C   HIS A 363       3.735 -40.925  14.609  1.00 12.52      A    C  
 4 | ATOM      4  O   HIS A 363       3.491 -39.745  14.804  1.00 12.41      A    O  
 5 | ATOM      5  CB  HIS A 363       5.847 -40.946  15.976  1.00 12.94      A    C  
 6 | ATOM      6  CG  HIS A 363       6.460 -41.421  17.253  1.00 12.72      A    C  
 7 | ATOM      7  CD2 HIS A 363       7.320 -42.431  17.522  1.00 12.58      A    C  
 8 | ATOM      8  ND1 HIS A 363       6.167 -40.861  18.473  1.00 12.29      A    N  
 9 | ATOM      9  CE1 HIS A 363       6.827 -41.482  19.434  1.00 11.30      A    C  
10 | ATOM     10  NE2 HIS A 363       7.514 -42.464  18.877  1.00 13.49      A    N  
11 | ATOM     11  N   ARG A 364       3.235 -41.553  13.544  1.00 12.12      A    N  
12 | ATOM     12  CA  ARG A 364       2.319 -40.877  12.622  1.00 11.10      A    C  
13 | ATOM     13  C   ARG A 364       1.202 -40.146  13.356  1.00 12.71      A    C  
14 | ATOM     14  O   ARG A 364       0.640 -40.610  14.344  1.00 12.68      A    O  
15 | ATOM     15  CB  ARG A 364       1.685 -41.899  11.649  1.00 11.87      A    C  
16 | ATOM     16  CG  ARG A 364       0.917 -41.342  10.487  1.00 13.05      A    C  
17 | ATOM     17  CD  ARG A 364       0.408 -42.437   9.552  1.00 14.31      A    C  
18 | ATOM     18  NE  ARG A 364      -0.194 -41.802   8.400  1.00 15.24      A    N  
19 | ATOM     19  CZ  ARG A 364       0.121 -42.006   7.135  1.00 15.87      A    C  
20 | ATOM     20  NH1 ARG A 364       1.008 -42.903   6.794  1.00 15.19      A    N1+
21 | ATOM     21  NH2 ARG A 364      -0.453 -41.271   6.209  1.00 16.74      A    N  
22 | ATOM     22  N   CYS A 365       0.870 -38.980  12.842  1.00 11.45      A    N  
23 | ATOM     23  CA  CYS A 365      -0.133 -38.095  13.440  1.00 10.72      A    C  
24 | ATOM     24  C   CYS A 365      -1.398 -38.810  13.866  1.00 11.45      A    C  
25 | ATOM     25  O   CYS A 365      -2.130 -39.345  13.038  1.00 13.42      A    O  
26 | ATOM     26  CB  CYS A 365      -0.499 -37.044  12.396  1.00 10.95      A    C  
27 | ATOM     27  SG  CYS A 365      -1.632 -35.790  12.940  1.00 12.75      A    S  
28 | ATOM     28  N   ALA A 366      -1.739 -38.680  15.149  1.00 12.58      A    N  
29 | ATOM     29  CA  ALA A 366      -2.981 -39.272  15.628  1.00 14.03      A    C  
30 | ATOM     30  C   ALA A 366      -4.183 -38.592  15.020  1.00 15.69      A    C  
31 | ATOM     31  O   ALA A 366      -5.249 -39.210  14.915  1.00 15.05      A    O  
32 | ATOM     32  CB  ALA A 366      -3.101 -39.141  17.134  1.00 13.26      A    C  
33 | ATOM     33  N   GLY A 367      -4.073 -37.328  14.670  1.00 12.82      A    N  
34 | ATOM     34  CA  GLY A 367      -5.151 -36.485  14.210  1.00 14.57      A    C  
35 | ATOM     35  C   GLY A 367      -5.299 -36.322  12.702  1.00 13.52      A    C  
36 | ATOM     36  O   GLY A 367      -5.966 -35.395  12.227  1.00 13.45      A    O  
37 | ATOM     37  N   GLU A 368      -4.747 -37.251  11.929  1.00 14.10      A    N  
38 | ATOM     38  CA  GLU A 368      -4.816 -37.140  10.474  1.00 12.64      A    C  
39 | ATOM     39  C   GLU A 368      -6.252 -37.199   9.966  1.00 15.94      A    C  
40 | ATOM     40  O   GLU A 368      -6.635 -36.418   9.083  1.00 15.41      A    O  
41 | ATOM     41  CB  GLU A 368      -3.961 -38.215   9.828  1.00 15.10      A    C  
42 | ATOM     42  CG  GLU A 368      -3.784 -38.032   8.359  1.00 15.16      A    C  
43 | ATOM     43  CD  GLU A 368      -2.640 -38.795   7.750  1.00 15.22      A    C  
44 | ATOM     44  OE1 GLU A 368      -2.460 -39.970   8.159  1.00 15.91      A    O  
45 | ATOM     45  OE2 GLU A 368      -1.967 -38.239   6.860  1.00 16.51      A    O1-
46 | ATOM     46  N   TRP A 369      -7.044 -38.135  10.472  1.00 16.59      A    N  
47 | ATOM     47  CA  TRP A 369      -8.454 -38.191  10.058  1.00 16.63      A    C  
48 | ATOM     48  C   TRP A 369      -9.248 -36.984  10.533  1.00 15.46      A    C  
49 | ATOM     49  O   TRP A 369     -10.033 -36.427   9.756  1.00 18.41      A    O  
50 | ATOM     50  CB  TRP A 369      -9.036 -39.532  10.485  1.00 18.58      A    C  
51 | ATOM     51  CG  TRP A 369      -8.425 -40.568   9.565  1.00 37.49      A    C  
52 | ATOM     52  CD1 TRP A 369      -7.501 -41.507   9.903  1.00 40.53      A    C  
53 | ATOM     53  CD2 TRP A 369      -8.593 -40.683   8.131  1.00 40.81      A    C  
54 | ATOM     54  CE2 TRP A 369      -7.773 -41.750   7.700  1.00 42.82      A    C  
55 | ATOM     55  CE3 TRP A 369      -9.366 -40.004   7.180  1.00 41.25      A    C  
56 | ATOM     56  NE1 TRP A 369      -7.152 -42.253   8.808  1.00 38.35      A    N  
57 | ATOM     57  CZ2 TRP A 369      -7.710 -42.161   6.367  1.00 49.47      A    C  
58 | ATOM     58  CZ3 TRP A 369      -9.304 -40.417   5.854  1.00 46.90      A    C  
59 | ATOM     59  CH2 TRP A 369      -8.470 -41.477   5.461  1.00 41.19      A    C  
60 | ATOM     60  N   VAL A 370      -8.981 -36.486  11.744  1.00 15.47      A    N  
61 | ATOM     61  CA  VAL A 370      -9.591 -35.231  12.183  1.00 16.31      A    C  
62 | ATOM     62  C   VAL A 370      -9.294 -34.108  11.199  1.00 16.02      A    C  
63 | ATOM     63  O   VAL A 370     -10.169 -33.321  10.823  1.00 16.65      A    O  
64 | ATOM     64  CB  VAL A 370      -9.137 -34.851  13.606  1.00 15.82      A    C  
65 | ATOM     65  CG1 VAL A 370      -9.382 -33.345  13.933  1.00 17.55      A    C  
66 | ATOM     66  CG2 VAL A 370      -9.801 -35.759  14.636  1.00 17.86      A    C  
67 | ATOM     67  N   THR A 371      -8.020 -33.997  10.805  1.00 13.79      A    N  
68 | ATOM     68  CA  THR A 371      -7.593 -32.922   9.932  1.00 13.46      A    C  
69 | ATOM     69  C   THR A 371      -8.322 -32.993   8.592  1.00 12.99      A    C  
70 | ATOM     70  O   THR A 371      -8.839 -31.976   8.099  1.00 13.80      A    O  
71 | ATOM     71  CB  THR A 371      -6.089 -32.985   9.710  1.00 13.13      A    C  
72 | ATOM     72  CG2 THR A 371      -5.608 -31.874   8.840  1.00 14.33      A    C  
73 | ATOM     73  OG1 THR A 371      -5.358 -32.902  10.943  1.00 13.94      A    O  
74 | TER   
75 | END
76 | 


--------------------------------------------------------------------------------
/utils/align_pdbs.py:
--------------------------------------------------------------------------------
  1 | import os,sys 
  2 | sys.path.append(os.path.dirname(os.path.realpath(__file__)))
  3 | import kabsch_align 
  4 | import util 
  5 | import numpy as np
  6 | import pyrosetta as pyr
  7 | import pyrosetta.rosetta
  8 | 
  9 | 
 10 | def find_atom_idx(atom, mapping):
 11 |     for i,A in enumerate(mapping):
 12 |         try:
 13 |             if A.strip() == atom:
 14 |                 return i
 15 |         except AttributeError:
 16 |             print('This is atom ',A)
 17 | 
 18 |     raise KeyError(f'Could not find atom {atom} in mapping {mapping}')
 19 | 
 20 | 
 21 | def align_pose_to_residue(ref_residue, mobile_pose, ref_atoms):
 22 |     xyz1, parsed1 = get_xyz_stack_residue(ref_residue, ref_atoms["atoms1"])
 23 |     xyz2, parsed2 = get_xyz_stack_pose(mobile_pose, ref_atoms["atoms2"])
 24 | 
 25 |     # run Kabsch to get rotation matrix for atoms and rmsd
 26 |     # aligns xyz2 onto xyz1
 27 |     rmsd, _, R = kabsch_align.np_kabsch(xyz1, xyz2)
 28 |     print('RMSD between atoms: ',rmsd)
 29 | 
 30 |     # (1) now translate both proteins such that centroid(xyz1/xyz2) is at origin
 31 |     # (2) rorate xyz2 onto xyz1 with R
 32 |     # (3) write pdbs into outdir
 33 | 
 34 |     def centroid(X):
 35 |         # return the mean X,Y,Z down the atoms
 36 |         return np.mean(X, axis=0, keepdims=True)
 37 | 
 38 |     # centroid of just the points being aligned
 39 |     centroid1 = centroid(xyz1)
 40 |     centroid2 = centroid(xyz2)
 41 | 
 42 |     # (1)
 43 |     #xyz_protein1 = np.copy(parsed1['xyz']) - centroid1
 44 |     xyz_protein2 = np.copy(parsed2) - centroid2
 45 | 
 46 |     # (2)
 47 |     xyz_protein2 = xyz_protein2 @ R
 48 | 
 49 |     # Translate protein 2 to where it aligns with original protein 1
 50 |     xyz_protein2 += centroid1
 51 |     
 52 |     out_pose = mobile_pose.clone()
 53 |     for resno, res_coords in enumerate(xyz_protein2):
 54 |         for i, ac in enumerate(res_coords):
 55 |             if np.isnan(ac[0]):
 56 |                 break
 57 |             out_pose.residue(resno+1).set_xyz(i+1, pyrosetta.rosetta.numeric.xyzVector_double_t(*ac))
 58 |             continue
 59 |     return out_pose
 60 | 
 61 | 
 62 | def align_residue_to_residue(ref_residue, mobile_residue, ref_atoms):
 63 |     xyz1, parsed1 = get_xyz_stack_residue(ref_residue, ref_atoms["atoms1"])
 64 |     xyz2, parsed2 = get_xyz_stack_residue(mobile_residue, ref_atoms["atoms2"])
 65 | 
 66 |     # run Kabsch to get rotation matrix for atoms and rmsd
 67 |     # aligns xyz2 onto xyz1
 68 |     rmsd, _, R = kabsch_align.np_kabsch(xyz1, xyz2)
 69 |     if rmsd > 0.1:
 70 |         print('RMSD between atoms: ',rmsd)
 71 | 
 72 |     # (1) now translate both proteins such that centroid(xyz1/xyz2) is at origin
 73 |     # (2) rorate xyz2 onto xyz1 with R
 74 |     # (3) write pdbs into outdir
 75 | 
 76 |     def centroid(X):
 77 |         # return the mean X,Y,Z down the atoms
 78 |         return np.mean(X, axis=0, keepdims=True)
 79 | 
 80 |     # centroid of just the points being aligned
 81 |     centroid1 = centroid(xyz1)
 82 |     centroid2 = centroid(xyz2)
 83 | 
 84 |     # (1)
 85 |     #xyz_protein1 = np.copy(parsed1['xyz']) - centroid1
 86 |     xyz_protein2 = np.copy(parsed2) - centroid2
 87 | 
 88 |     # (2)
 89 |     xyz_protein2 = xyz_protein2 @ R
 90 | 
 91 |     # Translate protein 2 to where it aligns with original protein 1
 92 |     xyz_protein2 += centroid1
 93 |     
 94 |     out_residue = mobile_residue.clone()
 95 | 
 96 |     for i, ac in enumerate(xyz_protein2[0]):
 97 |         if np.isnan(ac[0]):
 98 |             break
 99 |         out_residue.set_xyz(i+1, pyrosetta.rosetta.numeric.xyzVector_double_t(*ac))
100 |         continue
101 |     return out_residue
102 | 
103 | 
104 | def get_xyz_stack_residue(residue, atoms_list):
105 |     """
106 |     Extracts the xyz crds corresponding to every atom in atoms_list 
107 |     atoms_list format: [(resno, atomname), (resno, atomname), ...]
108 |     """
109 |     if residue.is_ligand() or residue.is_virtual_residue():
110 |         return None, None
111 | 
112 |     xyz_all = parse_residue_coords(residue)
113 |     seq = [util.alpha_1.index(residue.name1())]
114 |     xyz_out = []
115 | 
116 |     # for each atom, get residue index and atom index 
117 |     # store crds 
118 |     for atom in atoms_list:
119 |         # get index of residue and its Heavy atom mapping
120 |         AA_int = seq[0]
121 | 
122 |         if residue.is_lower_terminus():
123 |             AA_long_map = util.aa2longH_Nterm[AA_int]
124 |         elif residue.is_upper_terminus():
125 |             AA_long_map = util.aa2longH_Cterm[AA_int]
126 |         else:
127 |             AA_long_map = util.aa2longH[AA_int]
128 | 
129 |         # get index of atom in residue 
130 |         atom_idx0 = find_atom_idx(atom.strip(), AA_long_map)
131 | 
132 |         # crds of this atom 
133 |         xyz_atom = xyz_all[0, atom_idx0, :]
134 | 
135 |         xyz_out.append(xyz_atom)
136 | 
137 |     return np.array(xyz_out), xyz_all
138 | 
139 | 
140 | def get_xyz_stack_pose(pose, atoms_list):
141 |     """
142 |     Extracts the xyz crds corresponding to every atom in atoms_list 
143 |     atoms_list format: [(resno, atomname), (resno, atomname), ...]
144 |     """
145 | 
146 |     xyz_all = parse_pose_coords(pose)
147 |     seq = [util.alpha_1.index(r.name1()) for r in pose.residues if not r.is_ligand() and not r.is_virtual_residue()]
148 |     xyz_out = []
149 | 
150 |     # for each atom, get residue index and atom index 
151 |     # store crds 
152 |     for (resn, atom) in atoms_list:
153 |         # get index of residue and its Heavy atom mapping
154 |         AA_int = seq[resn-1]
155 |         if pose.residue(resn).is_lower_terminus():
156 |             AA_long_map = util.aa2longH_Nterm[AA_int]
157 |         elif pose.residue(resn).is_upper_terminus():
158 |             AA_long_map = util.aa2longH_Cterm[AA_int]
159 |         else:
160 |             AA_long_map = util.aa2longH[AA_int]
161 | 
162 |         # get index of atom in residue 
163 |         atom_idx0 = find_atom_idx(atom.strip(), AA_long_map)
164 | 
165 |         # crds of this atom 
166 |         xyz_atom = xyz_all[resn-1, atom_idx0, :]
167 | 
168 |         xyz_out.append(xyz_atom)
169 | 
170 |     return np.array(xyz_out), xyz_all
171 | 
172 | 
173 | def parse_pose_coords(pose):
174 |     res = [r.seqpos() for r in pose.residues if not r.is_ligand() and not r.is_virtual_residue()]
175 |     xyz = np.full((len(res), 26, 3), np.nan, dtype=np.float32)
176 |     for r in pose.residues:
177 |         if r.is_ligand() or r.is_virtual_residue():
178 |             continue
179 |         # rc = np.ndarray((res.natoms(), 3), dtype=np.float32)
180 |         for n in range(r.natoms()):
181 |             try:
182 |                 xyz[r.seqpos()-1][n] = r.xyz(n+1)
183 |             except IndexError:
184 |                 print(r.name())
185 |                 print(r.seqpos())
186 |                 print(r.natoms())
187 |                 sys.exit(1)
188 |     return xyz
189 | 
190 | 
191 | def parse_residue_coords(residue):
192 |     xyz = np.full((1, 26, 3), np.nan, dtype=np.float32)
193 |     if residue.is_ligand() or residue.is_virtual_residue():
194 |         return None
195 |     # rc = np.ndarray((res.natoms(), 3), dtype=np.float32)
196 |     for n in range(residue.natoms()):
197 |         xyz[0][n] = residue.xyz(n+1)
198 |     return xyz
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # InvrotZyme
  2 | 
  3 | Script for building inverse rotamer assemblies out of a Rosetta matcher/enzdes constraint file.
  4 | 
  5 | This script will place sidechains according to the constraint file definitions, sample backbone positions, and optionally grow out extended backbone stubs (idealized helix or strand).
  6 | This script will perform an exhaustive analysis of all allowed rotamers and CST samplings.
  7 | 
  8 | You can also provide a motif PDB that will serve as a host for a particular constrained catalytic residue. That residue must exist in the PDB file, and only the rotamer will then be used for that residue.
  9 | 
 10 | The purpose of this tool is to find combinations of inverse rotamers that can be placed (on small extended backbones) without clashes. The outputs of this script can subsequently be used as inputs for RFdiffusion All-Atom to create protein backbones that host these active sites.
 11 | 
 12 | 
 13 | 
 14 | ## Examples
 15 | 
 16 | A few usage examples are provided in `examples/`
 17 | 
 18 | **Kemp eliminase example:**
 19 | Places three catalytic residues around a benzisoxazole substrate. A HIS-GLU/ASP dyad on one side, and a SER/THR/TYR/GLN/ASN H-bond donor on the other side.
 20 | `cd examples/Kemp_eliminase ; python ../../invrotzyme.py --cstfile inputs/BIO_His_ED_oxy_nosample.cst --params inputs/BIO.params --dunbrack_prob 0.6 --frac_random_rotamers_per_cst 0.5 0.5 0.5 0.5 --secstruct_per_cst H H E --prefix outputs/ --suffix HHE`
 21 | 
 22 | 
 23 | **P450 example:**
 24 | Places a custom Heme ligand in complex with a substrate against a CYS-containing motif from a cytochrome P450 enzyme.
 25 | `cd examples/P450 ; python ../../invrotzyme.py --cstfile inputs/HBA_CYS_P450_nosample.cst --params inputs/HBA_unique.params --motif_for_cst 1:3:inputs/P450_motif.pdb --frac_random_rotamers 0.1 --prefix outputs/`
 26 | 
 27 | 
 28 | ## Usage
 29 | 
 30 | First prepare a matcher/enzdes Constraint file according to the standard format outlined in Rosetta documentation:<br>
 31 | https://docs.rosettacommons.org/docs/latest/rosetta_basics/file_types/match-cstfile-format
 32 | 
 33 | This script requires all six degrees of freedom to be defined, so you msut provide distance, 2 angles, and 3 torsions for each interaction.
 34 | 
 35 | You can then run the script using many of the options below, perhaps taking inspiration from the provided examples.
 36 | 
 37 | ```
 38 | options:
 39 |   -h, --help            show this help message and exit
 40 |   --cstfile CSTFILE     CST file used for matching. Keep sampling to minimum to avoid combinatorial explosion.
 41 |   --params PARAMS [PARAMS ...]
 42 |                         params files used by ligands and residues
 43 |   --keep_his_tautomer KEEP_HIS_TAUTOMER
 44 |                         Per cst, should a specific HIS tautomer (HIS or HIS_D) be used. Keeps only one the requested HIS tautomers. Format: 'cst_no:HIS/HIS_D,..'
 45 |   --dunbrack_prob DUNBRACK_PROB
 46 |                         Cumulative Dunbrack probability of used rotamers for any residue. As used by the -packing:dunbrack_prob_... flag in Rosetta.
 47 |   --dunbrack_prob_per_cst DUNBRACK_PROB_PER_CST [DUNBRACK_PROB_PER_CST ...]
 48 |                         Cumulative Dunbrack probability of used rotamers for each CST residue.
 49 |   --N_len N_LEN         Number of residues added to the stub N-term
 50 |   --C_len C_LEN         Number of residues added to the stub C-term
 51 |   --N_len_per_cst N_LEN_PER_CST [N_LEN_PER_CST ...]
 52 |                         Number of residues added to the stub N-term, per CST
 53 |   --C_len_per_cst C_LEN_PER_CST [C_LEN_PER_CST ...]
 54 |                         Number of residues added to the stub C-term, per CST
 55 |   --prune_ligand_rotamers PRUNE_LIGAND_ROTAMERS
 56 |                         Prunes the set of used ligand rotamers based on clashcheck, AND rmsd similarity cutoff.
 57 |   --max_random_rotamers MAX_RANDOM_ROTAMERS
 58 |                         Number of random rotamers picked for each residue for the sampling. Reasonable number would be below 20 for quick sampling.
 59 |   --max_random_rotamers_per_cst MAX_RANDOM_ROTAMERS_PER_CST [MAX_RANDOM_ROTAMERS_PER_CST ...]
 60 |                         Number of random rotamers picked for each CST block for the sampling. First value is for the ligand.
 61 |   --frac_random_rotamers FRAC_RANDOM_ROTAMERS
 62 |                         Fraction of rotamers that are randomly picked for each residue for the sampling.
 63 |   --frac_random_rotamers_per_cst FRAC_RANDOM_ROTAMERS_PER_CST [FRAC_RANDOM_ROTAMERS_PER_CST ...]
 64 |                         Fraction of rotamers that are randomly picked for each CST block for the sampling. First value is for the ligand.
 65 |   --secstruct SECSTRUCT
 66 |                         What secondary structure stub should be generated for each residue.
 67 |   --secstruct_per_cst SECSTRUCT_PER_CST [SECSTRUCT_PER_CST ...]
 68 |                         Per CST, what secondary structure stub should be generated for reaach residue.
 69 |   --motif_for_cst MOTIF_FOR_CST [MOTIF_FOR_CST ...]
 70 |                         Per CST, an external motif that should be used, instead of inverse rotamers. Only works for the first CST right now.
 71 |                         Format: cst_no:resno_in_motif:filepath ...
 72 |   --use_best_rotamer_cstids USE_BEST_ROTAMER_CSTIDS [USE_BEST_ROTAMER_CSTIDS ...]
 73 |                         CST ID's that should only use the best rotamer from each secondary structure bin. Numbering starts from 1.
 74 |   --extra_chi EXTRA_CHI
 75 |                         Enables extra CHI sampling on a given level for all CST's. Input format: chino:level,chino2:level2
 76 |   --extra_chi_per_cst EXTRA_CHI_PER_CST [EXTRA_CHI_PER_CST ...]
 77 |                         Enables extra CHI sampling on a given level for specific CST's. Input format: CSTNO1-chino:level,chino2:level2 CSTNO2-chino:level,chino2:level2
 78 |                         Sampling levels:
 79 |                           0 Default original dihedral only; same as using no flag at all
 80 |                           1 +/- one standard deviation (sd); 3 samples 
 81 |                           2 +/- 0.5 sd; 3 samples 
 82 |                           3 +/- 1 & 2 sd; 5 samples 
 83 |                           4 +/- 0.5 & 1 sd; 5 samples
 84 |                           5 +/- 0.5, 1, 1.5 & 2 sd; 9 samples 
 85 |                           6 +/- 0.33, 0.67, 1 sd; 7 samples 
 86 |                           7 +/- 0.25, 0.5, 0.75, 1, 1.25 & 1.5 sd; 13 samples.
 87 |   --suffix SUFFIX       Suffix to be added to the end of output PDB files
 88 |   --prefix PREFIX       Prefix to be added to the beginning of output PDB files
 89 |   --tip_atom            Inverse rotamers will be pre-selected based on whether the tip atoms are placed geometrically differently. Rotamer diversity is ignored.
 90 |   --debug               Debug mode. Printing more stuff out and running single-threaded
 91 | ```
 92 | 
 93 | The script runs by default on multiple CPU cores using python multiprocessing. When submitted as a Slurm job, it will adjust the number of cores based on the environment variable `SLURM_CPUS_ON_NODE`.
 94 | 
 95 | 
 96 | ### Best practices
 97 | 
 98 | Keep conformational sampling levels in the CST file to a minimum to avoid combinatorial explosion. Only sample torsions that are expectes to lead different valid assemblies.<br>
 99 | 
100 | It's possible to limit the sampling by randomly picking rotamers for each residue, and limiting how the sidechain placements are sampled in the CST file.<br>
101 | It's possible to control the length of the generated idealized backbone stub (from zero to ...).<br>
102 | It's possible control most of the parameters separately for each constraint block.<br>
103 | With using the `--tip_atom` argument it is possible to skip the inverse rotamer clash analysis, and only output assemblies based on their unique placement of catalytic atoms.
104 | 
105 | The output PDB files of this script will also contain the `REMARK 666 ...` lines which are required by the Rosetta enzdes constraint parser. As such, the outputs are suitable for building more complex enzyme design pipelines.<br>
106 | For example, the published all-atom diffusion pipeline (https://github.com/ikalvet/heme_binder_diffusion) is directly compatible with the outputs of this script.
107 | 
108 | 
109 | ### Requirements
110 | 
111 | Python packages that are required:
112 | ```
113 | pyrosetta
114 | numpy
115 | pandas
116 | scipy
117 | ```
118 | 


--------------------------------------------------------------------------------
/examples/P450/inputs/HBA_unique.params:
--------------------------------------------------------------------------------
  1 | NAME HBA
  2 | IO_STRING HBA Z
  3 | TYPE LIGAND
  4 | AA UNK
  5 | ATOM FE1  Fe3p  X   3.00
  6 | ATOM  N2  Npro  X   -0.37
  7 | ATOM  C33 aroC  X   -0.11
  8 | ATOM  C32 aroC  X   -0.11
  9 | ATOM  C34 CH3   X   -0.27
 10 | ATOM  H8  Hapo  X   0.10
 11 | ATOM  H9  Hapo  X   0.10
 12 | ATOM  H10 Hapo  X   0.10
 13 | ATOM  C2  aroC  X   -0.11
 14 | ATOM  C3  CH2   X   -0.18
 15 | ATOM  C4  CH2   X   -0.18
 16 | ATOM  C5  COO   X   0.62
 17 | ATOM  O1  OOC   X   -0.76
 18 | ATOM  O3  OOC   X   -0.76
 19 | ATOM  H27 Hapo  X   0.10
 20 | ATOM  H28 Hapo  X   0.10
 21 | ATOM  H21 Hapo  X   0.10
 22 | ATOM  H25 Hapo  X   0.10
 23 | ATOM  C1  aroC  X   -0.11
 24 | ATOM  C28 aroC  X   -0.11
 25 | ATOM  C6  aroC  X   -0.11
 26 | ATOM  C7  aroC  X   -0.11
 27 | ATOM  C8  CH2   X   -0.18
 28 | ATOM  C9  CH2   X   -0.18
 29 | ATOM  C10 COO   X   0.62
 30 | ATOM  O2  OOC   X   -0.76
 31 | ATOM  O4  OOC   X   -0.76
 32 | ATOM  H29 Hapo  X   0.10
 33 | ATOM  H30 Hapo  X   0.10
 34 | ATOM  H26 Hapo  X   0.10
 35 | ATOM  H3  Hapo  X   0.10
 36 | ATOM  C11 aroC  X   -0.11
 37 | ATOM  C12 aroC  X   -0.11
 38 | ATOM  N1  Npro  X   -0.37
 39 | ATOM  C31 aroC  X   -0.11
 40 | ATOM  C14 aroC  X   -0.11
 41 | ATOM  N4  Npro  X   -0.37
 42 | ATOM  C19 aroC  X   -0.11
 43 | ATOM  C30 aroC  X   -0.11
 44 | ATOM  C21 aroC  X   -0.11
 45 | ATOM  N3  Npro  X   -0.37
 46 | ATOM  C26 aroC  X   -0.11
 47 | ATOM  C29 aroC  X   -0.11
 48 | ATOM  H20 Haro  X   0.12
 49 | ATOM  C25 aroC  X   -0.11
 50 | ATOM  C27 CH3   X   -0.27
 51 | ATOM  H13 Hapo  X   0.10
 52 | ATOM  H12 Hapo  X   0.10
 53 | ATOM  H11 Hapo  X   0.10
 54 | ATOM  C22 aroC  X   -0.11
 55 | ATOM  C23 aroC  X   -0.11
 56 | ATOM  C24 aroC  X   -0.11
 57 | ATOM  H5  Haro  X   0.12
 58 | ATOM  H4  Haro  X   0.12
 59 | ATOM  H1  Haro  X   0.12
 60 | ATOM  H24 Haro  X   0.12
 61 | ATOM  C18 aroC  X   -0.11
 62 | ATOM  C15 aroC  X   -0.11
 63 | ATOM  C16 aroC  X   -0.11
 64 | ATOM  C17 aroC  X   -0.11
 65 | ATOM  H6  Haro  X   0.12
 66 | ATOM  H7  Haro  X   0.12
 67 | ATOM  H2  Haro  X   0.12
 68 | ATOM  C20 CH3   X   -0.27
 69 | ATOM  H14 Hapo  X   0.10
 70 | ATOM  H15 Hapo  X   0.10
 71 | ATOM  H16 Hapo  X   0.10
 72 | ATOM  H23 Haro  X   0.12
 73 | ATOM  C13 CH3   X   -0.27
 74 | ATOM  H19 Hapo  X   0.10
 75 | ATOM  H18 Hapo  X   0.10
 76 | ATOM  H17 Hapo  X   0.10
 77 | ATOM  H22 Haro  X   0.12
 78 | ATOM  O5  OH    X   -0.66
 79 | ATOM  C35 CH2   X   -0.18
 80 | ATOM  O6  OH    X   -0.66
 81 | ATOM  C36 aroC  X   -0.11
 82 | ATOM  C38 aroC  X   -0.11
 83 | ATOM  C40 aroC  X   -0.11
 84 | ATOM  C41 aroC  X   -0.11
 85 | ATOM  C39 aroC  X   -0.11
 86 | ATOM  C37 aroC  X   -0.11
 87 | ATOM  H34 Haro  X   0.12
 88 | ATOM  H37 Haro  X   0.12
 89 | ATOM  C42 aroC  X   -0.11
 90 | ATOM  C44 aroC  X   -0.11
 91 | ATOM  C46 aroC  X   -0.11
 92 | ATOM  C47 aroC  X   -0.11
 93 | ATOM  C45 aroC  X   -0.11
 94 | ATOM  C43 aroC  X   -0.11
 95 | ATOM  H38 Haro  X   0.12
 96 | ATOM  H40 Haro  X   0.12
 97 | ATOM  H42 Haro  X   0.12
 98 | ATOM  H41 Haro  X   0.12
 99 | ATOM  H39 Haro  X   0.12
100 | ATOM  H36 Haro  X   0.12
101 | ATOM  H35 Haro  X   0.12
102 | ATOM  H33 Hapo  X   0.10
103 | ATOM  H32 Hapo  X   0.10
104 | ATOM  H31 Hpol  X   0.43
105 | BOND_TYPE  O1   C5  4   
106 | BOND_TYPE  O3   C5  4   
107 | BOND_TYPE  C5   C4  1   
108 | BOND_TYPE  H8   C34 1   
109 | BOND_TYPE  H21  C3  1   
110 | BOND_TYPE  C4   C3  1   
111 | BOND_TYPE  C4   H27 1   
112 | BOND_TYPE  C4   H28 1   
113 | BOND_TYPE  H9   C34 1   
114 | BOND_TYPE  C34  H10 1   
115 | BOND_TYPE  C34  C32 1   
116 | BOND_TYPE  C3   H25 1   
117 | BOND_TYPE  C3   C2  1   
118 | BOND_TYPE  O2   C10 4   
119 | BOND_TYPE  C32  C2  4   
120 | BOND_TYPE  C32  C33 4   
121 | BOND_TYPE  C2   C1  4   
122 | BOND_TYPE  O4   C10 4   
123 | BOND_TYPE  C10  C9  1   
124 | BOND_TYPE  H20  C29 1   
125 | BOND_TYPE  H22  C28 1   
126 | BOND_TYPE  C33  C29 2   
127 | BOND_TYPE  C33  N2  4   
128 | BOND_TYPE  C1   C28 2   
129 | BOND_TYPE  C1   N2  4   
130 | BOND_TYPE  H26  C8  1   
131 | BOND_TYPE  C29  C26 1   
132 | BOND_TYPE  H13  C27 1   
133 | BOND_TYPE  C28  C6  1   
134 | BOND_TYPE  H12  C27 1   
135 | BOND_TYPE  C9   H29 1   
136 | BOND_TYPE  C9   C8  1   
137 | BOND_TYPE  C9   H30 1   
138 | BOND_TYPE  N2  FE1  1   
139 | BOND_TYPE  C27  H11 1   
140 | BOND_TYPE  C27  C25 1   
141 | BOND_TYPE  C8   H3  1   
142 | BOND_TYPE  C8   C7  1   
143 | BOND_TYPE  C26  C25 1   
144 | BOND_TYPE  C26  N3  2   
145 | BOND_TYPE  C6   C7  1   
146 | BOND_TYPE  C6   N1  2   
147 | BOND_TYPE  H33  C35 1   
148 | BOND_TYPE  C25  C22 2   
149 | BOND_TYPE  C7   C11 2   
150 | BOND_TYPE  C35  H32 1   
151 | BOND_TYPE  C35  O6  1   
152 | BOND_TYPE  H31  O5  1   
153 | BOND_TYPE  N3  FE1  1   
154 | BOND_TYPE  N3   C21 1   
155 | BOND_TYPE  N1  FE1  1   
156 | BOND_TYPE  N1   C12 1   
157 | BOND_TYPE FE1   O5  1   
158 | BOND_TYPE FE1   N4  1   
159 | BOND_TYPE  C22  C21 1   
160 | BOND_TYPE  C22  C23 1   
161 | BOND_TYPE  C11  C12 1   
162 | BOND_TYPE  C11  C13 1   
163 | BOND_TYPE  O6   C36 1   
164 | BOND_TYPE  H1   C23 1   
165 | BOND_TYPE  C21  C30 2   
166 | BOND_TYPE  C12  C31 2   
167 | BOND_TYPE  H19  C13 1   
168 | BOND_TYPE  C23  C24 2   
169 | BOND_TYPE  C13  H18 1   
170 | BOND_TYPE  C13  H17 1   
171 | BOND_TYPE  H35  C38 1   
172 | BOND_TYPE  N4   C19 4   
173 | BOND_TYPE  N4   C14 4   
174 | BOND_TYPE  C36  C38 4   
175 | BOND_TYPE  C36  C37 4   
176 | BOND_TYPE  C30  C19 1   
177 | BOND_TYPE  C30  H24 1   
178 | BOND_TYPE  C31  C14 1   
179 | BOND_TYPE  C31  H23 1   
180 | BOND_TYPE  C38  C40 4   
181 | BOND_TYPE  C24  H5  1   
182 | BOND_TYPE  C24  H4  1   
183 | BOND_TYPE  C19  C18 4   
184 | BOND_TYPE  C14  C15 4   
185 | BOND_TYPE  H34  C37 1   
186 | BOND_TYPE  C37  C39 4   
187 | BOND_TYPE  C40  H36 1   
188 | BOND_TYPE  C40  C41 4   
189 | BOND_TYPE  C18  C15 4   
190 | BOND_TYPE  C18  C20 1   
191 | BOND_TYPE  C15  C16 1   
192 | BOND_TYPE  C39  C41 4   
193 | BOND_TYPE  C39  H37 1   
194 | BOND_TYPE  C41  C42 1   
195 | BOND_TYPE  H14  C20 1   
196 | BOND_TYPE  H2   C16 1   
197 | BOND_TYPE  C16  C17 2   
198 | BOND_TYPE  C20  H15 1   
199 | BOND_TYPE  C20  H16 1   
200 | BOND_TYPE  H39  C44 1   
201 | BOND_TYPE  C42  C44 4   
202 | BOND_TYPE  C42  C43 4   
203 | BOND_TYPE  C17  H6  1   
204 | BOND_TYPE  C17  H7  1   
205 | BOND_TYPE  H38  C43 1   
206 | BOND_TYPE  C44  C46 4   
207 | BOND_TYPE  C43  C45 4   
208 | BOND_TYPE  C46  H41 1   
209 | BOND_TYPE  C46  C47 4   
210 | BOND_TYPE  C45  C47 4   
211 | BOND_TYPE  C45  H40 1   
212 | BOND_TYPE  C47  H42 1   
213 | BOND_TYPE  O5   C35 1   
214 | CHI 1  C3   C4   C5   O1 
215 | CHI 2  C2   C3   C4   C5 
216 | CHI 3  C32  C2   C3   C4 
217 | CHI 4  C8   C9   C10  O2 
218 | CHI 5  C7   C8   C9   C10
219 | CHI 6  C6   C7   C8   C9 
220 | CHI 7  O5   C35  O6   C36
221 | CHI 8  N2  FE1   O5   C35
222 | CHI 9  C25  C22  C23  C24
223 | CHI 10  C35  O6   C36  C38
224 | CHI 11  C18  C15  C16  C17
225 | CHI 12  C40  C41  C42  C44
226 | CHI 13 FE1   O5   C35  O6 
227 | NBR_ATOM  O5 
228 | NBR_RADIUS 13.456399
229 | ICOOR_INTERNAL   FE1     0.000000    0.000000    0.000000  FE1    N2    C33
230 | ICOOR_INTERNAL    N2     0.000000  180.000000    2.018878  FE1    N2    C33
231 | ICOOR_INTERNAL    C33    0.000001   53.581195    1.373135   N2   FE1    C33
232 | ICOOR_INTERNAL    C32 -178.502193   69.602248    1.452331   C33   N2   FE1 
233 | ICOOR_INTERNAL    C34 -179.986284   55.631927    1.496810   C32   C33   N2 
234 | ICOOR_INTERNAL    H8   179.523543   68.748802    1.092786   C34   C32   C33
235 | ICOOR_INTERNAL    H9  -120.214586   68.550017    1.096877   C34   C32   H8 
236 | ICOOR_INTERNAL    H10 -119.584492   68.563454    1.096894   C34   C32   H9 
237 | ICOOR_INTERNAL    C2  -179.832410   73.531587    1.369725   C32   C33   C34
238 | ICOOR_INTERNAL    C3   179.828563   50.915427    1.499732   C2    C32   C33
239 | ICOOR_INTERNAL    C4  -102.596835   67.625659    1.538258   C3    C2    C32
240 | ICOOR_INTERNAL    C5   146.999694   65.222093    1.554685   C4    C3    C2 
241 | ICOOR_INTERNAL    O1   -31.997497   64.516933    1.262209   C5    C4    C3 
242 | ICOOR_INTERNAL    O3  -178.654391   62.681761    1.262040   C5    C4    O1 
243 | ICOOR_INTERNAL    H27 -122.780968   70.057389    1.096230   C4    C3    C5 
244 | ICOOR_INTERNAL    H28 -115.836979   70.552845    1.100306   C4    C3    H27
245 | ICOOR_INTERNAL    H21  119.769291   68.760642    1.092767   C3    C2    C4 
246 | ICOOR_INTERNAL    H25  120.234202   68.569298    1.096805   C3    C2    H21
247 | ICOOR_INTERNAL    C1  -179.655565   73.427377    1.452820   C2    C32   C3 
248 | ICOOR_INTERNAL    C28  178.719920   55.957567    1.384487   C1    C2    C32
249 | ICOOR_INTERNAL    C6  -178.930708   55.033211    1.384065   C28   C1    C2 
250 | ICOOR_INTERNAL    C7   179.988903   55.835882    1.453199   C6    C28   C1 
251 | ICOOR_INTERNAL    C8    -0.860763   55.699372    1.501415   C7    C6    C28
252 | ICOOR_INTERNAL    C9   -78.998795   65.749088    1.546648   C8    C7    C6 
253 | ICOOR_INTERNAL    C10  141.999037   67.347387    1.551252   C9    C8    C7 
254 | ICOOR_INTERNAL    O2   -33.999719   62.282349    1.259720   C10   C9    C8 
255 | ICOOR_INTERNAL    O4  -179.957769   65.001979    1.266190   C10   C9    O2 
256 | ICOOR_INTERNAL    H29 -118.434948   72.078025    1.099244   C9    C8    C10
257 | ICOOR_INTERNAL    H30 -117.376528   69.709170    1.095638   C9    C8    H29
258 | ICOOR_INTERNAL    H26  120.034261   68.570555    1.096912   C8    C7    C9 
259 | ICOOR_INTERNAL    H3   120.192030   68.728840    1.092802   C8    C7    H26
260 | ICOOR_INTERNAL    C11  179.868008   73.441608    1.369284   C7    C6    C8 
261 | ICOOR_INTERNAL    C12   -0.154467   73.482580    1.452883   C11   C7    C6 
262 | ICOOR_INTERNAL    N1     0.435801   69.675550    1.374007   C12   C11   C7 
263 | ICOOR_INTERNAL    C31 -178.693261   55.893570    1.380984   C12   C11   N1 
264 | ICOOR_INTERNAL    C14  178.357907   54.599034    1.386888   C31   C12   C11
265 | ICOOR_INTERNAL    N4     0.520628   54.286519    1.372941   C14   C31   C12
266 | ICOOR_INTERNAL    C19 -179.055731   73.706335    1.377697   N4    C14   C31
267 | ICOOR_INTERNAL    C30 -178.505888   54.779402    1.384031   C19   N4    C14
268 | ICOOR_INTERNAL    C21    2.654589   54.793775    1.382403   C30   C19   N4 
269 | ICOOR_INTERNAL    N3    -4.661808   55.020478    1.378742   C21   C30   C19
270 | ICOOR_INTERNAL    C26 -175.324645   73.700213    1.380305   N3    C21   C30
271 | ICOOR_INTERNAL    C29  177.683167   53.999984    1.385889   C26   N3    C21
272 | ICOOR_INTERNAL    H20 -179.979397   62.804618    1.083332   C29   C26   N3 
273 | ICOOR_INTERNAL    C25 -178.928272   69.621569    1.440461   C26   N3    C29
274 | ICOOR_INTERNAL    C27 -178.514185   55.444074    1.496833   C25   C26   N3 
275 | ICOOR_INTERNAL    H13  -57.561069   68.564267    1.096437   C27   C25   C26
276 | ICOOR_INTERNAL    H12  119.456698   68.786471    1.096955   C27   C25   H13
277 | ICOOR_INTERNAL    H11  119.928154   68.567917    1.092300   C27   C25   H12
278 | ICOOR_INTERNAL    C22  178.673006   73.118881    1.379818   C25   C26   C27
279 | ICOOR_INTERNAL    C23 -179.979763   54.615808    1.459849   C22   C25   C26
280 | ICOOR_INTERNAL    C24  150.023835   51.869958    1.342020   C23   C22   C25
281 | ICOOR_INTERNAL    H5  -179.640676   59.615916    1.085871   C24   C23   C22
282 | ICOOR_INTERNAL    H4   178.171547   56.914633    1.083754   C24   C23   H5 
283 | ICOOR_INTERNAL    H1  -177.989675   65.539017    1.089079   C23   C22   C24
284 | ICOOR_INTERNAL    H24 -179.987889   62.579510    1.079747   C30   C19   C21
285 | ICOOR_INTERNAL    C18  178.982124   69.399441    1.443148   C19   N4    C30
286 | ICOOR_INTERNAL    C15   -0.483648   73.586246    1.381379   C18   C19   N4 
287 | ICOOR_INTERNAL    C16 -177.712163   50.665146    1.457704   C15   C18   C19
288 | ICOOR_INTERNAL    C17  -28.973816   53.788201    1.342234   C16   C15   C18
289 | ICOOR_INTERNAL    H6    -2.582383   57.411776    1.085027   C17   C16   C15
290 | ICOOR_INTERNAL    H7  -178.671386   59.329067    1.085731   C17   C16   H6 
291 | ICOOR_INTERNAL    H2   179.942344   64.071680    1.088635   C16   C15   C17
292 | ICOOR_INTERNAL    C20 -176.831231   54.601971    1.495128   C18   C19   C15
293 | ICOOR_INTERNAL    H14   27.956740   68.323585    1.093567   C20   C18   C19
294 | ICOOR_INTERNAL    H15 -120.526429   68.078446    1.097641   C20   C18   H14
295 | ICOOR_INTERNAL    H16 -119.213017   69.806158    1.094147   C20   C18   H15
296 | ICOOR_INTERNAL    H23 -178.630689   62.934282    1.082653   C31   C12   C14
297 | ICOOR_INTERNAL    C13 -179.788688   50.851531    1.496813   C11   C7    C12
298 | ICOOR_INTERNAL    H19   -0.846953   68.746995    1.092826   C13   C11   C7 
299 | ICOOR_INTERNAL    H18  120.198842   68.537795    1.096911   C13   C11   H19
300 | ICOOR_INTERNAL    H17  119.584023   68.569414    1.096832   C13   C11   H18
301 | ICOOR_INTERNAL    H22  179.655343   62.492735    1.083429   C28   C1    C6 
302 | ICOOR_INTERNAL    O5    93.551083   88.797704    1.766351  FE1    N2    C33
303 | ICOOR_INTERNAL    C35  -88.167337   58.378822    2.508687   O5   FE1    N2 
304 | ICOOR_INTERNAL    O6   -57.034305   64.528007    1.383909   C35   O5   FE1 
305 | ICOOR_INTERNAL    C36  -55.550936   60.118970    1.362558   O6    C35   O5 
306 | ICOOR_INTERNAL    C38   17.111617   56.522838    1.401330   C36   O6    C35
307 | ICOOR_INTERNAL    C40  175.458107   60.992428    1.388896   C38   C36   O6 
308 | ICOOR_INTERNAL    C41    1.079143   60.518840    1.395052   C40   C38   C36
309 | ICOOR_INTERNAL    C39    0.432097   58.242537    1.398985   C41   C40   C38
310 | ICOOR_INTERNAL    C37   -0.958172   61.160057    1.384293   C39   C41   C40
311 | ICOOR_INTERNAL    H34 -179.126725   58.776514    1.083687   C37   C39   C41
312 | ICOOR_INTERNAL    H37 -179.384445   60.046487    1.082011   C39   C41   C37
313 | ICOOR_INTERNAL    C42 -179.992532   60.881265    1.540038   C41   C40   C39
314 | ICOOR_INTERNAL    C44  -45.000184   59.059228    1.405006   C42   C41   C40
315 | ICOOR_INTERNAL    C46 -179.996457   59.039759    1.393954   C44   C42   C41
316 | ICOOR_INTERNAL    C47   -0.001829   59.754353    1.396227   C46   C44   C42
317 | ICOOR_INTERNAL    C45    0.000386   60.546632    1.395743   C47   C46   C44
318 | ICOOR_INTERNAL    C43    0.003095   59.739786    1.393927   C45   C47   C46
319 | ICOOR_INTERNAL    H38  179.998065   60.342795    1.085673   C43   C45   C47
320 | ICOOR_INTERNAL    H40  179.993256   59.919618    1.086355   C45   C47   C43
321 | ICOOR_INTERNAL    H42 -179.994438   59.724947    1.086054   C47   C46   C45
322 | ICOOR_INTERNAL    H41 -179.993764   60.344214    1.086348   C46   C44   C47
323 | ICOOR_INTERNAL    H39  179.999701   60.615603    1.085631   C44   C42   C46
324 | ICOOR_INTERNAL    H36 -178.439340   59.339446    1.081631   C40   C38   C41
325 | ICOOR_INTERNAL    H35 -174.807835   59.206472    1.082225   C38   C36   C40
326 | ICOOR_INTERNAL    H33  122.944578   70.550409    1.088203   C35   O5    O6 
327 | ICOOR_INTERNAL    H32  118.437744   85.616119    1.094307   C35   O5    H33
328 | ICOOR_INTERNAL    H31    1.092413   66.031827    1.245030   O5   FE1    C35
329 | PDB_ROTAMERS HBA_conformers_unique.pdb
330 | 


--------------------------------------------------------------------------------
/invrotzyme.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Apr 15 12:48:51 2022
  5 | 
  6 | @author: ikalvet
  7 | """
  8 | import argparse
  9 | import pyrosetta as pyr
 10 | import pyrosetta.rosetta
 11 | import pyrosetta.distributed.io
 12 | import sys, os
 13 | import itertools
 14 | import functools
 15 | import operator
 16 | import time
 17 | import numpy as np
 18 | import pandas as pd
 19 | import multiprocessing
 20 | import random
 21 | import scipy.spatial
 22 | script_dir = os.path.dirname(os.path.realpath(__file__))
 23 | sys.path.append(script_dir)
 24 | sys.path.append(script_dir+'/utils/')
 25 | import protocol
 26 | import utils
 27 | import dunbrack_rotlib
 28 | import align_pdbs
 29 | 
 30 | 
 31 | 
 32 | 
 33 | def process_rotamer_set_queue(q, early_stop, prefix, bad_rotamers, rotamers, cst_io, cst_atoms, motifs, results_found):
 34 |     while True:
 35 |         i_ids = q.get()
 36 |         if i_ids is None:
 37 |             return
 38 | 
 39 |         i = i_ids[0]
 40 |         ids = i_ids[1]
 41 |         # Grabbing a combination of inverse rotamers based on the provided
 42 |         # per-cst inverse rotamer ids.
 43 |         c = [rotamers[n][i] for n, i in enumerate(ids)]
 44 | 
 45 |         if any([rot_id in bad_rotamers[j] for j, rot_id in enumerate(ids)]):
 46 |             # print(f"Bad rotamer in set {i}")
 47 |             continue
 48 | 
 49 |         # TODO: implement symmetry here
 50 |         # Take the list "c" and apply some symmetric transform to the residues there
 51 |         # Then the rest of the code should take care of it appropriately
 52 | 
 53 |         pose = pyrosetta.rosetta.core.pose.Pose()
 54 |         bad_rotamer = False
 55 |         catres_resnos = {n: 0 for n,r in enumerate(c) if not isinstance(r, pyrosetta.rosetta.core.pose.Pose) and r.is_ligand()}
 56 |         ligands = [r for r in c if not isinstance(r, pyrosetta.rosetta.core.pose.Pose) and r.is_ligand()]
 57 |         for j, res in enumerate(c):
 58 |             if args.debug:
 59 |                 if isinstance(res, pyrosetta.rosetta.core.conformation.Residue):
 60 |                     print(i, j, res.name())
 61 |                 elif isinstance(res, pyrosetta.rosetta.core.pose.Pose):
 62 |                     print(i, j, res.pdb_info().name())
 63 | 
 64 |             if not isinstance(res, pyrosetta.rosetta.core.pose.Pose) and res.is_ligand():  # ligand
 65 |                 continue
 66 | 
 67 |             # If we have already seen that it's a bad rotamer then let's just skip it
 68 |             if ids[j] in bad_rotamers[j]:
 69 |                 if args.debug: print(f"{j}, previously seen as a bad rotamer")
 70 |                 bad_rotamer = True
 71 |                 break
 72 | 
 73 |             if isinstance(res, pyrosetta.rosetta.core.conformation.Residue):
 74 |                 _res_pose = pyrosetta.rosetta.core.pose.Pose()
 75 |                 _res_pose.append_residue_by_jump(res, 0)
 76 |                 if res.is_protein():
 77 |                     _res_pose = protocol.extend_SS(pose=_res_pose, ref_seqpos=1,
 78 |                                           secstruct=args.secstruct_per_cst[j], AAA=AAA,
 79 |                                           nres_Nterm=args.N_len_per_cst[j],
 80 |                                           nres_Cterm=args.C_len_per_cst[j])
 81 |                     _res_pose.fold_tree().clear()
 82 |                     _res_pose.fold_tree().add_edge(1, _res_pose.size(), -1)  # This will avoid FoldTree reordering error showing up
 83 |                 catres_resno = args.N_len_per_cst[j]+1
 84 | 
 85 |             elif isinstance(res, pyrosetta.rosetta.core.pose.Pose):
 86 |                 _res_pose = res.clone()
 87 |                 catres_resno = motifs[j]["resno"]
 88 | 
 89 |             # Figuring out information about which CST atoms are used for this residue
 90 |             catres_cst_atoms = protocol.identify_cst_atoms_for_res(res, j, catres_resno, _res_pose, cst_atoms[j], motifs, ligands)
 91 | 
 92 | 
 93 |             # Adding ligand to the extended chain and checking for clashes
 94 |             for ligand in ligands:
 95 |                 # _res_pose.append_residue_by_jump(ligand, 1)  # this doesn't turn ligand into new chain
 96 |                 _res_pose.append_residue_by_jump(ligand, catres_resno,
 97 |                                                  jump_anchor_atom=_res_pose.residue(catres_resno).atom_name(_res_pose.residue(catres_resno).nbr_atom()),
 98 |                                                  jump_root_atom=ligand.atom_name(ligand.nbr_atom()),
 99 |                                                  start_new_chain=True)
100 | 
101 |             if protocol.check_clash(_res_pose, catres_resnos=[catres_resno]+[r.seqpos() for r in _res_pose.residues if r.is_ligand()], cst_atoms=catres_cst_atoms, tip_atom=args.tip_atom, debug=args.debug) is True:
102 |                 if args.debug: print(f"{j}, clash after extension")
103 |                 # Only adding the residude object to the bad residues
104 |                 # The motif pose will never be dumped
105 |                 if isinstance(res, pyrosetta.rosetta.core.conformation.Residue):
106 |                     if ids[j] not in bad_rotamers[j]:
107 |                         bad_rotamers[j].append(ids[j])
108 |                 elif isinstance(res, pyrosetta.rosetta.core.pose.Pose):
109 |                     if args.debug: print("MOTIF POSE SEEMS TO GIVE CLASH!!!! PLEASE INVESTIGATE!!!")
110 |                 bad_rotamer = True
111 | 
112 |                 # Giving up if all rotamers are bad
113 |                 if len(set(bad_rotamers[j])) == len(rotamers[j]):
114 |                     print(f"All rotamers for CST {j} are bad...")
115 |                 break
116 | 
117 |             if isinstance(res, pyrosetta.rosetta.core.conformation.Residue):
118 |                 catres_resnos[j] = pose.size() + args.N_len_per_cst[j]+1
119 |             else:
120 |                 catres_resnos[j] = motifs[j]["resno"]
121 | 
122 |             pyrosetta.rosetta.core.pose.append_subpose_to_pose(pose, _res_pose, 1, _res_pose.size()-len(ligands), new_chain=True)
123 | 
124 |         # Finished individual evaluation of residues
125 |         # Now putting the whole thing together
126 |         if bad_rotamer is True:
127 |             if args.debug: print(f"{j}, bad rotamer")
128 |             continue
129 | 
130 |         # Adding ligand as the last residue
131 |         for _n,res in enumerate(c):
132 |             if isinstance(res, pyrosetta.rosetta.core.pose.Pose):
133 |                 continue
134 |             if res.is_ligand():
135 |                 lig_pose = pyrosetta.rosetta.core.pose.Pose()
136 |                 lig_pose.append_residue_by_jump(res, 0)
137 |                 pyrosetta.rosetta.core.pose.append_subpose_to_pose(pose, lig_pose, 1, 1, new_chain=True)
138 |                 catres_resnos[_n] = pose.size()
139 | 
140 |         # Checking for clashes
141 |         # Ignoring clashes between catalytic residues and the ligand
142 |         ignore_clash_respairs = []
143 |         for j in catres_resnos:
144 |             if isinstance(c[j], pyrosetta.rosetta.core.conformation.Residue):
145 |                 assert pose.residue(catres_resnos[j]).name3() == c[j].name3(), f"cst {j}: resno {catres_resnos[j]}, {c[j].name3()} != {pose.residue(catres_resnos[j]).name3()}"
146 |             if j == 0:
147 |                 continue
148 |             if args.debug: print(f"clashcheck exclude cst atoms, cst {j}, resno {catres_resnos[j]}, name {pose.residue(catres_resnos[j]).name()}")
149 |             ignore_clash_respairs.append((catres_resnos[0], catres_resnos[j]))
150 | 
151 |         clash = protocol.check_clash(pose, catres_resnos=catres_resnos.values(), ignore_respairs=ignore_clash_respairs, tip_atom=args.tip_atom, debug=args.debug)
152 |         if clash is True:
153 |             if args.debug: print(f"{i}, clash in the final assembly")
154 |             continue
155 |         if args.debug: print(j, pose.sequence())
156 | 
157 |         # TODO: Need to implement checking whether the pose actually respects the CST's
158 |         # This is an issue when the ligand has any chi sampling enabled, and another residue is matched downstream of that.
159 |         # Some combinations of rotamers are not meant to work together
160 |         ## I think this is now managed in the REMARK 666 generation stage
161 | 
162 |         pose_name = args.prefix
163 |         for res in c:
164 |             if isinstance(res, pyrosetta.rosetta.core.conformation.Residue):
165 |                 if res.is_protein():
166 |                     pose_name += res.name1() + "_"
167 |                 else:
168 |                     pose_name += res.name3() + "_"
169 |             elif isinstance(res, pyrosetta.rosetta.core.pose.Pose):
170 |                 pose_name += os.path.basename(res.pdb_info().name()).replace(".pdb", "") + "_"
171 |         pose_name += f"{prefix}_{i}{args.suffix}.pdb"
172 |         if os.path.exists(pose_name):
173 |             print(f"Found existing file with name {pose_name}")
174 |             pose_name.replace(".pdb", "a.pdb")
175 | 
176 | 
177 |         remarks = protocol.create_remark_lines(pose, catres_resnos, cst_io)
178 | 
179 |         if len(remarks) != len(catres_resnos) - 1:
180 |             if args.debug: print(f"{i}: Could not build all REMARK 666 lines")
181 |             continue
182 | 
183 |         print(f"Found good rotamer: {pose_name.replace('.pdb', '')}")
184 | 
185 |         pdbstr = pyrosetta.distributed.io.to_pdbstring(pose).split("\n")
186 | 
187 |         pdbstr_new = []
188 |         for l in pdbstr:
189 |             pdbstr_new.append(l)
190 |             if "HEADER" in l:
191 |                 for rmrk in remarks:
192 |                     pdbstr_new.append(rmrk)
193 |         with open(pose_name, "w") as file:
194 |             file.write("\n".join(pdbstr_new))
195 | 
196 |         results_found.append(ids)
197 |         if args.max_outputs is not None and len(results_found) > args.max_outputs:
198 |             early_stop.value = True
199 |             print(f"Reached the output limit of {args.max_outputs}")
200 | 
201 | 
202 | 
203 | 
204 | def parallelize_mp(iterables, rotset, prefix, cst_io, cst_atoms, motifs, results_found):
205 | 
206 |     the_queue = multiprocessing.Queue(maxsize=args.nproc)  # Queue stores the iterables
207 | 
208 |     start = time.time()
209 |     manager = multiprocessing.Manager() 
210 |     bad_rotamers = manager.dict()
211 |     early_stop = multiprocessing.Value("b", False)
212 | 
213 |     if results_found is None:
214 |         results_found = manager.list()
215 | 
216 |     print(f"Starting to generate inverse rotamer assemblies using {args.nproc} parallel processes.")
217 |     pool = multiprocessing.Pool(processes=args.nproc,
218 |                                 initializer=process_rotamer_set_queue,
219 |                                 initargs=(the_queue, early_stop, prefix, bad_rotamers, rotset, cst_io, cst_atoms, motifs, results_found, ))
220 | 
221 |     for i, c in enumerate(iterables):
222 |         if i == 0:
223 |             for j in range(len(c)):
224 |                 bad_rotamers[j] = manager.list()
225 |         if early_stop.value == True:
226 |             the_queue.put(None)
227 |             break
228 |         the_queue.put((i, c))
229 | 
230 |     # None to end each process
231 |     for _i in range(args.nproc):
232 |         the_queue.put(None)
233 | 
234 |     # Closing the queue and the pool
235 |     the_queue.close()
236 |     the_queue.join_thread()
237 |     pool.close()
238 |     pool.join()
239 |     
240 |     print(f"Bad rotamers from set {prefix}:")
241 |     for j in bad_rotamers:
242 |         print(f"   CST {j}: {list(set(bad_rotamers[j]))}")
243 | 
244 |     end = time.time()
245 |     print(f"Processing all the rotamers in set {prefix} took {(end - start):.2f} seconds")
246 |     return results_found
247 | 
248 | 
249 | 
250 | 
251 | def main(args):
252 |     if args.suffix != "":
253 |         args.suffix = f"_{args.suffix}"
254 | 
255 |     if args.prefix != "":
256 |         args.prefix = f"{args.prefix}"
257 | 
258 |     assert os.path.exists(args.cstfile)
259 |     extra_res_fa = ""
260 |     if args.params is not None:
261 |         params = [p for p in args.params if ".params" in p]
262 |         extra_res_fa = "-extra_res_fa " + ' '.join(params)
263 | 
264 |     """
265 |     Setting up PyRosetta
266 |     """
267 |     
268 |     # pyr.init(f"{extra_res_fa} -run:preserve_header -output_virtual true")
269 |     pyr.init(f"{extra_res_fa} -run:preserve_header")
270 |     
271 |     # Loading the backbone-dependent Dunbrack rotamer library into a dataframe
272 |     dunbrack_database = os.path.dirname(pyr.__file__) + "/database/rotamer/bbdep02.May.sortlib-correct.12.2010"
273 |     rotlib = dunbrack_rotlib.load_rotamer_df(dunbrack_database)
274 | 
275 | 
276 |     global AAA  # making it global so that functions downstream can see it
277 |     AAA = pyr.pose_from_sequence("AAA")
278 | 
279 | 
280 |     ###### CST PARSING ########
281 |     # Parsing the CST file
282 |     addcst_mover = pyrosetta.rosetta.protocols.enzdes.AddOrRemoveMatchCsts()
283 |     chem_manager = pyrosetta.rosetta.core.chemical.ChemicalManager.get_instance()
284 |     residue_type_set = chem_manager.residue_type_set("fa_standard")
285 |     cst_io = pyrosetta.rosetta.protocols.toolbox.match_enzdes_util.EnzConstraintIO(residue_type_set)
286 |     cst_io.read_enzyme_cstfile(args.cstfile)
287 | 
288 | 
289 |     # Figuring out which residue atoms are used for each cst
290 |     # Using the MCFI (MatcherConstraintFileInfo) object for that
291 |     # cst_atoms will be a dict where each cst_block contains a list of variable CST's? and then a list of residue types
292 |     cst_atoms = protocol.get_cst_atoms(cst_io)
293 | 
294 |     # Storing information about which residues are matched for each CST block
295 |     restypes = {}
296 |     for n in range(1, cst_io.mcfi_lists_size()+1):
297 |         restypes[n] = []
298 |         for restype in cst_io.mcfi_list(n).upstream_restypes():
299 |             restypes[n].append(restype.name3())
300 | 
301 | 
302 |     ### PROCESS ARGUMENTS A BIT FURTHER ###
303 |     args = protocol.parse_arguments(args, restypes)
304 | 
305 | 
306 |     #### PARSING HIS TAUTOMER RESTRICTIONS #####
307 |     keep_his_tautomer_per_cst = None
308 |     if args.keep_his_tautomer is not None:
309 |         keep_his_tautomer_per_cst = {int(x.split(":")[0]): x.split(":")[1] for x in args.keep_his_tautomer.split(",")}
310 |         assert all([val in ["HIS", "HIS_D"] for key, val in keep_his_tautomer_per_cst.items()]), "Invalid input for --keep_his_tautomer"
311 | 
312 | 
313 |     ### ROTAMER SUBSAMPLING ####
314 |     chi_subsampling_levels = protocol.parse_rotamer_subsampling(args, cst_atoms)
315 | 
316 | 
317 |     ### Putting together a dictionary listing good rotamers for each residue in each CST
318 |     restype_good_rotamers = {}
319 |     for n in restypes:
320 |         restype_good_rotamers[n] = {}
321 |         for restyp in restypes[n]:
322 |             if restyp not in utils.N_chis.keys():
323 |                 continue
324 |             if restyp not in restype_good_rotamers.keys():
325 |                 use_only_best_rotamer = False
326 |                 if n in args.use_best_rotamer_cstids:
327 |                     use_only_best_rotamer = True
328 |                 restype_good_rotamers[n][restyp] = dunbrack_rotlib.find_good_rotamers(rotlib, restyp, args.dunbrack_prob_per_cst[n],
329 |                                                                                       args.secstruct_per_cst[n],
330 |                                                                                       keep_only_best=use_only_best_rotamer)
331 | 
332 | 
333 |     ### PARSING EXTERNAL MOTIFS ####
334 |     # TODO: make external motifs usable with other CST id's, not just the 1st one
335 |     motifs = None
336 |     if args.motif_for_cst is not None:
337 |         motifs = protocol.parse_motif_input(args.motif_for_cst, cst_atoms, restypes)
338 | 
339 |     
340 |     
341 |     ### GETTING INVERSE ROTAMERS ####
342 |     ### This is where half of the work gets done ###
343 |     invrot_tree = pyrosetta.rosetta.protocols.toolbox.match_enzdes_util.TheozymeInvrotTree(cst_io)
344 |     invrot_tree.generate_targets_and_inverse_rotamers()
345 |     all_inverse_rotamers_per_cst = invrot_tree.collect_all_inverse_rotamers()
346 |     
347 |     
348 |     ## There is a way to get inverse rotamers from cst_io
349 |     ## need to investigate this, because this allows keeping the sub-cst information
350 |     """
351 |     target_ats = pyrosetta.rosetta.utility.vector1_unsigned_long()
352 |     invrot_ats = pyrosetta.rosetta.utility.vector1_unsigned_long()
353 |     
354 |     _mcfi.inverse_rotamers_against_residue(target_conf=lig, invrot_restype=_mcfi.allowed_restypes(_mcfi.upstream_res())[1],
355 |                                            target_ats=target_ats, invrot_ats=invrot_ats, flip_exgs_upstream_downstream_samples=False, backbone_interaction=False)
356 |     """
357 |     
358 | 
359 |     time.sleep(1)
360 |     
361 |     print(f"{len(all_inverse_rotamers_per_cst)} rotamer sets to process")
362 | 
363 |     results_found = None
364 |     for xx, rotset in enumerate(all_inverse_rotamers_per_cst):
365 |         print(f"Non-redundant rotamer set {xx+1}")
366 |         for cst_block, invrots in enumerate(rotset.invrots()):
367 |             print(f"CST {cst_block}: {len(invrots)} inverse rotamers.")
368 | 
369 |         # Listify the inverse rotamer dataset
370 |         rotset_sub = [[invrot for invrot in invrots] for invrots in rotset.invrots()]
371 | 
372 |         # Pruning all other inverse rotamers based on proton-chis.
373 |         # Removing duplicate rotamers where the only difference is in the value of the proton_chi
374 |         for rotset_id in range(len(rotset_sub)):
375 |             if isinstance(rotset_sub[rotset_id][0], pyrosetta.rosetta.core.pose.Pose) or rotset_sub[rotset_id][0].is_ligand():
376 |                 continue
377 |             _n_before = len(rotset_sub[rotset_id])
378 |             rotset_sub[rotset_id] = protocol.prune_residue_rotamers(rotset_sub[rotset_id])
379 |             if len(rotset_sub[rotset_id]) != _n_before:
380 |                 print(f"CST {rotset_id}: {len(rotset_sub[rotset_id])} inverse rotamers after pruning for proton-chi")
381 | 
382 | 
383 |         # Loading any external motifs, if provided and aligning them to the appropriate CST atoms
384 |         if args.motif_for_cst is not None:
385 |             for cstno in motifs:
386 |                 # TODO: implement for not-first CST's (or CST's with additional sampling from CST file),
387 |                 # Picking rotamers with unique subsampling defined in CST
388 |                 to_align_rotamers = protocol.find_unique_rotamers_for_motif([r if i==cstno else [] for i, r in enumerate(rotset_sub)], motifs)
389 |                 rotset_sub[cstno] = [align_pdbs.align_pose_to_residue(rotamer, motifs[cstno]["pose"],
390 |                                                                      {"atoms1": motifs[cstno]["atoms"],
391 |                                                                       "atoms2": [(motifs[cstno]["resno"], a) for a in motifs[cstno]["atoms"]]}) for rotamer in to_align_rotamers[cstno]]
392 | 
393 | 
394 |         # Pruning inverse rotamers based on Dunbrack probabilites
395 |         rotset_sub = protocol.preselect_inverse_rotamers(rotset_sub, restype_good_rotamers, keep_his_tautomer_per_cst)
396 |         if rotset_sub is None:
397 |             continue
398 | 
399 |         # Culling ligand rotamers based on RMSD cutoff
400 |         if args.prune_ligand_rotamers != 0.0:
401 |             for rotset_id in range(len(rotset_sub)):
402 |                 if isinstance(rotset_sub[rotset_id][0], pyrosetta.rosetta.core.pose.Pose):
403 |                     continue
404 |                 if rotset_sub[rotset_id][0].is_ligand():
405 |                     rotset_sub[rotset_id] = protocol.prune_ligand_rotamers(rotset_sub[rotset_id], args.prune_ligand_rotamers, args.nproc)
406 | 
407 |         # Performing rotamer subsampling (expanding CHI's)
408 |         if any([any([y != 0 for y in x.values()]) for k, x in chi_subsampling_levels.items()]):
409 |             rotset_sub = protocol.subsample_rotamers(rotset_sub, chi_subsampling_levels, restype_good_rotamers, cst_atoms)
410 | 
411 |         # Picking random rotamers if requested
412 |         if args.frac_random_rotamers_per_cst is not None or args.max_random_rotamers_per_cst is not None:
413 |             print("Picking a random subset of inverse rotamers")
414 |             rotset_sub = protocol.pick_random_rotamers_set(rotset_sub, max_random_rotamers_per_cst=args.max_random_rotamers_per_cst,
415 |                                                   frac_random_rotamers_per_cst=args.frac_random_rotamers_per_cst)
416 |     
417 |         for cst_block, invrots in enumerate(rotset_sub):
418 |             print(f"CST {cst_block}: {len(invrots)} inverse rotamers after filtering.")
419 |     
420 |         rotset_ids = [[i for i, y in enumerate(x)] for x in rotset_sub]
421 |         rotamer_id_combinations = itertools.product(*[x for x in rotset_ids])
422 | 
423 |         # Processing this subset of rotamers
424 |         print(f"{functools.reduce(operator.mul, map(len, rotset_ids), 1)} inverse rotamer combinations to process in this set.")
425 |         results_found = parallelize_mp(iterables=rotamer_id_combinations, rotset=rotset_sub, prefix=xx+1, cst_io=cst_io, cst_atoms=cst_atoms, motifs=motifs, results_found=results_found)
426 | 
427 | 
428 | 
429 | if __name__ == "__main__":
430 |     parser = argparse.ArgumentParser()
431 |     parser.add_argument("--cstfile", type=str, required=True, help="CST file used for matching. Keep sampling to minimum to avoid combinatorial explosion.")
432 |     parser.add_argument("--params", nargs="+", required=False, help="params files used by ligands and residues")
433 |     parser.add_argument("--keep_his_tautomer", type=str, help="Per cst, should a specific HIS tautomer (HIS or HIS_D) be used. Keeps only one the requested HIS tautomers. Format: 'cst_no:HIS/HIS_D,..'")
434 |     parser.add_argument("--dunbrack_prob", type=float, default=0.85, help="Cumulative Dunbrack probability of used rotamers for any residue\n."
435 |                                                                           "As used by the -packing:dunbrack_prob_... flag in Rosetta.")
436 |     parser.add_argument("--dunbrack_prob_per_cst", type=float, nargs="+", help="Cumulative Dunbrack probability of used rotamers for each CST residue.")
437 |     parser.add_argument("--N_len", type=int, default=4, help="Number of residues added to the stub N-term")
438 |     parser.add_argument("--C_len", type=int, default=5, help="Number of residues added to the stub C-term")
439 |     parser.add_argument("--N_len_per_cst", type=int, nargs="+", help="Number of residues added to the stub N-term, per CST")
440 |     parser.add_argument("--C_len_per_cst", type=int, nargs="+", help="Number of residues added to the stub C-term, per CST")
441 |     parser.add_argument("--prune_ligand_rotamers", type=float, default=0.0, help="Prunes the set of used ligand rotamers based on clashcheck, AND rmsd similarity cutoff.")
442 |     parser.add_argument("--max_random_rotamers", type=int, help="Number of random rotamers picked for each residue for the sampling. Reasonable number would be below 20 for quick sampling.")
443 |     parser.add_argument("--max_random_rotamers_per_cst", nargs="+", type=int, help="Number of random rotamers picked for each CST block for the sampling. First value is for the ligand.")
444 |     parser.add_argument("--frac_random_rotamers", type=float, help="Fraction of rotamers that are randomly picked for each residue for the sampling.")
445 |     parser.add_argument("--frac_random_rotamers_per_cst", nargs="+", type=float, help="Fraction of rotamers that are randomly picked for each CST block for the sampling. First value is for the ligand.")
446 |     parser.add_argument("--secstruct", type=str, default="H", choices=["E", "H"], help="What secondary structure stub should be generated for each residue.")
447 |     parser.add_argument("--secstruct_per_cst", nargs="+", type=str, help="Per CST, what secondary structure stub should be generated for each residue.")
448 |     parser.add_argument("--motif_for_cst", type=str, nargs="+", help="Per CST, an external motif that should be used, instead of inverse rotamers. Only works for the first CST right now. Format: cst_no:resno_in_motif:filepath ...")
449 |     parser.add_argument("--use_best_rotamer_cstids", nargs="+", type=int, default=[], help="CST ID's that should only use the best rotamer from each secondary structure bin. Numbering starts from 1.")
450 |     parser.add_argument("--extra_chi", type=str, help="Enables extra CHI sampling on a given level for all CST's. Input format: chino:level,chino2:level2")
451 |     parser.add_argument("--extra_chi_per_cst", nargs="+", help=f"Enables extra CHI sampling on a given level for specific CST's. Input format: CSTNO1-chino:level,chino2:level2 CSTNO2-chino:level,chino2:level2\nSampling levels:\n{protocol.calculate_samplings.__doc__}")
452 |     parser.add_argument("--suffix", type=str, default= "", help="Suffix to be added to the end of output files")
453 |     parser.add_argument("--prefix", type=str, default= "", help="Prefix to be added to the beginning of output files")
454 |     parser.add_argument("--tip_atom", action="store_true", default=False, help="Inverse rotamers will be pre-selected based on whether the tip atoms are placed geometrically differently. Rotamer diversity is ignored.")
455 |     parser.add_argument("--nproc", type=int, help="Number of CPU cores used.")
456 |     parser.add_argument("--max_outputs", type=int, help="Maximum number of output structures that will be produced.")
457 |     parser.add_argument("--debug", action="store_true", default=False, help="Debug mode. Will print out more output at each step. Will run in single-core mode.")
458 | 
459 |     args = parser.parse_args()
460 | 
461 |     if "SLURM_CPUS_ON_NODE" in os.environ:
462 |         args.nproc = int(os.environ["SLURM_CPUS_ON_NODE"])
463 |     if args.nproc is None:
464 |         args.nproc = os.cpu_count()
465 |     if args.debug is True:
466 |         args.nproc = 1
467 | 
468 |     main(args)
469 | 
470 | 


--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | 
  4 | 
  5 | num2aa=[
  6 |     'ALA','ARG','ASN','ASP','CYS',
  7 |     'GLN','GLU','GLY','HIS','ILE',
  8 |     'LEU','LYS','MET','PHE','PRO',
  9 |     'SER','THR','TRP','TYR','VAL',
 10 |     ]
 11 | 
 12 | aa2num= {x:i for i,x in enumerate(num2aa)}
 13 | 
 14 | alpha_1 = list("ARNDCQEGHILKMFPSTWYV-")
 15 | aa_N_1 = {n:a for n,a in enumerate(alpha_1)}
 16 | aa_1_N = {a:n for n,a in enumerate(alpha_1)}
 17 | 
 18 | aa123 = {aa1: aa3 for aa1, aa3 in zip(alpha_1, num2aa)}
 19 | aa321 = {aa3: aa1 for aa1, aa3 in zip(alpha_1, num2aa)}
 20 | 
 21 | def N_to_AA(x):
 22 |     x = np.array(x);
 23 |     if x.ndim == 1: x = x[None]
 24 |     return ["".join([aa_N_1.get(a,"-") for a in y]) for y in x]
 25 | 
 26 |  
 27 | def alphabet_mapping(seq_list, alphabet_dict):
 28 |     """
 29 |     Args:
 30 |     seq_list: a list of sequences ['ABADSDAS', 'AABSDVDDV']
 31 |     Returns:
 32 |     encoded: a list of np.arrays
 33 |     """
 34 |     encoded = [[alphabet_dict[token] for token in seq] for seq in seq_list]
 35 |     return encoded
 36 | 
 37 | 
 38 | def alphabet_onehot_2_onehot(alphabet1, alphabet2):
 39 |     '''
 40 |     Args:
 41 |     alphabet1: List of amino acids in order (A characters)
 42 |     alphabet2: List of amino acids in different order
 43 |     
 44 |     Returns:
 45 |     map: AxA matrix to map one-hot encoding from alphabet1 to alphabet2
 46 |     '''
 47 |     assert len(alphabet1) == len(alphabet2), 'The alphabets must be the same length'
 48 |     
 49 |     alpha1_2_int = {aa: i for i, aa in enumerate(alphabet1)}
 50 |     alpha2_2_int = {aa: i for i, aa in enumerate(alphabet2)}
 51 |     
 52 |     A = len(alphabet1)
 53 |     map = np.zeros((A,A))
 54 |     
 55 |     for aa in alphabet1:
 56 |         j = alpha1_2_int[aa]
 57 |         i = alpha2_2_int[aa]
 58 |         map[i, j] = 1
 59 |         
 60 |     return map
 61 |         
 62 | 
 63 | # minimal sc atom representation (Nx8)
 64 | aa2short=[
 65 |     (" N  "," CA "," C  "," CB ",  None,  None,  None,  None), # ala
 66 |     (" N  "," CA "," C  "," CB "," CG "," CD "," NE "," CZ "), # arg
 67 |     (" N  "," CA "," C  "," CB "," CG "," OD1",  None,  None), # asn
 68 |     (" N  "," CA "," C  "," CB "," CG "," OD1",  None,  None), # asp
 69 |     (" N  "," CA "," C  "," CB "," SG ",  None,  None,  None), # cys
 70 |     (" N  "," CA "," C  "," CB "," CG "," CD "," OE1",  None), # gln
 71 |     (" N  "," CA "," C  "," CB "," CG "," CD "," OE1",  None), # glu
 72 |     (" N  "," CA "," C  ",  None,  None,  None,  None,  None), # gly
 73 |     (" N  "," CA "," C  "," CB "," CG "," ND1",  None,  None), # his
 74 |     (" N  "," CA "," C  "," CB "," CG1"," CD1",  None,  None), # ile
 75 |     (" N  "," CA "," C  "," CB "," CG "," CD1",  None,  None), # leu
 76 |     (" N  "," CA "," C  "," CB "," CG "," CD "," CE "," NZ "), # lys
 77 |     (" N  "," CA "," C  "," CB "," CG "," SD "," CE ",  None), # met
 78 |     (" N  "," CA "," C  "," CB "," CG "," CD1",  None,  None), # phe
 79 |     (" N  "," CA "," C  "," CB "," CG "," CD ",  None,  None), # pro
 80 |     (" N  "," CA "," C  "," CB "," OG ",  None,  None,  None), # ser
 81 |     (" N  "," CA "," C  "," CB "," OG1",  None,  None,  None), # thr
 82 |     (" N  "," CA "," C  "," CB "," CG "," CD1",  None,  None), # trp
 83 |     (" N  "," CA "," C  "," CB "," CG "," CD1",  None,  None), # tyr
 84 |     (" N  "," CA "," C  "," CB "," CG1",  None,  None,  None), # val
 85 | ]
 86 | 
 87 | # full sc atom representation (Nx14)
 88 | aa2long=[
 89 |     (" N  "," CA "," C  "," O  "," CB ",  None,  None,  None,  None,  None,  None,  None,  None,  None), # ala
 90 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD "," NE "," CZ "," NH1"," NH2",  None,  None,  None), # arg
 91 |     (" N  "," CA "," C  "," O  "," CB "," CG "," OD1"," ND2",  None,  None,  None,  None,  None,  None), # asn
 92 |     (" N  "," CA "," C  "," O  "," CB "," CG "," OD1"," OD2",  None,  None,  None,  None,  None,  None), # asp
 93 |     (" N  "," CA "," C  "," O  "," CB "," SG ",  None,  None,  None,  None,  None,  None,  None,  None), # cys
 94 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD "," OE1"," NE2",  None,  None,  None,  None,  None), # gln
 95 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD "," OE1"," OE2",  None,  None,  None,  None,  None), # glu
 96 |     (" N  "," CA "," C  "," O  ",  None,  None,  None,  None,  None,  None,  None,  None,  None,  None), # gly
 97 |     (" N  "," CA "," C  "," O  "," CB "," CG "," ND1"," CD2"," CE1"," NE2",  None,  None,  None,  None), # his
 98 |     (" N  "," CA "," C  "," O  "," CB "," CG1"," CG2"," CD1",  None,  None,  None,  None,  None,  None), # ile
 99 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD1"," CD2",  None,  None,  None,  None,  None,  None), # leu
100 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD "," CE "," NZ ",  None,  None,  None,  None,  None), # lys
101 |     (" N  "," CA "," C  "," O  "," CB "," CG "," SD "," CE ",  None,  None,  None,  None,  None,  None), # met
102 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD1"," CD2"," CE1"," CE2"," CZ ",  None,  None,  None), # phe
103 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD ",  None,  None,  None,  None,  None,  None,  None), # pro
104 |     (" N  "," CA "," C  "," O  "," CB "," OG ",  None,  None,  None,  None,  None,  None,  None,  None), # ser
105 |     (" N  "," CA "," C  "," O  "," CB "," OG1"," CG2",  None,  None,  None,  None,  None,  None,  None), # thr
106 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD1"," CD2"," CE2"," CE3"," NE1"," CZ2"," CZ3"," CH2"), # trp
107 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD1"," CD2"," CE1"," CE2"," CZ "," OH ",  None,  None), # tyr
108 |     (" N  "," CA "," C  "," O  "," CB "," CG1"," CG2",  None,  None,  None,  None,  None,  None,  None), # val
109 | ]
110 | 
111 | # build the "alternate" sc mapping
112 | aa2longalt=[
113 |     (" N  "," CA "," C  "," O  "," CB ",  None,  None,  None,  None,  None,  None,  None,  None,  None), # ala
114 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD "," NE "," CZ "," NH2"," NH1",  None,  None,  None), # arg
115 |     (" N  "," CA "," C  "," O  "," CB "," CG "," OD1"," ND2",  None,  None,  None,  None,  None,  None), # asn
116 |     (" N  "," CA "," C  "," O  "," CB "," CG "," OD2"," OD1",  None,  None,  None,  None,  None,  None), # asp
117 |     (" N  "," CA "," C  "," O  "," CB "," SG ",  None,  None,  None,  None,  None,  None,  None,  None), # cys
118 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD "," OE1"," NE2",  None,  None,  None,  None,  None), # gln
119 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD "," OE2"," OE1",  None,  None,  None,  None,  None), # glu
120 |     (" N  "," CA "," C  "," O  ",  None,  None,  None,  None,  None,  None,  None,  None,  None,  None), # gly
121 |     (" N  "," CA "," C  "," O  "," CB "," CG "," ND1"," CD2"," CE1"," NE2",  None,  None,  None,  None), # his
122 |     (" N  "," CA "," C  "," O  "," CB "," CG1"," CG2"," CD1",  None,  None,  None,  None,  None,  None), # ile
123 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD2"," CD1",  None,  None,  None,  None,  None,  None), # leu
124 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD "," CE "," NZ ",  None,  None,  None,  None,  None), # lys
125 |     (" N  "," CA "," C  "," O  "," CB "," CG "," SD "," CE ",  None,  None,  None,  None,  None,  None), # met
126 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD2"," CD1"," CE2"," CE1"," CZ ",  None,  None,  None), # phe
127 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD ",  None,  None,  None,  None,  None,  None,  None), # pro
128 |     (" N  "," CA "," C  "," O  "," CB "," OG ",  None,  None,  None,  None,  None,  None,  None,  None), # ser
129 |     (" N  "," CA "," C  "," O  "," CB "," OG1"," CG2",  None,  None,  None,  None,  None,  None,  None), # thr
130 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD1"," CD2"," CE2"," CE3"," NE1"," CZ2"," CZ3"," CH2"), # trp
131 |     (" N  "," CA "," C  "," O  "," CB "," CG "," CD2"," CD1"," CE2"," CE1"," CZ "," OH ",  None,  None), # tyr
132 |     (" N  "," CA "," C  "," O  "," CB "," CG2"," CG1",  None,  None,  None,  None,  None,  None,  None), # val
133 | ]
134 | 
135 | # full sc & H atom representation (Nx22)
136 | aa2longH = [
137 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' H  ', ' HA ', '1HB ', '2HB ', '3HB ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # ala
138 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' NE ', ' CZ ', ' NH1', ' NH2', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ', ' HE ', '1HH1', '2HH1', '1HH2', '2HH2'), # arg
139 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' OD1', ' ND2', ' H  ', ' HA ', '1HB ', '2HB ', '1HD2', '2HD2',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # asn
140 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' OD1', ' OD2', ' H  ', ' HA ', '1HB ', '2HB ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # asp
141 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' SG ', ' H  ', ' HA ', '1HB ', '2HB ', ' HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # cys
142 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' OE1', ' NE2', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HE2', '2HE2',   None,   None,   None,   None,   None,   None,   None), # gln
143 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' OE1', ' OE2', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None), # glu
144 | (' N  ', ' CA ', ' C  ', ' O  ', ' H  ', '1HA ', '2HA ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # gly
145 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' ND1', ' CD2', ' CE1', ' NE2', ' H  ', ' HA ', '1HB ', '2HB ', ' HD2', ' HE1', ' HE2',   None,   None,   None,   None,   None,   None,   None), # his
146 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG1', ' CG2', ' CD1', ' H  ', ' HA ', ' HB ', '1HG1', '2HG1', '1HG2', '2HG2', '3HG2', '1HD1', '2HD1', '3HD1',   None,   None,   None,   None,   None), # ile
147 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD1', ' CD2', ' H  ', ' HA ', '1HB ', '2HB ', ' HG ', '1HD1', '2HD1', '3HD1', '1HD2', '2HD2', '3HD2',   None,   None,   None,   None,   None), # leu
148 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' CE ', ' NZ ', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ', '1HE ', '2HE ', '1HZ ', '2HZ ', '3HZ ',   None,   None), # lys
149 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' SD ', ' CE ', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HE ', '2HE ', '3HE ',   None,   None,   None,   None,   None,   None,   None), # met
150 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD1', ' CD2', ' CE1', ' CE2', ' CZ ', ' H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HD2', ' HE1', ' HE2', ' HZ ',   None,   None,   None,   None), # phe
151 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' NV ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ',   None,   None,   None,   None,   None,   None,   None,   None,   None), # pro
152 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' OG ', ' H  ', ' HA ', '1HB ', '2HB ', ' HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # ser
153 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' OG1', ' CG2', ' H  ', ' HA ', ' HB ', ' HG1', '1HG2', '2HG2', '3HG2',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # thr
154 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD1', ' CD2', ' NE1', ' CE2', ' CE3', ' CZ2', ' CZ3', ' CH2', ' H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HE1', ' HE3', ' HZ2',  'HZ3',  'HH2'), # trp
155 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD1', ' CD2', ' CE1', ' CE2', ' CZ ', ' OH ', ' H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HD2', ' HE1', ' HE2', ' HH ',   None,   None,   None), # tyr
156 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG1', ' CG2', ' H  ', ' HA ', ' HB ', '1HG1', '2HG1', '3HG1', '1HG2', '2HG2', '3HG2',   None,   None,   None,   None,   None,   None,   None,   None)  # val
157 | ]
158 | 
159 | aa2longH_Nterm = [
160 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', '3HB ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # ala
161 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' NE ', ' CZ ', ' NH1', ' NH2', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ', ' HE ', '1HH1', '2HH1', '1HH2', '2HH2'), # arg
162 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' OD1', ' ND2', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', '1HD2', '2HD2',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # asn
163 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' OD1', ' OD2', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # asp
164 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' SG ', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', ' HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # cys
165 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' OE1', ' NE2', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HE2', '2HE2',   None,   None,   None,   None,   None,   None,   None), # gln
166 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' OE1', ' OE2', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None), # glu
167 | (' N  ', ' CA ', ' C  ', ' O  ', '1H  ', '2H  ', '3H  ', '1HA ', '2HA ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # gly
168 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' ND1', ' CD2', ' CE1', ' NE2', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', ' HD2', ' HE1', ' HE2',   None,   None,   None,   None,   None,   None,   None), # his
169 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG1', ' CG2', ' CD1', '1H  ', '2H  ', '3H  ', ' HA ', ' HB ', '1HG1', '2HG1', '1HG2', '2HG2', '3HG2', '1HD1', '2HD1', '3HD1',   None,   None,   None,   None,   None), # ile
170 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD1', ' CD2', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', ' HG ', '1HD1', '2HD1', '3HD1', '1HD2', '2HD2', '3HD2',   None,   None,   None,   None,   None), # leu
171 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' CE ', ' NZ ', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ', '1HE ', '2HE ', '1HZ ', '2HZ ', '3HZ ',   None,   None), # lys
172 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' SD ', ' CE ', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HE ', '2HE ', '3HE ',   None,   None,   None,   None,   None,   None,   None), # met
173 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD1', ' CD2', ' CE1', ' CE2', ' CZ ', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HD2', ' HE1', ' HE2', ' HZ ',   None,   None,   None,   None), # phe
174 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD ', ' NV ', 'CAV ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ', '1H  ', '2H  ',   None,   None,   None,   None,   None,   None,   None,   None), # pro
175 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' OG ', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', ' HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # ser
176 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' OG1', ' CG2', '1H  ', '2H  ', '3H  ', ' HA ', ' HB ', ' HG1', '1HG2', '2HG2', '3HG2',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # thr
177 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD1', ' CD2', ' NE1', ' CE2', ' CE3', ' CZ2', ' CZ3', ' CH2', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HE1', ' HE3', ' HZ2',  'HZ3',  'HH2'), # trp
178 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG ', ' CD1', ' CD2', ' CE1', ' CE2', ' CZ ', ' OH ', '1H  ', '2H  ', '3H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HD2', ' HE1', ' HE2', ' HH ',   None,   None,   None), # tyr
179 | (' N  ', ' CA ', ' C  ', ' O  ', ' CB ', ' CG1', ' CG2', '1H  ', '2H  ', '3H  ', ' HA ', ' HB ', '1HG1', '2HG1', '3HG1', '1HG2', '2HG2', '3HG2',   None,   None,   None,   None,   None,   None,   None,   None)  # val
180 | ]
181 | 
182 | aa2longH_Cterm = [
183 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' H  ', ' HA ', '1HB ', '2HB ', '3HB ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # ala
184 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD ', ' NE ', ' CZ ', ' NH1', ' NH2', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ', ' HE ', '1HH1', '2HH1', '1HH2', '2HH2'), # arg
185 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' OD1', ' ND2', ' H  ', ' HA ', '1HB ', '2HB ', '1HD2', '2HD2',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # asn
186 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' OD1', ' OD2', ' H  ', ' HA ', '1HB ', '2HB ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # asp
187 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' SG ', ' H  ', ' HA ', '1HB ', '2HB ', ' HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # cys
188 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD ', ' OE1', ' NE2', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HE2', '2HE2',   None,   None,   None,   None,   None,   None,   None), # gln
189 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD ', ' OE1', ' OE2', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None), # glu
190 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' H  ', '1HA ', '2HA ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # gly
191 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' ND1', ' CD2', ' CE1', ' NE2', ' H  ', ' HA ', '1HB ', '2HB ', ' HD2', ' HE1', ' HE2',   None,   None,   None,   None,   None,   None,   None), # his
192 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG1', ' CG2', ' CD1', ' H  ', ' HA ', ' HB ', '1HG1', '2HG1', '1HG2', '2HG2', '3HG2', '1HD1', '2HD1', '3HD1',   None,   None,   None,   None,   None), # ile
193 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD1', ' CD2', ' H  ', ' HA ', '1HB ', '2HB ', ' HG ', '1HD1', '2HD1', '3HD1', '1HD2', '2HD2', '3HD2',   None,   None,   None,   None,   None), # leu
194 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD ', ' CE ', ' NZ ', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ', '1HE ', '2HE ', '1HZ ', '2HZ ', '3HZ ',   None,   None), # lys
195 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' SD ', ' CE ', ' H  ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HE ', '2HE ', '3HE ',   None,   None,   None,   None,   None,   None,   None), # met
196 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD1', ' CD2', ' CE1', ' CE2', ' CZ ', ' H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HD2', ' HE1', ' HE2', ' HZ ',   None,   None,   None,   None), # phe
197 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD ', ' NV ', ' HA ', '1HB ', '2HB ', '1HG ', '2HG ', '1HD ', '2HD ',   None,   None,   None,   None,   None,   None,   None,   None,   None), # pro
198 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' OG ', ' H  ', ' HA ', '1HB ', '2HB ', ' HG ',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # ser
199 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' OG1', ' CG2', ' H  ', ' HA ', ' HB ', ' HG1', '1HG2', '2HG2', '3HG2',   None,   None,   None,   None,   None,   None,   None,   None,   None,   None), # thr
200 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD1', ' CD2', ' NE1', ' CE2', ' CE3', ' CZ2', ' CZ3', ' CH2', ' H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HE1', ' HE3', ' HZ2',  'HZ3',  'HH2'), # trp
201 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG ', ' CD1', ' CD2', ' CE1', ' CE2', ' CZ ', ' OH ', ' H  ', ' HA ', '1HB ', '2HB ', ' HD1', ' HD2', ' HE1', ' HE2', ' HH ',   None,   None,   None), # tyr
202 | (' N  ', ' CA ', ' C  ', ' O  ', ' OXT', ' CB ', ' CG1', ' CG2', ' H  ', ' HA ', ' HB ', '1HG1', '2HG1', '3HG1', '1HG2', '2HG2', '3HG2',   None,   None,   None,   None,   None,   None,   None,   None)  # val
203 | ]
204 | 
205 | 
206 | # build "deterministic" atoms
207 | # see notebook (se3_experiments.ipynb for derivation)
208 | aa2frames=[
209 |     [], # ala
210 |     [   # arg
211 |         [' NH1', ' CZ ', ' NE ', ' CD ', [-0.7218378782272339, 1.0856682062149048, -0.006118079647421837]],
212 |         [' NH2', ' CZ ', ' NE ', ' CD ', [-0.6158039569854736, -1.1400136947631836, 0.006467342376708984]]],
213 |     [   # asn
214 |         [' ND2', ' CG ', ' CB ', ' OD1', [-0.6304131746292114, -1.1431225538253784, 0.02364802360534668]]],
215 |     [   # asp
216 |         [' OD2', ' CG ', ' CB ', ' OD1', [-0.5972501039505005, -1.0955055952072144, 0.04530305415391922]]],
217 |     [], # cys
218 |     [   # gln
219 |         [' NE2', ' CD ', ' CG ', ' OE1', [-0.6558755040168762, -1.1324536800384521, 0.026521772146224976]]],
220 |     [   # glu
221 |         [' OE2', ' CD ', ' CG ', ' OE1', [-0.5578438639640808, -1.1161314249038696, -0.015464287251234055]]],
222 |     [], # gly
223 |     [   # his
224 |         [' CD2', ' CG ', ' CB ', ' ND1', [-0.7502505779266357, -1.1680538654327393, 0.0005368441343307495]],
225 |         [' CE1', ' CG ', ' CB ', ' ND1', [-2.0262467861175537, 0.539483368396759, -0.004495501518249512]],
226 |         [' NE2', ' CG ', ' CB ', ' ND1', [-2.0761325359344482, -0.8199722766876221, -0.0018703639507293701]]],
227 |     [   # ile
228 |         [' CG2', ' CB ', ' CA ', ' CG1', [-0.6059935688972473, -0.8108057379722595, 1.1861376762390137]]],
229 |     [   # leu
230 |         [' CD2', ' CG ', ' CB ', ' CD1', [-0.5942193269729614, -0.7693282961845398, -1.1914138793945312]]],
231 |     [], # lys
232 |     [], # met
233 |     [   # phe
234 |         [' CD2', ' CG ', ' CB ', ' CD1', [-0.7164441347122192, -1.197853446006775, 0.06416648626327515]],
235 |         [' CE1', ' CG ', ' CB ', ' CD1', [-2.0785865783691406, 1.2366485595703125, 0.08100450038909912]],
236 |         [' CE2', ' CG ', ' CB ', ' CD1', [-2.107091188430786, -1.178497076034546, 0.13524535298347473]],
237 |         [' CZ ', ' CG ', ' CB ', ' CD1', [-2.786630630493164, 0.03873880207538605, 0.14633776247501373]]],
238 |     [], # pro
239 |     [], # ser
240 |     [   # thr
241 |         [' CG2', ' CB ', ' CA ', ' OG1', [-0.6842088103294373, -0.6709619164466858, 1.2105456590652466]]],
242 |     [   # trp
243 |         [' CD2', ' CG ', ' CB ', ' CD1', [-0.8550368547439575, -1.0790592432022095, 0.09017711877822876]],
244 |         [' NE1', ' CG ', ' CB ', ' CD1', [-2.1863200664520264, 0.8064242601394653, 0.08350661396980286]],
245 |         [' CE2', ' CG ', ' CB ', ' CD1', [-2.1801204681396484, -0.5795643329620361, 0.14015203714370728]],
246 |         [' CE3', ' CG ', ' CB ', ' CD1', [-0.605582594871521, -2.4733362197875977, 0.16200461983680725]],
247 |         [' CE2', ' CG ', ' CB ', ' CD1', [-2.1801204681396484, -0.5795643329620361, 0.14015203714370728]],
248 |         [' CZ2', ' CG ', ' CB ', ' CD1', [-3.2672977447509766, -1.473116159439087, 0.250858873128891]],
249 |         [' CZ3', ' CG ', ' CB ', ' CD1', [-1.6969941854476929, -3.3360071182250977, 0.264143705368042]],
250 |         [' CH2', ' CG ', ' CB ', ' CD1', [-3.009331703186035, -2.8451972007751465, 0.3059283494949341]]],
251 |     [   # tyr
252 |         [' CD2', ' CG ', ' CB ', ' CD1', [-0.69439297914505, -1.2123756408691406, -0.009198814630508423]],
253 |         [' CE1', ' CG ', ' CB ', ' CD1', [-2.104464054107666, 1.1910505294799805, -0.014679580926895142]],
254 |         [' CE2', ' CG ', ' CB ', ' CD1', [-2.0857787132263184, -1.2231677770614624, -0.024517983198165894]],
255 |         [' CZ ', ' CG ', ' CB ', ' CD1', [-2.7897322177886963, -0.021470561623573303, -0.026979409158229828]],
256 |         [' OH ', ' CG ', ' CB ', ' CD1', [-4.1559271812438965, -0.029129385948181152, -0.044720835983753204]]],
257 |     [   # val
258 |         [' CG2', ' CB ', ' CA ', ' CG1', [-0.6258467435836792, -0.7654698491096497, -1.1894742250442505]]],
259 | ]
260 | 
261 | # O from frame (C,N-1,CA)
262 | bb2oframe=[-0.5992066264152527, -1.0820008516311646, 0.0001476481556892395]
263 | 
264 | # build the mapping from indices in reduced representation to 
265 | # indices in the full representation
266 | #  N x 14 x 6 = <base-idx | parent-idx | gparent-idx | x | y | z >
267 | #    base-idx < 0 ==> no atom
268 | #    xyz = 0 ==> no mapping
269 | short2long = np.zeros((20,14,6))
270 | for i in range(20):
271 |     i_s, i_l = aa2short[i],aa2long[i]
272 |     for j,a in enumerate(i_l):
273 |         # case 1: if no atom defined, blank
274 |         if (a is None):
275 |             short2long[i,j,0] = -1
276 |         # case 2: atom is a base atom
277 |         elif (a in i_s):
278 |             short2long[i,j,0] = i_s.index(a)
279 |             if (short2long[i,j,0] == 0):
280 |                 short2long[i,j,1] = 1
281 |                 short2long[i,j,2] = 2
282 |             else:
283 |                 short2long[i,j,1] = 0
284 |                 if (short2long[i,j,0] == 1):
285 |                     short2long[i,j,2] = 2
286 |                 else:
287 |                     short2long[i,j,2] = 1
288 |         # case 3: atom is ' O  '
289 |         elif (a == " O  "):
290 |             short2long[i,j,0] = 2
291 |             short2long[i,j,1] = 0 #Nprev (will pre-roll N as nothing else needs it)
292 |             short2long[i,j,2] = 1
293 |             short2long[i,j,3:] = np.array(bb2oframe)
294 |         # case 4: build this atom
295 |         else:
296 |             i_f = aa2frames[i]
297 |             names = [f[0] for f in i_f]
298 |             idx = names.index(a)
299 |             short2long[i,j,0] = i_s.index(i_f[idx][1])
300 |             short2long[i,j,1] = i_s.index(i_f[idx][2])
301 |             short2long[i,j,2] = i_s.index(i_f[idx][3])
302 |             short2long[i,j,3:] = np.array(i_f[idx][4])
303 | 
304 | # build the mapping from atoms in the full rep (Nx14) to the "alternate" rep
305 | long2alt = np.zeros((20,14))
306 | for i in range(20):
307 |     i_l, i_lalt = aa2long[i],  aa2longalt[i]
308 |     for j,a in enumerate(i_l):
309 |         if (a is None):
310 |             long2alt[i,j] = j
311 |         else:
312 |             long2alt[i,j] = i_lalt.index(a)
313 | 
314 | 


--------------------------------------------------------------------------------
/protocol.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Aug 25 23:14:34 2024
  5 | 
  6 | @author: indrek
  7 | """
  8 | import pyrosetta as pyr
  9 | import pyrosetta.rosetta
 10 | import os, sys
 11 | import random
 12 | import numpy as np
 13 | import itertools
 14 | import multiprocessing
 15 | import time
 16 | import scipy.spatial
 17 | 
 18 | script_dir = os.path.dirname(os.path.realpath(__file__))
 19 | sys.path.append(script_dir)
 20 | sys.path.append(script_dir+'/utils/')
 21 | import utils
 22 | import dunbrack_rotlib
 23 | import align_pdbs
 24 | 
 25 | 
 26 | """
 27 | PARSING FUNCTIONS
 28 | """
 29 | def parse_arguments(args, restypes):
 30 |     # Limiting Dunbrack library as requested. 
 31 |     if args.dunbrack_prob_per_cst is None:
 32 |         args.dunbrack_prob_per_cst = [None]+[args.dunbrack_prob for r in restypes]
 33 |     else:
 34 |         assert all([isinstance(x, float) for x in args.dunbrack_prob_per_cst])
 35 |         args.dunbrack_prob_per_cst = [None]+ args.dunbrack_prob_per_cst
 36 | 
 37 | 
 38 |     ######### IF REQUESTED... ############
 39 |     ### RANDOM ROTAMER SELECTION SETUP ###
 40 |     if args.max_random_rotamers_per_cst is not None:
 41 |         assert all([isinstance(x, int) for x in args.max_random_rotamers_per_cst])
 42 |         assert len(args.max_random_rotamers_per_cst) == len(restypes)+1, "Invalid number of per-cst max_random_rotamers_per_cst"
 43 |     
 44 |     if args.frac_random_rotamers_per_cst is not None:
 45 |         assert all([isinstance(x, float) for x in args.frac_random_rotamers_per_cst])
 46 |         assert len(args.frac_random_rotamers_per_cst) == len(restypes)+1, "Invalid number of per-cst frac_random_rotamers_per_cst"
 47 |     
 48 |     if args.max_random_rotamers is not None:
 49 |         args.max_random_rotamers_per_cst = [args.max_random_rotamers]+[args.max_random_rotamers for r in restypes]
 50 |     
 51 |     if args.frac_random_rotamers is not None:
 52 |         args.frac_random_rotamers_per_cst = [args.frac_random_rotamers]+[args.frac_random_rotamers for r in restypes]
 53 |     
 54 |         # In case best rotamer is requested for a given CST id then set randomness to 1.0
 55 |         for i, frac in enumerate(args.frac_random_rotamers_per_cst):
 56 |             if i in args.use_best_rotamer_cstids:
 57 |                 args.frac_random_rotamers_per_cst[i] = 1.0
 58 |     
 59 |     
 60 |     #### PARSING SECONDARY STRUCTURE LENGTHS #####
 61 |     if args.N_len_per_cst is None:
 62 |         args.N_len_per_cst = [None]+[args.N_len for r in restypes]
 63 |     else:
 64 |         assert all([isinstance(x, int) for x in args.N_len_per_cst])
 65 |         args.N_len_per_cst = [None]+ args.N_len_per_cst
 66 |     
 67 |     if args.C_len_per_cst is None:
 68 |         args.C_len_per_cst = [None]+[args.C_len for r in restypes]
 69 |     else:
 70 |         assert all([isinstance(x, int) for x in args.C_len_per_cst])
 71 |         args.C_len_per_cst = [None]+ args.C_len_per_cst
 72 | 
 73 | 
 74 |     # Loading favored rotamers for each used residue type in each CST block
 75 |     # This allows different rotamer sets to be stored if same residue type should be
 76 |     # on different secondary structures in different CST blocks
 77 |     # TODO: could also consider enabling different probabilities for different CST's or AA's?  <-- partially done
 78 |     if args.secstruct_per_cst is None:
 79 |         args.secstruct_per_cst = [None]+[args.secstruct for r in restypes]
 80 |     else:
 81 |         assert all([x in "EH-" for x in args.secstruct_per_cst])
 82 |         args.secstruct_per_cst = [None]+ args.secstruct_per_cst
 83 |     return args
 84 | 
 85 | 
 86 | def parse_motif_input(motif_input, cst_atoms, restypes):
 87 |     motifs = {}
 88 |     for motif_txt in motif_input:
 89 |         motif_cst_no = int(motif_txt.split(":")[0])
 90 |         if motif_cst_no != 1:
 91 |             sys.exit("External motif not supported for not-first CST's right now.")
 92 |         motif_resno = int(motif_txt.split(":")[1])
 93 |         motif_fp = motif_txt.split(":")[2]
 94 |         motifs[motif_cst_no] = {"resno": motif_resno,
 95 |                                 "pose": pyr.pose_from_file(motif_fp),
 96 |                                 "fp": motif_fp,
 97 |                                 "atoms": None}
 98 |         motif_resname = motifs[motif_cst_no]["pose"].residue(motif_resno).name3()
 99 |         assert motif_resname in restypes[motif_cst_no], f"{motif_resname} not found in {restypes}"
100 | 
101 |         # Finding the CST atoms for a given CST
102 |         for sub_cst_block in cst_atoms[motif_cst_no]:
103 |             for per_aa_cstset in sub_cst_block:
104 |                 if motif_resname in [aa.split("-")[0] for aa in per_aa_cstset.keys()]:
105 |                     motif_resname_full = [aa for aa in per_aa_cstset.keys() if aa.split("-")[0]==motif_resname][0]
106 |                     motifs[motif_cst_no]["atoms"] = per_aa_cstset[motif_resname_full]
107 |         if motifs[motif_cst_no]["atoms"] is None:
108 |             print(cst_atoms)
109 |             sys.exit("Unable to find correct motif atoms based on the corresponding CST definition")
110 |     return motifs
111 | 
112 | 
113 | def parse_rotamer_subsampling(args, cst_atoms):
114 |     chi_subsampling_levels = {}
115 |     __xtrachi_cst_def = {}
116 |     _extra_chi_definitions = {}
117 |     if args.extra_chi is not None:
118 |         # 1:2,2:2,3:1,4:1
119 |         __xtrachi = args.extra_chi.split(",")
120 |         _extra_chi_definitions = {int(x.split(":")[0]): int(x.split(":")[1]) for x in __xtrachi}
121 |     
122 |     elif args.extra_chi_per_cst is not None:
123 |         # CSTNO-1:2,2:2,3:1,4:1 CSTNO2-1:1,2:1 
124 |         __xtrachi_cst = {int(x.split("-")[0]): x.split("-")[1].split(",") for x in args.extra_chi_per_cst}
125 |         __xtrachi_cst_def = {cstno: {int(x.split(":")[0]): int(x.split(":")[1]) for x in val} for cstno, val in __xtrachi_cst.items()}
126 | 
127 |     for cstno in cst_atoms:
128 |         chi_subsampling_levels[cstno] = {}
129 |         for n in range(4):
130 |             if cstno in __xtrachi_cst_def.keys() and n+1 in __xtrachi_cst_def[cstno].keys():
131 |                 chi_subsampling_levels[cstno][n+1] = __xtrachi_cst_def[cstno][n+1]
132 |             elif n+1 in _extra_chi_definitions.keys():
133 |                 chi_subsampling_levels[cstno][n+1] = _extra_chi_definitions[n+1]
134 |             else:
135 |                 chi_subsampling_levels[cstno][n+1] = 0
136 |             assert 0 <= chi_subsampling_levels[cstno][n+1] <= 7, f"Invalid sampling level for cst {cstno}, chi {n+1}: {chi_subsampling_levels[cstno][n+1]}"
137 | 
138 |     print("Using CHI sampling levels for CST's:")
139 |     for cstno in chi_subsampling_levels:
140 |         print(f"    CST {cstno} :: {chi_subsampling_levels[cstno]}")
141 | 
142 |     return chi_subsampling_levels
143 | 
144 | 
145 | def get_cst_atoms(cst_io):
146 |     cst_atoms = {}
147 |     for n in range(1, cst_io.mcfi_lists_size()+1):
148 |         cst_atoms[n] = []
149 |         for m in range(1, cst_io.mcfi_list(n).num_mcfis()+1):
150 |             cst_atoms[n].append([])
151 |             _mcfi = cst_io.mcfi_list(n).mcfi(m)
152 | 
153 |             # Figuring out if there is a particular downstream or upstream secondary match happening
154 |             downstream_match = False
155 |             upstream_match = False
156 |             downstream_res_cst = 1
157 |             if _mcfi.algorithm_inputs().__contains__("match"):
158 |                 if any(["DOWNSTREAM" in ai for ai in _mcfi.algorithm_inputs()["match"]]):
159 |                     downstream_match = True
160 |                     downstream_res_cst = 1  # I think this is always 1, right?
161 |                 elif any(["UPSTREAM_CST" in ai for ai in _mcfi.algorithm_inputs()["match"]]):
162 |                     upstream_match = True
163 |                     for ai in _mcfi.algorithm_inputs()["match"]:
164 |                         if "SECONDARY_MATCH:" in ai and "UPSTREAM_CST" in ai:
165 |                             downstream_res_cst = int(ai.split()[2])
166 |                             break
167 | 
168 | 
169 |             rt_combs = itertools.product(_mcfi.allowed_restypes(_mcfi.downstream_res()), _mcfi.allowed_restypes(_mcfi.upstream_res()))
170 |             for (ds_res, us_res) in rt_combs:
171 |                 ais_ds = [ds_res.atom_name(_mcfi.template_atom_inds(_mcfi.downstream_res(), ai, ds_res)[1]) for ai in range(1, 4)]
172 |                 ais_us = [us_res.atom_name(_mcfi.template_atom_inds(_mcfi.upstream_res(), ai, us_res)[1]) for ai in range(1, 4)]
173 |                 
174 |                 # Need to append CST numbers to residue names
175 |                 cst_atoms[n][-1].append({f"{ds_res.name()}-{downstream_res_cst}": tuple(ais_ds),
176 |                                          f"{us_res.name()}-{n}": tuple(ais_us)})
177 | 
178 |     return cst_atoms
179 | 
180 | 
181 | 
182 | """
183 | ROTAMER-RELATED FUNCTIONS
184 | """
185 | def preselect_inverse_rotamers(rotset, restype_good_rotamers, keep_his_tautomer_per_cst, tip_atom=False):
186 |     if tip_atom is False:
187 |         print("Preselecting inverse rotamers based on Dunbrack probability")
188 |         good_rotamers = [[] for x in rotset]
189 |         for i, invrots in enumerate(rotset):
190 |             if len(invrots) == 0:
191 |                 continue
192 |             for res in invrots:
193 |                 if isinstance(res, pyrosetta.rosetta.core.pose.Pose):  # motif pose
194 |                     good_rotamers[i].append(res)
195 |                     continue
196 |                 if res.is_ligand():
197 |                     # if len(good_rotamers[i]) > 0 and args.single_ligand_rotamer is True:
198 |                     #     break
199 |                     good_rotamers[i].append(res)
200 |                     continue
201 |                 if res.name3() == "HIS" and keep_his_tautomer_per_cst is not None and i in keep_his_tautomer_per_cst:
202 |                     if res.name() != keep_his_tautomer_per_cst[i]:
203 |                         continue
204 |                 # Need to exclude proton CHIs
205 |                 _chis = [res.chi(n+1) for n in range(res.nchi()) if "H" not in [res.atom_type(an).element() for an in res.chi_atoms(n+1)]]
206 |                 if res.name3() in ["ALA", "GLY"]:
207 |                     good_rotamers[i].append(res)
208 |                 else:
209 |                     rotlib_matches = dunbrack_rotlib.find_bb_from_inverse_loc(restype_good_rotamers[i][res.name3()], _chis)
210 |                     if len(rotlib_matches) > 0:
211 |                         good_rotamers[i].append(res)
212 |             if len(good_rotamers[i]) == 0 and len(rotset[i]) != 0:
213 |                 print(f"Failed to find compatible rotamers for constraint {i}: {res.name()}")
214 |                 return None
215 |     else:
216 |         print("Preselecting inverse rotamers only based whether the tip atoms are different")
217 |         good_rotamers = []
218 |         for i, invrots in enumerate(rotset):
219 |             if isinstance(invrots[0], pyrosetta.rosetta.core.pose.Pose):
220 |                 good_rotamers.append(invrots)
221 |                 continue
222 |             elif invrots[0].is_ligand():
223 |                 good_rotamers.append(invrots)
224 |                 continue
225 |             good_rotamers.append([])
226 |             for invrot in invrots:
227 |                 if len(good_rotamers[i]) == 0:
228 |                     good_rotamers[i].append(invrot)
229 |                     continue
230 |                 is_unique = []
231 |                 for rot in good_rotamers[i]:
232 |                     if rot.name() != invrot.name():
233 |                         continue
234 |                     if (rot.xyz("CA")-invrot.xyz("CA")).norm() < 0.2:
235 |                         is_unique.append(False)
236 |                         continue
237 |                     if (rot.xyz("CB")-invrot.xyz("CB")).norm() < 0.2:
238 |                         is_unique.append(False)
239 |                         continue
240 |                     is_unique.append(True)
241 |                 if all(is_unique):
242 |                     good_rotamers[i].append(invrot)
243 |     return good_rotamers
244 | 
245 | 
246 | def find_unique_rotamers_for_motif(rotset, motifs):
247 |     """
248 |     Identifies different rotamers from the inverse rotamer set that can be used for aligning the motif to.
249 |     Difference is calculated based on the geometric distance between the motif atoms of inverse rotamers.
250 |     """
251 |     print("Preselecting inverse rotamers for motif alignment, based on unique CST subsampling")
252 |     unique_rotset = []
253 |     
254 |     for i, invrots in enumerate(rotset):
255 |         if len(invrots) == 0:
256 |             unique_rotset.append([])
257 |             continue
258 |         unique_rotamers = []
259 |         for j, res in enumerate(invrots):
260 |             if len(unique_rotamers) == 0:
261 |                 unique_rotamers.append(res)
262 |                 continue
263 |             dms = []
264 |             for ures in unique_rotamers:
265 |                 dms.append([(res.xyz(a)-ures.xyz(a)).norm() for a in motifs[i]["atoms"]])
266 |             if all([sum(x) > 0.1 for x in dms]):
267 |                 unique_rotamers.append(res)
268 | 
269 |         print(f"    CST {i}, {len(unique_rotamers)}/{len(invrots)} after unique selection")
270 |         unique_rotset.append(unique_rotamers)
271 |     return unique_rotset
272 | 
273 | 
274 | def pick_random_rotamers(invrots, N_max=None, frac=None):
275 |     if N_max is not None:
276 |         if len(invrots) < N_max:
277 |             return [r for r in invrots]
278 |         elif isinstance(invrots[0], pyrosetta.rosetta.core.pose.Pose):
279 |             return [r for r in invrots]
280 |         else:
281 |             return random.sample([r for r in invrots], N_max)
282 |     if frac is not None:
283 |         if len(invrots) <= 1:
284 |             return [r for r in invrots]
285 |         elif isinstance(invrots[0], pyrosetta.rosetta.core.pose.Pose):
286 |             return [r for r in invrots]
287 |         else:
288 |             return random.sample([r for r in invrots], int(round(frac*len(invrots), 0)))
289 | 
290 | 
291 | def pick_random_rotamers_set(rotset, max_random_rotamers_per_cst=None, frac_random_rotamers_per_cst=None):
292 |     """
293 |     Selects a subset of inverse rotamers for each set of inverse rotamers
294 |     Arguments:
295 |         rotset (list)
296 |         max_random_rotamers_per_cst (list, int)
297 |         frac_random_rotamers_per_cst (list, float)
298 |     """
299 |     if max_random_rotamers_per_cst is None and frac_random_rotamers_per_cst is None:
300 |         sys.exit("Bad setup")
301 |     elif max_random_rotamers_per_cst is not None and frac_random_rotamers_per_cst is not None:
302 |         sys.exit("Bad setup")
303 | 
304 |     if max_random_rotamers_per_cst is None:
305 |         max_random_rotamers_per_cst = [None for x in frac_random_rotamers_per_cst]
306 |     elif frac_random_rotamers_per_cst is None:
307 |         frac_random_rotamers_per_cst = [None for x in max_random_rotamers_per_cst]
308 | 
309 |     assert len(rotset) == len(frac_random_rotamers_per_cst)
310 |     assert len(rotset) == len(max_random_rotamers_per_cst)
311 | 
312 |     rotsett = []
313 | 
314 |     for n, invrots in enumerate(rotset):
315 |         rotsett.append(pick_random_rotamers(invrots, N_max=max_random_rotamers_per_cst[n], frac=frac_random_rotamers_per_cst[n]))
316 |     return rotsett
317 | 
318 | 
319 | def subsample_rotamers(rotamers, subsample_levels, per_cst_rotlib, cst_atoms):
320 |     expanded_rotset = []
321 |     for cst_block, invrots in enumerate(rotamers):
322 |         expanded_rotset.append([])
323 |         if cst_block == 0:  # Ligand
324 |             expanded_rotset[0] = [r for r in invrots]
325 |             continue
326 |         for n, invrot in enumerate(invrots):
327 |             if isinstance(invrot, pyrosetta.rosetta.core.pose.Pose):  # motif pose
328 |                 expanded_rotset[cst_block].append(invrot)
329 |                 continue
330 |             _asd = dunbrack_rotlib.find_bb_from_inverse_loc(per_cst_rotlib[cst_block][invrot.name3()], list(invrot.chi()))
331 |             if len(_asd) == 0:
332 |                 print(f"CST {cst_block}: rotamer {n} found no hits from Dunbrack library!?")
333 |                 expanded_rotset[cst_block].append(invrot)
334 |                 continue
335 |             # Right not taking STDEV just as an average of all found rotamers in desired secondary structure bins
336 |             stdevs = {chino+1: _asd[f"std{chino+1}"].mean() for chino in range(invrot.nchi())}
337 | 
338 |             # Expanding all chi's based on user request
339 |             chi_samplings = {chino: calculate_samplings(invrot.chi(chino), stdevs[chino], subsample_levels[cst_block][chino]) for chino in stdevs}
340 |             for chiset in itertools.product(*chi_samplings.values()):
341 |                 _rot = invrot.clone()
342 |                 for chino, _chi in enumerate(chiset):
343 |                     _rot.set_chi(chino+1, _chi)
344 |                 
345 |                 # Need to realign coordinates
346 |                 # First let's find what are the CST atoms used
347 |                 align_atoms = [[restype_block[f"{invrot.name()}-{cst_block}"] for restype_block in var_cst if invrot.name() == list(restype_block.keys())[1].split("-")[0]] for var_cst in cst_atoms[cst_block]]
348 |                 align_atoms = list(set([item for sublist in align_atoms for item in sublist]))
349 |                 if len(align_atoms) != 1:
350 |                     print(f"Bad choice for alignment atoms: {align_atoms}")
351 |                 __rot = align_pdbs.align_residue_to_residue(invrot, _rot, {"atoms1": align_atoms[0],
352 |                                                                            "atoms2": align_atoms[0]})
353 |                 expanded_rotset[cst_block].append(__rot)
354 |         print(f"Expanded CST-{cst_block} rotamers from {len(invrots)} to {len(expanded_rotset[cst_block])}")
355 |     return expanded_rotset
356 | 
357 | 
358 | 
359 | def prune_ligand_rotamers(rotset, rmsd_cutoff=None, nproc=None):
360 |     print("Pruning ligand rotamers based on intramolecular clashes")
361 |     # Clashcheck
362 |     def process():
363 |         while True:
364 |             i = the_queue.get(block=True)
365 |             if i is None:
366 |                 return
367 |             res = rotset[i]
368 |             nonbonded_distmat = []
369 |             for p in itertools.combinations(range(1, res.natoms()+1), 2):
370 |                 if any([res.is_virtual(n) for n in p]):
371 |                     continue
372 |                 # Skipping over bonded atoms
373 |                 if p[0] in res.bonded_neighbor(p[1]) or p[1] in res.bonded_neighbor(p[0]):
374 |                     continue
375 |                 nonbonded_distmat.append((res.xyz(p[0]) - res.xyz(p[1])).norm())
376 | 
377 |                 if all([res.atom_type(n).is_heavyatom() for n in p]):
378 |                     cutoff = 2.1
379 |                 else:
380 |                     cutoff = 1.7
381 | 
382 |                 if nonbonded_distmat[-1] < cutoff:
383 |                     # if args.debug: print(f"Ligand rotamer pruning: {i}: {p}, {res.atom_name(p[0])}-{res.atom_name(p[1])}, {nonbonded_distmat[-1]}")
384 |                     good_rotamers[i] = False
385 |                     # print(f"Clashing ligand rotamer: {i}")
386 |                     break
387 | 
388 |     print(f"{len(rotset)} conformers to process")
389 |     the_queue = multiprocessing.Queue()  # Queue stores the iterables
390 | 
391 |     start = time.time()
392 |     manager = multiprocessing.Manager() 
393 |     good_rotamers = manager.dict()  # Need a special dictionary to store outputs from multiple processes
394 | 
395 |     for i, res in enumerate(rotset):
396 |         the_queue.put(i)
397 |         good_rotamers[i] = True
398 | 
399 |     pool = multiprocessing.Pool(processes=nproc,
400 |                                 initializer=process)
401 | 
402 |     # None to end each process
403 |     for _i in range(nproc):
404 |         the_queue.put(None)
405 | 
406 |     # Closing the queue and the pool
407 |     the_queue.close()
408 |     the_queue.join_thread()
409 |     pool.close()
410 |     pool.join()
411 | 
412 |     end = time.time()
413 |     print(f"Found {len([i for i in good_rotamers.keys() if good_rotamers[i] is True])} good ligand rotamers.\n"
414 |           f"Processing all the rotamers for clashes took {(end - start):.2f} seconds")
415 |     
416 |     
417 |     ## RMSD
418 |     if rmsd_cutoff in [None, 0.0]:
419 |         return [rotset[i] for i in good_rotamers.keys() if good_rotamers[i] is True]
420 | 
421 |     unique_rotamers = {}
422 |     DMs = {}
423 |     for i in good_rotamers.keys():
424 |         if good_rotamers[i] is False:
425 |             continue
426 |         res = rotset[i]
427 | 
428 |         xyz = np.array([res.xyz(n+1) for n in range(res.natoms()) if res.atom_type(n+1).element() != "H"])
429 |         DMs[i] = scipy.spatial.distance.pdist(xyz, 'euclidean')
430 | 
431 |         if len(unique_rotamers) == 0:
432 |             unique_rotamers[i] = res
433 |             continue
434 |         rmsds = []
435 |         for j, res_u in unique_rotamers.items():
436 |             rmsds.append(utils.rmsd(DMs[i], DMs[j]))
437 |             
438 |             if rmsds[-1] < rmsd_cutoff:
439 |                 break
440 | 
441 |         if min(rmsds) < rmsd_cutoff:
442 |             continue
443 |         else:
444 |             unique_rotamers[i] = rotset[i]
445 | 
446 |     print(f"Found {len(unique_rotamers)}/{len(good_rotamers)} unique ligand rotamers based on RMSD cutoff {rmsd_cutoff}.")
447 |     return [rot for i, rot in unique_rotamers.items()]
448 | 
449 | 
450 | def prune_residue_rotamers(rotset):
451 |     """
452 |     Pruning based on proton chi similarity
453 |     """
454 |     unique_rotamers = {}
455 |     for i, res in enumerate(rotset):
456 |         if res.name3() not in utils.N_chis:
457 |             n_chis = len([n for n in range(1, res.nchi()+1) if not any([res.atom_type(x).element() == "H" for x in res.chi_atoms(n)])])
458 |         else:
459 |             n_chis = utils.N_chis[res.name3()]
460 |         if res.nchi() == n_chis:
461 |             unique_rotamers[i] = res
462 |             continue
463 |         if i == 0:
464 |             unique_rotamers[i] = res
465 |             continue
466 | 
467 |         ads = []  # largest atom-atom distance between heavyatoms of RES and all parsed residues
468 |         for j, res_u in unique_rotamers.items():
469 |             if res.name3() != res_u.name3():
470 |                 continue
471 |             ads.append(max([(res.xyz(n+1) - res_u.xyz(n+1)).norm() for n in range(res.natoms()) if res.atom_type(n+1).element() != "H"]))
472 |             if ads[-1] < 0.02:
473 |                 break
474 | 
475 |         if len(ads) == 0 or min(ads) >= 0.02:
476 |             unique_rotamers[i] = res
477 |         else:
478 |             continue
479 | 
480 |     return [val for k, val in unique_rotamers.items()]
481 | 
482 | 
483 | def calculate_samplings(chi_value, std, sampling_level):
484 |     """
485 |     0 Default  original dihedral only; same as using no flag at all
486 |     1          +/- one standard deviation (sd); 3 samples
487 |     2          +/- 0.5 sd; 3 samples
488 |     3          +/- 1 & 2 sd; 5 samples
489 |     4          +/- 0.5 & 1 sd; 5 samples
490 |     5          +/- 0.5, 1, 1.5 & 2 sd; 9 samples
491 |     6          +/- 0.33, 0.67, 1 sd; 7 samples
492 |     7          +/- 0.25, 0.5, 0.75, 1, 1.25 & 1.5 sd; 13 samples.
493 |     """
494 |     if sampling_level == 0:
495 |         samples = [chi_value]
496 |     elif sampling_level == 1:
497 |         samples = [chi_value-std, chi_value, chi_value+std]
498 |     elif sampling_level == 2:
499 |         samples = [chi_value-0.5*std, chi_value, chi_value+0.5*std]
500 |     elif sampling_level == 3:
501 |         samples = [chi_value-2*std, chi_value-std, chi_value, chi_value+std, chi_value+2*std]
502 |     elif sampling_level == 4:
503 |         samples = [chi_value-std, chi_value-0.5*std, chi_value, chi_value+0.5*std, chi_value+std]
504 |     elif sampling_level == 5:
505 |         samples = [chi_value-2*std, chi_value-1.5*std, chi_value-std, chi_value-0.5*std, 
506 |                    chi_value,
507 |                    chi_value+0.5*std, chi_value+std, chi_value+1.5*std, chi_value+2*std]
508 |     elif sampling_level == 6:
509 |         samples = [chi_value*std, chi_value-0.667*std, chi_value-0.333*std, 
510 |                    chi_value,
511 |                    chi_value+0.333*std, chi_value+0.667*std, chi_value*std]
512 |     elif sampling_level == 7:
513 |         samples = [chi_value-1.5*std, chi_value-1.25*std, chi_value-std, chi_value-0.75*std, chi_value-0.5*std, chi_value-0.25*std,
514 |                    chi_value,
515 |                    chi_value+0.25*std, chi_value+0.5*std, chi_value+0.75*std, chi_value+std, chi_value+1.25*std, chi_value+1.5*std]
516 |     else:
517 |         sys.exit(f"Invalid sampling level: {sampling_level}")
518 |     return samples
519 | 
520 | 
521 | """
522 | Functions used during inverse rotamer assembly generation
523 | """
524 | def identify_cst_atoms_for_res(res, cst_no, catres_resno, _res_pose, cst_atompair_sets, motifs, ligands):
525 |     j = cst_no
526 |     catres_cst_atoms = {}
527 |     for subcst in cst_atompair_sets:
528 |         for respair in subcst:
529 |             # residue 1
530 |             if isinstance(res, pyrosetta.rosetta.core.conformation.Residue) and f"{res.name()}-{j}" in respair.keys():
531 |                 _this_res = {catres_resno: respair[f"{res.name()}-{j}"]}
532 |             elif isinstance(res, pyrosetta.rosetta.core.pose.Pose) and f"{res.residue(motifs[j]['resno']).name()}-{j}" in respair.keys():
533 |                 _this_res = {catres_resno: motifs[j]["atoms"]}
534 |             else:
535 |                 _trgt = None
536 |                 continue
537 | 
538 |             # residue 2    (that residue 1 is constrained to)
539 |             _trgt = None
540 |             if j == 1:
541 |                 _trgt = {_res_pose.size()+1: respair[ligands[0].name3()+f"-{j}"]}
542 |             else:
543 |                 _trgt = [rn for rn,_ in respair.items() if f"-{j}" not in rn]
544 |                 # target must be a ligand
545 |                 if not any([ rn.split("-")[0] in [l.name3() for l in ligands] for rn in _trgt]):
546 |                     _trgt = None
547 |                     continue
548 |                 else:
549 |                     if len(_trgt) != 1:
550 |                         _trgt = None
551 |                         continue
552 |                     for il, lig in enumerate(ligands):
553 |                         if il+1 != int(_trgt[0].split("-")[1]):
554 |                             continue
555 |                         _trgt = {_res_pose.size()+il+1: respair[_trgt[0]]}
556 |             if _trgt is None:
557 |                 continue
558 |             else:
559 |                 break
560 |         if _trgt is None:
561 |             continue
562 |         else:
563 |             break
564 |     if _trgt is not None:
565 |         # No validation is done whether correct CST atoms are used for this particular residue
566 |         # i.e. sitation where a variable CST is used with different sets of atoms from the same residue
567 |         catres_cst_atoms.update(_this_res)
568 |         catres_cst_atoms.update(_trgt)
569 |     else:
570 |         catres_cst_atoms = None
571 |     return catres_cst_atoms
572 | 
573 | 
574 | def check_clash(pose, catres_resnos, cutoff=1.7, ignore_respairs=None, cst_atoms=None, tip_atom=False, debug=False):
575 |     """
576 |     Checks for clashes between residue atoms
577 |     Only consideres residues that have nbr_atom within 10 angstrom of eachother.
578 |     Default clash cutoff is 1.7 angstrom.
579 |     Clashes are not detected for N-H and O-H contacts.
580 |     cst_atoms: {resno1: (a1, a2, a3), resno2: (a1, a2, a3)}
581 |     """
582 | 
583 |     combs = itertools.combinations(range(1, pose.size()+1), 2)
584 |     for c in combs:
585 |         res1 = pose.residue(c[0])
586 |         res2 = pose.residue(c[1])
587 |         # Going through a bunch of conditions that would allow us to skip
588 |         # checking clashes in a given pair of residues
589 |         
590 |         _ignore_atoms = {res1.seqpos():[],res2.seqpos():[]}
591 |         if tip_atom is True:
592 |             # Ignoring any of the backbone-ish atoms
593 |             for r in [res1, res2]:
594 |                 if r.is_ligand():
595 |                     continue
596 |                 if r.seqpos() in catres_resnos:
597 |                     if r.name3() in ["GLY", "PRO", "ALA"]:
598 |                         continue
599 |                     for a in ["CA", "CB", "C", "N", "O"]:
600 |                         _ignore_atoms[r.seqpos()].append(r.atom_index(a))
601 |                         if r.attached_H_begin(r.atom_index(a)) == 0:
602 |                             continue
603 |                         for _n in range(r.attached_H_begin(r.atom_index(a)), r.attached_H_end(r.atom_index(a))+1):
604 |                             _ignore_atoms[r.seqpos()].append(_n)
605 |         if cst_atoms is not None:
606 |             for r in [res1, res2]:
607 |                 if r.seqpos() not in cst_atoms.keys():
608 |                     continue
609 |                 for a in cst_atoms[r.seqpos()]:
610 |                     _ignore_atoms[r.seqpos()].append( r.atom_index(a.strip()) )
611 | 
612 | 
613 |         if ignore_respairs is not None:
614 |             if any([res1.seqpos() in p and res2.seqpos() in p for p in ignore_respairs]):
615 |                 continue
616 | 
617 |         if res1.chain() == res2.chain():
618 |             continue
619 |         if res1.seqpos() == res2.seqpos():
620 |             continue
621 |         if res1.is_bonded(res2):
622 |             continue
623 |         if (res1.nbr_atom_xyz() - res2.nbr_atom_xyz()).norm() > 10.0:
624 |             continue
625 |         if res1.is_virtual_residue() or res2.is_virtual_residue():
626 |             continue
627 | 
628 |         for atm1 in range(1, res1.natoms()+1):
629 |             if res1.is_virtual(atm1):
630 |                 continue
631 |             if atm1 in  _ignore_atoms[res1.seqpos()]:
632 |                 continue
633 |             for atm2 in range(1, res2.natoms()+1):
634 |                 if res2.is_virtual(atm2):
635 |                     continue
636 |                 if atm2 in  _ignore_atoms[res2.seqpos()]:
637 |                     continue
638 | 
639 |                 if all([res1.atom_type(atm1).is_heavyatom(), res2.atom_type(atm2).is_heavyatom()]):
640 |                     cutoff = 1.8
641 |                 else:
642 |                     cutoff = 1.5
643 |                 _dist = (res1.xyz(atm1) - res2.xyz(atm2)).norm()
644 |                 if _dist < cutoff:
645 |                     if res1.atom_type(atm1).element() in "NO" and res2.atom_type(atm2).element() == "H":  # H-bonds are not clashes
646 |                         continue
647 |                     # elif res1.atom_type(atm1).element() == "H" and res2.atom_type(atm2).element() in "NO":
648 |                     #     continue
649 |                     else:
650 |                         if debug: print(f"Clashing atoms: {res1.name()}-{res1.seqpos()}-{res1.atom_name(atm1)} -- {res2.name()}-{res2.seqpos()}-{res2.atom_name(atm2)}: {_dist}")
651 |                         return True
652 |     return False
653 | 
654 | 
655 | def adjust_bb(pose, resno, phi, psi):
656 |     pose.set_phi(resno, phi)
657 |     pose.set_psi(resno, psi)
658 |     pose.set_omega(resno, 180.0)
659 | 
660 | 
661 | def extend_SS(pose, ref_seqpos, secstruct, AAA, nres_Nterm=4, nres_Cterm=5):
662 |     """
663 |     Extends the stubs around a given residue in a pose by a number of residues on N-term and C-term side.
664 |     The secondary structure is set to either idealized Helix or Strand
665 | 
666 |     Parameters
667 |     ----------
668 |     pose : pyrosetta.rosetta.core.pose.Pose
669 |         DESCRIPTION.
670 |     ref_seqpos : int
671 |         DESCRIPTION.
672 |     secstruct : str
673 |         "E" or "H".
674 |     AAA : pyrosetta.rosetta.core.pose.Pose
675 |         pose object with 3 alanines.
676 |     nres_Nterm : int, optional
677 |         How many residues are added to N terminus. The default is 4.
678 |     nres_Cterm : int, optional
679 |         How many residues are added to C terminus. The default is 5.
680 | 
681 |     Returns
682 |     -------
683 |     pose2 : TYPE
684 |         DESCRIPTION.
685 | 
686 |     """
687 |     # assert nres_Nterm >= 2, "Too short N-term extension"
688 |     # assert nres_Cterm >= 2, "Too short C-term extension"
689 |     pose2 = pose.clone()
690 |     for n in range(nres_Cterm):
691 |         pose2.append_polymer_residue_after_seqpos(AAA.residue(2), ref_seqpos+n, True)
692 |         adjust_bb(pose2, ref_seqpos+n, phi=utils.idealized_SS_phi_psi[secstruct]["phi"][0], psi=utils.idealized_SS_phi_psi[secstruct]["psi"][0])
693 | 
694 |     if nres_Cterm > 0:
695 |         adjust_bb(pose2, pose2.size(), phi=utils.idealized_SS_phi_psi[secstruct]["phi"][0], psi=utils.idealized_SS_phi_psi[secstruct]["psi"][0])
696 |     else:
697 |         # If no C-term stub included then adding temporarily one
698 |         pose2.append_polymer_residue_after_seqpos(AAA.residue(2), ref_seqpos, True)
699 | 
700 |     for n in range(nres_Nterm):
701 |         pose2.prepend_polymer_residue_before_seqpos(AAA.residue(2), ref_seqpos, True)
702 |         if n == 0:
703 |             # Building foldtree to have a center point at the reference residue
704 |             ft = pyrosetta.rosetta.core.kinematics.FoldTree()
705 |             ft.add_edge(ref_seqpos+2, pose2.chain_begin(pose2.chain(ref_seqpos)), -1)
706 |             ft.add_edge(ref_seqpos+2, pose2.chain_end(pose2.chain(ref_seqpos)), -1)
707 |             for j in range(1, pose2.num_chains()+1):
708 |                 if j == pose2.chain(ref_seqpos):
709 |                     continue
710 |                 else:  # adding foldtree edges for other chains
711 |                     ft.add_edge(pose2.fold_tree().get_residue_edge(pose2.chain_begin(j)))
712 |             pose2.fold_tree().clear()
713 |             pose2.fold_tree(ft)
714 |         adjust_bb(pose2, ref_seqpos+1, phi=utils.idealized_SS_phi_psi[secstruct]["phi"][0], psi=utils.idealized_SS_phi_psi[secstruct]["psi"][0])
715 | 
716 |     adjust_bb(pose2, ref_seqpos, phi=utils.idealized_SS_phi_psi[secstruct]["phi"][0], psi=utils.idealized_SS_phi_psi[secstruct]["psi"][0])
717 | 
718 |     if nres_Cterm == 0:
719 |         pose2.delete_residue_slow(pose2.size())
720 | 
721 |     return pose2
722 | 
723 | 
724 | def create_remark_lines(pose, catalytic_residues, cst_io):
725 |     ## Adding REMARK 666 lines to the PDB's
726 |     ## This is actually quite arduous since we need to figure out which variable CST block a particular residue came from
727 | 
728 |     pdb_info = pyrosetta.rosetta.core.pose.PDBInfo(pose)  # can this be added to pose somehow?
729 | 
730 |     ligands = [r for r in pose.residues if r.is_ligand()]
731 | 
732 |     calculators = {"dis": utils.get_dist, "ang": utils.get_angle, "tor": utils.get_dihedral}
733 |     remarks = []
734 |     for j, resno in catalytic_residues.items():
735 |         if pose.residue(resno).is_ligand() and j == 0:
736 |             continue
737 |         rmrk = None
738 | 
739 |         for m in range(1, cst_io.mcfi_list(j).num_mcfis()+1):
740 |             _mcfi = cst_io.mcfi_list(j).mcfi(m)
741 | 
742 |             downstream_res_cst = 0
743 |             if _mcfi.algorithm_inputs().__contains__("match"):
744 |                 if any(["DOWNSTREAM" in ai for ai in _mcfi.algorithm_inputs()["match"]]):
745 |                     downstream_res_cst = 0  # I think this is always 1, right?
746 |                 elif any(["UPSTREAM_CST" in ai for ai in _mcfi.algorithm_inputs()["match"]]):
747 |                     for ai in _mcfi.algorithm_inputs()["match"]:
748 |                         if "SECONDARY_MATCH:" in ai and "UPSTREAM_CST" in ai:
749 |                             downstream_res_cst = int(ai.split()[2])
750 |                             break
751 |             # Residues in the final pose
752 |             DS_RES = pose.residue(catalytic_residues[downstream_res_cst])
753 |             US_RES = pose.residue(resno)
754 |             
755 |             good_cst_found = False
756 |             rt_combs = itertools.product(_mcfi.allowed_restypes(_mcfi.downstream_res()), _mcfi.allowed_restypes(_mcfi.upstream_res()))
757 |             for (ds_res, us_res) in rt_combs:
758 |                 if US_RES.name().split(":")[0] != us_res.name():  # skipping the wrong residue types
759 |                     continue
760 |                 ais_ds = [ds_res.atom_name(_mcfi.template_atom_inds(_mcfi.downstream_res(), ai, ds_res)[1]) for ai in range(1, 4)]
761 |                 ais_us = [us_res.atom_name(_mcfi.template_atom_inds(_mcfi.upstream_res(), ai, us_res)[1]) for ai in range(1, 4)]
762 | 
763 |                 cst_atomsets = {'dis_U1D1': [DS_RES.xyz(ais_ds[0]), US_RES.xyz(ais_us[0])],
764 |                                 'ang_U1D2': [DS_RES.xyz(ais_ds[1]), DS_RES.xyz(ais_ds[0]), US_RES.xyz(ais_us[0])],
765 |                                 'ang_U2D1': [DS_RES.xyz(ais_ds[0]), US_RES.xyz(ais_us[0]), US_RES.xyz(ais_us[1])],
766 |                                 'tor_U1D3': [DS_RES.xyz(ais_ds[2]), DS_RES.xyz(ais_ds[1]), DS_RES.xyz(ais_ds[0]), US_RES.xyz(ais_us[0])],
767 |                                 'tor_U2D2': [DS_RES.xyz(ais_ds[1]), DS_RES.xyz(ais_ds[0]), US_RES.xyz(ais_us[0]), US_RES.xyz(ais_us[1])],
768 |                                 'tor_U3D1': [DS_RES.xyz(ais_ds[0]), US_RES.xyz(ais_us[0]), US_RES.xyz(ais_us[1]), US_RES.xyz(ais_us[2])]}
769 |                 cst_atomsets = {k: np.array(v) for k,v in cst_atomsets.items()}
770 |                 
771 |                 # Measuring whether a particular respair geometrically matches the CST
772 |                 good_cst_found = False
773 |                 for cs in _mcfi.constraints():
774 |                     passed_cst = []
775 |                     for cst_par in cst_atomsets.keys():
776 |                         cst_samples = getattr(cs, cst_par).create_sample_vector()
777 |                         val = calculators[cst_par[:3]](*cst_atomsets[cst_par])
778 |                         if val < 0.0:
779 |                             val = 360.0 + val
780 |                         # is any of the sampled values very close to the measured value?
781 |                         if "dis" in cst_par:
782 |                             passed_cst.append( any([abs(val-x) < 0.1 for x in cst_samples]) )
783 |                         else:
784 |                             passed_cst.append( any([abs(val-x) < 1.0 for x in cst_samples]) )
785 |                     if all(passed_cst):
786 |                         good_cst_found = True
787 |                         break
788 |                 if good_cst_found:
789 |                     break
790 |             if good_cst_found:
791 |                 # if there's only one ligand then it will be stored as chain X residue 0
792 |                 if len(ligands) == 1 and DS_RES.name3() == ligands[0].name3():
793 |                     rmrk = f"REMARK 666 MATCH TEMPLATE X {DS_RES.name3()}"\
794 |                             f"    0 MATCH MOTIF {pdb_info.chain(resno)} "\
795 |                             f"{US_RES.name3()} {resno:>4}  {j}  {m}               "
796 |                 else:
797 |                     rmrk = f"REMARK 666 MATCH TEMPLATE {pdb_info.chain(DS_RES.seqpos())} {DS_RES.name3()}"\
798 |                             f" {DS_RES.seqpos():>4} MATCH MOTIF {pdb_info.chain(resno)} "\
799 |                             f"{US_RES.name3()} {resno:>4}  {j}  {m}               "
800 |                 remarks.append(rmrk)
801 |                 break
802 |     return remarks
803 | 
804 | 


--------------------------------------------------------------------------------