├── webserver
    ├── web.md
    ├── tests.py
    ├── admin.py
    ├── apps.py
    ├── models.py
    ├── views.py
    ├── pred_torsion.py
    ├── rebulid.py
    └── model.py
├── Training models
    ├── read.md
    ├── fetch_top_models.py
    ├── cross_val_dataset_sep.py
    ├── coordinates_extraction.py
    ├── batch_test.py
    ├── distance_map.py
    ├── batch_validation.py
    ├── make_dataset.py
    └── modelable_assess.py
├── process.jpg
├── rotation.jpg
├── our_process .jpg
├── README.md
├── ncbi_spydier.py
├── extraction.py
├── extract_coord.py
├── computation_rmsd.py
├── transform.py
└── angle_computation.ipynb


/webserver/web.md:
--------------------------------------------------------------------------------
1 | ##Web source code
2 | 


--------------------------------------------------------------------------------
/Training models/read.md:
--------------------------------------------------------------------------------
1 | Model training and evaluation and fusion code
2 | 


--------------------------------------------------------------------------------
/process.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvinJun/DeepPBS/HEAD/process.jpg


--------------------------------------------------------------------------------
/rotation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvinJun/DeepPBS/HEAD/rotation.jpg


--------------------------------------------------------------------------------
/our_process .jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvinJun/DeepPBS/HEAD/our_process .jpg


--------------------------------------------------------------------------------
/webserver/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/webserver/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/webserver/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class FileoperationConfig(AppConfig):
5 |     name = 'fileoperation'
6 | 


--------------------------------------------------------------------------------
/webserver/models.py:
--------------------------------------------------------------------------------
 1 | from django.db import models
 2 | 
 3 | # Create your models here.
 4 | class Files(models.Model):
 5 |     id = models.AutoField(max_length=10, primary_key=True, verbose_name='id')
 6 |     file = models.FileField(upload_to='./files')
 7 |     def __unicode__(self):  # __str__ on Python 3
 8 |         return (self.id,self.file)
 9 | 
10 | class Files_name(models.Model):
11 |     id = models.AutoField(max_length=10, primary_key=True, verbose_name='id')
12 |     name = models.CharField(max_length=10)
13 |     files = models.ManyToManyField(Files, related_name='files')
14 |     def __unicode__(self):  # __str__ on Python 3
15 |         return (self.id,self.name,self.files)


--------------------------------------------------------------------------------
/Training models/fetch_top_models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | import numpy as np
 4 | import shutil
 5 | 
 6 | 
 7 | train_name = 'nr40_Split_LeL2_Drop05'
 8 | cross_validation_fold = 10
 9 | 
10 | top_models_folder = os.path.join('./outputs/%s_top_models_2' % train_name)
11 | pathlib.Path(top_models_folder).mkdir(parents=True, exist_ok=True)
12 | 
13 | val_losses = []
14 | for subset_index in range(cross_validation_fold):
15 |     with open('./outputs/%s_%d/validation_map_2.txt' % (train_name, subset_index)) as file:
16 |         lines = file.readlines()
17 |     for line in lines[1::4]:
18 |         val_losses.append(float(line.split('=')[1][:-1]))
19 | 
20 | total_epochs = len(val_losses) // cross_validation_fold
21 | val_losses = np.array(val_losses).reshape(-1, total_epochs)
22 | 
23 | top_num = 5
24 | for subset_index in range(cross_validation_fold):
25 |     for top_index in np.argsort(val_losses[subset_index])[:top_num]:
26 |         model_path = './outputs/%s_%d/%d_Linear.pth' % (train_name, subset_index, top_index)
27 |         new_model_path = os.path.join(top_models_folder, 'model_%d.pth' % (top_index + subset_index * total_epochs))
28 |         shutil.copy(model_path, new_model_path)
29 | 


--------------------------------------------------------------------------------
/Training models/cross_val_dataset_sep.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import random
 4 | import pathlib
 5 | 
 6 | 
 7 | DATA_PATH = '/share/Data/processed/nr40/bitorsion'
 8 | SUBSET_PATH = '/share/Data/processed/nr40/10fold_val_subset'
 9 | subset_fold = 10
10 | 
11 | 
12 | filenames = os.listdir(DATA_PATH)
13 | random.shuffle(filenames)
14 | print('Total file number = %d' % len(filenames))
15 | 
16 | for i in range(subset_fold):
17 |     print('subset', i)
18 |     train_path = os.path.join(SUBSET_PATH, 'subset_%d/train' % i)
19 |     val_path = os.path.join(SUBSET_PATH, 'subset_%d/val' % i)
20 |     pathlib.Path(train_path).mkdir(parents=True, exist_ok=True)
21 |     pathlib.Path(val_path).mkdir(parents=True, exist_ok=True)
22 | 
23 |     start_index = i / 10 * len(filenames) // 1
24 |     end_index = (i + 1) / 10 * len(filenames) // 1
25 |     print('from %d to %d' % (start_index, end_index))
26 | 
27 |     for k, filename in enumerate(filenames):
28 |         if start_index <= k < end_index:
29 |             shutil.copy(os.path.join(DATA_PATH, filename), os.path.join(val_path, filename))
30 |         else:
31 |             shutil.copy(os.path.join(DATA_PATH, filename), os.path.join(train_path, filename))
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepPBS
 2 | **Motivation:** Accurate all-atom protein structures play an important role in various research and applications. However, in most cases, only coarse-grained models can be obtained for reasons. Precisely predict protein backbone structures based on alpha-carbon traces, the most-used coarse-grained model, is a pivotal step for precise all-atom modeling for protein structures. 
 3 | 
 4 | **Results:** In this study, we proposed a deep learning-based method to predict protein backbone structures from alpha-carbon traces. Our method achieved comparable performance as the best previous method with cRMSD between predicted coordinates and reference coordinates as measurement.
 5 | 
 6 | # Workflow
 7 | ![image](https://user-images.githubusercontent.com/46809259/115357912-b3290e00-a1ef-11eb-8b82-0b58706c48a9.png)
 8 | 
 9 | # Webserver
10 | [点击进入骨架结构预测网页](http://deeppbs.com/)
11 | * Python / Pytorch / Django
12 | * KNN / Bi-litsm / Rodrigues
13 | 
14 | 
15 | # Protein structure prediction process
16 | ![](https://github.com/ElvinJun/DeepPBS/blob/master/process.jpg?raw=true)
17 | 
18 | 
19 | # Protein backbone strcture prediction based on Bi-LSTM
20 | ![deeppbs](https://github.com/ElvinJun/DeepPBS/blob/master/our_process%20.jpg?raw=true)
21 | 
22 | 
23 | # Method of rotation repetition
24 | ![rotation](https://github.com/ElvinJun/DeepPBS/blob/master/rotation.jpg?raw=true)
25 | 


--------------------------------------------------------------------------------
/Training models/coordinates_extraction.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | import numpy as np
 4 | 
 5 | 
 6 | def extract_cif(path, filename):
 7 |     with open(os.path.join(path, filename), 'r') as file:
 8 |         message = file.readlines()
 9 |     coos = []
10 |     for line in message:
11 |         line = line.split()
12 |         if line[3] != 'CB':
13 |             x = line[10]
14 |             y = line[11]
15 |             z = line[12]
16 |             coos.append([float(x), float(y), float(z)])
17 |     coos = np.array(coos)
18 |     return coos.astype('float32')
19 | 
20 | 
21 | dataset_name = 'nr_40'
22 | LIST_PATH = 'D:\protein_structure_prediction\data\dataset/nr_list/best_rebuild_nr40.txt'  # % dataset_name
23 | DATA_PATH = 'D:\protein_structure_prediction\data\dataset/cif_remove_again'
24 | COOR_PATH = 'D:\protein_structure_prediction\data\dataset/processed_data/%s/coordinates' % dataset_name
25 | 
26 | pathlib.Path(COOR_PATH).mkdir(parents=True, exist_ok=True)
27 | 
28 | with open(LIST_PATH, 'r') as file:
29 |     filenames = file.read().split('\n')
30 | finished_filenames = os.listdir(COOR_PATH)
31 | finished_num = 0
32 | for filename in finished_filenames:
33 |     if filename in filenames:
34 |         filenames.remove(filename)
35 |         finished_num += 1
36 | print('%d finished! %d to go!' % (finished_num, len(filenames)))
37 | 
38 | 
39 | failed_filename = []
40 | for filename in filenames:
41 |     print(filename)
42 | 
43 |     coos = extract_cif(DATA_PATH, filename + '.cif')
44 | 
45 |     np.save(os.path.join(COOR_PATH, filename), coos)
46 | 


--------------------------------------------------------------------------------
/ncbi_spydier.py:
--------------------------------------------------------------------------------
 1 | from Bio import Entrez
 2 | import os
 3 | Entrez.email = 'xxxxxxxxxxx@qq.com'  # always tell who you are
 4 | # handle = Entrez.egquery(term="E.coli")
 5 | # record = Entrez.read(handle)
 6 | # for row in record["eGQueryResult"]:
 7 | #     if row["DbName"]=="pubmed":
 8 | #         print row["Count"]
 9 | handle = Entrez.esearch(db="pubmed", term="growth phase" , retmax=500000)
10 | record = Entrez.read(handle)
11 | idlist = record["IdList"]
12 | list3_2= idlist
13 | print(len(list3_2))
14 | 
15 | Entrez.email = 'xxxxxxxxxxx@qq.com'  # always tell who you are
16 | # handle = Entrez.egquery(term="promoter")
17 | # record = Entrez.read(handle)
18 | # for row in record["eGQueryResult"]:
19 | #     if row["DbName"]=="pubmed":
20 | #         print row["Count"]
21 | handle = Entrez.esearch(db="pubmed", term="stress response", retmax=500000)
22 | record = Entrez.read(handle)
23 | idlist = record["IdList"]
24 | list3_3 = idlist
25 | print(len(list3_3))
26 | 
27 | Entrez.email = 'xxxxxxxxxxx@qq.com'  # always tell who you are
28 | # handle = Entrez.egquery(term="stationary phase")
29 | # record = Entrez.read(handle)
30 | # for row in record["eGQueryResult"]:
31 | #     if row["DbName"]=="pubmed":
32 | #         print row["Count"]
33 | 
34 | handle = Entrez.esearch(db="pubmed", term="acid response", retmax=500000)
35 | record = Entrez.read(handle)
36 | idlist = record["IdList"]
37 | list3_4 = idlist
38 | print(len(list3_4))
39 | 
40 | 
41 | handle = Entrez.esearch(db="pubmed", term="pH response", retmax=5000000 )
42 | record = Entrez.read(handle)
43 | idlist = record["IdList"]
44 | list3_5 = idlist
45 | print(len(list3_5))
46 | 
47 | f3_2 = open(os.path.join(os.getcwd(),'growth phase.txt'), 'w')
48 | f3_3 = open(os.path.join(os.getcwd(),'stress response.txt'), 'w')
49 | f3_4 = open(os.path.join(os.getcwd(),'acid response.txt'), 'w')
50 | f3_5 = open(os.path.join(os.getcwd(),'pH response.txt'), 'w')
51 | 
52 | for i in list3_2:
53 |     f3_2.write(i + ' \n')
54 | for i in list3_3:
55 |     f3_3.write(i + ' \n')
56 | for i in list3_4:
57 |     f3_4.write(i + ' \n')
58 | for i in list3_5:
59 |     f3_5.write(i + '\n ')
60 | 
61 | f3_2.close()
62 | f3_3.close()
63 | f3_4.close()
64 | f3_5.close()
65 | 
66 | 


--------------------------------------------------------------------------------
/webserver/views.py:
--------------------------------------------------------------------------------
 1 | from django.shortcuts import render
 2 | from django.views.decorators.http import require_GET, require_POST
 3 | from django.http import HttpResponse
 4 | from django.conf import settings
 5 | from django.core.files import File
 6 | import logging
 7 | import subprocess
 8 | import random
 9 | import os
10 | import time
11 | logger = logging.getLogger('django')
12 | 
13 | 
14 | def save_dir():
15 |     LOCAL_TIME = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
16 |     FILES_DIR = os.path.join(r'files', LOCAL_TIME)
17 |     if not os.path.exists(FILES_DIR):
18 |         os.makedirs(FILES_DIR)
19 | 
20 |     else:
21 |         FILES_DIR = FILES_DIR + '-' + str(random.randint(1, 1000))
22 |         os.makedirs(FILES_DIR)
23 |     SAVED_FILES_DIR = os.path.join(FILES_DIR, 'CA_info')
24 |     os.makedirs(SAVED_FILES_DIR)
25 |     return SAVED_FILES_DIR
26 | 
27 | # SAVED_FILES_DIR = save_dir()
28 | # files = os.listdir(SAVED_FILES_DIR)
29 | # for file in files:
30 | #     file_pathname = os.path.join(SAVED_FILES_DIR, file)
31 | #     os.unlink(file_pathname)
32 | 
33 | # Create your views here.
34 | def render_home_template(request):
35 | 
36 |     return render(request, 'home.html')
37 | 
38 | def render_home_template1(request):
39 |     files = os.listdir(SAVED_FILES_DIR)
40 |     return render(request, 'download.html', {'files': files})
41 | 
42 | def home(request):
43 | 
44 |     return render(request, 'home.html')
45 | 
46 | 
47 | def download(request, filename):
48 |     file_pathname = os.path.join(SAVED_FILES_DIR.replace('CA_info', 'backbone'), filename)
49 | 
50 |     with open(file_pathname, 'rb') as f:
51 |         file = File(f)
52 | 
53 |         response = HttpResponse(file.chunks(),
54 |                                 content_type='APPLICATION/OCTET-STREAM')
55 |         response['Content-Disposition'] = 'attachment; filename=' + filename
56 |         response['Content-Length'] = os.path.getsize(file_pathname)
57 |     # os.unlink(file_pathname)
58 |     return response
59 | 
60 | 
61 | def upload(request):
62 |     global SAVED_FILES_DIR
63 |     SAVED_FILES_DIR = save_dir()
64 |     files = request.FILES.getlist('filename')
65 |     if not files:
66 |         return render_home_template(request)
67 | 
68 | 
69 |     for file in files:
70 |         destination = open(SAVED_FILES_DIR + '/' + file.name, 'wb+')
71 |         for chunk in file.chunks():
72 |             destination.write(chunk)
73 | 
74 |         destination.close()
75 |     shell = 'python D:/python/webserver/fileoperation/model.py ' + SAVED_FILES_DIR
76 |     child = subprocess.Popen(shell,
77 |                             stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
78 |     stdout, stderr = child.communicate()
79 |     result = str(stderr, encoding='utf-8')  # 将脚本反馈的结果输入result
80 |     logger.info(result)
81 |     return render_home_template1(request)
82 | 
83 | 
84 | def index(request):
85 |     return render(request, 'index.html')
86 | 


--------------------------------------------------------------------------------
/webserver/pred_torsion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.utils.data import DataLoader
 4 | import os
 5 | from torch.nn import functional as F
 6 | from torch.utils.data import Dataset
 7 | import numpy as np
 8 | import pathlib
 9 | 
10 | class DistanceWindow(Dataset):
11 |     """Extract distance window arrays"""
12 | 
13 |     def __init__(self, distance_window_path):
14 |         self.distance_window_path = distance_window_path
15 |         self.file_list = os.listdir(distance_window_path)
16 | 
17 |     def __len__(self):
18 |         return len(self.file_list)
19 | 
20 |     def __getitem__(self, idx):
21 |         filename = self.file_list[idx]
22 |         arrays = np.load(os.path.join(self.distance_window_path, filename)).reshape((-1, 60))
23 |         # mix_arrays = np.concatenate((arrays[:-1], arrays[1:]), 1)
24 |         print(arrays.shape)
25 |         torsions = np.load(os.path.join(self.distance_window_path, filename))
26 | 
27 |         return arrays, torsions, filename
28 | 
29 | # torch.cuda.set_device(0)
30 | 
31 | 
32 | if torch.cuda.is_available():
33 |     print('GPU available!!!')
34 |     print('MainDevice=', device)
35 | 
36 | 
37 | def swish_fn(x):
38 |     """ Swish activation function """
39 |     return x * torch.sigmoid(x)
40 | 
41 | 
42 | class SplitModel(nn.Module):
43 |     def __init__(self, input_dim, hidden_dim, feature_dim, output_dim):
44 |         super().__init__()
45 | 
46 |         self.hidden1 = nn.Linear(input_dim, hidden_dim)
47 |         self._bn1 = nn.BatchNorm1d(hidden_dim)
48 | 
49 |         self.hidden2 = nn.Linear(hidden_dim, 2*hidden_dim)
50 |         self._bn2 = nn.BatchNorm1d(2*hidden_dim)
51 | 
52 |         self.hidden3 = nn.Linear(2*hidden_dim, hidden_dim)
53 |         self._bn3 = nn.BatchNorm1d(hidden_dim)
54 | 
55 | 
56 | 
57 |         self.extract_feature = nn.Linear(hidden_dim, feature_dim)
58 |         self._bn4 = nn.BatchNorm1d(feature_dim)
59 | 
60 |         self.lstm = nn.LSTM(feature_dim, hidden_dim, bidirectional=True)
61 |         self._bn5 = nn.BatchNorm1d(2 * hidden_dim)
62 | 
63 |         # self.sub_net1 = nn.Linear(2 * hidden_dim, hidden_dim)
64 |         # self._bn_s1 = nn.BatchNorm1d(hidden_dim)
65 |         # self.output1 = nn.Linear(hidden_dim, output_dim)
66 |         #
67 |         # self.sub_net2 = nn.Linear(2 * hidden_dim, hidden_dim)
68 |         # self._bn_s2 = nn.BatchNorm1d(hidden_dim)
69 |         # self.output2 = nn.Linear(hidden_dim, output_dim)
70 | 
71 |     def forward(self, arrays):
72 |         hidden1 = swish_fn(self._bn1(self.hidden1(arrays)))
73 |         hidden2 = swish_fn(self._bn2(self.hidden2(hidden1)))
74 |         hidden3 = swish_fn(self._bn3(self.hidden3(hidden2)))
75 |         features = swish_fn(self._bn4(self.extract_feature(hidden3)))
76 | 
77 |         hidden, _ = self.lstm(features.view(len(features), 1, -1))
78 |         output = swish_fn(self._bn5(hidden.squeeze(1)))
79 | 
80 |         # sub_hidden1 = swish_fn(self._bn_s1(self.sub_net1(hidden)))
81 |         # # sub_hidden1 = F.dropout(sub_hidden1, p=0.5, training=self.training)
82 |         # output1 = self.output1(sub_hidden1)
83 |         #
84 |         # sub_hidden2 = swish_fn(self._bn_s2(self.sub_net2(hidden)))
85 |         # # sub_hidden2 = F.dropout(sub_hidden2, p=0.5, training=self.training)
86 |         # output2 = self.output1(sub_hidden2)
87 |         #
88 |         # output = torch.cat([output1, output2], 1)
89 |         return output
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/Training models/batch_test.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from npy_data_loader import DistanceWindow
  4 | from torch.utils.data import DataLoader
  5 | import os
  6 | from torch.nn import functional as F
  7 | import numpy as np
  8 | import pathlib
  9 | 
 10 | 
 11 | torch.cuda.set_device(0)
 12 | device = torch.device('cuda:0')
 13 | 
 14 | if torch.cuda.is_available():
 15 |     print('GPU available!!!')
 16 |     print('MainDevice=', device)
 17 | 
 18 | 
 19 | train_name = 'nr40_2_Split_L1_Drop05'
 20 | save_dir = './outputs/' + train_name
 21 | val_dir = os.path.join(save_dir, 'val')
 22 | 
 23 | is_cross_validation = True
 24 | cross_validation_fold = 10
 25 | 
 26 | 
 27 | test_dataset = DistanceWindow(
 28 |     distance_window_path='/share/Data/processed/test_set/distance_window',
 29 |     torsion_path='/share/Data/processed/test_set/bitorsion')
 30 | test_data_loader = DataLoader(dataset=test_dataset)
 31 | 
 32 | 
 33 | def swish_fn(x):
 34 |     """ Swish activation function """
 35 |     return x * torch.sigmoid(x)
 36 | 
 37 | 
 38 | class SplitModel(nn.Module):
 39 |     def __init__(self, input_dim, hidden_dim, feature_dim, output_dim):
 40 |         super().__init__()
 41 | 
 42 |         self.hidden1 = nn.Linear(input_dim, hidden_dim)
 43 |         self._bn1 = nn.BatchNorm1d(hidden_dim)
 44 | 
 45 |         self.hidden2 = nn.Linear(hidden_dim, 2*hidden_dim)
 46 |         self._bn2 = nn.BatchNorm1d(2*hidden_dim)
 47 | 
 48 |         self.hidden3 = nn.Linear(2*hidden_dim, hidden_dim)
 49 |         self._bn3 = nn.BatchNorm1d(hidden_dim)
 50 | 
 51 |         self.extract_feature = nn.Linear(hidden_dim, feature_dim)
 52 |         self._bn4 = nn.BatchNorm1d(feature_dim)
 53 | 
 54 |         self.lstm = nn.LSTM(feature_dim, hidden_dim, bidirectional=True)
 55 |         self._bn5 = nn.BatchNorm1d(2 * hidden_dim)
 56 | 
 57 |         self.sub_net1 = nn.Linear(2 * hidden_dim, hidden_dim)
 58 |         self.output1 = nn.Linear(hidden_dim, output_dim)
 59 | 
 60 |         self.sub_net2 = nn.Linear(2 * hidden_dim, hidden_dim)
 61 |         self.output2 = nn.Linear(hidden_dim, output_dim)
 62 | 
 63 |     def forward(self, arrays):
 64 |         hidden1 = self._bn1(swish_fn(self.hidden1(arrays)))
 65 |         hidden2 = self._bn2(swish_fn(self.hidden2(hidden1)))
 66 |         hidden3 = self._bn3(swish_fn(self.hidden3(hidden2)))
 67 |         features = self._bn4(swish_fn(self.extract_feature(hidden3)))
 68 | 
 69 |         hidden, _ = self.lstm(features.view(len(features), 1, -1))
 70 |         hidden = self._bn5(hidden.squeeze(1))
 71 | 
 72 |         sub_hidden1 = self.sub_net1(hidden)
 73 |         sub_hidden1 = F.dropout(sub_hidden1, p=0.5, training=self.training)
 74 |         output1 = self.output1(sub_hidden1)
 75 | 
 76 |         sub_hidden2 = self.sub_net2(hidden)
 77 |         sub_hidden2 = F.dropout(sub_hidden2, p=0.5, training=self.training)
 78 |         output2 = self.output1(sub_hidden2)
 79 | 
 80 |         output = torch.cat([output1, output2], 1)
 81 |         return output
 82 | 
 83 | 
 84 | def test(model, data_loader):
 85 |     model.eval()
 86 |     model.is_training = False
 87 |     with torch.no_grad():
 88 |         for arrays, torsions, output_filename in data_loader:
 89 |             torsions = torsions.to(device)
 90 |             arrays = arrays.to(device)
 91 |             pred_sincos = model(arrays[0]).squeeze(1).transpose(0, 1)
 92 | 
 93 |             output = np.concatenate((pred_sincos.data.cpu().numpy(), torsions.data.cpu().numpy()[0]), 0)
 94 |             np.save(os.path.join(test_output_folder, output_filename[0]), output)
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     models_path = os.path.join(os.getcwd(), 'top_models')
 99 |     for model_name in os.listdir(models_path):
100 |         if model_name[-4:] == '.pth':
101 |             print(model_name)
102 |             # test_output_folder = os.path.join(models_path, 'test_outputs/%s' % model_name[:-4])
103 |             test_output_folder = os.path.join(os.getcwd(), 'comparison_test_outputs/%s' % model_name[:-4])
104 |             pathlib.Path(test_output_folder).mkdir(parents=True, exist_ok=True)
105 |             test_model = torch.load(os.path.join(models_path, model_name)).to(device)
106 | 
107 |             test(test_model, test_data_loader)
108 | 


--------------------------------------------------------------------------------
/extraction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | import os
  4 | from numpy import *
  5 | from scipy.spatial.distance import pdist
  6 | from scipy.spatial.distance import squareform
  7 | import pathlib
  8 | 
  9 | 
 10 | ALPHABET = {'A': 'ALA', 'F': 'PHE', 'C': 'CYS', 'D': 'ASP', 'N': 'ASN',
 11 |             'E': 'GLU', 'Q': 'GLN', 'G': 'GLY', 'H': 'HIS', 'L': 'LEU',
 12 |             'I': 'ILE', 'K': 'LYS', 'M': 'MET', 'P': 'PRO', 'R': 'ARG',
 13 |             'S': 'SER', 'T': 'THR', 'V': 'VAL', 'W': 'TRP', 'Y': 'TYR'}
 14 | AA_HYDROPATHICITY_INDEX = {'ARG': -4.5, 'LYS': -3.9, 'ASN': -3.5, 'ASP': -3.5, 'GLN': -3.5,
 15 |                            'GLU': -3.5, 'HIS': -3.2, 'PRO': -1.6, 'TYR': -1.3, 'TRP': -0.9,
 16 |                            'SER': -0.8, 'THR': -0.7, 'GLY': -0.4, 'ALA': 1.8, 'MET': 1.9,
 17 |                            'CYS': 2.5, 'PHE': 2.8, 'LEU': 3.8, 'VAL': 4.2, 'ILE': 4.5}
 18 | AA_BULKINESS_INDEX = {'ARG': 14.28, 'LYS': 15.71, 'ASN': 12.82, 'ASP': 11.68, 'GLN': 14.45,
 19 |                       'GLU': 13.57, 'HIS': 13.69,  'PRO': 17.43, 'TYR': 18.03, 'TRP': 21.67,
 20 |                       'SER': 9.47, 'THR': 15.77, 'GLY': 3.4, 'ALA': 11.5, 'MET': 16.25,
 21 |                       'CYS': 13.46, 'PHE': 19.8, 'LEU': 21.4, 'VAL': 21.57, 'ILE': 21.4}
 22 | AA_FLEXIBILITY_INDEX = {'ARG': 2.6, 'LYS': 1.9, 'ASN': 14., 'ASP': 12., 'GLN': 4.8,
 23 |                         'GLU': 5.4, 'HIS': 4., 'PRO': 0.05, 'TYR': 0.05, 'TRP': 0.05,
 24 |                         'SER': 19., 'THR': 9.3, 'GLY': 23., 'ALA': 14., 'MET': 0.05,
 25 |                         'CYS': 0.05, 'PHE': 7.5, 'LEU': 5.1, 'VAL': 2.6, 'ILE': 1.6}
 26 | AA_MESSAGE = {}
 27 | for aa_short in ALPHABET.keys():
 28 |     aa_long = ALPHABET[aa_short]
 29 |     AA_MESSAGE.update({aa_short: [(5.5 - AA_HYDROPATHICITY_INDEX[aa_long]) / 10,
 30 |                                   AA_BULKINESS_INDEX[aa_long] / 21.67,
 31 |                                   (25. - AA_FLEXIBILITY_INDEX[aa_long]) / 25.]})
 32 |     AA_MESSAGE.update({aa_long: [(5.5 - AA_HYDROPATHICITY_INDEX[aa_long]) / 10,
 33 |                                  AA_BULKINESS_INDEX[aa_long] / 21.67,
 34 |                                  (25. - AA_FLEXIBILITY_INDEX[aa_long]) / 25.]})
 35 | DISTANCE_WINDOW_PATH = 'D:\\database\\rmsd_compare\\real'
 36 | # filename = input()
 37 | # path = os.path.join(os.getcwd(),filename)
 38 | path = 'D:\\database\\rmsd_compare\\real\\4f7v.pdb'
 39 | 
 40 | #提取CA原子信息
 41 | def atoms_infos(path):
 42 |     file = open(path, 'r')
 43 |     lines = file.readlines()
 44 | 
 45 |     atoms_info = [line.strip('\n') for line in lines if line.split()[0] == 'ATOM' and line.split()[2] == 'CA']
 46 |     delet = []
 47 |     # 筛掉重复概率小的氨基酸
 48 |     for i in range(len(atoms_info)):
 49 |         if atoms_info[i - 1].split()[2] == atoms_info[i].split()[2] and atoms_info[i - 1].split()[5] == atoms_info[i].split()[5]:
 50 |             if atoms_info[i - 1].split()[-3] <= atoms_info[i].split()[-3]:
 51 |                 delet.append(i - 1)
 52 |             else:
 53 |                 delet.append(i)
 54 |     for i in delet[::-1]:
 55 |         del atoms_info[i]
 56 |     # atoms_info = array(atoms_info)
 57 |     return atoms_info
 58 | 
 59 | #断链情况是否进行补全
 60 | 
 61 | #提取坐标信息
 62 | def extract_coord(atoms_info):
 63 |     coord_array = np.zeros((len(atoms_info), 3))
 64 |     acid_list = []
 65 |     for i in range(len(atoms_info)):
 66 |         coord_array[i] = [float(atoms_info[i].split()[j]) for j in range(6, 9)]
 67 |         acid_list.append(atoms_info[i].split()[3][-3::])
 68 |     acid_array = array(acid_list)
 69 |     return coord_array, acid_array
 70 | 
 71 | 
 72 | def torsion():
 73 |     for n in range(len(torsion_sin)):
 74 |         torsion_training[n] = math.atan2(torsion_sin[n], torsion_cos[n])
 75 | 
 76 | def distance_window(coord_array, acid_array):
 77 |     WINDOW_SIZE = 15
 78 |     distCA = pdist(coord_array, metric='euclidean')
 79 |     distCA = squareform(distCA).astype('float32')
 80 |     save_name = 'out.npy'
 81 |     mark_type = [('distance', float), ('aa', 'S10')]
 82 |     dist_windows = []
 83 | 
 84 |     for i in range(len(distCA)):
 85 |         marked_array = []
 86 |         new_array = []
 87 |         for j in range(len(distCA[i])):
 88 |             marked_array.append((distCA[i, j], acid_array[j]))
 89 |         marked_array = np.array(marked_array, dtype=mark_type)
 90 |         marked_array = np.sort(marked_array, order='distance')[:WINDOW_SIZE]
 91 |         for j in range(len(marked_array)):
 92 |             aa = marked_array[j][1].decode('utf-8')
 93 |             new_array.append([marked_array[j][0]] + AA_MESSAGE[aa])
 94 |         dist_windows.append(new_array)
 95 |     dist_windows = np.array(dist_windows).astype('float32')
 96 | 
 97 |     np.save(os.path.join(DISTANCE_WINDOW_PATH, save_name), dist_windows)
 98 |     print('successful')
 99 | 
100 | if __name__ == "__main__":
101 |     atoms_info = atoms_infos(path)
102 |     coord_array, acid_array = extract_coord(atoms_info)
103 |     distance_window(coord_array, acid_array)
104 | 


--------------------------------------------------------------------------------
/Training models/distance_map.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from scipy.spatial.distance import pdist
  4 | from scipy.spatial.distance import squareform
  5 | import pathlib
  6 | 
  7 | 
  8 | ALPHABET = {'A': 'ALA', 'F': 'PHE', 'C': 'CYS', 'D': 'ASP', 'N': 'ASN',
  9 |             'E': 'GLU', 'Q': 'GLN', 'G': 'GLY', 'H': 'HIS', 'L': 'LEU',
 10 |             'I': 'ILE', 'K': 'LYS', 'M': 'MET', 'P': 'PRO', 'R': 'ARG',
 11 |             'S': 'SER', 'T': 'THR', 'V': 'VAL', 'W': 'TRP', 'Y': 'TYR'}
 12 | AA_HYDROPATHICITY_INDEX = {'ARG': -4.5, 'LYS': -3.9, 'ASN': -3.5, 'ASP': -3.5, 'GLN': -3.5,
 13 |                            'GLU': -3.5, 'HIS': -3.2, 'PRO': -1.6, 'TYR': -1.3, 'TRP': -0.9,
 14 |                            'SER': -0.8, 'THR': -0.7, 'GLY': -0.4, 'ALA': 1.8, 'MET': 1.9,
 15 |                            'CYS': 2.5, 'PHE': 2.8, 'LEU': 3.8, 'VAL': 4.2, 'ILE': 4.5}
 16 | AA_BULKINESS_INDEX = {'ARG': 14.28, 'LYS': 15.71, 'ASN': 12.82, 'ASP': 11.68, 'GLN': 14.45,
 17 |                       'GLU': 13.57, 'HIS': 13.69,  'PRO': 17.43, 'TYR': 18.03, 'TRP': 21.67,
 18 |                       'SER': 9.47, 'THR': 15.77, 'GLY': 3.4, 'ALA': 11.5, 'MET': 16.25,
 19 |                       'CYS': 13.46, 'PHE': 19.8, 'LEU': 21.4, 'VAL': 21.57, 'ILE': 21.4}
 20 | AA_FLEXIBILITY_INDEX = {'ARG': 2.6, 'LYS': 1.9, 'ASN': 14., 'ASP': 12., 'GLN': 4.8,
 21 |                         'GLU': 5.4, 'HIS': 4., 'PRO': 0.05, 'TYR': 0.05, 'TRP': 0.05,
 22 |                         'SER': 19., 'THR': 9.3, 'GLY': 23., 'ALA': 14., 'MET': 0.05,
 23 |                         'CYS': 0.05, 'PHE': 7.5, 'LEU': 5.1, 'VAL': 2.6, 'ILE': 1.6}
 24 | AA_MESSAGE = {}
 25 | for aa_short in ALPHABET.keys():
 26 |     aa_long = ALPHABET[aa_short]
 27 |     AA_MESSAGE.update({aa_short: [(5.5 - AA_HYDROPATHICITY_INDEX[aa_long]) / 10,
 28 |                                   AA_BULKINESS_INDEX[aa_long] / 21.67,
 29 |                                   (25. - AA_FLEXIBILITY_INDEX[aa_long]) / 25.]})
 30 |     AA_MESSAGE.update({aa_long: [(5.5 - AA_HYDROPATHICITY_INDEX[aa_long]) / 10,
 31 |                                  AA_BULKINESS_INDEX[aa_long] / 21.67,
 32 |                                  (25. - AA_FLEXIBILITY_INDEX[aa_long]) / 25.]})
 33 | 
 34 | 
 35 | def extract_pn(path, filename):
 36 |     with open(os.path.join(path, filename), 'r') as file:
 37 |         message = file.readlines()
 38 |     ca_coos = []
 39 |     seq_array = []
 40 |     seq = message[3][:-1]
 41 |     x = message[27][:-1].split('\t')
 42 |     y = message[28][:-1].split('\t')
 43 |     z = message[29][:-1].split('\t')
 44 |     mask = message[31][:-1]
 45 |     for i in range(len(mask)):
 46 |         if mask[i] == '+':
 47 |             ca_coos.append([float(x[3 * i + 1]) / 100., float(y[3 * i + 1]) / 100., float(z[3 * i + 1]) / 100.])
 48 |             aa = seq[i]
 49 |             seq_array.append(aa)
 50 |     ca_coos = np.array(ca_coos)
 51 |     seq_array = np.array(seq_array)
 52 |     return ca_coos, seq_array
 53 | 
 54 | 
 55 | def extract_cif(path, filename):
 56 |     with open(os.path.join(path, filename), 'r') as file:
 57 |         message = file.readlines()
 58 |     ca_coos = []
 59 |     seq_array = []
 60 |     # for line in message[1::3]:
 61 |     for line in message:
 62 |         line = line.split()
 63 |         if line[3] == 'CA':
 64 |             x = line[10]
 65 |             y = line[11]
 66 |             z = line[12]
 67 |             ca_coos.append([float(x), float(y) , float(z)])
 68 |             aa = line[5]
 69 |             seq_array.append(aa)
 70 |     ca_coos = np.array(ca_coos)
 71 |     seq_array = np.array(seq_array)
 72 |     return ca_coos, seq_array
 73 | 
 74 | 
 75 | dataset_name = 'test_set'
 76 | 
 77 | DATA_PATH = 'D:\protein_structure_prediction\data\dataset/test_set_atom_text'
 78 | DISTANCE_MAP_PATH = 'D:\protein_structure_prediction\data\dataset/processed_data/%s/distance_map' % dataset_name
 79 | DISTANCE_WINDOW_PATH = 'D:\protein_structure_prediction\data\dataset/processed_data/%s/distance_window' % dataset_name
 80 | 
 81 | pathlib.Path(DISTANCE_MAP_PATH).mkdir(parents=True, exist_ok=True)
 82 | pathlib.Path(DISTANCE_WINDOW_PATH).mkdir(parents=True, exist_ok=True)
 83 | 
 84 | 
 85 | failed_filename = []
 86 | 
 87 | for filename in ['4FBR.npy']:
 88 |     filename = filename.replace('.npy', '.cif')
 89 |     print(filename)
 90 | 
 91 |     ca_coo_test, seq_test = extract_cif(DATA_PATH, filename)
 92 |     
 93 | def distance_window(coord_array):
 94 |     WINDOW_SIZE = 15
 95 |     distCA = pdist(ca_coo_test, metric='euclidean')
 96 |     distCA = squareform(distCA).astype('float32')
 97 | 
 98 |     save_name = filename.replace('.cif', '.npy')
 99 |     np.save(os.path.join(DISTANCE_MAP_PATH, save_name), distCA)
100 | 
101 |     mark_type = [('distance', float), ('aa', 'S10')]
102 |     dist_windows = []
103 |     for i in range(len(distCA)):
104 |         marked_array = []
105 |         new_array = []
106 |         for j in range(len(distCA[i])):
107 |             marked_array.append((distCA[i, j], seq_test[j]))
108 |         marked_array = np.array(marked_array, dtype=mark_type)
109 |         marked_array = np.sort(marked_array, order='distance')[:WINDOW_SIZE]
110 |         for j in range(len(marked_array)):
111 |             aa = marked_array[j][1].decode('utf-8')
112 |             new_array.append([marked_array[j][0]] + AA_MESSAGE[aa])
113 |         dist_windows.append(new_array)
114 |     dist_windows = np.array(dist_windows).astype('float32')
115 |     
116 |     np.save(os.path.join(DISTANCE_WINDOW_PATH, save_name), dist_windows)
117 |     
118 | 
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/Training models/batch_validation.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from npy_data_loader import DistanceWindow
  4 | from torch.utils.data import DataLoader
  5 | import os
  6 | from torch.nn import functional as F
  7 | import math
  8 | 
  9 | 
 10 | torch.cuda.set_device(0)
 11 | device = torch.device('cuda:0')
 12 | 
 13 | batch_size = 1
 14 | loss_function_2 = nn.MSELoss()
 15 | val_epoch = 200
 16 | 
 17 | 
 18 | if torch.cuda.is_available():
 19 |     print('GPU available!!!')
 20 |     print('MainDevice=', device)
 21 | 
 22 | 
 23 | train_name = 'nr40_Split_L1_Drop05'
 24 | save_dir = './outputs/' + train_name
 25 | val_dir = os.path.join(save_dir, 'val')
 26 | 
 27 | is_cross_validation = True
 28 | cross_validation_fold = 10
 29 | 
 30 | 
 31 | def swish_fn(x):
 32 |     """ Swish activation function """
 33 |     return x * torch.sigmoid(x)
 34 | 
 35 | 
 36 | class SplitModel(nn.Module):
 37 |     def __init__(self, input_dim, hidden_dim, feature_dim, output_dim):
 38 |         super().__init__()
 39 | 
 40 |         self.hidden1 = nn.Linear(input_dim, hidden_dim)
 41 |         self._bn1 = nn.BatchNorm1d(hidden_dim)
 42 | 
 43 |         self.hidden2 = nn.Linear(hidden_dim, 2*hidden_dim)
 44 |         self._bn2 = nn.BatchNorm1d(2*hidden_dim)
 45 | 
 46 |         self.hidden3 = nn.Linear(2*hidden_dim, hidden_dim)
 47 |         self._bn3 = nn.BatchNorm1d(hidden_dim)
 48 | 
 49 |         self.extract_feature = nn.Linear(hidden_dim, feature_dim)
 50 |         self._bn4 = nn.BatchNorm1d(feature_dim)
 51 | 
 52 |         self.lstm = nn.LSTM(feature_dim, hidden_dim, bidirectional=True)
 53 |         self._bn5 = nn.BatchNorm1d(2 * hidden_dim)
 54 | 
 55 |         self.sub_net1 = nn.Linear(2 * hidden_dim, hidden_dim)
 56 |         self.output1 = nn.Linear(hidden_dim, output_dim)
 57 | 
 58 |         self.sub_net2 = nn.Linear(2 * hidden_dim, hidden_dim)
 59 |         self.output2 = nn.Linear(hidden_dim, output_dim)
 60 | 
 61 |     def forward(self, arrays):
 62 |         hidden1 = self._bn1(swish_fn(self.hidden1(arrays)))
 63 |         hidden2 = self._bn2(swish_fn(self.hidden2(hidden1)))
 64 |         hidden3 = self._bn3(swish_fn(self.hidden3(hidden2)))
 65 |         features = self._bn4(swish_fn(self.extract_feature(hidden3)))
 66 | 
 67 |         hidden, _ = self.lstm(features.view(len(features), 1, -1))
 68 |         hidden = self._bn5(hidden.squeeze(1))
 69 | 
 70 |         sub_hidden1 = self.sub_net1(hidden)
 71 |         sub_hidden1 = F.dropout(sub_hidden1, p=0.5, training=self.training)
 72 |         output1 = self.output1(sub_hidden1)
 73 | 
 74 |         sub_hidden2 = self.sub_net2(hidden)
 75 |         sub_hidden2 = F.dropout(sub_hidden2, p=0.5, training=self.training)
 76 |         output2 = self.output1(sub_hidden2)
 77 | 
 78 |         output = torch.cat([output1, output2], 1)
 79 |         return output
 80 | 
 81 | 
 82 | def validation(model, data_loader):
 83 |     model.eval()
 84 |     model.is_training = False
 85 |     with torch.no_grad():
 86 |         loss_sum = 0
 87 | 
 88 |         for arrays, torsions, output_filename in data_loader:
 89 |             torsions = torsions.to(device)
 90 |             arrays = arrays.to(device)
 91 |             sincos = torsions[0][2:]
 92 |             pred_sincos = model(arrays[0]).squeeze(1).transpose(0, 1)
 93 | 
 94 |             inner_error = (sincos[:2] - sincos[2:]).abs()
 95 |             weight = torch.pow(math.e, -inner_error)
 96 |             loss = torch.add(
 97 |                 torch.add(
 98 |                     (torch.pow((pred_sincos[:2] - sincos[:2]).abs() + 1e-10, weight)).mean(),
 99 |                     (torch.pow((pred_sincos[2:] - sincos[2:]).abs() + 1e-10, weight)).mean()),
100 |                 torch.sqrt(loss_function_2(pred_sincos[:2], pred_sincos[2:])))
101 | 
102 |             loss_sum += float(loss)
103 |         return loss_sum
104 | 
105 | 
106 | def main():
107 |     if is_cross_validation:
108 |         for subset_index in range(cross_validation_fold):
109 |             val_dataset = DistanceWindow(
110 |                 distance_window_path='/share/Data/processed/cif_190917/distance_window/',
111 |                 torsion_path='/share/Data/processed/nr40/10fold_val_subset/subset_%d/val' % subset_index)
112 |             val_loader = DataLoader(dataset=val_dataset, pin_memory=True)
113 | 
114 |             writer = open('./outputs/%s_%d/validation_map.txt' % (train_name, subset_index), 'w')
115 | 
116 |             for epoch in range(val_epoch):
117 |                 writer.write('epoch %d\n' % epoch)
118 |                 val_model = torch.load('./outputs/%s_%d/%d_Linear.pth' % (train_name, subset_index, epoch)).to(device)
119 | 
120 |                 loss_sum = validation(val_model, val_loader)
121 |                 mean_loss = loss_sum / len(val_dataset)
122 | 
123 |                 writer.write('mean_val_loss=%f\n\n' % mean_loss)
124 |                 print('epoch %d, mean_val_loss=%f\n' % (epoch, mean_loss))
125 | 
126 |             writer.close()
127 | 
128 | 
129 | def collect_result():
130 |     for subset_index in range(cross_validation_fold):
131 |         writer = open('./outputs/%s_%d/validation_map.txt' % (train_name, subset_index), 'w')
132 |         with open('./outputs/%s_%d/val_loss.txt' % (train_name, subset_index), 'r') as file:
133 |             lines = file.readlines()
134 |         for i in range(len(lines)):
135 |             if lines[i][0] == 'v':
136 |                 epoch_len = i+3
137 |                 result_index = i
138 |                 break
139 |         epoch = 0
140 |         for result_line in lines[result_index::epoch_len]:
141 |             writer.write('epoch %d\n' % epoch)
142 |             epoch += 1
143 |             mean_loss = result_line.split('=')[1]
144 |             writer.write('mean_val_loss=%s\n\n' % mean_loss)
145 |             print('subset_index %d, epoch %d, mean_val_loss=%s\n' % (subset_index, epoch, mean_loss))
146 |         writer.close()
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     # main()
151 |     collect_result()
152 | 
153 | 


--------------------------------------------------------------------------------
/webserver/rebulid.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | import os
  4 | from scipy.spatial.distance import pdist
  5 | from scipy.spatial.distance import squareform
  6 | import pathlib
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | L1_C = 0.5511235634596036
 10 | L1_N = 0.5275157666844377
 11 | # trans
 12 | L2_C_trans = 1.4281242923706199
 13 | R_C_trans = 0.5298886988235514
 14 | L2_N_trans = 1.4076846053568244
 15 | R_N_trans = 0.3797594132360668
 16 | L_O_trans = 1.669968615090273
 17 | R_O_trans = 1.735878468087069
 18 | # cis
 19 | L2_C_cis = 0.7914339670632375
 20 | R_C_cis = 1.309401495255961
 21 | L2_N_cis = 0.7973937679940248
 22 | R_N_cis = 1.2349344835918588
 23 | L_O_cis = 0.17424337647795887
 24 | R_O_cis = 2.384890116717385
 25 | 
 26 | 
 27 | class Coordinate(object):
 28 |     def __init__(self, coo):
 29 |         self.coo = coo
 30 |         self.x = self.coo[0]
 31 |         self.y = self.coo[1]
 32 |         self.z = self.coo[2]
 33 |         self.len = np.linalg.norm(self.coo)
 34 |         if self.len != 0:
 35 |             self.orient = self.coo/self.len
 36 | 
 37 | 
 38 | def vec(a, b):
 39 |     return Coordinate(b.coo - a.coo)
 40 | 
 41 | 
 42 | def get_coo(line):
 43 |     items = line.split()
 44 |     x = float(items[10])
 45 |     y = float(items[11])
 46 |     z = float(items[12])
 47 |     return Coordinate(np.array([x, y, z]))
 48 | 
 49 | 
 50 | def get_coos(lines):
 51 |     atom_coos = []
 52 |     for line in lines:
 53 |         atom_coos.append(get_coo(line))
 54 |     return atom_coos
 55 | 
 56 | 
 57 | def read_pn(lines):
 58 |     x = lines[27].split('\t')
 59 |     y = lines[28].split('\t')
 60 |     z = lines[29].split('\t')
 61 |     mask = lines[31]
 62 |     atoms_coo = []
 63 |     for i in range(len(mask) * 3):
 64 |         if mask[i // 3] == '+':
 65 |             atoms_coo.append(Coordinate(np.array([float(x[i]) / 100., float(y[i]) / 100., float(z[i]) / 100.])))
 66 |             if atoms_coo[-1].len == 0:
 67 |                 return None
 68 |     return atoms_coo
 69 | 
 70 | 
 71 | def get_cos(cb, cd):
 72 |     return np.dot(cb.coo, cd.coo)/(cb.len * cd.len)
 73 | 
 74 | 
 75 | def get_angle(cb, cd):
 76 |     return math.acos(get_cos(cb, cd))
 77 | 
 78 | 
 79 | def angle_norm(angle):
 80 |     return math.atan2(math.sin(angle), math.cos(angle))
 81 | 
 82 | 
 83 | def batch_angle_norm(array):
 84 |     return np.arctan2(np.sin(array), np.cos(array))
 85 | 
 86 | 
 87 | def get_projection(vector, axis):
 88 |     return Coordinate(vector.len * get_cos(vector, axis) * axis.orient)
 89 | 
 90 | 
 91 | def get_sign(vector, axis):
 92 |     return Coordinate(vector.coo - get_projection(vector, axis).coo)
 93 | 
 94 | 
 95 | # 计算以axis为轴，向量A到向量B的旋转角
 96 | def get_torsion(vector_A, vector_B, axis):
 97 |     N = Coordinate(np.cross(axis, vector_B))
 98 |     N_1 = Coordinate(np.cross(vector_A, axis))
 99 |     torsion = np.sign(np.dot(vector_A, N.orient)) * math.acos(np.dot(N_1.orient, N.orient))
100 |     return torsion
101 | 
102 | 
103 | def distance_martix(coordinates):
104 |     return squareform(pdist(coordinates, metric='euclidean'))
105 | 
106 | 
107 | # 计算夹角和坐标转换权重
108 | def torsion_m(vector_A, axis):
109 |     # 计算法向量
110 |     N_1 = Coordinate(np.cross(vector_A, axis)).orient
111 |     # 旋转基向量
112 |     m_weight = np.array([axis, np.cross(N_1, axis), N_1])
113 |     angle = math.acos(np.dot(axis, vector_A))
114 |     return m_weight, angle
115 | 
116 | 
117 | # 根据向量，旋转轴 旋转角 计算旋转过后的向量
118 | def rotation(vector_A, axis, torsion):
119 |     m, angle = torsion_m(vector_A, axis)
120 |     rotation_martix = [math.cos(math.pi-angle),
121 |                        math.sin(math.pi-angle) * math.cos(torsion),
122 |                        math.sin(math.pi-angle) * math.sin(torsion)]
123 | 
124 |     # 计算旋转后向量
125 |     vector_B = np.dot(m.T, rotation_martix)
126 |     return vector_B
127 | 
128 | 
129 | def backbone_rebuild_separated_torsion(coos, torsions_C, torsions_N):
130 |     # coos: coordinates of CA only
131 |     output_coos = [coos[0].coo]
132 | 
133 |     for k in range(len(coos) - 2):
134 |         CA1 = coos[k]
135 |         CA2 = coos[k + 1]
136 |         CA3 = coos[k + 2]
137 |         CA2CA3 = vec(CA2, CA3)
138 |         CA1CA2 = vec(CA1, CA2)
139 | 
140 |         initial_orient = get_sign(CA2CA3, CA1CA2).orient
141 |         axis = CA1CA2.orient
142 |         torsion_pred_C = torsions_C[k]
143 |         torsion_pred_N = torsions_N[k]
144 |         if CA1CA2.len > 3.4:
145 |             L2_C, L2_N, L_O, R_C, R_N, R_O = L2_C_trans, L2_N_trans, L_O_trans, R_C_trans, R_N_trans, R_O_trans
146 |             torsion_pred_N = angle_norm(torsion_pred_N - math.pi)
147 |         else:
148 |             L2_C, L2_N, L_O, R_C, R_N, R_O = L2_C_cis, L2_N_cis, L_O_cis, R_C_cis, R_N_cis, R_O_cis
149 | 
150 |         output_C1 = CA1.coo + L2_C * CA1CA2.orient + R_C * rotation(initial_orient, axis, torsion_pred_C)
151 |         output_O1 = CA1.coo + L_O * CA1CA2.orient + R_O * rotation(initial_orient, axis, torsion_pred_C)
152 |         output_N2 = CA2.coo - L2_N * CA1CA2.orient + R_N * rotation(initial_orient, axis, torsion_pred_N)
153 |         output_coos += [output_C1, output_O1, output_N2, CA2.coo]
154 | 
155 |     CA1 = coos[-3]
156 |     CA2 = coos[-2]
157 |     CA3 = coos[-1]
158 |     CA2CA3 = vec(CA2, CA3)
159 |     CA2CA1 = vec(CA2, CA1)
160 | 
161 |     initial_orient = get_sign(CA2CA1, CA2CA3).orient
162 |     axis = CA2CA3.orient
163 |     torsion_pred_C = torsions_C[-1]
164 |     torsion_pred_N = torsions_N[-1]
165 |     if CA2CA3.len > 3.4:
166 |         L2_C, L2_N, L_O, R_C, R_N, R_O = L2_C_trans, L2_N_trans, L_O_trans, R_C_trans, R_N_trans, R_O_trans
167 |         torsion_pred_N = angle_norm(torsion_pred_N - math.pi)
168 |     else:
169 |         L2_C, L2_N, L_O, R_C, R_N, R_O = L2_C_cis, L2_N_cis, L_O_cis, R_C_cis, R_N_cis, R_O_cis
170 | 
171 |     output_C2 = CA2.coo + L2_C * CA2CA3.orient + R_C * rotation(initial_orient, axis, torsion_pred_C)
172 |     output_O2 = CA2.coo + L_O * CA2CA3.orient + R_O * rotation(initial_orient, axis, torsion_pred_C)
173 |     output_N3 = CA3.coo - L2_N * CA2CA3.orient + R_N * rotation(initial_orient, axis, torsion_pred_N)
174 |     output_coos += [output_C2, output_O2, output_N3, CA3.coo]
175 |     output_coos = np.array(output_coos)
176 |     return output_coos
177 | 


--------------------------------------------------------------------------------
/Training models/make_dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*
  2 | import os
  3 | import time
  4 | import pathlib
  5 | import argparse
  6 | from process import Process
  7 | 
  8 | parser = argparse.ArgumentParser(description='manual to this script')
  9 | parser.add_argument('--resolution', type=int, default='3000',
 10 |                     help='output resolution')
 11 | parser.add_argument('--dataset_path', type=str, default='D:\protein_structure_prediction\data\dataset/casp12_sep',
 12 |                     help='path of dataset')
 13 | parser.add_argument('--output_path', type=str, default='D:\protein_structure_prediction\data\dataset\processed_data',
 14 |                     help='path of output')
 15 | parser.add_argument('--dataset', type=str, default='training_95',
 16 |                     help='name of dataset folder, training_95|bc-30-1_CA|bc-30-1_chains|cif_filtered|cif_fragment|nr90')
 17 | parser.add_argument('--input_type', type=str, default='pn',
 18 |                     help='type of input file, pn|cif|pdb')
 19 | parser.add_argument('--output_type', type=str, default='image',
 20 |                     help='output_format, images|distance_map|relocated_coordinate')
 21 | parser.add_argument('--axis_range', type=int, default='100',
 22 |                     help='map range of structures, 42|64|84')
 23 | parser.add_argument('--multi_process', type=bool, default=False,
 24 |                     help='multi process or not')
 25 | parser.add_argument('--multi_atom', type=bool, default=False,
 26 |                     help='input all backbone atoms or CA only')
 27 | parser.add_argument('--self_norm_ser_num', type=bool, default=False,
 28 |                     help='self normalized serial number')
 29 | parser.add_argument('--draw_connection', type=bool, default=True,
 30 |                     help='draw dots connection or not')
 31 | parser.add_argument('--crop', type=bool, default=True,
 32 |                     help='crop image before output')
 33 | parser.add_argument('--aminoacid_message', type=bool, default=True,
 34 |                     help='mark amino acid with hydropathicity, bulkiness and flexibility or 1.')
 35 | parser.add_argument('--z_norm', type=float, default=64.,
 36 |                     help='normalize range of z value')
 37 | parser.add_argument('--pairs_data', action='store_true', default=False,
 38 |                     help='pairs_data')
 39 | parser.add_argument('--test', action='store_true', default=True,
 40 |                     help='test mode')
 41 | parser.add_argument('--filenames_list', type=str, default='validation_len_under_200.txt',
 42 |                     help='read input filenames in list')
 43 | parser.add_argument('--sliding_window', action='store_true', default=True,
 44 |                     help='save outputs as sliding window')
 45 | parser.add_argument('--window_reorient', action='store_true', default=True,
 46 |                     help='reorientation for normalize every sliding window')
 47 | argparses = parser.parse_args()
 48 | 
 49 | 
 50 | class MakeDataset(object):
 51 |     def __init__(self, args):
 52 |         self.args = args
 53 |         self.input_folder = os.path.join(args.dataset_path, args.dataset)
 54 |         if args.filenames_list:
 55 |             with open(os.path.join(args.dataset_path, args.filenames_list), 'r') as file:
 56 |                 self.filenames = file.read().split('\n')
 57 |         else:
 58 |             self.filenames = os.listdir(self.input_folder)
 59 |         self.output_folders = {}
 60 | 
 61 |     def run(self):
 62 |         output_folder = os.path.join(self.args.output_path, self.args.dataset, time.strftime("%Y%m%d_%H%M",
 63 |                                                                                              time.localtime()))
 64 |         log_folder = os.path.join(self.args.output_path, self.args.dataset)
 65 |         self.make_folders(output_folder)
 66 |         self.write_log(log_folder)
 67 |         for filename in self.filenames:
 68 |             Process(self.args, filename, self.output_folders)
 69 | 
 70 |     def test(self, sample_num=5):
 71 |         output_folder = self.args.output_path + '/test_sample'
 72 |         self.make_folders(output_folder)
 73 |         self.write_log(output_folder)
 74 |         for filename in ['4KE2_1_A.pn']:  # self.filenames[:sample_num]:
 75 |             Process(self.args, filename, self.output_folders).process_for_data_loader_test()
 76 | 
 77 |     def make_folders(self, output_folder):
 78 |         self.output_folders.update({'output': output_folder})
 79 |         if self.args.pairs_data:
 80 |             query_folder = output_folder + '/query'
 81 |             target_folder = output_folder + '/target'
 82 |             pathlib.Path(query_folder).mkdir(parents=True, exist_ok=True)
 83 |             pathlib.Path(target_folder).mkdir(parents=True, exist_ok=True)
 84 |             self.output_folders.update({'query': query_folder, 'target': target_folder})
 85 |         else:
 86 |             pathlib.Path(output_folder).mkdir(parents=True, exist_ok=True)
 87 | 
 88 |     def write_log(self, path):
 89 |         args = self.args
 90 |         write_list = [time.strftime("%Y%m%d_%H%M", time.localtime())]
 91 |         arg_name_list = ['dataset',
 92 |                          'resolution',
 93 |                          'input_type',
 94 |                          'output_type',
 95 |                          'axis_range',
 96 |                          'multi_atom',
 97 |                          'self_norm_ser_num',
 98 |                          'draw_connection',
 99 |                          'z_norm']
100 |         arg_list = [args.dataset,
101 |                     args.resolution,
102 |                     args.input_type,
103 |                     args.output_type,
104 |                     args.axis_range,
105 |                     args.multi_atom,
106 |                     args.self_norm_ser_num,
107 |                     args.draw_connection,
108 |                     args.z_norm]
109 |         for i in range(len(arg_name_list)):
110 |             print("%s = %s" % (arg_name_list[i], str(arg_list[i])))
111 |             write_list.append("%s = %s" % (arg_name_list[i], str(arg_list[i])))
112 |         write_list.append('\n\n\n')
113 |         with open(path + '/args_log.txt', 'a') as log_writer:
114 |             log_writer.write('\n'.join(write_list))
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     if argparses.test:
119 |         MakeDataset(argparses).test()
120 |     else:
121 |         MakeDataset(argparses).run()
122 | 


--------------------------------------------------------------------------------
/extract_coord.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | import os
  4 | from numpy import *
  5 | from scipy.spatial.distance import pdist
  6 | from scipy.spatial.distance import squareform
  7 | import pathlib
  8 | from model import *
  9 | from rebulid import *
 10 | path = 'D:\\backbone_prediction'
 11 | 
 12 | ALPHABET = {'A': 'ALA', 'F': 'PHE', 'C': 'CYS', 'D': 'ASP', 'N': 'ASN',
 13 |             'E': 'GLU', 'Q': 'GLN', 'G': 'GLY', 'H': 'HIS', 'L': 'LEU',
 14 |             'I': 'ILE', 'K': 'LYS', 'M': 'MET', 'P': 'PRO', 'R': 'ARG',
 15 |             'S': 'SER', 'T': 'THR', 'V': 'VAL', 'W': 'TRP', 'Y': 'TYR'}
 16 | 
 17 | AA_HYDROPATHICITY_INDEX = {'ARG': -4.5, 'LYS': -3.9, 'ASN': -3.5, 'ASP': -3.5, 'GLN': -3.5,
 18 |                            'GLU': -3.5, 'HIS': -3.2, 'PRO': -1.6, 'TYR': -1.3, 'TRP': -0.9,
 19 |                            'SER': -0.8, 'THR': -0.7, 'GLY': -0.4, 'ALA': 1.8, 'MET': 1.9,
 20 |                            'CYS': 2.5, 'PHE': 2.8, 'LEU': 3.8, 'VAL': 4.2, 'ILE': 4.5}
 21 | 
 22 | AA_BULKINESS_INDEX = {'ARG': 14.28, 'LYS': 15.71, 'ASN': 12.82, 'ASP': 11.68, 'GLN': 14.45,
 23 |                       'GLU': 13.57, 'HIS': 13.69, 'PRO': 17.43, 'TYR': 18.03, 'TRP': 21.67,
 24 |                       'SER': 9.47, 'THR': 15.77, 'GLY': 3.4, 'ALA': 11.5, 'MET': 16.25,
 25 |                       'CYS': 13.46, 'PHE': 19.8, 'LEU': 21.4, 'VAL': 21.57, 'ILE': 21.4}
 26 | 
 27 | AA_FLEXIBILITY_INDEX = {'ARG': 2.6, 'LYS': 1.9, 'ASN': 14., 'ASP': 12., 'GLN': 4.8,
 28 |                         'GLU': 5.4, 'HIS': 4., 'PRO': 0.05, 'TYR': 0.05, 'TRP': 0.05,
 29 |                         'SER': 19., 'THR': 9.3, 'GLY': 23., 'ALA': 14., 'MET': 0.05,
 30 |                         'CYS': 0.05, 'PHE': 7.5, 'LEU': 5.1, 'VAL': 2.6, 'ILE': 1.6}
 31 | 
 32 | AA_MESSAGE = {}
 33 | 
 34 | for aa_short in ALPHABET.keys():
 35 |     aa_long = ALPHABET[aa_short]
 36 |     AA_MESSAGE.update({aa_short: [(5.5 - AA_HYDROPATHICITY_INDEX[aa_long]) / 10,
 37 |                                   AA_BULKINESS_INDEX[aa_long] / 21.67,
 38 |                                   (25. - AA_FLEXIBILITY_INDEX[aa_long]) / 25.]})
 39 | 
 40 |     AA_MESSAGE.update({aa_long: [(5.5 - AA_HYDROPATHICITY_INDEX[aa_long]) / 10,
 41 |                                  AA_BULKINESS_INDEX[aa_long] / 21.67,
 42 |                                  (25. - AA_FLEXIBILITY_INDEX[aa_long]) / 25.]})
 43 | 
 44 | distance_window_path = os.path.join(path, 'distance_window')
 45 | 
 46 | path_CA = 'D:\\backbone prediction\\CA_info'
 47 | atoms_type = ['N', 'CA', 'C', 'O']
 48 | 
 49 | 
 50 | # 提取CA原子信息
 51 | def atoms_infos(file_name):
 52 |     file = open(os.path.join(path_CA, file_name), 'r')
 53 |     lines = file.readlines()
 54 |     array_head_tail = np.zeros((3, 3))
 55 |     atoms_info = [line.strip('\n') for line in lines
 56 |                   if line.split()[0] == 'ATOM' and line.split()[2] in atoms_type]
 57 |     for line in lines:
 58 |         if line.split()[0] == 'ATOM' and line.split()[2] == 'N':
 59 |             array_head_tail[0] = [float(line.split()[j]) for j in range(6, 9)]
 60 |             break
 61 | 
 62 |     for line in lines[::-1]:
 63 |         if line.split()[0] == 'ATOM' and line.split()[2] == 'C':
 64 |             array_head_tail[1] = [float(line.split()[j]) for j in range(6, 9)]
 65 |             break
 66 | 
 67 |     for line in lines[::-1]:
 68 |         if line.split()[0] == 'ATOM' and line.split()[2] == 'O':
 69 |             array_head_tail[2] = [float(line.split()[j]) for j in range(6, 9)]
 70 |             break
 71 | 
 72 |     delet = []
 73 |     # 筛掉重复概率小的氨基酸
 74 |     for i in range(len(atoms_info)):
 75 |         if atoms_info[i - 1].split()[2] == atoms_info[i].split()[2] and \
 76 |                 atoms_info[i - 1].split()[5] == atoms_info[i].split()[5]:
 77 |             if atoms_info[i - 1].split()[-3] <= atoms_info[i].split()[-3]:
 78 |                 delet.append(i - 1)
 79 |             else:
 80 |                 delet.append(i)
 81 |     for i in delet[::-1]:
 82 |         del atoms_info[i]
 83 |     # atoms_info = array(atoms_info)
 84 |     return atoms_info, array_head_tail
 85 | 
 86 | 
 87 | # 断链情况是否进行补全
 88 | # 提取坐标信息
 89 | def extract_coord(atoms_info):
 90 |     coord_array = np.zeros((len(atoms_info) // 4, 3))
 91 |     coord_all = np.zeros((len(atoms_info), 3))
 92 |     acid_list = []
 93 |     for i in range(len(atoms_info)):
 94 | 
 95 |         coord_all[i] = [float(atoms_info[i].split()[j]) for j in range(6, 9)]
 96 |         if i % 4 == 1:
 97 |             coord_array[i // 4] = [float(atoms_info[i].split()[j]) for j in range(6, 9)]
 98 |             acid_list.append(atoms_info[i].split()[3][-3::])
 99 |     acid_array = array(acid_list)
100 |     return coord_array, acid_array, coord_all
101 | 
102 | 
103 | def torsion():
104 |     for n in range(len(torsion_sin)):
105 |         torsion_training[n] = math.atan2(torsion_sin[n], torsion_cos[n])
106 | 
107 | 
108 | def distance_window(coord_array, acid_array):
109 |     WINDOW_SIZE = 15
110 |     distCA = pdist(coord_array, metric='euclidean')
111 |     distCA = squareform(distCA).astype('float32')
112 |     save_name = file_name.replace('pdb', 'npy')
113 |     mark_type = [('distance', float), ('aa', 'S10')]
114 |     dist_windows = []
115 | 
116 |     for i in range(len(distCA)):
117 |         marked_array = []
118 |         new_array = []
119 |         for j in range(len(distCA[i])):
120 |             marked_array.append((distCA[i, j], acid_array[j]))
121 |         marked_array = np.array(marked_array, dtype=mark_type)
122 |         marked_array = np.sort(marked_array, order='distance')[:WINDOW_SIZE]
123 |         for j in range(len(marked_array)):
124 |             aa = marked_array[j][1].decode('utf-8')
125 |             new_array.append([marked_array[j][0]] + AA_MESSAGE[aa])
126 |         dist_windows.append(new_array)
127 |     dist_windows = np.array(dist_windows).astype('float32')
128 | 
129 |     np.save(os.path.join(distance_window_path, save_name), dist_windows)
130 |     print('successful')
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     #提取坐标信息计算windows——distance
135 |     COOR_PATH = 'D:\\backbone_prediction\\coord'
136 |     for file_name in os.listdir(path_CA):
137 |         atoms_info, array_head_tail = atoms_infos(file_name)
138 |         coord_array, acid_array, coord_all = extract_coord(atoms_info)
139 |         distance_window(coord_array, acid_array)
140 |         test_dataset = DistanceWindow(
141 |             distance_window_path='D:/backbone_prediction/distance_window')
142 |         data_loader = DataLoader(dataset=test_dataset)
143 |         np.save(os.path.join(COOR_PATH, file_name.replace('pdb', 'npy')), coord_all)
144 |     #融合50个模型的角度
145 | 
146 |     models_path = os.path.join(path, 'top_models')
147 |     with torch.no_grad():
148 |         for arrays, torsions, output_filename in data_loader:
149 |             total_file = 0
150 |             for model_name in os.listdir(models_path):
151 |                 model = torch.load(os.path.join(models_path, model_name), map_location='cuda:0')
152 |                 model.eval()
153 |                 model.is_training = False
154 |                 arrays = arrays.to(device)
155 |                 pred_sincos = model(arrays[0]).squeeze(1).transpose(0, 1)
156 |                 output = pred_sincos.data.cpu().numpy()
157 |                 total_file += output
158 |             np.save(os.path.join('D:\\backbone_prediction\\eric_rebulid', output_filename[0]), total_file)
159 |             #根据预测角度复原坐标
160 |             filename = output_filename[0]
161 |             coos = []
162 |             ground_true_coos = np.load(os.path.join(COOR_PATH, filename))
163 |             for coo in ground_true_coos[1::4]:
164 |                 coos.append(Coordinate(coo))
165 | 
166 |             PATH_OUTPUT = PATH_PRED + '_backbone'
167 |             pathlib.Path(PATH_OUTPUT).mkdir(parents=True, exist_ok=True)
168 |             pred = np.load(os.path.join(PATH_PRED, filename))
169 | 
170 |             torsions_C = np.arctan2(pred[0], pred[1])
171 |             torsions_N = np.arctan2(pred[2], pred[3])
172 | 
173 |             backbone_pred = backbone_rebuild_separated_torsion(coos, torsions_C, torsions_N)
174 |             backbone_pred = np.concatenate((ground_true_coos[0].reshape([1, 3]),
175 |                                             backbone_pred, ground_true_coos[-2:]), axis=0).astype('float32')
176 |             np.save(os.path.join(PATH_OUTPUT, filename), backbone_pred)
177 | 
178 | 


--------------------------------------------------------------------------------
/Training models/modelable_assess.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | import os
  4 | from scipy.spatial.distance import pdist
  5 | from scipy.spatial.distance import squareform
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | L1_C = 0.5511235634596036
 10 | L1_N = 0.5275157666844377
 11 | # trans
 12 | L2_C_trans = 1.4281242923706199
 13 | R_C_trans = 0.5298886988235514
 14 | L2_N_trans = 1.4076846053568244
 15 | R_N_trans = 0.3797594132360668
 16 | L_O_trans = 1.6340680296346668
 17 | R_O_trans = 1.7458955685095028
 18 | # cis
 19 | L2_C_cis = 0.7914339670632375
 20 | R_C_cis = 1.309401495255961
 21 | L2_N_cis = 0.7973937679940248
 22 | R_N_cis = 1.2349344835918588
 23 | L_O_cis = 0.17424337647795887
 24 | R_O_cis = 2.384890116717385
 25 | 
 26 | 
 27 | class Coordinate(object):
 28 |     def __init__(self, coo):
 29 |         self.coo = coo
 30 |         self.x = self.coo[0]
 31 |         self.y = self.coo[1]
 32 |         self.z = self.coo[2]
 33 |         self.len = np.linalg.norm(self.coo)
 34 |         if self.len != 0:
 35 |             self.orient = self.coo/self.len
 36 | 
 37 | 
 38 | def vec(a, b):
 39 |     ab = b.coo - a.coo
 40 |     return Coordinate(ab)
 41 | 
 42 | 
 43 | def get_coo(line):
 44 |     items = line.split()
 45 |     x = float(items[10])
 46 |     y = float(items[11])
 47 |     z = float(items[12])
 48 |     return Coordinate(np.array([x, y, z]))
 49 | 
 50 | 
 51 | def get_coos(lines):
 52 |     atom_coos = []
 53 |     for line in lines:
 54 |         if line.split()[3] != 'CB':
 55 |             atom_coos.append(get_coo(line))
 56 |             # print(line.split()[3], len(atom_coos) % 4)
 57 |     return atom_coos
 58 | 
 59 | 
 60 | def read_pn(lines):
 61 |     x = lines[27].split('\t')
 62 |     y = lines[28].split('\t')
 63 |     z = lines[29].split('\t')
 64 |     mask = lines[31]
 65 |     atoms_coo = []
 66 |     for i in range(len(mask) * 3):
 67 |         if mask[i // 3] == '+':
 68 |             atoms_coo.append(Coordinate(np.array([float(x[i]) / 100., float(y[i]) / 100., float(z[i]) / 100.])))
 69 |             if atoms_coo[-1].len == 0:
 70 |                 return None
 71 |     return atoms_coo
 72 | 
 73 | 
 74 | def get_cos(cb, cd):
 75 |     cos = np.dot(cb.coo, cd.coo)/(cb.len * cd.len)
 76 |     return cos
 77 | 
 78 | 
 79 | def get_angle(cb, cd):
 80 |     angle = math.acos(get_cos(cb, cd))
 81 |     return angle
 82 | 
 83 | 
 84 | def angle_norm(angle):
 85 |     normed_angle = math.atan2(math.sin(angle), math.cos(angle))
 86 |     return normed_angle
 87 | 
 88 | 
 89 | def array_angle_norm(array):
 90 |     normed_array = []
 91 |     for angle in array:
 92 |         normed_array.append(angle_norm(angle))
 93 |     return np.array(normed_array)
 94 | 
 95 | 
 96 | def get_projection(vec, axis):
 97 |     projection = Coordinate(vec.len * get_cos(vec, axis) * axis.orient)
 98 |     return projection
 99 | 
100 | 
101 | def get_sign(vec, axis):
102 |     sign = Coordinate(vec.coo - get_projection(vec, axis).coo)
103 |     return sign
104 | 
105 | 
106 | # 计算以axis为轴，向量A到向量B的旋转角
107 | def torsion(vector_A, vector_B, axis):
108 |     N = Coordinate(np.cross(axis, vector_B))
109 |     N_1 = Coordinate(np.cross(vector_A, axis))
110 |     torsion = np.sign(np.dot(vector_A, N.orient)) * math.acos(np.dot(N_1.orient, N.orient))
111 |     return torsion
112 | 
113 | 
114 | # 计算夹角和坐标转换权重
115 | def torsion_m(vector_A, axis):
116 |     #计算法向量
117 |     N_1 = Coordinate(np.cross(vector_A,axis)).orient
118 |     #旋转基向量
119 |     m_weight = np.array([axis , np.cross(N_1,axis) , N_1])
120 |     angle = math.acos(np.dot(axis,vector_A))
121 |     return m_weight, angle
122 | 
123 | 
124 | # 根据向量，旋转轴 旋转角 计算旋转过后的向量
125 | def rotation(vector_A, axis, torsion):
126 |     m, angle = torsion_m(vector_A, axis)
127 |     rotation_martix=[math.cos(math.pi-angle),
128 |                   math.sin(math.pi-angle) * math.cos(torsion),
129 |                   math.sin(math.pi-angle) * math.sin(torsion)]
130 | 
131 |     #计算旋转后向量
132 |     vector_B = np.dot(m.T, rotation_martix)
133 |     return vector_B
134 | 
135 | 
136 | def distance_martix(A):
137 |     # A是一个向量矩阵：euclidean代表欧式距离
138 |     distA=pdist(A, metric='euclidean')
139 |     # 将distA数组变成一个矩阵
140 |     distB = squareform(distA)
141 |     return distB
142 | 
143 | 
144 | def backbone_rebuild_separated_torsion(coos, torsions_C, torsions_N):
145 |     # coos: coordinates of CA only
146 |     output_coos = [coos[0].coo]
147 | 
148 |     for k in range(len(coos) - 2):
149 |         CA1 = coos[k]
150 |         CA2 = coos[k + 1]
151 |         CA3 = coos[k + 2]
152 |         CA2CA3 = vec(CA2, CA3)
153 |         CA1CA2 = vec(CA1, CA2)
154 | 
155 |         initial_orient = get_sign(CA2CA3, CA1CA2).orient
156 |         axis = CA1CA2.orient
157 |         torsion_pred_C = torsions_C[k]
158 |         torsion_pred_N = torsions_N[k]
159 |         if CA1CA2.len > 3.4:
160 |             L2_C, L2_N, L_O, R_C, R_N, R_O = L2_C_trans, L2_N_trans, L_O_trans, R_C_trans, R_N_trans, R_O_trans
161 |             torsion_pred_N = angle_norm(torsion_pred_N - math.pi)
162 |         else:
163 |             L2_C, L2_N, L_O, R_C, R_N, R_O = L2_C_cis, L2_N_cis, L_O_cis, R_C_cis, R_N_cis, R_O_cis
164 | 
165 |         output_C1 = CA1.coo + L2_C * CA1CA2.orient + R_C * rotation(initial_orient, axis, torsion_pred_C)
166 |         output_O1 = CA1.coo + L_O * CA1CA2.orient + R_O * rotation(initial_orient, axis, torsion_pred_C)
167 |         output_N2 = CA2.coo - L2_N * CA1CA2.orient + R_N * rotation(initial_orient, axis, torsion_pred_N)
168 |         output_coos += [output_C1, output_O1, output_N2, CA2.coo]
169 | 
170 |     CA1 = coos[-3]
171 |     CA2 = coos[-2]
172 |     CA3 = coos[-1]
173 |     CA2CA3 = vec(CA2, CA3)
174 |     CA2CA1 = vec(CA2, CA1)
175 | 
176 |     initial_orient = get_sign(CA2CA1, CA2CA3).orient
177 |     axis = CA2CA3.orient
178 |     torsion_pred_C = torsions_C[-1]
179 |     torsion_pred_N = torsions_N[-1]
180 |     if CA2CA3.len > 3.4:
181 |         L2_C, L2_N, L_O, R_C, R_N, R_O = L2_C_trans, L2_N_trans, L_O_trans, R_C_trans, R_N_trans, R_O_trans
182 |         torsion_pred_N = angle_norm(torsion_pred_N - math.pi)
183 |     else:
184 |         L2_C, L2_N, L_O, R_C, R_N, R_O = L2_C_cis, L2_N_cis, L_O_cis, R_C_cis, R_N_cis, R_O_cis
185 | 
186 |     output_C2 = CA2.coo + L2_C * CA2CA3.orient + R_C * rotation(initial_orient, axis, torsion_pred_C)
187 |     output_O2 = CA2.coo + L_O * CA2CA3.orient + R_O * rotation(initial_orient, axis, torsion_pred_C)
188 |     output_N3 = CA3.coo - L2_N * CA2CA3.orient + R_N * rotation(initial_orient, axis, torsion_pred_N)
189 |     output_coos += [output_C2, output_O2, output_N3, CA3.coo]
190 |     output_coos = np.array(output_coos)
191 |     return output_coos
192 | 
193 | 
194 | # PATH = 'D:\protein_structure_prediction\data\dataset/test_set_withO'
195 | PATH = 'D:\protein_structure_prediction\data\dataset/processed_data/test_set\coordinates'
196 | REBUILD_PATH = 'D:\protein_structure_prediction\data\dataset/processed_data/test_set/rebuild_coordinates'
197 | BITORSION_PATH = 'D:\protein_structure_prediction\data\dataset/processed_data/test_set/bitorsions_'
198 | 
199 | 
200 | atom_missed_filenames = []
201 | failed_filenames = []
202 | filenames = os.listdir(PATH)
203 | # for filename in [filenames[0]]:
204 | for filename in filenames:
205 |     print(filename)
206 |     try:
207 |         gt_coos = np.load(os.path.join(PATH, filename))
208 |         if np.shape(gt_coos)[0] % 4 != 0:
209 |             atom_missed_filenames.append(filename)
210 | 
211 |         else:
212 |             torsions_C = []
213 |             torsions_N = []
214 |             coos = []
215 |             for coo in gt_coos:
216 |                 coos.append(Coordinate(coo))
217 | 
218 |             for k in range(len(coos) // 4 - 2):
219 |                 k *= 4
220 |                 CA1 = coos[1 + k]
221 |                 C1 = coos[2 + k]
222 |                 O1 = coos[3 + k]
223 |                 N2 = coos[4 + k]
224 |                 CA2 = coos[5 + k]
225 |                 CA3 = coos[9 + k]
226 | 
227 |                 CA2CA3 = vec(CA2, CA3)
228 |                 CA1CA2 = vec(CA1, CA2)
229 |                 CA2CA1 = vec(CA2, CA1)
230 |                 CA1C1 = vec(CA1, C1)
231 |                 CA2N2 = vec(CA2, N2)
232 | 
233 |                 torsions_C.append(torsion(CA2CA3.orient, CA1C1.orient, CA1CA2.orient))
234 |                 if CA1CA2.len > 3.4:
235 |                     torsions_N.append(angle_norm(torsion(CA2CA3.orient, CA2N2.orient, CA1CA2.orient) - math.pi))
236 |                 else:
237 |                     torsions_N.append(torsion(CA2CA3.orient, CA2N2.orient, CA1CA2.orient))
238 | 
239 |             k = (len(coos) // 4 - 3) * 4
240 |             CA1 = coos[1 + k]
241 |             CA2 = coos[5 + k]
242 |             C2 = coos[6 + k]
243 |             O2 = coos[7 + k]
244 |             N3 = coos[8 + k]
245 |             CA3 = coos[9 + k]
246 | 
247 |             CA2CA3 = vec(CA2, CA3)
248 |             CA3CA2 = vec(CA3, CA2)
249 |             CA2CA1 = vec(CA2, CA1)
250 |             CA2C2 = vec(CA2, C2)
251 |             CA3N3 = vec(CA3, N3)
252 | 
253 |             torsions_C.append(torsion(CA2CA1.orient, CA2C2.orient, CA2CA3.orient))
254 |             if CA2CA3.len > 3.4:
255 |                 torsions_N.append(angle_norm(torsion(CA2CA1.orient, CA3N3.orient, CA2CA3.orient) - math.pi))
256 |             else:
257 |                 torsions_N.append(torsion(CA2CA1.orient, CA3N3.orient, CA2CA3.orient))
258 | 
259 |             torsions_N = np.array(torsions_N)
260 |             torsions_C = np.array(torsions_C)
261 |             bitorsions = np.array([torsions_C / math.pi,
262 |                                    torsions_N / math.pi,
263 |                                    np.sin(torsions_C),
264 |                                    np.cos(torsions_C),
265 |                                    np.sin(torsions_N),
266 |                                    np.cos(torsions_N)]).astype('float32')
267 |             np.save(os.path.join(BITORSION_PATH, filename), bitorsions)
268 | 
269 |             rebuild_coos = np.concatenate((gt_coos[:1],
270 |                                            backbone_rebuild_separated_torsion(coos[1::4], torsions_C, torsions_N),
271 |                                            gt_coos[-2:]), axis=0).astype('float32')
272 |             np.save(os.path.join(REBUILD_PATH, filename), rebuild_coos)
273 |             # print(np.shape(rebuild_coos), np.shape(gt_coos))
274 |             # print(np.linalg.norm(rebuild_coos - gt_coos, axis=1))
275 |             # print(np.linalg.norm(rebuild_coos[::4] - gt_coos[::4], axis=1).mean())
276 |             # print(np.linalg.norm(rebuild_coos[2::4] - gt_coos[2::4], axis=1).mean())
277 | 
278 |     except Exception:
279 |         failed_filenames.append(filename)
280 | 


--------------------------------------------------------------------------------
/computation_rmsd.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import numpy as np
  4 | import xlrd
  5 | import xlwt
  6 | from numpy import *
  7 | import pandas as pd
  8 | 
  9 | #通过两坐标计算单位向量
 10 | def vector_unit(vector_1,vector_2):
 11 |     bond_vector_2 = vector_1 - vector_2
 12 |     bond_length_2 = np.linalg.norm(bond_vector_2)
 13 |     return bond_vector_2 / bond_length_2
 14 | 
 15 | 
 16 | #计算法向量和旋转角
 17 | def torsion_angle(A, B, C, D):
 18 |     #计算法向量
 19 |     U_2 = vector_unit(B,A); U_1 = vector_unit(C,B); U = vector_unit(D,C)
 20 |     N   = np.cross(U_1, U)   / np.linalg.norm(np.cross(U_1, U))
 21 |     N_1 = np.cross(U_2, U_1) / np.linalg.norm(np.cross(U_2, U_1))
 22 |     m_weight = np.array([U_1, np.cross(N_1, U_1), N_1])
 23 |     #torsion_angle
 24 |     angle = np.sign(np.dot(U_2, N)) * math.acos(np.dot(N_1, N))  
 25 |     return angle, m_weight
 26 | 
 27 | 
 28 | #根据真实角度或训练角度预测下一个坐标
 29 | def next_coord(A, B, C, D, R, angle_confirm,torsion_pred):
 30 |     #torsion_angle
 31 |     angle_real , m = torsion_angle(A, B, C, D)
 32 |     #将真实角度或预测角度赋值给torsion
 33 |     torsion = torsion_pred
 34 | #     print("N——angle:",angle_real,angle_train)
 35 |     angle_martix=[math.cos(math.pi-angle_confirm),
 36 |                   math.sin(math.pi-angle_confirm) * math.cos(torsion),
 37 |                   math.sin(math.pi-angle_confirm) * math.sin(torsion)]
 38 |     #计算下一个坐标
 39 |     next_corrd = C + R * np.dot(m.T, angle_martix)
 40 |     return next_corrd, torsion
 41 | 
 42 | 
 43 | #计算预测的CB位置
 44 | def pred_CBcoord(N_coord, CA_coord, C_coord, CB_coord):
 45 |     # N和C的中间向量
 46 |     vector_midleline = (vector_unit(N_coord, CA_coord) + vector_unit(C_coord, CA_coord))
 47 |     vector_midleline_unit = vector_midleline / np.linalg.norm(vector_midleline)
 48 | 
 49 |     C = CA_coord + vector_midleline_unit * 0.841829775235248
 50 |     # C2 = N_coord + vector_unit(C_coord, N_coord) * 1.190426725853957
 51 |     angle_confirm = math.pi / 2
 52 | 
 53 |     # 统计CA到CB的距离: R = np.linalg.norm(C1 - CB_coord)
 54 |     R = 2.1545175870366853  # 统计得到的CA到CB的距离
 55 |     torsion = 0.5999114448494303  # 根据统计得到的旋转角
 56 | 
 57 |     # 计算得到预测的CB位置
 58 |     next_CB_coord, t = next_coord(CA_coord, N_coord, C, CB_coord, R, angle_confirm, torsion)
 59 |     return next_CB_coord
 60 | 
 61 | 
 62 | #从真实的pdb中提取对应原子坐标信息
 63 | def extract_info_from_pdb(path_file_real):
 64 |     delet = []
 65 |     # 获取真实pdb文件坐标
 66 |     f_real = open(path_file_real, 'r');
 67 |     real_lines = f_real.readlines()
 68 | 
 69 |     # 提取对应原子的信息存到列表real
 70 |     real = [line for line in real_lines if line.split()[0] == 'ATOM' and line.split()[2] in  atoms_type]
 71 |     f_real.close()
 72 | 
 73 |     # 筛掉重复概率小的氨基酸
 74 |     for i in range(len(real)):
 75 |         if real[i - 1].split()[2] == real[i].split()[2] and real[i - 1].split()[5] == real[i].split()[5]:
 76 |             if real[i - 1].split()[-3] <= real[i].split()[-3]:
 77 |                 delet.append(i - 1)
 78 |             else:
 79 |                 delet.append(i)
 80 |     for i in delet[::-1]:
 81 |         del real[i]
 82 |     return real
 83 | 
 84 | 
 85 | # 记录该氨基酸是否存在CB
 86 | def CB_determine(real):
 87 |     real_CB = []
 88 |     CB_whether_exist = []
 89 |     real_array_without_CB = []
 90 |     real_array = np.zeros((len(real), 3))
 91 |     # real_with_CB = np.zeros((len(real), 3))
 92 |     for i in range(len(real)):
 93 |         real_array[i] = np.array([float(real[i].split()[j]) for j in range(6, 9)])
 94 |         #为了判定该CA处是否存在CB
 95 |         if real[i].split()[2] == atoms_type[1]:
 96 |             for line in range(-1, len(atoms_type) - 2):
 97 |                 real_array_without_CB.append([float(real[i + line].split()[j]) for j in range(6, 9)])
 98 | 
 99 |             if real[i].split()[3] == 'GLY':
100 |                 CB_whether_exist.append('-')
101 |             else:
102 |                 CB_coord = np.array([float(real[i + len(atoms_type) - 2].split()[j]) for j in range(6, 9)])
103 |                 # 检查重构CB和真实CB的误差
104 |                 # next_CB = pred_CBcoord(N_coord, CA_coord, C_coord, CB_coord)
105 |                 real_CB.append(CB_coord)
106 |                 CB_whether_exist.append('+')
107 |     real_array_without_CB = array(real_array_without_CB)
108 | 
109 |     return real_CB, CB_whether_exist, real_array, real_array_without_CB
110 | 
111 | 
112 | #从预测的pdb中提取对应原子坐标信息
113 | def extract_info_from_pred(CB_whether_exist, path_pred):
114 |     gen = []
115 |     length_file = CB_whether_exist.count('+') * len(atoms_type) + CB_whether_exist.count('-') * (len(atoms_type) - 1)
116 |     all_atoms = {'N': 0, 'CA': 1, 'C': 2, 'O': 3}
117 |     if path_pred.endswith('.pdb'):
118 |         # pdb格式读取
119 |         path_pred = path_pred.replace('real', 'pd2_out')
120 |         f_gen = open(path_pred, 'r')
121 |         gen_lines = f_gen.readlines()
122 | 
123 |         for line in range(len(gen_lines)):
124 |             if gen_lines[line].split()[0] == 'ATOM' and gen_lines[line].split()[2] == 'CB':
125 |                 gen.append(gen_lines[line])
126 |         f_gen.close()
127 |         # 提取全部坐标为array，提取CB坐标为array_CB
128 |         pred_array = np.zeros((length_file, 3))
129 |         pred_CB = np.zeros((CB_whether_exist.count('+'), 3))
130 |         pred_array_without_CB = np.zeros((length_file - CB_whether_exist.count('+'), 3))
131 |         count = 0; count_CB = 0
132 |         print(length_file, len(gen), len(CB_whether_exist))
133 |         for i in range(len(gen)):
134 |             pred_array[i] = np.array([float(gen[i].split()[j]) for j in range(6, 9)])
135 |             if gen[i].split()[2] == 'CB':
136 |                 pred_CB[count_CB] = np.array([float(gen[i].split()[j]) for j in range(6, 9)])
137 |                 count_CB += 1
138 |             else:
139 |                 pred_array_without_CB[count] = np.array([float(gen[i].split()[j]) for j in range(6, 9)])
140 |                 count += 1
141 |         return pred_array, pred_array_without_CB, pred_CB
142 |     else:
143 |         # npy格式读取
144 |         path_pred = path_pred.replace('real', 'our_out')
145 |         pred_array = []
146 |         number = [all_atoms[atom] for atom in atoms_type[:-1]]
147 |         pred_npy_without_CB = np.load(path_pred)
148 |         pred_CB = np.zeros((CB_whether_exist.count('+'), 3))
149 |         pred_array_without_CB = []
150 |         count = 0
151 |         for j in range(0, pred_npy_without_CB.shape[0]):
152 |             # 进一步提取只有CB的array
153 |             if j % 4 == int(number[-1]):
154 |                 for line in number:
155 |                     pred_array.append(pred_npy_without_CB[j +line - number[-1]])
156 |                     pred_array_without_CB.append(pred_npy_without_CB[j +line - number[-1]])
157 | 
158 |                 if CB_whether_exist[j // 4] == '+':
159 |                     N_coord_pred  = pred_npy_without_CB[j - number[-1]]
160 |                     CA_coord_pred = pred_npy_without_CB[j - number[-1] +1]
161 |                     C_coord_pred  = pred_npy_without_CB[j - number[-1] +2]
162 |                     CB_coord_pred = pred_npy_without_CB[j - number[-1] +3]
163 |                     #根据预测出的C和N计算得到CB的坐标
164 |                     next_CB = pred_CBcoord(N_coord_pred, CA_coord_pred, C_coord_pred, CB_coord_pred)
165 |                     pred_CB[count] = np.array(next_CB)
166 |                     pred_array.append(next_CB)
167 |                     count += 1
168 |         pred_array = array(pred_array)
169 |         pred_array_without_CB = array(pred_array_without_CB)
170 |         return  pred_array, pred_array_without_CB, pred_CB
171 | 
172 | 
173 | # pdb或npy数据的位置，
174 | # 可以返回含有所有原子的array
175 | # 含（N,CA,C,O)的array
176 | # 只含CB的array
177 | def extraction_coord(path_real, path_pred):
178 |     # 获取真实pdb的坐标信息返回为array
179 |     real = extract_info_from_pdb(path_real)
180 |     real_CB, CB_whether_exist, real_array, real_array_without_CB = CB_determine(real)
181 | 
182 |     # 获取pred的坐标信息返回为array
183 |     pred_array, pred_array_without_CB, pred_CB = extract_info_from_pred(CB_whether_exist, path_pred )
184 | 
185 |     return real_array, pred_array, real_array_without_CB, pred_array_without_CB, real_CB, pred_CB
186 | 
187 | 
188 | def test(real,pred):
189 |     GC = 0
190 |     for i in range(len(real)):
191 |         A = real[i]
192 |         B = pred[i]
193 |         GC += np.square(np.linalg.norm(np.array(A) - np.array(B)))
194 |         print(A-B)
195 | 
196 | 
197 | #real:真实的坐标数组 pred:生成的坐标数组
198 | def computation_rmsd(real, pred):
199 |     K = np.eye(4)
200 |     Sxx = Sxy = Sxz = Syx = Syy = Syz = Szx = Szy = Szz = 0
201 |     GA = GB = GC = 0
202 | 
203 |     for i in range(len(real)):
204 |         A = real[i]; B = pred[i]
205 |         XA = A[0]; YA = A[1]; ZA = A[2]
206 |         XB = B[0]; YB = B[1]; ZB = B[2]
207 | 
208 |         GA += np.square(np.linalg.norm(A))
209 |         GB += np.square(np.linalg.norm(B))
210 |         GC += np.square(np.linalg.norm(np.array(A) - np.array(B)))
211 | 
212 |         Sxx += XB * XA; Syy += YB * YA; Szz += ZB * ZA
213 |         Sxy += XB * YA; Sxz += XB * ZA; Syz += YB * ZA
214 |         Syx += YB * XA; Szx += ZB * XA; Szy += ZB * YA
215 | 
216 |     # 构建密钥矩阵
217 |     K[0][0] = Sxx + Syy + Szz
218 |     K[1][1] = Sxx - Syy - Szz
219 |     K[2][2] = -Sxx + Syy - Szz
220 |     K[3][3] = -Sxx - Syy + Szz
221 |     K[0][1] = K[1][0] = Syz - Szy
222 |     K[0][2] = K[2][0] = Szx - Sxz
223 |     K[0][3] = K[3][0] = Sxy - Syx
224 |     K[1][2] = K[2][1] = Sxy + Syx
225 |     K[1][3] = K[3][1] = Szx + Sxz
226 |     K[2][3] = K[3][2] = Syz + Szy
227 | 
228 |     # 计算最大特征值
229 |     a, b = np.linalg.eig(K)
230 |     u = max(a)
231 | 
232 |     # 计算rmsd
233 |     rmsd = np.sqrt(abs((GA + GB - 2 * u)) / len(real))
234 |     # C_rmsd = np.sqrt(GC / len(real))
235 |     return rmsd
236 | 
237 | #计算所有rmsd数值的矩阵
238 | def computation_rmsd_array(pred_end, sheet):
239 |     atoms = atoms_type[0:-1]
240 |     # 获取需要计算的文件名
241 |     file_names = os.listdir(os.path.join(os.getcwd(), 'real'))
242 |     # 暂且不算有问题的pdb， 具体问题正在进一步查找
243 |     # file_names.remove('4fbr.pdb')
244 |     file_names.remove('4avz.pdb')
245 | 
246 |     # 遍历所有pdb文件
247 |     path_file = os.path.join(os.getcwd(), 'real')
248 |     rmsd_array = np.zeros((len(file_names), len(atoms) + 2))
249 | 
250 |     for file_real in file_names:
251 |         idx = file_names.index(file_real) + 1
252 |         sheet.write(idx, 0, file_real)
253 | 
254 |         path_real = os.path.join(path_file, file_real)
255 |         if pred_end.endswith('pdb'):
256 |             path_pred = path_real.replace('.pdb', '_out.pdb')
257 |         elif pred_end.endswith('npy'):
258 |             path_pred = path_real.replace('.pdb', '.npy')  # 想要和真实数据集进行对比的文件后缀
259 | 
260 |         # 计算每个原子的rmsd
261 |         real, pred, real_without_CB, pred_without_CB, real_CB, pred_CB = extraction_coord(path_real, path_pred)
262 | 
263 |         for atom in atoms:
264 |             row = atoms.index(atom)
265 |             real_atom = real_without_CB[row::len(atoms_type)-1]
266 |             pred_atom = pred_without_CB[row::len(atoms_type)-1]
267 |             rmsd_atom = computation_rmsd(real_atom, pred_atom)
268 |             rmsd_array[idx - 1][row] = rmsd_atom
269 | 
270 |         # 计算CB的rmsd
271 |         rmsd_CB = computation_rmsd(real_CB, pred_without_CB)
272 |         rmsd_array[idx - 1][row + 1] = rmsd_CB
273 |         # 计算全原子的rmsd
274 |         rmsd = computation_rmsd(real, pred)
275 |         rmsd_array[idx - 1][row + 2] = rmsd
276 | 
277 |     sheet.write(idx + 1, 0, 'mean')
278 |     means = np.mean(rmsd_array, axis=0)
279 |     rmsd_array = np.insert(rmsd_array, rmsd_array.shape[0], values=means, axis=0)
280 |     return rmsd_array
281 | 
282 | #将计算出的数值写入excel
283 | if __name__ == "__main__":
284 |     # 创建excel，并写入每列名称
285 |     book = xlwt.Workbook(encoding="utf-8", style_compression=0)
286 |     pred_end = input('请输入文件格式：')
287 |     # Create a sheet object, a sheet object corresponding to a table in the Excel file.
288 |     if pred_end.endswith('pdb'):
289 |         sheet = book.add_sheet('PD2', cell_overwrite_ok=True)
290 |     elif pred_end.endswith('npy'):
291 |         sheet = book.add_sheet('our', cell_overwrite_ok=True)
292 | 
293 |     atoms_type = input('请按pdb原子排列顺序输入需要计算的原子(逗号隔开）：')
294 |     if atoms_type == '':
295 |         atoms_type= ['N', 'CA', 'C', 'O', 'CB']
296 |     else:
297 |         atoms_type = atoms_type.split(",")
298 |     # 写入每一列的title
299 |     names = ['file_name'] + atoms_type + ['scut']
300 |     for i in range(len(names)):
301 |         sheet.write(0, i, names[i])
302 | 
303 |     #获取所有计算数值的rmsd
304 |     rmsd_array = computation_rmsd_array(pred_end, sheet)
305 | 
306 |     #将矩阵数据写入excel
307 |     for i in range(rmsd_array.shape[0]):
308 |         for j in range(rmsd_array.shape[1]):
309 |             sheet.write(i+1, j+1, rmsd_array[i][j])
310 | 
311 |     if pred_end.endswith('pdb'):
312 |         book.save('D://database//rmsd_compare//backbone_PD2.xls')
313 |     elif pred_end.endswith('npy'):
314 |         book.save('D://database//rmsd_compare//backbone1_our.xls')
315 | 
316 | 
317 | 
318 | 


--------------------------------------------------------------------------------
/webserver/model.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import os
  4 | from numpy import *
  5 | from scipy.spatial.distance import pdist
  6 | from scipy.spatial.distance import squareform
  7 | import pathlib
  8 | from pred_torsion import *
  9 | from rebulid import *
 10 | import shutil
 11 | import time
 12 | import random
 13 | import sys
 14 | 
 15 | # path = 'D:\\backbone_prediction'
 16 | # distance_window_path = os.path.join(path, 'distance_window')
 17 | # path_CA = sys.argv[1]
 18 | # logger.info(path_CA)
 19 | # path_CA = '/data/wwwroot/webserver/files/2019-11-19-07-20-19/CA_info'
 20 | path_CA = 'D:\\backbone_prediction\\CA_infoglp'
 21 | distance_window_path = path_CA.replace('CA_infoglp', 'distance_window_test1')
 22 | pathlib.Path(distance_window_path).mkdir(parents=True, exist_ok=True)
 23 | atoms_type = ['N', 'CA', 'C', 'O']
 24 | 
 25 | acid_normol = ['ALA', 'PHE', 'CYS', 'ASP', 'ASN',
 26 |                 'GLU', 'GLN', 'GLY', 'HIS', 'LEU',
 27 |                 'ILE', 'LYS', 'MET', 'PRO', 'ARG',
 28 |                 'SER', 'THR', 'VAL', 'TRP', 'TYR']
 29 | 
 30 | ALPHABET = {'A': 'ALA', 'F': 'PHE', 'C': 'CYS', 'D': 'ASP', 'N': 'ASN',
 31 |             'E': 'GLU', 'Q': 'GLN', 'G': 'GLY', 'H': 'HIS', 'L': 'LEU',
 32 |             'I': 'ILE', 'K': 'LYS', 'M': 'MET', 'P': 'PRO', 'R': 'ARG',
 33 |             'S': 'SER', 'T': 'THR', 'V': 'VAL', 'W': 'TRP', 'Y': 'TYR'}
 34 | 
 35 | AA_HYDROPATHICITY_INDEX = {'ARG': -4.5, 'LYS': -3.9, 'ASN': -3.5, 'ASP': -3.5, 'GLN': -3.5,
 36 |                            'GLU': -3.5, 'HIS': -3.2, 'PRO': -1.6, 'TYR': -1.3, 'TRP': -0.9,
 37 |                            'SER': -0.8, 'THR': -0.7, 'GLY': -0.4, 'ALA': 1.8, 'MET': 1.9,
 38 |                            'CYS': 2.5, 'PHE': 2.8, 'LEU': 3.8, 'VAL': 4.2, 'ILE': 4.5}
 39 | 
 40 | AA_BULKINESS_INDEX = {'ARG': 14.28, 'LYS': 15.71, 'ASN': 12.82, 'ASP': 11.68, 'GLN': 14.45,
 41 |                       'GLU': 13.57, 'HIS': 13.69, 'PRO': 17.43, 'TYR': 18.03, 'TRP': 21.67,
 42 |                       'SER': 9.47, 'THR': 15.77, 'GLY': 3.4, 'ALA': 11.5, 'MET': 16.25,
 43 |                       'CYS': 13.46, 'PHE': 19.8, 'LEU': 21.4, 'VAL': 21.57, 'ILE': 21.4}
 44 | 
 45 | AA_FLEXIBILITY_INDEX = {'ARG': 2.6, 'LYS': 1.9, 'ASN': 14., 'ASP': 12., 'GLN': 4.8,
 46 |                         'GLU': 5.4, 'HIS': 4., 'PRO': 0.05, 'TYR': 0.05, 'TRP': 0.05,
 47 |                         'SER': 19., 'THR': 9.3, 'GLY': 23., 'ALA': 14., 'MET': 0.05,
 48 |                         'CYS': 0.05, 'PHE': 7.5, 'LEU': 5.1, 'VAL': 2.6, 'ILE': 1.6}
 49 | 
 50 | AA_MESSAGE = {}
 51 | 
 52 | for aa_short in ALPHABET.keys():
 53 |     aa_long = ALPHABET[aa_short]
 54 |     AA_MESSAGE.update({aa_short: [(5.5 - AA_HYDROPATHICITY_INDEX[aa_long]) / 10,
 55 |                                   AA_BULKINESS_INDEX[aa_long] / 21.67,
 56 |                                   (25. - AA_FLEXIBILITY_INDEX[aa_long]) / 25.]})
 57 | 
 58 |     AA_MESSAGE.update({aa_long: [(5.5 - AA_HYDROPATHICITY_INDEX[aa_long]) / 10,
 59 |                                  AA_BULKINESS_INDEX[aa_long] / 21.67,
 60 |                                  (25. - AA_FLEXIBILITY_INDEX[aa_long]) / 25.]})
 61 | 
 62 | 
 63 | # 提取CA原子信息
 64 | def atoms_infos(file_name):
 65 |     file = open(os.path.join(path_CA, file_name), 'r')
 66 |     lines = file.readlines()
 67 |     atoms_info = [line.strip('\n') for line in lines
 68 |                   if line.split()[0] == 'ATOM' and line.split()[2] == 'CA']
 69 |     array_head_tail = np.zeros((5, 3))
 70 | 
 71 |     for line in lines:
 72 |         if line.split()[0] == 'ATOM' and line.split()[2] == 'N':
 73 |             array_head_tail[0] = [float(line.split()[j]) for j in range(6, 9)]
 74 |             break
 75 | 
 76 |     for line in lines[::-1]:
 77 |         if line.split()[0] == 'ATOM' and line.split()[2] == 'C':
 78 |             array_head_tail[1] = [float(line.split()[j]) for j in range(6, 9)]
 79 |             break
 80 | 
 81 |     for line in lines[::-1]:
 82 |         if line.split()[0] == 'ATOM' and line.split()[2] == 'O':
 83 |             array_head_tail[2] = [float(line.split()[j]) for j in range(6, 9)]
 84 |             break
 85 | 
 86 |     for line in lines:
 87 |         if line.split()[0] == 'ATOM' and line.split()[2] == 'C':
 88 |             array_head_tail[3] = [float(line.split()[j]) for j in range(6, 9)]
 89 |             break
 90 | 
 91 |     for line in lines[::-1]:
 92 |         if line.split()[0] == 'ATOM' and line.split()[2] == 'N':
 93 |             array_head_tail[4] = [float(line.split()[j]) for j in range(6, 9)]
 94 |             break
 95 | 
 96 |     delet = []
 97 |     # 筛掉重复概率小的氨基酸
 98 |     for i in range(len(atoms_info)):
 99 |         if atoms_info[i - 1].split()[2] == atoms_info[i].split()[2] and \
100 |                 atoms_info[i - 1].split()[5] == atoms_info[i].split()[5]:
101 |             if atoms_info[i - 1].split()[-3] <= atoms_info[i].split()[-3]:
102 |                 delet.append(i - 1)
103 |             else:
104 |                 delet.append(i)
105 |     for i in delet[::-1]:
106 |         del atoms_info[i]
107 |     return atoms_info,array_head_tail
108 | 
109 | 
110 | # 提取坐标信息
111 | def extract_coord(atoms_info):
112 |     coord_array = np.zeros((len(atoms_info), 3))
113 |     # coord_all = np.zeros((len(atoms_info), 3))
114 |     acid_list = []
115 |     CB_whether_exist = []
116 |     for i in range(len(atoms_info)):
117 | 
118 |         #判断该氨基酸是否存在CB
119 |         if atoms_info[i].split()[2] == 'CA':
120 |             if atoms_info[i].split()[3] == 'GLY':
121 |                 CB_whether_exist.append('-')
122 |             else:
123 |                 CB_whether_exist.append('+')
124 |         coord_array[i] = [float(atoms_info[i].split()[j]) for j in range(6, 9)]
125 |         acid_list.append(atoms_info[i].split()[3][-3::])
126 | 
127 |     acid_array = array(acid_list)
128 |     return coord_array, acid_array, CB_whether_exist
129 | 
130 | 
131 | def torsion():
132 |     for n in range(len(torsion_sin)):
133 |         torsion_training[n] = math.atan2(torsion_sin[n], torsion_cos[n])
134 | 
135 | 
136 | def distance_window(coord_array, acid_array, i):
137 |     WINDOW_SIZE = 15
138 |     distCA = pdist(coord_array, metric='euclidean')
139 |     distCA = squareform(distCA).astype('float32')
140 |     # save_name = file_name.replace('pdb', 'npy')
141 |     save_name = str(i) + '.npy'
142 |     mark_type = [('distance', float), ('aa', 'S10')]
143 |     dist_windows = []
144 | 
145 |     for i in range(len(distCA)):
146 |         marked_array = []
147 |         new_array = []
148 |         for j in range(len(distCA[i])):
149 |             marked_array.append((distCA[i, j], acid_array[j]))
150 |         marked_array = np.array(marked_array, dtype=mark_type)
151 |         marked_array = np.sort(marked_array, order='distance')[:WINDOW_SIZE]
152 |         for j in range(len(marked_array)):
153 |             aa = marked_array[j][1].decode('utf-8')
154 |             new_array.append([marked_array[j][0]] + AA_MESSAGE[aa])
155 |         dist_windows.append(new_array)
156 |     dist_windows = np.array(dist_windows).astype('float32')
157 | 
158 |     np.save(os.path.join(distance_window_path, save_name), dist_windows)
159 | 
160 | 
161 | #通过两坐标计算单位向量
162 | def vector_unit(vector_1,vector_2):
163 |     bond_vector_2 = vector_1 - vector_2
164 |     bond_length_2 = np.linalg.norm(bond_vector_2)
165 |     return bond_vector_2 / bond_length_2
166 | 
167 | 
168 | def torsion_angle(A, B, C):
169 |     #计算法向量
170 |     U_2 = vector_unit(B,A); U_1 = vector_unit(C,B)#; U = vector_unit(D,C)
171 |     # N   = np.cross(U_1, U)   / np.linalg.norm(np.cross(U_1, U))
172 |     N_1 = np.cross(U_2, U_1) / np.linalg.norm(np.cross(U_2, U_1))
173 |     m_weight = np.array([U_1, np.cross(N_1, U_1), N_1])
174 | 
175 |     #torsion_angle
176 |     # try:
177 |     #     angle = np.sign(np.dot(U_2,N)) * math.acos(np.dot(N_1,N))
178 |     # except:
179 |     #     angle = 0
180 |     return m_weight
181 | 
182 | 
183 | #根据真实角度或训练角度预测下一个坐标
184 | def next_coord(A, B, C, R, angle_confirm,torsion_pred):
185 |     #torsion_angle
186 |     m = torsion_angle(A, B, C)
187 |     #将真实角度或预测角度赋值给torsion
188 |     torsion = torsion_pred
189 | #     print("N——angle:",angle_real,angle_train)
190 |     angle_martix=[math.cos(math.pi-angle_confirm),
191 |                   math.sin(math.pi-angle_confirm) * math.cos(torsion),
192 |                   math.sin(math.pi-angle_confirm) * math.sin(torsion)]
193 |     #计算下一个坐标
194 |     next_corrd = C + R * np.dot(m.T, angle_martix)
195 | 
196 |     return next_corrd
197 | 
198 | 
199 | #计算预测的CB位置
200 | def pred_CBcoord(N_coord, CA_coord, C_coord, CB_coord):
201 |     # N和C的中间向量
202 |     vector_midleline = (vector_unit(N_coord, CA_coord) + vector_unit(C_coord, CA_coord))
203 |     vector_midleline_unit = vector_midleline / np.linalg.norm(vector_midleline)
204 | 
205 |     C = CA_coord + vector_midleline_unit * 0.841829775235248
206 |     # C2 = N_coord + vector_unit(C_coord, N_coord) * 1.190426725853957
207 |     angle_confirm = math.pi / 2
208 | 
209 |     # 统计CA到CB的距离: R = np.linalg.norm(C1 - CB_coord)
210 |     R = 2.1545175870366853  # 统计得到的CA到CB的距离
211 |     torsion = 0.5999114448494303  # 根据统计得到的旋转角
212 | 
213 |     # 计算得到预测的CB位置
214 |     next_CB_coord = next_coord(CA_coord, N_coord, C, R, angle_confirm, torsion)
215 |     return next_CB_coord
216 | 
217 | 
218 | #将预测的CB加入数组
219 | def add_pred_CB(pred_npy_without_CB, CB_whether_exist):
220 |     pred_array = []
221 |     for j in range(0, pred_npy_without_CB.shape[0]):
222 |         # 进一步提取只有CB的array
223 |         if j % 4 == 0:
224 |             for line in range(4):
225 |                 pred_array.append(pred_npy_without_CB[j + line])
226 |             if CB_whether_exist[j // 4] == '+':
227 |                 N_coord_pred = pred_npy_without_CB[j]
228 |                 CA_coord_pred = pred_npy_without_CB[j + 1]
229 |                 C_coord_pred = pred_npy_without_CB[j + 2]
230 |                 O_coord_pred = pred_npy_without_CB[j + 3]
231 |                 # 根据预测出的C和N计算得到CB的坐标
232 |                 next_CB = pred_CBcoord(N_coord_pred, CA_coord_pred, C_coord_pred, O_coord_pred)
233 |                 pred_array.append(next_CB)
234 |     return array(pred_array)
235 | 
236 | 
237 | def recovery_infos(pred_array, CA_infos, backbone_path):
238 |     # pred_array = np.load(pred_array1)
239 |     # CA_info = atoms_infos(pred_array1.split(".")[0]+".pdb")
240 | 
241 |     # after_work = open(pred_array1.split(".")[0] + "1" + ".pdb", "w")
242 |     backbone = open(backbone_path, 'w')
243 | # 完成pdb的框架
244 |     list1 = []
245 |     for i in range(len(CA_infos)):
246 |         if CA_infos[i].split()[3] != "GLY":
247 |             for j in range(5):
248 |                 list1.append(CA_infos[i])
249 |         else:
250 |             for j in range(4):
251 |                 list1.append(CA_infos[i])
252 | 
253 |     # 命名N\C\O\CB
254 |     i = 0
255 |     while i < len(list1) - 3:
256 |         if list1[i].split()[3] == "GLY":
257 |             list1[i] = list1[i].replace(list1[i].split()[2], "N ")
258 |             list1[i + 2] = list1[i + 2].replace(list1[i + 2].split()[2], "C ")
259 |             list1[i + 3] = list1[i + 3].replace(list1[i + 3].split()[2], "O ")
260 |             i = i + 4
261 | 
262 |         else:
263 |             list1[i] = list1[i].replace(list1[i].split()[2], "N ")
264 |             list1[i + 2] = list1[i + 2].replace(list1[i + 2].split()[2], "C ")
265 |             list1[i + 3] = list1[i + 3].replace(list1[i + 3].split()[2], "O ")
266 |             list1[i + 4] = list1[i + 4].replace(list1[i + 4].split()[2], "CB")
267 |             i = i + 5
268 | 
269 |     # 将npy的数据取三位小数
270 |     for i in range(len(pred_array)):
271 |         for j in range(3):
272 |             pred_array[i][j] = "%.3f" % pred_array[i][j]
273 | 
274 |     # 坐标替换及补齐小数点位数
275 |     for i in range(len(list1)):
276 |         for j in range(3):
277 |             if len(str(pred_array[i][j]).split(".")[1]) < 3:
278 |                 list1[i] = list1[i].replace(list1[i].split()[j + 6], str(pred_array[i][j]).split(".")[0] + "." + \
279 |                                             str(pred_array[i][j]).split(".")[1].ljust(3, '0'))
280 |             else:
281 |                 list1[i] = list1[i].replace(list1[i].split()[j + 6], str(pred_array[i][j]))
282 | 
283 |             # 最后一项原子名称修改
284 |             list1[i] = list1[i].replace(list1[i].split()[11], list1[i].split()[2][0])
285 | 
286 |         # 序号与格式
287 |         t = list1[i].split()
288 |         list1[i] = t[0].ljust(7, ' ') + str(i + 1).rjust(4, ' ') + "  " + t[2].ljust(3, ' ') + t[3].rjust(4,
289 |                                                                                                           ' ') + " " + \
290 |                    t[4].ljust(2, ' ') + t[5].rjust(3, ' ') + t[6].rjust(12, ' ') + t[7].rjust(8, ' ') + t[8].rjust(8,
291 |                                                                                                                    ' ') + \
292 |                    "  " + t[9].ljust(5, ' ') + t[10].ljust(16, ' ') + t[11]
293 | 
294 |     for e in list1:
295 |         backbone.write(e + "\n")
296 |     backbone.close()
297 | 
298 | if __name__ == "__main__":
299 |     CB_whether_exist_all = []
300 |     #提取坐标信息计算windows_distance
301 |     # time_statics = np.zeros((100, 4))
302 |     count = 0
303 |     # book = xlwt.Workbook(encoding="utf-8", style_compression=0)
304 |     # sheet = book.add_sheet('time_statics', cell_overwrite_ok=True)
305 |     # for iter in range(100):
306 |     start1 = time.time()
307 |     # for file_name in os.listdir(path_CA):
308 |     f = open('D:\\backbone_prediction\\CA_infoglp\\orign.txt', 'r')
309 |     file = f.readlines()
310 |     seqs = [seq.split()[0] for seq in file]
311 |     # filename = [seq.split()[0] for seq in file]
312 |     for i, seq in enumerate(seqs):
313 |         try:
314 |             # file_name = filename[i] + '.pdb'
315 |             file_name = '4avz.pdb'
316 |             atoms_info, ground_true_coos = atoms_infos(file_name)
317 |             coord_array, acid_array, CB_whether_exist = extract_coord(atoms_info)
318 |             # CB_whether_exist_all.append(CB_whether_exist)
319 | 
320 |             distance_window(coord_array[:223], seq, str(i))
321 |             test_dataset = DistanceWindow(
322 |                 distance_window_path=distance_window_path)
323 |             data_loader = DataLoader(dataset=test_dataset)
324 |         except Exception as e:
325 |             print(e)
326 | 
327 |     end1 = time.time()
328 |     # 融合50个模型的角度
329 | 
330 |     models_path = 'D:\\backbone_prediction\\top_models'
331 | 
332 |     total_acid = 0
333 | 
334 |     with torch.no_grad():
335 | 
336 |         models = []
337 |         start2 = time.time()
338 |         for model_name in os.listdir(models_path):
339 |             model = torch.load(os.path.join(models_path, model_name), map_location='cuda:0')
340 |             model.eval()
341 |             model.is_training = False
342 |             models.append(model)
343 |         end2 = time.time()
344 | 
345 |         start3 = time.time()
346 |         for arrays, torsions, output_filename in data_loader:
347 |             total_file = 0
348 | 
349 |             for model in models:
350 |                 arrays = arrays.to(device)
351 |                 pred_sincos = model(arrays[0]).squeeze(1).transpose(0, 1)
352 |                 output = pred_sincos.data.cpu().numpy()
353 |                 total_file += output
354 |             total_file = total_file / 50
355 | 
356 |             # 根据预测角度复原坐标
357 |             start4 = time.time()
358 |             filename = output_filename[0]
359 |             coos = []
360 | 
361 |             # 读入CA数据
362 |             atoms_info, ground_true_coos_real = atoms_infos(filename.replace('npy', 'pdb'))
363 |             coord_array, acid_array, CB_whether_exist = extract_coord(atoms_info)
364 |             for coo in coord_array:
365 |                 coos.append(Coordinate(coo))
366 | 
367 |             PATH_OUTPUT = path_CA.replace('CA_info', 'backbone')
368 |             pathlib.Path(PATH_OUTPUT).mkdir(parents=True, exist_ok=True)
369 | 
370 |             pred = total_file
371 |             torsions_C = np.arctan2(pred[0], pred[1])
372 |             torsions_N = np.arctan2(pred[2], pred[3])
373 |             # 复原骨架结构
374 |             backbone_pred_without_CB = backbone_rebuild_separated_torsion(coos, torsions_C, torsions_N)
375 | 
376 |             ground_true_coos = np.zeros((3, 3))
377 |             # print(backbone_pred_without_CB[1],backbone_pred_without_CB[-2])
378 |             if (ground_true_coos[0] == np.zeros((1, 3))).all():
379 |                 ground_true_coos[0] = next_coord(coord_array[1], backbone_pred_without_CB[1], coord_array[0],
380 |                                                              1.45801, 2.124, 2.7)
381 | 
382 |             if (ground_true_coos[1] == np.zeros((1, 3))).all():
383 |                 ground_true_coos[1] = next_coord(coord_array[-2], backbone_pred_without_CB[-2], coord_array[-1],
384 |                                                              1.52326, 1.941, -1.4)
385 | 
386 |             if (ground_true_coos[2] == np.zeros((1, 3))).all():
387 |                 ground_true_coos[2] = next_coord(coord_array[-2], backbone_pred_without_CB[-2], coord_array[-1],
388 |                                                              2.408748478225743, 1.4915450962173677, -1.4)
389 | 
390 |             # loss[iter][count] = np.array([np.linalg.norm(ground_true_coos[i] - ground_true_coos_real[i]) for i in range(3)])
391 | 
392 |             pred_npy_without_CB = np.concatenate((ground_true_coos[0].reshape([1, 3]),
393 |                                                   backbone_pred_without_CB, ground_true_coos[-2:]), axis=0).astype(
394 |                 'float32')
395 | 
396 |             CB_whether_exist = CB_whether_exist_all[count]
397 |             count += 1
398 |             pred_array = add_pred_CB(pred_npy_without_CB, CB_whether_exist)
399 | 
400 |             backbone_path = os.path.join(PATH_OUTPUT, filename.replace('npy', 'pdb'))
401 |             recovery_infos(pred_array, atoms_info, backbone_path)
402 |             end4 = time.time()
403 |             time_total = end4 - start4
404 |         end3 = time.time()
405 |         # time_statics[iter] = np.array([end1-start1, end2-start2, end3-start3-time_total, time_total])
406 |         print(end1-start1, end2-start2, end3-start3-time_total, time_total)
407 | 
408 |     # 将loss写入excel
409 |     # loss_mean = np.mean(loss, axis=0)
410 |     # for i in range(loss_mean.shape[0]):
411 |     #     for j in range(loss_mean.shape[1]):
412 |     #         sheet.write(i+1,j+1,loss_mean[i][j])
413 |     # book.save('D://backbone_prediction//NC_random1.xls') n
414 |     # print(time_statics)
415 |     # for i in range(100):
416 |     #     sheet.write(i + 1, 0, i+1)
417 |     #     for j in range(4):
418 |     #         sheet.write(i+1, j+1, time_statics[i][j])
419 |     # book.save('D://backbone_prediction//time_stattics_cpu.xls')
420 | 


--------------------------------------------------------------------------------
/transform.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import time
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | import matplotlib.pyplot as plt
  7 | import pathlib
  8 | #from scipy.stats import norm as nm
  9 | from multiprocessing import Pool
 10 | import argparse
 11 | import matplotlib.image
 12 | # from .arraylize import Arraylize
 13 | 
 14 | 
 15 | parser = argparse.ArgumentParser(description='manual to this script')
 16 | parser.add_argument('--resolution', type=int, default='256',
 17 |                     help='output resolution')
 18 | parser.add_argument('--dataset_path', type=str, default=os.getcwd(),
 19 |                     help='path of dataset')
 20 | parser.add_argument('--output_path', type=str, default=os.getcwd()+'/processed_data',
 21 |                     help='path of output')
 22 | parser.add_argument('--dataset', type=str, default='cif_filtered',
 23 |                     help='name of dataset folder, bc-30-1_CA|bc-30-1_chains|cif_filtered')
 24 | parser.add_argument('--input_type', type=str, default='cif',
 25 |                     help='type of input file, cif|pdb')
 26 | parser.add_argument('--output_type', type=str, default='image',
 27 |                     help='image or distance_map, images|distance_map')
 28 | parser.add_argument('--axis_range', type=int, default='64',
 29 |                     help='map range of structures, 42|64')
 30 | parser.add_argument('--multi_process', type=bool, default=True,
 31 |                     help='multi process or not')
 32 | parser.add_argument('--multi_atom', type=bool, default=False,
 33 |                     help='input all backbone atoms or CA only')
 34 | parser.add_argument('--move2center', type=bool, default=True,
 35 |                     help='relocate the center of proteins to the center of coordinate system')
 36 | parser.add_argument('--redistribute', type=bool, default=False,
 37 |                     help='redistribute the original distribution according to normal distribution')
 38 | parser.add_argument('--relative_number', type=bool, default=False,
 39 |                     help='mark dots with relative serial number')
 40 | parser.add_argument('--draw_connection', type=bool, default=True,
 41 |                     help='draw dots connection or not')
 42 | parser.add_argument('--aminoacid_message', type=bool, default=True,
 43 |                     help='mark amino acid with hydropathicity, bulkiness and flexibility or 1.')
 44 | parser.add_argument('--redistribute_rate', type=float, default='1.4',
 45 |                     help='coefficient of redistribution amplitude')
 46 | args = parser.parse_args()
 47 | 
 48 | res = args.resolution
 49 | ar = args.axis_range
 50 | s = ar / res  # scale=axis_range/resolution
 51 | input_folder = args.dataset_path + '/' + args.dataset
 52 | AMINO_ACIDS = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS',
 53 |                'GLN', 'GLU', 'GLY', 'HIS', 'ILE',
 54 |                'LEU', 'LYS', 'MET', 'PHE', 'PRO',
 55 |                'SER', 'THR', 'TRP', 'TYR', 'VAL']
 56 | AA_HYDROPATHICITY_INDEX = {
 57 |     'ARG': -4.5,
 58 |     'LYS': -3.9,
 59 |     'ASN': -3.5,
 60 |     'ASP': -3.5,
 61 |     'GLN': -3.5,
 62 |     'GLU': -3.5,
 63 |     'HIS': -3.2,
 64 |     'PRO': -1.6,
 65 |     'TYR': -1.3,
 66 |     'TRP': -0.9,
 67 |     'SER': -0.8,
 68 |     'THR': -0.7,
 69 |     'GLY': -0.4,
 70 |     'ALA': 1.8,
 71 |     'MET': 1.9,
 72 |     'CYS': 2.5,
 73 |     'PHE': 2.8,
 74 |     'LEU': 3.8,
 75 |     'VAL': 4.2,
 76 |     'ILE': 4.5,
 77 | }
 78 | AA_BULKINESS_INDEX = {
 79 |     'ARG': 14.28,
 80 |     'LYS': 15.71,
 81 |     'ASN': 12.82,
 82 |     'ASP': 11.68,
 83 |     'GLN': 14.45,
 84 |     'GLU': 13.57,
 85 |     'HIS': 13.69,
 86 |     'PRO': 17.43,
 87 |     'TYR': 18.03,
 88 |     'TRP': 21.67,
 89 |     'SER': 9.47,
 90 |     'THR': 15.77,
 91 |     'GLY': 3.4,
 92 |     'ALA': 11.5,
 93 |     'MET': 16.25,
 94 |     'CYS': 13.46,
 95 |     'PHE': 19.8,
 96 |     'LEU': 21.4,
 97 |     'VAL': 21.57,
 98 |     'ILE': 21.4,
 99 | }
100 | AA_FLEXIBILITY_INDEX = {
101 |     'ARG': 2.6,
102 |     'LYS': 1.9,
103 |     'ASN': 14.,
104 |     'ASP': 12.,
105 |     'GLN': 4.8,
106 |     'GLU': 5.4,
107 |     'HIS': 4.,
108 |     'PRO': 0.05,
109 |     'TYR': 0.05,
110 |     'TRP': 0.05,
111 |     'SER': 19.,
112 |     'THR': 9.3,
113 |     'GLY': 23.,
114 |     'ALA': 14.,
115 |     'MET': 0.05,
116 |     'CYS': 0.05,
117 |     'PHE': 7.5,
118 |     'LEU': 5.1,
119 |     'VAL': 2.6,
120 |     'ILE': 1.6,
121 | }
122 | AMINO_ACID_NUMBERS = {}
123 | if args.aminoacid_message:
124 |     for aa in AMINO_ACIDS:
125 |         AMINO_ACID_NUMBERS.update({aa: [(5.5-AA_HYDROPATHICITY_INDEX[aa]) / 10 * 255.,
126 |                                         AA_BULKINESS_INDEX[aa] / 21.67 * 255.,
127 |                                         (25.-AA_FLEXIBILITY_INDEX[aa]) / 25. * 255.]})
128 | else:
129 |     for aa in AMINO_ACIDS:
130 |         AMINO_ACID_NUMBERS.update({aa: [1.]})
131 | ary_dim = 2 + len(AMINO_ACID_NUMBERS[AMINO_ACIDS[0]])
132 | 
133 | 
134 | class Atom(object):
135 |     def __init__(self, aminoacid, index, x, y, z, atom_type='CA', element='C'):
136 |         self.index = int(index)
137 |         self.aa = aminoacid
138 |         self.x = float(x)
139 |         self.y = float(y)
140 |         self.z = float(z)
141 |         self.type = atom_type
142 |         self.element = element
143 | 
144 | 
145 | def readfile(filename, path):
146 |     file = open(path + '/' + filename, 'r')
147 |     if os.path.splitext(filename)[1] == '.cif'  or os.path.splitext(filename)[1]=='.pdb':
148 |         message = file.readlines()
149 |         return message
150 | 
151 |     file.close()
152 | 
153 | 
154 | 
155 | def extract_cif(cif_message):
156 |     atoms = []
157 |     for line in cif_message:
158 |         line = line.split()
159 |         if line[3] in ['CA', 'C', 'N']:
160 |             atoms.append(Atom(line[5], line[8], line[10],
161 |                               line[11], line[12], line[3], line[2]))
162 |     return atoms
163 | 
164 | 
165 | def extract_ca_cif(cif_message):
166 |     atoms = []
167 |     for line in cif_message:
168 |         line = line.split()
169 |         if line[3] == 'CA':
170 |             atoms.append(Atom(line[5], line[8], line[10], line[11], line[12]))
171 |     return atoms
172 | 
173 | 
174 | def extract_pdb(pdb_message):
175 |     atoms = []
176 |     for line in pdb_message:
177 |         if line[13:15] in ['N ', 'CA', 'C ']:
178 |             atoms.append(Atom(line[17:20], line[13:16], line[30:38],
179 |                               line[38:46], line[46:54], line[13:16], line[77]))
180 |     return atoms
181 | 
182 | 
183 | def extract_ca_pdb(pdb_message):
184 |     atoms = []
185 |     for line in pdb_message:
186 |         if line[13:15] == 'CA':
187 |             atoms.append(Atom(line[17:20], line[13:16], line[30:38], line[38:46], line[46:54]))
188 |     return atoms
189 | 
190 | 
191 | def extract_message(message, message_type):
192 |     if message_type == 'pdb':
193 |         if args.multi_atom:
194 |             return extract_pdb(message)
195 |         else:
196 |             return extract_ca_pdb(message)
197 |     elif message_type == 'cif':
198 |         if args.multi_atom:
199 |             return extract_cif(message)
200 |         else:
201 |             return extract_ca_cif(message)
202 | 
203 | 
204 | def find_head(atoms):
205 |     for atom in atoms:
206 |         if atom.type == 'CA':
207 |             return atom
208 | 
209 | 
210 | def find_tail(atoms):
211 |     for i in range(1, len(atoms)+1):
212 |         if atoms[-i].type == 'CA':
213 |             return atoms[-i]
214 | 
215 | 
216 | def rotation_axis(head):
217 |     x = head.x
218 |     y = head.y
219 |     z = head.z
220 |     c = ((y - x) ** 2 /
221 |          ((y * res * (x ** 2 + y ** 2 + z ** 2 - 2 * s ** 2) ** 0.5 / ar - z) ** 2
222 |           + (x * res * (x ** 2 + y ** 2 + z ** 2 - 2 * s ** 2) ** 0.5 / ar - z) ** 2
223 |           + (y - x) ** 2)
224 |          ) ** 0.5
225 |     a = (y * res * (x ** 2 + y ** 2 + z ** 2 - 2 * s ** 2) ** 0.5 / ar - z) / (x - y) * c
226 |     b = (x * res * (x ** 2 + y ** 2 + z ** 2 - 2 * s ** 2) ** 0.5 / ar - z) / (y - x) * c
227 |     return [(a, b, c), (-a, -b, -c)]  # 转轴
228 | 
229 | 
230 | def rotation_angle(head):
231 |     x = head.x
232 |     y = head.y
233 |     z = head.z
234 |     return math.acos(
235 |         ((x + y) * s + z * (x ** 2 + y ** 2 + z ** 2 - 2 * s ** 2) ** 0.5) /
236 |         (x ** 2 + y ** 2 + z ** 2)
237 |     )  # 转角
238 | 
239 | 
240 | def rotation(u, v, w, t, axis):  # 原始坐标
241 |     (a, b, c) = axis
242 |     # 罗德里格旋转公式：
243 |     rx = u*math.cos(t)+(b*w-c*v)*math.sin(t)+a*(a*u+b*v+c*w)*(1-math.cos(t))
244 |     ry = v*math.cos(t)+(c*u-a*w)*math.sin(t)+b*(a*u+b*v+c*w)*(1-math.cos(t))
245 |     rz = w*math.cos(t)+(a*v-b*u)*math.sin(t)+c*(a*u+b*v+c*w)*(1-math.cos(t))
246 |     return rx, ry, rz  # 旋转所得坐标
247 | 
248 | 
249 | def relocate(atoms):
250 |     head = find_head(atoms)
251 |     tail = find_tail(atoms)
252 |     x_o = (head.x + tail.x) / 2
253 |     y_o = (head.y + tail.y) / 2
254 |     z_o = (head.z + tail.z) / 2
255 |     for atom in atoms:
256 |         atom.x -= x_o
257 |         atom.y -= y_o
258 |         atom.z -= z_o
259 |     vs = rotation_axis(head)
260 |     t = rotation_angle(head)
261 |     atom_v = []
262 |     for v in vs:
263 |         atom_v.append(rotation(head.x, head.y, head.z, t, v))
264 |     if abs(atom_v[0][0] - s) + abs(atom_v[0][1] - s) < abs(atom_v[1][0] - s) + abs(atom_v[1][1] - s):
265 |         for atom in atoms:
266 |             (atom.x, atom.y, atom.z) = rotation(atom.x, atom.y, atom.z, t, vs[0])
267 |     else:
268 |         for atom in atoms:
269 |             (atom.x, atom.y, atom.z) = rotation(atom.x, atom.y, atom.z, t, vs[1])
270 |     return atoms
271 | 
272 | 
273 | def move2center(atoms):
274 |     coordinates = []
275 |     for atom in atoms:
276 |         if atom.type == 'CA':
277 |             coordinates.append([atom.x, atom.y, atom.z])
278 |     coordinates = np.array(coordinates)
279 |     center = tf.Variable(tf.zeros([1, 3]))
280 |     distances = coordinates-center
281 |     loss = tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(distances), 1)))
282 |     optimizer = tf.train.GradientDescentOptimizer(0.5)
283 |     train = optimizer.minimize(loss)
284 |     init = tf.global_variables_initializer()
285 |     sess = tf.Session()
286 |     sess.run(init)
287 |     losses = []
288 |     for step in range(10):
289 |         sess.run(train)
290 |         losses.append(sess.run(loss))
291 |     while losses[-1] != losses[-5]:
292 |         sess.run(train)
293 |         losses.append(sess.run(loss))
294 |     final_center = sess.run(center)[0]
295 |     for atom in atoms:
296 |         atom.x -= final_center[0]
297 |         atom.y -= final_center[1]
298 |         atom.z -= final_center[2]
299 |     tf.reset_default_graph()
300 |     return atoms
301 | 
302 | 
303 | def sign(x):
304 |     if x < 0:
305 |         return -1
306 |     else:
307 |         return 1
308 | 
309 | 
310 | def close_neibor(array, x_ary, y_ary, dot, dis_x, dis_y, rec):
311 |     x_step = sign(dis_x)
312 |     y_step = sign(dis_y)
313 |     if abs(dis_x) < abs(dis_y):
314 |         neibors = [(0, y_step), (x_step, 0), (x_step, y_step), (-x_step, 0),
315 |                    (0, -y_step), (-x_step, y_step), (x_step, -y_step), (-x_step, -y_step)]
316 |     else:
317 |         neibors = [(x_step, 0), (0, y_step), (x_step, y_step), (0, -y_step),
318 |                    (-x_step, 0), (x_step, -y_step), (-x_step, y_step), (-x_step, -y_step)]
319 |     step = 1
320 |     while True:
321 |         for (i, j) in neibors:
322 |             try:
323 |                 if array[x_ary + i * step, y_ary + j * step, 2] == 0:
324 |                     array[x_ary + i * step, y_ary + j * step] = [dot.z, dot.index] + AMINO_ACID_NUMBERS.get(dot.aa)
325 |                     rec.update({(x_ary + i * step, y_ary + j * step): dot})
326 |                     # print('dot%d:%d,%d->%d,%d'%(dot[6],x,y,x+i*step,y+j*step))
327 |                     return array
328 |             except IndexError:
329 |                 print('dot(%d+%d,%d+%d) is out of the edge' % (x_ary, i * step, y_ary, j * step))
330 |         # print('%d step neibor of dot%d(%d,%d) is full!'%(step,dot_i,x,y))
331 |         step += 1
332 | 
333 | 
334 | def lattice_battle(array, x_ary, y_ary, dot1, dot2, rec):  # dot1 is original; dot2 is new
335 |     dis1_x = dot1.x / (2 * s) % 1 - 0.5
336 |     dis1_y = dot1.y / (2 * s) % 1 - 0.5
337 |     dis2_x = dot2.x / (2 * s) % 1 - 0.5
338 |     dis2_y = dot2.y / (2 * s) % 1 - 0.5
339 |     if dis1_x ** 2 + dis1_y ** 2 > dis2_x ** 2 + dis2_y ** 2:
340 |         # print('%d / %d swap!'%(dot1[6],dot2[6]))
341 |         array = close_neibor(array, x_ary, y_ary, dot1, dis1_x, dis1_y, rec)
342 |         array[x_ary, y_ary] = [dot2.z, dot2.index] + AMINO_ACID_NUMBERS[dot2.aa]
343 |         rec.update({(x_ary, y_ary) : dot2})
344 |     else:
345 |         array = close_neibor(array, x_ary, y_ary, dot2, dis2_x, dis2_y, rec)
346 |     return array
347 | 
348 | 
349 | def draw_atom(x, y, dot, array, rec):
350 |     if array[x, y, -1] == 0:
351 |         array[x, y] = [dot.z, dot.index] + AMINO_ACID_NUMBERS[dot.aa]
352 |         rec.update({(x, y): dot})
353 | 
354 | 
355 | def arraylize(atoms, array_dim):
356 |     array = np.zeros([res, res, array_dim], dtype=float, order='C')
357 |     rec = {}  # atoms record
358 |     for atom in atoms:
359 |         x_ary = int((atom.x + ar) // (2 * s))
360 |         y_ary = int((atom.y + ar) // (2 * s))
361 |         if rec.get((x_ary, y_ary)):
362 |             array = lattice_battle(array, x_ary, y_ary, rec[(x_ary, y_ary)], atom, rec)
363 |         else:
364 |             draw_atom(x_ary, y_ary, atom, array, rec)
365 |     return array, rec
366 | 
367 | 
368 | # def values_sta(path):
369 | #     xs = []
370 | #     ys = []
371 | #     for filename in os.listdir(path):
372 | #         atoms = move2center(relocate(extract_cif(readfile(filename, path))))
373 | #         for atom in atoms:
374 | #             xs.append(atom.x)
375 | #             ys.append(atom.y)
376 | #     return xs, ys
377 | 
378 | 
379 | def normal_dis(values, var, coefficient):
380 |     dis = []
381 |     values.sort()
382 |     mark = 0
383 |     idx = 0
384 |     for i in range(res):
385 |         cut_point = nm.ppf((i + 1) / res, 0, var**0.5 * coefficient)
386 |         if idx == len(values):
387 |             dis.append([])
388 |             mark = int(idx)
389 |         else:
390 |             while values[idx] < cut_point:
391 |                 idx += 1
392 |                 if idx == len(values):
393 |                     dis.append(values[mark:idx])
394 |                     mark = int(idx)
395 |                     break
396 |             else:
397 |                 dis.append(values[mark:idx])
398 |                 mark = int(idx)
399 |     return dis
400 | 
401 | 
402 | # def redistribute():
403 | 
404 | 
405 | def visual_values_dis(values):
406 |     mark = 0
407 |     idx = 0
408 |     dis = []
409 |     dis_count = []
410 |     axis_length = 2*ar
411 |     for i in range(1, res+1):
412 |         cut_point = (i-res/2)*axis_length/res
413 |         if idx == len(values):
414 |             dis.append([])
415 |         else:
416 |             while values[idx] < cut_point:
417 |                 idx += 1
418 |                 if idx == len(values):
419 |                     dis.append(values[mark:idx])
420 |                     break
421 |             else:
422 |                 dis.append(values[mark:idx])
423 |                 mark = int(idx)
424 |     for i in range(res):
425 |         dis_count.append(len(dis[i]))
426 |     plt.bar(range(res), dis_count)
427 |     plt.show()
428 | 
429 | 
430 | def vis_normal_dis(values, var, coefficient):
431 |     dis = []
432 |     values.sort()
433 |     mark = 0
434 |     idx = 0
435 |     dis_count = []
436 |     for i in range(res):
437 |         cut_point = nm.ppf((i+1)/res, 0, var**0.5*coefficient)
438 |         if idx == len(values):
439 |             dis.append([])
440 |             mark = int(idx)
441 |         else:
442 |             while values[idx] < cut_point:
443 |                 idx += 1
444 |                 if idx == len(values):
445 |                     dis.append(values[mark:idx])
446 |                     mark = int(idx)
447 |                     break
448 |             else:
449 |                 dis.append(values[mark:idx])
450 |                 mark = int(idx)
451 |         dis_count.append(len(dis[i]))
452 |     plt.bar(range(res), dis_count)
453 |     plt.show()
454 | 
455 | 
456 | def draw_dot(x, y, dot1, z_add, idx_add, array):
457 |     if array[x, y, 2] == 0:
458 |         array[x, y] = [dot1.z + z_add, dot1.index + idx_add, 0, 0, 0]
459 | 
460 | 
461 | def dots_connection(dot1, dot2, array, site):
462 |     x = site[dot1][0]
463 |     y = site[dot1][1]
464 |     z_s = dot2.z - dot1.z
465 |     x_r = sign(site[dot2][0] - x)
466 |     y_r = sign(site[dot2][1] - y)
467 |     x_s = abs(site[dot2][0] - x)
468 |     y_s = abs(site[dot2][1] - y)
469 |     dis_c = max(x_s, y_s)+1
470 |     if x_s + y_s > 2:
471 |         for i in range(max(x_s, y_s)):
472 |             l = i + 1
473 |             if min(x_s, y_s) <= 1:
474 |                 if x_s > y_s:
475 |                     draw_dot(x + l*x_r, y, dot1, z_s*l/dis_c, l/dis_c, array)
476 |                 else:
477 |                     draw_dot(x, y + l*y_r, dot1, z_s*l/dis_c, l/dis_c, array)
478 |             else:
479 |                 t = max(x_s, y_s) // min(x_s, y_s)
480 |                 remainder = max(x_s, y_s) % min(x_s, y_s)
481 |                 if x_s > y_s:
482 |                     j = [l, i//t, l, y_s]
483 |                 else:
484 |                     j = [i//t, l, x_s, l]
485 |                 if i < max(x_s, y_s) - remainder:
486 |                     draw_dot(x + j[0] * x_r, y + j[1] * y_r, dot1, z_s*l/dis_c, l/dis_c, array)
487 |                 else:
488 |                     draw_dot(x + j[2] * x_r, y + j[3] * y_r, dot1, z_s*l/dis_c, l/dis_c, array)
489 | 
490 | 
491 | def draw_connection(atoms, array, rec):
492 |     site = {}
493 |     for (x, y) in rec.keys():
494 |         site.update({rec[(x, y)]: [x, y]})
495 |     for i in range(len(atoms) - 1):
496 |         dots_connection(atoms[i], atoms[i + 1], array, site)
497 | 
498 | 
499 | def write_log(path):
500 |     arg_name_list = ['dataset', 'resolution', 'input_type', 'output_type', 'axis_range', 'multi_atom',
501 |                      'move2center', 'redistribute', 'redistribute_rate', 'relative_number', 'draw_connection',
502 |                      'aminoacid_message']
503 |     arg_list = [args.dataset, args.resolution, args.input_type, args.output_type, args.axis_range, args.multi_atom,
504 |                 args.move2center, args.redistribute, args.redistribute_rate, args.relative_number, args.draw_connection,
505 |                 args.aminoacid_message]
506 |     write_list = [time.strftime("%Y%m%d_%H%M", time.localtime())]
507 |     for i in range(len(arg_name_list)):
508 |         print("%s = %s" % (arg_name_list[i], str(arg_list[i])))
509 |         write_list.append("%s = %s" % (arg_name_list[i], str(arg_list[i])))
510 |     write_list.append('\n\n\n')
511 |     with open(path + '/args_log.txt', 'a') as log_writer:
512 |         log_writer.write('\n'.join(write_list))
513 | 
514 | 
515 | def process():
516 |     log_dir = args.output_path + '/' + args.dataset
517 |     output_dir = args.output_path + '/' + args.dataset + '/' + time.strftime("%Y%m%d_%H%M", time.localtime())
518 |     pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
519 |     write_log(log_dir)
520 |     num = 0
521 |     if args.output_type == 'image':
522 |         if args.redistribute:
523 |             atoms_dic = {}
524 |             xs = []
525 |             ys = []
526 |             for filename in os.listdir(input_folder):
527 |                 atoms = relocate(extract_message(readfile(filename, input_folder), args.input_type))
528 |                 if args.move2center:
529 |                     atoms = move2center(atoms)
530 |                 for atom in atoms:
531 |                     xs.append(atom.x)
532 |                     ys.append(atom.y)
533 |                 atoms_dic.update({filename: atoms})
534 |             # var_sta = max(np.var(xs), np.var(ys))
535 |         else:
536 |             for filename in os.listdir(input_folder):
537 |                 atoms = relocate(extract_message(readfile(filename, input_folder), args.input_type))
538 |                 if args.move2center:
539 |                     atoms = move2center(atoms)
540 |                     for i in range(len(atoms)):
541 |                         atoms[i].z=(atoms[i].z+64.)*2.-2.
542 |                 if args.draw_connection:
543 |                     array, rec = arraylize(atoms, ary_dim)
544 |                     draw_connection(atoms, array, rec)
545 |                 else:
546 |                     array, _ = arraylize(atoms, ary_dim)
547 |                 if args.relative_number:
548 |                     array[:, :, 1] /= (len(atoms) + 1)
549 |                 output_name = filename.replace('.cif', '.npy')
550 | 
551 |                 np.save(output_dir + '/' + output_name, array)
552 |                 # break
553 |                 # matplotlib.image.imsave(output_dir + '/' + output_name.replace('.npy', '.png'), array)
554 |                 # num += 1
555 |                 # if num == 10:
556 |                 #     break
557 |     elif args.output_type == 'distance_map':
558 |         if args.multi_atom:
559 |             for filename in os.listdir(input_folder):
560 |                 atoms = extract_message(readfile(filename, input_folder), args.input_type)
561 | 
562 | 
563 | if __name__ == '__main__':
564 |     print('Parent process %s.' % os.getpid())
565 |     p = Pool(3)
566 |     p.apply_async(process())
567 |     print('Waiting for all subprocesses done...')
568 |     p.close()
569 |     p.join()
570 |     print('All subprocesses done.')
571 | 
572 | 


--------------------------------------------------------------------------------
/angle_computation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import math\n",
 11 |     "import os \n",
 12 |     "import random\n",
 13 |     "import argparse\n",
 14 |     "from scipy.spatial.distance import pdist\n",
 15 |     "from scipy.spatial.distance import squareform\n",
 16 |     "from scipy.spatial.distance import cdist\n",
 17 |     "\n",
 18 |     "parser = argparse.ArgumentParser()\n",
 19 |     "parser.add_argument(\"--verbosity\", help=\"increase output verbosity\")\n",
 20 |     "parser.add_argument('--input_type', type=str, default='pn',\n",
 21 |     "                    help='type of input file, cif|pdb')\n",
 22 |     "args = parser.parse_args(args=[])\n",
 23 |     "\n",
 24 |     "\n",
 25 |     "#读取Protein-Net坐标\n",
 26 |     "def atom_net(coord,atom_id):\n",
 27 |     "    atom = np.array([float(coord[0].split()[atom_id])] + [float(coord[1].split()[atom_id])]+ [float(coord[2].split()[atom_id])])/100\n",
 28 |     "    return atom\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "#读取cif坐标\n",
 32 |     "def atom_cif(atoms, id_):\n",
 33 |     "    atom = np.array([float(atoms[id_].split()[10])] + [float(atoms[id_].split()[11])] + [float(atoms[id_].split()[12])])\n",
 34 |     "    return atom\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "#根据两点求单位向量\n",
 38 |     "def vector_unit(vector_1,vector_2):\n",
 39 |     "    bond_vector_2 = vector_1 - vector_2 \n",
 40 |     "    bond_length_2 = np.linalg.norm(bond_vector_2)\n",
 41 |     "    return bond_vector_2 / bond_length_2\n",
 42 |     "\n",
 43 |     "\n",
 44 |     "#计算标准法向量 \n",
 45 |     "def normal_vector_(B, C, D):\n",
 46 |     "    U_1 = vector_unit(C,B) ; U = vector_unit(D,B)    \n",
 47 |     "    N   = np.cross(U_1,U) / np.linalg.norm(np.cross(U_1,U)) \n",
 48 |     "    return  N  \n",
 49 |     "\n",
 50 |     "\n",
 51 |     "#根据矩阵坐标求距离矩阵 \n",
 52 |     "def contact_martix(A):\n",
 53 |     "    # A是一个向量矩阵：euclidean代表欧式距离\n",
 54 |     "    distA=pdist(A,metric='euclidean')\n",
 55 |     "    # 将distA数组变成一个矩阵\n",
 56 |     "    distB = squareform(distA)\n",
 57 |     "    return distB\n",
 58 |     "\n",
 59 |     "\n",
 60 |     "#计算旋转角和坐标转换权重\n",
 61 |     "def torsion_angle(A, B, C, D):\n",
 62 |     "    #计算法向量\n",
 63 |     "    U_2 = vector_unit(B,A) ; U_1 = vector_unit(C,B) ; U = vector_unit(D,C)    \n",
 64 |     "    N   = np.cross(U_1,U)   / np.linalg.norm(np.cross(U_1,U)) \n",
 65 |     "    N_1 = np.cross(U_2,U_1) / np.linalg.norm(np.cross(U_2,U_1))\n",
 66 |     "    m_weight = np.array([U_1 , np.cross(N_1,U_1) , N_1]) \n",
 67 |     "    #torsion_angle\n",
 68 |     "    angle = np.sign(np.dot(U_2,N)) * math.acos(np.dot(N_1,N))#+np.random.normal(loc=0.0, scale=1, size=None)*math.pi/18\n",
 69 |     "    return angle, m_weight\n",
 70 |     "\n",
 71 |     "\n",
 72 |     "#根据真实角度或训练角度预测下一个坐标\n",
 73 |     "def next_coord(A, B, C, D, R, angle_confirm, angle_train):\n",
 74 |     "    #torsion_angle\n",
 75 |     "    angle_real , m = torsion_angle(A, B, C, D)\n",
 76 |     "    #将真实角度或预测角度赋值给torsion\n",
 77 |     "    torsion = angle_real\n",
 78 |     "#     print(\"N——angle:\",angle_real,angle_train)\n",
 79 |     "    angle_martix=[math.cos(math.pi-angle_confirm),\n",
 80 |     "                  math.sin(math.pi-angle_confirm) * math.cos(torsion),\n",
 81 |     "                  math.sin(math.pi-angle_confirm) * math.sin(torsion)]\n",
 82 |     "    #计算下一个坐标\n",
 83 |     "    next_corrd = C + R * np.dot(m.T, angle_martix)\n",
 84 |     "    return next_corrd, torsion\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "#计算旋转角和坐标转换权重\n",
 88 |     "def torsion_angle_C(A, B, C, D):\n",
 89 |     "    #计算法向量\n",
 90 |     "    U_2 = vector_unit(B,A) ; U_1 = vector_unit(C,B) ; U = vector_unit(D,B)    \n",
 91 |     "    N   = np.cross(U_1,U)   / np.linalg.norm(np.cross(U_1,U)) \n",
 92 |     "    N_1 = np.cross(U_2,U_1) / np.linalg.norm(np.cross(U_2,U_1))\n",
 93 |     "    m_weight = np.array([U_1 , np.cross(N_1,U_1) , N_1]) \n",
 94 |     "    #torsion_angle\n",
 95 |     "    angle = np.sign(np.dot(U_2,N)) * math.acos(np.dot(N_1,N))#+np.random.normal(loc=0.0, scale=1, size=None)*math.pi/18\n",
 96 |     "    return angle, m_weight\n",
 97 |     "\n",
 98 |     "\n",
 99 |     "#根据真实角度或训练角度 沿CA-CA轴旋转 预测同一平面的C\n",
100 |     "def next_coord_C(A, B, C, D, R, angle_confirm , angle_train):\n",
101 |     "    #torsion_angle\n",
102 |     "    angle_real , m = torsion_angle_C(A, B, C, D)\n",
103 |     "    #将真实角度或预测角度赋值给torsion\n",
104 |     "    torsion = angle_real\n",
105 |     "#     print(\"C——angle:\",angle_real,angle_train)\n",
106 |     "    angle_martix=[math.cos(math.pi-angle_confirm),\n",
107 |     "                  math.sin(math.pi-angle_confirm) * math.cos(torsion),\n",
108 |     "                  math.sin(math.pi-angle_confirm) * math.sin(torsion)]\n",
109 |     "    #计算下一个坐标\n",
110 |     "    next_corrd = B + R * np.dot(m.T, angle_martix)\n",
111 |     "    return next_corrd, torsion\n",
112 |     "\n",
113 |     "\n",
114 |     "#根据真实坐标计算旋转角\n",
115 |     "def Cartesian_to_angle(path_file):\n",
116 |     "    if args.input_type == 'cif':\n",
117 |     "        f = open(path_file, 'r') ; lines = f.readlines() ; atoms = len(lines)\n",
118 |     "    else:\n",
119 |     "        f     = open(path_file, 'r')   ;   f_line = f.readlines()\n",
120 |     "        xyz   = f_line[-6:-3]          ;   chain  = f_line[-2]\n",
121 |     "        atoms = chain.count('+') * 3   ;   n_zero = chain[0:len(chain)//2].count('-') * 3; \n",
122 |     "        \n",
123 |     "    for i in range(2,atoms-1):\n",
124 |     "        #读取数据\n",
125 |     "        if args.input_type == 'cif':\n",
126 |     "            A = atom_cif(lines, i-2)    ; B = atom_cif(lines, i-1)     ; C = atom_cif(lines, i)      ; D = atom_cif(lines, i+1)\n",
127 |     "        else:\n",
128 |     "            atom_id = n_zero+i  \n",
129 |     "            A = atom_net(xyz, atom_id-2); B = atom_net(xyz, atom_id-1) ; C = atom_net(xyz, atom_id)  ; D = atom_net(xyz, atom_id+1)\n",
130 |     "        #计算单位向量\n",
131 |     "        U_2 = vector_unit(B,A) ; U_1 = vector_unit(C,B) ; U = vector_unit(D,C)  \n",
132 |     "        #计算法向量\n",
133 |     "        N = np.cross(U_1,U) / np.linalg.norm(np.cross(U_1,U)) ; N_1 = np.cross(U_2,U_1) / np.linalg.norm(np.cross(U_2,U_1))\n",
134 |     "        #计算角度\n",
135 |     "        angle = np.sign(np.dot(U_2,N)) * math.acos(np.dot(N_1,N))/math.pi * 180 ; print(angle)\n",
136 |     "    f.close() \n",
137 |     "          \n",
138 |     "        \n",
139 |     "#复现05-文献 基于cif格式文件  \n",
140 |     "def angle_to_Cartesian_cif(path, path_angle):\n",
141 |     "    f=open(path,'r') ; lines=f.readlines() ; total = 0 ; distance = 0\n",
142 |     "    A = atom_cif(lines, 0)   ; B = atom_cif(lines, 1)  ; C = atom_cif(lines, 2) \n",
143 |     "    #构建接触矩阵\n",
144 |     "    true = np.zeros(shape=(len(lines),3))      \n",
145 |     "    true[0] =  A       ;  true[1] =  B       ; true[2] =  C ;\n",
146 |     "    \n",
147 |     "    generation = np.zeros(shape=(len(lines),3))\n",
148 |     "    generation[0] =  A ;  generation[1] =  B ; generation[2] =  C\n",
149 |     "    \n",
150 |     "    #读取预测的角度信息\n",
151 |     "    if not os.path.exists(path_angle):\n",
152 |     "        torsion_training = 0\n",
153 |     "    else:\n",
154 |     "        torsion_training = np.load(path_angle)\n",
155 |     "        \n",
156 |     "    #计算下一个原子坐标\n",
157 |     "    for i in range(2,len(lines)-1):\n",
158 |     "            #给定键长键角\n",
159 |     "        if lines[i].split()[3]   == 'CA':\n",
160 |     "            R = 1.52326  ; angle_confirm = 1.941\n",
161 |     "        elif lines[i].split()[3] == 'C':\n",
162 |     "            R = 1.32868; angle_confirm= 2.028\n",
163 |     "        elif lines[i].split()[3] == 'N':\n",
164 |     "            R = 1.45801 ; angle_confirm = 2.124 \n",
165 |     "         \n",
166 |     "        #获取当前原子的预测旋转角     \n",
167 |     "        torsion_train   = torsion_training[i//3]  \n",
168 |     "        \n",
169 |     "        D = atom_cif(lines, i+1) ; next_xyz, angle = next_coord(A, B, C, D, R, angle_confirm, torsion_training)\n",
170 |     "        true[i+1] = D            ; generation[i+1] = next_xyz\n",
171 |     "        total += np.square(np.linalg.norm(next_xyz - D))\n",
172 |     "        A = B ; B = C ; C = next_xyz\n",
173 |     "    \n",
174 |     "    #根据接触矩阵计算rmsd \n",
175 |     "    T = contact_martix(true) ; G = contact_martix(generation)\n",
176 |     "    distance = np.square(np.linalg.norm(T - G))\n",
177 |     "    rmsd = np.sqrt(distance/(len(lines)-1)/len(lines))\n",
178 |     "    print(rmsd,len(lines))\n",
179 |     "    #由于误差会传递，所以rmsd较大\n",
180 |     "    \n",
181 |     "    #求接触矩阵的差的接触矩阵\n",
182 |     "#     dist=cdist(true,generation,metric='euclidean')\n",
183 |     "# #     print(dist)\n",
184 |     "#     rmsd = np.square(np.linalg.norm(dist))/len(lines)/(len(lines)-1)\n",
185 |     "#     print(rmsd,len(lines)) ; f.close()、\n",
186 |     "    f.close()\n",
187 |     "\n",
188 |     "    \n",
189 |     "#复现05-文献 基于Protein-Net\n",
190 |     "def angle_to_Cartesian(path_file, path_angle):\n",
191 |     "    total = 0                      ;   distance = 0\n",
192 |     "    f     = open(path_file, 'r')   ;   f_line = f.readlines()\n",
193 |     "    xyz   = f_line[-6:-3]          ;   chain  = f_line[-2]\n",
194 |     "    atoms = chain.count('+') * 3   ;   n_zero = chain[0:len(chain)//2].count('-') * 3; \n",
195 |     "    # Constants\n",
196 |     "    A = atom_net(xyz, n_zero + 0)         ; B = atom_net(xyz, n_zero + 1)            ; C = atom_net(xyz, n_zero + 2) \n",
197 |     "    true = np.zeros(shape=(atoms,3))      ; true[0] =  A       ;  true[1] =  B       ; true[2] =  C ;\n",
198 |     "    generation = np.zeros(shape=(atoms,3)); generation[0] =  A ;  generation[1] =  B ; generation[2] =  C\n",
199 |     "        \n",
200 |     "    #读取预测的角度信息\n",
201 |     "    if not os.path.exists(path_angle):\n",
202 |     "        torsion_training = 0\n",
203 |     "    else:\n",
204 |     "        torsion_training = np.load(path_angle)\n",
205 |     "        \n",
206 |     "    #计算下一个原子坐标\n",
207 |     "    for i in range(2,atoms-1):\n",
208 |     "        D = atom_net(xyz, atom_id+1)  \n",
209 |     "        atom_id = n_zero+i  \n",
210 |     "        \n",
211 |     "        #给定键长键角\n",
212 |     "        if   atom_id % 3 == 1 :\n",
213 |     "            R = 1.52326  ; angle_confirm = 1.941\n",
214 |     "        elif atom_id % 3 == 2 :\n",
215 |     "            R = 1.32868; angle_confirm= 2.028\n",
216 |     "        elif atom_id % 3 == 0:\n",
217 |     "            R = 1.45801 ; angle_confirm = 2.124 \n",
218 |     "            \n",
219 |     "        #获取当前原子的预测旋转角     \n",
220 |     "        torsion_train   = torsion_training[i//3]  \n",
221 |     "        #预测下一个原子坐标\n",
222 |     "        next_xyz, angle = next_coord(A, B, C, D, R, angle_confirm, torsion_training)\n",
223 |     "        \n",
224 |     "        #构建接触矩阵\n",
225 |     "        true[i+1]       = D       \n",
226 |     "        generation[i+1] = next_xyz\n",
227 |     "        total += np.square(np.linalg.norm(next_xyz - D))\n",
228 |     "        A = B ; B = C ; C = next_xyz\n",
229 |     "        \n",
230 |     "    #根据接触矩阵计算rmsd \n",
231 |     "    T = contact_martix(true) ; G = contact_martix(generation)\n",
232 |     "    distance = np.square(np.linalg.norm(T - G))\n",
233 |     "    rmsd = np.sqrt(distance/(atoms-1)/atoms)\n",
234 |     "    print(rmsd,atoms)\n",
235 |     "    #由于误差会传递，所以rmsd较大\n",
236 |     "    f.close()\n",
237 |     "    \n",
238 |     "    \n",
239 |     "#以CA-CA轴为旋转轴 根据旋转角预测原子坐标 （三种情况判断误差最小的情况）\n",
240 |     "def angle_to_Cartesian_CA_compare(path_file,path_angle):\n",
241 |     "    total = 0 ;  distance = 0; a=0; b=0; c=0\n",
242 |     "    \n",
243 |     "    if args.input_type == 'cif':\n",
244 |     "        f = open(path_file, 'r');   lines  = f.readlines();  atoms = len(lines)\n",
245 |     "        next_N = atom_cif(lines,0)\n",
246 |     "        \n",
247 |     "    else:\n",
248 |     "        f = open(path_file, 'r');            f_line = f.readlines()\n",
249 |     "        xyz = f_line[-6:-3];                 chain  = f_line[-2]\n",
250 |     "        atoms = chain.count('+') * 3;        n_zero = chain[0:len(chain)//2].count('-') * 3;\n",
251 |     "        \n",
252 |     "    #读取预测的角度信息\n",
253 |     "    if not os.path.exists(path_angle):\n",
254 |     "        torsion_training = 0\n",
255 |     "    else:\n",
256 |     "        torsion_training = np.load(path_angle)\n",
257 |     "        \n",
258 |     "    for i in range(2,atoms-4,3):         \n",
259 |     "        if args.input_type == 'cif':\n",
260 |     "            A = atom_cif(lines,i-2);   B = atom_cif(lines,i-1);     C = atom_cif(lines,i+2)\n",
261 |     "            D_C = atom_cif(lines, i);  D_N = atom_cif(lines, i+1);  C_2 = atom_cif(lines,i+3)      \n",
262 |     "            \n",
263 |     "        else:\n",
264 |     "            atom_id = n_zero+i \n",
265 |     "            A = atom_net(xyz, atom_id-2); B = atom_net(xyz, atom_id-1);    C   = atom_net(xyz, atom_id + 2)  \n",
266 |     "            D_C = atom_net(xyz, atom_id) ; D_N = atom_net(xyz, atom_id+1);  C_2 = atom_net(xyz, atom_id+3)  \n",
267 |     "            \n",
268 |     "        #获取当前原子的预测旋转角     \n",
269 |     "        torsion_train = torsion_training[i//3]     \n",
270 |     "        \n",
271 |     "        #CA_next - C沿CA - CA轴做旋转\n",
272 |     "        #0.21941264623804932：C-CA轴和CA-CA轴的夹角\n",
273 |     "        R_C = 2.4345193937977068  ; angle_confirm_C = 0.21941264623804932\n",
274 |     "        next_C , angle_C = next_coord(A, B, C, D_C, R_C, angle_confirm_C, torsion_training)\n",
275 |     "        \n",
276 |     "        #CA - C沿CA - CA轴做旋转\n",
277 |     "        #0.35529281510453287：CA-C轴和CA-CA轴的夹角\n",
278 |     "        R_C_2 = 1.52326            ; angle_confirm_C_2 = math.pi -  0.35529281510453287\n",
279 |     "        next_C_2 , angle_C_2 = next_coord_C(A, B, C, D_C, R_C_2, angle_confirm_C_2, torsion_training)\n",
280 |     "        \n",
281 |     "        #CA_next - N沿CA - CA轴做旋转\n",
282 |     "        #0.263502970667963：CA-N轴和CA-CA轴的夹角\n",
283 |     "        R_N = 1.45801              ; angle_confirm_N =  0.263502970667963\n",
284 |     "        next_N , angle_N = next_coord(A, B, C, D_N, R_N, angle_confirm_N, torsion_training)\n",
285 |     "        \n",
286 |     "        total_C = np.linalg.norm(next_C - D_C)\n",
287 |     "        total_C_2 = np.linalg.norm(next_C_2 - D_C)\n",
288 |     "        total_N = np.linalg.norm(next_N - D_N)\n",
289 |     "        a += total_C;  b += total_C_2;  c += total_N\n",
290 |     "        x=(atoms - 6) // 3 \n",
291 |     "        print(a/x, b/x, c/x)\n",
292 |     "        f.close()\n",
293 |     "\n",
294 |     "    \n",
295 |     "def angle_to_Cartesian_CA_CA(path_file,path_angle):\n",
296 |     "    total = 0;   distance = 0;  a = 0;   b = 0\n",
297 |     "    #读取数据\n",
298 |     "    if args.input_type == 'cif':\n",
299 |     "        f = open(path_file, 'r');   lines  = f.readlines();  atoms = len(lines)\n",
300 |     "        next_N = atom_cif(lines,0)\n",
301 |     "        \n",
302 |     "        #建造接触矩阵\n",
303 |     "        true = np.zeros(shape=(atoms,3));    generation = np.zeros(shape=(atoms,3))\n",
304 |     "        true[0]   = atom_cif(lines,0);       generation[0] = atom_cif(lines,0)\n",
305 |     "        true[-1]  = atom_cif(lines,atoms-1); generation[-1] = atom_cif(lines,atoms-1)\n",
306 |     "        \n",
307 |     "    else:\n",
308 |     "        f = open(path_file, 'r');            f_line = f.readlines()\n",
309 |     "        xyz = f_line[-6:-3];                 chain  = f_line[-2]\n",
310 |     "        atoms = chain.count('+') * 3;        n_zero = chain[0:len(chain)//2].count('-') * 3;\n",
311 |     "        \n",
312 |     "        #建造接触矩阵\n",
313 |     "        true = np.zeros(shape=(atoms,3));    generation = np.zeros(shape=(atoms,3))\n",
314 |     "        true[0] = atom_net(xyz,0);           generation[0] = atom_net(xyz,0)\n",
315 |     "        true[-1] = atom_net(xyz,atoms-1);    generation[-1] = atom_net(xyz,atoms-1)\n",
316 |     "        \n",
317 |     "    #读取预测的角度信息\n",
318 |     "    torsion_training = np.zeros(shape=(len(torsion_sin),1))\n",
319 |     "    if not os.path.exists(path_angle):\n",
320 |     "        torsion_training = np.zeros(shape=(len(torsion_sin),1))\n",
321 |     "        torsion_training[0] = 'none'\n",
322 |     "    else:\n",
323 |     "        torsion_sin = np.load(path_angle)[0] \n",
324 |     "        torsion_cos = np.load(path_angle)[1] \n",
325 |     "    #         torsion_training = np.load(path_angle)[2] * math.pi\n",
326 |     "        torsion_training = np.zeros(shape=(len(torsion_sin),1))\n",
327 |     "    \n",
328 |     "    for n in range(len(torsion_sin)):\n",
329 |     "        torsion_training[n] = math.atan2(torsion_sin[n],torsion_cos[n])\n",
330 |     "    \n",
331 |     "    for i in range(5,atoms-1,3):\n",
332 |     "        \n",
333 |     "        if args.input_type == 'cif':\n",
334 |     "            A = atom_cif(lines, i-4);      B = atom_cif(lines,i-1);       C = atom_cif(lines,i+2)\n",
335 |     "            D_C = atom_cif(lines, i);      D_N = atom_cif(lines, i+1) \n",
336 |     "        else:\n",
337 |     "            atom_id = n_zero+i \n",
338 |     "            A = atom_net(xyz, atom_id-4);  B = atom_net(xyz, atom_id-1);  C = atom_net(xyz, atom_id + 2)  \n",
339 |     "            D_C = atom_net(xyz, atom_id);  D_N = atom_net(xyz, atom_id+1)\n",
340 |     "        \n",
341 |     "        if torsion_training[0] != 'none':\n",
342 |     "            torsion_train_N = torsion_training[i//3]   #介入训练的角度\n",
343 |     "\n",
344 |     "            if torsion_train_N >0:\n",
345 |     "                torsion_train_C = torsion_train_N - math.pi \n",
346 |     "            else:\n",
347 |     "                torsion_train_C = math.pi + torsion_train_N\n",
348 |     "                \n",
349 |     "        #CA - C沿CA - CA轴做旋转    \n",
350 |     "        #0.35529281510453287：CA-C轴和CA-CA轴的夹角\n",
351 |     "        R_C = 1.52326          \n",
352 |     "        angle_confirm_C = math.pi -  0.35529281510453287\n",
353 |     "        next_C , angle_C = next_coord_C(A, B, C, D_C, R_C, angle_confirm_C, torsion_train_C)\n",
354 |     "        \n",
355 |     "        #CA_next - N沿CA - CA轴做旋转\n",
356 |     "        #0.263502970667963：CA-N轴和CA-CA轴的夹角\n",
357 |     "        R_N = 1.45801\n",
358 |     "        angle_confirm_N =  0.263502970667963\n",
359 |     "        next_N , angle_N =  next_coord (A, B, C, D_N, R_N, angle_confirm_N, torsion_train_N)\n",
360 |     "        \n",
361 |     "        true[i-1] = B;   generation[i-1] = B\n",
362 |     "        true[i]   = D_C; generation[i]   = next_C\n",
363 |     "        true[i+1] = D_N; generation[i+1] = next_N\n",
364 |     "#         print(next_C-D_C)\n",
365 |     "\n",
366 |     "    #构建接触矩阵，计算rmsd     \n",
367 |     "    T = contact_martix(true) ; G = contact_martix(generation)\n",
368 |     "    distance = np.square(np.linalg.norm(T - G))\n",
369 |     "    rmsd = np.sqrt(distance/((atoms-1)*atoms))\n",
370 |     "    print(rmsd,atoms)\n",
371 |     "    \n",
372 |     "    \n",
373 |     "    #另一种度量损失方式以及建立文档记录的代码\n",
374 |     "#     write_rmsd = ('ProteinNet-文件名:'+ path_file.split('\\\\')[-1] +'\\n' + \n",
375 |     "#                   '接触矩阵的rmsd: ' + str(rmsd) + '\\n------------------------------------------')\n",
376 |     "#     total_C = np.square(np.linalg.norm(next_C - D_C))\n",
377 |     "#     total_N = np.square(np.linalg.norm(next_N - D_N))\n",
378 |     "#     a += total_C  ; b += total_N\n",
379 |     "\n",
380 |     "#     x=(atoms-6)//6 ;  write_rmsd = ('ProteinNet-文件名:'+path_file.split('\\\\')[-1]+'\\n'+'  C原子的平均误差:'+str(np.sqrt(a/(x)))+'\\n'\n",
381 |     "#                             '  N原子的平均误差:'+ str(np.sqrt(b/(x)))+'\\n------------------------------------------')\n",
382 |     "#     print(write_rmsd)\n",
383 |     "#     f_rmsd.write(write_rmsd +'\\n')\n",
384 |     "#     print(np.linalg.norm(next_N-next_C) )\n",
385 |     "#     print(angle_C,angle_N) ;\n",
386 |     "    f.close()\n",
387 |     "    \n",
388 |     "\n",
389 |     "def angle_to_Cartesian_intersection(path_file,path_angle):\n",
390 |     "    total = 0;   distance = 0;  a = 0;   b = 0\n",
391 |     "    #读取数据\n",
392 |     "    if args.input_type == 'cif':\n",
393 |     "        f = open(path_file, 'r');   lines  = f.readlines();  atoms = len(lines)\n",
394 |     "        next_N = atom_cif(lines,0)\n",
395 |     "        \n",
396 |     "    else:\n",
397 |     "        f = open(path_file, 'r');            f_line = f.readlines()\n",
398 |     "        xyz = f_line[-6:-3];                 chain  = f_line[-2]\n",
399 |     "        atoms = chain.count('+') * 3;        n_zero = chain[0:len(chain)//2].count('-') * 3;\n",
400 |     "\n",
401 |     "    for i in range(5,atoms-1,3):\n",
402 |     "        \n",
403 |     "        if args.input_type == 'cif':\n",
404 |     "            A   = atom_cif(lines, i-4);    B   = atom_cif(lines,i-1);       C = atom_cif(lines,i+2)\n",
405 |     "            D_C = atom_cif(lines, i);      D_N = atom_cif(lines, i+1);      N = atom_cif(lines,i-2)\n",
406 |     "        else:\n",
407 |     "            atom_id = n_zero+i \n",
408 |     "            A = atom_net(xyz, atom_id-4);  B   = atom_net(xyz, atom_id-1);    C = atom_net(xyz, atom_id + 2)  \n",
409 |     "            D_C = atom_net(xyz, atom_id);  D_N = atom_net(xyz, atom_id+1) ;   N = atom_net(xyz, atom_id-2);\n",
410 |     "         \n",
411 |     "        #根据键长键角计算圆环的半径\n",
412 |     "        #0.35529281510453287：CA-C轴和CA-CA轴的夹角\n",
413 |     "        R_C = 1.52326 * math.cos(0.35529281510453287)\n",
414 |     "        R_C_2 = 1.52326 * math.cos(math.pi - 1.941)\n",
415 |     "        \n",
416 |     "        #求第一个法向量    CA-CA轴\n",
417 |     "        normal_vector_1 =  (C-B)/np.linalg.norm(C-B)\n",
418 |     "        #求第一个圆心    CA-CA轴\n",
419 |     "        next_CA_mid = B +  R_C * normal_vector_1\n",
420 |     "        \n",
421 |     "        #求第二个法向量    C-N轴\n",
422 |     "        normal_vector_2 =  (B-N)/np.linalg.norm(B-N)\n",
423 |     "        #求第二个圆心    CA-CA轴\n",
424 |     "        next_C_mid =  B +  R_C_2 * normal_vector_2\n",
425 |     "        #两个法向量之间的角度不是固定的\n",
426 |     "        \n",
427 |     "#         print(np.dot(normal_vector_2,vector_unit(next_C_2,next_C_mid)))#验证法向量\n",
428 |     "        #半径大的误差大\n",
429 |     "    \n",
430 |     "        #根据求出的半径和圆心复原坐标 ，验证半径和圆心是否计算正确 是否可还原坐标\n",
431 |     "        true_C = next_C_mid + ((D_C - next_C_mid) / np.linalg.norm(D_C - next_C_mid)) * 1.52326 * math.sin(math.pi - 1.941)\n",
432 |     "        true_C_1 = next_CA_mid + ((D_C - next_CA_mid) / np.linalg.norm(D_C - next_CA_mid)) * 0.5298886988235514\n",
433 |     "        #0.5298886988235514：CA-C轴绕CA-CA轴旋转的半径\n",
434 |     "        \n",
435 |     "        \n",
436 |     "        #两平面相交直线L0的方向向量\n",
437 |     "        L0_dir = np.cross(normal_vector_1,normal_vector_2) / np.linalg.norm(np.cross(normal_vector_1,normal_vector_2)) \n",
438 |     "        \n",
439 |     "        #平面1上于L0垂直的L1的方向向量\n",
440 |     "        L1_dir = np.cross(L0_dir,normal_vector_1) / np.linalg.norm(np.cross(L0_dir,normal_vector_1)) \n",
441 |     "        \n",
442 |     "        #两圆心连起来的的向量ps\n",
443 |     "        ps = next_C_mid - next_CA_mid #p-s     \n",
444 |     "        \n",
445 |     "        #计算平面1的圆心到平面2的距离D\n",
446 |     "        D = np.dot(ps,normal_vector_2)\n",
447 |     "        \n",
448 |     "        #计算平面1圆心到交线L0的距离\n",
449 |     "        cos_a_n = np.dot(L1_dir,normal_vector_2)\n",
450 |     "        t = D / cos_a_n\n",
451 |     "        \n",
452 |     "        #求出L1和L0的交点\n",
453 |     "        R = next_CA_mid + t* L1_dir\n",
454 |     "        print(D,t)\n",
455 |     "        \n",
456 |     "        #从两个交点中找出和正确坐标更近的点\n",
457 |     "        L0  = R - np.sqrt((0.5298886988235514**2)-t**2) * L0_dir\n",
458 |     "        L0_ = R + np.sqrt((0.5298886988235514**2)-t**2) * L0_dir\n",
459 |     "        \n",
460 |     "        print(np.linalg.norm(L0 - next_CA_mid))\n",
461 |     "        print(min(np.linalg.norm(L0 - D_C),np.linalg.norm(L0_ - D_C)))\n",
462 |     "\n",
463 |     "        print('--------------------------------------------------')\n",
464 |     "    f.close()\n",
465 |     "\n",
466 |     "#根据局部原子坐标，找到训练出来的npy匹配的真实的文件名\n",
467 |     "#path：真实文件的存放路径\n",
468 |     "#path_test：训练出来需要更改为真实文件名的npy文件\n",
469 |     "def change_filename(path,path_test): \n",
470 |     "    for filename in os.listdir(path):\n",
471 |     "        path_angle = os.path.join(path,filename)\n",
472 |     "        load = np.array(np.load(path_angle))\n",
473 |     "        for file in os.listdir(path_test):\n",
474 |     "            path_fake = os.path.join(path_test,file)\n",
475 |     "            load_fake = np.array(np.load(path_fake))\n",
476 |     "            if np.linalg.norm(load[0][0:3]-load_fake[2][0:3]) == 0:\n",
477 |     "                print(filename)\n",
478 |     "                path_new = os.path.join(path_test,filename)\n",
479 |     "                os.rename(path_fake,path_new)             \n",
480 |     "            \n",
481 |     "# path = 'G:\\\\Computational reconstruction\\\\plane_torsion'\n",
482 |     "# path_test =  'G:\\\\Computational reconstruction\\\\torsion_validation'\n",
483 |     "# change_filename(path,path_test)\n",
484 |     "\n",
485 |     "\n",
486 |     "def traverse_file(path):\n",
487 |     "    ask = input('是否要计算旋转角？ y/n:')\n",
488 |     "    ask_repetition = input('是否采用05-文献方法计算坐标？ y/n:')\n",
489 |     "    ask_CA_CA = input('是否以CA_CA为旋转轴？ y/n:')\n",
490 |     "    ask_intersection = input('是否通过求两个环的交点预测原子坐标？ y/n:')\n",
491 |     "    \n",
492 |     "    for i in os.listdir(path):\n",
493 |     "        path_file = os.path.join(path, i)\n",
494 |     "        #替换路径为角度npy的路径（训练出来的旋转角度）\n",
495 |     "        #需和原始数据放在同一个大文件夹下 且文件名相匹配\n",
496 |     "        path_angle_ = path_file.replace('test','torsion_validation')\n",
497 |     "        path_angle = path_angle_.replace('.pn','.npy')\n",
498 |     "        \n",
499 |     "        #根据不同情况执行对应情况函数\n",
500 |     "        if ask == 'y':\n",
501 |     "            Cartesian_to_angle(path_file)\n",
502 |     "            \n",
503 |     "        elif ask_repetition == 'y':\n",
504 |     "            if i.split('.')[-1] == 'cif':\n",
505 |     "                angle_to_Cartesian_cif(path_file, path_angle)\n",
506 |     "            elif i.split('.')[-1] == 'pn':\n",
507 |     "                angle_to_Cartesian(path_file, path_angle)\n",
508 |     "                \n",
509 |     "        elif ask_CA_CA == 'y':\n",
510 |     "            angle_to_Cartesian_CA_CA(path_file, path_angle)\n",
511 |     "            \n",
512 |     "        elif ask_intersection == 'y':   \n",
513 |     "            angle_to_Cartesian_intersection(path_file,path_angle)\n",
514 |     "            \n",
515 |     "\n",
516 |     "#         angle_to_Cartesian_CA_C(path_cif,path_angle)#计算CA-C为轴长的复原方法\n",
517 |     "#         angle_to_Cartesian_CA_CA(path_cif,path_angle)#3-CA为旋转基准面的复原方法\n",
518 |     "\n",
519 |     "traverse_file('G:\\Computational reconstruction\\\\plam\\\\test' )"
520 |    ]
521 |   }
522 |  ],
523 |  "metadata": {
524 |   "kernelspec": {
525 |    "display_name": "Python 3",
526 |    "language": "python",
527 |    "name": "python3"
528 |   },
529 |   "language_info": {
530 |    "codemirror_mode": {
531 |     "name": "ipython",
532 |     "version": 3
533 |    },
534 |    "file_extension": ".py",
535 |    "mimetype": "text/x-python",
536 |    "name": "python",
537 |    "nbconvert_exporter": "python",
538 |    "pygments_lexer": "ipython3",
539 |    "version": "3.6.5"
540 |   }
541 |  },
542 |  "nbformat": 4,
543 |  "nbformat_minor": 2
544 | }
545 | 


--------------------------------------------------------------------------------