├── AMP_Classification.py ├── AMP_Classification_Prediction.py ├── ECD_Attention.py ├── README.md ├── attribute_selection.py ├── data ├── 6-pc ├── AAindex.txt ├── AMPdb_data.csv ├── PAAC.txt ├── blosum62.txt ├── training_data.csv └── val_data.csv ├── data_feature.py ├── examples ├── readme.txt └── samples.fasta ├── gan_diff.py ├── gan_generate.py ├── gan_update.py ├── generate_pos.py ├── model.png ├── model.py ├── models └── readme.txt ├── predict.py ├── requirements.txt ├── run.sh ├── tmp_save ├── anti_mammalian_cells │ └── readme.txt ├── antibacterial │ └── readme.txt ├── antibiofilm │ └── readme.txt ├── anticancer │ └── readme.txt ├── anticandida │ └── readme.txt ├── antifungal │ └── readme.txt ├── antigram-negative │ └── readme.txt ├── antigram-positive │ └── readme.txt ├── antihiv │ └── readme.txt ├── antimalarial │ └── readme.txt ├── antimrsa │ └── readme.txt ├── antiparasitic │ └── readme.txt ├── antiplasmodial │ └── readme.txt ├── antiprotozoal │ └── readme.txt ├── antitb │ └── readme.txt ├── antiviral │ └── readme.txt ├── anurandefense │ └── readme.txt ├── chemotactic │ └── readme.txt └── cytotoxic │ └── readme.txt ├── tools.py ├── vocab.py └── weight └── readme.txt /AMP_Classification.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from transformers import AutoTokenizer, AutoModelForSequenceClassification 3 | import pandas as pd 4 | from transformers import set_seed 5 | from torch.utils.data import Dataset, DataLoader 6 | import torch 7 | import torch.nn as nn 8 | import warnings 9 | import tqdm 10 | import torch.nn.functional as F 11 | from sklearn.metrics import auc, roc_curve, precision_recall_curve, average_precision_score 12 | from sklearn.metrics import roc_auc_score 13 | import matplotlib.pyplot as plt 14 | 15 | warnings.filterwarnings('ignore') 16 | device = "cuda:0" 17 | model_checkpoint1 = "facebook/esm2_t12_35M_UR50D" # 初始 18 | 19 | 20 | 21 | df_train1 = pd.read_csv('data/training_data.csv') 22 | df_val = pd.read_csv('data/val_data.csv') 23 | 24 | train_sequences1 = df_train1["Seq"].tolist() 25 | train_labels1 = df_train1["Label"].tolist() 26 | val_sequences = df_val["Seq"].tolist() 27 | val_labels = df_val["Label"].tolist() 28 | 29 | 30 | class MyDataset(Dataset): 31 | def __init__(self, dict_data) -> None: 32 | super(MyDataset, self).__init__() 33 | self.data = dict_data 34 | 35 | def __getitem__(self, index): 36 | return [self.data['text'][index], self.data['labels'][index]] 37 | 38 | def __len__(self): 39 | return len(self.data['text']) 40 | 41 | 42 | train_dict1 = {"text": train_sequences1, 'labels': train_labels1} 43 | val_dict = {"text": val_sequences, 'labels': val_labels} 44 | 45 | epochs = 500 46 | learning_rate = 0.0005 47 | batch_size = 2048 # 1024 48 | 49 | tokenizer1 = AutoTokenizer.from_pretrained(model_checkpoint1) # model_checkpoint1 = "facebook/esm2_t12_35M_UR50D"#初始 50 | 51 | 52 | def collate_fn(batch): 53 | max_len = 30 # 30 54 | pt_batch = tokenizer1([b[0] for b in batch], max_length=max_len, padding="max_length", truncation=True, 55 | return_tensors='pt') 56 | 57 | labels = [b[1] for b in batch] 58 | return {'labels': labels, 'input_ids': pt_batch['input_ids'], 59 | 'attention_mask': pt_batch['attention_mask']} 60 | 61 | 62 | train_data1 = MyDataset(train_dict1) 63 | val_data = MyDataset(val_dict) 64 | train_dataloader1 = DataLoader(train_data1, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 65 | 66 | val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 67 | 68 | 69 | 70 | class MyModel(nn.Module): 71 | def __init__(self): 72 | super().__init__() 73 | self.bert1 = AutoModelForSequenceClassification.from_pretrained(model_checkpoint1, num_labels=3000) # 3000 74 | for param in self.bert1.parameters(): 75 | param.requires_grad = False 76 | self.bn1 = nn.BatchNorm1d(256) 77 | self.bn2 = nn.BatchNorm1d(128) 78 | self.bn3 = nn.BatchNorm1d(64) 79 | self.relu = nn.LeakyReLU() 80 | self.fc1 = nn.Linear(3000, 256) 81 | self.fc2 = nn.Linear(256, 128) 82 | self.fc3 = nn.Linear(128, 64) 83 | self.output_layer = nn.Linear(64, 2) 84 | self.dropout = nn.Dropout(0.2) # 0.3 85 | 86 | def forward(self, x): 87 | with torch.no_grad(): 88 | bert_output = self.bert1(input_ids=x['input_ids'].to(device), 89 | attention_mask=x['attention_mask'].to(device)) 90 | output_feature = self.dropout(bert_output["logits"]) 91 | output_feature = self.dropout(self.relu(self.bn1(self.fc1(output_feature)))) 92 | output_feature = self.dropout(self.relu(self.bn2(self.fc2(output_feature)))) 93 | output_feature = self.dropout(self.relu(self.bn3(self.fc3(output_feature)))) 94 | output_feature = self.dropout(self.output_layer(output_feature)) 95 | return torch.softmax(output_feature, dim=1), output_feature 96 | 97 | 98 | model = MyModel().cuda() 99 | model = model.to(device) 100 | # model.load_state_dict(torch.load("best_model.pth")) 101 | 102 | # nn.BCELoss() 103 | criterion = nn.CrossEntropyLoss() 104 | # criterion = nn.BCELoss() 105 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 106 | 107 | train_loss = [] 108 | valid_loss = [] 109 | train_epochs_loss = [] 110 | valid_epochs_loss = [] 111 | train_epochs_acc = [] 112 | valid_epochs_acc = [] 113 | 114 | best_acc = 0 115 | for epoch in range(epochs): 116 | model.train() 117 | train_epoch_loss = [] 118 | tp1 = 0 119 | fn1 = 0 120 | tn1 = 0 121 | fp1 = 0 122 | for index, batch in enumerate(train_dataloader1): 123 | batchs = {k: v for k, v in batch.items()} 124 | optimizer.zero_grad() 125 | outputs, _ = model(batchs) 126 | label = torch.nn.functional.one_hot(torch.tensor(batchs["labels"]).to(torch.int64), 127 | num_classes=2).float() # 原始int64 128 | loss = criterion(outputs.to(device), label.to(device)) 129 | 130 | loss.backward() 131 | optimizer.step() 132 | train_epoch_loss.append(loss.item()) 133 | train_loss.append(loss.item()) 134 | train_argmax = np.argmax(outputs.cpu().detach().numpy(), axis=1) 135 | for j in range(0, len(train_argmax)): 136 | if batchs["labels"][j] == 1: 137 | if batchs["labels"][j] == train_argmax[j]: 138 | tp1 += 1 139 | else: 140 | fn1 = fn1 + 1 141 | else: 142 | if batchs["labels"][j] == train_argmax[j]: 143 | tn1 = tn1 + 1 144 | else: 145 | fp1 = fp1 + 1 146 | 147 | train_acc = float(tp1 + tn1) / len(train_labels1) 148 | train_epochs_acc.append(train_acc) 149 | train_epochs_loss.append(np.average(train_epoch_loss)) 150 | 151 | model.eval() 152 | valid_epoch_loss = [] 153 | tp = 0 154 | fn = 0 155 | tn = 0 156 | fp = 0 157 | Sensitivity = 0 158 | Specificity = 0 159 | MCC = 0 160 | AUC = 0 161 | true_labels = [] 162 | features_list = [] 163 | 164 | pred_prob = [] 165 | with torch.no_grad(): 166 | for index, batch in enumerate(val_dataloader): 167 | batchs = {k: v for k, v in batch.items()} 168 | outputs, output_feature = model(batchs) 169 | 170 | features_list.append(output_feature.cpu().numpy()) 171 | 172 | label = torch.nn.functional.one_hot(torch.tensor(batchs["labels"]).to(torch.int64), num_classes=2).float() 173 | loss = criterion(outputs.to(device), label.to(device)) 174 | valid_epoch_loss.append(loss.item()) 175 | valid_loss.append(loss.item()) 176 | val_argmax = np.argmax(outputs.cpu(), axis=1) 177 | true_labels += batchs["labels"] # 收集真实标签 178 | pred_prob += outputs[:, 1].tolist() 179 | # print("\n") 180 | # print(pred_prob) 181 | for j in range(0, len(val_argmax)): 182 | if batchs["labels"][j] == 1: 183 | if batchs["labels"][j] == val_argmax[j]: 184 | tp = tp + 1 185 | else: 186 | fn = fn + 1 187 | else: 188 | if batchs["labels"][j] == val_argmax[j]: 189 | tn = tn + 1 190 | else: 191 | fp = fp + 1 192 | if tp + fn == 0: 193 | Recall = Sensitivity = 0 194 | else: 195 | Recall = Sensitivity = float(tp) / (tp + fn) 196 | if tn + fp == 0: 197 | Specificity = 0 198 | else: 199 | Specificity = float(tn) / (tn + fp) 200 | if (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) == 0: 201 | MCC = 0 202 | else: 203 | MCC = float(tp * tn - fp * fn) / (np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))) 204 | auc_score = roc_auc_score(true_labels, pred_prob) 205 | # Precision 206 | if tp + fp == 0: 207 | Precision = 0 208 | else: 209 | Precision = float(tp) / (tp + fp) 210 | # F1-score 211 | if Recall + Precision == 0: 212 | F1 = 0 213 | else: 214 | F1 = 2 * Recall * Precision / (Recall + Precision) 215 | valid_epochs_loss.append(np.average(valid_epoch_loss)) 216 | val_acc = float(tp + tn) / len(val_labels) 217 | if val_acc >= best_acc: 218 | best_acc = val_acc 219 | print("best_acc is {}".format(best_acc)) 220 | # torch.save(model.state_dict(), f"weight/best_model.pth") 221 | 222 | print( 223 | f'epoch:{epoch}, train_acc:{train_acc}, val_acc:{val_acc}, prec:{Precision} SE:{Sensitivity}, SP:{Specificity} ,f1:{F1} ,MCC:{MCC}, AUC:{auc_score}') 224 | -------------------------------------------------------------------------------- /AMP_Classification_Prediction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from transformers import AutoTokenizer, AutoModelForSequenceClassification 3 | from transformers import set_seed 4 | import torch 5 | import torch.nn as nn 6 | import warnings 7 | from tqdm import tqdm 8 | 9 | warnings.filterwarnings('ignore') 10 | device = "cuda:0" 11 | model_checkpoint1 = "facebook/esm2_t12_35M_UR50D" 12 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint1) 13 | 14 | 15 | class MyModel(nn.Module): 16 | def __init__(self): 17 | super().__init__() 18 | self.bert1 = AutoModelForSequenceClassification.from_pretrained(model_checkpoint1, num_labels=3000).cuda()#3000 19 | # for param in self.bert1.parameters(): 20 | # param.requires_grad = False 21 | self.bn1 = nn.BatchNorm1d(256) 22 | self.bn2 = nn.BatchNorm1d(128) 23 | self.bn3 = nn.BatchNorm1d(64) 24 | self.relu = nn.LeakyReLU() 25 | self.fc1 = nn.Linear(3000, 256) 26 | self.fc2 = nn.Linear(256, 128) 27 | self.fc3 = nn.Linear(128, 64) 28 | self.output_layer = nn.Linear(64, 2) 29 | self.dropout = nn.Dropout(0.3) # 0.3 30 | 31 | def forward(self, x): 32 | with torch.no_grad(): 33 | bert_output = self.bert1(input_ids=x['input_ids'].to(device), 34 | attention_mask=x['attention_mask'].to(device)) 35 | # output_feature = bert_output["logits"] 36 | # print(output_feature.size()) 37 | # output_feature = self.bn1(self.fc1(output_feature)) 38 | # output_feature = self.bn2(self.fc1(output_feature)) 39 | # output_feature = self.relu(self.bn3(self.fc3(output_feature))) 40 | # output_feature = self.dropout(self.output_layer(output_feature)) 41 | output_feature = self.dropout(bert_output["logits"]) 42 | output_feature = self.dropout(self.relu(self.bn1(self.fc1(output_feature)))) 43 | output_feature = self.dropout(self.relu(self.bn2(self.fc2(output_feature)))) 44 | output_feature = self.dropout(self.relu(self.bn3(self.fc3(output_feature)))) 45 | output_feature = self.dropout(self.output_layer(output_feature)) 46 | # return torch.sigmoid(output_feature),output_feature 47 | return torch.softmax(output_feature, dim=1) 48 | 49 | 50 | def AMP(test_sequences, model): 51 | # 保持 AMP 函数不变,只处理传入的 test_sequences 数据 52 | max_len = 18 53 | test_data = tokenizer(test_sequences, max_length=max_len, padding="max_length", truncation=True, 54 | return_tensors='pt') 55 | model = model.to(device) 56 | model.eval() 57 | out_probability = [] 58 | with torch.no_grad(): 59 | predict = model(test_data).cuda() 60 | out_probability.extend(np.max(np.array(predict.cpu()), axis=1).tolist()) 61 | test_argmax = np.argmax(predict.cpu(), axis=1).tolist() 62 | id2str = {0: "non-AMP", 1: "AMP"} 63 | return id2str[test_argmax[0]], out_probability[0] 64 | 65 | 66 | input_file = "seq.txt" 67 | output_file = "seq_.txt" 68 | pos_file = "pos.txt" 69 | 70 | amp_count = 0 71 | non_amp_count = 0 72 | 73 | 74 | # 一次性读取整个文件 75 | with open(input_file, 'r') as infile: 76 | lines = infile.readlines() 77 | 78 | # 加载模型 79 | model = MyModel() 80 | model.load_state_dict(torch.load("weight/best_model.pth")) 81 | print('\nGeneration Start') 82 | # 处理每一行数据 83 | for line in tqdm(lines, total=len(lines), desc="Processing"): 84 | line = line.strip() 85 | result, probability = AMP(line, model) 86 | 87 | # 写入结果到输出文件 88 | with open(output_file, 'a') as outfile: 89 | outfile.write(f"{line} {result} {probability}\n") 90 | 91 | # 统计AMP和非AMP的数量 92 | if result == "AMP" and not any(char in line for char in ["0", "X", "Z", "x", "z"]): 93 | amp_count += 1 94 | with open(pos_file, 'a') as posfile: 95 | posfile.write(f"{line} {result} {probability}\n") 96 | else: 97 | non_amp_count += 1 98 | 99 | print("\n AMP Generation Finished") 100 | -------------------------------------------------------------------------------- /ECD_Attention.py: -------------------------------------------------------------------------------- 1 | import math,os 2 | import torch 3 | import numpy as np 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | # from torch_sparse import SparseTensor, matmul 7 | from torch_geometric.utils import degree 8 | 9 | 10 | def full_attention_conv(qs, ks, vs, kernel, output_attn=False): 11 | ''' 12 | qs: query tensor [N, H, M] 13 | ks: key tensor [L, H, M] 14 | vs: value tensor [L, H, D] 15 | 16 | return output [N, H, D] 17 | ''' 18 | if kernel == 'simple': 19 | # normalize input 20 | qs = qs / torch.norm(qs, p=2) # [N, H, M] 21 | ks = ks / torch.norm(ks, p=2) # [L, H, M] 22 | N = qs.shape[0] 23 | 24 | # numerator 25 | kvs = torch.einsum("lhm,lhd->hmd", ks, vs) 26 | attention_num = torch.einsum("nhm,hmd->nhd", qs, kvs) # [N, H, D] 27 | all_ones = torch.ones([vs.shape[0]]).to(vs.device) 28 | vs_sum = torch.einsum("l,lhd->hd", all_ones, vs) # [H, D] 29 | attention_num += vs_sum.unsqueeze(0).repeat(vs.shape[0], 1, 1) # [N, H, D] 30 | 31 | # denominator 32 | all_ones = torch.ones([ks.shape[0]]).to(ks.device) 33 | ks_sum = torch.einsum("lhm,l->hm", ks, all_ones) 34 | attention_normalizer = torch.einsum("nhm,hm->nh", qs, ks_sum) # [N, H] 35 | 36 | # attentive aggregated results 37 | attention_normalizer = torch.unsqueeze(attention_normalizer, len(attention_normalizer.shape)) # [N, H, 1] 38 | attention_normalizer += torch.ones_like(attention_normalizer) * N 39 | attn_output = attention_num / attention_normalizer # [N, H, D] 40 | 41 | # compute attention for visualization if needed 42 | if output_attn: 43 | attention = torch.einsum("nhm,lhm->nlh", qs, ks) / attention_normalizer # [N, L, H] 44 | 45 | elif kernel == 'sigmoid': 46 | # numerator 47 | attention_num = torch.sigmoid(torch.einsum("nhm,lhm->nlh", qs, ks)) # [N, L, H] 48 | 49 | # denominator 50 | all_ones = torch.ones([ks.shape[0]]).to(ks.device) 51 | attention_normalizer = torch.einsum("nlh,l->nh", attention_num, all_ones) 52 | attention_normalizer = attention_normalizer.unsqueeze(1).repeat(1, ks.shape[0], 1) # [N, L, H] 53 | 54 | # compute attention and attentive aggregated results 55 | attention = attention_num / attention_normalizer 56 | attn_output = torch.einsum("nlh,lhd->nhd", attention, vs) # [N, H, D] 57 | 58 | if output_attn: 59 | return attn_output, attention 60 | else: 61 | return attn_output 62 | 63 | def gcn_conv(x, edge_index, edge_weight): 64 | N, H = x.shape[0], x.shape[1] 65 | row, col = edge_index 66 | d = degree(col, N).float() 67 | d_norm_in = (1. / d[col]).sqrt() 68 | d_norm_out = (1. / d[row]).sqrt() 69 | gcn_conv_output = [] 70 | if edge_weight is None: 71 | value = torch.ones_like(row) * d_norm_in * d_norm_out 72 | else: 73 | value = edge_weight * d_norm_in * d_norm_out 74 | value = torch.nan_to_num(value, nan=0.0, posinf=0.0, neginf=0.0) 75 | adj = SparseTensor(row=col, col=row, value=value, sparse_sizes=(N, N)) 76 | for i in range(x.shape[1]): 77 | gcn_conv_output.append( matmul(adj, x[:, i]) ) # [N, D] 78 | gcn_conv_output = torch.stack(gcn_conv_output, dim=1) # [N, H, D] 79 | return gcn_conv_output 80 | 81 | class DIFFormerConv(nn.Module): 82 | ''' 83 | one DIFFormer layer 84 | ''' 85 | def __init__(self, in_channels, 86 | out_channels, 87 | num_heads, 88 | kernel='simple', 89 | use_graph=True, 90 | use_weight=True): 91 | super(DIFFormerConv, self).__init__() 92 | self.Wk = nn.Linear(in_channels, out_channels * num_heads) 93 | self.Wq = nn.Linear(in_channels, out_channels * num_heads) 94 | if use_weight: 95 | self.Wv = nn.Linear(in_channels, out_channels * num_heads) 96 | 97 | self.out_channels = out_channels 98 | self.num_heads = num_heads 99 | self.kernel = kernel 100 | self.use_graph = use_graph 101 | self.use_weight = use_weight 102 | 103 | def reset_parameters(self): 104 | self.Wk.reset_parameters() 105 | self.Wq.reset_parameters() 106 | if self.use_weight: 107 | self.Wv.reset_parameters() 108 | 109 | def forward(self, query_input, source_input, edge_index=None, edge_weight=None, output_attn=False): 110 | # feature transformation 111 | query = self.Wq(query_input).reshape(-1, self.num_heads, self.out_channels) 112 | key = self.Wk(source_input).reshape(-1, self.num_heads, self.out_channels) 113 | if self.use_weight: 114 | value = self.Wv(source_input).reshape(-1, self.num_heads, self.out_channels) 115 | else: 116 | value = source_input.reshape(-1, 1, self.out_channels) 117 | 118 | # compute full attentive aggregation 119 | if output_attn: 120 | attention_output, attn = full_attention_conv(query, key, value, self.kernel, output_attn) # [N, H, D] 121 | else: 122 | attention_output = full_attention_conv(query,key,value,self.kernel) # [N, H, D] 123 | 124 | # use input graph for gcn conv 125 | if self.use_graph: 126 | final_output = attention_output + gcn_conv(value, edge_index, edge_weight) 127 | else: 128 | final_output = attention_output 129 | final_output = final_output.mean(dim=1) 130 | 131 | if output_attn: 132 | return final_output, attn 133 | else: 134 | return final_output 135 | 136 | class DIFFormer(nn.Module): 137 | ''' 138 | DIFFormer model class 139 | x: input node features [N, D] 140 | edge_index: 2-dim indices of edges [2, E] 141 | return y_hat predicted logits [N, C] 142 | ''' 143 | def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2, num_heads=1, kernel='simple', 144 | alpha=0.5, dropout=0.5, use_bn=True, use_residual=True, use_weight=True, use_graph=True): 145 | super(DIFFormer, self).__init__() 146 | 147 | self.convs = nn.ModuleList() 148 | self.fcs = nn.ModuleList() 149 | self.fcs.append(nn.Linear(in_channels, hidden_channels)) 150 | self.bns = nn.ModuleList() 151 | self.bns.append(nn.LayerNorm(hidden_channels)) 152 | for i in range(num_layers): 153 | self.convs.append( 154 | DIFFormerConv(hidden_channels, hidden_channels, num_heads=num_heads, kernel=kernel, use_graph=use_graph, use_weight=use_weight)) 155 | self.bns.append(nn.LayerNorm(hidden_channels)) 156 | 157 | self.fcs.append(nn.Linear(hidden_channels, out_channels)) 158 | 159 | self.dropout = dropout 160 | self.activation = F.relu 161 | self.use_bn = use_bn 162 | self.residual = use_residual 163 | self.alpha = alpha 164 | 165 | def reset_parameters(self): 166 | for conv in self.convs: 167 | conv.reset_parameters() 168 | for bn in self.bns: 169 | bn.reset_parameters() 170 | for fc in self.fcs: 171 | fc.reset_parameters() 172 | 173 | def forward(self, x, edge_index, edge_weight=None): 174 | layer_ = [] 175 | 176 | # input MLP layer 177 | x = self.fcs[0](x) 178 | if self.use_bn: 179 | x = self.bns[0](x) 180 | x = self.activation(x) 181 | x = F.dropout(x, p=self.dropout, training=self.training) 182 | 183 | # store as residual link 184 | layer_.append(x) 185 | 186 | for i, conv in enumerate(self.convs): 187 | # graph convolution with DIFFormer layer 188 | x = conv(x, x, edge_index, edge_weight) 189 | if self.residual: 190 | x = self.alpha * x + (1-self.alpha) * layer_[i] 191 | if self.use_bn: 192 | x = self.bns[i+1](x) 193 | x = F.dropout(x, p=self.dropout, training=self.training) 194 | layer_.append(x) 195 | 196 | # output MLP layer 197 | x_out = self.fcs[-1](x) 198 | return x_out 199 | 200 | def get_attentions(self, x): 201 | layer_, attentions = [], [] 202 | x = self.fcs[0](x) 203 | if self.use_bn: 204 | x = self.bns[0](x) 205 | x = self.activation(x) 206 | layer_.append(x) 207 | for i, conv in enumerate(self.convs): 208 | x, attn = conv(x, x, output_attn=True) 209 | attentions.append(attn) 210 | if self.residual: 211 | x = self.alpha * x + (1 - self.alpha) * layer_[i] 212 | if self.use_bn: 213 | x = self.bns[i + 1](x) 214 | layer_.append(x) 215 | return torch.stack(attentions, dim=0) # [layer num, N, N] 216 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Diff-AMP: An Integrated Tool for Antimicrobial Peptide Generation and Design Based on Diffusion and Reinforcement Learning Strategies 2 | 3 | ## Introduction 4 | We have developed a unified AMP (Antimicrobial Peptide) design and optimization framework called Diff-AMP, which integrates various deep learning techniques to efficiently accomplish the tasks of AMP generation, evaluation, and optimization. As depicted in the following diagram, this framework encompasses the entire process of automatic molecule design and optimization, including AMP generation, evaluation, property prediction, and multi-objective iterative optimization. The framework comprises four modules, each incorporating advanced deep learning technologies. In Module A, we, for the first time, combine diffusion strategies and attention mechanisms (Module B) within a GAN network, proposing a novel AMP generation model to create AMPs that meet specific properties. Module C introduces large-parameter pre-trained models to acquire general knowledge about nodes and efficiently infer generated potential AMPs. Module D introduces a CNN model to perform multi-property prediction for AMPs. Module E marks the introduction of reinforcement learning strategies for the iterative optimization of generated AMPs. 5 | 6 | ![image](https://github.com/wrab12/diff-amp/blob/main/model.png) 7 | ## Environment 8 | First, clone and navigate to the repository. 9 | ```bash 10 | git clone https://github.com/wrab12/diff-amp 11 | cd diff-amp 12 | ``` 13 | This process can take several minutes, depending on network speed. 14 | 15 | Create and activate a virtual environment using python 3.9 with `virtualenv` or `conda`, 16 | ```python 17 | # virtualenv (python 3.9) 18 | virtualenv env 19 | source env/bin/activate 20 | 21 | # conda 22 | conda create -n diff-amp python=3.9 23 | conda activate diff-amp 24 | ``` 25 | 26 | Install dependencies and the local library with `pip`. 27 | ```bash 28 | 29 | pip install -r requirements.txt 30 | 31 | ``` 32 | This process usually takes around 5 minutes. 33 | ## Datasets 34 | - Find datasets for the generation model(Diff-RLGen), and the recognition model in the `data` directory. 35 | - Download datasets for multi-property prediction from this [Google Drive link](https://drive.google.com/drive/folders/1ZAr3149wxE-362TsxjATwtdRVOPClk37?usp=drive_link). 36 | 37 | 38 | ## Usage 39 | ### generation model 40 | - You can directly use our generation model through Hugging Face: [Diff-AMP Antimicrobial Peptide Generation](https://huggingface.co/spaces/jackrui/diff-amp-antimicrobial_peptide_generation) 41 | - Train the generation model: Run `gan_diff.py`. 42 | - Generate antimicrobial peptides: Use `gan_generate.py`. Get weight files from this [Google Drive link](https://drive.google.com/drive/folders/1vb_vvso29CQHMt43WpTGxoXTki16oNSm?usp=drive_link) and place them in the `weight` directory to use without retraining. 43 | ### classification model 44 | - You can directly use our recognition model through Hugging Face: [Diff-AMP AMP Sequence Detector](https://huggingface.co/spaces/jackrui/diff-amp-AMP_Sequence_Detector) 45 | - Train the classification model: Run `AMP_Classification.py`. 46 | - If you want to directly identify your own peptides as antimicrobial, you can run `AMP_Classification_Prediction.py`. In this case, prepare a file named `seq.txt` with one sequence per line. You can modify the input format if needed. To use pre-trained weights, download them from this [Google Drive link](https://drive.google.com/drive/folders/1vb_vvso29CQHMt43WpTGxoXTki16oNSm?usp=drive_link) and place them in the `weight` directory. 47 | ### Multi-Attribute Prediction Model 48 | - You can directly use our multi-attribute prediction model through Hugging Face: [Diff-AMP AMP Prediction Model](https://huggingface.co/spaces/jackrui/AMP_Prediction_Model) 49 | - Perform property prediction on generated antimicrobial peptides: Run `predict.py`. Get the necessary weight files from this [Google Drive link](https://drive.google.com/drive/folders/1iLzwYbq0R3lwJum4laG1KshXs7oXD9fv?usp=drive_link) and place them in the 'models' directory. 50 | ## run 51 | - Experience all steps of generation, recognition, and optimization. 52 | ```shell 53 | run.sh 54 | ``` 55 | 56 | -------------------------------------------------------------------------------- /attribute_selection.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | 4 | def select_attributes(input_csv, output_csv, attribute_columns, attribute_values): 5 | # 读取输入CSV文件 6 | data = pd.read_csv(input_csv) 7 | 8 | # 创建一个布尔索引,用于筛选数据 9 | mask = data[attribute_columns].eq(attribute_values).all(axis=1) 10 | 11 | # 根据布尔索引筛选数据 12 | selected_data = data[mask] 13 | 14 | # 将选择的数据保存到输出CSV文件 15 | selected_data.to_csv(output_csv, index=False) 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser(description='属性选择工具') 19 | 20 | parser.add_argument('-i', '--input_csv', required=True, help='输入CSV文件名') 21 | parser.add_argument('-o', '--output_csv', required=True, help='输出CSV文件名') 22 | parser.add_argument('-c', '--columns', nargs='+', required=True, help='属性列的名称') 23 | parser.add_argument('-v', '--values', nargs='+', required=True, help='要筛选的属性值') 24 | 25 | args = parser.parse_args() 26 | 27 | # 确保属性列和属性值的数量一致 28 | if len(args.columns) != len(args.values): 29 | raise ValueError("属性列和属性值的数量不一致") 30 | 31 | select_attributes(args.input_csv, args.output_csv, args.columns, args.values) 32 | -------------------------------------------------------------------------------- /data/6-pc: -------------------------------------------------------------------------------- 1 | H1 V P1 Pl PKa NCI 2 | A 0.62 27.5 8.1 6 2.34 0.007187 3 | C 0.29 44.6 5.5 5.07 1.96 -0.03661 4 | D -0.9 40 13 2.77 1.88 -0.02382 5 | E -0.74 62 12.3 3.22 2.19 0.006802 6 | F 1.19 115.5 5.2 5.48 1.83 0.037552 7 | G 0.48 0 9 5.97 2.34 0.179052 8 | H -0.4 79 10.4 7.59 1.82 -0.01069 9 | I 1.38 93.5 5.2 6.02 2.36 0.021631 10 | K -1.5 100 11.3 9.74 2.18 0.017708 11 | L 1.06 93.5 4.9 5.98 2.36 0.051672 12 | M 0.64 94.1 5.7 5.74 2.28 0.002683 13 | N -0.78 58.7 11.6 5.41 2.02 0.005392 14 | P 0.12 41.9 8 6.3 1.99 0.239531 15 | Q -0.85 80.7 10.5 5.65 2.17 0.049211 16 | R -2.53 105 10.5 10.76 2.17 0.043587 17 | S -0.18 29.3 9.2 5.68 2.21 0.004627 18 | T -0.05 51.3 8.6 5.6 2.09 0.003352 19 | V 1.08 71.5 5.9 5.96 2.32 0.057004 20 | W 0.81 145.5 5.4 5.89 2.83 0.037977 21 | Y 0.26 117.3 6.2 5.66 2.2 0.023599 22 | -------------------------------------------------------------------------------- /data/PAAC.txt: -------------------------------------------------------------------------------- 1 | # A R N D C Q E G H I L K M F P S T W Y V 2 | Hydrophobicity 0.62 -2.53 -0.78 -0.9 0.29 -0.85 -0.74 0.48 -0.4 1.38 1.06 -1.5 0.64 1.19 0.12 -0.18 -0.05 0.81 0.26 1.08 3 | Hydrophilicity -0.5 3 0.2 3 -1 0.2 3 0 -0.5 -1.8 -1.8 3 -1.3 -2.5 0 0.3 -0.4 -3.4 -2.3 -1.5 4 | SideChainMass 15 101 58 59 47 72 73 1 82 57 57 73 75 91 42 31 45 130 107 43 5 | -------------------------------------------------------------------------------- /data/blosum62.txt: -------------------------------------------------------------------------------- 1 | A R N D C Q E G H I L K M F P S T W Y V B Z X 2 | 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 3 | -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 4 | -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 5 | -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 6 | 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 7 | -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 8 | -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 9 | 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 10 | -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 11 | -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 12 | -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 13 | -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 14 | -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 15 | -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 16 | -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 17 | 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 18 | 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 19 | -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 20 | -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 21 | 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 22 | -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 23 | -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 24 | 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -------------------------------------------------------------------------------- /data/val_data.csv: -------------------------------------------------------------------------------- 1 | Seq,Label 2 | KDLIPEEAKMTIEDLRM,0 3 | AKKDSTGICFIGERNFKDFLSNYLPA,0 4 | VRQLCKLLRGTKALTEVIPL,1 5 | SISNALNKLEESNRNLDKVNVKLT,1 6 | VNFLLHKIYGNIRYS,1 7 | FITGLIGGLMKALGK,1 8 | GIMSSLMKKLAAHIAK,1 9 | VWVNGLPLRYL,0 10 | LCSATMKMNVQ,0 11 | ELKVGDVIAASLF,0 12 | GSNKGDRLTKTFEGFRNQLDKVQFIRK,0 13 | PRKYIHMGRGRVQLKRIENKI,0 14 | KKKKVVAATYVLVF,1 15 | RAASIVDLMQKDKECAVILTLAG,0 16 | MKDVMDLLEDADFSINVISKSGTTT,0 17 | GVVVRVGRVVVRWVRRRR,1 18 | KEEPDAPVEKVIKLALKRLMYEYIKGT,0 19 | RNLQDRCKDLEQDLGTLK,0 20 | GSDTSLIDTCGNTPNVSGAN,1 21 | AASKGVIHKNQAANKKSALARTLNKL,1 22 | DGSVKREDIFYTS,0 23 | CGETCLFIPCIFSVVGCSCSSKVCYRN,1 24 | SISCGETCTTFNCWIPNCKCNHHDKVCYWN,1 25 | LPRDRHAVTALFISLDPALVDVNV,0 26 | VIGAILGAFFLTQINSG,0 27 | VTAIGVITLMNFETVIGLE,0 28 | SFRLSLSSLTFT,0 29 | HARIKPTFRRLKWKYKGKFW,1 30 | YQKEKLPFNYIPD,0 31 | KSHKAIEEVFSMK,0 32 | VEAANRLNIKTLILDAPNSP,0 33 | IWTRVNSEITDTLMK,0 34 | KKRKGHKMATHKRKKRLRKNRHKKK,1 35 | VAPIAKYLATALAKWALKQGFAKLKS,1 36 | RWCVYAYVRVRGVLVRYRRCS,1 37 | WIQPKTKVIPFFFFF,1 38 | FAGAAFKALSKLL,1 39 | YLAPEMILNRSYASQWTTGGLGI,0 40 | RRRREELKQSWQRMRKLALKR,1 41 | HWQKESVQIQVDDW,0 42 | KLPLLPWLPLK,1 43 | ITEVKQAFLSGHM,0 44 | AFLGWIGAIVSTALPQWR,1 45 | IADGGGVYNMGNLVAVTPKRHIEIHKGGK,1 46 | SLLSLIPCMMSHLQDVSS,0 47 | QAAASVGQVALVN,0 48 | AAHEDAVFSLSFHPLGHILCSGSKDFTARF,0 49 | DFGQIKGKLQK,0 50 | LAVSTMTQMQEPN,0 51 | PPPFEMAEFETII,0 52 | YVRKTARLRDKADLLVNEINLYASTE,0 53 | FLLKIVALLKKKLL,1 54 | LKKLCARLKKLCARLKKLCARLKKLCAR,1 55 | SSLVLIAAMKV,0 56 | ASVPPGFTPFRVAPEIV,1 57 | ISRACLIMKVRRKK,1 58 | GKLICTTTVPWNASW,1 59 | HVMLALATVLSIAGAGTGATAI,1 60 | GKGRWLERIGKAGGIIIGGALDHL,1 61 | PNTMHQQFAERFTAKL,0 62 | GILKGILGMGKNLVCGLSGLC,1 63 | FHEPFRAKLIPEYSEIRTAAKKNG,0 64 | SVLSGSNPNVTAPTDRPSTANV,0 65 | FLGVVFKLASKVFPAVFGKV,1 66 | GTRCGETCFVLPCWSAKFGCYCQKGFCYRN,1 67 | VVPLISALREK,0 68 | NYSRLRNEHTMDERLAKEIMDAANDTGA,0 69 | FRKLFRVYSNFLRGKLKL,1 70 | EKESELREILNYGHTLGHAI,0 71 | FLPAIAGILSQLF,1 72 | AGGLGDALYGLAK,0 73 | KNESHSATRSALIQKLM,0 74 | NQKKSEKDQKDNNMS,0 75 | NSSSNSLVALPNNKQFVFF,0 76 | LNLYTILLSFVALFLA,0 77 | SELLLVGLVALLVLGPERL,0 78 | SFLSKFKDIALDVPRMRARVY,1 79 | KWFKIQLQIKKWKNKK,1 80 | ATFAPSVIGERGAVF,0 81 | RGILIHNTIFGEQVF,1 82 | YKRNLPRRGLSGYSMLAIG,0 83 | LQRAARLGVVTT,0 84 | GFLNTAMNTVTNLAGTLMDKAKCKIRGC,1 85 | ILPLKLPLLPWRR,1 86 | CFGGSLGIQYEDKINVGGGLSGDHC,1 87 | PSLLDEVTALVEWPVPLVCSFEERFLEVPQ,0 88 | IKLLARIALKIKIL,1 89 | ANRETKLGKAGYVTNRGRQK,1 90 | SDEEGNITEEANPNGSLLNIAAIA,0 91 | VPREILFWLTVVSVFLLI,0 92 | IIETPIVANFREGLNVLQYF,0 93 | LTLYLVIALASVSSLFLLSVVLFVG,0 94 | RTREDGDIGSKSVAELVDEI,0 95 | IPLLRKGKKIIWW,1 96 | AYDRAYKTDPTSAF,0 97 | YLLHDKIMTSKSSP,0 98 | ENMSGPPFERPSKSL,0 99 | HWSHGQNQWPLSCPQYVYGS,1 100 | QGRVTDHRINLTLYKLDEVMEG,0 101 | LFKCYKPDSRGFRVCE,1 102 | RPRRRATTRRRITTGTRRRR,1 103 | VAYLILWERKLIG,0 104 | PQITLRKKRRQRRRPPQVSFNFATLNF,1 105 | GILDPIKAFAKAAG,1 106 | RPILIRVRRIRVI,1 107 | GLMDTIKGVAKNVAASLLEKLKCKVTGC,1 108 | ASSLLLVGWKKWSKKWKHWIPQCKKFGKK,1 109 | LSYAFVTLEGQTISGRKG,0 110 | LIASLCERYLTSEDELSKALQKTN,0 111 | LERRIENLNKKMEDGFLDVWTYNAELLVL,0 112 | DVPKTTATVDTYESLLSDSNSNQSRD,0 113 | AILAYESGVIEL,0 114 | PQGTGIEPVVTCSNG,0 115 | VGVFPELSGSEFESFMAVAEKMRA,0 116 | ILPILAPLIGGLL,1 117 | GVPKMVKAYLDKKVK,0 118 | IIMLRETEREARPLDRRR,1 119 | CEGKKQEASEKECEIKEDFELKYQEMH,0 120 | GLKDMIKNLAKEAAVKLAGAVINKFSPQPQ,1 121 | WFIQHEWGKDQGWHCHVLLH,0 122 | FIGPVLKIAAGILPTAICKIFKKC,1 123 | FDFRCDRVRVLINRIGGVVTKTPTVG,1 124 | SLPQRLDELAGDDPWADYAGTRQ,0 125 | GGVGKIIEYFIGGGVGRYG,1 126 | QWLKGKDRGMPPGPDMTLELACGQA,0 127 | GFLGILFHGVHHGRKKALQMNSERRS,1 128 | TVGVLVKWILNVVAKFA,1 129 | FPELSQDLISNLLEK,1 130 | NILSGIANGINRVLSWFG,1 131 | CYYWSKYLWGYV,0 132 | ACWWAGAKQEF,1 133 | ITTTMPEFNEADEDKEMA,0 134 | VRLEFKLQQTSCRKRDWKKPECK,1 135 | VLYNSTFFSTFKCYGVSATK,1 136 | LLRIPQAIMDMIAGAHWG,1 137 | FGIEFTIIIEQVND,0 138 | GAGSQEERMQGQMEGQDFSHEERFLSMVRE,1 139 | VPTHLATDVELKEIQGMMDASEGTNYTCCK,1 140 | IRLKILPFVFFKAVKIFDRYCSKRIV,0 141 | SDYSRKRDPPQKYEEE,1 142 | KKWWRRVLKGLSSGPALSNV,1 143 | RARHMALIPYIVDRKKVVDSPFKQH,1 144 | YKPRRKRAAIHVMLALATVLSI,1 145 | GLWKGKLGLWKGKLGLWKGKL,1 146 | LKTMRKRPELMGKLLARGDWDYDLVDEL,0 147 | FWGTLAKWALKAIPAAMGMKQNK,1 148 | CWFWKWWRRRRR,1 149 | KKLLLLCLLKCKKLKLK,1 150 | PIQVRGRLDCGMAFAGDE,0 151 | MQFITDLIKKAVDFFKGLFGNK,1 152 | TKPKGTKPKGTKPKGTKPKG,1 153 | SALVGCWTKSYPPNPCFGRG,1 154 | PAPPPAPYVSPDEYSSLSLSQELV,0 155 | DYTRDNMAIVS,0 156 | FLGSIVGALASALPSLISKIRN,1 157 | HTASDAAAAAALTAANAAAAAAASMA,1 158 | MIRHVFKETIGVTLDEKFPVMLY,0 159 | FVPAILCSILKTC,1 160 | YTTKNIIGTIRAQAH,1 161 | GLRRLLGRLLRRLGRLLLR,1 162 | GFWGKLFKLGLHGIGLLHLHL,1 163 | STSCQSVSTDGSIEIDGL,0 164 | INWKKIKSLIKAAMS,1 165 | EQLNKEINQLKEKIESTKNEQLRLLKIL,0 166 | DDRFFFYWLMCIWCTFVMSH,1 167 | GLFRALLRLLRSLWRLLLRA,1 168 | KISKIGPENPYNTPVFAIKK,1 169 | QLHAEVEIHVEL,0 170 | FLSAIASMLGKFL,1 171 | AGDKRYTFSQTE,0 172 | RTMADRKRSKGMIPSMSLRYLRFLT,0 173 | PLTGGALTGQVV,0 174 | MSLLERKLLMHFLRV,1 175 | VRLRIRVAVIRA,1 176 | VKLMLEQCLTFQPTYASMA,0 177 | GEFAQFEQEAQSLEQELSQLEQQLESL,1 178 | VSSKYLSKVKVKAGK,1 179 | YIVKNPPKAVSTK,0 180 | PALEKHHGVWI,0 181 | GAAAAGKVVGGKGGK,1 182 | PSSILYNGPFLMK,0 183 | RLQLQQLQHMQ,0 184 | AFVRTKLKDVVS,0 185 | YVTDDMVGHKLGEFSPTRNFKQHTANR,1 186 | ECNDSIKAYGVASHRHTPEIEQTLT,0 187 | FRFKIKFRLKFRFKARFKFRAKFRA,1 188 | IFQNAPTDPTQDFSTQVAK,0 189 | GEEMTLSSEREAAKAAFL,0 190 | SFGISIDSKKLEV,0 191 | EIPGEPGKYFYVWLDAPI,0 192 | RLPWRWPRRPWRR,1 193 | EAIERLKESEK,0 194 | AEIAKAHVMAPPYVIKPVADGSSVGVF,0 195 | YKQLLGVFYKNAPLREQKVWDADAL,0 196 | LWKIWKKIWRVWWNWR,1 197 | AVESRPAGVIGTSQ,0 198 | SFPLGYYLIPFTGIVGLLVLAMGTV,0 199 | TLKQFHVKLKTPGVDAPSWL,0 200 | GSDIRVPLPPLTEERRKDLTKIVRGE,0 201 | KMKEASTNNEP,0 202 | ITKVITKLLNRLTKILSK,1 203 | TTESIVIWGKTPKFKLPIQK,1 204 | DCPNGPWVWVPAFCQAVGWG,1 205 | RGCYKICGETCLFIPCLTSVFGCSCKN,1 206 | KMGNITVDSSSGC,0 207 | EQVLTHLDLNGIASKVEIAGPGFI,0 208 | FAIAAQRIGQTIIAVD,0 209 | LIKHILHRLGGGFHFHLHF,1 210 | SQNLINIGSENSKKRGPNNLEWQ,0 211 | PVPLISVASSLE,0 212 | NPGLLETSEGCRQILGQLQPSLQT,1 213 | LRFDVLQDPQVPTRF,0 214 | ARTEWTADVFD,0 215 | GVGSFIHKVVSAIKNVA,1 216 | IMTKRGYIPGKG,0 217 | LLPDKSTFELMILI,0 218 | AQFKQYHSITDYHGMMFDLG,0 219 | LDVGCGGGILSEAMAQAGADVTG,0 220 | HLVPFYGKVSIGYLPCN,0 221 | KKLALLALKKWLLALKKLALLALKK,1 222 | AMPWRPATGLLPIKPTHIKPLCGDD,1 223 | EKDAVRLFITETFTSRLSTRRP,1 224 | NALSMPRNKCNRALMCFG,1 225 | MGALFKAALKAAGGGAAGGATYGGLKHFFG,1 226 | SLPFQALFSKLQQLAIIA,0 227 | MNHALEEIGCEVLES,0 228 | PNPKVFFDMTIGGQSAGRIVMEEYA,1 229 | RGSKRNADGLGMLVAQAA,0 230 | EPAVKPAAAATRSGVVGVLATTGTLQS,0 231 | NSLIPGLLDFL,0 232 | KGKKGVIIAILLFAIIYKKK,1 233 | GRPAGGGRRRRTGGLRRAAAPDRDYLHRPS,0 234 | TLDTIKQYYVLCNSRDEKFQAL,0 235 | AVAGEKLWLLPHLLKMLLTPTP,1 236 | VRWIGNIPQYLKGLLGGILGIGL,0 237 | GLLKKWLKKWKEFKRIVGY,1 238 | FLVPPKQPDADHYRGFGNLWSFED,0 239 | LFLLQEALRRKRER,0 240 | ENREVPPGFTALIKTLRKCKII,1 241 | GILDAIKAFANALG,1 242 | YLLKSHMDHPSQDV,0 243 | KQDLPNGTDSTGEPPLKKV,0 244 | KKWKKFIKKIGIGAVLTTPGAKK,1 245 | EIVARAGRLTATVKVSQVDG,0 246 | KALKKLLAKWLAAAKALL,1 247 | SASDNEEDSDEMRIGVYV,0 248 | VHRSRAEQRGRAL,0 249 | WASGAASFAAYA,1 250 | MGPPQRLYMSRIAEHLVRIG,0 251 | QLPICGETCVLGGCYTPNCRCQYPICVR,1 252 | VDAVKEIKETYGLKICACLGLLKPG,0 253 | WFQVKTRVRTKVQFW,1 254 | LLLTAIREGLVSSSH,0 255 | IVSSIAISEQGEMLN,0 256 | EVLDSVIMSGAIIGQGAKI,0 257 | ALAALITPKKIKPPL,1 258 | QDNYWVTQGLNILSG,1 259 | GLLSLSLGKLL,1 260 | NVNVPDVSLNYIKGF,0 261 | SAKFKLAVALGQFKEKLLLA,1 262 | GLLGSIGNAIGAFIANKLKPK,1 263 | MMQAKGWRIGAENSGHV,0 264 | SPYDATVTTKLRAAGIPILGKTNMDEFA,0 265 | RTKADIERLGELHPEE,0 266 | GVVVRIGRVVVRGVRR,1 267 | HVYTPVLASSELYKTSGHWDHYKDDMFPTM,0 268 | QVAKICNNMLLGILMAGTAEALALGVKNGL,0 269 | GFLDIIEKIAKSW,1 270 | DISNRDIMEAFAIEDPFHSVIVGFNV,0 271 | ANRLLEAYKMLLKFLGNLR,1 272 | AVPLIYKRPGVYVTKRPKGK,1 273 | SRSEKAGLQFPVGRIGRMLKK,1 274 | FLGKKPSPNFELSKQELYVRV,0 275 | GNPLKLFLPSTWVHFFKFLR,1 276 | INPDFSFVENQS,0 277 | RLARIVKIRVAR,1 278 | GWASKIGTQLGKMAKVGLKEFVQS,1 279 | PLCDESDDPDNVWY,0 280 | RFCWKVCYKGICFKKCK,1 281 | RDMHVIYLANRTY,0 282 | AAARIARVLHNDPASGVMRHAD,0 283 | PFLWMGYELHPDKWTVQPIV,1 284 | CYGRNESIASTPPGED,0 285 | ATAFALCRDQRLPVRVFSIIKHGAFKRVV,0 286 | ITIPPIVKDTLKKFFKGGIAGVMGKSQ,1 287 | FLPVLGKVIKLVGGLL,1 288 | YRLSIENLLTLDGIKEK,0 289 | KIKGAIKWKGAIKIKGAI,1 290 | FLPLIGSLLSKVLEK,1 291 | AQLLPNRGIDDILEDQVEPDDPLP,0 292 | TPIESHQVEKRKSNT,1 293 | FLPILAGLAAKIVPKLFCLATKKC,1 294 | RRGLLRVIRTVILALDIL,1 295 | ISLIIRGEDHVSNTPKQILIQQALGSN,0 296 | FLGALAKIISGIF,1 297 | ADELVFYDITASAHDRVVDKSWV,0 298 | EKRILGPVLGLVSDTLDDVLGILG,1 299 | EGQGVISLQAGTNK,1 300 | RGWRRWGRKWAHGWKKYG,1 301 | NTILHVTDLTGAETIAKV,0 302 | FALGAVTKVLYKLFCLITRKC,1 303 | RWRLLLLLRWR,1 304 | SVSGLGTGGMETKLQA,0 305 | PGGPGGAGGAGGFPGGAGG,1 306 | RTYYTEFAKALPKDTVILTAGCAKYR,0 307 | SIPSLGLNNSTNPHVPAGSQL,0 308 | GNACWELYCLEHGI,0 309 | HLRRINKLLTRIGLYRHAFG,1 310 | SIMETRPLSKTKHFRLIEVIEKAIG,1 311 | RRFRFFFRFRR,1 312 | GSNTGFNFKTLDKE,1 313 | PGSTHLPETPRLLPLPPV,0 314 | KDIMDAGKLVTDELVIALVK,0 315 | PCVKLTPLCVTLNCT,1 316 | KEILKIVKENFDFRPGMIIINL,0 317 | KEPSKIAEKLRFLRADRDRLSSRTRKLAA,0 318 | KWKSFIKKLTSAAKKVVTTAKPLALIS,1 319 | QKDYAPVVPTCKPSGQPKKA,0 320 | KNKLKKHKKLQ,1 321 | GKKLKLYFAQVQTPE,0 322 | GMQMRIAMFSLIYKKT,0 323 | RFRAQQAADLGIIDIGRTRRPKAI,0 324 | GIGRIGKVHAANLIKIPKG,1 325 | KWFRVYRGIYRRR,1 326 | QPVTSPAHIHPHLAWLYRTED,0 327 | HSIIGRKLVVHKKAK,1 328 | RIRLLQRFNKR,1 329 | FIDLIHRHNLKRPLILLN,0 330 | LESLASSAVRTANKARAKL,1 331 | KIFDAEILLNGKRKGLG,1 332 | WWRELLKKLAFTAAGHLGSVLAAKQSGW,1 333 | KLWLLWLKWLL,1 334 | YAERLCTCSIKAEV,1 335 | FIPLIGSVLSQLL,1 336 | IRILSCNPFCSGGIAYPTTRLKRPSLLQ,0 337 | FIGPIISALASLFGG,1 338 | LLHSLQLPEACSEIKHEAFKNHVFNVDNAK,0 339 | FLSLIPAAISAVSALANHF,1 340 | KAMVTGLMAVPKRRVSHS,0 341 | WADGNVYIGRWKSGKMNGSGVMQW,0 342 | GVEKDKRALEI,0 343 | DIPRFNKFFHGMLERGIYL,0 344 | KRKILILILIKRK,1 345 | AAEAIETSDEVKQQFISEFI,0 346 | FKCWRWQWRWKKLGAKVFKRLEKLFSKI,1 347 | SEAMVFVQTEDTPEGTGLLLRPVEE,0 348 | LEMLLKMTYLSQIEALLFVAGEEGLSL,0 349 | GCKKFRRFKLKCKQKLWLWCG,1 350 | FLASHVAMEQLSKLGSKIATKL,1 351 | SIWAGEKVTGATVFRIVRAM,0 352 | RKKRRQRRRGGGGLHITDMAWKR,1 353 | QWSRIFYSGHRKGPGIALVT,0 354 | CFSSTPSSSPVSIPPYISCDRSNFGAVAS,0 355 | SDHAIGKAQTLPAMFGVDLAIEALLSI,0 356 | LGGDNYGTFSGSNGNNFQHGSN,1 357 | TSHHSESIITRDI,0 358 | KVGSLQYLALAALIT,1 359 | NGELIELGVKHKLVNKSGAWFSYE,0 360 | GYFGWDDLMDLAITIRT,0 361 | LDCFTILIEEGTE,0 362 | GRVSDHRINLTLYK,0 363 | TKKEKRILGPILGLVSNALGGLLG,1 364 | GSGILILIKRK,1 365 | DFKNRIDLRHLP,0 366 | KKAAAFAAAAAFAAWAAFAAAKKKK,1 367 | GGILVAKLVTTLGPALISWIKRKRQQ,1 368 | LKLLLLKLLKLKLWK,1 369 | SHRNYVSGLVAAHLQWHLER,0 370 | CLAPLAKVIHDNFGIVEGLMTTV,0 371 | YRMWRWRWRWR,1 372 | VGIGTPIFSYGGGAGHVPEYF,1 373 | VTEFKHHLEYHIRKHKNQKP,0 374 | VNWKKVLGKVVKVVK,1 375 | FAIRARRSGNHPFSSADVGR,0 376 | AMRMTYNRPCLYATKRTKEM,1 377 | IIKKIIKKIIKKI,1 378 | QLQKWEDWVRWIGNIPQYLKG,1 379 | MVSAAKVALKVGWGLVKKYYTKVMQFIG,1 380 | KKKKKKKKKGGGLLALLALLA,1 381 | AYELAEELKDIMRPLFQKHQDDII,0 382 | GLASFLGKALKAGLKIGSHLLGGAPQQ,1 383 | SYLSDPQSVEES,0 384 | NLYLIIDTMHELSSLMSST,0 385 | KWLCRIWSWISDVLDDFE,1 386 | FPFSLIPHAIGGLISAIK,1 387 | VSSPDYPERNRENVMED,0 388 | IYLLGKNKRMSNYVPRLKKVYTE,0 389 | ESLKARSLKKSLKLKKLL,1 390 | GVPKDFVVPVRELVL,0 391 | CALKLTKAKRLVRKIGF,1 392 | FKKALHLFKPIKKFLKWK,1 393 | IKHVVAADYLDKDTYGYVGDVTHINKRVIE,0 394 | ANAAVHVAMAGK,0 395 | LGGIVSAVKKIVDFLG,1 396 | INWKKIAEIGKQVLSAL,1 397 | GMRTVLNILNFVLGG,0 398 | GFLNGLLGAGEKKGRGINGVG,1 399 | ALLNHFYPPRRISPGVHPT,0 400 | AGVNDRVQLAQLG,0 401 | WRWAKWGLKLWKWKKIY,1 402 | GVLDILKGAAKDLAGHVATKVINKI,1 403 | KRIGWRWRRWPRLRCK,1 404 | FPELQQDLIARLLGK,1 405 | GWKDWAKKAGGWLKKKGPGMAKAALKAAMQ,1 406 | RVKRVWKLVIRLVKALYKLYRAIKKK,1 407 | SIIFDSLYDQQHTYEVKR,0 408 | SLYNVLHEQSSVVIDHGQAVRFALD,0 409 | LGQPIDGLGAIEA,0 410 | AKKVFKRLPKLFSKIWNWKHHHHHH,1 411 | NSVNHTSGAVPTPSVMGSATASST,0 412 | LSSGRPDGFIHVQGHLQEVD,1 413 | ISLASDIENADSRSLARSILDRVD,0 414 | TSLGGWKLIQQKMDK,1 415 | GKFFKLMLNQGINLAPSKFEAW,0 416 | KFGYRYDSLELEGRSISRIDELIQQR,1 417 | FMKKIVKIGKIARYSHKAKK,1 418 | SRHNNEEKAQAIKFATIDSNIQLSI,0 419 | KWKSFIKNLTKVLKKVVTTALPALIS,1 420 | GQLNKFIKKAQRKFHEKFAK,1 421 | WFKFFKKFFKKWK,1 422 | RRFPFFPFKFPLI,1 423 | SKRKLGRHVRALV,0 424 | IATASVKLACGEEIKAEAAN,0 425 | RKRAARLLKRLV,1 426 | FIGTLIPLALGALTKLFK,1 427 | DYMILPTVTYYTISI,1 428 | ISLVSELDNLNLLDCLKS,0 429 | APYFRCEKGTDSIPLCRKCETRVLAWKI,0 430 | WQNDYSRDYEVKNHMECQNRSDKYIW,0 431 | CLGVGSCNNFAGCGYAIVCFW,1 432 | KWKLFKKIEKVGQGIGAVLKVLTTGL,1 433 | AQNGVLYGTPTKHEITFA,0 434 | KDIEIHALDAFDRTALITLPA,0 435 | DTHIIHHAGHNAHRDNPAAV,0 436 | DALMREIGRQARAAARR,0 437 | EGGLKHIEDAIEKLGKRHD,0 438 | LAPVGELFVCGGGA,0 439 | VRRFPAAAPFLRR,1 440 | DMKDMPVLSFRN,0 441 | YIAVDECNDTDEDESMMDIDSADSGNPLAA,0 442 | LNMVKLEAEDCMEEISSDFP,0 443 | RSQIVLQSLDGKSR,0 444 | TCNKNGIPFDPEKPFVTSGLRL,0 445 | KGPLGRAMFRKLKVYAGTE,0 446 | KFFKRLLKSVRRAVKKFRK,1 447 | CADCSAPDPGWAS,0 448 | KKALLALALHHLAHLALHLALALKKA,1 449 | IETFLKQLRSAANKIVGL,1 450 | MCLEKDTLGLFLREG,0 451 | FFPMLADLVSKIF,1 452 | EMVFYIWKPGGMA,0 453 | SCANPFLYAIFTKNFRRDFFI,0 454 | ELALIKQVAYLVAMGN,0 455 | KINNRKILTGIAEII,0 456 | FLPGILKVAANVVPGVICAITKKC,1 457 | GGLDRGFTFEPLTDLLK,0 458 | ILGIITSLLKSLGKK,1 459 | PSVQGAAAQLTADVKK,1 460 | LLEVYEVLSSNLNPTKDSGPV,0 461 | EDEEEYREIKES,0 462 | KWKSFIKNLEKVLKKGPILANLVSIV,1 463 | SGSNQTLCFHCKGREAELYC,0 464 | GTKWLTEWIPLTAEC,1 465 | LPVPQGIACDTAEEAVS,0 466 | ENQYIGYVAYPLDLFE,0 467 | KRFLPSCVRSIQNLDDALPTPEEF,1 468 | SKTKRRSLLKRLGDGIRGFWNGFRGRK,1 469 | SEVLVAASTVEKAI,0 470 | YRWWRWARRWYRWWRWARRW,1 471 | CLRKLRKRLLRC,1 472 | KLKLCKKLKLLKLKLCLK,1 473 | RVLGANTGQSHNGVFTTR,0 474 | KWGKKWGKKWGK,1 475 | LALLKVLLRKIKKAL,1 476 | FVIIGSHVMLGGGSMVSQDVPPYV,0 477 | FWPSFNSAIAEPG,0 478 | RVHGSELPPSATAKRQELAGKS,0 479 | RINSAKDDAAGLQIAKKLFKKILKKL,1 480 | WVWVPAFCQAVGWGDPITHW,1 481 | GLPVCGETCTLGTCYTQGCTCSWPICKRN,1 482 | GWLPTFGKILRKAMQLGPKLIQPI,1 483 | RWRWRKRRWRKGLIP,1 484 | LHMGHVRNYTIGD,0 485 | ALAKMQERGLPYISVLTDPT,0 486 | PNRESQFYKILRKILSKIV,1 487 | PGLRMNIIANPGIP,0 488 | SGARFHANYFRPGGVAKDLPAGLTDRI,0 489 | GLWEKVKEKANELVSGIVEGVK,1 490 | DFIRQYVDKIL,0 491 | RRAKVLSLLNGIT,0 492 | KWLRRPWRRWR,1 493 | YHKLKLLLIVTKIVELLGKK,1 494 | SDLLEQNRLLQDQLRV,0 495 | LDFEQPIAELEAKIDSLTAVSRQ,0 496 | LKFIAPVIAKGLD,0 497 | LDALEAAMDADVV,0 498 | IGNIPQYLKGLLGGILGIGL,1 499 | STMRFHKLATVETAEPEDKAII,0 500 | GFLGPLLKLGLKGAAKLLPQLLPSRQQ,1 501 | SGLAYGIHVVITT,0 502 | IVPFLLGMVPKLVCLITKKC,1 503 | KKKKVVAATYVLKK,1 504 | GSLCGDTCFVLGCNDSSCSCNYPICVKD,1 505 | TDLDNLWLDVLL,0 506 | FIKGLKRLCAVIVPSVICAVDKLPPG,1 507 | NDELILTFGADDVVCTRIYVREMPFALGQ,0 508 | VTDSQYALGIIQAQPDQSES,1 509 | PAQPFRIKKRQGPFERP,1 510 | LRRLQQERVLEDTPAIRGNVEKVA,0 511 | NKHHTYKYSTM,0 512 | LLLTGQSAIVGGLEVAGRVVETVTGTVG,0 513 | GEFLKCGESCVQGECYTPGCSCDWPICKKN,1 514 | PEDQELESLSAIEAELEKVAHQLEELRRG,1 515 | GPVGLLSSPGSLPPVGGAP,1 516 | FAKLFAKLAKKFAL,1 517 | HFLGGTLVNLAKKIL,1 518 | RRWARRLFFAYRR,1 519 | WLRAFRRLVRRLARGLRR,1 520 | GLKALKKVFKGIHKAIKLINKHVQ,1 521 | ARSANVGGEVL,0 522 | LVKKLLKLAMGFG,1 523 | DCTRWIIGINGRICRD,1 524 | NFLGTLINLAKKIM,1 525 | ALYKKIIKKLLESAKKLG,1 526 | TINYPFEMGHRGP,0 527 | RQYMRQIEQALRYGYRISRR,1 528 | SSPPYWTSAFLTFQHAIESS,0 529 | LYHFLHWFQRF,1 530 | EIEYGNPGVGTDR,1 531 | GFKLGRKLVKVFKWII,1 532 | LATLLFTMCLNFCGVLAGDD,1 533 | DVLQHLGGRFASLSTIPENLLLNMNRLGL,0 534 | ILRWKWRWWRWRR,1 535 | VRRILTYILVNAKEPTLPKGI,0 536 | TLVEVRKSSTGSTTTISLNQERNSLDHTAI,0 537 | KIAVSDSCPDCFTTQRECIYINES,0 538 | KKKAAFAKAWAKAFKAA,1 539 | LFAKFDDFRPM,0 540 | GLASTLGSFLGKFAKGGAQAFLQPK,1 541 | GPELNHAFGLCHHGNYLFWT,0 542 | ILQSLKVDYIDESEV,0 543 | KSMILKLSSQNKF,0 544 | TAVDKLNTESQEMGRAIYEAEA,0 545 | QFAIPTIDYIRILPE,0 546 | AGPGSLGPVNLPIPV,0 547 | LKRCSKHLVSKQNDL,0 548 | TKGASEVAPRRIHAP,0 549 | FCTMIPIPRCY,1 550 | KLKKLCCLLLLKKLKKL,1 551 | GLWSKIKTAGKSVAKAAAKAAVKAVTNAV,1 552 | VPSRTYKGQFFALPQSPQLF,0 553 | ENQNWHLKHFCCFDCDNILA,0 554 | AAQLLWGPLSDRYGRKPVLLIGL,0 555 | RWRLLLLLKRH,1 556 | KLFKRHLKWKII,1 557 | CVWEYRRTVNDEYAGPRL,0 558 | LLPIVDNLLDGLL,1 559 | TKKFPNRRKTAFCSNWNQKIKTV,1 560 | FRAGSMDNGDGIAVGWLGHP,0 561 | SGSFVEVIAISKE,0 562 | SGKKFVNLWMHAEHLVVDQK,0 563 | IAGNAFPDTHEE,0 564 | TAYFLLKAAGRW,1 565 | IENPELHQHFG,1 566 | GRAAALISGTNNV,0 567 | VIIRYANDVIGLGKWVGNRVKN,0 568 | AVPAVRKTNETLD,1 569 | GTIILADEFSPDNCRLWDKN,0 570 | TRDTNDLISSRTAAPSMV,1 571 | KKKKVVTSTYVLVEA,1 572 | YDVVSVELANERERR,0 573 | YVLAKRKRAIFI,1 574 | GRFRRLGRKFKKLFKKYGP,1 575 | IKKILSKIKKLLK,1 576 | AAGHQADMAWMQDPRRQAIELLLPGV,0 577 | VLNACSDICYLEEGKQLHS,0 578 | LVIGSEHMSSIL,0 579 | KETSFSVVRAGGS,0 580 | EMSQNGSDRDPWSKPGQSNDQQPGNSSNN,0 581 | WAVNAGGEAHVDVHGI,0 582 | AKELTQKSLPEIGELFGGRDH,0 583 | YQEPVLGPVRGPFPIIV,1 584 | MIGHKFGEFAFTRKATIHKKKTK,1 585 | RKMMLTHLVVRPGL,0 586 | ILKAHKALETITLTKDVAHFKPIIEKQ,0 587 | KLIPILSKTIPAIKNLFYKI,1 588 | DSDITSQTIGEMVME,0 589 | KKKKPLFGLFFGLF,1 590 | RYDGALFPKAAGFALGVE,0 591 | DEQWCDLLELNSVDSV,0 592 | RWKMRRIKIQR,1 593 | GMATKAGTALGKVAKAVIGAAL,1 594 | KSLIGAERPSLDEHERVWRGFINL,0 595 | IRTAKAKGLSYTRIVIVHALRNALIP,0 596 | NVLPEGWRYDPKTAALLVK,0 597 | RKRPAAYPFVLSTSSEKDKG,0 598 | ADELLARTDQVVA,0 599 | PDKWTVQPIVLPEKDSWTVN,1 600 | IHFKWRRWKFHI,1 601 | RDMAAETMDPRASTE,0 602 | FSVPLDEDFRKYTAFTIPSI,1 603 | HRMANMMMNWSPTAALV,1 604 | LLFREAGLPGSVSYEKVEV,0 605 | FRRFRKWFRRFLKLF,1 606 | AGKEKIFQRLKKTIQEGKKIAKRAW,1 607 | FIGLLISAGKAIHDLIRRRH,1 608 | AAEKTLKKLEFEMK,0 609 | FEGYTVNKDLVDK,0 610 | FNLDNGGPTPGLN,0 611 | ECRSTSYAGAVVNDL,1 612 | GFKEVLKADLGSLVKGIAAHVAN,1 613 | HTLRTQLIECE,0 614 | DPVWARLNLAWAGFFAAMGVLNLYVAY,0 615 | ELVNQIIEQLIKKEKVYLAW,1 616 | KIRFTRTVSRLLKAALAST,1 617 | LLKLFFPFLETGE,1 618 | RLMKCYKPNSRGFQLCE,1 619 | YGTCYTRLSHFS,0 620 | YVWWFKPQFTFPVAM,0 621 | KRNGFRKFMRRLKKFFAGGGSSIAHIKLH,1 622 | GIGKHVGKALKGLKGLLKGLGES,1 623 | KRIRKRIKKWLR,1 624 | VKSMLPMQVILLEPGKLGKTGDVV,0 625 | AYAGIPLTHRDHAQSVRFVTAHGKGGTQDL,0 626 | KFKSKIMSELVSN,0 627 | DFVVVEVKDWKIIPLENIIA,0 628 | GGVLGGAGGTVAYGATCWWS,1 629 | DVKIDGQAVTVKGPKGELSL,0 630 | RSRGSESTRARQAAVVSEAGRDDDQESG,0 631 | SISTEQARTLLPL,0 632 | AKRLLKVLKRF,1 633 | AENSSASIATAGIGGPPQ,0 634 | GIDEVGKGAIFG,0 635 | YTIAKQYYELGQQIKPFVINT,0 636 | KKLKKFKNKLQ,1 637 | AAKAWLKLWAKAA,1 638 | FILPLIASFLSKFL,1 639 | ILRYVYYVYRRK,1 640 | PLELFERWLKQACEARLPD,0 641 | QELELDYAFEQEK,0 642 | GFVDFLKKVAGTIANVVT,1 643 | GWKDWLNKAKDFIKEKGPEILRAAANAAIN,1 644 | LRIVKLILKWLR,1 645 | LGTDSLKNLTLRDLS,0 646 | LGRVDIHVWDAVYIRGR,1 647 | FIKWKFRPWKKRT,1 648 | KVEIVAINDPFIDL,1 649 | RNDANASSEFYMN,0 650 | VKKMCEKCRVIRRRGR,0 651 | AGHNKVGSLQYLALA,1 652 | NSLPLYTTASMGNPALNSLA,0 653 | FLPLIASLAGNVVPKIFCKITKRC,1 654 | WCFAVCYRGRCRRKCRR,1 655 | LKMFKLPVHQLSMI,0 656 | NGGVGGLGGAGAAGGNGGAGGMLLGDGGA,0 657 | GWFKKTFHKVSHAVKSGIHAGQRGCSALGF,1 658 | VLPVRDGDKDYL,0 659 | GSAQPYKQLHKVVNWDPYG,1 660 | FTLILYVTDIV,0 661 | GWGESVIIGVAGAGQEIATRPFQL,0 662 | GFGSFLGKALKAALKIGANVLGGAPQQ,1 663 | DDDVDVLVQSIYHLSPKMDMQLLT,0 664 | GSDQTTTMSGGEGEPGEGEGDGEDVSSD,0 665 | RERLKIAQETAFNEKAH,0 666 | AADFSSGAPITGQLGPDAISEATV,0 667 | CFQDLQERRDSLLYEIDGMV,0 668 | GLELAASATQRRCKVTVIELA,0 669 | SLGIVPCAISWLFRLIDE,0 670 | KLKKLLKKWLKLLKKLLK,1 671 | SVDMVMKGLKIWPL,1 672 | AKGVALGGGCELLLYSSYIVANQ,0 673 | VLNLTDNTYLKLNDEVIM,0 674 | WNSLKIANLAV,1 675 | PAWFKARRWAWRMLKKAA,1 676 | WKWWQWQRNWRKVWG,1 677 | LVINSNGNLIAPRGYF,0 678 | FLKALFKVALKVL,1 679 | LVVMRATQLQKNLTCEVW,0 680 | SDCFKKILKDPGKPQAKGNFWTVDVSRI,0 681 | DAPTIEISFRYNGTQQKFSI,0 682 | QRNLILSHCCGVGEPLSENIVRLI,0 683 | HLPLPLLQPLMQQVPQPI,1 684 | RPWAGNGSVHRYTVLSPRLKTQ,1 685 | AALFLGGYGVF,0 686 | QTFKAANFDFFKVDA,0 687 | KALAKALAKLWKALAKAA,1 688 | MTPWFLYLIRTADNKLY,0 689 | MSTNPKPQRKTKRNTNRRPQ,1 690 | LKLKAIAALAKKKW,1 691 | YQVVKQPDYLLVPEEVMEYK,1 692 | FLPLVTMLLGKLF,1 693 | PDVYQPLRSATPTSGFS,0 694 | LSRVASIQSEPGQQN,0 695 | RWRWRWWRWRWRWCTKSIPPIC,1 696 | SILSGNFGVGKKIVCGLSGLC,1 697 | GLLGFVGSLLGGLGI,1 698 | IKVTDNGRGIPVDIQEKMGRP,0 699 | WWSYVRRWRSR,1 700 | TPLSLGIETLGGVMTKM,0 701 | QLSGKMSKEDIEGTNILDEPVHGI,0 702 | VQIIETRPLSALKRWRIVKIIERAK,1 703 | EEMKKSDEEIKKYIEEIKKVEEESKYDEE,1 704 | NCVKMLCTHTGTGQAITVTP,1 705 | RHDEVLAAGGLH,0 706 | PVDCTMYSNKMYNCSLQNGF,1 707 | KYELPVISPIDD,0 708 | ANVGFVIQLQARLRGFLVRQKFA,1 709 | DMGFDANSCAV,0 710 | PRRLAENVKIKID,0 711 | ALCSSERTTLCGPDSMLLAL,0 712 | GRPKGKVNADYSL,0 713 | VHGKSHIIGKMPGDRWQKFA,0 714 | RWRWRWFSGGFIKHFIHRF,1 715 | DMADTDKMMKKS,0 716 | GSHLYSHLSSAH,0 717 | AARIILRPRFR,1 718 | AWRERNPQARISAAHEALEINEIRSRV,0 719 | GEFLFFDWRQDPAFELNQPRYKGSQILVAN,0 720 | NEAPVRSGPAKPAAKPPRS,0 721 | HEFTTLLAENRRNGITVDDSVWAKIQALMI,0 722 | LGCINVHGSILPKYRG,0 723 | GWGRFFKKWWRVGKRVGK,1 724 | RARTHNSHTSATHSHGHRRS,0 725 | GILDSLKNFAKDAAGILLKKASCKLSGQC,1 726 | NQRQTRSKRKAIGDTLE,0 727 | ITSISLCTPGCPTNVFVCISKRCK,1 728 | VNALKDRAEAYLIEE,0 729 | WPKWWKWKRRWGRKKAKKRRG,1 730 | RKMADEWGLKLISIRDLIAY,0 731 | GNGNLLGGLLRPVLGVVKGLTGGLGKK,1 732 | ELSIRTRGGRLLHN,0 733 | DEEGKLNKSLLDVQG,0 734 | GIHDILKAGKPS,1 735 | PKISRAWKKRENDRRQKHFNGVARPA,0 736 | LLGAALSALSSVIPSVISWFQKG,1 737 | KNEKKNDKINDTINDKINHKIN,0 738 | FIHHIIGWISHGVRAIHRAIHG,1 739 | TPVDPAEIIARDSLCPLPSNGFSVKLFP,0 740 | ADHSPLVGMNFIRQIRQL,0 741 | FIFLPIFRRPVS,1 742 | TEEAELELAENREILKEPVH,1 743 | QICRIIVVRVCRPICRITVIRVCS,1 744 | LKLLLLLKCCKKKK,1 745 | VFLGAIAQALTSLLGKL,1 746 | PSTQENKTENTGEAS,0 747 | LKLLKKLLKLLKKLGGGK,1 748 | FLSLIPKAISAISALINHF,1 749 | FFFIRRIARLLRRIF,1 750 | FSKLLKNQDLMVNHDRACFFRPTLV,0 751 | PMAKKHAGERDKKLAAKKKTDKKRALRRLM,0 752 | VWKRWVVRKWKWVKKKVKKK,1 753 | RGKAKCCKRGRKCCRRKK,1 754 | MPRRRRIRRRQK,1 755 | ITFLDIHPVEIARQLTLIDYELF,0 756 | AATAGKMHPIFLEIIIAP,0 757 | WKIKWKIKWKIK,1 758 | GKILKYLLYLLRKYANLIIR,1 759 | CLGIGSCNDFAGCGYAVVCFW,1 760 | AYCTFPASVLAPVPTHSTVSE,0 761 | SYEIKINRHFKTLKKNLKKK,1 762 | WYQLEKEPIVGAETFYVDGA,1 763 | YDAREVFGDEKAIWW,0 764 | AVAEVVKDLLAKAKKINTSDEVA,0 765 | FALAKLAKKAKAKLKKALKAL,1 766 | KQPVIVSYRTHYPA,0 767 | YHFEIYKLVLK,0 768 | DFISNLDENGEILEEKDESFYE,0 769 | KRKLTLVFGVMAGVIGTKKR,1 770 | KFKLKSLKIIEYKQ,1 771 | MTYYWRMKMPET,0 772 | NSVGSKAWHTTND,0 773 | KGGKFLNFLKKAAKVGAKVGMAALG,1 774 | GKMFPGLGKEFGLGKFPELMGER,1 775 | VLGFNGQQVTADNQDD,0 776 | ASYLSAGDSPSQFANTALH,0 777 | PFHMLGVAGVFGGSLFSAMH,0 778 | PLCLKSKYQSYRHQAEFSMS,0 779 | DETVEIVPTPVVI,0 780 | GVLDIFKDAAKQILAHAAEKQI,1 781 | PDFIDGYINLAAA,0 782 | IRPAKLRWFKKIK,1 783 | ATVRFSEDADGVLPVKTFLEYL,0 784 | ACFLTRLGTYVC,1 785 | TPSSFPQHVGHILCQLGEKH,0 786 | FLLFPLMCKIQGKC,1 787 | LVKIPRMILMYIHS,0 788 | GTSLSGGQQQRLCIARAIAVS,0 789 | MAGRSGDSDEELLKTVRLIKFLYQSNPPPS,1 790 | HHPSEVDVTIQAAR,0 791 | FFLPPCAHKGTC,1 792 | VNKLKQQAIDHVNNAPDVHS,0 793 | LLKKGCKATCLCEG,0 794 | SEVRISVIIPAEY,0 795 | GRWPVKTIHTDNGSNFTGAT,1 796 | GRKLWRKWLKRWLP,1 797 | FAKGVGKVGKKAL,1 798 | AIFGLSHTHNTKVGNDFVRGVSGG,0 799 | GITDLRGMLKRLKKMK,1 800 | NDCLPDPCHSRGRCYDLVNDFYC,0 801 | SHHRRRRSHCH,1 802 | LGGGLPAGFVGPLSRMAYQA,0 803 | CGKSTILRCFNR,0 804 | HVTTTFAPPPPR,1 805 | KWLDAFYKDVAKELEKAF,1 806 | TAYFLLALAGRW,1 807 | NLENTVKETIKYLKSLFSHAFEVVKT,1 808 | GLFDIVKKVVGAIGSL,1 809 | LFSEPYKTNKG,0 810 | GRRPRPRPRPFFFFF,1 811 | TKKAVPGKKEEKTTKTV,0 812 | IKTQIKADASILQVSAAS,0 813 | EQNPPVRVYDTSGVYTD,0 814 | KWRRKLKKLRPKKKRKV,1 815 | TVGKNEAGKAINGLERE,0 816 | QQKPKGNTSSILVTEQEITYAE,0 817 | RSITRPVLVRRRWRVRPVF,1 818 | AKNPTLCKLVGGIESVTLSDTQAIL,0 819 | HISAELNGTAPNPTATNPSGKKGVPK,0 820 | GSATLCSALYVGDLCGSV,1 821 | KKLIKVWAKGFKKAKKLFKGIG,1 822 | ECDIYSYNPDLDSDPFGEDGSL,0 823 | FSQSRLDYLQNQQAQAAAQI,0 824 | CCFLRIQNDSIIRLGDLQPLSQRVSTDWQ,1 825 | SSPQTPSDWPTAP,0 826 | KIAKVALAKLGIGAVLKVLTTGL,1 827 | NPQAGRIKTVTSFDLPALRFLKL,1 828 | ASVLARAAFEITTQHLFEAAE,0 829 | EHKSFIITGNGDVVQPEQDLI,0 830 | VTISIVTSVLGIGPYLDAN,0 831 | GMGVGSDVEEEEE,0 832 | GAARKSIRLHRLYTWKATIYTR,1 833 | LRKCEEESQAQMAKITGG,0 834 | FLAAARIAKRVAKKARKLAKRAARKRK,1 835 | SGSLSTFFRLFNRSFTQALGK,1 836 | KQIMTQFFNFARSPAVKD,1 837 | KKLLGWGLLKK,1 838 | KWKLFKKLPLIGRVL,1 839 | LRPAILVRVKGKGL,1 840 | KWKLFKKKGTGAVLTVLTTGLPALIS,1 841 | AGTSYDRIRLLYLDEMGGMAIPLPK,0 842 | ILKCSHTFNLLDARGAIS,0 843 | LEALANETTWFGPGSRIVVTTENK,0 844 | VRGLAPKKSLWPFGGPFKSPFN,1 845 | SPKKRESLFEEITFSWKTYTVVRTPFDID,0 846 | GLFDAIGNLLGGLGL,1 847 | TLWSNILLNMPIYI,0 848 | DGLSATPSIGAWLAKNLPKGSFVGVDP,0 849 | KSLRCFWKDKTSTRTRC,0 850 | AGYLLPKINLKPLAKLPKKIL,1 851 | TFILTRGIGQSFIADDV,0 852 | AGRQTIAKYLRREIRKRGRKWVIAW,1 853 | SQRQATKDAGVIS,1 854 | GLMSVLGHAVGNVLGGLFKPKS,1 855 | ARAEKAGIPATRLGT,0 856 | KEELKIIMCALDCKTIKELK,0 857 | GIGTKFLGGLKTAVKGALKELAFTYVN,1 858 | KRIRQRIKKWSR,1 859 | ETRDYLVVATTRPETMLGDTAV,0 860 | HRRSVAHQQQASLHVKTNQLPSPNTVRQQL,1 861 | GANALKKYFTILKKFFKLAW,1 862 | GSWIKRLNSWLRK,1 863 | AAKGLIHKNKAARHKANLTAQINKLA,1 864 | NDMVEYFGEQLSGFA,0 865 | HNSSKQWSHWLWHNGIRI,1 866 | MGKKIGKWIITGAAGWAGWEIGEGIWK,1 867 | GLIRPKYSIIRWL,1 868 | ITSISLCTPGCATSLFRTCLTRSCKGC,1 869 | GAFTDLLKGVAKQAGIKILGIAQCKLAKTC,1 870 | GTGTQLRSLVMVFGHGFRKRDTLQYPEE,0 871 | EEVPVTVFCPTCNTEVTLPN,0 872 | LIRKSNHSFLVQAGNVQLRV,0 873 | GRHFQQWLGSYSEALGGSI,0 874 | MDQPPCFARPGQSFP,0 875 | PPCRGIFCRRVGSSSAIARPGKTLSTFITV,1 876 | FVMSMYALLRSSKNSPSEEEI,0 877 | YDREKALEVAR,0 878 | AEPRDLVAIERTREFKGHYHVLGGLI,0 879 | GFKQFVHSMGKFGKAFVGEIINPK,1 880 | GLWKSLLKNVGKAAGKAALNAVTDMVNQA,1 881 | AHHMYSMPPYPY,0 882 | VTEDDNLPANAI,0 883 | KPGSQVREVVGRCP,0 884 | EVKPISKTKRWIISEVLSSTVNPEKFGD,1 885 | WRQKGTGRARAGTIRSP,0 886 | IQPGDKMAGRHGNKGVVSKIVPIEDM,0 887 | RASQEKVRVSSGRTCRPWK,0 888 | FLGEWLFVTRKAKRKHKHAA,1 889 | LNTYFCEKVVAKEDSEKTCEVYCPDIPLP,0 890 | GVDASAKPEQTKAQILR,0 891 | AVNDSRWHAVTLRRNFK,0 892 | CKLLLKWLLKLLKC,1 893 | KLFLHLEKKTPVHSRSSSPIP,0 894 | FQRYFHRYARFLAKIWKG,1 895 | GRGLQRFKSEFPKTAEKPALVVAGGV,0 896 | APFRMWYMYHKLKDMEPKPMA,1 897 | LAWVDWIRGWDAQL,0 898 | MSDVFMEMVKASATVDIVQKKLV,0 899 | FLAKLGQEKLMAEASS,0 900 | AWRWKAFRNCWRVRSSSL,1 901 | GLFAVIKKVAAVIKKL,1 902 | EEYKAKIIADIPENEALSLYRQGD,0 903 | GVFRVLRKVTRVVLKVIGKVLKWI,1 904 | FLPVIAGVAAKFLPKIFCAITKKC,1 905 | TIQNFRNHEELSLDFDSR,0 906 | PRFWEAWLRLME,1 907 | ENLKGSKTMPVALPSCSYFISIPYTPARK,0 908 | ETTADSNETTTTSGPP,0 909 | FAVSVLCGIGFTMSIFISS,0 910 | PWYHVVMEDDNGLPI,0 911 | FFPTIAGLTKLFCAITKKC,1 912 | AALEAKICHQIEYYFGDF,1 913 | LEGTPVKIEFKTSENPFEGRKSQV,0 914 | RERSKGSKYLYVG,1 915 | SFKWHPRCGEAYSMWT,1 916 | SIRDEESIKVCNEK,0 917 | NQHLRGAEIERLAKGAT,0 918 | DFVDHYNGWIWG,0 919 | GAKLAKKQVRALGKFFSF,1 920 | VRRVRKWVRRVLKLV,1 921 | DFTEEVEEGGSATGFV,0 922 | GKSNKPALTLIQARILKHKT,1 923 | DSTEQNMNIDALI,0 924 | LKAAAAAAKLAAKAAKAALKAAAAAAKL,1 925 | IDHTLLKPEASEE,0 926 | SMMNENIIKEKAKVTTINRM,0 927 | GFEDSYRNYSGE,0 928 | ETVITATIIGASILLAFAALGTAIG,0 929 | AEIAKNKQLQEFINTMKPSS,0 930 | GASCGETCFTGICFTAGCSCNPWPTCTRN,1 931 | GMASKAGSVLGKITKIALGAL,1 932 | IQRNVQKLKDTVKKLGESGEIKAIG,1 933 | GWRLIKKILRVFKGL,1 934 | FNRGDVISLPNWTNFRW,0 935 | LDSLKCGINLRSI,0 936 | GLFGVLAKVASHVVPAIAEHFQA,1 937 | ALVTELENRKQRQQWYAQ,0 938 | IAADMSNLIENLDTREL,0 939 | KYFVQIRSELPESIKEPDRS,0 940 | RVQRVPATETQG,0 941 | GGSLDGKPPRWNIDEVIK,0 942 | ASEQGASMTAMDNATRNAGELINKLTI,0 943 | VNWKKILGKIKKVVK,1 944 | VNPIILGVLPKFVCLITKKC,1 945 | LITLLFTDLGVLTPSAVS,0 946 | LGGGTFDISILRLTRGVF,0 947 | PARFAYPPNLFVVDGGAPQV,0 948 | KIKWILKYWKWS,1 949 | GLLSGILGAGKNIVCGLSGLLKLESEII,1 950 | VGKKLRDLMSWV,0 951 | INWSSIFESVKNLV,1 952 | IEDIAVNSKYQGQGLGKLLIPRT,1 953 | NAMLRLFKTNVLTSQFKRSSYLIDL,0 954 | RKRIHIGPGRAFYTT,1 955 | GTRTDYEADKIK,0 956 | NQLFAQLLAQSVHHG,0 957 | ILPKKKPKKPKRR,1 958 | RRIVQRIRDFLR,1 959 | AHISGKMRKNYIRILTG,1 960 | KALAALAKKILQQCRRPLYCRRR,1 961 | FPVTWPTKWLKG,1 962 | CTRSIPPQCRCTDRT,0 963 | LGSSDRDTVVELSEWGVPCV,1 964 | IAGPVLRFKDLV,0 965 | ALRGCKTKSIPPKRCPGKK,1 966 | EISPEEVQDPVVKALVQR,1 967 | PMKKKLAARILAKIVAPVW,1 968 | VLSFDLSDPEKEKVG,0 969 | KLLQTFKQIFR,1 970 | LTDKGLANRDQVV,0 971 | DALVMIDHAYI,0 972 | GTAWRWHYRARS,1 973 | GQILLDDQDVTQVPAEQRHV,0 974 | INWKGIAAMAKKLA,1 975 | LKHEDVIFLEVDTEDCEQL,0 976 | KLLKLLKKLLKALG,1 977 | AHWGVLAGIKYFSMVGNW,1 978 | EAKWDDSRLLWLV,0 979 | ARLRDLAHGGQV,0 980 | IKTKNREYTIDENK,0 981 | ALHQDILSRVVPLNQS,0 982 | CVHWQTNTARTSCIGP,1 983 | WAIRWGADTVMDLSTGRNIHN,0 984 | WEIHHINKFAQAYSSYSRVIGGTVFVA,1 985 | GILGKLWKGVKSIF,1 986 | AKGLRCVCTRGFCRLL,1 987 | GLWDTIKQAGKKIFLSVLDKIRCKVAGGG,1 988 | INWKKMAATALKMI,1 989 | GVLSNVIGYLKKLGTGALNAVLKQ,1 990 | KWVISMGACANSGG,0 991 | FIHHIIGGLFSAGKAIHRLIRRRRR,1 992 | IDRRVIQGGNSIGGAI,0 993 | AQRRNLLKYIKRTNHARYVVLIEKLGIKDR,1 994 | FVPWFSKFLPRIL,1 995 | LKLKLKEGNKKLKLKL,1 996 | QQDYGTGWFDF,1 997 | STGAIALLARLL,0 998 | KWAKKWKWFAKAAWKWYKK,1 999 | RRFKLLSHSLLVTLASHL,1 1000 | APPLPRAPPVPP,1 1001 | SRSMWSGMWRRKLKKLRNALKKKLKGEK,1 1002 | DKIHRFSDKSSHQIFIEPEGL,0 1003 | KPNAYKGKLPIGLW,1 1004 | ALLKLAPRLLAGIF,1 1005 | ILPWKWKFFPWRR,1 1006 | GWFDVVKHIAKRF,1 1007 | LLKKLLRAASKALSLL,1 1008 | SHHRCCRSHRCRR,1 1009 | PDKEILNIVKENFDFRPGMI,0 1010 | KFLKLKWLKSWKNFK,1 1011 | AEKRPVEDGEKHSGRKRRRRNYDEHDE,0 1012 | ICEHLKRDGLEDYIRSNTGL,0 1013 | FPISTLLKWWKG,1 1014 | KSLLPAGVKTVAGEF,0 1015 | SAVEFTATGSRDKTIKVWDR,0 1016 | DSHAKRHHGTKRKFHEKHHSHRGT,1 1017 | RGGKIAGKIAKIAGKIAKIAGKIA,1 1018 | RIILGQCIKREAEAAIEQIFRTKYNDSH,1 1019 | VKSWIRKLVHR,1 1020 | QVRWWGRYWRRKWATCR,1 1021 | RYGRRPYGVGLLIAGYDDMGPHVFQTCPS,0 1022 | LRCMCIKWWSGKHPK,1 1023 | GSIADANDAAQFAELETLG,0 1024 | IIGPVLGMVGSALGGLLKKIG,1 1025 | GFASVLGKALKLGANLL,1 1026 | KIKCDNPGFIELAEILEGGNVVV,0 1027 | ARCENFADSYRQPPISSSQT,1 1028 | IYNPGDTVESHEGWSLAIEE,0 1029 | LFAPEIHSNPGDTGYCPT,0 1030 | IKVIFQLALAVL,0 1031 | SQAFTFGLIYSTISVISHYL,0 1032 | PAIKGHDEHDKEIE,0 1033 | NNWWYWWDTLVN,1 1034 | KTTSSIEFARLQFTY,1 1035 | DAGGMNVYIRNTSTVLARRGVAVEI,0 1036 | ENNPGVKFTGYQ,0 1037 | LGDLAQSEDLARYVL,0 1038 | NRVFGALKGAL,1 1039 | VQRWLIVWRIRK,1 1040 | LGKEVALITDGRFSGGTHGFVVGHITPE,0 1041 | KVAARCPTMGPAVLT,0 1042 | GLFLDTLKKFAKAGMEAVINPK,1 1043 | DWHLGQGVSIEWRKK,1 1044 | GLLDLLKLLLKAAG,1 1045 | VIMDVSMQGIDCYEVAVVI,0 1046 | TLDTNEVFLEKLRQRIQ,0 1047 | KEKRRVYTQTLLMVSVL,0 1048 | VVLSIVNRVRQGYSPL,0 1049 | WRKWRKRWWWRKWRKRWW,1 1050 | VVRLNMLKTHYRSP,0 1051 | GGTLNPGLAPAPVHKF,1 1052 | ALTSSWITSRQIESV,0 1053 | LNLKGIFKKVASLLT,1 1054 | TWLKKRRWKKAKPP,1 1055 | IAYAQMGDPNASIPTP,0 1056 | SNSLFLYLKWRVKKVVLMIIVVSLI,0 1057 | LSLLPFVASNSI,1 1058 | GLFDKLKSLVSDF,1 1059 | GMLNEFKAFAVKGNVVDMAVGIIIGA,0 1060 | GPIVHALSQSYILTPLCDFSLTQ,0 1061 | ASELGPFSDPRQFPSISSLTE,0 1062 | KRKILILIGSG,1 1063 | LNWGAILKHIIK,1 1064 | KSSTSSARSGVLSARFTCNSSS,0 1065 | SFSQIGQYNEEVK,0 1066 | ALWKEVLKNAGKAALNEINNLVQ,1 1067 | SWFASTGGRDSKIDVWSLVP,1 1068 | KGDDRYAIYFSQTGLPEDKVRIAYHSAN,0 1069 | MGTPEERKETMAALERA,0 1070 | GVCRCLCRRGVCRCLCRR,1 1071 | ITSGYASFDYEDAEYQASDLVKLDILLNG,0 1072 | GGAIVGGVGYGATCWW,1 1073 | DCDIRVLGFEAGKGIIQCSH,0 1074 | KNHHSREMSSFII,0 1075 | KADPDAVYETEI,0 1076 | QTLLQELPIPP,1 1077 | MDRFSLKWKKKRKIRRRRRRRRRRR,1 1078 | TLPEWDDRRVVNS,1 1079 | RRLQALDRAAYNV,0 1080 | RQIKIRRMKWR,1 1081 | IIKAVIQARGVIKQVKPDV,0 1082 | KYYSRVRGGRSAVLSSLDK,1 1083 | FLPALAGIAGLLGKIF,1 1084 | FGSLFSLGSKLLPTVFKLFSRKKQ,1 1085 | GLWRFWFGDFLT,1 1086 | AHFGFADDARVLL,0 1087 | IFGAIWKGISSLL,1 1088 | FAIAIKAIKKAIKKIKKAIKKAI,1 1089 | AVMAEALPEAGRYFCHSCTAEITPR,0 1090 | GELVVDGKALVFNSATDALE,0 1091 | FIGALLRPALKLLAGK,1 1092 | FFFAFFFLNKSDVSSTSALRNRDSMA,0 1093 | RARFEELNMDLFRR,1 1094 | KLLPGLKNNYATIA,0 1095 | LRRWLRWLLRRMR,1 1096 | GLLRRLRDFLKKIGEKFKKIGY,1 1097 | KAFAKLAARKA,1 1098 | GLLDILKGAAKDLIATGLNALRCKLTKC,1 1099 | ILGKTYLPRKFKT,0 1100 | ATNPIEVPKALLSNEVDRLRVQ,0 1101 | LLFVFCVVNLINY,0 1102 | CNITQSPIYLF,0 1103 | YDLSDSNCLPANRDKRYYVI,1 1104 | YKRWKKWRSKAKKIL,1 1105 | TIAPGDGLMVVPLF,0 1106 | RVISLPAGFSPFR,1 1107 | GFGSLLGKALKIGTNLL,1 1108 | KLIPIASKTCPAGKNLCYKI,1 1109 | FWGALAKGALKLIGPGSLFSSFSKKD,1 1110 | LGLFPYRVQIMGGIVLHNGDV,0 1111 | GIRCPKSWKCKAFKQRVLKRLLAMLRQHAF,1 1112 | TPGIMTPMNILEEFTNFA,0 1113 | VNGWCRETIFNLKLPMKK,0 1114 | KKSFFKKLTSVASSVLS,1 1115 | KALVMIGEPMSRLDVERFAPV,0 1116 | IPEDCGEDEFEETLQEA,0 1117 | WWKWWKRLRRLFLLV,1 1118 | FMGGLIKAATKIVPAAYCAITKKC,1 1119 | KFGVIGFTRSLADPLYYTQN,0 1120 | LKRFLKWFKRF,1 1121 | AVAVVGQATQIAKKKLFKKILKYLKDEL,1 1122 | LVAGRDFTADGTI,0 1123 | RGKAKCCKGTKCCKKP,1 1124 | GKSETALELVKRGHRLVADDSVEIRQTGD,0 1125 | ILAGCAGVGAVGSVVFPH,1 1126 | RFDDASHRRCEL,0 1127 | VTQYKAGKASLF,0 1128 | CMSGNSGASESISIEE,0 1129 | GEGIPIVKPGSRKVKMIP,0 1130 | LKIPGFVKDTLKKVAKGIFSAVAGAMTPS,1 1131 | FKVTWKTKWWKG,1 1132 | DQLLERFEPGELNEEFDIDYSKEFGA,0 1133 | IEPAKNVTVIGHKAELVREVLDGQSAFTMQ,0 1134 | EGIVRAEIDLQESAEVRE,0 1135 | TIDFGVRNINQSNLVYDTER,1 1136 | RSLRSCNKKTTNS,0 1137 | RKKRLKLLKRLV,1 1138 | INIKDILAKLVKVLGHV,1 1139 | DGQVYGAKFVASHPIQ,0 1140 | YLKNFPSIINWDYIESRFVEM,0 1141 | GLLDVVGNVLHSLGL,1 1142 | HFKIRKRFVKKLV,1 1143 | RRRFVVQQDTISPRLEVDERFLPNSVQEQI,1 1144 | ADSGEGDFLAEGGGVRKLIK,1 1145 | PASIGNVSVGFDVLG,0 1146 | GARRKLLKSFQ,0 1147 | FLPIVTGLLSSLL,1 1148 | ANFCALILAYSNRTVGELGDVR,0 1149 | VAVKQGQYLGVSFHPELTDD,0 1150 | KLLDIVKKVVGALG,1 1151 | TEQADGQGASNSASS,0 1152 | RQWKTGPAGMSGISLGIPDG,0 1153 | GPGYCPVKVPESHDL,0 1154 | QRQCVEYALKARPL,0 1155 | VRLRVKTRVRLK,1 1156 | PEHKKKIPRFPDRIAVVT,0 1157 | GVSTRELSMMVYEA,0 1158 | IEAMPTSPIFVGN,0 1159 | CLRIGMRGRELMGGIGKTM,1 1160 | EIEGVSKMAFRH,0 1161 | FWRRYKKVKKYRRWF,1 1162 | KLAQKVKRVLR,1 1163 | DIGAMLVSMMRL,0 1164 | KRRLALFRLFRLALKLVLKK,1 1165 | VDMDLSVRALNCLKAADVETLGD,0 1166 | AMSEAAPVAVKEGVSKEEAEALKKELVEA,0 1167 | AVVFILFGLIGLICF,0 1168 | YIPKDTCVFINQWQINHD,0 1169 | GLGPNPCRKKCYKRDFLGRCRLNFTCMFG,1 1170 | AIMDTIKDTAKTVAVGLLNKLKCKITGC,1 1171 | ILPIIGKILSTIFGK,1 1172 | LKKLLKLLKKLAKLAG,1 1173 | RWKIFKKVVKKA,1 1174 | ADLEVVAATYAAAA,1 1175 | RIKRLTRESFRLRQHELPAMDFVVVA,0 1176 | GHTSMIRRDPLGVV,0 1177 | ASSATTSPFFKPPA,0 1178 | EGLKAYRWADGSIVSFRP,0 1179 | AEEMSAVFHPA,0 1180 | FLNKGLGIWYERKGRRRRTPKKTK,1 1181 | TQQAFQKFLAAVTSALGKQYH,1 1182 | PKSSWSDHEASSGV,1 1183 | FLSTALKVAANVVPTLFCKITKKC,1 1184 | FLPIVGRLISGLL,1 1185 | SWEDDIAGQRTFP,0 1186 | KLKHVGLHVGKLK,1 1187 | FSATKVRWILDHVEG,0 1188 | ASFVSNTNWQAYSGEA,0 1189 | VVTLTDTTNQKTELQAIYLA,1 1190 | KTEAWGLRSLTYRMNKNRKAH,0 1191 | GKREILVPTGDD,0 1192 | EHTIAVIPGSFDPITYGHLDIIERS,0 1193 | AGFGVREDMPIGLTVTLRGER,0 1194 | GLRLQTGTPQESLPTYTQTL,0 1195 | GKLLVKKIVSLVRSA,1 1196 | GVLLYATCSVLPE,0 1197 | YQLLIRAIYKNI,1 1198 | ISMELWQEAFKAVED,0 1199 | CSGSMSGNPIDSARRALEIIIRS,0 1200 | AQRKKKFVLDVDPKYAKQ,0 1201 | RRRQRRKKRGGGDTRLNTVWMW,1 1202 | AGIGKIGDFIKKAIAKYKN,1 1203 | RRWPLKPWKKPLI,1 1204 | RLLGNCLTVVMAAKLGTAFSPEIQCAWQK,1 1205 | ILYALRTGGKLYCSDADLSPI,0 1206 | QETFSDLWKLLP,1 1207 | EEMVRYARTDAHYLLYIADSLT,0 1208 | NADAATPAGSDFALQESQGYMIPKELETR,0 1209 | ATKAVCVLKGDGPVQGIINF,1 1210 | EEPHKAASAEGKK,1 1211 | DVHKDLWKTEQGTVIGLLNANPMKP,0 1212 | HGEPYPAVTSEQAKKADAILLGAVGGPK,0 1213 | AAGGWADRLFP,0 1214 | GFCWNVAVYRNGVRVAHRRCN,1 1215 | AVAALKFEAGN,1 1216 | RCVRWWKRVCK,1 1217 | LRLIAQRYQFSYGEVIP,0 1218 | FLPIVKKLLRKLF,1 1219 | FFSMIPKIAGGIASLVKNLG,1 1220 | FLIGKAIKRKFCLRSVWNA,1 1221 | LTAARVAALCDRRKGLGADG,0 1222 | ILPWKWPWWPWWKKPWRR,1 1223 | WETLPRRIRGGRLWILAI,1 1224 | VQWRIRVAVIRK,1 1225 | FDIGFSELLLVFVIGLIVLGPQRLPVA,0 1226 | ITSISLCTPGCKTGALMGCTIVCGTDFDGR,1 1227 | DEAMFFDEDYVTALEHGLPPT,0 1228 | LAAARETVLGLRSGKGMV,0 1229 | FLGGLLFGIFKHLGKK,1 1230 | LKRLYKRLAKLIKRLYRYLKKPVR,1 1231 | GMSMFGGLGCLP,0 1232 | KNKEYTIDDEEG,0 1233 | FHRKKGRGKHK,1 1234 | ASEELLLRFQATSSGPILREEFE,1 1235 | IFGAIAGLLKNIF,1 1236 | FLLKLGLGKKKLL,1 1237 | INLKAIAALAKKLLG,1 1238 | AQAAHQAAHAAHQF,1 1239 | PAPHALLEPVLGL,0 1240 | AYISTLDKEKLRKLTT,0 1241 | MESSRALAECSPL,0 1242 | YFLKMWSSVSETLIFIFLGV,0 1243 | QMRRKVELFTYMRFD,1 1244 | SGAVDVIVVDSVAALTPKAE,0 1245 | RRHCIKKCMKSRKHNERMIRIRRK,1 1246 | ESELDAEQKRGAE,0 1247 | VLNGKITRDSSV,0 1248 | DLNPDFFAEVVI,0 1249 | VLLVTLTRLHQRGVIYRKWRHFSGRKYR,1 1250 | YHAPADPRHLLEGPLPAS,0 1251 | KLAMSMSFNFVFYFMVNFDD,1 1252 | GIMIPTHLKLL,0 1253 | YFADLVKVGQAEAAARAGML,0 1254 | SVRQLDTTVTAGRPL,0 1255 | WVKAAAKAAAKVW,1 1256 | RRIPRPILLPWRPPRPIPRPQPQPIPRWL,1 1257 | FFPVIGRILNGIL,1 1258 | VSVLTVSGYIFSLLIGGSVVIEQIFALPG,0 1259 | KQVKETLARLTTKGK,0 1260 | DVAPGGSLASVY,0 1261 | RWMVWRHWFHRLRLPYNPGKNKQNQQWP,1 1262 | GQNFIFDSSLCDKIIRASNI,0 1263 | KSKKYFIILLALA,0 1264 | AGFPVVDIKVELIDGAYH,0 1265 | TFTPDTFLWQI,0 1266 | IDKLVGEVDKALRALSPDTKP,0 1267 | RLALRLALRALRAALRLA,1 1268 | KKLFKKILKYLKFLHSAKKDEL,1 1269 | SKQYCTEQNATLVKTASQ,0 1270 | STGAAKAVGKVLPELQGK,1 1271 | IVRVAVALRRIR,1 1272 | AKAWGIPPHVIPQIVPVRIRPLCGNV,1 1273 | FAKLLFKALKKAL,1 1274 | GWAKLITKAIKKI,1 1275 | PMARNKKLLKKLRLKIAFK,1 1276 | FCPRRYKQIGTGLPGTKCK,1 1277 | DGFFNRINLLVASQAIKDRVA,0 1278 | GLWQFIKDKFKDAATGLVTGIQS,1 1279 | FLSKDLNFEYRQCIV,0 1280 | FDEVMTGFRVAYNCGQGYYGVT,0 1281 | KRRLILRILRLAIRILVKKR,1 1282 | AQVKCNGLQINML,0 1283 | SELQEVFYLPMDAP,0 1284 | KCRRRKVHGPMIRIRKKNL,1 1285 | ILPWDWPWWPWDD,1 1286 | HQKIVKGDEYALRTLENQIFHIDNL,1 1287 | ETRAVILSHLQRGGSPSI,0 1288 | QRHKRRLEQLSWLTPIRMP,0 1289 | NTEYDFKWIIIIVL,0 1290 | TRDVNGVKHFIDHEINSIQNFMSEDMKSM,0 1291 | VKPVVKPVVKPVVKPVVKPVVKPVVKPV,1 1292 | NVGLHFICRDVWLR,0 1293 | EVAIKQLQEAD,0 1294 | KQWRIRVCVIRA,1 1295 | ETLDLVLETFRKSLRGQKMLPLLSQ,0 1296 | YVTKGQSVQQVYYSIGALAKSVY,0 1297 | RLGRLVSLHTLG,1 1298 | RAIRRAIRGAPRAILRAIL,1 1299 | ETFAKKALKALEKLLKKG,1 1300 | WILAIPRRIRGGRLWETL,1 1301 | WAVVPVNEGWEIASINVGL,0 1302 | RSGGIFEKKVLD,0 1303 | FLPAVLKVAAHILPTAICAISRRC,1 1304 | WKLKWKLKWKLK,1 1305 | SPPNQPSIMTFDYAKTNK,1 1306 | FLGALFKAVSKLL,1 1307 | FLPLLASLVGGLL,1 1308 | FKAWRWAWRMKKLAAPS,1 1309 | IKIPSFFRNILKKVGKEAVSLIAGALKQS,1 1310 | KIGGIGTVPVGRVETGVIKPG,1 1311 | WDVDGNKYIDY,0 1312 | VAKLLAKALKKLL,1 1313 | LAAKVEAFLAL,0 1314 | TSRCYVYRLKVVCS,1 1315 | GWLLLEYIPVIAAL,1 1316 | AKDEMEECASHLPYEA,1 1317 | QKSTEEAAMKNVSVDNDL,0 1318 | GPASRSVEMLKEMIKSGMNVARLNF,0 1319 | TWAGVEAIIRILQQL,1 1320 | ENDSKDKISYRTIIFDT,0 1321 | ELEAAINALEDG,0 1322 | FARPVAKKPTFLRLRGLFEYEIQ,0 1323 | EKLAGNDEVISAIALYS,0 1324 | LLGDLLGQTSKLVNDLTDTVGSIV,1 1325 | WHWTWLPKKKRKV,1 1326 | RVCSAIPWPICH,1 1327 | IAKAILPLALKALKNLIK,1 1328 | GKLKLRPIEDEAL,0 1329 | ILENTDLSTDKAE,0 1330 | GLKKKFKELAGNATKKS,1 1331 | ASPATPTVAQFVIQGSTICLVC,1 1332 | TVIEVASKMCSKMRLLKGLCKSITKRFLRR,1 1333 | IILGILLFIDICV,0 1334 | FLPFLAKILTGVL,1 1335 | GLGSLLGKAFKIGLKTVGKMMGGAPREQ,1 1336 | EIKKKFFYMKMVSDINALKY,0 1337 | VRTMIEVSPNGYMQPIEPMLYESTG,0 1338 | DVSEIKQKILGSGDVQ,1 1339 | KKKKFVKKVAKKVKKVAKKVAKVAVAV,1 1340 | APVPFSCTRGCLTHLV,1 1341 | AAARMGAQTLLLTH,0 1342 | ETLTNTFVAFQDGPQPPENVV,0 1343 | AWWRRTVAKVRK,1 1344 | TDRDFIGRKALE,0 1345 | FCNRVKKARKLGKRLRFVGI,0 1346 | AVRRHINRLFFRLILDE,0 1347 | LLRWNLQLGNAVVFRSAKAEHIASNFDVF,0 1348 | KLKTWPKNYWRKVWSKKNWRKFVKKFKHW,1 1349 | FLSLIPHAINAVSAIAKHN,1 1350 | SSAIPVASMEFA,0 1351 | LSCPQYVYGSVSVTCVWGSV,1 1352 | LKQFVHFIHRF,1 1353 | FILDALSRYKA,0 1354 | FTFAQIHGFTNARDILELVTRPLRRNHS,0 1355 | SMLNPKYTFDTFVIGSGNRFAHAAS,0 1356 | SKCAQWQRRMKKVRGPSVTCVKKTSRFECI,1 1357 | PHGTQCLAMGWGRVGAHPPP,1 1358 | GVLDILTGAGKDLLAHALSKLSEKV,1 1359 | TTSIRRRYQVSLIRRHRGKR,1 1360 | QDNYWVKQGLNKLSK,1 1361 | KRGVTARASHKK,0 1362 | KPPKPDPLKITK,0 1363 | IRAGMPLSDGIIAPSD,0 1364 | DRAASAQAQLGLREGAAIAHGM,0 1365 | KKLALALAKKWLALAKKLALALAKK,1 1366 | KAARFLTCQMLKGLKFLHGA,0 1367 | MSQVSLASIHFWIAL,0 1368 | QHQQQSPRSTKQQQPLVSILDFVGF,1 1369 | AARRILRWIFR,1 1370 | VLDGWLFCRQCQKVLKFLHKNTSNLS,0 1371 | TDREIETLFTVINKLRKE,0 1372 | VRPLQDRPVRVD,0 1373 | FIKHFIHRFSGGRWRRLLKKLHHLLH,1 1374 | WQCLTLTHRGFVLLTITVLR,1 1375 | AAAAAQAAQAMP,0 1376 | RRSFRRSLRWL,1 1377 | LYKRYILRKFKFLKKKLKKK,1 1378 | MRGIVKVNAVKAPGFGDR,0 1379 | LFEKKYVKPLYKHFAWLTEHLGNQTIPGIP,0 1380 | HKLEPVFDSPRMSRR,0 1381 | DKGRRRSKFVLHRRQCAN,1 1382 | VDEFTGRIMPDRRWNEGLHQAVEAKE,0 1383 | FTYDVDFQESEVKWAS,0 1384 | KLLKQWPIGKLLKKLLKKLLK,1 1385 | TVVLATVVAAKNPKPGQDFFPLTV,0 1386 | SVLDGVDAIAVACYSPTPLVD,0 1387 | RKARAREPITSKLVANMLEVAG,0 1388 | VNVGGFVSQYFWIEDHEMEV,0 1389 | RKKRLRVVRRLV,1 1390 | KLALKLALKALKAALKLA,1 1391 | TVRIQETRPLSKTKRWRVVGRVN,1 1392 | GWKSVFRKAKKVGKTVGGLALDHYLG,1 1393 | LNRSSRVLLAGHLDT,0 1394 | CTSDLPSSWGYMNCNCTNSSSS,1 1395 | HSLENIFKSENKGYKSNES,0 1396 | VISYVLVPEEKLTSHR,0 1397 | GWFDIIKKIASEL,1 1398 | KNPLVRSSYESKIYSF,0 1399 | IEELVKQRNNIRKKHQWVQADKVREKLA,0 1400 | VRLIVAVRIWRR,1 1401 | GICRCLCRRGVCRCICVL,1 1402 | ASLTALKRSARAEVD,1 1403 | GDDAIYLLHGLSHFV,0 1404 | HTSKALLDMLKRLGK,1 1405 | AILDPGWVIPSKVEQLI,0 1406 | SVTANGKGPEADRDA,0 1407 | ATAWRMPPNGIPPIVAVRIRPLCGTV,1 1408 | YEELRAATESI,0 1409 | FFSKKYNGKLVLRLEDTDAKRVLPEAY,0 1410 | QRAQAHPAEAATA,0 1411 | ALWKDILKNAGKAALEINQLVNQGEL,1 1412 | IIGLVSKGTCVLVKTVCKKVLKQG,1 1413 | VGVKVDVKELVNT,0 1414 | ILPEKEPEEPERR,1 1415 | CSIELSDIPLSVDFNTMID,1 1416 | LKEQFKNKTIVFNQS,1 1417 | SFGGVNNSFAIQAGREIRVM,0 1418 | AAVLLPVLLAAP,1 1419 | KKWKIVVIKWKK,1 1420 | CGYRHGKANCGKG,1 1421 | PLLMARLAALL,0 1422 | DPPDPDRFYGMM,1 1423 | FLGLLGSVLGSVLPSIFK,1 1424 | HTRLYTCSAEFDFIQYATIEKLSS,0 1425 | LLESCRDSWLDGIKY,0 1426 | VIVTGCLGAKEDQIREVHPKVLEI,0 1427 | FSEAIKKIIDFLGEGLFDIIKKIAESF,1 1428 | CDEVSLAGFGYDLSQPRTPLHYF,0 1429 | GVFYPWRFRLLCLLRRWLPRPRAWFIR,1 1430 | RRRRRYRYWRRGLTIQGRPKSLPLNTGD,1 1431 | ESKSVVDSGLMSGKIERTTLKLSDK,0 1432 | VFIGQSTRIYDRETGE,0 1433 | VDIANQLSLIPEPSEVEQALASIDPDDLTP,0 1434 | SAYRLKRFTGFL,0 1435 | GEGRRIVVVASRFNEGVTVPLAEGAVS,0 1436 | KRLFKELKKSLRKY,1 1437 | EVGRILDAWGVKGWVKILPHS,0 1438 | AAHCIALRKGYK,1 1439 | ATGQETAYFLLKLAGKA,1 1440 | DQVMIVNLSGR,0 1441 | FCYWGIGIPFGYVL,0 1442 | ILGWKWGWWGWRR,1 1443 | MTKDIVETVGGLVKWILDTVKKFA,1 1444 | EKVDLALGREDLR,0 1445 | GMWSKLLGHLLR,1 1446 | VWGIKQLQARILAVERYLKDQQLLGIWG,1 1447 | ADLAIRVVDAKEGQSLNRHYRGKDYATN,0 1448 | FLGLIFHGLVHAGKLIHGLIHRNRG,1 1449 | FLGMIPKLIKKLIKAFK,1 1450 | TGRKIIVDTYGGWAPHGGGAFSGKDP,0 1451 | HRILMAIRQMMT,1 1452 | GQLFTFSPRHHWTTQDCN,1 1453 | ILGTILKLLSKL,1 1454 | RQELWPKGETSGHVQWVKSIRY,0 1455 | VLYIPGSKERALEKA,0 1456 | LQKYYWRVRGGRWAVLS,1 1457 | GQGALLLHDPGAARNPFYLLAPEWALY,0 1458 | DPTVINQAAGQVSTSKNALNG,0 1459 | PTQYSMDHFSGMHNWYACSHARPT,0 1460 | KTKKTKKTKKTKKTKKTKKTK,1 1461 | VELKKVVDFERSLLSFLRDQHQDLLDEI,0 1462 | ELIKQRIAQFNVVSEAHNEGTI,0 1463 | ITQPSSDSSARME,0 1464 | INMKASAAVAKKLL,1 1465 | STLHLVLALRGG,1 1466 | FLPVIAGLLSKLF,1 1467 | YPGPLDLRRPMRTVPGP,0 1468 | EGHKSALRFNPLFIHASVGLGKTHL,0 1469 | TIWKGVPKFLRRVDTALKNI,1 1470 | KGLSGPESRWWVV,1 1471 | RTPQSRSFTNENNDNLRSVS,0 1472 | FKAFNDAIQKAKPATAKGVYITNLSITTTQ,0 1473 | MQTSSTPEFDGWR,0 1474 | FLKGIVGKLGKLF,1 1475 | GWFDVVKHIASAV,1 1476 | ALKAALLAILKIVRVIKK,1 1477 | RLIDISNILQSRTCA,1 1478 | VRKRLASEMSSNPDYNNS,0 1479 | CGYRHGRLNCGRG,1 1480 | YAKLFHSYIDLQAAYIYSLPREVMEV,0 1481 | GLFDIVKKLVSDF,1 1482 | GRWRWWWRWRI,1 1483 | LKLMGIVKKVLGAL,1 1484 | RIKDFLRNGRKCCRRKK,1 1485 | DIFSRVAWQLRTRSGVNIYAWMPVLSWDL,0 1486 | RRGWARRLAFAFAFGRR,1 1487 | AAAEIYEEFLAAFEGSDGNKVKTFVRGG,0 1488 | KVCYRAIPCGESCVWIPCISAAIGCSCKN,1 1489 | GFLDKLKKGASDFANALVNSIKGT,1 1490 | YCPSIEDKIVK,0 1491 | HLPEQAFYMVGGIDEAIE,0 1492 | GSTSFHLIYNKWFAVKRRRKR,1 1493 | GFLDVLKGVGKAALGAVTHLINQGEQ,1 1494 | YPVKLKVYPLKVKL,1 1495 | FAKLLAKALKLKL,1 1496 | GSQVVATEPIHIMEDITKPRRDSNITAW,0 1497 | FLPILGKLLSGIL,1 1498 | LNDNNIENYSKNVLTQP,0 1499 | EWGRRMMGWGRGRRMMRRWW,1 1500 | SILPTIVSFLSKFL,1 1501 | ILGKILKGIKKLF,1 1502 | YDGSRPMSTVF,0 1503 | IERQIRSGDSVQSMK,0 1504 | WKKIKKIIKKIKKI,1 1505 | LKLKDILGKIKVILSHLNK,1 1506 | INWKKIAEVGGKILSSL,1 1507 | LIRELNSKRPLLPRNWQPSA,0 1508 | CGYKYGCMVKVDR,1 1509 | PRLMSVGLIAQQLHW,0 1510 | SEPWFDPAGFLLAERDGQLLGSVWT,0 1511 | LTNQNLPPIHTL,0 1512 | TSFAWAFVLALLAG,0 1513 | GFRDVLKGAAKAFVKTVAGHIANI,1 1514 | TNIAYLQQNGVKIWDDWADENGEL,0 1515 | SGSLKSTLTGNKVFNCISYSPLCK,0 1516 | LRWLGTKVGILK,1 1517 | VIVGGGGVGKS,0 1518 | HQRIEIKDTKEALDKIEEE,0 1519 | LKKWWKTSKGLLGGLLGKVTSVIK,1 1520 | DRISFVLKVYLRSLQCKMLG,0 1521 | CWTKSIPPKPCKFFKRFFKSFRRAFKKF,1 1522 | TLPCLWPWWPWSI,1 1523 | GSDIKIPITSNCIKGFIFDTFSAGAETSST,0 1524 | KAAKKAWKAWKKAAKAAWKKAA,1 1525 | MNGFILGKFFDVAKKILDTIFQK,1 1526 | DLPGHSGDIWHGYLPDA,0 1527 | WRFWRFWRFWRFWRF,1 1528 | RRLRLLLRLRR,1 1529 | RKLPDAPGMHTWGGGGRCAVLSCLPKEQI,1 1530 | FLLAHINRLIADHQ,0 1531 | TTFAAVAGVVYV,0 1532 | RHRHRHRHRHRH,1 1533 | SIGISYWLNAVFLWVY,0 1534 | AHCNISRAKWNDTLR,1 1535 | EERKKLGEEIKKEAEEAKKQIEETKKNDEE,1 1536 | KLWKLWKKWLK,1 1537 | CRVYNNGLPTGLYRWC,1 1538 | LKLKKLCKCLLKKKLL,1 1539 | LIALLPLWIQWTLNVPWA,0 1540 | IRQLQTWASGHQF,0 1541 | DDEDIDPCEPSSGGLANPTR,0 1542 | VADKRPHILHEKKSIPY,1 1543 | AKLLENEAAFSH,0 1544 | NIHFNLIEPRKDIDVLMVAP,0 1545 | DEPRGLLMLAELSSKGSFAH,0 1546 | RIRFPWPWRWPWWRRVRG,1 1547 | CGRCLQRACCKYCRLKCRLILFVIF,1 1548 | YCSFNVTPKFKRWQLYFRGRMWCP,1 1549 | PQRGEGGRAGNLLREEQEI,1 1550 | VILTRFRFLNRIVEPLLKKA,1 1551 | IGLTNAIFVSLNITVILSSL,0 1552 | FSESSEFSESSED,0 1553 | GNQGRGNPVRSPLGFGSYTM,1 1554 | SAGPAVAAAVVEEQTEFN,0 1555 | RLRLRLWRLRLRLCTKSIPPIC,1 1556 | TFTNDPEEVVASILPPKQEAFEEDAEPS,0 1557 | TVKALVKWILKTVAKFW,1 1558 | CGSVFLVGQLFTFSPRHH,1 1559 | TMVALGQHEEEYFSGPGPKAVLTKFR,0 1560 | PPDVHTPPHALWRLHLSLRVCLVRMWIH,1 1561 | EPIILHGAGRTDAGVHATNMVAHFDTNA,0 1562 | QILSDLGIRKMRLLSSSSHLYHSLSG,0 1563 | LALLPFVNRES,1 1564 | GKEFKRIVGRIYRLCCR,1 1565 | AASYACLHAACA,1 1566 | KYFSNVEETDSKTEQSTDIVKGDVKTCK,0 1567 | EKTLARTAAKTALKK,1 1568 | TECLNPKDYDKPIYEVICEK,0 1569 | NFEDHLEVYIQ,0 1570 | INLKILARLAKKIL,1 1571 | FVKLKKILNIILSIFKK,1 1572 | GGAILGGVAYAATCWW,1 1573 | KGLSGPAVPWWVV,1 1574 | KTHDFLNSICNFVES,0 1575 | FRIRVRVKWKLFKKI,1 1576 | VVNINSPDTLPIIYERGL,0 1577 | KTKLFKKFAKKLAKKLKKLAKKL,1 1578 | IPCGESCVWIPCISGMFGCSCKDKVCYS,1 1579 | GPKSPDHSEEPMSYD,0 1580 | GALSLLSLLGKLL,1 1581 | LPARIMSAIPMLGVVD,0 1582 | AILRKATRRLVQLIVSG,1 1583 | RFLVCWKQKIWGKARPSMCTRRARF,1 1584 | EFIVMVGPSGCGKS,0 1585 | ADRVLYRQAVRTAL,0 1586 | MTGRLISWWWSL,1 1587 | EKAAAKSAAAKTLARR,1 1588 | RRRRRRRRRGGGKWKKLLKKPLKLLKK,1 1589 | FFGHLFKLATKIIPSLFQRKKE,1 1590 | AQANPVEGVALLVE,0 1591 | GTPLSFDRLLATVFGI,0 1592 | KKKFIYIVLALIKGAIIKKG,1 1593 | VLKQDQIWKDTT,0 1594 | LENAGMLRALDLS,0 1595 | NKNNSSQAATMVFTKCEKEPLDLIASL,0 1596 | GYLPMVLAMAAIG,0 1597 | KKKPKPPYLPKPKPPPFFPPKLPPKI,1 1598 | ESLAKALSKEALKALK,1 1599 | KKYRYHLKPFCKKADPC,1 1600 | IVNHLVKLFDKGLNSIVNLR,1 1601 | NKNYKLLVSMAPLSNEIRPISSCTPQHIGP,0 1602 | KRRILIRILKLIIKLILKKR,1 1603 | RCVCTRGFCRCICLLGIC,1 1604 | GWWRRTAAKVRK,1 1605 | HVLTPRIREARPVNTGVKV,0 1606 | IFSAIAGLLSNLL,1 1607 | FLSLLPSIVSGAVSLAKKLG,1 1608 | DSSLFDINNDFPQLTS,0 1609 | FAQSFIGRVFLFLMIVLPLW,0 1610 | FLGGLMKIIPAAFCAVTKKC,1 1611 | LDAAKACNVDEMCQRLRT,0 1612 | RKKRRQRRRLNLKALLAVAKKIL,1 1613 | FLPKMSTKLRVPYRRGTKDYH,1 1614 | FKVKAKVKAKVKAKVKAKKKK,1 1615 | VLVGAKDALNSGLAVLPEGVDFSKVMDNLS,0 1616 | CDVIALLCHLNTPSFNTTHYRESWY,1 1617 | KEKRILGPVISKIGGVLGGLLKNLG,1 1618 | RQGMTWEQYARAVSAGEAD,0 1619 | GLFDIVKKVLKLLK,1 1620 | LLVTRQDDSRTAR,0 1621 | LDLQPLPISAVTNKEFQTVFAESGFKVF,0 1622 | IYYVDEKAPEFSMQGLTA,0 1623 | TAPAEIGGWRLPAGTVVNTS,0 1624 | KQRNRWEWRPDFKSKKVKISLPC,1 1625 | VGIGTPISPTGGGAGHVPGTP,1 1626 | QIFAKLFIFVDDNGE,0 1627 | RYGYSQVQVGLCYLPFGVGSILSRWT,0 1628 | EGKIFPLISQA,0 1629 | WEFEGPEWMTAQELK,0 1630 | VLDRILGPEPEPEPEPEPEPEPELGSGLEP,0 1631 | MASTGLELLGMTLAVLGW,1 1632 | TARYIRPNLLQDRLN,0 1633 | KAAQALEKIVAEMERRYDL,0 1634 | SADLVKKIWDNPAL,1 1635 | GTCEKCDTCGA,0 1636 | FPRRWQWRRPF,1 1637 | LKKLKQLLGKLSEFAAAFVA,1 1638 | NGSAPDIPMAGFPYHASEGYIAR,0 1639 | KRLLKYLKHNDLEAYRNLVKTLNLRG,1 1640 | DLICPILKGGKTGLF,0 1641 | TLKILDVSANPLHCACGAAFVD,0 1642 | LQVPQDDWGAYPT,0 1643 | RIAGRIARIAGRIARIAGRIA,1 1644 | HPTPTQVNVKATAGK,0 1645 | MKSLGSVCSGGRYD,0 1646 | DDRGAAMFAVLFNAFSSLLE,1 1647 | YWKKWKKLRRIFMLV,1 1648 | LKNELWQYFAAVLDTKAT,0 1649 | TFLILVNYYKYKNSKEFKTSIASLICI,0 1650 | PKLLKTFLSKWKKIG,1 1651 | QDARESGKLVDATHGRRTRAVI,0 1652 | LLQWFTTRGLFFTAVSFS,0 1653 | DPSLVFSRKFYIAQW,0 1654 | RWFRIQLQIRRWRNRR,1 1655 | KWKKKKKKPKFL,1 1656 | -------------------------------------------------------------------------------- /data_feature.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import torch 4 | import pandas as pd 5 | from Bio import SeqIO 6 | import torch.utils.data 7 | from sklearn.model_selection import KFold, ShuffleSplit 8 | import re 9 | import vocab 10 | from sklearn.model_selection import train_test_split 11 | # from transformers import T5Tokenizer,XLNetTokenizer 12 | def AAI_embedding(seq,max_len=200): 13 | f=open('data/AAindex.txt') 14 | text=f.read() 15 | f.close() 16 | text=text.split('\n') 17 | while '' in text: 18 | text.remove('') 19 | cha=text[0].split('\t') 20 | while '' in cha: 21 | cha.remove('') 22 | cha=cha[1:] 23 | index=[] 24 | for i in range(1,len(text)): 25 | temp=text[i].split('\t') 26 | while '' in temp: 27 | temp.remove('') 28 | temp=temp[1:] 29 | for j in range(len(temp)): 30 | temp[j]=float(temp[j]) 31 | index.append(temp) 32 | index=np.array(index) 33 | AAI_dict={} 34 | for j in range(len(cha)): 35 | AAI_dict[cha[j]]=index[:,j] 36 | AAI_dict['X']=np.zeros(531) 37 | all_embeddings=[] 38 | for each_seq in seq: 39 | temp_embeddings=[] 40 | for each_char in each_seq: 41 | temp_embeddings.append(AAI_dict[each_char]) 42 | if max_len>len(each_seq): 43 | zero_padding=np.zeros((max_len-len(each_seq),531)) 44 | data_pad=np.vstack((temp_embeddings,zero_padding)) 45 | elif max_len==len(each_seq): 46 | data_pad=temp_embeddings 47 | else: 48 | data_pad=temp_embeddings[:max_len] 49 | all_embeddings.append(data_pad) 50 | all_embeddings=np.array(all_embeddings) 51 | return torch.from_numpy(all_embeddings).float() 52 | 53 | 54 | def PAAC_embedding(seq,max_len=200): 55 | f=open('data/PAAC.txt') 56 | text=f.read() 57 | f.close() 58 | text=text.split('\n') 59 | while '' in text: 60 | text.remove('') 61 | cha=text[0].split('\t') 62 | while '' in cha: 63 | cha.remove('') 64 | cha=cha[1:] 65 | index=[] 66 | for i in range(1,len(text)): 67 | temp=text[i].split('\t') 68 | while '' in temp: 69 | temp.remove('') 70 | temp=temp[1:] 71 | for j in range(len(temp)): 72 | temp[j]=float(temp[j]) 73 | index.append(temp) 74 | index=np.array(index) 75 | AAI_dict={} 76 | for j in range(len(cha)): 77 | AAI_dict[cha[j]]=index[:,j] 78 | AAI_dict['X']=np.zeros(3) 79 | all_embeddings=[] 80 | for each_seq in seq: 81 | temp_embeddings=[] 82 | for each_char in each_seq: 83 | temp_embeddings.append(AAI_dict[each_char]) 84 | if max_len>len(each_seq): 85 | zero_padding=np.zeros((max_len-len(each_seq),3)) 86 | data_pad=np.vstack((temp_embeddings,zero_padding)) 87 | elif max_len==len(each_seq): 88 | data_pad=temp_embeddings 89 | else: 90 | data_pad=temp_embeddings[:max_len] 91 | all_embeddings.append(data_pad) 92 | all_embeddings=np.array(all_embeddings) 93 | return torch.from_numpy(all_embeddings).float() 94 | 95 | def PC6_embedding(seq,max_len=200): 96 | f=open('data/6-pc') 97 | text=f.read() 98 | f.close() 99 | text=text.split('\n') 100 | while '' in text: 101 | text.remove('') 102 | text=text[1:] 103 | AAI_dict={} 104 | for each_line in text: 105 | temp=each_line.split(' ') 106 | while '' in temp: 107 | temp.remove('') 108 | for i in range(1,len(temp)): 109 | temp[i]=float(temp[i]) 110 | AAI_dict[temp[0]]=temp[1:] 111 | AAI_dict['X']=np.zeros(6) 112 | all_embeddings=[] 113 | for each_seq in seq: 114 | temp_embeddings=[] 115 | for each_char in each_seq: 116 | temp_embeddings.append(AAI_dict[each_char]) 117 | if max_len>len(each_seq): 118 | zero_padding=np.zeros((max_len-len(each_seq),6)) 119 | data_pad=np.vstack((temp_embeddings,zero_padding)) 120 | elif max_len==len(each_seq): 121 | data_pad=temp_embeddings 122 | else: 123 | data_pad=temp_embeddings[:max_len] 124 | all_embeddings.append(data_pad) 125 | all_embeddings=np.array(all_embeddings) 126 | return torch.from_numpy(all_embeddings).float() 127 | 128 | 129 | 130 | def BLOSUM62_embedding(seq,max_len=200): 131 | f=open('data/blosum62.txt') 132 | text=f.read() 133 | f.close() 134 | text=text.split('\n') 135 | while '' in text: 136 | text.remove('') 137 | cha=text[0].split(' ') 138 | while '' in cha: 139 | cha.remove('') 140 | index=[] 141 | for i in range(1,len(text)): 142 | temp=text[i].split(' ') 143 | while '' in temp: 144 | temp.remove('') 145 | for j in range(len(temp)): 146 | temp[j]=float(temp[j]) 147 | index.append(temp) 148 | index=np.array(index) 149 | BLOSUM62_dict={} 150 | for j in range(len(cha)): 151 | BLOSUM62_dict[cha[j]]=index[:,j] 152 | all_embeddings=[] 153 | for each_seq in seq: 154 | temp_embeddings=[] 155 | for each_char in each_seq: 156 | temp_embeddings.append(BLOSUM62_dict[each_char]) 157 | if max_len>len(each_seq): 158 | zero_padding=np.zeros((max_len-len(each_seq),23)) 159 | data_pad=np.vstack((temp_embeddings,zero_padding)) 160 | elif max_len==len(each_seq): 161 | data_pad=temp_embeddings 162 | else: 163 | data_pad=temp_embeddings[:max_len] 164 | all_embeddings.append(data_pad) 165 | all_embeddings=np.array(all_embeddings) 166 | return torch.from_numpy(all_embeddings).float() 167 | 168 | 169 | def onehot_embedding(seq,max_len=200): 170 | char_list='ARNDCQEGHILKMFPSTWYVX' 171 | char_dict={} 172 | for i in range(len(char_list)): 173 | char_dict[char_list[i]]=i 174 | all_embeddings=[] 175 | for each_seq in seq: 176 | temp_embeddings=[] 177 | for each_char in each_seq: 178 | codings=np.zeros(21) 179 | if each_char in char_dict.keys(): 180 | codings[char_dict[each_char]]=1 181 | else: 182 | codings[20]=1 183 | temp_embeddings.append(codings) 184 | if max_len>len(each_seq): 185 | zero_padding=np.zeros((max_len-len(each_seq),21)) 186 | data_pad=np.vstack((temp_embeddings,zero_padding)) 187 | elif max_len==len(each_seq): 188 | data_pad=temp_embeddings 189 | else: 190 | data_pad=temp_embeddings[:max_len] 191 | 192 | all_embeddings.append(data_pad) 193 | all_embeddings=np.array(all_embeddings) 194 | return torch.from_numpy(all_embeddings).float() 195 | 196 | 197 | 198 | 199 | def index_encoding(sequences,max_len=200): 200 | ''' 201 | Modified from https://github.com/openvax/mhcflurry/blob/master/mhcflurry/amino_acid.py#L110-L130 202 | 203 | Parameters 204 | ---------- 205 | sequences: list of equal-length sequences 206 | 207 | Returns 208 | ------- 209 | np.array with shape (#sequences, length of sequences) 210 | ''' 211 | seq_list=[] 212 | for s in sequences: 213 | temp=list(s) 214 | while len(temp)TESTFLE_00001 2 | QGLFFLGAKLFYLLTLFL 3 | >TESTFLE_00002 4 | MVCVSRSLLKVVLLLLFL 5 | >TESTFLE_00003 6 | GWLCIIASALCFSFILFT 7 | >TESTFLE_00004 8 | MGRPLGLLPSWVFPLERL 9 | >TESTFLE_00005 10 | YSCRGGKAFILFRPFPTV 11 | >TESTFLE_00006 12 | MFSVVAGLFRVCLHRAWS 13 | >TESTFLE_00007 14 | KAKASFKVPGVALCILAL 15 | >TESTFLE_00008 16 | KAAFRLKVAFIPLLAVLW 17 | >TESTFLE_00009 18 | MAFLFKVAFRILTGTLWP 19 | >TESTFLE_00010 20 | MFAGLFRCFASFIFGLAG 21 | >TESTFLE_00011 22 | MKQLPYRSLLLSLLLVQI 23 | >TESTFLE_00012 24 | MNVKLLLLVAALSTLLII 25 | >TESTFLE_00013 26 | MTISCTHASVAIWGRKWV 27 | >TESTFLE_00014 28 | MLLARNQSVPVLVQWILL 29 | >TESTFLE_00015 30 | MRAFFLAAESPLFAKTSF 31 | >TESTFLE_00016 32 | MFLCAFCSAFFGTGVFVR 33 | >TESTFLE_00017 34 | MEKKVLQILGFALLAAVL 35 | >TESTFLE_00018 36 | MNLVWRLLSVFKFARRSA 37 | >TESTFLE_00019 38 | MPAFGGKPCLHWLPYFLL 39 | >TESTFLE_00020 40 | MMCTKLSGWQTQISLILG 41 | >TESTFLE_00021 42 | MKLLLALLLGAARGLPLL 43 | >TESTFLE_00022 44 | MRTVFALVKFCVAAVAAV 45 | >TESTFLE_00023 46 | WAIPLFLLESPVRKSTFF 47 | >TESTFLE_00024 48 | MSSLLFWLPLTKAKLQVA 49 | >TESTFLE_00025 50 | MLSIKAPRLKQGLIISVL 51 | >TESTFLE_00026 52 | MAKSWPNGGLVRITKQLI 53 | >TESTFLE_00027 54 | YAGFICKPLPSFILTLLS 55 | >TESTFLE_00028 56 | MRALLCKLLVTGGQSLSV 57 | >TESTFLE_00029 58 | MALFFKFAIFAALGAMFV 59 | >TESTFLE_00030 60 | WNTLSWLLCLLSWVRFVC 61 | >TESTFLE_00031 62 | GGVLCLAWCFLVVPKMLV 63 | >TESTFLE_00032 64 | MKTSKLKALELLLTPKCL 65 | >TESTFLE_00033 66 | MCVLGRLALLVKIKGVSG 67 | >TESTFLE_00034 68 | MKTIVAMLLVLLILVIAV 69 | >TESTFLE_00035 70 | MRICLTAAKKHVLQFLLL 71 | >TESTFLE_00037 72 | MAQMLSRLNIRRFSCYAK 73 | >TESTFLE_00038 74 | FIVVAKQGLFSLVAASFC 75 | >TESTFLE_00039 76 | MFQLLLGGNGISWIKLAA 77 | >TESTFLE_00040 78 | MGVAACTLKSCYFLPPRK 79 | >TESTFLE_00041 80 | MAIFILLQKHFPIGAQLF 81 | >TESTFLE_00042 82 | MNAGKTCKKWSKSFSFKA 83 | >TESTFLE_00043 84 | MLSKKFIQLQKLAINGTV 85 | >TESTFLE_00044 86 | AKLLLAGLLLIALASAVR 87 | >TESTFLE_00045 88 | MRLLALFLILIHVVAALF 89 | >TESTFLE_00046 90 | MKTLLPFFLNPPLFGILK 91 | >TESTFLE_00047 92 | MRARPITITLWLKYIAYI 93 | >TESTFLE_00048 94 | MTFAYLSKMRFWAKATSS 95 | >TESTFLE_00049 96 | MRKLGLVTMVILLRVQLA 97 | >TESTFLE_00050 98 | MRKLGLQVFLVFRRVLCH 99 | >TESTFLE_00051 100 | FWCFCRLSIRISFRAEFL 101 | >TESTFLE_00052 102 | QSALLKRALLRWLCVVLK 103 | >TESTFLE_00053 104 | GAAVVLVATLLLLLLALL 105 | >TESTFLE_00054 106 | MCAVSVAPALCLSFVALC 107 | >TESTFLE_00055 108 | MCCPGAGVTICPQAVARL 109 | >TESTFLE_00056 110 | MCVSVWLSLVCAIKLLLT 111 | >TESTFLE_00057 112 | MVRRLCLAWLVNFAGSFG 113 | >TESTFLE_00059 114 | MSSLIKTCTGLVLLLAAF 115 | >TESTFLE_00060 116 | MKKMTLLFAVKKVKALRL 117 | >TESTFLE_00062 118 | MYFFLGKSCYGLFFRACI 119 | >TESTFLE_00063 120 | MDLRFKAKCKELAPVAIP 121 | >TESTFLE_00064 122 | MKKIVKVPFALLLLKKSF 123 | >TESTFLE_00065 124 | MFLKLSLGFLLALIACLL 125 | >TESTFLE_00066 126 | MSCPIRVAISLLKVLQAV 127 | >TESTFLE_00067 128 | KRCLFAFFPNWFVVPVKL 129 | >TESTFLE_00068 130 | MAVRLALLSIRRAFLSLL 131 | >TESTFLE_00069 132 | MWRQQSGCFLKLLAGFIS 133 | >TESTFLE_00070 134 | MAKIAAQAVLVAIKLLVI 135 | >TESTFLE_00071 136 | MKIHRFARNSKYNCFAAL 137 | >TESTFLE_00072 138 | MWSPQTTFSKAVAVTKVI 139 | >TESTFLE_00073 140 | MLKRLVALWLLVRWLFLL 141 | >TESTFLE_00074 142 | MKLKLYCVVARGNLIVFV 143 | >TESTFLE_00075 144 | KMVKLWFFCLFVLLLLFF 145 | >TESTFLE_00076 146 | PSCECCPILTFIACLTQA 147 | >TESTFLE_00077 148 | MKVFRKLLLFGAAKFILF 149 | >TESTFLE_00078 150 | MKLIVTLLALALPAFLLF 151 | >TESTFLE_00079 152 | MMPCKYCRCLLKVVAVKS 153 | >TESTFLE_00080 154 | MFALAGKQTLLAAILKYL 155 | >TESTFLE_00081 156 | MECTLVARFLSKLSFKQF 157 | >TESTFLE_00082 158 | MFWAFLSLKDGVLGKHSS 159 | >TESTFLE_00083 160 | MLLVLFEKSLLNFALKLK 161 | >TESTFLE_00084 162 | TFPKTFFTKVVIHKSCAD 163 | >TESTFLE_00085 164 | KRRAGLKLLVTLLILITM 165 | >TESTFLE_00086 166 | MCARICSMAFFTKPTNSG 167 | >TESTFLE_00087 168 | MKQCILNLFFAQLVVRLL 169 | >TESTFLE_00088 170 | MALLIRSAAAFITRTLPK 171 | >TESTFLE_00089 172 | QTTCAGVKCATGLAAVTC 173 | >TESTFLE_00090 174 | MKVNKPIVTRIFGVKIFI 175 | >TESTFLE_00091 176 | MSVGFVILLALAALIFCP 177 | >TESTFLE_00092 178 | MAKQAVGPYCDTFLLKIM 179 | >TESTFLE_00093 180 | MEAVFIALLLLLFVCKVS 181 | >TESTFLE_00095 182 | MKLCPLSIFVLALAASAL 183 | >TESTFLE_00096 184 | MFLAWIKPQGANFPREKL 185 | >TESTFLE_00097 186 | MTWCCLCRAINFGSSAIF 187 | >TESTFLE_00098 188 | MDAKLTFFFVLLGAASLV 189 | >TESTFLE_00099 190 | MKSSFLFGASIGLIAALA 191 | >TESTFLE_00100 192 | SASLVSLLTKLISPKLKS 193 | >TESTFLE_00101 194 | MPVAQCPKGYVEKMYAGQ 195 | >TESTFLE_00102 196 | MAAKWAGCNMVAIALSAL 197 | >TESTFLE_00103 198 | MVIKTGWFARAFLLKILT 199 | >TESTFLE_00104 200 | MLRSALIFLLRTFSKKFA 201 | >TESTFLE_00105 202 | MACCGRLFLLCVRLILLF 203 | >TESTFLE_00106 204 | MKGIGRIARFSKLKGASL 205 | >TESTFLE_00107 206 | AFIARNAKGTLTINAKGK 207 | >TESTFLE_00108 208 | MFSQLPVAFVKMAAVGKK 209 | >TESTFLE_00109 210 | MALFVRFCDAGSKANHCG 211 | >TESTFLE_00110 212 | MKQAFSCAFKILGRPADP 213 | >TESTFLE_00111 214 | MLFLRRLFLLGLCMFHCY 215 | >TESTFLE_00112 216 | MCVVRSLQTLSVQVLLSV 217 | >TESTFLE_00114 218 | MKLHFCILSFGELRFVKC 219 | >TESTFLE_00115 220 | MLLQGVAIGLGKALKLVW 221 | >TESTFLE_00116 222 | MASFLAATLWSKTPLKAF 223 | >TESTFLE_00117 224 | MQFRMLICVKVLWFDKLR 225 | >TESTFLE_00118 226 | MRFGIAWFAVCWTAVNSV 227 | >TESTFLE_00119 228 | MRMLSWVFRVQLLVLKVD 229 | >TESTFLE_00120 230 | MSFPVTPKVAFTLVAVPL 231 | >TESTFLE_00121 232 | MGRSLIKGTIIKLSALAA 233 | >TESTFLE_00122 234 | MMPRLLANLGLRLTKRAV 235 | >TESTFLE_00123 236 | MTSWKLFVARFYLAGTAS 237 | >TESTFLE_00124 238 | MKLMTFWLGLAAVIGFSV 239 | >TESTFLE_00125 240 | GTITTFAKASCGIQCGMA 241 | >TESTFLE_00126 242 | MFVCKALAYTVAKVVEHL 243 | >TESTFLE_00127 244 | MRFICLKVKILLLSIVQA 245 | >TESTFLE_00128 246 | MAGWLLTCLLLCLYIPVV 247 | >TESTFLE_00129 248 | AKKIVVSATLLLVALRLW 249 | >TESTFLE_00131 250 | MKRLLLLVVIGIKGLLVC 251 | >TESTFLE_00132 252 | MRACAACSVGEARPSGTG 253 | >TESTFLE_00133 254 | MKKPLVLSNLMPLFSTVV 255 | >TESTFLE_00134 256 | MKAIICLLFIATYWIAHF 257 | >TESTFLE_00135 258 | MATLLLLRLKRLRNLLIA 259 | >TESTFLE_00136 260 | MMLPFVIKKQALLLGLLS 261 | >TESTFLE_00137 262 | MKTVGKALSLVKLLLLTA 263 | >TESTFLE_00138 264 | MFLPIKAMLYAAAISRAF 265 | >TESTFLE_00139 266 | MKGASSKFLSRLALLLGT 267 | 268 | -------------------------------------------------------------------------------- /gan_diff.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import torch.autograd as autograd 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | import pdb 8 | import math 9 | import torch.nn.init as init 10 | from math import ceil 11 | import numpy as np 12 | import sys 13 | import torch.optim as optim 14 | import pandas as pd 15 | import ECD_Attention 16 | import re 17 | import pickle 18 | 19 | CUDA = torch.cuda.is_available() 20 | loss_g = [] 21 | loss_d = [] 22 | MAX_SEQ_LEN = 18 23 | data = pd.read_csv('data/AMPdb_data.csv', skiprows=1, usecols=range(3), header=None, names=['ID', 'seq', 'len']) 24 | all_sequences = np.asarray(data['seq']) 25 | CHARACTER_DICT = { 26 | 'A': 1, 'C': 2, 'E': 3, 'D': 4, 'F': 5, 'I': 6, 'H': 7, 27 | 'K': 8, 'M': 9, 'L': 10, 'N': 11, 'Q': 12, 'P': 13, 'S': 14, 28 | 'R': 15, 'T': 16, 'W': 17, 'V': 18, 'Y': 19, 'G': 20, 'O': 21, 'U': 22, 'Z': 23, 'X': 24} 29 | INDEX_DICT = { 30 | 1: 'A', 2: 'C', 3: 'E', 4: 'D', 5: 'F', 6: 'I', 7: 'H', 31 | 8: 'K', 9: 'M', 10: 'L', 11: 'N', 12: 'Q', 13: 'P', 14: 'S', 32 | 15: 'R', 16: 'T', 17: 'W', 18: 'V', 19: 'Y', 20: 'G', 21: 'O', 22: 'U', 23: 'Z', 24: 'X'} 33 | 34 | 35 | def sequence_to_vector(sequence): 36 | default = np.asarray([25] * (MAX_SEQ_LEN)) 37 | for i, character in enumerate(sequence[:MAX_SEQ_LEN]): 38 | default[i] = CHARACTER_DICT[character] 39 | return default.astype(int) 40 | 41 | 42 | def vector_to_sequence(vector): 43 | return ''.join([INDEX_DICT.get(item, '0') for item in vector]) 44 | 45 | 46 | all_data = [] 47 | for i in range(len(all_sequences)): 48 | all_data.append(sequence_to_vector(all_sequences[i])) 49 | 50 | 51 | class Generator(nn.Module): 52 | def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, oracle_init=False): 53 | super(Generator, self).__init__() 54 | self.hidden_dim = hidden_dim 55 | self.embedding_dim = embedding_dim 56 | self.max_seq_len = max_seq_len 57 | self.vocab_size = vocab_size 58 | self.embeddings = nn.Embedding(vocab_size, embedding_dim) 59 | self.diff = ECD_Attention.DIFFormer(embedding_dim, hidden_dim, out_channels=128, use_graph=False) 60 | self.diff2out = nn.Linear(hidden_dim, vocab_size) 61 | 62 | if oracle_init: 63 | for p in self.parameters(): 64 | nn.init.normal_(p, 0, 1) 65 | 66 | def init_hidden(self, batch_size=1): 67 | h = autograd.Variable(torch.zeros(1, batch_size, self.hidden_dim)) 68 | 69 | if self.gpu: 70 | return h.cuda() 71 | else: 72 | return h 73 | 74 | def forward(self, inp, hidden): 75 | emb = self.embeddings(inp) 76 | emb = emb.view(1, -1, self.embedding_dim) # 1.16.3 77 | emb1 = emb.view(-1, self.embedding_dim) 78 | z = self.diff(emb1, edge_index=None) 79 | out = self.diff2out(z.view(-1, self.hidden_dim)) # 16,26 80 | out = F.log_softmax(out, dim=1) # 16.26 81 | return out, hidden 82 | 83 | def sample(self, num_samples, start_letter=0): 84 | 85 | samples = torch.zeros(num_samples, self.max_seq_len).type(torch.LongTensor) 86 | 87 | h = self.init_hidden(num_samples) # 1,16,128 88 | inp = autograd.Variable( 89 | torch.LongTensor([start_letter] * num_samples)) # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 90 | 91 | if self.gpu: 92 | samples = samples.cuda() 93 | inp = inp.cuda() 94 | 95 | for i in range(self.max_seq_len): 96 | out, h = self.forward(inp, h) 97 | # print("out1:{}".format(out.size()))#[16, 26]) 98 | # print("h2:{}".format(h.size()))#[1, 16, 128]) 99 | out = torch.multinomial(torch.exp(out), 1) 100 | # print("out2:{}".format(out.size()))#[16, 1]) 101 | samples[:, i] = out.view(-1).data 102 | # print("samples:{}".format(samples.size()))#[16, 18]) 103 | 104 | inp = out.view(-1) 105 | # print("inp:{}".format(inp.size()))#16 106 | 107 | return samples 108 | 109 | def batchNLLLoss(self, inp, target): 110 | loss_fn = nn.NLLLoss() # 111 | batch_size, seq_len = inp.size() 112 | inp = inp.permute(1, 0) 113 | target = target.permute(1, 0) 114 | h = self.init_hidden(batch_size) 115 | 116 | loss = 0 117 | 118 | for i in range(seq_len): 119 | out, h = self.forward(inp[i], h) 120 | loss += loss_fn(out, target[i]) 121 | 122 | return loss # per batch 123 | 124 | def batchPGLoss(self, inp, target, reward): 125 | batch_size, seq_len = inp.size() 126 | inp = inp.permute(1, 0) # seq_len x batch_size 127 | target = target.permute(1, 0) # seq_len x batch_size 128 | h = self.init_hidden(batch_size) 129 | 130 | loss = 0 131 | for i in range(seq_len): 132 | out, h = self.forward(inp[i], h) 133 | for j in range(batch_size): 134 | loss += -out[j][target.data[i][j]] * reward[j] 135 | 136 | return loss / batch_size 137 | 138 | 139 | class Discriminator(nn.Module): 140 | 141 | def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, gpu=True, dropout=0.2): 142 | super(Discriminator, self).__init__() 143 | self.hidden_dim = hidden_dim 144 | self.embedding_dim = embedding_dim 145 | self.max_seq_len = max_seq_len 146 | self.gpu = gpu 147 | 148 | self.embeddings = nn.Embedding(vocab_size, embedding_dim) 149 | self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=dropout) 150 | self.gru2hidden = nn.Linear(2 * 2 * hidden_dim, hidden_dim) 151 | self.dropout_linear = nn.Dropout(p=dropout) 152 | self.hidden2out = nn.Linear(hidden_dim, 1) 153 | 154 | def init_hidden(self, batch_size): 155 | h = autograd.Variable(torch.zeros(2 * 2 * 1, batch_size, self.hidden_dim)) 156 | 157 | if self.gpu: 158 | return h.cuda() 159 | else: 160 | return h 161 | 162 | def forward(self, input, hidden): 163 | 164 | emb = self.embeddings(input) 165 | emb = emb.permute(1, 0, 2) 166 | _, hidden = self.gru(emb, hidden) 167 | hidden = hidden.permute(1, 0, 2).contiguous() 168 | out = self.gru2hidden(hidden.view(-1, 4 * self.hidden_dim)) 169 | out = torch.tanh(out) 170 | out = self.dropout_linear(out) 171 | out = self.hidden2out(out) 172 | out = torch.sigmoid(out) 173 | return out 174 | 175 | def batchClassify(self, inp): 176 | h = self.init_hidden(inp.size()[0]) 177 | out = self.forward(inp, h) 178 | return out.view(-1) 179 | 180 | def batchBCELoss(self, inp, target): 181 | loss_fn = nn.BCELoss() 182 | h = self.init_hidden(inp.size()[0]) 183 | out = self.forward(inp, h) 184 | return loss_fn(out, target) 185 | 186 | 187 | def prepare_generator_batch(samples, start_letter=0, gpu=True): 188 | batch_size, seq_len = samples.size() 189 | inp = torch.zeros(batch_size, seq_len) 190 | target = samples 191 | inp[:, 0] = start_letter 192 | inp[:, 1:] = target[:, :seq_len - 1] 193 | inp = inp.type(torch.LongTensor) 194 | target = target.type(torch.LongTensor) 195 | if gpu: 196 | inp = inp.cuda() 197 | target = target.cuda() 198 | return inp, target 199 | 200 | 201 | def prepare_discriminator_data(pos_samples, neg_samples, gpu=True): 202 | inp = torch.cat((pos_samples, neg_samples), 0).type(torch.LongTensor) 203 | target = torch.ones(pos_samples.size()[0] + neg_samples.size()[0]) 204 | target[pos_samples.size()[0]:] = 0 205 | 206 | perm = torch.randperm(target.size()[0]) 207 | target = target[perm] 208 | inp = inp[perm] 209 | 210 | if gpu: 211 | inp = inp.cuda() 212 | target = target.cuda() 213 | 214 | return inp, target 215 | 216 | 217 | def batchwise_sample(gen, num_samples, batch_size): 218 | samples = [] 219 | for i in range(int(ceil(num_samples / float(batch_size)))): 220 | samples.append(gen.sample(batch_size)) 221 | 222 | return torch.cat(samples, 0)[:num_samples] 223 | 224 | 225 | def batchwise_oracle_nll(gen, oracle, num_samples, batch_size, max_seq_len, start_letter=0, gpu=True): 226 | s = batchwise_sample(gen, num_samples, batch_size) 227 | oracle_nll = 0 228 | for i in range(0, num_samples, batch_size): 229 | inp, target = prepare_generator_batch(s[i:i + batch_size], start_letter, gpu) 230 | oracle_loss = oracle.batchNLLLoss(inp, target) / max_seq_len 231 | oracle_nll += oracle_loss.data.item() 232 | 233 | return oracle_nll / (num_samples / batch_size) 234 | 235 | 236 | def train_generator_MLE(gen, gen_opt, oracle, real_data_samples, epochs): 237 | for epoch in range(epochs): 238 | print('epoch %d : ' % (epoch + 1), end='') 239 | sys.stdout.flush() 240 | total_loss = 0 241 | 242 | for i in range(0, POS_NEG_SAMPLES, BATCH_SIZE): 243 | inp, target = prepare_generator_batch(real_data_samples[i:i + BATCH_SIZE], start_letter=START_LETTER, 244 | gpu=CUDA) 245 | gen_opt.zero_grad() 246 | loss = gen.batchNLLLoss(inp, target) 247 | loss.backward() 248 | gen_opt.step() 249 | 250 | total_loss += loss.data.item() 251 | 252 | if (i / BATCH_SIZE) % ceil( 253 | ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / 10.) == 0: 254 | print('.', end='') 255 | sys.stdout.flush() 256 | 257 | # each loss in a batch is loss per sample 258 | total_loss = total_loss / ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / MAX_SEQ_LEN 259 | 260 | # sample from generator and compute oracle NLL 261 | oracle_loss = batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN, 262 | start_letter=START_LETTER, gpu=CUDA) 263 | loss_g.append(oracle_loss) 264 | print(' average_train_NLL = %.4f, ' % total_loss) 265 | 266 | 267 | def train_generator_PG(gen, gen_opt, oracle, dis, num_batches): 268 | for batch in range(num_batches): 269 | s = gen.sample(BATCH_SIZE * 2) 270 | inp, target = prepare_generator_batch(s, start_letter=START_LETTER, gpu=CUDA) 271 | rewards = dis.batchClassify(target) 272 | 273 | gen_opt.zero_grad() 274 | pg_loss = gen.batchPGLoss(inp, target, rewards) 275 | pg_loss.backward() 276 | gen_opt.step() 277 | 278 | 279 | def train_discriminator(discriminator, dis_opt, real_data_samples, generator, oracle, d_steps, epochs): 280 | indice = random.sample(range(len(real_data_samples)), 100) 281 | indice = torch.tensor(indice) 282 | pos_val = real_data_samples[indice] 283 | neg_val = generator.sample(100) 284 | val_inp, val_target = prepare_discriminator_data(pos_val, neg_val, gpu=CUDA) 285 | 286 | for d_step in range(d_steps): 287 | s = batchwise_sample(generator, POS_NEG_SAMPLES, BATCH_SIZE) 288 | dis_inp, dis_target = prepare_discriminator_data(real_data_samples, s, gpu=CUDA) 289 | for epoch in range(epochs): 290 | print('d-step %d epoch %d : ' % (d_step + 1, epoch + 1), end='') 291 | sys.stdout.flush() 292 | total_loss = 0 293 | total_acc = 0 294 | 295 | for i in range(0, 2 * POS_NEG_SAMPLES, BATCH_SIZE): 296 | 297 | inp, target = dis_inp[i:i + BATCH_SIZE], dis_target[i:i + BATCH_SIZE] 298 | dis_opt.zero_grad() 299 | out = discriminator.batchClassify(inp) 300 | loss_fn = nn.BCELoss() 301 | loss = loss_fn(out, target) 302 | loss.backward() 303 | dis_opt.step() 304 | 305 | total_loss += loss.data.item() 306 | total_acc += torch.sum((out > 0.5) == (target > 0.5)).data.item() 307 | 308 | if (i / BATCH_SIZE) % ceil(ceil(2 * POS_NEG_SAMPLES / float( 309 | BATCH_SIZE)) / 10.) == 0: 310 | print('.', end='') 311 | sys.stdout.flush() 312 | 313 | total_loss /= ceil(2 * POS_NEG_SAMPLES / float(BATCH_SIZE)) 314 | total_acc /= float(2 * POS_NEG_SAMPLES) 315 | 316 | val_pred = discriminator.batchClassify(val_inp) 317 | print(' average_loss = %.4f, train_acc = %.4f, val_acc = %.4f' % ( 318 | total_loss, total_acc, torch.sum((val_pred > 0.5) == (val_target > 0.5)).data.item() / 200.)) 319 | 320 | loss_d.append(total_loss) 321 | 322 | 323 | VOCAB_SIZE = 26 324 | MAX_SEQ_LEN = 18 325 | START_LETTER = 0 326 | POS_NEG_SAMPLES = len(all_data) 327 | BATCH_SIZE = 16 328 | ADV_TRAIN_EPOCHS = 100 329 | MLE_TRAIN_EPOCHS = 50 330 | GEN_EMBEDDING_DIM = 3 331 | GEN_HIDDEN_DIM = 128 332 | NUM_PG_BATCHES = 1 333 | GEN_lr = 0.00005 334 | dis_lr = 0.00005 335 | DIS_EMBEDDING_DIM = 3 336 | DIS_HIDDEN_DIM = 128 337 | D_STEPS = 30 338 | D_EPOCHS = 10 339 | ADV_D_EPOCHS = 5 340 | ADV_D_STEPS = 1 341 | 342 | gen_model = 'weight/gen_500.pth' 343 | dis_model = 'weight/dis_500.pth' 344 | 345 | if __name__ == '__main__': 346 | oracle = Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, oracle_init=True) 347 | gen = Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, ) 348 | dis = Discriminator(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA) 349 | 350 | if CUDA: 351 | oracle = oracle.cuda() 352 | gen = gen.cuda() 353 | dis = dis.cuda() 354 | 355 | oracle_samples = torch.Tensor(all_data).type(torch.LongTensor) 356 | oracle_samples = oracle_samples.cuda() 357 | else: 358 | oracle_samples = torch.IntTensor(all_data).type(torch.LongTensor) 359 | 360 | print('Starting Generator MLE Training...') 361 | gen_optimizer = optim.Adam(gen.parameters(), lr=GEN_lr) 362 | train_generator_MLE(gen, gen_optimizer, oracle, oracle_samples, MLE_TRAIN_EPOCHS) 363 | print('Finished Generator MLE Training...') 364 | 365 | print('\nStarting Discriminator Training...') 366 | dis_optimizer = optim.Adagrad(dis.parameters()) 367 | train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, D_STEPS, D_EPOCHS) 368 | 369 | print('\nStarting Adversarial Training...') 370 | for epoch in range(ADV_TRAIN_EPOCHS): 371 | print('\n--------\nEPOCH %d\n--------' % (epoch + 1)) 372 | print('\nAdversarial Training Generator : ', end='') 373 | sys.stdout.flush() 374 | # train_generator_PG(gen, gen_optimizer, oracle, dis, NUM_PG_BATCHES) 375 | print('\nAdversarial Training Discriminator : ') 376 | train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, ADV_D_STEPS, ADV_D_EPOCHS) 377 | 378 | torch.save(gen.state_dict(), gen_model) 379 | torch.save(dis.state_dict(), dis_model) 380 | -------------------------------------------------------------------------------- /gan_generate.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import torch.autograd as autograd 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | import pdb 8 | import math 9 | import torch.nn.init as init 10 | from math import ceil 11 | import numpy as np 12 | import sys 13 | import torch.optim as optim 14 | import pandas as pd 15 | import re 16 | import pickle 17 | import ECD_Attention 18 | import numpy as np 19 | import matplotlib.pyplot as plt 20 | import tqdm 21 | MAX_SEQ_LEN = 18 22 | BATCH_SIZE = 512 23 | data = pd.read_csv('data/AMPdb_data.csv', skiprows=1, usecols=range(3), header=None, names=['ID', 'seq', 'len']) 24 | all_sequences = np.asarray(data['seq']) 25 | 26 | CHARACTER_DICT = { 27 | 'A': 1, 'C': 2, 'E': 3, 'D': 4, 'F': 5, 'I': 6, 'H': 7, 28 | 'K': 8, 'M': 9, 'L': 10, 'N': 11, 'Q': 12, 'P': 13, 'S': 14, 29 | 'R': 15, 'T': 16, 'W': 17, 'V': 18, 'Y': 19, 'G': 20, '0': 21, 'U': 22, 'Z': 23, 'X': 24} 30 | INDEX_DICT = { 31 | 1: 'A', 2: 'C', 3: 'E', 4: 'D', 5: 'F', 6: 'I', 7: 'H', 32 | 8: 'K', 9: 'M', 10: 'L', 11: 'N', 12: 'Q', 13: 'P', 14: 'S', 33 | 15: 'R', 16: 'T', 17: 'W', 18: 'V', 19: 'Y', 20: 'G', 21: '0', 22: 'U', 23: 'Z', 24: 'X'} 34 | 35 | 36 | def sequence_to_vector(sequence): 37 | default = np.asarray([25] * (MAX_SEQ_LEN)) 38 | for i, character in enumerate(sequence[:MAX_SEQ_LEN]): 39 | default[i] = CHARACTER_DICT[character] 40 | return default.astype(int) 41 | 42 | 43 | def vector_to_sequence(vector): 44 | return ''.join([INDEX_DICT.get(item, '0') for item in vector]) 45 | 46 | 47 | all_data = [] 48 | for i in range(len(all_sequences)): 49 | all_data.append(sequence_to_vector(all_sequences[i])) 50 | 51 | 52 | class Generator(nn.Module): 53 | 54 | def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, gpu=False, oracle_init=False): 55 | super(Generator, self).__init__() 56 | self.hidden_dim = hidden_dim 57 | self.embedding_dim = embedding_dim 58 | self.max_seq_len = max_seq_len 59 | self.vocab_size = vocab_size 60 | self.gpu = gpu 61 | 62 | self.embeddings = nn.Embedding(vocab_size, embedding_dim) 63 | self.gru = nn.GRU(embedding_dim, hidden_dim) 64 | self.diff = difformer.DIFFormer(embedding_dim, hidden_dim, out_channels=128, use_graph=False) 65 | self.gru2out = nn.Linear(hidden_dim, vocab_size) 66 | 67 | if oracle_init: 68 | for p in self.parameters(): 69 | nn.init.normal_(p, 0, 1) 70 | 71 | def init_hidden(self, batch_size=1): 72 | h = autograd.Variable(torch.zeros(1, batch_size, self.hidden_dim)) 73 | 74 | if self.gpu: 75 | return h.cuda() 76 | else: 77 | return h 78 | 79 | def forward(self, inp, hidden): 80 | emb = self.embeddings(inp) 81 | emb = emb.view(1, -1, self.embedding_dim) 82 | emb1 = emb.view(-1, self.embedding_dim) 83 | z = self.diff(emb1, edge_index=None) 84 | # print("z={}".format(z.size())) # 16,128 85 | # out, hidden = self.gru(emb, hidden) # 1,16,128 # 1,1,128 86 | out = self.gru2out(z.view(-1, self.hidden_dim)) # 16,26 87 | out = F.log_softmax(out, dim=1) # 16.26 88 | # out, hidden = self.gru(emb, hidden) 89 | # out = self.gru2out(out.view(-1, self.hidden_dim)) 90 | # out = F.log_softmax(out, dim=1) 91 | return out, hidden 92 | 93 | def sample(self, num_samples, start_letter=0): 94 | samples = torch.zeros(num_samples, self.max_seq_len).type(torch.LongTensor) 95 | samples_p = torch.zeros(num_samples, self.max_seq_len).type(torch.FloatTensor) 96 | 97 | h = self.init_hidden(num_samples) # (1,100,128) 98 | inp = autograd.Variable(torch.LongTensor([start_letter] * num_samples)) 99 | 100 | if self.gpu: 101 | samples = samples.cuda() 102 | inp = inp.cuda() 103 | 104 | for i in range(self.max_seq_len): 105 | out, h = self.forward(inp, h) 106 | out_p, _ = torch.max(torch.exp(out), dim=1) 107 | out = torch.multinomial(torch.exp(out), 1) 108 | samples_p[:, i] = out_p 109 | samples[:, i] = out.view(-1).data 110 | inp = out.view(-1) 111 | return samples, samples_p 112 | 113 | def batchNLLLoss(self, inp, target): 114 | loss_fn = nn.NLLLoss() 115 | batch_size, seq_len = inp.size() 116 | inp = inp.permute(1, 0) 117 | target = target.permute(1, 0) 118 | h = self.init_hidden(batch_size) 119 | 120 | loss = 0 121 | 122 | for i in range(seq_len): 123 | out, h = self.forward(inp[i], h) 124 | loss += loss_fn(out, target[i]) 125 | 126 | return loss # per batch 127 | 128 | def batchPGLoss(self, inp, target, reward): 129 | batch_size, seq_len = inp.size() 130 | inp = inp.permute(1, 0) 131 | target = target.permute(1, 0) 132 | h = self.init_hidden(batch_size) 133 | 134 | loss = 0 135 | for i in range(seq_len): 136 | out, h = self.forward(inp[i], h) 137 | for j in range(batch_size): 138 | loss += -out[j][target.data[i][j]] * reward[j] 139 | 140 | return loss / batch_size 141 | 142 | 143 | class Discriminator(nn.Module): 144 | 145 | def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, gpu=False, dropout=0.2): 146 | super(Discriminator, self).__init__() 147 | self.hidden_dim = hidden_dim 148 | self.embedding_dim = embedding_dim 149 | self.max_seq_len = max_seq_len 150 | self.gpu = gpu 151 | 152 | self.embeddings = nn.Embedding(vocab_size, embedding_dim) 153 | self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=dropout) 154 | self.gru2hidden = nn.Linear(2 * 2 * hidden_dim, hidden_dim) 155 | self.dropout_linear = nn.Dropout(p=dropout) 156 | self.hidden2out = nn.Linear(hidden_dim, 1) 157 | 158 | def init_hidden(self, batch_size): 159 | h = autograd.Variable(torch.zeros(2 * 2 * 1, batch_size, self.hidden_dim)) 160 | 161 | if self.gpu: 162 | return h.cuda() 163 | else: 164 | return h 165 | 166 | def forward(self, input, hidden): 167 | emb = self.embeddings(input) 168 | emb = emb.permute(1, 0, 2) 169 | _, hidden = self.gru(emb, hidden) 170 | hidden = hidden.permute(1, 0, 2).contiguous() 171 | out = self.gru2hidden(hidden.view(-1, 4 * self.hidden_dim)) 172 | out = torch.tanh(out) 173 | out = self.dropout_linear(out) 174 | out = self.hidden2out(out) 175 | out = torch.sigmoid(out) 176 | return out 177 | 178 | def batchClassify(self, inp): 179 | h = self.init_hidden(inp.size()[0]) 180 | out = self.forward(inp, h) 181 | return out.view(-1) 182 | 183 | def batchBCELoss(self, inp, target): 184 | loss_fn = nn.BCELoss() 185 | h = self.init_hidden(inp.size()[0]) 186 | out = self.forward(inp, h) 187 | return loss_fn(out, target) 188 | 189 | 190 | def prepare_generator_batch(samples, start_letter=0, gpu=False): 191 | batch_size, seq_len = samples.size() 192 | inp = torch.zeros(batch_size, seq_len) 193 | target = samples 194 | inp[:, 0] = start_letter 195 | inp[:, 1:] = target[:, :seq_len - 1] 196 | 197 | inp = inp.type(torch.LongTensor) 198 | target = target.type(torch.LongTensor) 199 | 200 | if gpu: 201 | inp = inp.cuda() 202 | target = target.cuda() 203 | 204 | return inp, target 205 | 206 | 207 | def prepare_discriminator_data(pos_samples, neg_samples, gpu=False): 208 | inp = torch.cat((pos_samples, neg_samples), 0).type(torch.LongTensor) 209 | target = torch.ones(pos_samples.size()[0] + neg_samples.size()[0]) 210 | target[pos_samples.size()[0]:] = 0 211 | perm = torch.randperm(target.size()[0]) 212 | target = target[perm] 213 | inp = inp[perm] 214 | if gpu: 215 | inp = inp.cuda() 216 | target = target.cuda() 217 | 218 | return inp, target 219 | 220 | 221 | def batchwise_sample(gen, num_samples, batch_size): 222 | samples = [] 223 | for i in range(int(ceil(num_samples / float(batch_size)))): 224 | samples.append(gen.sample(batch_size)) 225 | 226 | return torch.cat(samples, 0)[:num_samples] 227 | 228 | 229 | def batchwise_oracle_nll(gen, oracle, num_samples, batch_size, max_seq_len, start_letter=0, gpu=False): 230 | s = batchwise_sample(gen, num_samples, batch_size) 231 | oracle_nll = 0 232 | for i in range(0, num_samples, batch_size): 233 | inp, target = prepare_generator_batch(s[i:i + batch_size], start_letter, gpu) 234 | oracle_loss = oracle.batchNLLLoss(inp, target) / max_seq_len 235 | oracle_nll += oracle_loss.data.item() 236 | 237 | return oracle_nll / (num_samples / batch_size) 238 | 239 | 240 | def train_generator_MLE(gen, gen_opt, oracle, real_data_samples, epochs): 241 | for epoch in range(epochs): 242 | print('epoch %d : ' % (epoch + 1), end='') 243 | sys.stdout.flush() 244 | total_loss = 0 245 | 246 | for i in range(0, POS_NEG_SAMPLES, BATCH_SIZE): 247 | inp, target = prepare_generator_batch(real_data_samples[i:i + BATCH_SIZE], start_letter=START_LETTER, 248 | gpu=CUDA) 249 | gen_opt.zero_grad() 250 | loss = gen.batchNLLLoss(inp, target) 251 | loss.backward() 252 | gen_opt.step() 253 | 254 | total_loss += loss.data.item() 255 | 256 | if (i / BATCH_SIZE) % ceil( 257 | ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / 10.) == 0: 258 | print('.', end='') 259 | sys.stdout.flush() 260 | 261 | total_loss = total_loss / ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / MAX_SEQ_LEN 262 | 263 | print(' average_train_NLL = %.4f' % (total_loss)) 264 | 265 | 266 | def train_generator_PG(gen, gen_opt, oracle, dis, num_batches): 267 | for batch in range(num_batches): 268 | s = gen.sample(BATCH_SIZE * 2) 269 | inp, target = prepare_generator_batch(s, start_letter=START_LETTER, gpu=CUDA) 270 | rewards = dis.batchClassify(target) 271 | 272 | gen_opt.zero_grad() 273 | pg_loss = gen.batchPGLoss(inp, target, rewards) 274 | pg_loss.backward() 275 | gen_opt.step() 276 | 277 | 278 | def train_discriminator(discriminator, dis_opt, real_data_samples, generator, oracle, d_steps, epochs): 279 | indice = random.sample(range(len(real_data_samples)), 100) 280 | indice = torch.tensor(indice) 281 | pos_val = real_data_samples[indice] 282 | neg_val = generator.sample(100) 283 | val_inp, val_target = prepare_discriminator_data(pos_val, neg_val, gpu=CUDA) 284 | 285 | for d_step in range(d_steps): 286 | s = batchwise_sample(generator, POS_NEG_SAMPLES, BATCH_SIZE) 287 | dis_inp, dis_target = prepare_discriminator_data(real_data_samples, s, gpu=CUDA) 288 | for epoch in range(epochs): 289 | print('d-step %d epoch %d : ' % (d_step + 1, epoch + 1), end='') 290 | sys.stdout.flush() 291 | total_loss = 0 292 | total_acc = 0 293 | 294 | for i in range(0, 2 * POS_NEG_SAMPLES, BATCH_SIZE): 295 | inp, target = dis_inp[i:i + BATCH_SIZE], dis_target[i:i + BATCH_SIZE] 296 | dis_opt.zero_grad() 297 | out = discriminator.batchClassify(inp) 298 | loss_fn = nn.BCELoss() 299 | loss = loss_fn(out, target) 300 | loss.backward() 301 | dis_opt.step() 302 | 303 | total_loss += loss.data.item() 304 | total_acc += torch.sum((out > 0.5) == (target > 0.5)).data.item() 305 | 306 | if (i / BATCH_SIZE) % ceil(ceil(2 * POS_NEG_SAMPLES / float( 307 | BATCH_SIZE)) / 10.) == 0: 308 | print('.', end='') 309 | sys.stdout.flush() 310 | 311 | total_loss /= ceil(2 * POS_NEG_SAMPLES / float(BATCH_SIZE)) 312 | total_acc /= float(2 * POS_NEG_SAMPLES) 313 | 314 | val_pred = discriminator.batchClassify(val_inp) 315 | print(' average_loss = %.4f, train_acc = %.4f, val_acc = %.4f' % ( 316 | total_loss, total_acc, torch.sum((val_pred > 0.5) == (val_target > 0.5)).data.item() / 200.)) 317 | 318 | loss_d.append(total_loss) 319 | 320 | 321 | CUDA = torch.cuda.is_available() 322 | 323 | VOCAB_SIZE = 26 324 | MAX_SEQ_LEN = 18 325 | START_LETTER = 0 326 | POS_NEG_SAMPLES = len(all_data) 327 | GEN_EMBEDDING_DIM = 3 328 | GEN_HIDDEN_DIM = 128 329 | DIS_EMBEDDING_DIM = 3 330 | DIS_HIDDEN_DIM = 128 331 | num_outputs = 20000 332 | 333 | if __name__ == '__main__': 334 | def euclidean_distance(a, b): 335 | return np.linalg.norm(a - b) 336 | 337 | oracle = Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA, oracle_init=True) 338 | 339 | gen = Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA) 340 | dis = Discriminator(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA) 341 | 342 | loss_g = [] 343 | loss_d = [] 344 | 345 | if CUDA: 346 | oracle = oracle.cuda() 347 | gen = gen.cuda() 348 | dis = dis.cuda() 349 | 350 | oracle_samples = torch.Tensor(all_data).type(torch.LongTensor) 351 | 352 | oracle_samples = oracle_samples.cuda() 353 | 354 | else: 355 | oracle_samples = torch.IntTensor(all_data).type(torch.LongTensor) 356 | 357 | gen.load_state_dict(torch.load(r'weight/gen_500.pth', map_location=torch.device('cpu'))) 358 | dis.load_state_dict(torch.load(r'weight/dis_500.pth', map_location=torch.device('cpu'))) 359 | 360 | gen.eval() 361 | dis.eval() 362 | 363 | a, b = gen.sample(num_outputs) 364 | a = a.tolist() 365 | b = b.tolist() 366 | 367 | f = open('outputs.txt', 'w+') 368 | f_seq = open("seq.txt", 'w+') 369 | print('\nGeneration Start') 370 | 371 | 372 | 373 | 374 | 375 | 376 | for i in range(num_outputs): 377 | seq = (vector_to_sequence(a[i])) 378 | percent = (b[i]) 379 | percent = np.array(percent) 380 | percent = np.round(percent, 4) 381 | percent = list(percent) 382 | ALP = sum(percent) / len(percent) 383 | 384 | seq = re.sub('[X]+$', '', seq) 385 | check_x = re.search('[0]', seq) 386 | f.write("%.2f" % ALP + ">" + str(i) + ">" + seq + ">" + str(percent) + '\n') 387 | f_seq.write(seq+'\n') 388 | print('\nGeneration Finished') 389 | -------------------------------------------------------------------------------- /gan_update.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import pandas as pd 4 | import numpy as np 5 | import gan_diff # 导入原始模型的定义,确保与之前训练的模型相匹配 6 | import tqdm 7 | import sys 8 | 9 | CUDA = torch.cuda.is_available() 10 | AX_SEQ_LEN = 18 11 | BATCH_SIZE = 512 12 | data = pd.read_csv('selected_data.csv') 13 | all_sequences = np.asarray(data['sequence']) 14 | all_data = [] 15 | for i in range(len(all_sequences)): 16 | all_data.append(gan_diff.sequence_to_vector(all_sequences[i])) 17 | 18 | VOCAB_SIZE = 26 19 | OCAB_SIZE = 26 20 | MAX_SEQ_LEN = 18 21 | START_LETTER = 0 22 | POS_NEG_SAMPLES = len(all_data) 23 | BATCH_SIZE = 16 24 | ADV_TRAIN_EPOCHS = 1 # 原始100 25 | MLE_TRAIN_EPOCHS = 1 # 原始50 26 | GEN_EMBEDDING_DIM = 3 27 | GEN_HIDDEN_DIM = 128 28 | NUM_PG_BATCHES = 1 29 | GEN_lr = 0.00005 30 | dis_lr = 0.00005 31 | DIS_EMBEDDING_DIM = 3 32 | DIS_HIDDEN_DIM = 128 33 | D_STEPS = 1 # 原始30 34 | D_EPOCHS = 1 # 原始10 35 | ADV_D_EPOCHS = 5 36 | ADV_D_STEPS = 1 37 | 38 | gen_model = 'gen_500.pth' 39 | dis_model = 'dis_500.pth' 40 | 41 | if __name__ == '__main__': 42 | oracle = gan_diff.Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA, oracle_init=True) 43 | gen = gan_diff.Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA).cuda() 44 | dis = gan_diff.Discriminator(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA).cuda() 45 | 46 | # loss_g = [] 47 | # loss_d = [] 48 | 49 | if CUDA: 50 | oracle = oracle.cuda() 51 | gen = gen.cuda() 52 | dis = dis.cuda() 53 | 54 | oracle_samples = torch.Tensor(all_data).type(torch.LongTensor) 55 | oracle_samples = oracle_samples.cuda() 56 | else: 57 | oracle_samples = torch.IntTensor(all_data).type(torch.LongTensor) 58 | gen.load_state_dict(torch.load(r'models/gen_500.pth', map_location=torch.device('cpu'))) 59 | dis.load_state_dict(torch.load(r'models/dis_500.pth', map_location=torch.device('cpu'))) 60 | 61 | print('Starting Generator MLE Training...') 62 | gen_optimizer = optim.Adam(gen.parameters(), lr=GEN_lr) 63 | gan_diff.train_generator_MLE(gen, gen_optimizer, oracle, oracle_samples, MLE_TRAIN_EPOCHS) 64 | print('Finished Generator MLE Training...') 65 | 66 | print('\nStarting Discriminator Training...') 67 | dis_optimizer = optim.Adam(dis.parameters(), lr=dis_lr) # adagrad 68 | gan_diff.train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, D_STEPS, D_EPOCHS) 69 | 70 | print('\nStarting Adversarial Training...') 71 | for epoch in range(ADV_TRAIN_EPOCHS): 72 | print('\n--------\nEPOCH %d\n--------' % (epoch + 1)) 73 | print('\nAdversarial Training Generator : ', end='') 74 | sys.stdout.flush() 75 | gan_diff.train_generator_PG(gen, gen_optimizer, oracle, dis, NUM_PG_BATCHES) 76 | print('\nAdversarial Training Discriminator : ') 77 | gan_diff.train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, ADV_D_STEPS, ADV_D_EPOCHS) 78 | 79 | torch.save(gen.state_dict(), './models/' + gen_model) 80 | torch.save(dis.state_dict(), './models/' + dis_model) 81 | 82 | print('\n Update training completed successfully.Model saved.') 83 | -------------------------------------------------------------------------------- /generate_pos.py: -------------------------------------------------------------------------------- 1 | # 打开pos.txt文件并读取数据 2 | with open("pos.txt", "r") as file: 3 | data = file.readlines() 4 | 5 | # 创建一个输出文件 6 | output_file = open("AMPpos.fasta", "w") 7 | 8 | # 初始化计数器 9 | counter = 1 10 | 11 | # 遍历数据并将其写入文件 12 | for line in data: 13 | sequence = line.strip() # 移除行尾的换行符 14 | parts = sequence.split() 15 | sequence = parts[0] 16 | output_file.write(f">NO_{counter:06}\n{sequence}\n") 17 | counter += 1 18 | 19 | # 关闭文件 20 | output_file.close() 21 | -------------------------------------------------------------------------------- /model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wrab12/diff-amp/3ec897e415a3a4a0333dc7cefc8af8afc2122825/model.png -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | from math import sqrt 4 | import collections 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | 10 | class FocalLoss(nn.Module): 11 | def __init__(self, alpha=1, gamma=2, logits=False, reduce=True): 12 | super(FocalLoss, self).__init__() 13 | self.alpha = alpha 14 | self.gamma = gamma 15 | self.logits = logits 16 | self.reduce = reduce 17 | 18 | def forward(self, inputs, targets): 19 | if self.logits: 20 | BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduce=False) 21 | else: 22 | BCE_loss = F.binary_cross_entropy(inputs, targets, reduce=False) 23 | pt = torch.exp(-BCE_loss) 24 | F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss 25 | 26 | if self.reduce: 27 | return torch.mean(F_loss) 28 | else: 29 | return F_loss 30 | 31 | class PositionalEmbedding(nn.Module): 32 | ''' 33 | Modified from Annotated Transformer 34 | http://nlp.seas.harvard.edu/2018/04/03/attention.html 35 | ''' 36 | def __init__(self, d_model, max_len=1024): 37 | super(PositionalEmbedding, self).__init__() 38 | pe = torch.zeros((max_len, d_model), requires_grad=False).float() 39 | position = torch.arange(0, max_len).float().unsqueeze(1) 40 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)) 41 | pe[:, 0::2] = torch.sin(position * div_term) 42 | pe[:, 1::2] = torch.cos(position * div_term) 43 | pe = pe.unsqueeze(0) 44 | self.register_buffer('pe', pe) 45 | 46 | def forward(self, x): 47 | return self.pe[:, :x.size(1)] 48 | 49 | 50 | class InputPositionEmbedding(nn.Module): 51 | def __init__(self, vocab_size=None, embed_dim=None, dropout=0.1, 52 | init_weight=None, seq_len=None): 53 | super(InputPositionEmbedding, self).__init__() 54 | self.embed = nn.Embedding(vocab_size, embed_dim) 55 | self.dropout = nn.Dropout(dropout) 56 | self.position_embed = PositionalEmbedding(embed_dim, max_len=seq_len) 57 | self.reproject = nn.Identity() 58 | if init_weight is not None: 59 | self.embed = nn.Embedding.from_pretrained(init_weight) 60 | self.reproject = nn.Linear(init_weight.size(1), embed_dim) 61 | 62 | def forward(self, inputs): 63 | # print(inputs.size()) 64 | x = self.embed(inputs) 65 | # print(x.size()) 66 | x = x + self.position_embed(inputs) 67 | # print(x) 68 | x = self.reproject(x) 69 | x = self.dropout(x) 70 | return x 71 | 72 | 73 | 74 | 75 | 76 | 77 | class MultiHeadSelfAttention(nn.Module): 78 | dim_in: int # input dimension 79 | dim_k: int # key and query dimension 80 | dim_v: int # value dimension 81 | num_heads: int # number of heads, for each head, dim_* = dim_* // num_heads 82 | 83 | def __init__(self, dim_in, dim_k, dim_v, num_heads=8): 84 | super(MultiHeadSelfAttention, self).__init__() 85 | assert dim_k % num_heads == 0 and dim_v % num_heads == 0, "dim_k and dim_v must be multiple of num_heads" 86 | self.dim_in = dim_in 87 | self.dim_k = dim_k 88 | self.dim_v = dim_v 89 | self.num_heads = num_heads 90 | self.linear_q = nn.Linear(dim_in, dim_k, bias=False) 91 | self.linear_k = nn.Linear(dim_in, dim_k, bias=False) 92 | self.linear_v = nn.Linear(dim_in, dim_v, bias=False) 93 | self._norm_fact = 1 / sqrt(dim_k // num_heads) 94 | 95 | def forward(self, x): 96 | # x: tensor of shape (batch, n, dim_in) 97 | batch, n, dim_in = x.shape 98 | assert dim_in == self.dim_in 99 | 100 | nh = self.num_heads 101 | dk = self.dim_k // nh # dim_k of each head 102 | dv = self.dim_v // nh # dim_v of each head 103 | 104 | q = self.linear_q(x).reshape(batch, n, nh, dk).transpose(1, 2) # (batch, nh, n, dk) 105 | k = self.linear_k(x).reshape(batch, n, nh, dk).transpose(1, 2) # (batch, nh, n, dk) 106 | v = self.linear_v(x).reshape(batch, n, nh, dv).transpose(1, 2) # (batch, nh, n, dv) 107 | 108 | dist = torch.matmul(q, k.transpose(2, 3)) * self._norm_fact # batch, nh, n, n 109 | dist = torch.softmax(dist, dim=-1) # batch, nh, n, n 110 | 111 | att = torch.matmul(dist, v) # batch, nh, n, dv 112 | att = att.transpose(1, 2).reshape(batch, n, self.dim_v) # batch, n, dim_v 113 | return att 114 | 115 | 116 | 117 | class AggregateLayer(nn.Module): 118 | def __init__(self, d_model=None, dropout=0.1): 119 | super(AggregateLayer, self).__init__() 120 | self.attn = nn.Sequential(collections.OrderedDict([ 121 | ('layernorm', nn.LayerNorm(d_model)), 122 | ('fc', nn.Linear(d_model, 1, bias=False)), 123 | ('dropout', nn.Dropout(dropout)), 124 | ('softmax', nn.Softmax(dim=1)) 125 | ])) 126 | 127 | def forward(self, context): 128 | ''' 129 | Parameters 130 | ---------- 131 | context: token embedding from encoder (Transformer/LSTM) 132 | (batch_size, seq_len, embed_dim) 133 | ''' 134 | 135 | weight = self.attn(context) 136 | # (batch_size, seq_len, embed_dim).T * (batch_size, seq_len, 1) * -> 137 | # (batch_size, embed_dim, 1) 138 | output = torch.bmm(context.transpose(1, 2), weight) 139 | output = output.squeeze(2) 140 | return output 141 | 142 | 143 | 144 | class GlobalPredictor(nn.Module): 145 | def __init__(self, d_model=None, d_h=None, d_out=None, dropout=0.5): 146 | super(GlobalPredictor, self).__init__() 147 | self.predict_layer = nn.Sequential(collections.OrderedDict([ 148 | ('batchnorm', nn.BatchNorm1d(d_model)), 149 | ('fc1', nn.Linear(d_model, d_h)), 150 | ('tanh', nn.Tanh()), 151 | ('dropout', nn.Dropout(dropout)), 152 | ('fc2', nn.Linear(d_h, d_out)), 153 | ('sigmoid', nn.Sigmoid()) 154 | ])) 155 | 156 | def forward(self, x): 157 | x = self.predict_layer(x) 158 | return x 159 | 160 | 161 | # class SequenceLSTM(nn.Module): 162 | # """Container module with an encoder, a recurrent module, and a decoder.""" 163 | 164 | # def __init__(self, d_input=None, d_embed=20, d_model=128, 165 | # vocab_size=None, seq_len=None, 166 | # dropout=0.1, lstm_dropout=0, 167 | # nlayers=1, bidirectional=False, 168 | # proj_loc_config=None): 169 | class SequenceLSTM(nn.Module): 170 | """Container module with an encoder, a recurrent module, and a decoder.""" 171 | 172 | def __init__(self, d_input=None, d_embed=20, d_model=128, 173 | vocab_size=None, seq_len=None, 174 | dropout=0.1, lstm_dropout=0, 175 | nlayers=1, bidirectional=False,d_another_input=531,d_another_embed=128): 176 | super(SequenceLSTM, self).__init__() 177 | 178 | self.embed = InputPositionEmbedding(vocab_size=vocab_size, 179 | seq_len=seq_len, embed_dim=d_embed) 180 | 181 | self.lstm = nn.LSTM(input_size=d_input, 182 | hidden_size=d_model//2 if bidirectional else d_model, 183 | num_layers=nlayers, dropout=lstm_dropout, 184 | bidirectional=bidirectional) 185 | self.drop = nn.Dropout(dropout) 186 | self.proj_loc_layer = nn.Linear(d_another_input, d_another_embed) 187 | 188 | 189 | def forward(self, x, loc_feat=None): 190 | # print(x) 191 | x = self.embed(x) 192 | # print(x) 193 | if loc_feat is not None: 194 | loc_feat = self.proj_loc_layer(loc_feat) 195 | x = torch.cat([x, loc_feat], dim=2) 196 | # print(x) 197 | x = x.transpose(0, 1).contiguous() 198 | # print(x.size()) 199 | x, _ = self.lstm(x) 200 | # print(x.size()) 201 | x = x.transpose(0, 1).contiguous() 202 | # print(x.size()) 203 | x = self.drop(x) 204 | return x 205 | 206 | 207 | class SequenceCNNLSTM(nn.Module): 208 | """Container module with an encoder, a recurrent module, and a decoder.""" 209 | 210 | def __init__(self, d_input=None, d_model=128, 211 | vocab_size=None, seq_len=None, 212 | dropout=0.1, lstm_dropout=0, 213 | nlayers=1, bidirectional=False,d_another_h=[64,32],d_output=1): 214 | super(SequenceCNNLSTM, self).__init__() 215 | 216 | self.lstm = nn.LSTM(input_size=d_input, 217 | hidden_size=d_model//2 if bidirectional else d_model, 218 | num_layers=nlayers, dropout=lstm_dropout, 219 | bidirectional=bidirectional) 220 | self.conv1d = nn.Conv1d(in_channels=d_model,out_channels=d_another_h[0],kernel_size=3) 221 | 222 | self.relu=nn.ReLU() 223 | self.pooling=nn.MaxPool1d(kernel_size=seq_len-d_another_h[0]+1) 224 | 225 | self.drop = nn.Dropout(dropout) 226 | self.fc = nn.Linear(d_another_h[0], d_output) 227 | self.sigmoid=nn.Sigmoid() 228 | 229 | 230 | def forward(self, x): 231 | # print(x) 232 | x = x.transpose(0, 1).contiguous() 233 | x, _ = self.lstm(x) 234 | x = x.transpose(0, 1).contiguous() 235 | x = self.drop(x) 236 | x = x.permute(0,2,1) 237 | x = self.conv1d(x) 238 | x = self.relu(x) 239 | x = self.pooling(x) 240 | x = x.view(-1, x.size(1)) 241 | x = self.fc(x) 242 | x = self.sigmoid(x) 243 | return x 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | class SequenceCNN(nn.Module): 252 | """Container module with an encoder, a recurrent module, and a decoder.""" 253 | 254 | def __init__(self, d_input=None, 255 | vocab_size=None, seq_len=None, 256 | dropout=0.1,d_another_h=[64,32],d_output=1): 257 | super(SequenceCNN, self).__init__() 258 | 259 | 260 | self.conv1d = nn.Conv1d(in_channels=d_input,out_channels=d_another_h[0],kernel_size=3) 261 | 262 | self.relu=nn.ReLU() 263 | self.pooling=nn.MaxPool1d(kernel_size=seq_len-d_another_h[0]+1) 264 | 265 | self.drop = nn.Dropout(dropout) 266 | self.fc = nn.Linear(d_another_h[0], d_output) 267 | self.sigmoid=nn.Sigmoid() 268 | 269 | 270 | def forward(self, x): 271 | # print(x) 272 | x = x.permute(0,2,1) 273 | x = self.conv1d(x) 274 | x = self.relu(x) 275 | x = self.pooling(x) 276 | x = x.view(-1, x.size(1)) 277 | x = self.fc(x) 278 | x = self.sigmoid(x) 279 | return x 280 | 281 | 282 | class SequenceMultiCNN(nn.Module): 283 | """Container module with an encoder, a recurrent module, and a decoder.""" 284 | 285 | def __init__(self, d_input=None, 286 | vocab_size=None, seq_len=None, 287 | dropout=0.1,d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1): 288 | super(SequenceMultiCNN, self).__init__() 289 | 290 | 291 | 292 | self.convs = nn.ModuleList([ 293 | nn.Sequential(nn.Conv1d(in_channels=d_input, 294 | out_channels=d_another_h, 295 | kernel_size=h), 296 | # nn.BatchNorm1d(num_features=config.feature_size), 297 | nn.ReLU(), 298 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 299 | for h in k_cnn 300 | ]) 301 | 302 | self.drop = nn.Dropout(dropout) 303 | self.fc = nn.Linear(d_another_h*len(k_cnn), d_output) 304 | self.fcblock=nn.Sequential( 305 | nn.BatchNorm1d(d_another_h*len(k_cnn)), 306 | nn.LeakyReLU(), 307 | nn.Linear(d_another_h*len(k_cnn),128), 308 | nn.BatchNorm1d(128), 309 | nn.LeakyReLU(), 310 | nn.Linear(128,64), 311 | nn.BatchNorm1d(64), 312 | nn.LeakyReLU(), 313 | nn.Linear(64,1) 314 | ) 315 | self.sigmoid=nn.Sigmoid() 316 | 317 | 318 | 319 | def forward(self, x): 320 | # print(x) 321 | x = x.permute(0,2,1) 322 | out = [conv(x) for conv in self.convs] 323 | x = torch.cat(out, dim=1) 324 | 325 | x = x.view(-1, x.size(1)) 326 | x = self.fc(x) 327 | x = self.sigmoid(x) 328 | return x 329 | 330 | 331 | class SequenceMultiCNN_1(nn.Module): 332 | """Container module with an encoder, a recurrent module, and a decoder.""" 333 | 334 | def __init__(self, d_input=None, 335 | vocab_size=None, seq_len=None, 336 | dropout=0.1,d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1): 337 | super(SequenceMultiCNN_1, self).__init__() 338 | 339 | 340 | 341 | self.convs = nn.ModuleList([ 342 | nn.Sequential(nn.Conv1d(in_channels=d_input, 343 | out_channels=d_another_h, 344 | kernel_size=h), 345 | # nn.BatchNorm1d(num_features=config.feature_size), 346 | nn.ReLU(), 347 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 348 | for h in k_cnn 349 | ]) 350 | 351 | self.drop = nn.Dropout(dropout) 352 | self.fc = nn.Linear(d_another_h*len(k_cnn), d_output) 353 | self.fcblock=nn.Sequential( 354 | nn.BatchNorm1d(d_another_h*len(k_cnn)), 355 | nn.LeakyReLU(), 356 | nn.Linear(d_another_h*len(k_cnn),128), 357 | nn.BatchNorm1d(128), 358 | nn.LeakyReLU(), 359 | nn.Linear(128,64), 360 | nn.BatchNorm1d(64), 361 | nn.LeakyReLU(), 362 | nn.Linear(64,1) 363 | ) 364 | self.sigmoid=nn.Sigmoid() 365 | 366 | 367 | 368 | def forward(self, AAI_feat,onehot_feat,BLOSUM62_feat,PAAC_feat): 369 | # print(x) 370 | x=torch.cat([AAI_feat,onehot_feat,BLOSUM62_feat,PAAC_feat],dim=2) 371 | # print(x.size()) 372 | x = x.permute(0,2,1) 373 | out = [conv(x) for conv in self.convs] 374 | x = torch.cat(out, dim=1) 375 | 376 | x = x.view(-1, x.size(1)) 377 | x = self.fcblock(x) 378 | x = self.sigmoid(x) 379 | return x 380 | 381 | 382 | class SequenceMultiCNNAGG(nn.Module): 383 | """Container module with an encoder, a recurrent module, and a decoder.""" 384 | 385 | def __init__(self, d_input=None, 386 | vocab_size=None, seq_len=None, 387 | dropout=0.1,d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1): 388 | super(SequenceMultiCNNAGG, self).__init__() 389 | 390 | self.conv_1=nn.Conv1d(in_channels=d_input, out_channels=d_another_h, kernel_size=k_cnn[0]) 391 | self.conv_2=nn.Conv1d(in_channels=d_input, 392 | out_channels=d_another_h, 393 | kernel_size=k_cnn[1]) 394 | self.conv_3=nn.Conv1d(in_channels=d_input, 395 | out_channels=d_another_h, 396 | kernel_size=k_cnn[2]) 397 | self.conv_4=nn.Conv1d(in_channels=d_input, 398 | out_channels=d_another_h, 399 | kernel_size=k_cnn[3]) 400 | self.conv_5=nn.Conv1d(in_channels=d_input, 401 | out_channels=d_another_h, 402 | kernel_size=k_cnn[4]) 403 | self.relu_1=nn.ReLU() 404 | self.relu_2=nn.ReLU() 405 | self.relu_3=nn.ReLU() 406 | self.relu_4=nn.ReLU() 407 | self.relu_5=nn.ReLU() 408 | self.agg_1=AggregateLayer(d_another_h) 409 | self.agg_2=AggregateLayer(d_another_h) 410 | self.agg_3=AggregateLayer(d_another_h) 411 | self.agg_4=AggregateLayer(d_another_h) 412 | self.agg_5=AggregateLayer(d_another_h) 413 | 414 | 415 | self.drop = nn.Dropout(dropout) 416 | self.fc = nn.Linear(d_another_h*len(k_cnn), d_output) 417 | self.sigmoid=nn.Sigmoid() 418 | 419 | 420 | 421 | def forward(self, x): 422 | # print(x) 423 | x = x.permute(0,2,1) 424 | out_1= self.conv_1(x) 425 | out_2= self.conv_2(x) 426 | out_3= self.conv_3(x) 427 | out_4= self.conv_4(x) 428 | out_5= self.conv_5(x) 429 | out_1=out_1.permute(0,2,1) 430 | out_2=out_2.permute(0,2,1) 431 | out_3=out_3.permute(0,2,1) 432 | out_4=out_4.permute(0,2,1) 433 | out_5=out_5.permute(0,2,1) 434 | # print(out_1.size()) 435 | out_1=self.agg_1(out_1) 436 | # print(out_1.size()) 437 | out_2=self.agg_1(out_2) 438 | out_3=self.agg_1(out_3) 439 | out_4=self.agg_1(out_4) 440 | out_5=self.agg_1(out_5) 441 | out = torch.cat([out_1,out_2,out_3,out_4,out_5], dim=1) 442 | # print(out.size()) 443 | # x = x.view(-1, x.size(1)) 444 | out = self.fc(out) 445 | out = self.sigmoid(out) 446 | return out 447 | 448 | 449 | 450 | class SequenceMultiTypeMultiCNN(nn.Module): 451 | """Container module with an encoder, a recurrent module, and a decoder.""" 452 | 453 | def __init__(self, d_input=[531,21,23,3], 454 | vocab_size=None, seq_len=None, 455 | dropout=0.1,d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1): 456 | super(SequenceMultiTypeMultiCNN, self).__init__() 457 | 458 | 459 | 460 | self.convs_1 = nn.ModuleList([ 461 | nn.Sequential(nn.Conv1d(in_channels=d_input[0], 462 | out_channels=d_another_h, 463 | kernel_size=h), 464 | nn.BatchNorm1d(num_features=d_another_h), 465 | nn.ReLU(), 466 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 467 | for h in k_cnn 468 | ]) 469 | self.convs_2 = nn.ModuleList([ 470 | nn.Sequential(nn.Conv1d(in_channels=d_input[1], 471 | out_channels=d_another_h, 472 | kernel_size=h), 473 | nn.BatchNorm1d(num_features=d_another_h), 474 | nn.ReLU(), 475 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 476 | for h in k_cnn 477 | ]) 478 | self.convs_3 = nn.ModuleList([ 479 | nn.Sequential(nn.Conv1d(in_channels=d_input[2], 480 | out_channels=d_another_h, 481 | kernel_size=h), 482 | nn.BatchNorm1d(num_features=d_another_h), 483 | nn.ReLU(), 484 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 485 | for h in k_cnn 486 | ]) 487 | # self.convs_4 = nn.ModuleList([ 488 | # nn.Sequential(nn.Conv1d(in_channels=d_input[3], 489 | # out_channels=d_another_h, 490 | # kernel_size=h), 491 | # nn.BatchNorm1d(num_features=d_another_h), 492 | # nn.ReLU(), 493 | # nn.MaxPool1d(kernel_size=seq_len-h+1)) 494 | # for h in k_cnn 495 | # ]) 496 | self.maxpool_1=nn.MaxPool1d(kernel_size=5) 497 | self.maxpool_2=nn.MaxPool1d(kernel_size=5) 498 | self.maxpool_3=nn.MaxPool1d(kernel_size=5) 499 | # self.maxpool_4=nn.MaxPool1d(kernel_size=5) 500 | self.drop = nn.Dropout(dropout) 501 | self.batchnorm1d=nn.BatchNorm1d(num_features=d_another_h*3) 502 | self.fc = nn.Linear(d_another_h*3, d_output) 503 | self.sigmoid=nn.Sigmoid() 504 | 505 | 506 | 507 | def forward(self, AAI_feat,onehot_feat,BLOSUM62_feat): 508 | # print(x) 509 | AAI_feat = AAI_feat.permute(0,2,1) 510 | out_1 = [conv(AAI_feat) for conv in self.convs_1] 511 | onehot_feat = onehot_feat.permute(0,2,1) 512 | out_2 = [conv(onehot_feat) for conv in self.convs_2] 513 | BLOSUM62_feat = BLOSUM62_feat.permute(0,2,1) 514 | out_3 = [conv(BLOSUM62_feat) for conv in self.convs_3] 515 | 516 | out_1 = torch.cat(out_1, dim=2) 517 | # print(out_1.size()) 518 | out_1=self.maxpool_1(out_1) 519 | # print(out_1.size()) 520 | out_2 = torch.cat(out_2, dim=2) 521 | out_2=self.maxpool_2(out_2) 522 | out_3 = torch.cat(out_3, dim=2) 523 | out_3=self.maxpool_3(out_3) 524 | 525 | x=torch.cat([out_1,out_2,out_3],dim=1) 526 | x = x.view(-1, x.size(1)) 527 | x = self.batchnorm1d(x) 528 | x = self.fc(x) 529 | x = self.sigmoid(x) 530 | return x 531 | 532 | 533 | class SequenceMultiTypeMultiCNN_1(nn.Module): 534 | """Container module with an encoder, a recurrent module, and a decoder.""" 535 | 536 | def __init__(self, d_input=[531,21,23,3], 537 | vocab_size=None, seq_len=None, 538 | dropout=0.1,d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1): 539 | super(SequenceMultiTypeMultiCNN_1, self).__init__() 540 | 541 | self.batchnorm_4=nn.BatchNorm1d(num_features=d_input[3]) 542 | self.convs_1 = nn.ModuleList([ 543 | nn.Sequential(nn.Conv1d(in_channels=d_input[0], 544 | out_channels=d_another_h, 545 | kernel_size=h), 546 | nn.BatchNorm1d(num_features=d_another_h), 547 | nn.ReLU(), 548 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 549 | for h in k_cnn 550 | ]) 551 | self.convs_2 = nn.ModuleList([ 552 | nn.Sequential(nn.Conv1d(in_channels=d_input[1], 553 | out_channels=d_another_h, 554 | kernel_size=h), 555 | nn.BatchNorm1d(num_features=d_another_h), 556 | nn.ReLU(), 557 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 558 | for h in k_cnn 559 | ]) 560 | self.convs_3 = nn.ModuleList([ 561 | nn.Sequential(nn.Conv1d(in_channels=d_input[2], 562 | out_channels=d_another_h, 563 | kernel_size=h), 564 | nn.BatchNorm1d(num_features=d_another_h), 565 | nn.ReLU(), 566 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 567 | for h in k_cnn 568 | ]) 569 | self.convs_4 = nn.ModuleList([ 570 | nn.Sequential(nn.Conv1d(in_channels=d_input[3], 571 | out_channels=d_another_h, 572 | kernel_size=h), 573 | nn.BatchNorm1d(num_features=d_another_h), 574 | nn.ReLU(), 575 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 576 | for h in k_cnn 577 | ]) 578 | self.maxpool_1=nn.MaxPool1d(kernel_size=len(k_cnn)) 579 | self.maxpool_2=nn.MaxPool1d(kernel_size=len(k_cnn)) 580 | self.maxpool_3=nn.MaxPool1d(kernel_size=len(k_cnn)) 581 | self.maxpool_4=nn.MaxPool1d(kernel_size=len(k_cnn)) 582 | # self.maxpool_1=nn.AvgPool1d(kernel_size=5) 583 | # self.maxpool_2=nn.AvgPool1d(kernel_size=5) 584 | # self.maxpool_3=nn.AvgPool1d(kernel_size=5) 585 | # self.maxpool_4=nn.AvgPool1d(kernel_size=5) 586 | self.drop = nn.Dropout(dropout) 587 | 588 | self.fc_1 = nn.Linear(d_another_h*len(k_cnn), d_output) 589 | self.fc_2 = nn.Linear(d_another_h*len(k_cnn), d_output) 590 | self.fc_3 = nn.Linear(d_another_h*len(k_cnn), d_output) 591 | self.fc_4 = nn.Linear(d_another_h*len(k_cnn), d_output) 592 | self.fc = nn.Linear(4*d_another_h, d_output) 593 | self.sigmoid=nn.Sigmoid() 594 | 595 | 596 | 597 | def forward(self, AAI_feat,onehot_feat,BLOSUM62_feat,PAAC_feat): 598 | # print(x) 599 | AAI_feat = AAI_feat.permute(0,2,1) 600 | out_1 = [conv(AAI_feat) for conv in self.convs_1] 601 | onehot_feat = onehot_feat.permute(0,2,1) 602 | out_2 = [conv(onehot_feat) for conv in self.convs_2] 603 | BLOSUM62_feat = BLOSUM62_feat.permute(0,2,1) 604 | out_3 = [conv(BLOSUM62_feat) for conv in self.convs_3] 605 | 606 | PAAC_feat = PAAC_feat.permute(0,2,1) 607 | PAAC_feat = self.batchnorm_4(PAAC_feat) 608 | out_4 = [conv(PAAC_feat) for conv in self.convs_4] 609 | out_1 = torch.cat(out_1, dim=2) 610 | # print(out_1.size()) 611 | out_1=self.maxpool_1(out_1) 612 | # print(out_1.size()) 613 | out_2 = torch.cat(out_2, dim=2) 614 | out_2=self.maxpool_2(out_2) 615 | out_3 = torch.cat(out_3, dim=2) 616 | out_3=self.maxpool_3(out_3) 617 | out_4 = torch.cat(out_4, dim=2) 618 | out_4=self.maxpool_4(out_4) 619 | # print(out_4.size()) 620 | x=torch.cat([out_1,out_2,out_3,out_4],dim=1) 621 | x = x.view(-1, x.size(1)) 622 | # out=torch.cat([out_1,out_2,out_3,out_4], dim=1) 623 | x = self.fc(x) 624 | x = self.sigmoid(x) 625 | return x 626 | 627 | 628 | 629 | 630 | class SequenceMultiTypeMultiCNN_2(nn.Module): 631 | """Container module with an encoder, a recurrent module, and a decoder.""" 632 | 633 | def __init__(self, d_input=[531,21,23,3], 634 | vocab_size=None, seq_len=None, 635 | dropout=0.1,d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1): 636 | super(SequenceMultiTypeMultiCNN_2, self).__init__() 637 | 638 | self.batchnorm_4=nn.BatchNorm1d(num_features=d_input[3]) 639 | self.convs_1 = nn.ModuleList([ 640 | nn.Sequential(nn.Conv1d(in_channels=d_input[0], 641 | out_channels=d_another_h, 642 | kernel_size=h), 643 | nn.BatchNorm1d(num_features=d_another_h), 644 | nn.ReLU(), 645 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 646 | for h in k_cnn 647 | ]) 648 | self.convs_2 = nn.ModuleList([ 649 | nn.Sequential(nn.Conv1d(in_channels=d_input[1], 650 | out_channels=d_another_h, 651 | kernel_size=h), 652 | nn.BatchNorm1d(num_features=d_another_h), 653 | nn.ReLU(), 654 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 655 | for h in k_cnn 656 | ]) 657 | self.convs_3 = nn.ModuleList([ 658 | nn.Sequential(nn.Conv1d(in_channels=d_input[2], 659 | out_channels=d_another_h, 660 | kernel_size=h), 661 | nn.BatchNorm1d(num_features=d_another_h), 662 | nn.ReLU(), 663 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 664 | for h in k_cnn 665 | ]) 666 | self.convs_4 = nn.ModuleList([ 667 | nn.Sequential(nn.Conv1d(in_channels=d_input[3], 668 | out_channels=d_another_h, 669 | kernel_size=h), 670 | nn.BatchNorm1d(num_features=d_another_h), 671 | nn.ReLU(), 672 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 673 | for h in k_cnn 674 | ]) 675 | self.maxpool_1=nn.MaxPool1d(kernel_size=len(k_cnn)) 676 | self.maxpool_2=nn.MaxPool1d(kernel_size=len(k_cnn)) 677 | self.maxpool_3=nn.MaxPool1d(kernel_size=len(k_cnn)) 678 | self.maxpool_4=nn.MaxPool1d(kernel_size=len(k_cnn)) 679 | # self.maxpool_1=nn.AvgPool1d(kernel_size=5) 680 | # self.maxpool_2=nn.AvgPool1d(kernel_size=5) 681 | # self.maxpool_3=nn.AvgPool1d(kernel_size=5) 682 | # self.maxpool_4=nn.AvgPool1d(kernel_size=5) 683 | self.drop = nn.Dropout(dropout) 684 | 685 | self.fc_1 = nn.Linear(d_another_h*len(k_cnn), d_output) 686 | self.fc_2 = nn.Linear(d_another_h*len(k_cnn), d_output) 687 | self.fc_3 = nn.Linear(d_another_h*len(k_cnn), d_output) 688 | self.fc_4 = nn.Linear(d_another_h*len(k_cnn), d_output) 689 | self.fc = nn.Linear(4*d_another_h, d_output) 690 | self.sigmoid=nn.Sigmoid() 691 | 692 | 693 | 694 | def forward(self, AAI_feat,onehot_feat,BLOSUM62_feat,PAAC_feat): 695 | # print(x) 696 | AAI_feat = AAI_feat.permute(0,2,1) 697 | out_1 = [conv(AAI_feat) for conv in self.convs_1] 698 | onehot_feat = onehot_feat.permute(0,2,1) 699 | out_2 = [conv(onehot_feat) for conv in self.convs_2] 700 | BLOSUM62_feat = BLOSUM62_feat.permute(0,2,1) 701 | out_3 = [conv(BLOSUM62_feat) for conv in self.convs_3] 702 | 703 | PAAC_feat = PAAC_feat.permute(0,2,1) 704 | PAAC_feat = self.batchnorm_4(PAAC_feat) 705 | out_4 = [conv(PAAC_feat) for conv in self.convs_4] 706 | out_1 = torch.cat(out_1, dim=2) 707 | # print(out_1.size()) 708 | out_1=self.maxpool_1(out_1) 709 | # print(out_1.size()) 710 | out_2 = torch.cat(out_2, dim=2) 711 | out_2=self.maxpool_2(out_2) 712 | out_3 = torch.cat(out_3, dim=2) 713 | out_3=self.maxpool_3(out_3) 714 | out_4 = torch.cat(out_4, dim=2) 715 | out_4=self.maxpool_4(out_4) 716 | # print(out_4.size()) 717 | x=torch.cat([out_1,out_2,out_3,out_4],dim=1) 718 | x = x.view(-1, x.size(1)) 719 | # out=torch.cat([out_1,out_2,out_3,out_4], dim=1) 720 | x = self.fc(x) 721 | x = self.sigmoid(x) 722 | return x,out_1,out_2,out_3,out_4 723 | 724 | 725 | 726 | 727 | 728 | class SequenceMultiCNNLSTM(nn.Module): 729 | """Container module with an encoder, a recurrent module, and a decoder.""" 730 | 731 | def __init__(self, d_input=None, d_model=128, 732 | vocab_size=None, seq_len=None, 733 | dropout=0.1, lstm_dropout=0, 734 | nlayers=1, bidirectional=False,d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1): 735 | super(SequenceMultiCNNLSTM, self).__init__() 736 | 737 | self.lstm = nn.LSTM(input_size=d_input, 738 | hidden_size=d_model//2 if bidirectional else d_model, 739 | num_layers=nlayers, dropout=lstm_dropout, 740 | bidirectional=bidirectional) 741 | 742 | self.convs = nn.ModuleList([ 743 | nn.Sequential(nn.Conv1d(in_channels=d_model, 744 | out_channels=d_another_h, 745 | kernel_size=h), 746 | # nn.BatchNorm1d(num_features=config.feature_size), 747 | nn.ReLU(), 748 | nn.MaxPool1d(kernel_size=seq_len-h+1)) 749 | for h in k_cnn 750 | ]) 751 | 752 | self.drop = nn.Dropout(dropout) 753 | self.fc = nn.Linear(d_another_h*len(k_cnn), d_output) 754 | self.sigmoid=nn.Sigmoid() 755 | 756 | 757 | 758 | def forward(self, x): 759 | # print(x) 760 | x = x.transpose(0, 1).contiguous() 761 | x, _ = self.lstm(x) 762 | x = x.transpose(0, 1).contiguous() 763 | x = self.drop(x) 764 | x = x.permute(0,2,1) 765 | out = [conv(x) for conv in self.convs] 766 | x = torch.cat(out, dim=1) 767 | 768 | x = x.view(-1, x.size(1)) 769 | x = self.fc(x) 770 | x = self.sigmoid(x) 771 | return x 772 | 773 | 774 | 775 | 776 | class TranformerModel(nn.Module): 777 | def __init__(self, vocab_size=24,hidden_dim=25,d_embed=512,max_length=500): 778 | super(TranformerModel,self).__init__() 779 | 780 | # self.embedding = nn.Embedding(vocab_size, d_embed, padding_idx=0) 781 | self.embed = InputPositionEmbedding(vocab_size=vocab_size, 782 | seq_len=max_length, embed_dim=d_embed) 783 | self.encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) 784 | self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1) 785 | 786 | self.gru = nn.GRU(d_embed, hidden_dim, num_layers=2, 787 | bidirectional=True, dropout=0.2) 788 | 789 | 790 | self.block1=nn.Sequential(nn.Linear(d_embed*max_length,1024), 791 | nn.BatchNorm1d(1024), 792 | nn.LeakyReLU(), 793 | nn.Linear(1024,256), 794 | ) 795 | 796 | self.block2=nn.Sequential( 797 | nn.BatchNorm1d(256), 798 | nn.LeakyReLU(), 799 | nn.Linear(256,128), 800 | nn.BatchNorm1d(128), 801 | nn.LeakyReLU(), 802 | nn.Linear(128,64), 803 | nn.BatchNorm1d(64), 804 | nn.LeakyReLU(), 805 | nn.Linear(64,1) 806 | ) 807 | self.sigmoid=nn.Sigmoid() 808 | def forward(self, x): 809 | x=self.embed(x) 810 | output=self.transformer_encoder(x).permute(1, 0, 2) 811 | # print(output.size()) 812 | # output,hn=self.gru(output) 813 | # print(output.size()) 814 | # print(hn.size()) 815 | output=output.permute(1,0,2) 816 | # hn=hn.permute(1,0,2) 817 | 818 | output=output.reshape(output.shape[0],-1) 819 | # hn=hn.reshape(output.shape[0],-1) 820 | # print(output.size()) 821 | # print(hn.size()) 822 | # output=torch.cat([output,hn],1) 823 | # print(output.size()) 824 | output=self.block1(output) 825 | output=self.block2(output) 826 | output=self.sigmoid(output) 827 | # print(output.size()) 828 | return output 829 | 830 | 831 | def __init__(self, config): 832 | super().__init__(config) 833 | self.bert = ProteinBertModel(config) 834 | self.embedding = nn.Embedding(21, 512, padding_idx=0) 835 | # self.embed = InputPositionEmbedding(vocab_size=21, 836 | # seq_len=300, embed_dim=512) 837 | self.encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) 838 | self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1) 839 | 840 | self.gru = nn.GRU(512, 25, num_layers=2, 841 | bidirectional=True, dropout=0.2) 842 | 843 | 844 | self.block1=nn.Sequential(nn.Linear(50*300+25*4,1024), 845 | nn.BatchNorm1d(1024), 846 | nn.LeakyReLU(), 847 | nn.Linear(1024,256), 848 | ) 849 | 850 | self.block2=nn.Sequential( 851 | nn.BatchNorm1d(256+768), 852 | nn.LeakyReLU(), 853 | nn.Linear(256,128), 854 | nn.BatchNorm1d(128), 855 | nn.LeakyReLU(), 856 | nn.Linear(128,64), 857 | nn.BatchNorm1d(64), 858 | nn.LeakyReLU(), 859 | nn.Linear(64,1) 860 | ) 861 | self.sigmoid=nn.Sigmoid() 862 | def forward(self, x, input_ids=None, input_mask=None,): 863 | outputs = self.bert(input_ids, input_mask=input_mask) 864 | 865 | sequence_output, pooled_output = outputs[:2] 866 | average = torch.mean(sequence_output, dim=1) 867 | x=self.embedding(x) 868 | output=self.transformer_encoder(x).permute(1, 0, 2) 869 | print(output.size()) 870 | output,hn=self.gru(output) 871 | print(output.size()) 872 | print(hn.size()) 873 | output=output.permute(1,0,2) 874 | hn=hn.permute(1,0,2) 875 | 876 | output=output.reshape(output.shape[0],-1) 877 | hn=hn.reshape(output.shape[0],-1) 878 | # print(output.size()) 879 | # print(hn.size()) 880 | output=torch.cat([output,hn],1) 881 | # print(output.size()) 882 | output=self.block1(output) 883 | output=torch.cat([output,average],1) 884 | output=self.block2(output) 885 | output=self.sigmoid(output) 886 | # print(output.size()) 887 | return output 888 | 889 | 890 | 891 | class TranformerModelNOGRU(nn.Module): 892 | def __init__(self, vocab_size=24,hidden_dim=25,d_embed=512,max_length=500): 893 | super(TranformerModelNOGRU,self).__init__() 894 | 895 | # self.embedding = nn.Embedding(vocab_size, d_embed, padding_idx=0) 896 | self.embed = InputPositionEmbedding(vocab_size=vocab_size, 897 | seq_len=max_length, embed_dim=d_embed) 898 | self.encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) 899 | self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1) 900 | 901 | self.gru = nn.GRU(d_embed, hidden_dim, num_layers=2, 902 | bidirectional=True, dropout=0.2) 903 | 904 | 905 | self.block1=nn.Sequential(nn.Linear(d_embed*max_length,1024), 906 | nn.BatchNorm1d(1024), 907 | nn.LeakyReLU(), 908 | nn.Linear(1024,256), 909 | ) 910 | 911 | self.block2=nn.Sequential( 912 | nn.BatchNorm1d(256), 913 | nn.LeakyReLU(), 914 | nn.Linear(256,128), 915 | nn.BatchNorm1d(128), 916 | nn.LeakyReLU(), 917 | nn.Linear(128,64), 918 | nn.BatchNorm1d(64), 919 | nn.LeakyReLU(), 920 | nn.Linear(64,1) 921 | ) 922 | self.sigmoid=nn.Sigmoid() 923 | def forward(self, x): 924 | x=self.embed(x) 925 | output=self.transformer_encoder(x).permute(1, 0, 2) 926 | # print(output.size()) 927 | # output,hn=self.gru(output) 928 | # print(output.size()) 929 | # print(hn.size()) 930 | output=output.permute(1,0,2) 931 | # hn=hn.permute(1,0,2) 932 | 933 | output=output.reshape(output.shape[0],-1) 934 | # hn=hn.reshape(output.shape[0],-1) 935 | # print(output.size()) 936 | # print(hn.size()) 937 | # output=torch.cat([output,hn],1) 938 | # print(output.size()) 939 | output=self.block1(output) 940 | output=self.block2(output) 941 | output=self.sigmoid(output) 942 | # print(output.size()) 943 | return output 944 | 945 | 946 | # class LSTMPredictor(nn.Module): 947 | # def __init__(self, d_embed=20, d_model=128, d_h=128, d_out=1, 948 | # vocab_size=None, seq_len=None, 949 | # dropout=0.1, lstm_dropout=0, nlayers=1, bidirectional=False, 950 | # use_loc_feat=True, use_glob_feat=True, 951 | # proj_loc_config=None, proj_glob_config=None): 952 | class LSTMPredictor(nn.Module): 953 | def __init__(self, d_embed=20, d_model=128, d_h=128, d_out=1, 954 | vocab_size=None, seq_len=None,use_loc_feat=True, 955 | dropout=0.1, lstm_dropout=0, nlayers=1, bidirectional=False, 956 | d_another_input=531,d_another_embed=128): 957 | super(LSTMPredictor, self).__init__() 958 | # self.seq_lstm = SequenceLSTM( 959 | # d_input=d_embed + (proj_loc_config['d_out'] if use_loc_feat else 0), 960 | # d_embed=d_embed, d_model=d_model, 961 | # vocab_size=vocab_size, seq_len=seq_len, 962 | # dropout=dropout, lstm_dropout=lstm_dropout, 963 | # nlayers=nlayers, bidirectional=bidirectional, 964 | # proj_loc_config=proj_loc_config) 965 | self.seq_lstm = SequenceLSTM( 966 | d_input=d_embed + (d_another_embed if use_loc_feat else 0), 967 | d_embed=d_embed, d_model=d_model, 968 | vocab_size=vocab_size, seq_len=seq_len, 969 | dropout=dropout, lstm_dropout=lstm_dropout, 970 | nlayers=nlayers, bidirectional=bidirectional, 971 | d_another_input=d_another_input,d_another_embed=d_another_embed) 972 | 973 | # self.proj_glob_layer = proj_glob_config['layer']( 974 | # proj_glob_config['d_in'], proj_glob_config['d_out'] 975 | # ) 976 | # self.aggragator = AggregateLayer( 977 | # d_model = d_model + (proj_glob_config['d_out'] if use_glob_feat else 0)) 978 | self.aggragator = AggregateLayer( 979 | d_model = d_model) 980 | # self.predictor = GlobalPredictor( 981 | # d_model = d_model + (proj_glob_config['d_out'] if use_glob_feat else 0), 982 | # d_h=d_h, d_out=d_out) 983 | self.predictor = GlobalPredictor( 984 | d_model = d_model, 985 | d_h=d_h, d_out=d_out) 986 | 987 | def forward(self, x, glob_feat=None, loc_feat=None): 988 | x = self.seq_lstm(x, loc_feat=loc_feat) 989 | # print(x.shape) 990 | # if glob_feat is not None: 991 | # glob_feat = self.proj_glob_layer(glob_feat) 992 | # print(glob_feat.shape) 993 | # x = torch.cat([x, glob_feat], dim=2) 994 | # print(x.shape) 995 | x = self.aggragator(x) 996 | # print(x) 997 | output = self.predictor(x) 998 | # print(output.shape) 999 | return output 1000 | 1001 | 1002 | if __name__ == "__main__": 1003 | # model = LSTMPredictor( 1004 | # d_model=128, d_h=128, nlayers=1, 1005 | # vocab_size=21, seq_len=500,bidirectional=True, 1006 | # d_another_input=531,d_another_embed=128) 1007 | 1008 | model = SequenceMultiCNN_1(d_input=578, 1009 | vocab_size=21, seq_len=500, 1010 | dropout=0.1,d_another_h=64,d_output=1) 1011 | 1012 | # model = SequenceMultiCNNLSTM(d_input=531, d_model=128, 1013 | # vocab_size=21, seq_len=500, 1014 | # dropout=0.1, d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1) 1015 | model = SequenceMultiTypeMultiCNN_1(d_input=[531,21,23,3], 1016 | vocab_size=21, seq_len=500, 1017 | dropout=0.1, d_another_h=64,k_cnn=[2,3,4,5,6],d_output=1) 1018 | x = torch.randint(0, 21, (128, 300)) 1019 | ids = torch.randint(0, 21, (128, 300)) 1020 | masks=torch.ones(128, 300) 1021 | # glob_feat = torch.rand((128, 500, 768)) 1022 | AAI_feat = torch.rand((128, 500, 531)) 1023 | onehot_feat = torch.rand((128, 500, 21)) 1024 | blosum62_feat = torch.rand((128, 500, 23)) 1025 | PAAC_feat = torch.rand((128, 500, 3)) 1026 | # y = model(x, glob_feat=glob_feat, loc_feat=loc_feat) 1027 | # print(y.size()) 1028 | 1029 | 1030 | # model=InputPositionEmbedding(vocab_size=21, 1031 | # seq_len=500, embed_dim=128) 1032 | # # model = LSTMPredictor( 1033 | # # d_model=128, d_h=128, nlayers=1, 1034 | # # vocab_size=21, seq_len=500, 1035 | # # proj_glob_config = {'layer':nn.Linear, 'd_in':768, 'd_out':128}, 1036 | # # proj_loc_config = {'layer':nn.Linear, 'd_in':500, 'd_out':128}, 1037 | # # ) 1038 | # x = torch.randint(0, 21, (128, 500)) 1039 | # glob_feat = torch.rand((128, 500, 768)) 1040 | # loc_feat = torch.rand((128, 500, 500)) 1041 | # y = model(x,loc_feat=loc_feat) 1042 | y = model(AAI_feat,onehot_feat,blosum62_feat,PAAC_feat) 1043 | # y = model(AAI_feat) 1044 | # y=model(x,ids,masks) 1045 | -------------------------------------------------------------------------------- /models/readme.txt: -------------------------------------------------------------------------------- 1 | Please download the corresponding file from the cloud storage and place it in this folder -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | import os 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torch.nn.functional as F 7 | from tqdm import tqdm 8 | import numpy as np 9 | import scipy.stats 10 | import pathlib 11 | import copy 12 | import time 13 | import vocab 14 | from model import SequenceMultiTypeMultiCNN_1 15 | from data_feature import Dataset 16 | import pandas as pd 17 | import argparse 18 | from tqdm import tqdm 19 | 20 | device = torch.device("cuda:0") 21 | 22 | 23 | def return_y(data_iter, net): 24 | y_pred = [] 25 | 26 | all_seq = [] 27 | for batch in data_iter: 28 | all_seq += batch['sequence'] 29 | 30 | AAI_feat = batch['seq_enc_AAI'].to(device) 31 | onehot_feat = batch['seq_enc_onehot'].to(device) 32 | BLOSUM62_feat = batch['seq_enc_BLOSUM62'].to(device) 33 | PAAC_feat = batch['seq_enc_PAAC'].to(device) 34 | # bert_feat=batch['seq_enc_bert'].to(device) 35 | # bert_mask=batch['seq_enc_mask'].to(device) 36 | outputs = net(AAI_feat, onehot_feat, BLOSUM62_feat, PAAC_feat) 37 | # print(outputs.size())#32,1 38 | # outputs = model(x) 39 | y_pred.extend(outputs.cpu().numpy()) 40 | 41 | return y_pred, all_seq 42 | 43 | 44 | def testing(batch_size, patience, n_epochs, testfasta, seq_len, cdhit_value, cv_number, save_file, model_file): 45 | model = SequenceMultiTypeMultiCNN_1(d_input=[531, 21, 23, 3], vocab_size=21, seq_len=seq_len, 46 | dropout=0.1, d_another_h=128, k_cnn=[2, 3, 4, 5, 6], d_output=1).to(device) 47 | 48 | dataset = Dataset(fasta=testfasta) 49 | test_loader = dataset.get_dataloader(batch_size=batch_size, max_length=seq_len) 50 | 51 | model.load_state_dict(torch.load(model_file, map_location=torch.device('cpu'))['state_dict']) 52 | model.eval() 53 | with torch.no_grad(): 54 | new_y_pred, all_seq = return_y(test_loader, model) 55 | 56 | final_y_pred = copy.deepcopy(new_y_pred) 57 | 58 | final_y_pred = np.array(final_y_pred).T[0].tolist() 59 | 60 | pred_dict = {'seq': all_seq, 'predictions': final_y_pred} 61 | pred_df = pd.DataFrame(pred_dict) 62 | pred_df.to_csv(save_file, index=None) 63 | 64 | 65 | all_function_names = ['antibacterial', 'antigram-positive', 'antigram-negative', 'antifungal', 'antiviral', \ 66 | 'anti_mammalian_cells', 'antihiv', 'antibiofilm', 'anticancer', 'antimrsa', 'antiparasitic', \ 67 | 'hemolytic', 'chemotactic', 'antitb', 'anurandefense', 'cytotoxic', \ 68 | 'endotoxin', 'insecticidal', 'antimalarial', 'anticandida', 'antiplasmodial', 'antiprotozoal'] 69 | 70 | 71 | # os.environ['CUDA_LAUNCH_BLOCKING'] = 1 72 | 73 | 74 | def predict(test_file): 75 | fas_id = [] 76 | fas_seq = [] 77 | for seq_record in SeqIO.parse(test_file, "fasta"): 78 | fas_seq.append(str(seq_record.seq).upper()) 79 | fas_id.append(str(seq_record.id)) 80 | 81 | seq_len = 200 82 | batch_size = 32 83 | cdhit_value = 40 84 | vocab_size = len(vocab.AMINO_ACIDS) 85 | 86 | epochs = 300 87 | temp_save_AMP_filename = '%s ' % (time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())) 88 | for cv_number in tqdm(range(10)): 89 | testing(testfasta=fas_seq, 90 | model_file=f'models/AMP_1st/textcnn_cdhit_40_{cv_number}.pth.tar', 91 | save_file=f'tmp_save/{temp_save_AMP_filename}_{cv_number}.csv', 92 | batch_size=batch_size, patience=10, n_epochs=epochs, seq_len=seq_len, cdhit_value=cdhit_value 93 | , cv_number=cv_number) 94 | 95 | pred_prob = [] 96 | for cv_number in tqdm(range(10)): 97 | df = pd.read_csv(f'tmp_save/{temp_save_AMP_filename}_{cv_number}.csv') 98 | data = df.values.tolist() 99 | temp = [] 100 | for i in tqdm(range(len(data))): 101 | temp.append(data[i][1]) 102 | pred_prob.append(temp) 103 | pred_prob = np.average(pred_prob, 0) 104 | pred_AMP_label = [] 105 | for i in tqdm(range(len(pred_prob))): 106 | if pred_prob[i] > 0.5: 107 | pred_AMP_label.append('Yes') 108 | else: 109 | pred_AMP_label.append('No') 110 | 111 | for function_name in all_function_names: 112 | temp_dir_list = os.listdir('tmp_save') 113 | if function_name not in temp_dir_list: 114 | os.mkdir('tmp_save/ ' + function_name) 115 | for cv_number in tqdm(range(10)): 116 | testing(testfasta=fas_seq, 117 | model_file=f'models/AMP_2nd/{function_name}/textcnn_cdhit_100_0.pth.tar', 118 | save_file=f'tmp_save/{function_name}/{temp_save_AMP_filename}_{cv_number}.csv', 119 | batch_size=batch_size, patience=10, n_epochs=epochs, seq_len=seq_len, cdhit_value=cdhit_value 120 | , cv_number=cv_number) 121 | 122 | all_function_pred_label = [] 123 | for function_name in all_function_names: 124 | 125 | function_threshold_df = pd.read_csv(f'models/AMP_2nd_threashold/{function_name}_yd_threshold.csv', index_col=0) 126 | function_thresholds = function_threshold_df.values[:, 0] 127 | 128 | each_function_data = [] 129 | 130 | for cv_number in tqdm(range(10)): 131 | df = pd.read_csv(f'tmp_save/{function_name}/{temp_save_AMP_filename}_{cv_number}.csv') 132 | data = df.values.tolist() 133 | temp = [] 134 | for i in tqdm(range(len(data))): 135 | 136 | if data[i][1] > function_thresholds[cv_number]: 137 | temp.append(1) 138 | else: 139 | temp.append(0) 140 | each_function_data.append(temp) 141 | each_function_data = np.average(each_function_data, 0) 142 | pred_each_function_label = [] 143 | for i in tqdm(range(len(each_function_data))): 144 | if each_function_data[i] > 0.5: 145 | pred_each_function_label.append('Yes') 146 | else: 147 | pred_each_function_label.append('No') 148 | 149 | all_function_pred_label.append(pred_each_function_label) 150 | 151 | all_function_cols = ['antibacterial', 'anti-Gram-positive', 'anti-Gram-negative', 'antifungal', 'antiviral', \ 152 | 'anti-mammalian-cells', 'anti-HIV', 'antibiofilm', 'anticancer', 'anti-MRSA', 'antiparasitic', \ 153 | 'hemolytic', 'chemotactic', 'anti-TB', 'anurandefense', 'cytotoxic', \ 154 | 'endotoxin', 'insecticidal', 'antimalarial', 'anticandida', 'antiplasmodial', 'antiprotozoal'] 155 | 156 | pred_contents_dict = {'name': fas_id, 'sequence': fas_seq, 'AMP': pred_AMP_label} 157 | for i in tqdm(range(len(all_function_cols))): 158 | pred_contents_dict[all_function_cols[i]] = all_function_pred_label[i] 159 | 160 | pred_contents_df = pd.DataFrame(pred_contents_dict) 161 | 162 | for function_name in all_function_names: 163 | for cv_number in tqdm(range(10)): 164 | os.remove(f'tmp_save/{function_name}/{temp_save_AMP_filename}_{cv_number}.csv') 165 | for cv_number in tqdm(range(10)): 166 | os.remove(f'tmp_save/{temp_save_AMP_filename}_{cv_number}.csv') 167 | 168 | return pred_contents_df 169 | # master.insert_one({'Test Report': res_val}) 170 | 171 | 172 | if __name__ == '__main__': 173 | parser = argparse.ArgumentParser(description='proposed model') 174 | 175 | parser.add_argument('-output_file_name', default='prediction_output', type=str) 176 | 177 | parser.add_argument('-test_fasta_file', default='AMPpos.fasta', type=str) 178 | args = parser.parse_args() 179 | 180 | output_file_name = args.output_file_name 181 | test_file = args.test_fasta_file 182 | flag = 0 183 | 184 | 185 | if flag == 0: 186 | pred_df = predict(test_file) 187 | pred_df.to_csv(output_file_name + '.csv') 188 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Bio==1.5.9 2 | biopython==1.81 3 | gradio==3.47.1 4 | numpy==1.23.5 5 | pandas==1.5.3 6 | scikit_learn==1.2.2 7 | scipy==1.10.1 8 | torch 9 | torch_geometric 10 | tqdm==4.64.1 11 | transformers==4.24.0 -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | num_runs=2 2 | set -e 3 | echo "Running gan.py" 4 | python gan_diff.py 5 | 6 | echo "Running AMP_Classification.py" 7 | python AMP_Classification.py 8 | 9 | echo "Running gan_diff.py" 10 | python gan_diff.py 11 | 12 | 13 | for ((i = 1; i <= $num_runs; i++)); do 14 | echo "Iteration $i of $num_runs:" 15 | # 生成AMP序列 16 | echo "Running gan_generate.py" 17 | python gan_generate.py 18 | 19 | echo "Running AMP-Classification_Prediction.py" 20 | python AMP_Classification_Prediction.py 21 | 22 | echo "Running generate_pos.py" 23 | python generate_pos.py 24 | 25 | echo "Running predict.py" 26 | python predict.py -test_fasta_file examples/samples.fasta -output_file_name prediction_results 27 | 28 | echo "Running attribute_selection.py" 29 | python attribute_selection.py -i prediction_results.csv -o selected_data.csv -c AMP antibacterial -v Yes No 30 | 31 | echo "Running gan_update.py" 32 | python gan_update.py 33 | 34 | done 35 | 36 | echo "All programs have been executed." 37 | echo "enter to quit " 38 | sleep 500 39 | -------------------------------------------------------------------------------- /tmp_save/anti_mammalian_cells/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antibacterial/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antibiofilm/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/anticancer/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/anticandida/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antifungal/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antigram-negative/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antigram-positive/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antihiv/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antimalarial/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antimrsa/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antiparasitic/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antiplasmodial/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antiprotozoal/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antitb/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/antiviral/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/anurandefense/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/chemotactic/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tmp_save/cytotoxic/readme.txt: -------------------------------------------------------------------------------- 1 | In placeholder to prevent automatic deletion by GitHub, please retain -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Dec 8 23:51:40 2021 4 | 5 | @author: joy 6 | """ 7 | import numpy as np 8 | import torch 9 | class EarlyStopping: 10 | """Early stops the training if validation loss doesn't improve after a given patience.""" 11 | def __init__(self, patience=7, verbose=False, delta=0, save_model_name='checkpoint.pt'): 12 | """ 13 | Args: 14 | patience (int): How long to wait after last time validation loss improved. 15 | 上次验证集损失值改善后等待几个epoch 16 | Default: 7 17 | verbose (bool): If True, prints a message for each validation loss improvement. 18 | 如果是True,为每个验证集损失值改善打印一条信息 19 | Default: False 20 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 21 | 监测数量的最小变化,以符合改进的要求 22 | Default: 0 23 | """ 24 | self.patience = patience 25 | self.verbose = verbose 26 | self.counter = 0 27 | self.best_score = None 28 | self.early_stop = False 29 | self.val_loss_min = np.Inf 30 | self.delta = delta 31 | self.save_model_name=save_model_name 32 | 33 | def __call__(self, val_loss, model): 34 | 35 | score = -val_loss 36 | 37 | if self.best_score is None: 38 | self.best_score = score 39 | self.save_checkpoint(val_loss, model) 40 | elif score < self.best_score + self.delta: 41 | self.counter += 1 42 | print(f'EarlyStopping counter: {self.counter} out of {self.patience}') 43 | if self.counter >= self.patience: 44 | self.early_stop = True 45 | else: 46 | self.best_score = score 47 | self.save_checkpoint(val_loss, model) 48 | self.counter = 0 49 | 50 | def save_checkpoint(self, val_loss, model): 51 | ''' 52 | Saves model when validation loss decrease. 53 | 验证损失减少时保存模型。 54 | ''' 55 | if self.verbose: 56 | print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') 57 | # torch.save(model.state_dict(), 'checkpoint.pt') # 这里会存储迄今最优模型的参数 58 | # torch.save(model, self.save_model_name) 59 | torch.save(model.state_dict(), self.save_model_name) # 这里会存储迄今最优的模型 60 | self.val_loss_min = val_loss -------------------------------------------------------------------------------- /vocab.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from io import StringIO 3 | import pandas as pd 4 | 5 | COMMON_AMINO_ACIDS_INDEX = collections.OrderedDict( 6 | {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 7 | 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 8 | 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 9 | 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}) 10 | AMINO_ACIDS = list(COMMON_AMINO_ACIDS_INDEX.keys()) 11 | 12 | AMINO_ACID_INDEX = collections.OrderedDict( 13 | {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 14 | 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 15 | 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 16 | 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, 17 | 'X': 20, 'Z': 20, 'B': 20, 'J': 20, '-': 20}) 18 | 19 | ''' 20 | CCMPred index of amino acid 21 | https://github.com/soedinglab/CCMpred/blob/2b2f9a0747a5e53035c33636d430f2f11dc186dd/src/sequence.c 22 | ''' 23 | CCMPRED_AMINO_ACID_INDEX = collections.OrderedDict( 24 | {'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 25 | 'Q': 5, 'E': 6, 'G': 7, 'H': 8, 'I': 9, 26 | 'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14, 27 | 'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19, '-': 20}) 28 | CCMPRED_AMINO_ACIDS = list(CCMPRED_AMINO_ACID_INDEX.keys()) 29 | 30 | BLOSUM62_MATRIX = pd.read_csv(StringIO(""" 31 | A R N D C Q E G H I L K M F P S T W Y V - 32 | A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 33 | R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0 34 | N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0 35 | D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0 36 | C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0 37 | Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 38 | E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0 39 | G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0 40 | H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 41 | I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0 42 | L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0 43 | K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 44 | M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0 45 | F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0 46 | P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0 47 | S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 48 | T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0 49 | W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0 50 | Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0 51 | V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0 52 | - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 53 | """), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] 54 | 55 | ENCODING_DATA_FRAMES = { 56 | "BLOSUM62": BLOSUM62_MATRIX, 57 | "one-hot": pd.DataFrame([ 58 | [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] 59 | for j in range(len(AMINO_ACIDS)) 60 | ], index=AMINO_ACIDS, columns=AMINO_ACIDS) 61 | } 62 | -------------------------------------------------------------------------------- /weight/readme.txt: -------------------------------------------------------------------------------- 1 | Please download the corresponding file from the cloud storage and place it in this folder --------------------------------------------------------------------------------