├── ADCNet.png
├── ESM-2.py
├── README.md
├── class.py
├── classification_weights
    ├── ADC_9.h5
    └── Explanation
├── data.xlsx
├── dataset.py
├── files
    ├── Antigen_1280.pkl
    ├── Heavy_1280.pkl
    ├── Light_1280.pkl
    ├── data.xlsx
    └── x
├── inference.py
├── medium3_weights
    ├── Explanation
    └── bert_weightsMedium_20.h5
├── model.py
├── py37.yaml
├── t_data.xlsx
└── utils.py


/ADCNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/ADCNet.png


--------------------------------------------------------------------------------
/ESM-2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import esm
 3 | import pandas as pd
 4 | import pickle
 5 | 
 6 | # Load ESM-2 model
 7 | # model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
 8 | model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
 9 | # model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
10 | batch_converter = alphabet.get_batch_converter()
11 | model.eval()  # disables dropout for deterministic results
12 | 
13 | # Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
14 | 
15 | df = pd.read_excel(r'data.xlsx',)
16 | 
17 | protain = df['Antigen Sequence（64）'].tolist()
18 | smiles = df['ADC ID'].tolist()
19 | 
20 | datas = []
21 | for i in range(len(protein)):
22 |     datas.append((smiles[i], protain[i])) 
23 | 
24 | sequence_representations = []
25 | 
26 | for data in datas:
27 |     # print(data)
28 |     try :
29 |         batch_labels, batch_strs, batch_tokens = batch_converter([data])
30 |     except Exception as e:
31 |         print(data)
32 |     batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
33 | 
34 |     # Extract per-residue representations (on CPU)
35 |     with torch.no_grad():
36 |         results = model(batch_tokens, repr_layers=[33], return_contacts=True)
37 | 
38 |     token_representations = results["representations"][33]
39 | 
40 | # Generate per-sequence representations via averaging
41 | # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
42 | 
43 |     for i, tokens_len in enumerate(batch_lens):
44 |         sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
45 | esm_embedding = {}
46 | for i in range(len(sequence_representations)):
47 |     esm_embedding[datas[i][0]] = sequence_representations[i]
48 | print(sequence_representations[0].shape)
49 | print(len(esm_embedding))
50 | 
51 | file_path = 'Antigen.pkl'
52 | with open(file_path, 'wb') as file:
53 |     pickle.dump(esm_embedding, file)
54 | 
55 | # 加载保存的长字典
56 | with open(file_path, 'rb') as file:
57 |     loaded_dict = pickle.load(file)
58 |     
59 | # Look at the unsupervised self-attention map contact predictions
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ADCNet
 2 | semi-supervised learning for ADC property prediction.
 3 | ![image](https://github.com/idrugLab/ADCNet/blob/main/ADCNet.png)
 4 | # Description of the document
 5 | py37.yaml is the version of the various packages for the installed environment. The files folder contains antibody heavy and light chains, antigen macromolecule embedding, and ADC dataset.
 6 | # Requried package: 
 7 | ## Example of ESM-2 environment installation：
 8 | ```ruby
 9 | conda create -n esm-2 python==3.9
10 | pip install fair-esm  # latest release, OR:
11 | pip install git+https://github.com/facebookresearch/esm.git  # bleeding edge, current repo main branch
12 | ```
13 | 
14 | ## Example of ADCNet environment installation：
15 | ```ruby
16 | conda create -n ADCNet python==3.7
17 | pip install tensorflow==2.3
18 | pip install rdkit
19 | pip install numpy
20 | pip install pandas
21 | conda install -c openbabel openbabel
22 | pip install matplotlib
23 | pip install hyperopt
24 | pip install scikit-learn
25 | pip install torch
26 | pip install openpyxl
27 | ```
28 | 
29 | ## Examples of obtaining embeddings for antibodies or antigens.
30 | ```ruby
31 | conda activate esm-2
32 | python ESM-2.py
33 | ```
34 | After completion of the run, you will find a .pkl file in the current directory. It is a dictionary where the keys are ADC IDs (if there is no ADC ID, you can add a column with numerical values to the original data and name it ADC ID), and the values are tensors of 1280 dimensions.
35 | 
36 | ## Examples of training ADCNet.
37 | First, run ESM-2.py to obtain embeddings for the heavy chain, light chain, and antigen of the antibody. The code will save these embeddings into three pkl files.
38 | Secondly, Ensure that each data entry contains the DAR value.
39 | Finally, Create a folder named "medium3_weights" and place the file "bert_weightsMedium_20.h5" from this repository into that folder.
40 | 
41 | ```ruby
42 | conda activate ADCNet
43 | python class.py
44 | ```
45 | ## Examples of using ADCNet to inference.
46 | First, run ESM-2.py to obtain embeddings for the heavy chain, light chain, and antigen of the antibody. The code will save these embeddings into three pkl files.
47 | Secondly, Ensure that each data entry contains the DAR value.
48 | Finally, Create a folder named "classification_weights" and place the file "ADC_9.h5" from this repository into that folder.
49 | If you want to reproduce the results of this article, run class.py directly. If you want to search for hyperparameters again, uncomment the code in the hyperparameter search section (lines 254-279) and comment the original hyperparameters (lines 281-285).
50 | ```ruby
51 | conda activate ADCNet
52 | python inference.py
53 | ```
54 | ## Using ADCNet for predictions
55 | ```ruby
56 | You can visit the (https://ADCNet.idruglab.cn) website to make predictions.
57 | ```
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/class.py:
--------------------------------------------------------------------------------
  1 | import sklearn
  2 | import tensorflow as tf
  3 | import tensorflow.keras as keras
  4 | import numpy as np
  5 | from dataset import Graph_Classification_Dataset
  6 | import os
  7 | import pandas as pd
  8 | from model import PredictModel, BertModel
  9 | from sklearn.metrics import roc_auc_score,confusion_matrix,precision_recall_curve,auc
 10 | from hyperopt import fmin, tpe, hp
 11 | from utils import get_task_names
 12 | from tensorflow.python.client import device_lib
 13 | from sklearn.preprocessing import StandardScaler
 14 | import pickle
 15 | import math
 16 | import csv
 17 | 
 18 | os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
 19 | keras.backend.clear_session()
 20 | os.environ['TF_DETERMINISTIC_OPS'] = '1'
 21 | 
 22 | def count_parameters(model):
 23 |     total_params = 0
 24 |     for variable in model.trainable_variables:
 25 |         shape = variable.shape
 26 |         params = 1
 27 |         for dim in shape:
 28 |             params *= dim
 29 |         total_params += params
 30 |     return total_params
 31 | 
 32 | def cover_dict(path):
 33 |     file_path = path
 34 |     with open(file_path, 'rb') as file:
 35 |         data = pickle.load(file)
 36 |     tensor_dict = {key: tf.constant(value) for key, value in data.items()}
 37 |     new_data = {i: value for i, (key, value) in enumerate(tensor_dict.items())}
 38 |     return new_data
 39 | 
 40 | def score(y_test, y_pred):
 41 |     auc_roc_score = roc_auc_score(y_test, y_pred)
 42 |     prec, recall, _ = precision_recall_curve(y_test, y_pred)
 43 |     prauc = auc(recall, prec)
 44 |     y_pred_print = [round(y, 0) for y in y_pred]
 45 |     tn, fp, fn, tp = confusion_matrix(y_test, y_pred_print).ravel()
 46 |     se = tp / (tp + fn)
 47 |     sp = tn / (tn + fp)
 48 |     acc = (tp + tn) / (tp + fn + tn + fp)
 49 |     mcc = (tp * tn - fn * fp) / math.sqrt((tp + fn) * (tp + fp) * (tn + fn) * (tn + fp))
 50 |     P = tp / (tp + fp)
 51 |     F1 = (P * se * 2) / (P + se)
 52 |     BA = (se + sp) / 2
 53 |     PPV = tp / (tp + fp)
 54 |     NPV = tn / (fn + tn)
 55 |     return tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV
 56 | 
 57 | def DAR_feature(file_path, column_name):
 58 |     df = pd.read_excel(file_path)
 59 |     column_data = df[column_name].values.reshape(-1, 1)
 60 |     scaler = StandardScaler()
 61 |     column_data_standardized = scaler.fit_transform(column_data)
 62 |     column_data_normalized = tf.keras.utils.normalize(column_data_standardized, axis=0).flatten()
 63 |     data_dict = {index: tf.constant(value, dtype=tf.float32) for index, value in zip(df.index, column_data_normalized)}
 64 |     return data_dict
 65 | 
 66 | def process_list(input_list):
 67 |     input_list.append(np.mean(input_list))
 68 |     mean_value = np.mean(input_list[:-1])
 69 |     std_value = np.std(input_list[:-1], ddof=0)
 70 |     mean_range = f'{mean_value:.4f} ± {std_value:.4f}'
 71 |     input_list[-1] = mean_range
 72 |     print(input_list)
 73 |     return input_list
 74 | 
 75 | def extract_tensors(index, heavy_dict, light_dict, antigen_dict, dar_dict):
 76 |     heavy_tensor_list = []
 77 |     light_tensor_list = []
 78 |     antigen_tensor_list = []
 79 |     DAR_tensor_list = []
 80 | 
 81 |     for i in index.numpy():
 82 |         heavy_tensor_list.append(heavy_dict[i[0]])
 83 |         light_tensor_list.append(light_dict[i[0]])
 84 |         antigen_tensor_list.append(antigen_dict[i[0]])
 85 |         DAR_tensor_list.append(dar_dict[i[0]])
 86 |     t1 = np.vstack(heavy_tensor_list)
 87 |     t2 = np.vstack(light_tensor_list)
 88 |     t3 = np.vstack(antigen_tensor_list)
 89 |     t4 = np.vstack(DAR_tensor_list)
 90 |     
 91 |     return t1, t2, t3, t4
 92 |     
 93 | Heavy_dict = cover_dict('Heavy_1280.pkl')
 94 | Light_dict = cover_dict('Light_1280.pkl')
 95 | Antigen_dict = cover_dict('Antigen_1280.pkl')
 96 | DAR_dict = DAR_feature('data.xlsx', 'DAR_val')
 97 | 
 98 | def main(seed, args):
 99 | 
100 |     task = 'ADC'
101 |     idx = ['index']
102 |     label = ['label（100nm）']
103 | 
104 |     arch = {'name': 'Medium', 'path': 'medium3_weights'}
105 |     pretraining = True
106 |     pretraining_str = 'pretraining' if pretraining else ''
107 |     trained_epoch = 20
108 |     num_layers = 6
109 |     d_model = 256
110 |     addH = True
111 |     dff = d_model * 2
112 |     vocab_size = 18
113 | 
114 |     num_heads = args['num_heads']
115 |     dense_dropout = args['dense_dropout']
116 |     learning_rate = args['learning_rate']
117 |     batch_size = args['batch_size']
118 |     seed = seed
119 |     np.random.seed(seed=seed)
120 |     tf.random.set_seed(seed=seed)
121 |     train_dataset, test_dataset, val_dataset = Graph_Classification_Dataset('data.xlsx', 
122 |                                                                             smiles_field1='Payload Isosmiles',
123 |                                                                             smiles_field2='Linker Isosmiles',
124 |                                                                             label_field=label,
125 |                                                                             index_field=idx, 
126 |                                                                             seed=seed,
127 |                                                                             batch_size=batch_size,
128 |                                                                             a = len(label), 
129 |                                                                             addH=addH).get_data()
130 |                                                         
131 |     x1, adjoin_matrix1, y, x2, adjoin_matrix2, index = next(iter(train_dataset.take(1)))
132 | 
133 |     seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32)
134 |     seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32)
135 |     mask1 = seq1[:, tf.newaxis, tf.newaxis, :]
136 |     mask2 = seq2[:, tf.newaxis, tf.newaxis, :]
137 |     t1, t2, t3, t4 = extract_tensors(index, Heavy_dict, Light_dict, Antigen_dict, DAR_dict)
138 |     model = PredictModel(num_layers=num_layers,
139 |                          d_model=d_model,
140 |                          dff=dff, 
141 |                          num_heads=num_heads, 
142 |                          vocab_size=vocab_size,
143 |                          a=len(label),
144 |                          dense_dropout = dense_dropout)
145 | 
146 |     if pretraining:
147 |         temp = BertModel(num_layers=num_layers, d_model=d_model,
148 |                          dff=dff, num_heads=num_heads, vocab_size=vocab_size)
149 | 
150 |         pred = temp(x1, mask=mask1, training=True, adjoin_matrix=adjoin_matrix1)
151 |         temp.load_weights(
152 |             arch['path']+'/bert_weights{}_{}.h5'.format(arch['name'], trained_epoch))
153 |         temp.encoder.save_weights(
154 |             arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'], trained_epoch))
155 |         del temp
156 | 
157 |         pred = model(x1=x1, mask1=mask1, training=True, adjoin_matrix1=adjoin_matrix1, x2=x2,mask2=mask2, adjoin_matrix2=adjoin_matrix2, t1=t1,t2=t2,t3=t3,t4=t4)
158 | 
159 |         model.encoder.load_weights(
160 |             arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'], trained_epoch))
161 |         print('load_wieghts')
162 | 
163 |     total_params = count_parameters(model)
164 | 
165 |     print('*'*100)
166 |     print("Total Parameters:", total_params)
167 |     print('*'*100)
168 |     
169 |     optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
170 | 
171 |     auc = -10
172 |     stopping_monitor = 0
173 |     for epoch in range(200):
174 |         loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
175 |         for x1, adjoin_matrix1, y, x2, adjoin_matrix2, index in train_dataset:
176 |             t1, t2, t3, t4 = extract_tensors(index, Heavy_dict, Light_dict, Antigen_dict, DAR_dict)
177 |             with tf.GradientTape() as tape:
178 |                 seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32)
179 |                 mask1 = seq1[:, tf.newaxis, tf.newaxis, :]
180 |                 seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32)
181 |                 mask2 = seq2[:, tf.newaxis, tf.newaxis, :]
182 |                 preds = model(x1=x1, mask1=mask1,training=True,adjoin_matrix1=adjoin_matrix1, x2=x2, mask2=mask2, adjoin_matrix2=adjoin_matrix2,t1=t1,t2=t2,t3=t3,t4=t4)
183 |                 loss = loss_object(y,preds)
184 |                 grads = tape.gradient(loss, model.trainable_variables)
185 |                 optimizer.apply_gradients(zip(grads, model.trainable_variables))
186 |         print('epoch: ', epoch, 'loss: {:.4f}'.format(loss.numpy().item()))
187 | 
188 |         y_true = []
189 |         y_preds = []
190 |         for x1, adjoin_matrix1, y, x2, adjoin_matrix2, index in val_dataset:
191 |             t1, t2, t3, t4 = extract_tensors(index, Heavy_dict, Light_dict, Antigen_dict, DAR_dict)
192 |             seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32)
193 |             mask1 = seq1[:, tf.newaxis, tf.newaxis, :]
194 |             seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32)
195 |             mask2 = seq2[:, tf.newaxis, tf.newaxis, :]
196 |             preds = model(x1=x1, mask1=mask1,training=False,adjoin_matrix1=adjoin_matrix1, x2=x2, mask2=mask2, adjoin_matrix2=adjoin_matrix2,t1=t1,t2=t2,t3=t3,t4=t4)
197 |             y_label = y
198 |             y_pred = preds
199 |             y_true.append(y_label)
200 |             y_preds.append(y_pred)
201 |         y_true = np.concatenate(y_true,axis=0).reshape(-1)
202 |         y_preds = np.concatenate(y_preds,axis=0).reshape(-1)
203 |         y_preds = tf.sigmoid(y_preds).numpy()
204 |         auc_new = roc_auc_score(y_true,y_preds)
205 | 
206 |         print('val auc:{:.4f}'.format(auc_new))
207 |         if auc_new> auc:
208 |             auc = auc_new
209 |             stopping_monitor = 0
210 |             np.save('{}/{}{}{}{}{}'.format(arch['path'], task, seed, arch['name'], trained_epoch, trained_epoch, pretraining_str),
211 |                     [y_true, y_preds])
212 |             model.save_weights('classification_weights/{}_{}.h5'.format(task, seed))
213 |             print('save model weights')
214 |         else:
215 |             stopping_monitor += 1
216 |         print('best val auc: {:.4f}'.format(auc))
217 |         if stopping_monitor > 0:
218 |             print('stopping_monitor:', stopping_monitor)
219 |         if stopping_monitor > 30:
220 |             break
221 | 
222 |     y_true = []
223 |     y_preds = []
224 |     model.load_weights('classification_weights/{}_{}.h5'.format(task, seed))
225 |     for x1, adjoin_matrix1, y, x2, adjoin_matrix2, index in test_dataset:
226 |         t1, t2, t3, t4 = extract_tensors(index, Heavy_dict, Light_dict, Antigen_dict, DAR_dict)
227 |         seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32)
228 |         mask1 = seq1[:, tf.newaxis, tf.newaxis, :]
229 |         seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32)
230 |         mask2 = seq2[:, tf.newaxis, tf.newaxis, :]
231 |         preds = model(x1=x1, mask1=mask1,training=False,adjoin_matrix1=adjoin_matrix1, x2=x2, mask2=mask2, adjoin_matrix2=adjoin_matrix2,t1=t1,t2=t2,t3=t3,t4=t4)
232 |         y_label = y
233 |         y_pred = preds
234 |         y_true.append(y_label)
235 |         y_preds.append(y_pred)
236 | 
237 |     y_true = np.concatenate(y_true, axis=0).reshape(-1)
238 |     y_preds = np.concatenate(y_preds, axis=0).reshape(-1)
239 |     y_preds = tf.sigmoid(y_preds).numpy()
240 |     test_auc = roc_auc_score(y_true, y_preds)
241 |     
242 |     tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV = score(y_true, y_preds)
243 |     print('test auc:{:.4f}'.format(test_auc))
244 | 
245 |     return test_auc,tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV                                                     
246 | 
247 | space = {"dense_dropout": hp.quniform("dense_dropout", 0, 0.5, 0.05), 
248 |         "learning_rate": hp.loguniform("learning_rate", np.log(3e-5), np.log(15e-5)),
249 |         "batch_size": hp.choice("batch_size", [16,32,48,64]),
250 |         "num_heads": hp.choice("num_heads", [4,8]),
251 |         }
252 | 
253 | # Hyperparametric search
254 | # def hy_main(args):
255 | #     test_auc_list = []
256 | #     x = 0
257 | #     for seed in [2, 8, 9]:
258 | #         print(seed)
259 | #         test_auc,tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV = main(seed, args)
260 | #         test_auc_list.append(test_auc)
261 | #         x+= test_auc
262 | #     test_auc_list.append(np.mean(test_auc_list))
263 | #     print(test_auc_list)
264 | #     print(args["dense_dropout"])
265 | #     print(args["learning_rate"])
266 | #     print(args["batch_size"])
267 | #     print(args["num_heads"])
268 | #     return -x/3
269 | 
270 | # best = fmin(hy_main, space, algo = tpe.suggest, max_evals= 30)
271 | # print(best)
272 | 
273 | # best_dict = {}
274 | # a = [16,32,48,64]
275 | # b = [4, 8]
276 | # best_dict["dense_dropout"] = best["dense_dropout"]
277 | # best_dict["learning_rate"] = best["learning_rate"]
278 | # best_dict["batch_size"] = a[best["batch_size"]]
279 | # best_dict["num_heads"] = b[best["num_heads"]]
280 | 
281 | best_dict = {}
282 | best_dict["dense_dropout"] = 0.30000000000000004
283 | best_dict["learning_rate"] = 5.5847758199523973e-05
284 | best_dict["batch_size"] = 32
285 | best_dict["num_heads"] = 8
286 | print(best_dict)
287 | 
288 | if __name__ == '__main__':
289 |     test_auc_list = []
290 |     tp_l, tn_l, fn_l, fp_l, se_l, sp_l, mcc_l, acc_l, auc_roc_score_l, F1_l, BA_l, prauc_l, PPV_l, NPV_l = [],[],[],[],[],[],[],[],[],[],[],[],[],[]
291 |     lists_to_process = [tp_l, tn_l, fn_l, fp_l, se_l, sp_l, mcc_l, acc_l, auc_roc_score_l, F1_l, BA_l, prauc_l, PPV_l, NPV_l]
292 |     for seed in [2,8,9]:
293 |         print(seed)
294 |         test_auc,tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV = main(seed, best_dict)
295 |         test_auc_list.append(test_auc)
296 |         tp_l.append(tp)
297 |         tn_l.append(tn)
298 |         fn_l.append(fn)
299 |         fp_l.append(fp)
300 |         se_l.append(se)
301 |         sp_l.append(sp)
302 |         mcc_l.append(mcc)
303 |         acc_l.append(acc)
304 |         auc_roc_score_l.append(auc_roc_score)
305 |         F1_l.append(F1)
306 |         BA_l.append(BA)
307 |         prauc_l.append(prauc)
308 |         PPV_l.append(PPV)
309 |         NPV_l.append(NPV)
310 |     test_auc_list.append(np.mean(test_auc_list))
311 |     tp_l.append(np.mean(tp_l))
312 |     tn_l.append(np.mean(tn_l))
313 |     fn_l.append(np.mean(fn_l))
314 |     fp_l.append(np.mean(fp_l))
315 |     se_l.append(np.mean(se_l))
316 |     sp_l.append(np.mean(sp_l))
317 |     mcc_l.append(np.mean(mcc_l))
318 |     acc_l.append(np.mean(acc_l))
319 |     auc_roc_score_l.append(np.mean(auc_roc_score_l))
320 |     F1_l.append(np.mean(F1_l))
321 |     BA_l.append(np.mean(BA_l))
322 |     prauc_l.append(np.mean(prauc_l))
323 |     PPV_l.append(np.mean(PPV_l))
324 |     NPV_l.append(np.mean(NPV_l))
325 |     
326 |     for i in range(len(lists_to_process)):
327 |         lists_to_process[i] = process_list(lists_to_process[i])
328 |     filename = 'ADCNet_output.csv'
329 |     column_names = ['tp', 'tn', 'fn', 'fp', 'se', 'sp', 'mcc', 'acc', 'auc', 'F1', 'BA', 'prauc','PPV', 'NPV']
330 |     rows = zip(tp_l, tn_l, fn_l, fp_l, se_l, sp_l, mcc_l, acc_l, auc_roc_score_l, F1_l, BA_l, prauc_l, PPV_l, NPV_l)
331 |     with open(filename, mode='w', newline='') as file:
332 |         writer = csv.writer(file)
333 |         writer.writerow(column_names)
334 |         writer.writerows(rows)
335 |     print(f'CSV file {filename} was successfully written')
336 | 
337 | 
338 | 


--------------------------------------------------------------------------------
/classification_weights/ADC_9.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/classification_weights/ADC_9.h5


--------------------------------------------------------------------------------
/classification_weights/Explanation:
--------------------------------------------------------------------------------
1 | This is a weight file, which is the model's weights fine-tuned using the ADC dataset. You can use it to predict the activity of unknown ADCs.
2 | 


--------------------------------------------------------------------------------
/data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/data.xlsx


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | from macpath import split
  2 | from operator import concat
  3 | import re
  4 | from cProfile import label
  5 | from cgi import test
  6 | from tkinter import Label
  7 | import pandas as pd
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | from utils import smiles2adjoin, molecular_fg
 11 | from rdkit import Chem
 12 | from random import Random
 13 | from collections import defaultdict
 14 | from rdkit.Chem.Scaffolds import MurckoScaffold
 15 | from itertools import compress
 16 | 
 17 | str2num = {'<pad>':0 ,'H': 1, 'C': 2, 'N': 3, 'O': 4, 'S': 5, 'F': 6, 'Cl': 7, 'Br': 8, 'P':  9,
 18 |          'I': 10,'Na': 11,'B':12,'Se':13,'Si':14,'<unk>':15,'<mask>':16,'<global>':17}
 19 | num2str =  {i:j for j,i in str2num.items()}
 20 | 
 21 | class Graph_Classification_Dataset(object):  # Graph classification task data set processing
 22 |     def __init__(self,path,smiles_field1='Smiles1',smiles_field2='Smiles2',label_field=label, index_field=label, max_len=500,seed=1,batch_size=16,a=1,addH=True):
 23 |         if path.endswith('.txt') or path.endswith('.tsv'):
 24 |             self.df = pd.read_csv(path,sep='\t',encoding='latin1')
 25 |         elif path.endswith('.xlsx'):
 26 |             self.df = pd.read_excel(path)
 27 |         else:
 28 |             self.df = pd.read_csv(path, encoding='latin1')
 29 |         self.smiles_field1 = smiles_field1
 30 |         self.smiles_field2 = smiles_field2
 31 |         self.label_field = label_field
 32 |         self.index_field = index_field
 33 |         self.vocab = str2num
 34 |         self.devocab = num2str
 35 |         self.df = self.df[self.df[smiles_field1].str.len() <= max_len]
 36 |         self.df = self.df[[True if Chem.MolFromSmiles(smi) is not None else False for smi in self.df[smiles_field1]]]
 37 |         self.seed = seed
 38 |         self.batch_size = batch_size
 39 |         self.a = a
 40 |         self.addH = addH
 41 | 
 42 |     def get_data(self):
 43 |         '''Randomized Split Dataset'''
 44 |         data = self.df
 45 |         data = data.fillna(666)
 46 |         train_idx = []
 47 |         idx = data.sample(frac=0.8).index
 48 |         train_idx.extend(idx)
 49 |         train_data = data[data.index.isin(train_idx)]
 50 |         data = data[~data.index.isin(train_idx)]
 51 |         test_idx = []
 52 |         idx = data[~data.index.isin(train_data)].sample(frac=0.5).index
 53 |         test_idx.extend(idx)
 54 |         test_data = data[data.index.isin(test_idx)]
 55 |         val_data = data[~data.index.isin(train_idx+test_idx)]
 56 |         df_train_data = pd.DataFrame(train_data)
 57 |         df_test_data = pd.DataFrame(test_data)
 58 |         df_val_data = pd.DataFrame(val_data)
 59 | 
 60 |         self.dataset1 = tf.data.Dataset.from_tensor_slices(
 61 |             (df_train_data[self.smiles_field1], df_train_data[self.label_field], df_train_data[self.smiles_field2], df_train_data[self.index_field]))
 62 |         self.dataset1 = self.dataset1.map(self.tf_numerical_smiles).cache().padded_batch(batch_size=self.batch_size, padded_shapes=(
 63 |             tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([self.a]),tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([1]))).shuffle(1000).prefetch(50)
 64 | 
 65 |         self.dataset2 = tf.data.Dataset.from_tensor_slices((df_test_data[self.smiles_field1], df_test_data[self.label_field], df_test_data[self.smiles_field2], df_test_data[self.index_field]))
 66 |         self.dataset2 = self.dataset2.map(self.tf_numerical_smiles).padded_batch(512, padded_shapes=(
 67 |             tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([self.a]),tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([1]))).cache().prefetch(100)
 68 | 
 69 |         self.dataset3 = tf.data.Dataset.from_tensor_slices((df_val_data[self.smiles_field1], df_val_data[self.label_field], df_val_data[self.smiles_field2], df_val_data[self.index_field]))
 70 |         self.dataset3 = self.dataset3.map(self.tf_numerical_smiles).padded_batch(512, padded_shapes=(
 71 |             tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([self.a]),tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([1]))).cache().prefetch(100)
 72 | 
 73 |         return self.dataset1, self.dataset2, self.dataset3
 74 | 
 75 |     def numerical_smiles(self, smiles, label):
 76 |         smiles = smiles.numpy().decode()
 77 |         atoms_list, adjoin_matrix = smiles2adjoin(smiles,explicit_hydrogens=self.addH)
 78 |         atoms_list = ['<global>'] + atoms_list
 79 |         nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
 80 |         temp = np.ones((len(nums_list),len(nums_list)))
 81 |         temp[1:, 1:] = adjoin_matrix
 82 |         adjoin_matrix = (1-temp)*(-1e9)
 83 |         x = np.array(nums_list).astype('int64')
 84 |         y = np.array(label).astype('int64')
 85 |         return x, adjoin_matrix, y
 86 | 
 87 |     def tf_numerical_smiles(self, smiles1, label, smiles2, index):
 88 |         x1,adjoin_matrix1,y= tf.py_function(self.numerical_smiles, [smiles1,label], [tf.int64, tf.float32 ,tf.int64])
 89 |         x1.set_shape([None])
 90 |         adjoin_matrix1.set_shape([None,None])
 91 |         y.set_shape([None])
 92 |         x2,adjoin_matrix2,index = tf.py_function(self.numerical_smiles, [smiles2,index], [tf.int64, tf.float32 ,tf.int64])
 93 |         x2.set_shape([None])
 94 |         adjoin_matrix2.set_shape([None,None])
 95 |         index.set_shape([None])
 96 |         return x1, adjoin_matrix1, y, x2,adjoin_matrix2, index
 97 | 
 98 | class Inference_Dataset(object):
 99 |     def __init__(self,sml_list,max_len=500,addH=True):
100 |         self.vocab = str2num
101 |         self.devocab = num2str
102 |         self.sml_list = [i for i in sml_list if len(i)<max_len]
103 |         self.addH =  addH
104 | 
105 |     def get_data(self):
106 |         self.dataset = tf.data.Dataset.from_tensor_slices((self.sml_list,))
107 |         self.dataset = self.dataset.map(self.tf_numerical_smiles).padded_batch(512, padded_shapes=(
108 |             tf.TensorShape([None]), tf.TensorShape([None,None]),tf.TensorShape([1]),tf.TensorShape([None]))).cache().prefetch(20)
109 | 
110 |         return self.dataset
111 | 
112 |     def numerical_smiles(self, smiles):
113 |         smiles_origin = smiles
114 |         smiles = smiles.numpy().decode()
115 |         atoms_list, adjoin_matrix = smiles2adjoin(smiles,explicit_hydrogens=self.addH)
116 |         atoms_list = ['<global>'] + atoms_list
117 |         nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
118 |         temp = np.ones((len(nums_list),len(nums_list)))
119 |         temp[1:,1:] = adjoin_matrix
120 |         adjoin_matrix = (1-temp)*(-1e9)
121 |         x = np.array(nums_list).astype('int64')
122 |         return x, adjoin_matrix,[smiles], atoms_list
123 | 
124 |     def tf_numerical_smiles(self, smiles):
125 |         x,adjoin_matrix,smiles,atom_list = tf.py_function(self.numerical_smiles, [smiles], [tf.int64, tf.float32,tf.string, tf.string])
126 |         x.set_shape([None])
127 |         adjoin_matrix.set_shape([None,None])
128 |         smiles.set_shape([1])
129 |         atom_list.set_shape([None])
130 |         return x, adjoin_matrix,smiles,atom_list
131 | 
132 | class Inference_Dataset(object):
133 |     def __init__(self,sml_list,max_len=500,addH=True):
134 |         self.vocab = str2num
135 |         self.devocab = num2str
136 |         self.sml_list = [i for i in sml_list if len(i)<max_len]
137 |         self.addH =  addH
138 | 
139 |     def get_data(self):
140 |         self.dataset = tf.data.Dataset.from_tensor_slices((self.sml_list,))
141 |         self.dataset = self.dataset.map(self.tf_numerical_smiles).padded_batch(512, padded_shapes=(
142 |             tf.TensorShape([None]), tf.TensorShape([None,None]),tf.TensorShape([1]),tf.TensorShape([None]))).cache().prefetch(20)
143 | 
144 |         return self.dataset
145 | 
146 |     def numerical_smiles(self, smiles):
147 |         smiles_origin = smiles
148 |         smiles = smiles.numpy().decode()
149 |         atoms_list, adjoin_matrix = smiles2adjoin(smiles,explicit_hydrogens=self.addH)
150 |         atoms_list = ['<global>'] + atoms_list
151 |         nums_list =  [str2num.get(i,str2num['<unk>']) for i in atoms_list]
152 |         temp = np.ones((len(nums_list),len(nums_list)))
153 |         temp[1:,1:] = adjoin_matrix
154 |         adjoin_matrix = (1-temp)*(-1e9)
155 |         x = np.array(nums_list).astype('int64')
156 |         return x, adjoin_matrix,[smiles], atoms_list
157 | 
158 |     def tf_numerical_smiles(self, smiles):
159 |         x,adjoin_matrix,smiles,atom_list = tf.py_function(self.numerical_smiles, [smiles], [tf.int64, tf.float32,tf.string, tf.string])
160 |         x.set_shape([None])
161 |         adjoin_matrix.set_shape([None,None])
162 |         smiles.set_shape([1])
163 |         atom_list.set_shape([None])
164 |         return x, adjoin_matrix,smiles,atom_list
165 | 


--------------------------------------------------------------------------------
/files/Antigen_1280.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/files/Antigen_1280.pkl


--------------------------------------------------------------------------------
/files/Heavy_1280.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/files/Heavy_1280.pkl


--------------------------------------------------------------------------------
/files/Light_1280.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/files/Light_1280.pkl


--------------------------------------------------------------------------------
/files/data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/files/data.xlsx


--------------------------------------------------------------------------------
/files/x:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | import  tensorflow as tf
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from dataset import Graph_Classification_Dataset,Inference_Dataset
  6 | from sklearn.metrics import roc_auc_score,confusion_matrix,precision_recall_curve,auc
  7 | from rdkit.Chem import Draw
  8 | import os
  9 | import tensorflow.keras as keras
 10 | from model import  PredictModel
 11 | import torch
 12 | import pickle
 13 | from sklearn.preprocessing import StandardScaler
 14 | import math
 15 | 
 16 | def cover_dict(path):
 17 |     file_path = path
 18 |     with open(file_path, 'rb') as file:
 19 |         data = pickle.load(file)
 20 |     tensor_dict = {key: tf.constant(value) for key, value in data.items()}
 21 |     new_data = {i: value for i, (key, value) in enumerate(tensor_dict.items())}
 22 |     return new_data
 23 | 
 24 | def score(y_test, y_pred):
 25 |     auc_roc_score = roc_auc_score(y_test, y_pred)
 26 |     prec, recall, _ = precision_recall_curve(y_test, y_pred)
 27 |     prauc = auc(recall, prec)
 28 |     y_pred_print = [round(y, 0) for y in y_pred]
 29 |     tn, fp, fn, tp = confusion_matrix(y_test, y_pred_print).ravel()
 30 |     se = tp / (tp + fn)
 31 |     sp = tn / (tn + fp)  # 也是R
 32 |     acc = (tp + tn) / (tp + fn + tn + fp)
 33 |     mcc = (tp * tn - fn * fp) / math.sqrt((tp + fn) * (tp + fp) * (tn + fn) * (tn + fp))
 34 |     P = tp / (tp + fp)
 35 |     F1 = (P * se * 2) / (P + se)
 36 |     BA = (se + sp) / 2
 37 |     PPV = tp / (tp + fp)
 38 |     NPV = tn / (fn + tn)
 39 |     return tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV
 40 | 
 41 | def DAR_feature(file_path, column_name):
 42 |     df = pd.read_excel(file_path)
 43 |     column_data = df[column_name].values.reshape(-1, 1)
 44 |     mean_value = 3.86845977
 45 |     variance_value = 1.569108443
 46 |     std_deviation = variance_value**0.5
 47 |     column_data_standardized = (column_data - mean_value) / std_deviation
 48 |     normalized_data = (column_data_standardized - 0.8) / (12 - 0.8)
 49 |     data_dict = {index: tf.constant(value, dtype=tf.float32) for index, value in zip(df.index, normalized_data.flatten())}
 50 |     return data_dict
 51 | 
 52 | Heavy_dict = cover_dict('Heavy.pkl')
 53 | Light_dict = cover_dict('Light.pkl')
 54 | Antigen_dict = cover_dict('Antigen.pkl')
 55 | DAR_dict = DAR_feature('data.xlsx', 'DAR')
 56 | 
 57 | medium = {'name':'Medium','num_layers': 6, 'num_heads': 8, 'd_model': 256,'path':'medium_weights','addH':True}
 58 | arch = medium
 59 | trained_epoch = 20
 60 | num_layers = arch['num_layers']
 61 | num_heads = arch['num_heads']
 62 | d_model = arch['d_model']
 63 | addH = arch['addH']
 64 | dff = d_model * 2
 65 | vocab_size = 18
 66 | dense_dropout = 0.1
 67 | seed = 1
 68 | df = pd.read_excel('data.xlsx')
 69 | np.random.seed(seed=seed)
 70 | tf.random.set_seed(seed=seed)
 71 | sml_list1 = df['Payload Isosmiles'].tolist()
 72 | sml_list2 = df['Linker Isosmiles'].tolist()
 73 | 
 74 | ans = []
 75 | y_preds = []
 76 | res = []
 77 | n = len(sml_list1)
 78 | for i in range(n):
 79 |     x1 = [sml_list1[i]]
 80 |     x2 = [sml_list2[i]]
 81 |     t1 = Heavy_dict[i]
 82 |     t2 = Light_dict[i]
 83 |     t3 = Antigen_dict[i]
 84 |     t4 = DAR_dict[i].numpy()
 85 |     t1 = tf.expand_dims(t1, axis=0)
 86 |     t2 = tf.expand_dims(t2, axis=0)
 87 |     t3 = tf.expand_dims(t3, axis=0)
 88 |     t4 = tf.constant(t4, shape=(1, 1))
 89 | 
 90 |     inference_dataset1 = Inference_Dataset(x1,addH=addH).get_data()
 91 |     inference_dataset2 = Inference_Dataset(x2,addH=addH).get_data()
 92 | 
 93 |     x1, adjoin_matrix1, smiles1 ,atom_list1 = next(iter(inference_dataset1.take(1)))
 94 |     x2, adjoin_matrix2, smiles2 ,atom_list2 = next(iter(inference_dataset2.take(1)))
 95 | 
 96 |     seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32)
 97 |     seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32)
 98 |     
 99 |     mask1 = seq1[:, tf.newaxis, tf.newaxis, :]
100 |     mask2 = seq2[:, tf.newaxis, tf.newaxis, :]
101 | 
102 |     model = PredictModel(num_layers=num_layers,
103 |                          d_model=d_model,
104 |                          dff=dff, 
105 |                          num_heads=num_heads, 
106 |                          vocab_size=vocab_size,
107 |                          a=1,
108 |                          dense_dropout = dense_dropout)
109 |     
110 |     pred = model(x1=x1, mask1=mask1, training=False, adjoin_matrix1=adjoin_matrix1, x2=x2,mask2=mask2, adjoin_matrix2=adjoin_matrix2, t1=t1,t2=t2,t3=t3,t4=t4)
111 |     model.load_weights('classification_weights/ADC_9.h5')
112 | 
113 |     x = model(x1=x1, mask1=mask1,training=False,adjoin_matrix1=adjoin_matrix1, x2=x2, mask2=mask2, adjoin_matrix2=adjoin_matrix2,t1=t1,t2=t2,t3=t3,t4=t4)
114 |     y_preds.append(x)
115 | 
116 | 
117 | y_preds = tf.sigmoid(y_preds)
118 | y_preds = tf.reshape(y_preds,(-1,))
119 | y_hat = tf.where(y_preds < 0.5, 0, 1)
120 | for i in y_preds.numpy():
121 |     ans.append(i)
122 | for i in y_hat.numpy():
123 |     res.append(i)
124 | print(ans)
125 | print(res)
126 | 


--------------------------------------------------------------------------------
/medium3_weights/Explanation:
--------------------------------------------------------------------------------
1 | This is a weight file for training the ADCNet model, originating from FG-BERT (https://github.com/idrugLab/FG-BERT). 
2 | Using these weights obtained after fine-tuning on labeled ADC data, the model can be employed for predicting ADCs.
3 | 


--------------------------------------------------------------------------------
/medium3_weights/bert_weightsMedium_20.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/medium3_weights/bert_weightsMedium_20.h5


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | from cProfile import label
  2 | import tensorflow as tf
  3 | import time
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | 
  8 | 
  9 | def gelu(x):
 10 |     return 0.5 * x * (1.0 + tf.math.erf(x / tf.sqrt(2.)))
 11 | 
 12 | 
 13 | def scaled_dot_product_attention(q, k, v, mask,adjoin_matrix):
 14 | 
 15 |     matmul_qk = tf.matmul(q, k, transpose_b=True)
 16 | 
 17 | 
 18 |     dk = tf.cast(tf.shape(k)[-1], tf.float32)
 19 |     scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
 20 | 
 21 | 
 22 |     if mask is not None:
 23 |         scaled_attention_logits += (mask * -1e9)
 24 |     if adjoin_matrix is not None:
 25 |         scaled_attention_logits += adjoin_matrix
 26 | 
 27 |     attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) 
 28 | 
 29 |     output = tf.matmul(attention_weights, v) 
 30 | 
 31 |     return output, attention_weights
 32 | 
 33 | 
 34 | class MultiHeadAttention(tf.keras.layers.Layer):
 35 |     def __init__(self, d_model, num_heads):
 36 |         super(MultiHeadAttention, self).__init__()
 37 |         self.num_heads = num_heads
 38 |         self.d_model = d_model
 39 | 
 40 |         assert d_model % self.num_heads == 0
 41 | 
 42 |         self.depth = d_model // self.num_heads
 43 | 
 44 |         self.wq = tf.keras.layers.Dense(d_model)
 45 |         self.wk = tf.keras.layers.Dense(d_model)
 46 |         self.wv = tf.keras.layers.Dense(d_model)
 47 | 
 48 |         self.dense = tf.keras.layers.Dense(d_model)
 49 | 
 50 |     def split_heads(self, x, batch_size):
 51 |         """Split the last dimension into (num_heads, depth).
 52 |         Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
 53 |         """
 54 |         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
 55 |         return tf.transpose(x, perm=[0, 2, 1, 3])
 56 | 
 57 |     def call(self, v, k, q, mask,adjoin_matrix):
 58 |         batch_size = tf.shape(q)[0]
 59 | 
 60 |         q = self.wq(q) 
 61 |         k = self.wk(k) 
 62 |         v = self.wv(v)  
 63 | 
 64 |         q = self.split_heads(q, batch_size)
 65 |         k = self.split_heads(k, batch_size)
 66 |         v = self.split_heads(v, batch_size) 
 67 | 
 68 |         scaled_attention, attention_weights = scaled_dot_product_attention(
 69 |             q, k, v, mask,adjoin_matrix)
 70 | 
 71 |         scaled_attention = tf.transpose(scaled_attention,
 72 |                                         perm=[0, 2, 1, 3])
 73 | 
 74 |         concat_attention = tf.reshape(scaled_attention,
 75 |                                       (batch_size, -1, self.d_model)) 
 76 | 
 77 |         output = self.dense(concat_attention)
 78 | 
 79 |         return output, attention_weights
 80 | 
 81 | def point_wise_feed_forward_network(d_model, dff):
 82 |     return tf.keras.Sequential([
 83 |         tf.keras.layers.Dense(dff, activation=gelu),
 84 |         tf.keras.layers.Dense(d_model) 
 85 |         ])
 86 | 
 87 | 
 88 | class EncoderLayer(tf.keras.layers.Layer):
 89 |     def __init__(self, d_model, num_heads, dff, rate=0.1):
 90 |         super(EncoderLayer, self).__init__()
 91 | 
 92 |         self.mha = MultiHeadAttention(d_model, num_heads)
 93 |         self.ffn = point_wise_feed_forward_network(d_model, dff)
 94 | 
 95 |         self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 96 |         self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 97 | 
 98 |         self.dropout1 = tf.keras.layers.Dropout(rate)
 99 |         self.dropout2 = tf.keras.layers.Dropout(rate)
100 | 
101 |     def call(self, x, training, mask,adjoin_matrix):
102 |         attn_output, attention_weights = self.mha(x, x, x, mask,adjoin_matrix)
103 |         attn_output = self.dropout1(attn_output, training=training)
104 |         out1 = self.layernorm1(x + attn_output)
105 | 
106 |         ffn_output = self.ffn(out1) 
107 |         ffn_output = self.dropout2(ffn_output, training=training)
108 |         out2 = self.layernorm2(out1 + ffn_output) 
109 | 
110 |         return out2,attention_weights
111 | 
112 | 
113 | 
114 | class Encoder(tf.keras.Model): 
115 |     def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
116 |                     maximum_position_encoding, rate=0.1):
117 |         super(Encoder, self).__init__()
118 | 
119 |         self.d_model = d_model
120 |         self.num_layers = num_layers
121 | 
122 |         self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
123 | 
124 |         self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
125 |                            for _ in range(num_layers)]
126 | 
127 |         self.dropout = tf.keras.layers.Dropout(rate)
128 | 
129 |     def call(self, x, training, mask,adjoin_matrix):
130 |         seq_len = tf.shape(x)[1]
131 |         adjoin_matrix = adjoin_matrix[:,tf.newaxis,:,:]
132 |         x = self.embedding(x)
133 |         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
134 | 
135 |         x = self.dropout(x, training=training)
136 | 
137 |         for i in range(self.num_layers):
138 |             x,attention_weights = self.enc_layers[i](x, training, mask,adjoin_matrix)
139 |         return x
140 | 
141 | class Encoder_test(tf.keras.Model):
142 |     def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
143 |                  maximum_position_encoding, rate=0.1):
144 |         super(Encoder_test, self).__init__()
145 | 
146 |         self.d_model = d_model
147 |         self.num_layers = num_layers
148 | 
149 |         self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
150 | 
151 |         self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
152 |                            for _ in range(num_layers)]
153 | 
154 |         self.dropout = tf.keras.layers.Dropout(rate)
155 | 
156 |     def call(self, x, training, mask,adjoin_matrix):
157 |         seq_len = tf.shape(x)[1]
158 |         adjoin_matrix = adjoin_matrix[:,tf.newaxis,:,:]
159 |         x = self.embedding(x)
160 |         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
161 | 
162 |         x = self.dropout(x, training=training)
163 |         attention_weights_list = []
164 |         xs = []
165 | 
166 |         for i in range(self.num_layers):
167 |             x,attention_weights = self.enc_layers[i](x, training, mask,adjoin_matrix)
168 |             attention_weights_list.append(attention_weights)
169 |             xs.append(x)
170 | 
171 |         return x,attention_weights_list,xs
172 | 
173 | class BertModel_test(tf.keras.Model):
174 |     def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size = 18,dropout_rate = 0.1):
175 |         super(BertModel_test, self).__init__()
176 |         self.encoder = Encoder_test(num_layers=num_layers,d_model=d_model,
177 |                         num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate)
178 |         self.fc1 = tf.keras.layers.Dense(d_model, activation=gelu)
179 |         self.layernorm = tf.keras.layers.LayerNormalization(-1)
180 |         self.fc2 = tf.keras.layers.Dense(vocab_size)
181 |     def call(self,x,adjoin_matrix,mask,training=False):
182 |         x,att,xs = self.encoder(x,training=training,mask=mask,adjoin_matrix=adjoin_matrix)
183 |         x = self.fc1(x)
184 |         x = self.layernorm(x)
185 |         x = self.fc2(x)
186 |         return x,att,xs
187 | 
188 | 
189 | class BertModel(tf.keras.Model):
190 |     def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size = 18,dropout_rate = 0.1):
191 |         super(BertModel, self).__init__()
192 |         self.encoder = Encoder(num_layers=num_layers,d_model=d_model,
193 |                         num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate)
194 |         self.fc1 = tf.keras.layers.Dense(d_model, activation=gelu)
195 |         self.layernorm = tf.keras.layers.LayerNormalization(-1)
196 |         self.fc2 = tf.keras.layers.Dense(vocab_size)
197 | 
198 |     def call(self,x,adjoin_matrix,mask,training=False):
199 |         x = self.encoder(x,training=training,mask=mask,adjoin_matrix=adjoin_matrix)
200 |         x = self.fc1(x)
201 |         x = self.layernorm(x)
202 |         x = self.fc2(x)
203 |         return x
204 | 
205 | 
206 | class PredictModel(tf.keras.Model):
207 |     def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size =18, a=2, dropout_rate = 0.1,dense_dropout=0.1):
208 |         super(PredictModel, self).__init__()
209 |         self.encoder = Encoder(num_layers=num_layers,d_model=d_model,
210 |                         num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate)
211 |         self.fc1 = tf.keras.layers.Dense(256,activation=tf.keras.layers.LeakyReLU(0.1))
212 |         self.dropout1 = tf.keras.layers.Dropout(dense_dropout)
213 |         self.fc2 = tf.keras.layers.Dense(a)
214 | 
215 |     def call(self,x1,adjoin_matrix1,mask1,x2,adjoin_matrix2,mask2,t1,t2,t3,t4,training=False):
216 |         x1 = self.encoder(x1,training=training,mask=mask1,adjoin_matrix=adjoin_matrix1)
217 |         x1 = x1[:,0,:]
218 |         x2 = self.encoder(x2,training=False,mask=mask2,adjoin_matrix=adjoin_matrix2)
219 |         x2 = x2[:,0,:]
220 |         x = tf.concat([x1,x2], axis=1)
221 |         x = tf.concat([x, t1], axis=1)
222 |         x = tf.concat([x, t2], axis=1)
223 |         x = tf.concat([x, t3], axis=1)
224 |         x = tf.concat([x, t4], axis=1)
225 |         x = self.fc1(x)
226 |         x = self.dropout1(x,training=training)
227 |         x = self.fc2(x)
228 |         return x
229 | 


--------------------------------------------------------------------------------
/py37.yaml:
--------------------------------------------------------------------------------
  1 | name: ADCNet
  2 | channels:
  3 |   - rdkit
  4 |   - openbabel
  5 |   - anaconda
  6 |   - defaults
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=main
  9 |   - _openmp_mutex=5.1=1_gnu
 10 |   - blas=1.0=openblas
 11 |   - bottleneck=1.3.5=py37h7deecbd_0
 12 |   - brotli=1.0.9=h5eee18b_7
 13 |   - brotli-bin=1.0.9=h5eee18b_7
 14 |   - bzip2=1.0.8=h7b6447c_0
 15 |   - ca-certificates=2023.08.22=h06a4308_0
 16 |   - cairo=1.14.12=h8948797_3
 17 |   - certifi=2022.12.7=py37h06a4308_0
 18 |   - cudatoolkit=10.1.243=h6bb024c_0
 19 |   - cudnn=7.6.5=cuda10.1_0
 20 |   - cycler=0.11.0=pyhd3eb1b0_0
 21 |   - daal4py=2021.5.0=py37h78b71dc_0
 22 |   - dal=2021.5.1=h06a4308_803
 23 |   - dbus=1.13.18=hb2f20db_0
 24 |   - expat=2.4.4=h295c915_0
 25 |   - fontconfig=2.13.1=h6c09931_0
 26 |   - freetype=2.11.0=h70c0345_0
 27 |   - giflib=5.2.1=h7b6447c_0
 28 |   - glib=2.69.1=h4ff587b_1
 29 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 30 |   - gstreamer=1.14.1=h5eee18b_1
 31 |   - icu=58.2=he6710b0_3
 32 |   - intel-openmp=2021.4.0=h06a4308_3561
 33 |   - joblib=1.1.0=pyhd3eb1b0_0
 34 |   - jpeg=9e=h7f8727e_0
 35 |   - lcms2=2.12=h3be6417_0
 36 |   - ld_impl_linux-64=2.38=h1181459_1
 37 |   - libboost=1.67.0=h46d08c1_4
 38 |   - libbrotlicommon=1.0.9=h5eee18b_7
 39 |   - libbrotlidec=1.0.9=h5eee18b_7
 40 |   - libbrotlienc=1.0.9=h5eee18b_7
 41 |   - libedit=3.1.20210910=h7f8727e_0
 42 |   - libffi=3.3=he6710b0_2
 43 |   - libgcc-ng=11.2.0=h1234567_1
 44 |   - libgfortran-ng=7.5.0=ha8ba4b0_17
 45 |   - libgfortran4=7.5.0=ha8ba4b0_17
 46 |   - libgomp=11.2.0=h1234567_1
 47 |   - libopenblas=0.3.18=hf726d26_0
 48 |   - libpng=1.6.37=hbc83047_0
 49 |   - libstdcxx-ng=11.2.0=h1234567_1
 50 |   - libtiff=4.2.0=h2818925_1
 51 |   - libuuid=1.0.3=h7f8727e_2
 52 |   - libwebp=1.2.2=h55f646e_0
 53 |   - libwebp-base=1.2.2=h7f8727e_0
 54 |   - libxcb=1.15=h7f8727e_0
 55 |   - libxml2=2.9.14=h74e7548_0
 56 |   - lz4-c=1.9.3=h295c915_1
 57 |   - matplotlib-base=3.4.3=py37hbbc1b5f_0
 58 |   - mkl=2021.4.0=h06a4308_640
 59 |   - mkl-service=2.4.0=py37h7f8727e_0
 60 |   - mpi=1.0=mpich
 61 |   - mpich=3.3.2=hc856adb_0
 62 |   - munkres=1.1.4=py_0
 63 |   - ncurses=6.3=h5eee18b_3
 64 |   - numpy-base=1.18.5=py37h2f8d375_0
 65 |   - openbabel=2.4.1=py37_6
 66 |   - openssl=1.1.1w=h7f8727e_0
 67 |   - packaging=21.3=pyhd3eb1b0_0
 68 |   - pcre=8.45=h295c915_0
 69 |   - pillow=9.2.0=py37hace64e9_1
 70 |   - pip=22.1.2=py37h06a4308_0
 71 |   - pixman=0.40.0=h7f8727e_1
 72 |   - py-boost=1.67.0=py37h04863e7_4
 73 |   - pyparsing=3.0.4=pyhd3eb1b0_0
 74 |   - pyqt=5.9.2=py37h05f1152_2
 75 |   - python=3.7.13=h12debd9_0
 76 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
 77 |   - pytz=2022.1=py37h06a4308_0
 78 |   - qt=5.9.7=h5867ecd_1
 79 |   - rdkit=2020.03.2.0=py37hc20afe1_1
 80 |   - readline=8.2=h5eee18b_0
 81 |   - scikit-learn=1.0.2=py37h51133e4_1
 82 |   - scikit-learn-intelex=2021.5.0=py37h06a4308_0
 83 |   - setuptools=61.2.0=py37h06a4308_0
 84 |   - sip=4.19.8=py37hf484d3e_0
 85 |   - six=1.16.0=pyhd3eb1b0_1
 86 |   - sqlite=3.39.3=h5082296_0
 87 |   - tbb=2021.5.0=hd09550d_0
 88 |   - threadpoolctl=2.2.0=pyh0d69192_0
 89 |   - tk=8.6.12=h1ccaba5_0
 90 |   - tornado=6.1=py37h27cfd23_0
 91 |   - typing_extensions=4.3.0=py37h06a4308_0
 92 |   - wheel=0.37.1=pyhd3eb1b0_0
 93 |   - xz=5.2.5=h7f8727e_1
 94 |   - zlib=1.2.12=h7f8727e_2
 95 |   - zstd=1.5.2=ha4553b6_0
 96 |   - pip:
 97 |     - absl-py==1.2.0
 98 |     - astunparse==1.6.3
 99 |     - cachetools==5.2.0
100 |     - charset-normalizer==2.1.0
101 |     - cloudpickle==2.1.0
102 |     - et-xmlfile==1.1.0
103 |     - fonttools==4.34.4
104 |     - future==0.18.2
105 |     - gast==0.3.3
106 |     - google-auth==2.10.0
107 |     - google-auth-oauthlib==0.4.6
108 |     - google-pasta==0.2.0
109 |     - grpcio==1.47.0
110 |     - h5py==2.10.0
111 |     - hyperopt==0.2.7
112 |     - idna==3.3
113 |     - importlib-metadata==4.12.0
114 |     - keras-preprocessing==1.1.2
115 |     - kiwisolver==1.4.4
116 |     - markdown==3.4.1
117 |     - markupsafe==2.1.1
118 |     - matplotlib==3.5.2
119 |     - networkx==2.6.3
120 |     - numpy==1.18.5
121 |     - nvidia-cublas-cu11==11.10.3.66
122 |     - nvidia-cuda-nvrtc-cu11==11.7.99
123 |     - nvidia-cuda-runtime-cu11==11.7.99
124 |     - nvidia-cudnn-cu11==8.5.0.96
125 |     - oauthlib==3.2.0
126 |     - openpyxl==3.1.2
127 |     - opt-einsum==3.3.0
128 |     - pandas==1.3.5
129 |     - protobuf==3.19.4
130 |     - py4j==0.10.9.7
131 |     - pyasn1==0.4.8
132 |     - pyasn1-modules==0.2.8
133 |     - requests==2.28.1
134 |     - requests-oauthlib==1.3.1
135 |     - rsa==4.9
136 |     - scipy==1.4.1
137 |     - tensorboard==2.9.1
138 |     - tensorboard-data-server==0.6.1
139 |     - tensorboard-plugin-wit==1.8.1
140 |     - tensorflow==2.3.0
141 |     - tensorflow-estimator==2.3.0
142 |     - tensorflow-gpu==2.3.0
143 |     - termcolor==1.1.0
144 |     - torch==1.13.1
145 |     - tqdm==4.64.0
146 |     - urllib3==1.26.11
147 |     - werkzeug==2.2.1
148 |     - wrapt==1.14.1
149 |     - zipp==3.8.1
150 | prefix: /share/home/yyzh/anaconda3/envs/ADCNet
151 | 


--------------------------------------------------------------------------------
/t_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/t_data.xlsx


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from rdkit import Chem
  2 | from rdkit.Chem import rdmolfiles, rdmolops
  3 | import numpy as np
  4 | import openbabel as ob
  5 | import os
  6 | import csv
  7 | from rdkit import RDConfig
  8 | from rdkit.Chem import FragmentCatalog
  9 | 
 10 | def fg_list(): # 47 FGs list
 11 |     fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt')
 12 |     fparams = FragmentCatalog.FragCatParams(1,6,fName)
 13 |     fg_list = []
 14 |     for i in range(fparams.GetNumFuncGroups()):
 15 |         fg_list.append(fparams.GetFuncGroup(i))
 16 |     fg_list.pop(27)
 17 |     
 18 |     x = [Chem.MolToSmiles(_) for _ in fg_list]+['*C=C','*F','*Cl','*Br','*I','[Na+]','*P','*P=O','*[Se]','*[Si]']
 19 |     y = set(x)
 20 |     return list(y)
 21 | 
 22 | def obsmitosmile(smi):
 23 |     conv = ob.OBConversion()
 24 |     conv.SetInAndOutFormats("smi", "can")
 25 |     conv.SetOptions("K", conv.OUTOPTIONS)
 26 |     mol = ob.OBMol()
 27 |     conv.ReadString(mol, smi)
 28 |     smile = conv.WriteString(mol)
 29 |     smile = smile.replace('\t\n', '')
 30 |     return smile
 31 | 
 32 | def molecular_fg(smiles): # Getting functional groups (including rings) in molecules
 33 | 
 34 |     mol = Chem.MolFromSmiles(smiles)
 35 |     if mol is None:
 36 |         print('error')
 37 |         mol = Chem.MolFromSmiles(obsmitosmile(smiles))
 38 |         assert mol is not None, smiles + ' is not valid '
 39 |     a = fg_list()
 40 | 
 41 | 
 42 |     ssr = Chem.GetSymmSSSR(mol)
 43 |     num_ring = len(ssr)
 44 |     ring_dict = {}
 45 |     for i in range(num_ring):
 46 |         ring_dict[i+1] = list(ssr[i])
 47 | 
 48 |     f_g_list = []
 49 |     for i in ring_dict.values():
 50 |         f_g_list.append(i)
 51 | 
 52 |     for i in a:
 53 |         patt = Chem.MolFromSmarts(i)
 54 |         flag = mol.HasSubstructMatch(patt)
 55 |         if flag:
 56 |             atomids = mol.GetSubstructMatches(patt)
 57 |             for atomid in atomids:
 58 |                 f_g_list.append(list(atomid))
 59 |     return f_g_list
 60 | 
 61 | 
 62 | def smiles2adjoin(smiles,explicit_hydrogens=True,canonical_atom_order=False): # Converting molecules in SMILES format to atom lists and adjacency matrices
 63 | 
 64 |     mol = Chem.MolFromSmiles(smiles)
 65 |     if mol is None:
 66 |         print('error')
 67 |         mol = Chem.MolFromSmiles(obsmitosmile(smiles))
 68 |         assert mol is not None, smiles + ' is not valid '
 69 | 
 70 |     if explicit_hydrogens:
 71 |         mol = Chem.AddHs(mol)
 72 |     else:
 73 |         mol = Chem.RemoveHs(mol)
 74 | 
 75 |     if canonical_atom_order:
 76 |         new_order = rdmolfiles.CanonicalRankAtoms(mol)
 77 |         mol = rdmolops.RenumberAtoms(mol, new_order)
 78 |     num_atoms = mol.GetNumAtoms()
 79 | 
 80 |     atoms_list = []
 81 |     for i in range(num_atoms):
 82 |         atom = mol.GetAtomWithIdx(i)
 83 |         atoms_list.append(atom.GetSymbol())
 84 | 
 85 | 
 86 |     adjoin_matrix = np.eye(num_atoms)
 87 |     num_bonds = mol.GetNumBonds()
 88 |     
 89 |     for i in range(num_bonds):
 90 |         bond = mol.GetBondWithIdx(i)
 91 |         u = bond.GetBeginAtomIdx()
 92 |         v = bond.GetEndAtomIdx()
 93 |         adjoin_matrix[u,v] = 1.0
 94 |         adjoin_matrix[v,u] = 1.0
 95 |     
 96 |     return atoms_list,adjoin_matrix
 97 | 
 98 | 
 99 | def get_header(path):
100 |     with open(path) as f:
101 |         header = next(csv.reader(f))
102 | 
103 |     return header
104 | 
105 | 
106 | def get_task_names(path, use_compound_names=False):
107 |     index = 2 if use_compound_names else 1
108 |     task_names = get_header(path)[index:]
109 | 
110 |     return task_names


--------------------------------------------------------------------------------