├── ADCNet.png ├── ESM-2.py ├── README.md ├── class.py ├── classification_weights ├── ADC_9.h5 └── Explanation ├── data.xlsx ├── dataset.py ├── files ├── Antigen_1280.pkl ├── Heavy_1280.pkl ├── Light_1280.pkl ├── data.xlsx └── x ├── inference.py ├── medium3_weights ├── Explanation └── bert_weightsMedium_20.h5 ├── model.py ├── py37.yaml ├── t_data.xlsx └── utils.py /ADCNet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/ADCNet.png -------------------------------------------------------------------------------- /ESM-2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import esm 3 | import pandas as pd 4 | import pickle 5 | 6 | # Load ESM-2 model 7 | # model, alphabet = esm.pretrained.esm2_t6_8M_UR50D() 8 | model, alphabet = esm.pretrained.esm2_t33_650M_UR50D() 9 | # model, alphabet = esm.pretrained.esm2_t36_3B_UR50D() 10 | batch_converter = alphabet.get_batch_converter() 11 | model.eval() # disables dropout for deterministic results 12 | 13 | # Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4) 14 | 15 | df = pd.read_excel(r'data.xlsx',) 16 | 17 | protain = df['Antigen Sequence(64)'].tolist() 18 | smiles = df['ADC ID'].tolist() 19 | 20 | datas = [] 21 | for i in range(len(protein)): 22 | datas.append((smiles[i], protain[i])) 23 | 24 | sequence_representations = [] 25 | 26 | for data in datas: 27 | # print(data) 28 | try : 29 | batch_labels, batch_strs, batch_tokens = batch_converter([data]) 30 | except Exception as e: 31 | print(data) 32 | batch_lens = (batch_tokens != alphabet.padding_idx).sum(1) 33 | 34 | # Extract per-residue representations (on CPU) 35 | with torch.no_grad(): 36 | results = model(batch_tokens, repr_layers=[33], return_contacts=True) 37 | 38 | token_representations = results["representations"][33] 39 | 40 | # Generate per-sequence representations via averaging 41 | # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1. 42 | 43 | for i, tokens_len in enumerate(batch_lens): 44 | sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0)) 45 | esm_embedding = {} 46 | for i in range(len(sequence_representations)): 47 | esm_embedding[datas[i][0]] = sequence_representations[i] 48 | print(sequence_representations[0].shape) 49 | print(len(esm_embedding)) 50 | 51 | file_path = 'Antigen.pkl' 52 | with open(file_path, 'wb') as file: 53 | pickle.dump(esm_embedding, file) 54 | 55 | # 加载保存的长字典 56 | with open(file_path, 'rb') as file: 57 | loaded_dict = pickle.load(file) 58 | 59 | # Look at the unsupervised self-attention map contact predictions 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ADCNet 2 | semi-supervised learning for ADC property prediction. 3 | ![image](https://github.com/idrugLab/ADCNet/blob/main/ADCNet.png) 4 | # Description of the document 5 | py37.yaml is the version of the various packages for the installed environment. The files folder contains antibody heavy and light chains, antigen macromolecule embedding, and ADC dataset. 6 | # Requried package: 7 | ## Example of ESM-2 environment installation: 8 | ```ruby 9 | conda create -n esm-2 python==3.9 10 | pip install fair-esm # latest release, OR: 11 | pip install git+https://github.com/facebookresearch/esm.git # bleeding edge, current repo main branch 12 | ``` 13 | 14 | ## Example of ADCNet environment installation: 15 | ```ruby 16 | conda create -n ADCNet python==3.7 17 | pip install tensorflow==2.3 18 | pip install rdkit 19 | pip install numpy 20 | pip install pandas 21 | conda install -c openbabel openbabel 22 | pip install matplotlib 23 | pip install hyperopt 24 | pip install scikit-learn 25 | pip install torch 26 | pip install openpyxl 27 | ``` 28 | 29 | ## Examples of obtaining embeddings for antibodies or antigens. 30 | ```ruby 31 | conda activate esm-2 32 | python ESM-2.py 33 | ``` 34 | After completion of the run, you will find a .pkl file in the current directory. It is a dictionary where the keys are ADC IDs (if there is no ADC ID, you can add a column with numerical values to the original data and name it ADC ID), and the values are tensors of 1280 dimensions. 35 | 36 | ## Examples of training ADCNet. 37 | First, run ESM-2.py to obtain embeddings for the heavy chain, light chain, and antigen of the antibody. The code will save these embeddings into three pkl files. 38 | Secondly, Ensure that each data entry contains the DAR value. 39 | Finally, Create a folder named "medium3_weights" and place the file "bert_weightsMedium_20.h5" from this repository into that folder. 40 | 41 | ```ruby 42 | conda activate ADCNet 43 | python class.py 44 | ``` 45 | ## Examples of using ADCNet to inference. 46 | First, run ESM-2.py to obtain embeddings for the heavy chain, light chain, and antigen of the antibody. The code will save these embeddings into three pkl files. 47 | Secondly, Ensure that each data entry contains the DAR value. 48 | Finally, Create a folder named "classification_weights" and place the file "ADC_9.h5" from this repository into that folder. 49 | If you want to reproduce the results of this article, run class.py directly. If you want to search for hyperparameters again, uncomment the code in the hyperparameter search section (lines 254-279) and comment the original hyperparameters (lines 281-285). 50 | ```ruby 51 | conda activate ADCNet 52 | python inference.py 53 | ``` 54 | ## Using ADCNet for predictions 55 | ```ruby 56 | You can visit the (https://ADCNet.idruglab.cn) website to make predictions. 57 | ``` 58 | 59 | 60 | -------------------------------------------------------------------------------- /class.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | import tensorflow as tf 3 | import tensorflow.keras as keras 4 | import numpy as np 5 | from dataset import Graph_Classification_Dataset 6 | import os 7 | import pandas as pd 8 | from model import PredictModel, BertModel 9 | from sklearn.metrics import roc_auc_score,confusion_matrix,precision_recall_curve,auc 10 | from hyperopt import fmin, tpe, hp 11 | from utils import get_task_names 12 | from tensorflow.python.client import device_lib 13 | from sklearn.preprocessing import StandardScaler 14 | import pickle 15 | import math 16 | import csv 17 | 18 | os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true" 19 | keras.backend.clear_session() 20 | os.environ['TF_DETERMINISTIC_OPS'] = '1' 21 | 22 | def count_parameters(model): 23 | total_params = 0 24 | for variable in model.trainable_variables: 25 | shape = variable.shape 26 | params = 1 27 | for dim in shape: 28 | params *= dim 29 | total_params += params 30 | return total_params 31 | 32 | def cover_dict(path): 33 | file_path = path 34 | with open(file_path, 'rb') as file: 35 | data = pickle.load(file) 36 | tensor_dict = {key: tf.constant(value) for key, value in data.items()} 37 | new_data = {i: value for i, (key, value) in enumerate(tensor_dict.items())} 38 | return new_data 39 | 40 | def score(y_test, y_pred): 41 | auc_roc_score = roc_auc_score(y_test, y_pred) 42 | prec, recall, _ = precision_recall_curve(y_test, y_pred) 43 | prauc = auc(recall, prec) 44 | y_pred_print = [round(y, 0) for y in y_pred] 45 | tn, fp, fn, tp = confusion_matrix(y_test, y_pred_print).ravel() 46 | se = tp / (tp + fn) 47 | sp = tn / (tn + fp) 48 | acc = (tp + tn) / (tp + fn + tn + fp) 49 | mcc = (tp * tn - fn * fp) / math.sqrt((tp + fn) * (tp + fp) * (tn + fn) * (tn + fp)) 50 | P = tp / (tp + fp) 51 | F1 = (P * se * 2) / (P + se) 52 | BA = (se + sp) / 2 53 | PPV = tp / (tp + fp) 54 | NPV = tn / (fn + tn) 55 | return tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV 56 | 57 | def DAR_feature(file_path, column_name): 58 | df = pd.read_excel(file_path) 59 | column_data = df[column_name].values.reshape(-1, 1) 60 | scaler = StandardScaler() 61 | column_data_standardized = scaler.fit_transform(column_data) 62 | column_data_normalized = tf.keras.utils.normalize(column_data_standardized, axis=0).flatten() 63 | data_dict = {index: tf.constant(value, dtype=tf.float32) for index, value in zip(df.index, column_data_normalized)} 64 | return data_dict 65 | 66 | def process_list(input_list): 67 | input_list.append(np.mean(input_list)) 68 | mean_value = np.mean(input_list[:-1]) 69 | std_value = np.std(input_list[:-1], ddof=0) 70 | mean_range = f'{mean_value:.4f} ± {std_value:.4f}' 71 | input_list[-1] = mean_range 72 | print(input_list) 73 | return input_list 74 | 75 | def extract_tensors(index, heavy_dict, light_dict, antigen_dict, dar_dict): 76 | heavy_tensor_list = [] 77 | light_tensor_list = [] 78 | antigen_tensor_list = [] 79 | DAR_tensor_list = [] 80 | 81 | for i in index.numpy(): 82 | heavy_tensor_list.append(heavy_dict[i[0]]) 83 | light_tensor_list.append(light_dict[i[0]]) 84 | antigen_tensor_list.append(antigen_dict[i[0]]) 85 | DAR_tensor_list.append(dar_dict[i[0]]) 86 | t1 = np.vstack(heavy_tensor_list) 87 | t2 = np.vstack(light_tensor_list) 88 | t3 = np.vstack(antigen_tensor_list) 89 | t4 = np.vstack(DAR_tensor_list) 90 | 91 | return t1, t2, t3, t4 92 | 93 | Heavy_dict = cover_dict('Heavy_1280.pkl') 94 | Light_dict = cover_dict('Light_1280.pkl') 95 | Antigen_dict = cover_dict('Antigen_1280.pkl') 96 | DAR_dict = DAR_feature('data.xlsx', 'DAR_val') 97 | 98 | def main(seed, args): 99 | 100 | task = 'ADC' 101 | idx = ['index'] 102 | label = ['label(100nm)'] 103 | 104 | arch = {'name': 'Medium', 'path': 'medium3_weights'} 105 | pretraining = True 106 | pretraining_str = 'pretraining' if pretraining else '' 107 | trained_epoch = 20 108 | num_layers = 6 109 | d_model = 256 110 | addH = True 111 | dff = d_model * 2 112 | vocab_size = 18 113 | 114 | num_heads = args['num_heads'] 115 | dense_dropout = args['dense_dropout'] 116 | learning_rate = args['learning_rate'] 117 | batch_size = args['batch_size'] 118 | seed = seed 119 | np.random.seed(seed=seed) 120 | tf.random.set_seed(seed=seed) 121 | train_dataset, test_dataset, val_dataset = Graph_Classification_Dataset('data.xlsx', 122 | smiles_field1='Payload Isosmiles', 123 | smiles_field2='Linker Isosmiles', 124 | label_field=label, 125 | index_field=idx, 126 | seed=seed, 127 | batch_size=batch_size, 128 | a = len(label), 129 | addH=addH).get_data() 130 | 131 | x1, adjoin_matrix1, y, x2, adjoin_matrix2, index = next(iter(train_dataset.take(1))) 132 | 133 | seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32) 134 | seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32) 135 | mask1 = seq1[:, tf.newaxis, tf.newaxis, :] 136 | mask2 = seq2[:, tf.newaxis, tf.newaxis, :] 137 | t1, t2, t3, t4 = extract_tensors(index, Heavy_dict, Light_dict, Antigen_dict, DAR_dict) 138 | model = PredictModel(num_layers=num_layers, 139 | d_model=d_model, 140 | dff=dff, 141 | num_heads=num_heads, 142 | vocab_size=vocab_size, 143 | a=len(label), 144 | dense_dropout = dense_dropout) 145 | 146 | if pretraining: 147 | temp = BertModel(num_layers=num_layers, d_model=d_model, 148 | dff=dff, num_heads=num_heads, vocab_size=vocab_size) 149 | 150 | pred = temp(x1, mask=mask1, training=True, adjoin_matrix=adjoin_matrix1) 151 | temp.load_weights( 152 | arch['path']+'/bert_weights{}_{}.h5'.format(arch['name'], trained_epoch)) 153 | temp.encoder.save_weights( 154 | arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'], trained_epoch)) 155 | del temp 156 | 157 | pred = model(x1=x1, mask1=mask1, training=True, adjoin_matrix1=adjoin_matrix1, x2=x2,mask2=mask2, adjoin_matrix2=adjoin_matrix2, t1=t1,t2=t2,t3=t3,t4=t4) 158 | 159 | model.encoder.load_weights( 160 | arch['path']+'/bert_weights_encoder{}_{}.h5'.format(arch['name'], trained_epoch)) 161 | print('load_wieghts') 162 | 163 | total_params = count_parameters(model) 164 | 165 | print('*'*100) 166 | print("Total Parameters:", total_params) 167 | print('*'*100) 168 | 169 | optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate) 170 | 171 | auc = -10 172 | stopping_monitor = 0 173 | for epoch in range(200): 174 | loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True) 175 | for x1, adjoin_matrix1, y, x2, adjoin_matrix2, index in train_dataset: 176 | t1, t2, t3, t4 = extract_tensors(index, Heavy_dict, Light_dict, Antigen_dict, DAR_dict) 177 | with tf.GradientTape() as tape: 178 | seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32) 179 | mask1 = seq1[:, tf.newaxis, tf.newaxis, :] 180 | seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32) 181 | mask2 = seq2[:, tf.newaxis, tf.newaxis, :] 182 | preds = model(x1=x1, mask1=mask1,training=True,adjoin_matrix1=adjoin_matrix1, x2=x2, mask2=mask2, adjoin_matrix2=adjoin_matrix2,t1=t1,t2=t2,t3=t3,t4=t4) 183 | loss = loss_object(y,preds) 184 | grads = tape.gradient(loss, model.trainable_variables) 185 | optimizer.apply_gradients(zip(grads, model.trainable_variables)) 186 | print('epoch: ', epoch, 'loss: {:.4f}'.format(loss.numpy().item())) 187 | 188 | y_true = [] 189 | y_preds = [] 190 | for x1, adjoin_matrix1, y, x2, adjoin_matrix2, index in val_dataset: 191 | t1, t2, t3, t4 = extract_tensors(index, Heavy_dict, Light_dict, Antigen_dict, DAR_dict) 192 | seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32) 193 | mask1 = seq1[:, tf.newaxis, tf.newaxis, :] 194 | seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32) 195 | mask2 = seq2[:, tf.newaxis, tf.newaxis, :] 196 | preds = model(x1=x1, mask1=mask1,training=False,adjoin_matrix1=adjoin_matrix1, x2=x2, mask2=mask2, adjoin_matrix2=adjoin_matrix2,t1=t1,t2=t2,t3=t3,t4=t4) 197 | y_label = y 198 | y_pred = preds 199 | y_true.append(y_label) 200 | y_preds.append(y_pred) 201 | y_true = np.concatenate(y_true,axis=0).reshape(-1) 202 | y_preds = np.concatenate(y_preds,axis=0).reshape(-1) 203 | y_preds = tf.sigmoid(y_preds).numpy() 204 | auc_new = roc_auc_score(y_true,y_preds) 205 | 206 | print('val auc:{:.4f}'.format(auc_new)) 207 | if auc_new> auc: 208 | auc = auc_new 209 | stopping_monitor = 0 210 | np.save('{}/{}{}{}{}{}'.format(arch['path'], task, seed, arch['name'], trained_epoch, trained_epoch, pretraining_str), 211 | [y_true, y_preds]) 212 | model.save_weights('classification_weights/{}_{}.h5'.format(task, seed)) 213 | print('save model weights') 214 | else: 215 | stopping_monitor += 1 216 | print('best val auc: {:.4f}'.format(auc)) 217 | if stopping_monitor > 0: 218 | print('stopping_monitor:', stopping_monitor) 219 | if stopping_monitor > 30: 220 | break 221 | 222 | y_true = [] 223 | y_preds = [] 224 | model.load_weights('classification_weights/{}_{}.h5'.format(task, seed)) 225 | for x1, adjoin_matrix1, y, x2, adjoin_matrix2, index in test_dataset: 226 | t1, t2, t3, t4 = extract_tensors(index, Heavy_dict, Light_dict, Antigen_dict, DAR_dict) 227 | seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32) 228 | mask1 = seq1[:, tf.newaxis, tf.newaxis, :] 229 | seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32) 230 | mask2 = seq2[:, tf.newaxis, tf.newaxis, :] 231 | preds = model(x1=x1, mask1=mask1,training=False,adjoin_matrix1=adjoin_matrix1, x2=x2, mask2=mask2, adjoin_matrix2=adjoin_matrix2,t1=t1,t2=t2,t3=t3,t4=t4) 232 | y_label = y 233 | y_pred = preds 234 | y_true.append(y_label) 235 | y_preds.append(y_pred) 236 | 237 | y_true = np.concatenate(y_true, axis=0).reshape(-1) 238 | y_preds = np.concatenate(y_preds, axis=0).reshape(-1) 239 | y_preds = tf.sigmoid(y_preds).numpy() 240 | test_auc = roc_auc_score(y_true, y_preds) 241 | 242 | tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV = score(y_true, y_preds) 243 | print('test auc:{:.4f}'.format(test_auc)) 244 | 245 | return test_auc,tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV 246 | 247 | space = {"dense_dropout": hp.quniform("dense_dropout", 0, 0.5, 0.05), 248 | "learning_rate": hp.loguniform("learning_rate", np.log(3e-5), np.log(15e-5)), 249 | "batch_size": hp.choice("batch_size", [16,32,48,64]), 250 | "num_heads": hp.choice("num_heads", [4,8]), 251 | } 252 | 253 | # Hyperparametric search 254 | # def hy_main(args): 255 | # test_auc_list = [] 256 | # x = 0 257 | # for seed in [2, 8, 9]: 258 | # print(seed) 259 | # test_auc,tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV = main(seed, args) 260 | # test_auc_list.append(test_auc) 261 | # x+= test_auc 262 | # test_auc_list.append(np.mean(test_auc_list)) 263 | # print(test_auc_list) 264 | # print(args["dense_dropout"]) 265 | # print(args["learning_rate"]) 266 | # print(args["batch_size"]) 267 | # print(args["num_heads"]) 268 | # return -x/3 269 | 270 | # best = fmin(hy_main, space, algo = tpe.suggest, max_evals= 30) 271 | # print(best) 272 | 273 | # best_dict = {} 274 | # a = [16,32,48,64] 275 | # b = [4, 8] 276 | # best_dict["dense_dropout"] = best["dense_dropout"] 277 | # best_dict["learning_rate"] = best["learning_rate"] 278 | # best_dict["batch_size"] = a[best["batch_size"]] 279 | # best_dict["num_heads"] = b[best["num_heads"]] 280 | 281 | best_dict = {} 282 | best_dict["dense_dropout"] = 0.30000000000000004 283 | best_dict["learning_rate"] = 5.5847758199523973e-05 284 | best_dict["batch_size"] = 32 285 | best_dict["num_heads"] = 8 286 | print(best_dict) 287 | 288 | if __name__ == '__main__': 289 | test_auc_list = [] 290 | tp_l, tn_l, fn_l, fp_l, se_l, sp_l, mcc_l, acc_l, auc_roc_score_l, F1_l, BA_l, prauc_l, PPV_l, NPV_l = [],[],[],[],[],[],[],[],[],[],[],[],[],[] 291 | lists_to_process = [tp_l, tn_l, fn_l, fp_l, se_l, sp_l, mcc_l, acc_l, auc_roc_score_l, F1_l, BA_l, prauc_l, PPV_l, NPV_l] 292 | for seed in [2,8,9]: 293 | print(seed) 294 | test_auc,tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV = main(seed, best_dict) 295 | test_auc_list.append(test_auc) 296 | tp_l.append(tp) 297 | tn_l.append(tn) 298 | fn_l.append(fn) 299 | fp_l.append(fp) 300 | se_l.append(se) 301 | sp_l.append(sp) 302 | mcc_l.append(mcc) 303 | acc_l.append(acc) 304 | auc_roc_score_l.append(auc_roc_score) 305 | F1_l.append(F1) 306 | BA_l.append(BA) 307 | prauc_l.append(prauc) 308 | PPV_l.append(PPV) 309 | NPV_l.append(NPV) 310 | test_auc_list.append(np.mean(test_auc_list)) 311 | tp_l.append(np.mean(tp_l)) 312 | tn_l.append(np.mean(tn_l)) 313 | fn_l.append(np.mean(fn_l)) 314 | fp_l.append(np.mean(fp_l)) 315 | se_l.append(np.mean(se_l)) 316 | sp_l.append(np.mean(sp_l)) 317 | mcc_l.append(np.mean(mcc_l)) 318 | acc_l.append(np.mean(acc_l)) 319 | auc_roc_score_l.append(np.mean(auc_roc_score_l)) 320 | F1_l.append(np.mean(F1_l)) 321 | BA_l.append(np.mean(BA_l)) 322 | prauc_l.append(np.mean(prauc_l)) 323 | PPV_l.append(np.mean(PPV_l)) 324 | NPV_l.append(np.mean(NPV_l)) 325 | 326 | for i in range(len(lists_to_process)): 327 | lists_to_process[i] = process_list(lists_to_process[i]) 328 | filename = 'ADCNet_output.csv' 329 | column_names = ['tp', 'tn', 'fn', 'fp', 'se', 'sp', 'mcc', 'acc', 'auc', 'F1', 'BA', 'prauc','PPV', 'NPV'] 330 | rows = zip(tp_l, tn_l, fn_l, fp_l, se_l, sp_l, mcc_l, acc_l, auc_roc_score_l, F1_l, BA_l, prauc_l, PPV_l, NPV_l) 331 | with open(filename, mode='w', newline='') as file: 332 | writer = csv.writer(file) 333 | writer.writerow(column_names) 334 | writer.writerows(rows) 335 | print(f'CSV file {filename} was successfully written') 336 | 337 | 338 | -------------------------------------------------------------------------------- /classification_weights/ADC_9.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/classification_weights/ADC_9.h5 -------------------------------------------------------------------------------- /classification_weights/Explanation: -------------------------------------------------------------------------------- 1 | This is a weight file, which is the model's weights fine-tuned using the ADC dataset. You can use it to predict the activity of unknown ADCs. 2 | -------------------------------------------------------------------------------- /data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/data.xlsx -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | from macpath import split 2 | from operator import concat 3 | import re 4 | from cProfile import label 5 | from cgi import test 6 | from tkinter import Label 7 | import pandas as pd 8 | import numpy as np 9 | import tensorflow as tf 10 | from utils import smiles2adjoin, molecular_fg 11 | from rdkit import Chem 12 | from random import Random 13 | from collections import defaultdict 14 | from rdkit.Chem.Scaffolds import MurckoScaffold 15 | from itertools import compress 16 | 17 | str2num = {'':0 ,'H': 1, 'C': 2, 'N': 3, 'O': 4, 'S': 5, 'F': 6, 'Cl': 7, 'Br': 8, 'P': 9, 18 | 'I': 10,'Na': 11,'B':12,'Se':13,'Si':14,'':15,'':16,'':17} 19 | num2str = {i:j for j,i in str2num.items()} 20 | 21 | class Graph_Classification_Dataset(object): # Graph classification task data set processing 22 | def __init__(self,path,smiles_field1='Smiles1',smiles_field2='Smiles2',label_field=label, index_field=label, max_len=500,seed=1,batch_size=16,a=1,addH=True): 23 | if path.endswith('.txt') or path.endswith('.tsv'): 24 | self.df = pd.read_csv(path,sep='\t',encoding='latin1') 25 | elif path.endswith('.xlsx'): 26 | self.df = pd.read_excel(path) 27 | else: 28 | self.df = pd.read_csv(path, encoding='latin1') 29 | self.smiles_field1 = smiles_field1 30 | self.smiles_field2 = smiles_field2 31 | self.label_field = label_field 32 | self.index_field = index_field 33 | self.vocab = str2num 34 | self.devocab = num2str 35 | self.df = self.df[self.df[smiles_field1].str.len() <= max_len] 36 | self.df = self.df[[True if Chem.MolFromSmiles(smi) is not None else False for smi in self.df[smiles_field1]]] 37 | self.seed = seed 38 | self.batch_size = batch_size 39 | self.a = a 40 | self.addH = addH 41 | 42 | def get_data(self): 43 | '''Randomized Split Dataset''' 44 | data = self.df 45 | data = data.fillna(666) 46 | train_idx = [] 47 | idx = data.sample(frac=0.8).index 48 | train_idx.extend(idx) 49 | train_data = data[data.index.isin(train_idx)] 50 | data = data[~data.index.isin(train_idx)] 51 | test_idx = [] 52 | idx = data[~data.index.isin(train_data)].sample(frac=0.5).index 53 | test_idx.extend(idx) 54 | test_data = data[data.index.isin(test_idx)] 55 | val_data = data[~data.index.isin(train_idx+test_idx)] 56 | df_train_data = pd.DataFrame(train_data) 57 | df_test_data = pd.DataFrame(test_data) 58 | df_val_data = pd.DataFrame(val_data) 59 | 60 | self.dataset1 = tf.data.Dataset.from_tensor_slices( 61 | (df_train_data[self.smiles_field1], df_train_data[self.label_field], df_train_data[self.smiles_field2], df_train_data[self.index_field])) 62 | self.dataset1 = self.dataset1.map(self.tf_numerical_smiles).cache().padded_batch(batch_size=self.batch_size, padded_shapes=( 63 | tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([self.a]),tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([1]))).shuffle(1000).prefetch(50) 64 | 65 | self.dataset2 = tf.data.Dataset.from_tensor_slices((df_test_data[self.smiles_field1], df_test_data[self.label_field], df_test_data[self.smiles_field2], df_test_data[self.index_field])) 66 | self.dataset2 = self.dataset2.map(self.tf_numerical_smiles).padded_batch(512, padded_shapes=( 67 | tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([self.a]),tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([1]))).cache().prefetch(100) 68 | 69 | self.dataset3 = tf.data.Dataset.from_tensor_slices((df_val_data[self.smiles_field1], df_val_data[self.label_field], df_val_data[self.smiles_field2], df_val_data[self.index_field])) 70 | self.dataset3 = self.dataset3.map(self.tf_numerical_smiles).padded_batch(512, padded_shapes=( 71 | tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([self.a]),tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([1]))).cache().prefetch(100) 72 | 73 | return self.dataset1, self.dataset2, self.dataset3 74 | 75 | def numerical_smiles(self, smiles, label): 76 | smiles = smiles.numpy().decode() 77 | atoms_list, adjoin_matrix = smiles2adjoin(smiles,explicit_hydrogens=self.addH) 78 | atoms_list = [''] + atoms_list 79 | nums_list = [str2num.get(i,str2num['']) for i in atoms_list] 80 | temp = np.ones((len(nums_list),len(nums_list))) 81 | temp[1:, 1:] = adjoin_matrix 82 | adjoin_matrix = (1-temp)*(-1e9) 83 | x = np.array(nums_list).astype('int64') 84 | y = np.array(label).astype('int64') 85 | return x, adjoin_matrix, y 86 | 87 | def tf_numerical_smiles(self, smiles1, label, smiles2, index): 88 | x1,adjoin_matrix1,y= tf.py_function(self.numerical_smiles, [smiles1,label], [tf.int64, tf.float32 ,tf.int64]) 89 | x1.set_shape([None]) 90 | adjoin_matrix1.set_shape([None,None]) 91 | y.set_shape([None]) 92 | x2,adjoin_matrix2,index = tf.py_function(self.numerical_smiles, [smiles2,index], [tf.int64, tf.float32 ,tf.int64]) 93 | x2.set_shape([None]) 94 | adjoin_matrix2.set_shape([None,None]) 95 | index.set_shape([None]) 96 | return x1, adjoin_matrix1, y, x2,adjoin_matrix2, index 97 | 98 | class Inference_Dataset(object): 99 | def __init__(self,sml_list,max_len=500,addH=True): 100 | self.vocab = str2num 101 | self.devocab = num2str 102 | self.sml_list = [i for i in sml_list if len(i)'] + atoms_list 117 | nums_list = [str2num.get(i,str2num['']) for i in atoms_list] 118 | temp = np.ones((len(nums_list),len(nums_list))) 119 | temp[1:,1:] = adjoin_matrix 120 | adjoin_matrix = (1-temp)*(-1e9) 121 | x = np.array(nums_list).astype('int64') 122 | return x, adjoin_matrix,[smiles], atoms_list 123 | 124 | def tf_numerical_smiles(self, smiles): 125 | x,adjoin_matrix,smiles,atom_list = tf.py_function(self.numerical_smiles, [smiles], [tf.int64, tf.float32,tf.string, tf.string]) 126 | x.set_shape([None]) 127 | adjoin_matrix.set_shape([None,None]) 128 | smiles.set_shape([1]) 129 | atom_list.set_shape([None]) 130 | return x, adjoin_matrix,smiles,atom_list 131 | 132 | class Inference_Dataset(object): 133 | def __init__(self,sml_list,max_len=500,addH=True): 134 | self.vocab = str2num 135 | self.devocab = num2str 136 | self.sml_list = [i for i in sml_list if len(i)'] + atoms_list 151 | nums_list = [str2num.get(i,str2num['']) for i in atoms_list] 152 | temp = np.ones((len(nums_list),len(nums_list))) 153 | temp[1:,1:] = adjoin_matrix 154 | adjoin_matrix = (1-temp)*(-1e9) 155 | x = np.array(nums_list).astype('int64') 156 | return x, adjoin_matrix,[smiles], atoms_list 157 | 158 | def tf_numerical_smiles(self, smiles): 159 | x,adjoin_matrix,smiles,atom_list = tf.py_function(self.numerical_smiles, [smiles], [tf.int64, tf.float32,tf.string, tf.string]) 160 | x.set_shape([None]) 161 | adjoin_matrix.set_shape([None,None]) 162 | smiles.set_shape([1]) 163 | atom_list.set_shape([None]) 164 | return x, adjoin_matrix,smiles,atom_list 165 | -------------------------------------------------------------------------------- /files/Antigen_1280.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/files/Antigen_1280.pkl -------------------------------------------------------------------------------- /files/Heavy_1280.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/files/Heavy_1280.pkl -------------------------------------------------------------------------------- /files/Light_1280.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/files/Light_1280.pkl -------------------------------------------------------------------------------- /files/data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/files/data.xlsx -------------------------------------------------------------------------------- /files/x: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from dataset import Graph_Classification_Dataset,Inference_Dataset 6 | from sklearn.metrics import roc_auc_score,confusion_matrix,precision_recall_curve,auc 7 | from rdkit.Chem import Draw 8 | import os 9 | import tensorflow.keras as keras 10 | from model import PredictModel 11 | import torch 12 | import pickle 13 | from sklearn.preprocessing import StandardScaler 14 | import math 15 | 16 | def cover_dict(path): 17 | file_path = path 18 | with open(file_path, 'rb') as file: 19 | data = pickle.load(file) 20 | tensor_dict = {key: tf.constant(value) for key, value in data.items()} 21 | new_data = {i: value for i, (key, value) in enumerate(tensor_dict.items())} 22 | return new_data 23 | 24 | def score(y_test, y_pred): 25 | auc_roc_score = roc_auc_score(y_test, y_pred) 26 | prec, recall, _ = precision_recall_curve(y_test, y_pred) 27 | prauc = auc(recall, prec) 28 | y_pred_print = [round(y, 0) for y in y_pred] 29 | tn, fp, fn, tp = confusion_matrix(y_test, y_pred_print).ravel() 30 | se = tp / (tp + fn) 31 | sp = tn / (tn + fp) # 也是R 32 | acc = (tp + tn) / (tp + fn + tn + fp) 33 | mcc = (tp * tn - fn * fp) / math.sqrt((tp + fn) * (tp + fp) * (tn + fn) * (tn + fp)) 34 | P = tp / (tp + fp) 35 | F1 = (P * se * 2) / (P + se) 36 | BA = (se + sp) / 2 37 | PPV = tp / (tp + fp) 38 | NPV = tn / (fn + tn) 39 | return tp, tn, fn, fp, se, sp, mcc, acc, auc_roc_score, F1, BA, prauc, PPV, NPV 40 | 41 | def DAR_feature(file_path, column_name): 42 | df = pd.read_excel(file_path) 43 | column_data = df[column_name].values.reshape(-1, 1) 44 | mean_value = 3.86845977 45 | variance_value = 1.569108443 46 | std_deviation = variance_value**0.5 47 | column_data_standardized = (column_data - mean_value) / std_deviation 48 | normalized_data = (column_data_standardized - 0.8) / (12 - 0.8) 49 | data_dict = {index: tf.constant(value, dtype=tf.float32) for index, value in zip(df.index, normalized_data.flatten())} 50 | return data_dict 51 | 52 | Heavy_dict = cover_dict('Heavy.pkl') 53 | Light_dict = cover_dict('Light.pkl') 54 | Antigen_dict = cover_dict('Antigen.pkl') 55 | DAR_dict = DAR_feature('data.xlsx', 'DAR') 56 | 57 | medium = {'name':'Medium','num_layers': 6, 'num_heads': 8, 'd_model': 256,'path':'medium_weights','addH':True} 58 | arch = medium 59 | trained_epoch = 20 60 | num_layers = arch['num_layers'] 61 | num_heads = arch['num_heads'] 62 | d_model = arch['d_model'] 63 | addH = arch['addH'] 64 | dff = d_model * 2 65 | vocab_size = 18 66 | dense_dropout = 0.1 67 | seed = 1 68 | df = pd.read_excel('data.xlsx') 69 | np.random.seed(seed=seed) 70 | tf.random.set_seed(seed=seed) 71 | sml_list1 = df['Payload Isosmiles'].tolist() 72 | sml_list2 = df['Linker Isosmiles'].tolist() 73 | 74 | ans = [] 75 | y_preds = [] 76 | res = [] 77 | n = len(sml_list1) 78 | for i in range(n): 79 | x1 = [sml_list1[i]] 80 | x2 = [sml_list2[i]] 81 | t1 = Heavy_dict[i] 82 | t2 = Light_dict[i] 83 | t3 = Antigen_dict[i] 84 | t4 = DAR_dict[i].numpy() 85 | t1 = tf.expand_dims(t1, axis=0) 86 | t2 = tf.expand_dims(t2, axis=0) 87 | t3 = tf.expand_dims(t3, axis=0) 88 | t4 = tf.constant(t4, shape=(1, 1)) 89 | 90 | inference_dataset1 = Inference_Dataset(x1,addH=addH).get_data() 91 | inference_dataset2 = Inference_Dataset(x2,addH=addH).get_data() 92 | 93 | x1, adjoin_matrix1, smiles1 ,atom_list1 = next(iter(inference_dataset1.take(1))) 94 | x2, adjoin_matrix2, smiles2 ,atom_list2 = next(iter(inference_dataset2.take(1))) 95 | 96 | seq1 = tf.cast(tf.math.equal(x1, 0), tf.float32) 97 | seq2 = tf.cast(tf.math.equal(x2, 0), tf.float32) 98 | 99 | mask1 = seq1[:, tf.newaxis, tf.newaxis, :] 100 | mask2 = seq2[:, tf.newaxis, tf.newaxis, :] 101 | 102 | model = PredictModel(num_layers=num_layers, 103 | d_model=d_model, 104 | dff=dff, 105 | num_heads=num_heads, 106 | vocab_size=vocab_size, 107 | a=1, 108 | dense_dropout = dense_dropout) 109 | 110 | pred = model(x1=x1, mask1=mask1, training=False, adjoin_matrix1=adjoin_matrix1, x2=x2,mask2=mask2, adjoin_matrix2=adjoin_matrix2, t1=t1,t2=t2,t3=t3,t4=t4) 111 | model.load_weights('classification_weights/ADC_9.h5') 112 | 113 | x = model(x1=x1, mask1=mask1,training=False,adjoin_matrix1=adjoin_matrix1, x2=x2, mask2=mask2, adjoin_matrix2=adjoin_matrix2,t1=t1,t2=t2,t3=t3,t4=t4) 114 | y_preds.append(x) 115 | 116 | 117 | y_preds = tf.sigmoid(y_preds) 118 | y_preds = tf.reshape(y_preds,(-1,)) 119 | y_hat = tf.where(y_preds < 0.5, 0, 1) 120 | for i in y_preds.numpy(): 121 | ans.append(i) 122 | for i in y_hat.numpy(): 123 | res.append(i) 124 | print(ans) 125 | print(res) 126 | -------------------------------------------------------------------------------- /medium3_weights/Explanation: -------------------------------------------------------------------------------- 1 | This is a weight file for training the ADCNet model, originating from FG-BERT (https://github.com/idrugLab/FG-BERT). 2 | Using these weights obtained after fine-tuning on labeled ADC data, the model can be employed for predicting ADCs. 3 | -------------------------------------------------------------------------------- /medium3_weights/bert_weightsMedium_20.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/medium3_weights/bert_weightsMedium_20.h5 -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from cProfile import label 2 | import tensorflow as tf 3 | import time 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | 9 | def gelu(x): 10 | return 0.5 * x * (1.0 + tf.math.erf(x / tf.sqrt(2.))) 11 | 12 | 13 | def scaled_dot_product_attention(q, k, v, mask,adjoin_matrix): 14 | 15 | matmul_qk = tf.matmul(q, k, transpose_b=True) 16 | 17 | 18 | dk = tf.cast(tf.shape(k)[-1], tf.float32) 19 | scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) 20 | 21 | 22 | if mask is not None: 23 | scaled_attention_logits += (mask * -1e9) 24 | if adjoin_matrix is not None: 25 | scaled_attention_logits += adjoin_matrix 26 | 27 | attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) 28 | 29 | output = tf.matmul(attention_weights, v) 30 | 31 | return output, attention_weights 32 | 33 | 34 | class MultiHeadAttention(tf.keras.layers.Layer): 35 | def __init__(self, d_model, num_heads): 36 | super(MultiHeadAttention, self).__init__() 37 | self.num_heads = num_heads 38 | self.d_model = d_model 39 | 40 | assert d_model % self.num_heads == 0 41 | 42 | self.depth = d_model // self.num_heads 43 | 44 | self.wq = tf.keras.layers.Dense(d_model) 45 | self.wk = tf.keras.layers.Dense(d_model) 46 | self.wv = tf.keras.layers.Dense(d_model) 47 | 48 | self.dense = tf.keras.layers.Dense(d_model) 49 | 50 | def split_heads(self, x, batch_size): 51 | """Split the last dimension into (num_heads, depth). 52 | Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth) 53 | """ 54 | x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) 55 | return tf.transpose(x, perm=[0, 2, 1, 3]) 56 | 57 | def call(self, v, k, q, mask,adjoin_matrix): 58 | batch_size = tf.shape(q)[0] 59 | 60 | q = self.wq(q) 61 | k = self.wk(k) 62 | v = self.wv(v) 63 | 64 | q = self.split_heads(q, batch_size) 65 | k = self.split_heads(k, batch_size) 66 | v = self.split_heads(v, batch_size) 67 | 68 | scaled_attention, attention_weights = scaled_dot_product_attention( 69 | q, k, v, mask,adjoin_matrix) 70 | 71 | scaled_attention = tf.transpose(scaled_attention, 72 | perm=[0, 2, 1, 3]) 73 | 74 | concat_attention = tf.reshape(scaled_attention, 75 | (batch_size, -1, self.d_model)) 76 | 77 | output = self.dense(concat_attention) 78 | 79 | return output, attention_weights 80 | 81 | def point_wise_feed_forward_network(d_model, dff): 82 | return tf.keras.Sequential([ 83 | tf.keras.layers.Dense(dff, activation=gelu), 84 | tf.keras.layers.Dense(d_model) 85 | ]) 86 | 87 | 88 | class EncoderLayer(tf.keras.layers.Layer): 89 | def __init__(self, d_model, num_heads, dff, rate=0.1): 90 | super(EncoderLayer, self).__init__() 91 | 92 | self.mha = MultiHeadAttention(d_model, num_heads) 93 | self.ffn = point_wise_feed_forward_network(d_model, dff) 94 | 95 | self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) 96 | self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) 97 | 98 | self.dropout1 = tf.keras.layers.Dropout(rate) 99 | self.dropout2 = tf.keras.layers.Dropout(rate) 100 | 101 | def call(self, x, training, mask,adjoin_matrix): 102 | attn_output, attention_weights = self.mha(x, x, x, mask,adjoin_matrix) 103 | attn_output = self.dropout1(attn_output, training=training) 104 | out1 = self.layernorm1(x + attn_output) 105 | 106 | ffn_output = self.ffn(out1) 107 | ffn_output = self.dropout2(ffn_output, training=training) 108 | out2 = self.layernorm2(out1 + ffn_output) 109 | 110 | return out2,attention_weights 111 | 112 | 113 | 114 | class Encoder(tf.keras.Model): 115 | def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 116 | maximum_position_encoding, rate=0.1): 117 | super(Encoder, self).__init__() 118 | 119 | self.d_model = d_model 120 | self.num_layers = num_layers 121 | 122 | self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model) 123 | 124 | self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 125 | for _ in range(num_layers)] 126 | 127 | self.dropout = tf.keras.layers.Dropout(rate) 128 | 129 | def call(self, x, training, mask,adjoin_matrix): 130 | seq_len = tf.shape(x)[1] 131 | adjoin_matrix = adjoin_matrix[:,tf.newaxis,:,:] 132 | x = self.embedding(x) 133 | x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) 134 | 135 | x = self.dropout(x, training=training) 136 | 137 | for i in range(self.num_layers): 138 | x,attention_weights = self.enc_layers[i](x, training, mask,adjoin_matrix) 139 | return x 140 | 141 | class Encoder_test(tf.keras.Model): 142 | def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 143 | maximum_position_encoding, rate=0.1): 144 | super(Encoder_test, self).__init__() 145 | 146 | self.d_model = d_model 147 | self.num_layers = num_layers 148 | 149 | self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model) 150 | 151 | self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 152 | for _ in range(num_layers)] 153 | 154 | self.dropout = tf.keras.layers.Dropout(rate) 155 | 156 | def call(self, x, training, mask,adjoin_matrix): 157 | seq_len = tf.shape(x)[1] 158 | adjoin_matrix = adjoin_matrix[:,tf.newaxis,:,:] 159 | x = self.embedding(x) 160 | x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) 161 | 162 | x = self.dropout(x, training=training) 163 | attention_weights_list = [] 164 | xs = [] 165 | 166 | for i in range(self.num_layers): 167 | x,attention_weights = self.enc_layers[i](x, training, mask,adjoin_matrix) 168 | attention_weights_list.append(attention_weights) 169 | xs.append(x) 170 | 171 | return x,attention_weights_list,xs 172 | 173 | class BertModel_test(tf.keras.Model): 174 | def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size = 18,dropout_rate = 0.1): 175 | super(BertModel_test, self).__init__() 176 | self.encoder = Encoder_test(num_layers=num_layers,d_model=d_model, 177 | num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate) 178 | self.fc1 = tf.keras.layers.Dense(d_model, activation=gelu) 179 | self.layernorm = tf.keras.layers.LayerNormalization(-1) 180 | self.fc2 = tf.keras.layers.Dense(vocab_size) 181 | def call(self,x,adjoin_matrix,mask,training=False): 182 | x,att,xs = self.encoder(x,training=training,mask=mask,adjoin_matrix=adjoin_matrix) 183 | x = self.fc1(x) 184 | x = self.layernorm(x) 185 | x = self.fc2(x) 186 | return x,att,xs 187 | 188 | 189 | class BertModel(tf.keras.Model): 190 | def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size = 18,dropout_rate = 0.1): 191 | super(BertModel, self).__init__() 192 | self.encoder = Encoder(num_layers=num_layers,d_model=d_model, 193 | num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate) 194 | self.fc1 = tf.keras.layers.Dense(d_model, activation=gelu) 195 | self.layernorm = tf.keras.layers.LayerNormalization(-1) 196 | self.fc2 = tf.keras.layers.Dense(vocab_size) 197 | 198 | def call(self,x,adjoin_matrix,mask,training=False): 199 | x = self.encoder(x,training=training,mask=mask,adjoin_matrix=adjoin_matrix) 200 | x = self.fc1(x) 201 | x = self.layernorm(x) 202 | x = self.fc2(x) 203 | return x 204 | 205 | 206 | class PredictModel(tf.keras.Model): 207 | def __init__(self,num_layers = 6,d_model = 256,dff = 512,num_heads = 8,vocab_size =18, a=2, dropout_rate = 0.1,dense_dropout=0.1): 208 | super(PredictModel, self).__init__() 209 | self.encoder = Encoder(num_layers=num_layers,d_model=d_model, 210 | num_heads=num_heads,dff=dff,input_vocab_size=vocab_size,maximum_position_encoding=200,rate=dropout_rate) 211 | self.fc1 = tf.keras.layers.Dense(256,activation=tf.keras.layers.LeakyReLU(0.1)) 212 | self.dropout1 = tf.keras.layers.Dropout(dense_dropout) 213 | self.fc2 = tf.keras.layers.Dense(a) 214 | 215 | def call(self,x1,adjoin_matrix1,mask1,x2,adjoin_matrix2,mask2,t1,t2,t3,t4,training=False): 216 | x1 = self.encoder(x1,training=training,mask=mask1,adjoin_matrix=adjoin_matrix1) 217 | x1 = x1[:,0,:] 218 | x2 = self.encoder(x2,training=False,mask=mask2,adjoin_matrix=adjoin_matrix2) 219 | x2 = x2[:,0,:] 220 | x = tf.concat([x1,x2], axis=1) 221 | x = tf.concat([x, t1], axis=1) 222 | x = tf.concat([x, t2], axis=1) 223 | x = tf.concat([x, t3], axis=1) 224 | x = tf.concat([x, t4], axis=1) 225 | x = self.fc1(x) 226 | x = self.dropout1(x,training=training) 227 | x = self.fc2(x) 228 | return x 229 | -------------------------------------------------------------------------------- /py37.yaml: -------------------------------------------------------------------------------- 1 | name: ADCNet 2 | channels: 3 | - rdkit 4 | - openbabel 5 | - anaconda 6 | - defaults 7 | dependencies: 8 | - _libgcc_mutex=0.1=main 9 | - _openmp_mutex=5.1=1_gnu 10 | - blas=1.0=openblas 11 | - bottleneck=1.3.5=py37h7deecbd_0 12 | - brotli=1.0.9=h5eee18b_7 13 | - brotli-bin=1.0.9=h5eee18b_7 14 | - bzip2=1.0.8=h7b6447c_0 15 | - ca-certificates=2023.08.22=h06a4308_0 16 | - cairo=1.14.12=h8948797_3 17 | - certifi=2022.12.7=py37h06a4308_0 18 | - cudatoolkit=10.1.243=h6bb024c_0 19 | - cudnn=7.6.5=cuda10.1_0 20 | - cycler=0.11.0=pyhd3eb1b0_0 21 | - daal4py=2021.5.0=py37h78b71dc_0 22 | - dal=2021.5.1=h06a4308_803 23 | - dbus=1.13.18=hb2f20db_0 24 | - expat=2.4.4=h295c915_0 25 | - fontconfig=2.13.1=h6c09931_0 26 | - freetype=2.11.0=h70c0345_0 27 | - giflib=5.2.1=h7b6447c_0 28 | - glib=2.69.1=h4ff587b_1 29 | - gst-plugins-base=1.14.0=hbbd80ab_1 30 | - gstreamer=1.14.1=h5eee18b_1 31 | - icu=58.2=he6710b0_3 32 | - intel-openmp=2021.4.0=h06a4308_3561 33 | - joblib=1.1.0=pyhd3eb1b0_0 34 | - jpeg=9e=h7f8727e_0 35 | - lcms2=2.12=h3be6417_0 36 | - ld_impl_linux-64=2.38=h1181459_1 37 | - libboost=1.67.0=h46d08c1_4 38 | - libbrotlicommon=1.0.9=h5eee18b_7 39 | - libbrotlidec=1.0.9=h5eee18b_7 40 | - libbrotlienc=1.0.9=h5eee18b_7 41 | - libedit=3.1.20210910=h7f8727e_0 42 | - libffi=3.3=he6710b0_2 43 | - libgcc-ng=11.2.0=h1234567_1 44 | - libgfortran-ng=7.5.0=ha8ba4b0_17 45 | - libgfortran4=7.5.0=ha8ba4b0_17 46 | - libgomp=11.2.0=h1234567_1 47 | - libopenblas=0.3.18=hf726d26_0 48 | - libpng=1.6.37=hbc83047_0 49 | - libstdcxx-ng=11.2.0=h1234567_1 50 | - libtiff=4.2.0=h2818925_1 51 | - libuuid=1.0.3=h7f8727e_2 52 | - libwebp=1.2.2=h55f646e_0 53 | - libwebp-base=1.2.2=h7f8727e_0 54 | - libxcb=1.15=h7f8727e_0 55 | - libxml2=2.9.14=h74e7548_0 56 | - lz4-c=1.9.3=h295c915_1 57 | - matplotlib-base=3.4.3=py37hbbc1b5f_0 58 | - mkl=2021.4.0=h06a4308_640 59 | - mkl-service=2.4.0=py37h7f8727e_0 60 | - mpi=1.0=mpich 61 | - mpich=3.3.2=hc856adb_0 62 | - munkres=1.1.4=py_0 63 | - ncurses=6.3=h5eee18b_3 64 | - numpy-base=1.18.5=py37h2f8d375_0 65 | - openbabel=2.4.1=py37_6 66 | - openssl=1.1.1w=h7f8727e_0 67 | - packaging=21.3=pyhd3eb1b0_0 68 | - pcre=8.45=h295c915_0 69 | - pillow=9.2.0=py37hace64e9_1 70 | - pip=22.1.2=py37h06a4308_0 71 | - pixman=0.40.0=h7f8727e_1 72 | - py-boost=1.67.0=py37h04863e7_4 73 | - pyparsing=3.0.4=pyhd3eb1b0_0 74 | - pyqt=5.9.2=py37h05f1152_2 75 | - python=3.7.13=h12debd9_0 76 | - python-dateutil=2.8.2=pyhd3eb1b0_0 77 | - pytz=2022.1=py37h06a4308_0 78 | - qt=5.9.7=h5867ecd_1 79 | - rdkit=2020.03.2.0=py37hc20afe1_1 80 | - readline=8.2=h5eee18b_0 81 | - scikit-learn=1.0.2=py37h51133e4_1 82 | - scikit-learn-intelex=2021.5.0=py37h06a4308_0 83 | - setuptools=61.2.0=py37h06a4308_0 84 | - sip=4.19.8=py37hf484d3e_0 85 | - six=1.16.0=pyhd3eb1b0_1 86 | - sqlite=3.39.3=h5082296_0 87 | - tbb=2021.5.0=hd09550d_0 88 | - threadpoolctl=2.2.0=pyh0d69192_0 89 | - tk=8.6.12=h1ccaba5_0 90 | - tornado=6.1=py37h27cfd23_0 91 | - typing_extensions=4.3.0=py37h06a4308_0 92 | - wheel=0.37.1=pyhd3eb1b0_0 93 | - xz=5.2.5=h7f8727e_1 94 | - zlib=1.2.12=h7f8727e_2 95 | - zstd=1.5.2=ha4553b6_0 96 | - pip: 97 | - absl-py==1.2.0 98 | - astunparse==1.6.3 99 | - cachetools==5.2.0 100 | - charset-normalizer==2.1.0 101 | - cloudpickle==2.1.0 102 | - et-xmlfile==1.1.0 103 | - fonttools==4.34.4 104 | - future==0.18.2 105 | - gast==0.3.3 106 | - google-auth==2.10.0 107 | - google-auth-oauthlib==0.4.6 108 | - google-pasta==0.2.0 109 | - grpcio==1.47.0 110 | - h5py==2.10.0 111 | - hyperopt==0.2.7 112 | - idna==3.3 113 | - importlib-metadata==4.12.0 114 | - keras-preprocessing==1.1.2 115 | - kiwisolver==1.4.4 116 | - markdown==3.4.1 117 | - markupsafe==2.1.1 118 | - matplotlib==3.5.2 119 | - networkx==2.6.3 120 | - numpy==1.18.5 121 | - nvidia-cublas-cu11==11.10.3.66 122 | - nvidia-cuda-nvrtc-cu11==11.7.99 123 | - nvidia-cuda-runtime-cu11==11.7.99 124 | - nvidia-cudnn-cu11==8.5.0.96 125 | - oauthlib==3.2.0 126 | - openpyxl==3.1.2 127 | - opt-einsum==3.3.0 128 | - pandas==1.3.5 129 | - protobuf==3.19.4 130 | - py4j==0.10.9.7 131 | - pyasn1==0.4.8 132 | - pyasn1-modules==0.2.8 133 | - requests==2.28.1 134 | - requests-oauthlib==1.3.1 135 | - rsa==4.9 136 | - scipy==1.4.1 137 | - tensorboard==2.9.1 138 | - tensorboard-data-server==0.6.1 139 | - tensorboard-plugin-wit==1.8.1 140 | - tensorflow==2.3.0 141 | - tensorflow-estimator==2.3.0 142 | - tensorflow-gpu==2.3.0 143 | - termcolor==1.1.0 144 | - torch==1.13.1 145 | - tqdm==4.64.0 146 | - urllib3==1.26.11 147 | - werkzeug==2.2.1 148 | - wrapt==1.14.1 149 | - zipp==3.8.1 150 | prefix: /share/home/yyzh/anaconda3/envs/ADCNet 151 | -------------------------------------------------------------------------------- /t_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idrugLab/ADCNet/40ea7ec87cc820893f400d90710f70e6bafc1209/t_data.xlsx -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from rdkit import Chem 2 | from rdkit.Chem import rdmolfiles, rdmolops 3 | import numpy as np 4 | import openbabel as ob 5 | import os 6 | import csv 7 | from rdkit import RDConfig 8 | from rdkit.Chem import FragmentCatalog 9 | 10 | def fg_list(): # 47 FGs list 11 | fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt') 12 | fparams = FragmentCatalog.FragCatParams(1,6,fName) 13 | fg_list = [] 14 | for i in range(fparams.GetNumFuncGroups()): 15 | fg_list.append(fparams.GetFuncGroup(i)) 16 | fg_list.pop(27) 17 | 18 | x = [Chem.MolToSmiles(_) for _ in fg_list]+['*C=C','*F','*Cl','*Br','*I','[Na+]','*P','*P=O','*[Se]','*[Si]'] 19 | y = set(x) 20 | return list(y) 21 | 22 | def obsmitosmile(smi): 23 | conv = ob.OBConversion() 24 | conv.SetInAndOutFormats("smi", "can") 25 | conv.SetOptions("K", conv.OUTOPTIONS) 26 | mol = ob.OBMol() 27 | conv.ReadString(mol, smi) 28 | smile = conv.WriteString(mol) 29 | smile = smile.replace('\t\n', '') 30 | return smile 31 | 32 | def molecular_fg(smiles): # Getting functional groups (including rings) in molecules 33 | 34 | mol = Chem.MolFromSmiles(smiles) 35 | if mol is None: 36 | print('error') 37 | mol = Chem.MolFromSmiles(obsmitosmile(smiles)) 38 | assert mol is not None, smiles + ' is not valid ' 39 | a = fg_list() 40 | 41 | 42 | ssr = Chem.GetSymmSSSR(mol) 43 | num_ring = len(ssr) 44 | ring_dict = {} 45 | for i in range(num_ring): 46 | ring_dict[i+1] = list(ssr[i]) 47 | 48 | f_g_list = [] 49 | for i in ring_dict.values(): 50 | f_g_list.append(i) 51 | 52 | for i in a: 53 | patt = Chem.MolFromSmarts(i) 54 | flag = mol.HasSubstructMatch(patt) 55 | if flag: 56 | atomids = mol.GetSubstructMatches(patt) 57 | for atomid in atomids: 58 | f_g_list.append(list(atomid)) 59 | return f_g_list 60 | 61 | 62 | def smiles2adjoin(smiles,explicit_hydrogens=True,canonical_atom_order=False): # Converting molecules in SMILES format to atom lists and adjacency matrices 63 | 64 | mol = Chem.MolFromSmiles(smiles) 65 | if mol is None: 66 | print('error') 67 | mol = Chem.MolFromSmiles(obsmitosmile(smiles)) 68 | assert mol is not None, smiles + ' is not valid ' 69 | 70 | if explicit_hydrogens: 71 | mol = Chem.AddHs(mol) 72 | else: 73 | mol = Chem.RemoveHs(mol) 74 | 75 | if canonical_atom_order: 76 | new_order = rdmolfiles.CanonicalRankAtoms(mol) 77 | mol = rdmolops.RenumberAtoms(mol, new_order) 78 | num_atoms = mol.GetNumAtoms() 79 | 80 | atoms_list = [] 81 | for i in range(num_atoms): 82 | atom = mol.GetAtomWithIdx(i) 83 | atoms_list.append(atom.GetSymbol()) 84 | 85 | 86 | adjoin_matrix = np.eye(num_atoms) 87 | num_bonds = mol.GetNumBonds() 88 | 89 | for i in range(num_bonds): 90 | bond = mol.GetBondWithIdx(i) 91 | u = bond.GetBeginAtomIdx() 92 | v = bond.GetEndAtomIdx() 93 | adjoin_matrix[u,v] = 1.0 94 | adjoin_matrix[v,u] = 1.0 95 | 96 | return atoms_list,adjoin_matrix 97 | 98 | 99 | def get_header(path): 100 | with open(path) as f: 101 | header = next(csv.reader(f)) 102 | 103 | return header 104 | 105 | 106 | def get_task_names(path, use_compound_names=False): 107 | index = 2 if use_compound_names else 1 108 | task_names = get_header(path)[index:] 109 | 110 | return task_names --------------------------------------------------------------------------------