├── MIMOSA.png
├── result
    ├── jnk.pkl
    ├── qed.pkl
    ├── jnkgsk.pkl
    ├── qed_f_t.txt
    ├── jnk_f_t.txt
    └── jnkgsk_f_t.txt
├── src
    ├── __pycache__
    │   ├── dpp.cpython-37.pyc
    │   ├── module.cpython-37.pyc
    │   ├── utils.cpython-37.pyc
    │   ├── chemutils.cpython-37.pyc
    │   ├── gnn_layer.cpython-37.pyc
    │   └── inference_utils.cpython-37.pyc
    ├── utils.py
    ├── download.py
    ├── clean.py
    ├── dpp.py
    ├── vocabulary.py
    ├── train.py
    ├── evaluate.py
    ├── gnn_layer.py
    ├── module.py
    ├── run.py
    ├── inference_utils.py
    └── chemutils.py
├── save_model
    └── GNN_epoch_0_validloss_1.61160.ckpt
├── data
    ├── vocabulary.txt
    └── substructure.txt
├── conda.yml
├── mimosa.yml
└── README.md


/MIMOSA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/MIMOSA.png


--------------------------------------------------------------------------------
/result/jnk.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/jnk.pkl


--------------------------------------------------------------------------------
/result/qed.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/qed.pkl


--------------------------------------------------------------------------------
/result/jnkgsk.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/jnkgsk.pkl


--------------------------------------------------------------------------------
/src/__pycache__/dpp.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/dpp.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/module.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/module.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/chemutils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/chemutils.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/gnn_layer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/gnn_layer.cpython-37.pyc


--------------------------------------------------------------------------------
/save_model/GNN_epoch_0_validloss_1.61160.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/save_model/GNN_epoch_0_validloss_1.61160.ckpt


--------------------------------------------------------------------------------
/src/__pycache__/inference_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/inference_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/result/qed_f_t.txt:
--------------------------------------------------------------------------------
1 | 0.495	0.0
2 | 0.514	0.025
3 | 0.612	0.064
4 | 0.731	0.065
5 | 0.815	0.041
6 | 0.856	0.033
7 | 0.889	0.022
8 | 0.912	0.018
9 | 0.921	0.019


--------------------------------------------------------------------------------
/result/jnk_f_t.txt:
--------------------------------------------------------------------------------
 1 | 0.0	0.0
 2 | 0.008	0.011
 3 | 0.021	0.023
 4 | 0.044	0.034
 5 | 0.060	0.035
 6 | 0.070	0.042
 7 | 0.085	0.051
 8 | 0.101	0.057
 9 | 0.115	0.057
10 | 0.122	0.059
11 | 0.136	0.047


--------------------------------------------------------------------------------
/result/jnkgsk_f_t.txt:
--------------------------------------------------------------------------------
 1 | 0.015	0.0
 2 | 0.022	0.010
 3 | 0.038	0.031
 4 | 0.065	0.040
 5 | 0.086	0.043
 6 | 0.103	0.039
 7 | 0.131	0.039
 8 | 0.144	0.046
 9 | 0.151	0.037
10 | 0.170	0.038
11 | 0.182	0.036


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | 
 3 | 
 4 | 
 5 | class Molecule_Dataset(torch.utils.data.Dataset):
 6 | 	def __init__(self, smiles_lst):
 7 | 		self.smiles_lst = smiles_lst
 8 | 
 9 | 	def __len__(self):
10 | 		return len(self.smiles_lst)
11 | 
12 | 	def __getitem__(self, idx):
13 | 		return self.smiles_lst[idx]
14 | 
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/src/download.py:
--------------------------------------------------------------------------------
 1 | from tdc.generation import MolGen
 2 | data = MolGen(name = 'ZINC')
 3 | # from random import shuffle 
 4 | # smiles_lst = data.get_data()['smiles'].to_list()
 5 | # shuffle(smiles_lst)
 6 | # smiles_lst = smiles_lst[:data_size]
 7 | # with open("data/zinc_" + str(data_size) + ".txt", 'w') as fout:
 8 | # 	for smiles in smiles_lst:
 9 | # 		fout.write(smiles + '\n')
10 | 
11 | 
12 | """
13 | python src/download.py
14 | """
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/data/vocabulary.txt:
--------------------------------------------------------------------------------
 1 | C	1158545
 2 | O	500212
 3 | N	280451
 4 | C1=CC=CC=C1	257945
 5 | F	79430
 6 | S	51103
 7 | Cl	42872
 8 | C1=CC=NC=C1	27852
 9 | C1CCCCC1	20256
10 | C1=CNN=C1	18920
11 | C1=CSC=C1	17515
12 | C1CCNCC1	15912
13 | C1CC1	15462
14 | C1CCCC1	14328
15 | Br	12722
16 | C1=CSC=N1	12617
17 | C1COCCN1	11924
18 | C1CNCCN1	11701
19 | C1=COC=C1	11274
20 | C1CCCN1	9739
21 | C1=CN=CN=C1	7964
22 | C1CC[NH+]CC1	7948
23 | C1CCNC1	7634
24 | C1CCCNC1	7277
25 | C1=CCCC=C1	6243
26 | C1=NN=CN1	5748
27 | C1CNCC1	5513
28 | C1CCOC1	5310
29 | C1=CNC=N1	5201
30 | C1=NOC=N1	5141
31 | 


--------------------------------------------------------------------------------
/src/clean.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm 
 2 | import os
 3 | # from chemutils import vocabulary, smiles2word 
 4 | from chemutils import is_valid, logp_modifier 
 5 | smiles_database = "data/zinc.tab"
 6 | clean_smiles_database = "data/zinc_clean.txt"
 7 | 
 8 | 
 9 | with open(smiles_database, 'r') as fin:
10 | 	lines = fin.readlines()[1:]
11 | smiles_lst = [i.strip().strip('"') for i in lines]
12 | 
13 | clean_smiles_lst = []
14 | for smiles in tqdm(smiles_lst):
15 | 	if is_valid(smiles):
16 | 		clean_smiles_lst.append(smiles)
17 | clean_smiles_set = set(clean_smiles_lst)
18 | with open(clean_smiles_database, 'w') as fout:
19 | 	for smiles in clean_smiles_set:
20 | 		fout.write(smiles + '\n')
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/src/dpp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | np.random.seed(1)
 4 | 
 5 | class DPPModel(object):
 6 |     def __init__(self, smiles_lst, sim_matrix, f_scores, top_k, lamb):
 7 |         self.smiles_lst = smiles_lst 
 8 |         self.sim_matrix = sim_matrix # (n,n)
 9 |         self.lamb = lamb
10 |         self.f_scores = np.exp(f_scores) * self.lamb # (n,) 
11 |         self.max_iter = top_k 
12 |         self.n = len(smiles_lst)
13 |         self.kernel_matrix = self.f_scores.reshape((self.n, 1)) \
14 |                              * sim_matrix * self.f_scores.reshape((1, self.n))
15 |         self.log_det_V = np.sum(f_scores) * self.lamb 
16 |         self.log_det_S = np.log(np.linalg.det(np.mat(self.kernel_matrix)))
17 | 
18 |     def dpp(self): 
19 |         c = np.zeros((self.max_iter, self.n))
20 |         d = np.copy(np.diag(self.kernel_matrix))  ### diagonal
21 |         j = np.argmax(d)
22 |         Yg = [j]
23 |         _iter = 0
24 |         Z = list(range(self.n))
25 |         while len(Yg) < self.max_iter:
26 |             Z_Y = set(Z).difference(set(Yg))
27 |             for i in Z_Y:
28 |                 if _iter == 0:
29 |                     ei = self.kernel_matrix[j, i] / np.sqrt(d[j])
30 |                 else:
31 |                     ei = (self.kernel_matrix[j, i] - np.dot(c[:_iter, j], c[:_iter, i])) / np.sqrt(d[j])
32 |                 c[_iter, i] = ei
33 |                 d[i] = d[i] - ei * ei
34 |             d[j] = 0
35 |             j = np.argmax(d)
36 |             Yg.append(j)
37 |             _iter += 1
38 | 
39 |         return [self.smiles_lst[i] for i in Yg], self.log_det_V, self.log_det_S 
40 | 
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     rank_score = np.random.random(size=(100)) 
45 |     item_embedding = np.random.randn(100, 5) 
46 |     item_embedding = item_embedding / np.linalg.norm(item_embedding, axis=1, keepdims=True)
47 |     sim_matrix = np.dot(item_embedding, item_embedding.T) 
48 | 
49 |     dpp = DPPModel(smiles_lst=list(range(100)), sim_matrix = sim_matrix, f_scores = rank_score, top_k = 10)
50 |     Yg = dpp.dpp() 
51 |     print(Yg)
52 | 
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/src/vocabulary.py:
--------------------------------------------------------------------------------
 1 | # from chemutils import smiles2word
 2 | 
 3 | import os
 4 | from collections import defaultdict 
 5 | from tqdm import tqdm 
 6 | from rdkit import Chem, DataStructs
 7 | 
 8 | 
 9 | def smiles2mol(smiles):
10 |     mol = Chem.MolFromSmiles(smiles)
11 |     if mol is None: 
12 |         return None
13 |     Chem.Kekulize(mol)
14 |     return mol 
15 | 
16 | ## input: smiles, output: word lst;  
17 | def smiles2word(smiles):
18 |     mol = smiles2mol(smiles)
19 |     if mol is None:
20 |         return None 
21 |     word_lst = []
22 | 
23 |     cliques = [list(x) for x in Chem.GetSymmSSSR(mol)]
24 |     cliques_smiles = []
25 |     for clique in cliques:
26 |         clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
27 |         cliques_smiles.append(clique_smiles)
28 |     atom_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
29 |     return cliques_smiles + atom_not_in_rings_list 
30 | 
31 | 
32 | 
33 | all_vocabulary_file = "data/substructure.txt"
34 | rawdata_file = "data/zinc.tab"
35 | select_vocabulary_file = "data/vocabulary.txt"
36 | 
37 | if not os.path.exists(all_vocabulary_file):
38 | 	with open(rawdata_file) as fin:
39 | 		lines = fin.readlines()[1:]
40 | 		smiles_lst = [line.strip().strip('"') for line in lines]
41 | 	word2cnt = defaultdict(int)
42 | 	for smiles in tqdm(smiles_lst):
43 | 		word_lst = smiles2word(smiles)
44 | 		for word in word_lst:
45 | 			word2cnt[word] += 1
46 | 	word_cnt_lst = [(word,cnt) for word,cnt in word2cnt.items()]
47 | 	word_cnt_lst = sorted(word_cnt_lst, key=lambda x:x[1], reverse = True)
48 | 
49 | 	with open(all_vocabulary_file, 'w') as fout:
50 | 		for word, cnt in word_cnt_lst:
51 | 			fout.write(word + '\t' + str(cnt) + '\n')
52 | else:
53 | 	with open(all_vocabulary_file, 'r') as fin:
54 | 		lines = fin.readlines()
55 | 		word_cnt_lst = [(line.split('\t')[0], int(line.split('\t')[1])) for line in lines]
56 | 
57 | 
58 | word_cnt_lst = list(filter(lambda x:x[1]>5000, word_cnt_lst))
59 | print(len(word_cnt_lst))
60 | 
61 | with open(select_vocabulary_file, 'w') as fout:
62 | 	for word, cnt in word_cnt_lst:
63 | 		fout.write(word + '\t' + str(cnt) + '\n')
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/conda.yml:
--------------------------------------------------------------------------------
 1 | name: mimosa
 2 | channels:
 3 |   - rdkit
 4 |   - soumith
 5 |   - defaults
 6 | dependencies:
 7 |   - _libgcc_mutex=0.1=main
 8 |   - blas=1.0=mkl
 9 |   - bzip2=1.0.8=h7b6447c_0
10 |   - ca-certificates=2021.1.19=h06a4308_0
11 |   - cairo=1.14.12=h8948797_3
12 |   - certifi=2020.12.5=py37h06a4308_0
13 |   - fontconfig=2.13.0=h9420a91_0
14 |   - freetype=2.10.4=h5ab3b9f_0
15 |   - glib=2.66.1=h92f7085_0
16 |   - icu=58.2=he6710b0_3
17 |   - intel-openmp=2020.2=254
18 |   - jpeg=9b=h024ee3a_2
19 |   - lcms2=2.11=h396b838_0
20 |   - ld_impl_linux-64=2.33.1=h53a641e_7
21 |   - libboost=1.73.0=h3ff78a5_11
22 |   - libedit=3.1.20191231=h14c3975_1
23 |   - libffi=3.3=he6710b0_2
24 |   - libgcc-ng=9.1.0=hdf63c60_0
25 |   - libpng=1.6.37=hbc83047_0
26 |   - libstdcxx-ng=9.1.0=hdf63c60_0
27 |   - libtiff=4.1.0=h2733197_1
28 |   - libuuid=1.0.3=h1bed415_2
29 |   - libxcb=1.14=h7b6447c_0
30 |   - libxml2=2.9.10=hb55368b_3
31 |   - lz4-c=1.9.3=h2531618_0
32 |   - mkl=2020.2=256
33 |   - mkl-service=2.3.0=py37he8ac12f_0
34 |   - mkl_fft=1.2.0=py37h23d657b_0
35 |   - mkl_random=1.1.1=py37h0573a6f_0
36 |   - ncurses=6.2=he6710b0_1
37 |   - numpy=1.19.2=py37h54aff64_0
38 |   - numpy-base=1.19.2=py37hfa32c7d_0
39 |   - olefile=0.46=py_0
40 |   - openssl=1.1.1i=h27cfd23_0
41 |   - pandas=1.2.1=py37ha9443f7_0
42 |   - pcre=8.44=he6710b0_0
43 |   - pillow=8.1.0=py37he98fc37_0
44 |   - pip=20.3.3=py37h06a4308_0
45 |   - pixman=0.40.0=h7b6447c_0
46 |   - py-boost=1.73.0=py37ha9443f7_11
47 |   - python=3.7.9=h7579374_0
48 |   - python-dateutil=2.8.1=py_0
49 |   - pytz=2020.5=pyhd3eb1b0_0
50 |   - rdkit=2020.09.1.0=py37hd50e099_1
51 |   - readline=8.0=h7b6447c_0
52 |   - setuptools=52.0.0=py37h06a4308_0
53 |   - six=1.15.0=pyhd3eb1b0_0
54 |   - sqlite=3.33.0=h62c20be_0
55 |   - tk=8.6.10=hbc83047_0
56 |   - wheel=0.36.2=pyhd3eb1b0_0
57 |   - xz=5.2.5=h7b6447c_0
58 |   - zlib=1.2.11=h7b6447c_3
59 |   - zstd=1.4.5=h9ceee32_0
60 |   - pip:
61 |     - chardet==4.0.0
62 |     - cycler==0.10.0
63 |     - decorator==4.4.2
64 |     - fuzzywuzzy==0.18.0
65 |     - idna==2.10
66 |     - joblib==1.0.0
67 |     - kiwisolver==1.3.1
68 |     - matplotlib==3.3.4
69 |     - networkx==2.5
70 |     - pyparsing==2.4.7
71 |     - pytdc==0.1.5
72 |     - requests==2.25.1
73 |     - scikit-learn==0.23.2
74 |     - scipy==1.6.0
75 |     - threadpoolctl==2.1.0
76 |     - torch==1.7.1
77 |     - torchvision==0.8.2
78 |     - tqdm==4.56.0
79 |     - typing-extensions==3.7.4.3
80 |     - urllib3==1.26.3
81 | 
82 | 


--------------------------------------------------------------------------------
/mimosa.yml:
--------------------------------------------------------------------------------
 1 | name: mimosa
 2 | channels:
 3 |   - rdkit
 4 |   - pytorch
 5 |   - anaconda
 6 |   - defaults
 7 | dependencies:
 8 |   - blas=1.0=mkl
 9 |   - bzip2=1.0.8=h1de35cc_0
10 |   - ca-certificates=2020.1.1=0
11 |   - cairo=1.14.12=hc4e6be7_4
12 |   - certifi=2020.4.5.1=py37_0
13 |   - cffi=1.14.0=py37hb5b8e2f_0
14 |   - fontconfig=2.13.0=h5d5b041_1
15 |   - freetype=2.9.1=hb4e5f40_0
16 |   - gettext=0.19.8.1=h15daf44_3
17 |   - glib=2.63.1=hd977a24_0
18 |   - icu=58.2=h0a44026_3
19 |   - intel-openmp=2019.4=233
20 |   - joblib=0.14.1=py_0
21 |   - jpeg=9b=he5867d9_2
22 |   - libboost=1.67.0=hebc422b_4
23 |   - libcxx=4.0.1=hcfea43d_1
24 |   - libcxxabi=4.0.1=hcfea43d_1
25 |   - libedit=3.1.20181209=hb402a30_0
26 |   - libffi=3.2.1=h0a44026_6
27 |   - libgfortran=3.0.1=h93005f0_2
28 |   - libiconv=1.16=h1de35cc_0
29 |   - libpng=1.6.37=ha441bb4_0
30 |   - libtiff=4.1.0=hcb84e12_0
31 |   - libxml2=2.9.9=hf6e021a_1
32 |   - llvm-openmp=4.0.1=hcfea43d_1
33 |   - mkl=2019.4=233
34 |   - mkl-service=2.3.0=py37hfbe908c_0
35 |   - mkl_fft=1.0.15=py37h5e564d8_0
36 |   - mkl_random=1.1.0=py37ha771720_0
37 |   - ncurses=6.2=h0a44026_1
38 |   - ninja=1.9.0=py37h04f5b5a_0
39 |   - numpy=1.18.1=py37h7241aed_0
40 |   - numpy-base=1.18.1=py37h6575580_1
41 |   - olefile=0.46=py37_0
42 |   - openssl=1.1.1g=h1de35cc_0
43 |   - pandas=1.0.3=py37h6c726b0_0
44 |   - pcre=8.43=h0a44026_0
45 |   - pillow=7.0.0=py37h4655f20_0
46 |   - pip=20.0.2=py37_1
47 |   - pixman=0.38.0=h1de35cc_0
48 |   - py-boost=1.67.0=py37h6440ff4_4
49 |   - pycparser=2.20=py_0
50 |   - python=3.7.7=hc70fcce_0_cpython
51 |   - python-dateutil=2.8.1=py_0
52 |   - pytorch=1.0.1=py3.7_2
53 |   - pytz=2019.3=py_0
54 |   - rdkit=2020.03.1.0=py37h65625ec_1
55 |   - readline=8.0=h1de35cc_0
56 |   - scikit-learn=0.21.3=py37h27c97d8_0
57 |   - scipy=1.4.1=py37h9fa6033_0
58 |   - setuptools=46.1.3=py37_0
59 |   - six=1.14.0=py37_0
60 |   - sqlite=3.31.1=h5c1f38d_1
61 |   - tk=8.6.8=ha441bb4_0
62 |   - tqdm=4.45.0=py_0
63 |   - wheel=0.34.2=py37_0
64 |   - xz=5.2.5=h1de35cc_0
65 |   - zlib=1.2.11=h1de35cc_3
66 |   - zstd=1.3.7=h5bba6e5_0
67 |   - pip:
68 |     - decorator==4.4.2
69 |     - isodate==0.6.0
70 |     - molvs==0.1.1
71 |     - networkx==2.4
72 |     - plyfile==0.7.2
73 |     - protobuf==3.11.3
74 |     - pyparsing==2.4.7
75 |     - rdflib==5.0.0
76 |     - tensorboardx==2.0
77 |     - torch-cluster==1.2.4
78 |     - torch-geometric==1.0.3
79 |     - torch-scatter==1.1.2
80 |     - torch-sparse==0.2.4
81 |     - torch-spline-conv==1.0.6
82 | 
83 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import numpy as np 
 6 | from tqdm import tqdm 
 7 | from matplotlib import pyplot as plt
 8 | import pickle 
 9 | from random import shuffle 
10 | torch.manual_seed(4) 
11 | np.random.seed(2)
12 | from module import GCN 
13 | from chemutils import smiles2graph, vocabulary, smiles2feature  
14 | from utils import Molecule_Dataset 
15 | 
16 | 
17 | device = 'cpu'
18 | data_file = "data/zinc_clean.txt"
19 | with open(data_file, 'r') as fin:
20 | 	lines = fin.readlines()
21 | 
22 | shuffle(lines)
23 | lines = [line.strip() for line in lines]
24 | N = int(len(lines) * 0.9)
25 | train_data = lines[:N]
26 | valid_data = lines[N:]
27 | 
28 | 
29 | 
30 | training_set = Molecule_Dataset(train_data)
31 | valid_set = Molecule_Dataset(valid_data)
32 | params = {'batch_size': 1,
33 |           'shuffle': True,
34 |           'num_workers': 1}
35 | # exit() 
36 | 
37 | 
38 | def collate_fn(batch_lst):
39 | 	return batch_lst
40 | 
41 | train_generator = torch.utils.data.DataLoader(training_set, collate_fn = collate_fn, **params)
42 | valid_generator = torch.utils.data.DataLoader(valid_set, collate_fn = collate_fn, **params)
43 | 
44 | gnn = GCN(nfeat = 50, nhid = 100, num_layer = 3).to(device)
45 | print('GNN is built!')
46 | # exit() 
47 | 
48 | cost_lst = []
49 | valid_loss_lst = []
50 | epoch = 5 
51 | every_k_iters = 5000
52 | save_folder = "save_model/GNN_epoch_" 
53 | for ep in tqdm(range(epoch)):
54 | 	for i, smiles in tqdm(enumerate(train_generator)):
55 | 		### 1. training
56 | 		smiles = smiles[0]
57 | 		node_mat, adjacency_matrix, idx, label = smiles2feature(smiles)  ### smiles2feature: only mask leaf node    
58 | 		# idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles)
59 | 		node_mat = torch.FloatTensor(node_mat).to(device)
60 | 		adjacency_matrix = torch.FloatTensor(adjacency_matrix).to(device)
61 | 		label = torch.LongTensor([label]).view(-1).to(device)
62 | 		# print('label', label)
63 | 		cost = gnn.learn(node_mat, adjacency_matrix, idx, label)
64 | 		cost_lst.append(cost)
65 | 
66 | 		#### 2. validation 
67 | 		if i % every_k_iters == 0:
68 | 			gnn.eval()
69 | 			valid_loss, valid_num = 0,0 
70 | 			for smiles in valid_generator:
71 | 				smiles = smiles[0]
72 | 				node_mat, adjacency_matrix, idx, label = smiles2feature(smiles)  
73 | 				node_mat = torch.FloatTensor(node_mat).to(device)
74 | 				adjacency_matrix = torch.FloatTensor(adjacency_matrix).to(device)
75 | 				label = torch.LongTensor([label]).view(-1).to(device)
76 | 				cost, _ = gnn.infer(node_mat, adjacency_matrix, idx, label)
77 | 				valid_loss += cost
78 | 				valid_num += 1 
79 | 			valid_loss = valid_loss / valid_num
80 | 			valid_loss_lst.append(valid_loss)
81 | 			file_name = save_folder + str(ep) + "_validloss_" + str(valid_loss)[:7] + ".ckpt"
82 | 			torch.save(gnn, file_name)
83 | 			gnn.train()
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/src/evaluate.py:
--------------------------------------------------------------------------------
 1 | import os, sys 
 2 | import numpy as np 
 3 | from time import time
 4 | from tqdm import tqdm 
 5 | from matplotlib import pyplot as plt
 6 | import pickle 
 7 | from random import shuffle 
 8 | import torch
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | from tdc import Oracle
12 | torch.manual_seed(1)
13 | np.random.seed(2)
14 | from tdc import Evaluator
15 | 
16 | from chemutils import * 
17 | ## 2. data and oracle 
18 | # qed = Oracle(name = 'qed')
19 | # logp = Oracle(name = 'logp')
20 | # jnk = Oracle(name = 'JNK3')
21 | # gsk = Oracle(name = 'GSK3B')
22 | # def foracle(smiles):
23 | # 	return logp(smiles)
24 | 
25 | oracle_name = sys.argv[1]
26 | # 'jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk'
27 | 
28 | 
29 | diversity = Evaluator(name = 'Diversity')
30 | novelty = Evaluator(name = 'Novelty')
31 | 
32 | 
33 | file = "data/zinc_clean.txt"
34 | with open(file, 'r') as fin:
35 | 	lines = fin.readlines() 
36 | train_smiles_lst = [line.strip().split()[0] for line in lines][:1000] 
37 | 
38 | 
39 | ## 5. run 
40 | if __name__ == "__main__":
41 | 
42 | 	# result_file = "result/denovo_from_" + start_smiles_lst[0] + "_generation_" + str(generations) + "_population_" + str(population_size) + ".pkl"
43 | 	# result_pkl = "result/ablation_dmg_topo_dmg_substr.pkl"
44 | 	# pkl_file = "result/denovo_qedlogpjnkgsk_start_ncncccn.pkl"
45 | 	pkl_file = "result/"+oracle_name+".pkl"
46 | 	idx_2_smiles2f, trace_dict = pickle.load(open(pkl_file, 'rb'))
47 | 	# bestvalue, best_smiles = 0, ''
48 | 	topk = 100
49 | 	whole_smiles2f = dict()
50 | 	for idx, (smiles2f,current_set) in tqdm(idx_2_smiles2f.items()):
51 | 		whole_smiles2f.update(smiles2f)
52 | 		# for smiles,f in smiles2f.items():
53 | 		# 	if f > bestvalue:
54 | 		# 		bestvalue = f
55 | 		# 		print("best", f)
56 | 		# 		best_smiles = smiles 
57 | 
58 | 	smiles_f_lst = [(smiles,f) for smiles,f in whole_smiles2f.items()]
59 | 	smiles_f_lst.sort(key=lambda x:x[1], reverse=True)
60 | 	best_smiles_lst = [smiles for smiles,f in smiles_f_lst[:topk]]
61 | 	best_f_lst = [f for smiles,f in smiles_f_lst[:topk]]
62 | 	avg, std = np.mean(best_f_lst), np.std(best_f_lst)
63 | 	print('average of top-'+str(topk), str(avg)[:5], str(std)[:5])
64 | 	#### evaluate novelty 
65 | 	t1 = time()
66 | 	nov = novelty(best_smiles_lst, train_smiles_lst)
67 | 	t2 = time()
68 | 	print("novelty", nov, "takes", str(int(t2-t1)), 'seconds')
69 | 
70 | 	### evaluate diversity 
71 | 	t1 = time()
72 | 	div = diversity(best_smiles_lst)
73 | 	t2 = time()
74 | 	print("diversity", div, 'takes', str(int(t2-t1)), 'seconds')
75 | 
76 | 
77 | 	# ### evaluate mean of property 
78 | 	# for oracle_name in oracle_lst:
79 | 	# 	oracle = Oracle(name = oracle_name)
80 | 	# 	scores = oracle(best_smiles_lst)
81 | 	# 	avg = np.mean(scores)
82 | 	# 	std = np.std(scores)
83 | 	# 	print(oracle_name, str(avg)[:7], str(std)[:7])
84 | 
85 | 	# for ii,smiles in enumerate(best_smiles_lst[:20]):
86 | 	# 	print(smiles, str(gsk(smiles)))
87 | 	# 	draw_smiles(smiles, "figure/best_"+oracle_name+"_"+str(ii)+'.png')
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/src/gnn_layer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import numpy as np
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.nn.parameter import Parameter
  7 | from torch.nn.modules.module import Module
  8 | torch.manual_seed(3) 
  9 | np.random.seed(1)
 10 | 
 11 | class GraphConvolution(Module):
 12 |     """
 13 |     Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
 14 |     """
 15 | 
 16 |     def __init__(self, in_features, out_features, bias=True, init='xavier'):
 17 |         super(GraphConvolution, self).__init__()
 18 |         self.in_features = in_features
 19 |         self.out_features = out_features
 20 |         self.weight = Parameter(torch.FloatTensor(in_features, out_features))
 21 |         if bias:
 22 |             self.bias = Parameter(torch.FloatTensor(out_features))
 23 |         else:
 24 |             self.register_parameter('bias', None)
 25 |         if init == 'uniform':
 26 |             print("| Uniform Initialization")
 27 |             self.reset_parameters_uniform()
 28 |         elif init == 'xavier':
 29 |             print("| Xavier Initialization")
 30 |             self.reset_parameters_xavier()
 31 |         elif init == 'kaiming':
 32 |             print("| Kaiming Initialization")
 33 |             self.reset_parameters_kaiming()
 34 |         else:
 35 |             raise NotImplementedError
 36 | 
 37 |     def reset_parameters_uniform(self):
 38 |         stdv = 1. / math.sqrt(self.weight.size(1))
 39 |         self.weight.data.uniform_(-stdv, stdv)
 40 |         if self.bias is not None:
 41 |             self.bias.data.uniform_(-stdv, stdv)
 42 | 
 43 |     def reset_parameters_xavier(self):
 44 |         nn.init.xavier_normal_(self.weight.data, gain=0.02) # Implement Xavier Uniform
 45 |         if self.bias is not None:
 46 |             nn.init.constant_(self.bias.data, 0.0)
 47 | 
 48 |     def reset_parameters_kaiming(self):
 49 |         nn.init.kaiming_normal_(self.weight.data, a=0, mode='fan_in')
 50 |         if self.bias is not None:
 51 |             nn.init.constant_(self.bias.data, 0.0)
 52 | 
 53 |     def forward(self, input, adj):
 54 |         support = torch.mm(input, self.weight)
 55 |         # print("adj", adj.dtype, "support", support.dtype)
 56 |         output = torch.spmm(adj, support)
 57 |         if self.bias is not None:
 58 |             return output + self.bias
 59 |         else:
 60 |             return output
 61 | 
 62 |     def __repr__(self):
 63 |         return self.__class__.__name__ + ' (' \
 64 |                + str(self.in_features) + ' -> ' \
 65 |                + str(self.out_features) + ')'
 66 | 
 67 | 
 68 | class GraphAttention(nn.Module):
 69 |     """
 70 |     Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
 71 |     """
 72 | 
 73 |     def __init__(self, in_features, out_features, dropout, alpha, concat=True):
 74 |         super(GraphAttention, self).__init__()
 75 |         self.dropout = dropout
 76 |         self.in_features = in_features
 77 |         self.out_features = out_features
 78 |         self.alpha = alpha
 79 |         self.concat = concat
 80 | 
 81 |         self.W = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(in_features, out_features).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True)
 82 |         self.a1 = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(out_features, 1).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True)
 83 |         self.a2 = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(out_features, 1).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True)
 84 | 
 85 |         self.leakyrelu = nn.LeakyReLU(self.alpha)
 86 | 
 87 |     def forward(self, input, adj):
 88 |         h = torch.mm(input, self.W)
 89 |         N = h.size()[0]
 90 | 
 91 |         f_1 = torch.matmul(h, self.a1)
 92 |         f_2 = torch.matmul(h, self.a2)
 93 |         e = self.leakyrelu(f_1 + f_2.transpose(0,1))
 94 | 
 95 |         zero_vec = -9e15*torch.ones_like(e)
 96 |         attention = torch.where(adj > 0, e, zero_vec)
 97 |         attention = F.softmax(attention, dim=1)
 98 |         attention = F.dropout(attention, self.dropout, training=self.training)
 99 |         h_prime = torch.matmul(attention, h)
100 | 
101 |         if self.concat:
102 |             return F.elu(h_prime)
103 |         else:
104 |             return h_prime
105 | 
106 |     def __repr__(self):
107 |         return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/src/module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from copy import deepcopy 
  5 | from torch.autograd import Variable
  6 | from torch.utils import data
  7 | from torch.utils.data import SequentialSampler
  8 | import matplotlib.pyplot as plt
  9 | import numpy as np 
 10 | sigmoid = torch.nn.Sigmoid() 
 11 | from tqdm import tqdm 
 12 | 
 13 | from gnn_layer import GraphConvolution, GraphAttention
 14 | from chemutils import smiles2graph, vocabulary 
 15 | 
 16 | torch.manual_seed(4) 
 17 | np.random.seed(1)
 18 | 
 19 | # def sigmoid(x):
 20 | #     return 1/(1+np.exp(-x))
 21 | # device = 'cuda' if torch.cuda.is_available() else 'cpu'
 22 | device = 'cpu'
 23 | 
 24 | class GCN(nn.Module):
 25 |     def __init__(self, nfeat, nhid, num_layer):
 26 |         super(GCN, self).__init__()
 27 |         self.gc1 = GraphConvolution(in_features = nfeat, out_features = nhid)
 28 |         self.gcs = [GraphConvolution(in_features = nhid, out_features = nhid) for i in range(num_layer)]
 29 |         # self.dropout = dropout
 30 |         from chemutils import vocabulary 
 31 |         self.vocabulary_size = len(vocabulary) 
 32 |         self.out_fc = nn.Linear(nhid, self.vocabulary_size)
 33 |         self.nfeat = nfeat 
 34 |         self.nhid = nhid 
 35 |         self.num_layer = num_layer 
 36 |         # self.embedding = nn.Embedding(self.vocabulary_size, nfeat)
 37 |         self.embedding = nn.Linear(self.vocabulary_size + 1, nfeat)
 38 |         self.criteria = torch.nn.CrossEntropyLoss() 
 39 |         self.opt = torch.optim.Adam(self.parameters(), lr=1e-3, betas=(0.9, 0.99))
 40 |         self.device = device 
 41 |         self = self.to(device) 
 42 | 
 43 |     def switch_device(self, device):
 44 |         self.device = device 
 45 |         self = self.to(device)
 46 | 
 47 |     def forward(self, node_mat, adj, idx):
 48 |         ''' N: # substructure  &  d: vocabulary size
 49 |         Input: 
 50 |             node_mat:  [N,d]     row sum is 1.
 51 |             adj:       [N,N]    
 52 |             idx:     integer 
 53 | 
 54 |         Output:
 55 |             scalar   prediction before sigmoid           [-inf, inf]
 56 |         '''
 57 |         node_mat, adj = node_mat.to(self.device), adj.to(self.device)
 58 |         x = self.embedding(node_mat)
 59 |         x = F.relu(self.gc1(x,adj))
 60 |         for gc in self.gcs:
 61 |             x = F.relu(gc(x,adj))
 62 |         x = x[idx].view(1,-1)
 63 |         logits = self.out_fc(x)
 64 |         return logits 
 65 |         ## without sigmoid 
 66 | 
 67 |     def smiles2embed(self, smiles):
 68 |         idx_lst, node_mat, substructure_lst, atomidx_2substridx, adj, leaf_extend_idx_pair = smiles2graph(smiles)
 69 |         idx_vec = torch.LongTensor(idx_lst).to(device)
 70 |         node_mat = torch.FloatTensor(node_mat).to(device)
 71 |         adj = torch.FloatTensor(adj).to(device)
 72 |         weight = torch.ones_like(idx_vec).to(device)
 73 |         
 74 |         ### forward 
 75 |         node_mat, adj, weight = node_mat.to(self.device), adj.to(self.device), weight.to(self.device)
 76 |         x = self.embedding(node_mat) ## bug 
 77 |         x = F.relu(self.gc1(x,adj))
 78 |         for gc in self.gcs:
 79 |             x = F.relu(gc(x,adj))
 80 |         return torch.mean(x, 0)
 81 | 
 82 | 
 83 |     def smiles2pred(self, smiles):
 84 |         idx_lst, node_mat, substructure_lst, atomidx_2substridx, adj, leaf_extend_idx_pair = smiles2graph(smiles)
 85 |         idx_vec = torch.LongTensor(idx_lst).to(device)
 86 |         node_mat = torch.FloatTensor(node_mat).to(device)
 87 |         adj = torch.FloatTensor(adj).to(device)
 88 |         weight = torch.ones_like(idx_vec).to(device)
 89 |         logits = self.forward(node_mat, adj, weight)
 90 |         pred = torch.sigmoid(logits) 
 91 |         return pred.item() 
 92 | 
 93 |     def learn(self, node_mat, adj, idx, label):
 94 |         pred_y = self.forward(node_mat, adj, idx)
 95 |         pred_y = pred_y.view(1,-1)
 96 |         # print(pred_y, pred_y.shape, label, label.shape) 
 97 |         cost = self.criteria(pred_y, label) 
 98 |         self.opt.zero_grad() 
 99 |         cost.backward() 
100 |         self.opt.step() 
101 |         return cost.data.numpy(), pred_y.data.numpy() 
102 | 
103 |     def infer(self, node_mat, adj, idx, target):
104 |         pred_y = self.forward(node_mat, adj, idx)
105 |         pred_y = pred_y.view(1,-1)
106 |         cost = self.criteria(pred_y, target)
107 |         return cost.data.numpy(), pred_y.data.numpy() 
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     gnn = GCN(nfeat = 50, nhid = 100, num_layer = 2)
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/src/run.py:
--------------------------------------------------------------------------------
  1 | import os, pickle, torch, random
  2 | import numpy as np 
  3 | import argparse
  4 | from time import time
  5 | from tqdm import tqdm 
  6 | from matplotlib import pyplot as plt
  7 | from random import shuffle 
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from tdc import Oracle
 11 | torch.manual_seed(1)
 12 | np.random.seed(2)
 13 | random.seed(1)
 14 | from chemutils import * 
 15 | from inference_utils import * 
 16 | 
 17 | 
 18 | def optimization(start_smiles_lst, gnn, oracle, oracle_num, oracle_name, generations, population_size, lamb, topk, epsilon, result_pkl):
 19 | 	smiles2score = dict() ### oracle_num
 20 | 	def oracle_new(smiles):
 21 | 		if smiles not in smiles2score:
 22 | 			value = oracle(smiles) 
 23 | 			smiles2score[smiles] = value 
 24 | 		return smiles2score[smiles] 
 25 | 	trace_dict = dict() 
 26 | 	existing_set = set(start_smiles_lst)  
 27 | 	current_set = set(start_smiles_lst)
 28 | 	average_f = np.mean([oracle_new(smiles) for smiles in current_set])
 29 | 	f_lst = [(average_f, 0.0)]
 30 | 	idx_2_smiles2f = {}
 31 | 	smiles2f_new = {smiles:oracle_new(smiles) for smiles in start_smiles_lst} 
 32 | 	idx_2_smiles2f[-1] = smiles2f_new, current_set 
 33 | 	for i_gen in tqdm(range(generations)):
 34 | 		next_set = set()
 35 | 		for smiles in current_set:
 36 | 			smiles_set = optimize_single_molecule_one_iterate(smiles, gnn)
 37 | 
 38 | 			for smi in smiles_set:
 39 | 				if smi not in trace_dict:
 40 | 					trace_dict[smi] = smiles ### ancestor -> offspring 
 41 | 			next_set = next_set.union(smiles_set)
 42 | 		# next_set = next_set.difference(existing_set)   ### if allow repeat molecule  
 43 | 		smiles_score_lst = oracle_screening(next_set, oracle_new)  ###  sorted smiles_score_lst 
 44 | 		print(smiles_score_lst[:5], "Oracle num", len(smiles2score))
 45 | 
 46 | 		# current_set = [i[0] for i in smiles_score_lst[:population_size]]  # Option I: top-k 
 47 | 		current_set,_,_ = dpp(smiles_score_lst = smiles_score_lst, num_return = population_size, lamb = lamb) 	# Option II: DPP
 48 | 		existing_set = existing_set.union(next_set)
 49 | 
 50 | 		# save 
 51 | 		smiles2f_new = {smiles:score for smiles,score in smiles_score_lst} 
 52 | 		idx_2_smiles2f[i_gen] = smiles2f_new, current_set 
 53 | 		pickle.dump((idx_2_smiles2f, trace_dict), open(result_pkl, 'wb'))
 54 | 
 55 | 		#### compute f-score
 56 | 		score_lst = [smiles2f_new[smiles] for smiles in current_set] 
 57 | 		average_f = np.mean(score_lst)
 58 | 		std_f = np.std(score_lst)
 59 | 		f_lst.append((average_f, std_f))
 60 | 		str_f_lst = [str(i[0])[:5]+'\t'+str(i[1])[:5] for i in f_lst]
 61 | 		with open("result/" + oracle_name + "_f_t.txt", 'w') as fout:
 62 | 			fout.write('\n'.join(str_f_lst))
 63 | 		if len(smiles2score) > oracle_num: 
 64 | 			break 
 65 | 
 66 | def main():
 67 | 	parser = argparse.ArgumentParser()
 68 | 	parser.add_argument('--oracle_num', type=int, default=1500)
 69 | 	parser.add_argument('--oracle_name', type=str, default="qed", choices=['jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk'])	
 70 | 	parser.add_argument('--generations', type=int, default=50)	
 71 | 	parser.add_argument('--population_size', type=int, default=20)	
 72 | 	args = parser.parse_args()
 73 | 
 74 | 	oracle_num = args.oracle_num 
 75 | 	oracle_name = args.oracle_name 
 76 | 	generations = args.generations 
 77 | 	population_size = args.population_size
 78 | 
 79 | 	start_smiles_lst = ['C1(N)=NC=CC=N1']  ## 'C1=CC=CC=C1NC2=NC=CC=N2'
 80 | 	qed = Oracle('qed')
 81 | 	sa = Oracle('sa')
 82 | 	jnk = Oracle('JNK3')
 83 | 	gsk = Oracle('GSK3B')
 84 | 	logp = Oracle('logp')
 85 | 	mu = 2.230044
 86 | 	sigma = 0.6526308
 87 | 	def normalize_sa(smiles):
 88 | 		sa_score = sa(smiles)
 89 | 		mod_score = np.maximum(sa_score, mu)
 90 | 		return np.exp(-0.5 * np.power((mod_score - mu) / sigma, 2.)) 
 91 | 
 92 | 
 93 | 	if oracle_name == 'jnkgsk':
 94 | 		def oracle(smiles):
 95 | 			return np.mean((jnk(smiles), gsk(smiles)))
 96 | 	elif oracle_name == 'qedsajnkgsk':
 97 | 		def oracle(smiles):
 98 | 			return np.mean((qed(smiles), normalize_sa(smiles), jnk(smiles), gsk(smiles))) 
 99 | 	elif oracle_name == 'qed':
100 | 		def oracle(smiles):
101 | 			return qed(smiles) 
102 | 	elif oracle_name == 'jnk':
103 | 		def oracle(smiles):
104 | 			return jnk(smiles)
105 | 	elif oracle_name == 'gsk':
106 | 		def oracle(smiles):
107 | 			return gsk(smiles) 
108 | 	elif oracle_name == 'logp':
109 | 		def oracle(smiles):
110 | 			return logp(smiles)
111 | 
112 | 	# device = 'cuda' if torch.cuda.is_available() else 'cpu'
113 | 	device = 'cpu' ## cpu is better 
114 | 	model_ckpt = "save_model/GNN_epoch_0_validloss_1.61160.ckpt"
115 | 	gnn = torch.load(model_ckpt)
116 | 	gnn.switch_device(device)
117 | 
118 | 	result_pkl = "result/" + oracle_name + ".pkl"
119 | 	optimization(start_smiles_lst, gnn, oracle, oracle_num, oracle_name,
120 | 						generations = generations, 
121 | 						population_size = population_size, 
122 | 						lamb=2, 
123 | 						topk = 5, 
124 | 						epsilon = 0.7, 
125 | 						result_pkl = result_pkl) 
126 | 
127 | 	
128 | 
129 | if __name__ == "__main__":
130 | 	main() 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 💊 MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization
  2 | 
  3 | [![License](https://img.shields.io/badge/License-BSD_2--Clause-orange.svg)](https://opensource.org/licenses/BSD-2-Clause)
  4 | [![Python 3.7+](https://img.shields.io/badge/python-3.7+-blue.svg)](https://www.python.org/downloads/release/python-370/)
  5 | [![GitHub Repo stars](https://img.shields.io/github/stars/futianfan/MIMOSA)](https://github.com/futianfan/MIMOSA/stargazers)
  6 | [![GitHub Repo stars](https://img.shields.io/github/forks/futianfan/MIMOSA)](https://github.com/futianfan/MIMOSA/network/members)
  7 | 
  8 | 
  9 | 
 10 | This repository hosts MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization (AAAI) 2021 (Tianfan Fu, Cao Xiao, Xinhao Li, Lucas Glass, Jimeng Sun), which used pretrained graph neural network (GNN) and MCMC for molecule optimization. 
 11 | 
 12 | ![pipeline](MIMOSA.png)
 13 | 
 14 | 
 15 | ## Table Of Contents
 16 | 
 17 | - [Installation](#installation) 
 18 | - [Data](#data)
 19 | - [Pretraining](#pretrain)
 20 | - [Run](#run) 
 21 | - [Contact](#contact) 
 22 | 
 23 | 
 24 | <a name="installation"></a>
 25 | ## ⚙️ 1. Installation 
 26 | 
 27 | To install locally, we recommend to install from `pip` and `conda`. Please see `conda.yml` for the package dependency. 
 28 | ```bash
 29 | conda create -n mimosa python=3.7 
 30 | conda activate mimosa
 31 | pip install torch 
 32 | pip install PyTDC 
 33 | conda install -c rdkit rdkit 
 34 | ```
 35 | 
 36 | Activate conda environment. 
 37 | ```bash
 38 | conda activate mimosa
 39 | ```
 40 | 
 41 | make directory
 42 | ```bash
 43 | mkdir -p save_model result 
 44 | ```
 45 | 
 46 | <a name="data"></a>
 47 | ## 📊 2. Data
 48 | In our setup, we restrict the number of oracle calls. In realistic discovery settings, the oracle acquisition cost is usually not negligible. 
 49 | 
 50 | ### Raw Data 
 51 | We use [`ZINC`](https://tdcommons.ai/generation_tasks/molgen/) database, which contains around 250K drug-like molecules and can be downloaded [`download ZINC`](https://tdcommons.ai/generation_tasks/molgen/). 
 52 | ```bash
 53 | python src/download.py
 54 | ```
 55 | - output
 56 |   - `data/zinc.tab`: all the smiles in ZINC, around 250K. 
 57 | 
 58 | ### Oracle
 59 | Oracle is a property evaluator and is a function whose input is molecular structure, and output is the property. 
 60 | We consider following oracles: 
 61 | * `JNK3`: biological activity to JNK3, ranging from 0 to 1.
 62 | * `GSK3B` biological activity to GSK3B, ranging from 0 to 1. 
 63 | * `QED`: Quantitative Estimate of Drug-likeness, ranging from 0 to 1. 
 64 | * `SA`: Synthetic Accessibility, we normalize SA to (0,1). 
 65 | * `LogP`: solubility and synthetic accessibility of a compound. It ranges from negative infinity to positive infinity. 
 66 | 
 67 | For all the property scores above, higher is more desirable. 
 68 | 
 69 | ### Optimization Task 
 70 | There are two kinds of optimization tasks: single-objective and multi-objective optimization. 
 71 | Multi-objective optimization contains `jnkgsk` (JNK3 + GSK3B), `qedsajnkgsk` (QED + SA + JNK3 + GSK3B). 
 72 | 
 73 | 
 74 | ### Generate Vocabulary 
 75 | In this project, the basic unit is `substructure`, which can be atoms or single rings. 
 76 | The vocabulary is the set of frequent `substructures`. 
 77 | ```bash 
 78 | python src/vocabulary.py
 79 | ```
 80 | - input
 81 |   - `data/zinc.tab`: all the smiles in ZINC, around 250K. 
 82 | - output
 83 |   - `data/substructure.txt`: including all the substructures in ZINC. 
 84 |   - `data/vocabulary.txt`: vocabulary, frequent substructures. 
 85 | 
 86 | ### data cleaning  
 87 | We remove the molecules that contains substructure that is not in vocabulary. 
 88 | 
 89 | ```bash 
 90 | python src/clean.py 
 91 | ```
 92 | 
 93 | - input 
 94 |   - `data/vocabulary.txt`: vocabulary 
 95 |   - `data/zinc.tab`: all the smiles in ZINC
 96 | - output
 97 |   - `data/zinc_clean.txt`
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | <a name="pretrain"></a>
104 | ## Pre-train graph neural network (GNN)
105 | ```bash 
106 | python src/train.py 
107 | ```
108 | - input 
109 |   - `data/zinc_clean.txt`
110 | - output 
111 |   - `save_model/GNN.ckpt`: trained GNN model. 
112 | - log
113 |   - `gnn_loss.pkl`: the valid loss. 
114 | 
115 | <a name="run"></a>
116 | ## 🤖 Run 
117 | 
118 | ### de novo molecule design 
119 | 
120 | ```bash
121 | python src/run.py
122 | ```
123 | - input 
124 |   - `save_model/GNN.ckpt`: pretrained GNN model. 
125 | - output 
126 |   - `result/{$prop}.pkl`: set of generated molecules. 
127 | 
128 | For example, 
129 | ```bash 
130 | python src/run.py
131 | ```
132 | 
133 | ### evaluate 
134 | 
135 | ```bash
136 | python src/evaluate.py $prop 
137 | ```
138 | - input 
139 |   - `result/{$prop}.pkl`
140 | - output 
141 |   - `diversity`, `novelty`, `average property` of top-100 molecules with highest property. 
142 | 
143 | For example, 
144 | ```bash 
145 | python src/evaluate.py jnkgsk 
146 | ```
147 | 
148 | <a name="contact"></a>
149 | ## 📞 Contact 
150 | Please contact futianfan@gmail.com for help or submit an issue. 
151 | 
152 | 
153 | ## Cite Us
154 | If you found this package useful, please cite our paper:
155 | ```
156 | @inproceedings{fu2021mimosa,
157 |   title={MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization},
158 |   author={Fu, Tianfan and Xiao, Cao and Li, Xinhao and Glass, Lucas M and Sun, Jimeng},
159 |   booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
160 |   volume={35},
161 |   number={1},
162 |   pages={125--133},
163 |   year={2021}
164 | }
165 | ```
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/src/inference_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ### 1. import
  3 | import numpy as np 
  4 | from tqdm import tqdm 
  5 | from matplotlib import pyplot as plt
  6 | import pickle 
  7 | from random import shuffle 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | from tdc import Oracle
 12 | torch.manual_seed(1)
 13 | np.random.seed(2)
 14 | import random 
 15 | random.seed(1)
 16 | from chemutils import * 
 17 | '''
 18 | optimize_single_molecule_one_iterate
 19 | gnn_prediction_of_single_smiles
 20 | oracle_screening
 21 | gnn_screening
 22 | optimize_single_molecule_all_generations
 23 | similarity_matrix(smiles_lst)
 24 | '''
 25 | from dpp import DPPModel
 26 | 
 27 | 
 28 | 
 29 | def gnn_prediction_of_single_smiles(smiles, gnn):
 30 | 	if not is_valid(smiles):
 31 | 		return 0
 32 | 	return gnn.smiles2pred(smiles)
 33 | 	# idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles)
 34 | 	# idx_vec = torch.LongTensor(idx_lst)
 35 | 	# node_mat = torch.FloatTensor(node_mat)
 36 | 	# adjacency_matrix = torch.FloatTensor(adjacency_matrix)
 37 | 	# weight = torch.ones_like(idx_vec)
 38 | 	# logits = gnn(node_mat = node_mat, adj = adjacency_matrix, weight = weight)
 39 | 	# logits = logits.item() 
 40 | 	# print("gnn prediction", logits)
 41 | 	# return logits 
 42 | 
 43 | 
 44 | def oracle_screening(smiles_set, oracle):
 45 | 	smiles_score_lst = []
 46 | 	for smiles in smiles_set:
 47 | 		score = oracle(smiles)
 48 | 		smiles_score_lst.append((smiles, score))
 49 | 	smiles_score_lst.sort(key=lambda x:x[1], reverse=True)
 50 | 	return smiles_score_lst 
 51 | 
 52 | def dpp(smiles_score_lst, num_return, lamb):
 53 | 	smiles_lst = [i[0] for i in smiles_score_lst]
 54 | 	if len(smiles_lst) <= num_return:
 55 | 		return smiles_lst, None, None 
 56 | 	score_arr = np.array([i[1] for i in smiles_score_lst])
 57 | 	sim_mat = similarity_matrix(smiles_lst)
 58 | 	dpp_model = DPPModel(smiles_lst = smiles_lst, sim_matrix = sim_mat, f_scores = score_arr, top_k = num_return, lamb = lamb)
 59 | 	smiles_lst, log_det_V, log_det_S = dpp_model.dpp()
 60 | 	return smiles_lst, log_det_V, log_det_S 
 61 | 
 62 | 
 63 | def gnn_screening(smiles_set, gnn):
 64 | 	smiles_score_lst = []
 65 | 	for smiles in smiles_set:
 66 | 		score = gnn_prediction_of_single_smiles(smiles, gnn)
 67 | 		smiles_score_lst.append((smiles, score))
 68 | 	smiles_score_lst.sort(key=lambda x:x[1], reverse=True)
 69 | 	return smiles_score_lst
 70 | 	# smiles_lst = [i[0] for i in smiles_score_lst]
 71 | 	# return smiles_lst
 72 | 
 73 | def optimize_single_node(smiles):
 74 | 	assert substr_num(smiles)==1 
 75 | 	vocabulary = load_vocabulary()
 76 | 	atoms = ['N', 'C']
 77 | 
 78 | # bondtype_list = [rdkit.Chem.rdchem.BondType.SINGLE, rdkit.Chem.rdchem.BondType.DOUBLE] ### chemutils 
 79 | 
 80 | def optimize_single_molecule_one_iterate(smiles, gnn):
 81 | 	target_ = torch.LongTensor([0]).view(-1)
 82 | 	if smiles == None:
 83 | 		return set() 
 84 | 	if not is_valid(smiles):
 85 | 		return set()
 86 | 	origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(smiles))
 87 | 	new_smiles_set = set() 
 88 | 	jj=-100
 89 | 
 90 | 	origin_idx_lst, origin_node_mat, origin_substructure_lst, \
 91 | 	origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles)
 92 | 
 93 | 	feature_lst = smiles2expandfeature(smiles)
 94 | 	for node_mat, adj_mat, mask_idx in feature_lst:
 95 | 		node_mat = torch.FloatTensor(node_mat) 
 96 | 		adj_mat = torch.FloatTensor(adj_mat)
 97 | 		N = adj_mat.shape[0]
 98 | 		for jj in range(N):
 99 | 			if adj_mat[jj,N-1]==1:
100 | 				break 
101 | 
102 | 		_, prediction = gnn.infer(node_mat, adj_mat, mask_idx, target_)
103 | 		top_idxs = prediction.reshape(-1).argsort().tolist()[::-1][:3]
104 | 		top_words = [vocabulary[ii] for ii in top_idxs]
105 | 		for substru_idx, word in zip(top_idxs, top_words):
106 | 			leaf_atom_idx_lst = origin_substructure_lst[jj]
107 | 
108 | 			if type(leaf_atom_idx_lst)==int:  ### int: single atom;   else: list of integer
109 | 				leaf_atom_idx_lst = [leaf_atom_idx_lst]
110 | 			for leaf_atom_idx in leaf_atom_idx_lst:
111 | 				for new_bond in bondtype_list:
112 | 					if ith_substructure_is_atom(substru_idx):
113 | 						new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
114 |                                                           new_atom = word, new_bond = new_bond)
115 | 						new_smiles_set.add(new_smiles)
116 | 					else:
117 | 						new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
118 |                                                                     fragment = word , new_bond = new_bond)
119 | 						new_smiles_set = new_smiles_set.union(new_smiles_batch)
120 | 
121 | 	new_smiles_set = set([new_smiles for new_smiles in new_smiles_set if new_smiles != None])
122 | 	return new_smiles_set
123 | 
124 | 
125 | 
126 | 
127 | def optimize_single_molecule_all_generations(input_smiles, gnn, oracle, generations, population_size, lamb):
128 | 	smiles2f = dict() 
129 | 	traceback_dict = dict() 
130 | 	input_smiles = canonical(input_smiles)
131 | 	input_score = oracle(input_smiles)
132 | 	best_mol_score_list = []
133 | 	existing_set = set([input_smiles])
134 | 	current_mol_score_list = [(input_smiles, input_score)]
135 | 	for it in tqdm(range(generations)):
136 | 		new_smiles_set = set()
137 | 		#### optimize each single smiles
138 | 		for smiles,score in current_mol_score_list:
139 | 			# proposal_smiles_set = optimize_single_molecule_one_iterate(smiles, gnn)
140 | 			proposal_smiles_set = optimize_single_molecule_one_iterate_v2(smiles, gnn)
141 | 			proposal_smiles_set = proposal_smiles_set.difference(set([input_smiles]))
142 | 			for new_smiles in proposal_smiles_set:
143 | 				if new_smiles not in traceback_dict:
144 | 					traceback_dict[new_smiles] = smiles 
145 | 			new_smiles_set = new_smiles_set.union(proposal_smiles_set)
146 | 
147 | 		### remove the repetition
148 | 		# new_smiles_set = new_smiles_set.difference(existing_set)
149 | 
150 | 		### add smiles into existing_set 
151 | 		existing_set = existing_set.union(new_smiles_set)
152 | 
153 | 		### scoring new smiles 
154 | 		####### I:GNN & oracle scoring
155 | 		# gnn_smiles_lst = gnn_screening(new_smiles_set, gnn)
156 | 		# gnn_smiles_lst = gnn_smiles_lst[:population_size*3]
157 | 		# mol_score_list = oracle_screening(gnn_smiles_lst, oracle)
158 | 		############ oracle call <= generations * population_size * 3 + 1 
159 | 
160 | 		####### II: only oracle scoring
161 | 		mol_score_list = oracle_screening(new_smiles_set, oracle)
162 | 		############ oracle call: unbounded, with better performance 
163 | 		for smiles, score in mol_score_list:
164 | 			if score > 0.50:
165 | 				print('example', smiles, score)
166 | 
167 | 
168 | 		### save results 
169 | 		best_mol_score_list.extend(mol_score_list)
170 | 
171 | 
172 | 		### only keep top-k 
173 | 		# mol_score_list = mol_score_list[:population_size] 
174 | 		### dpp(smiles_score_lst, num_return, lamb)
175 | 		smiles_lst = dpp(mol_score_list, num_return = population_size, lamb = lamb)
176 | 
177 | 
178 | 		### for next generation
179 | 		# current_mol_score_list = mol_score_list
180 | 		current_mol_score_list = [(smiles,0.0) for smiles in smiles_lst]
181 | 
182 | 	### endfor 
183 | 
184 | 	best_mol_score_list.sort(key=lambda x:x[1], reverse=True) 
185 | 	return best_mol_score_list, input_score, traceback_dict 
186 | 
187 | 
188 | 
189 | def calculate_results(input_smiles, input_score, best_mol_score_list):
190 | 	if best_mol_score_list == []:
191 | 		with open(result_file, 'a') as fout:
192 | 			fout.write("fail to optimize" + input_smiles + '\n')
193 | 		return None 
194 | 	output_scores = [i[1] for i in best_mol_score_list]
195 | 	smiles_lst = [i[0] for i in best_mol_score_list]
196 | 	with open(result_file, 'a') as fout:
197 | 		fout.write(str(input_score) + '\t' + str(output_scores[0]) + '\t' + str(np.mean(output_scores[:3]))
198 | 				 + '\t' + input_smiles + '\t' + ' '.join(smiles_lst[:3]) + '\n')
199 | 	return input_score, output_scores[0]
200 | 
201 | def inference_single_molecule(input_smiles, gnn, result_file, generations, population_size):
202 | 	best_mol_score_list, input_score, traceback_dict = optimize_single_molecule_all_generations(input_smiles, gnn, oracle, generations, population_size)
203 | 	return calculate_results(input_smiles, input_score, result_file, best_mol_score_list, oracle)
204 | 
205 | 
206 | 
207 | 
208 | def inference_molecule_set(input_smiles_lst, gnn, result_file, generations, population_size):
209 | 	score_lst = []
210 | 	for input_smiles in tqdm(input_smiles_lst):
211 | 		if not is_valid(input_smiles):
212 | 			continue 
213 | 		result = inference_single_molecule(input_smiles, gnn, result_file, generations, population_size)
214 | 		if result is None:
215 | 			score_lst.append(None)
216 | 		else:
217 | 			input_score, output_score = result
218 | 			score_lst.append((input_score, output_score))
219 | 	return score_lst
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------
/data/substructure.txt:
--------------------------------------------------------------------------------
   1 | C	1158545
   2 | O	500212
   3 | N	280451
   4 | C1=CC=CC=C1	257945
   5 | F	79430
   6 | S	51103
   7 | Cl	42872
   8 | C1=CC=NC=C1	27852
   9 | C1CCCCC1	20256
  10 | C1=CNN=C1	18920
  11 | C1=CSC=C1	17515
  12 | C1CCNCC1	15912
  13 | C1CC1	15462
  14 | C1CCCC1	14328
  15 | Br	12722
  16 | C1=CSC=N1	12617
  17 | C1COCCN1	11924
  18 | C1CNCCN1	11701
  19 | C1=COC=C1	11274
  20 | C1CCCN1	9739
  21 | C1=CN=CN=C1	7964
  22 | C1CC[NH+]CC1	7948
  23 | C1CCNC1	7634
  24 | C1CCCNC1	7277
  25 | C1=CCCC=C1	6243
  26 | C1=NN=CN1	5748
  27 | C1CNCC1	5513
  28 | C1CCOC1	5310
  29 | C1=CNC=N1	5201
  30 | C1=NOC=N1	5141
  31 | C1=CON=C1	4917
  32 | C1C[NH+]CCN1	4893
  33 | C1CC[NH+]C1	4027
  34 | C1=CCCCC1	3985
  35 | C1=NNC=N1	3958
  36 | C1COCC1	3829
  37 | C1=CCNC=C1	3752
  38 | C1=C[NH]N=C1	3575
  39 | C1=CNC=C1	3521
  40 | C1CCC1	2924
  41 | C1CCOCC1	2906
  42 | C1CCC[NH+]C1	2784
  43 | C1CNC=N1	2772
  44 | C1CCCCN1	2771
  45 | C1CSC=N1	2756
  46 | C1COCO1	2643
  47 | C1CSCC1	2614
  48 | C1=CN=CNC1	2566
  49 | C1CNCN1	2548
  50 | C1CNCCC1	2540
  51 | C1CSCN1	2360
  52 | C1=C[NH]CC1	2336
  53 | C1=CCCC1	2217
  54 | C1COCCC1	2178
  55 | C1COCCO1	2140
  56 | C1=NN=NN1	2117
  57 | C1=NN=CS1	2079
  58 | C1=NN=CO1	1994
  59 | C1=CCNCC1	1991
  60 | C1=COC=N1	1936
  61 | C1=CC=[NH+]C=C1	1876
  62 | C1=C[NH+]=CN1	1716
  63 | C1=CN=CC=N1	1706
  64 | C1=CNCN=C1	1630
  65 | C1=N[NH]C=N1	1598
  66 | C1C[NH+]CC1	1477
  67 | C1=CNN=N1	1444
  68 | C1=CCOCC1	1434
  69 | C1=COCC1	1406
  70 | C1=C[NH]C=C1	1389
  71 | C1CSCCN1	1385
  72 | C1=CC=NCC1	1375
  73 | C1C=CCCC1	1364
  74 | C1=CCNN=C1	1329
  75 | C1=CNCC1	1317
  76 | C1=CN=C[NH]C1	1268
  77 | C1=CNCNC1	1265
  78 | C1C[NH]C=N1	1230
  79 | C1CN=CNC1	1145
  80 | C1CCCCCC1	1145
  81 | C1COC=N1	1139
  82 | C1CCCNCC1	1102
  83 | C1=CSCN1	993
  84 | C1C=CCN1	955
  85 | C1CC[NH2+]CC1	940
  86 | C1=CN=NC=C1	917
  87 | I	888
  88 | C1CNCC[NH+]C1	834
  89 | C1=CC=[NH+]C1	816
  90 | C1=C[NH]C=N1	766
  91 | C1C=NNC1	759
  92 | C1=CSCC1	746
  93 | C1=NNCN1	735
  94 | C1CCCCCN1	717
  95 | C1=CC[NH]C=C1	694
  96 | C1CCCO1	692
  97 | C1CNCNC1	672
  98 | C1C[NH+]CCC1	654
  99 | C1CCSCC1	650
 100 | C1C=C[NH]C1	625
 101 | C1NCCN1	589
 102 | C1CNCCNC1	575
 103 | C1CCNCN1	566
 104 | C1=NNCC1	563
 105 | C1CNC1	538
 106 | C1=CN=NC1	536
 107 | C1NCCS1	523
 108 | C1C=CSC1	505
 109 | C1=CC[NH]CC1	500
 110 | C1NCCCN1	494
 111 | C1CNCS1	492
 112 | C1CCC=CN1	489
 113 | C1C[NH2+]CCN1	488
 114 | C1C=N[NH]C1	486
 115 | C1CC[NH2+]C1	482
 116 | C1COCN1	481
 117 | C1CCCS1	470
 118 | C1CCNCCN1	459
 119 | C1=NNCS1	459
 120 | C1=C[NH+]=C[NH]1	444
 121 | C1=CC=[N+]C=C1	443
 122 | C1CCNN=C1	430
 123 | C1CCC[NH+]CC1	430
 124 | C1=N[NH]CN1	428
 125 | C1=COCO1	426
 126 | C1C[NH2+]CC1	415
 127 | C1CCC[NH+]1	398
 128 | C1=CC[NH+]CC1	397
 129 | C1CCC[NH2+]C1	388
 130 | C1NC=CCN1	388
 131 | C1C=COCC1	386
 132 | C1=CNCCC1	381
 133 | C1=COCCO1	379
 134 | C1COCCCO1	378
 135 | C1C=NC=NC1	377
 136 | C1NCCO1	370
 137 | C1CCCSC1	356
 138 | C1C=COC1	354
 139 | C1CCCOC1	350
 140 | C1=CNC[NH]C1	349
 141 | C1CN=CC=N1	326
 142 | C1=NNN=N1	321
 143 | C1CN=C[NH]C1	320
 144 | C1CCSC1	320
 145 | C1=CNC=CC1	318
 146 | C1=CSN=N1	311
 147 | C1=COC=CC1	310
 148 | C1CNN=N1	309
 149 | C1=NSC=N1	308
 150 | C1=[NH+]CCN1	307
 151 | C1=COCCC1	296
 152 | C1C=CNCC1	286
 153 | C1C=CC=CC1	278
 154 | C1=CCOC=C1	272
 155 | C1=NON=C1	262
 156 | C1CC=NC=N1	251
 157 | C1C=CN=CC1	250
 158 | C1=NC=CNC1	249
 159 | C1=CC=CCC1	249
 160 | C1=CNNC1	244
 161 | C1=CSN=C1	242
 162 | C1CNCN=C1	230
 163 | C1CNC[NH]1	223
 164 | C1=NC=NC=N1	216
 165 | C1=NOCC1	215
 166 | C1C=NOC1	213
 167 | C1=CCN=C1	208
 168 | C1COCC[NH+]1	203
 169 | C1C=CNC1	202
 170 | C1=CCCCCC1	198
 171 | C1=[NH+]C[NH+]=C1	197
 172 | C1COC1	195
 173 | C1=NN=CC1	193
 174 | C1C[NH2+]CCC1	192
 175 | C1CCCCO1	190
 176 | C1CSCCC1	190
 177 | C1CCCCCCC1	189
 178 | C1=NC=NN=C1	188
 179 | C1CCOC=C1	184
 180 | C1C[NH]C[NH]1	181
 181 | C1=NSN=C1	181
 182 | C1CSCC[NH+]1	181
 183 | C1CCCCNC1	181
 184 | C1CN=CN=C1	178
 185 | C1=NCCNC1	176
 186 | C1CNCCCN1	176
 187 | C1CC[NH+]CCN1	174
 188 | C1=C[NH]CNC1	172
 189 | C1=NCCN=C1	171
 190 | C1=CN=C[NH+]=C1	171
 191 | C1CCSN1	169
 192 | C1C=CCS1	168
 193 | C1COCCNC1	164
 194 | C1=NNN=C1	162
 195 | C1C=CCC1	154
 196 | C1C=C[NH]CC1	153
 197 | C1C[NH]CNC1	152
 198 | C1=C[NH+]=CN=C1	151
 199 | C1COCC[NH2+]1	146
 200 | C1CCC=NN1	145
 201 | C1C=CNC=C1	142
 202 | C1=NN=N[N-]1	141
 203 | C1CSCCS1	140
 204 | C1=CNCN1	139
 205 | C1=NCCS1	139
 206 | C1COCOC1	138
 207 | C1=[N+]CCC1	137
 208 | C1C[NH+]C1	137
 209 | C1=CNCC=N1	136
 210 | C1=CCOC1	135
 211 | C1C=COC=C1	135
 212 | C1CCOCCN1	135
 213 | C1COC=C1	131
 214 | C1=CN=CC1	131
 215 | C1CO1	130
 216 | C1CNN=C1	130
 217 | C1=NNCO1	130
 218 | C1CNSC1	129
 219 | C1=NCNCC1	129
 220 | C1=CCSCC1	123
 221 | C1=CC=[NH+]CC1	123
 222 | C1CSCCO1	121
 223 | C1=C[NH]C[NH]C1	117
 224 | C1CNCCSC1	116
 225 | C1CCN1	116
 226 | C1CCCC[NH+]C1	109
 227 | C1C[NH2+]C1	109
 228 | C1CC=CN1	108
 229 | C1=NSCC1	108
 230 | C1CNCCO1	108
 231 | C1NN=CS1	106
 232 | C1=CC=NN=C1	105
 233 | C1CN=NNC1	104
 234 | C1=[NH+]CCS1	102
 235 | C1CC=CC=[NH+]1	102
 236 | C1CCC[NH2+]CC1	100
 237 | C1=NC=CCC1	100
 238 | P	99
 239 | C1=N[NH]C[NH]1	99
 240 | C1COCCCC1	98
 241 | C1C[NH+]=CN1	97
 242 | C1=CSC[NH]1	97
 243 | C1C=C[NH]C=C1	96
 244 | C1CC[NH]N=C1	94
 245 | C1=NCCN1	92
 246 | C1CCNCCC1	92
 247 | C1COCNC1	92
 248 | C1=CNC=NC1	92
 249 | C1=CN=CCC1	91
 250 | C1CCC[NH2+]1	91
 251 | C1=COCOC1	90
 252 | C1CCCNCCC1	89
 253 | C1=CC[NH2+]CC1	89
 254 | C1=COCCCO1	88
 255 | C1CNNC1	87
 256 | C1C=CCO1	87
 257 | C1=C[NH]C=NC1	87
 258 | C1OCCO1	85
 259 | C1=C[NH]CN1	82
 260 | C1CCSCCN1	82
 261 | C1CCSCN1	82
 262 | C1COC=CN1	82
 263 | C1=NC=NC1	80
 264 | C1C=CNCN1	78
 265 | C1=N[N-]C=N1	76
 266 | C1NCCCO1	76
 267 | C1=CN=[NH+]C=C1	75
 268 | C1CNN=CN1	74
 269 | C1COCCCN1	74
 270 | C1=C[NH]NC1	73
 271 | C1=NOCN1	73
 272 | C1C=CCCN1	72
 273 | C1CNC=C1	72
 274 | C1C=CNN=C1	71
 275 | C1=NSCCN1	71
 276 | C1C=NCNC1	71
 277 | C1=C[NH+]=CNC1	70
 278 | C1C=CCC=C1	70
 279 | C1=CNCN=N1	70
 280 | C1CSCCCN1	70
 281 | C1=CNCCNC1	68
 282 | C1NCCCS1	68
 283 | C1=CSC=[N+]1	68
 284 | C1CC=NCN1	67
 285 | C1CNCC[NH2+]C1	67
 286 | C1=CC[NH]N=C1	67
 287 | C1=CN[NH]C1	65
 288 | C1=CNCCN1	64
 289 | C1NCNN1	64
 290 | C1N=CCN1	63
 291 | C1CC=NN=C1	62
 292 | C1C[NH+]CCCN1	62
 293 | C1=NNCNC1	61
 294 | C1CCNC=N1	59
 295 | C1NNCCS1	59
 296 | C1COC[NH]1	57
 297 | C1CNCC=C1	57
 298 | C1=NN=CNC1	55
 299 | C1=CN[NH+]=C1	55
 300 | C1CNCCCC1	55
 301 | C1CN=CN1	55
 302 | C1=C[NH]CN=C1	53
 303 | C1C=CC=CN1	52
 304 | C1=CSCCC1	52
 305 | C1COCC[NH+]C1	51
 306 | C1C[NH+]=C[NH]1	49
 307 | C1CC[NH]C1	48
 308 | C1NCN=[NH+]1	48
 309 | C1=CCNC1	48
 310 | C1CCOCO1	47
 311 | C1=COCNC1	45
 312 | C1CNCO1	45
 313 | C1=CCCCCCC1	44
 314 | C1CC=CC=N1	44
 315 | C1CNC=CN1	43
 316 | C1=CON=[N+]1	43
 317 | C1CSCS1	43
 318 | C1NN=NN1	42
 319 | C1CCNN1	42
 320 | C1=NCNC1	42
 321 | C1CCOCNC1	41
 322 | C1CC[N+]CC1	40
 323 | C1N=NCN1	40
 324 | C1C=NCS1	39
 325 | C1CC=NN1	39
 326 | C1N=NCS1	39
 327 | C1=CCC1	39
 328 | C1C=NCCC1	39
 329 | C1CCC[NH+]CCC1	39
 330 | C1=CCCNC1	39
 331 | C1=N[NH]N=C1	38
 332 | C1CCNC=C1	37
 333 | C1C[N+]=CN1	36
 334 | C1NCNCN1	36
 335 | C1CCSNC1	36
 336 | C1=C[NH]C[NH]1	36
 337 | C1=NCN=C1	36
 338 | C1C[N+]CCN1	36
 339 | C1C=NCN1	35
 340 | C1CCC=CO1	35
 341 | C1CC=CCN1	34
 342 | C1=CN=NNC1	34
 343 | C1=[N+]CCN1	34
 344 | C1=CONC1	34
 345 | C1C=NSC1	32
 346 | C1C[N-]C=N1	32
 347 | C1CNCCOC1	32
 348 | C1=COCCCC1	32
 349 | C1=CN=[NH+]C1	31
 350 | C1=NN=C[NH]1	31
 351 | C1CSNCN1	31
 352 | C1C[NH+]CCNC1	31
 353 | C1N=CNN1	30
 354 | C1=NC=CCSC1	30
 355 | C1C=NC=CC1	30
 356 | C1=NCSC1	29
 357 | C1CSC[NH]1	29
 358 | C1CCOCN1	28
 359 | C1C[NH+]CCSC1	28
 360 | C1=NCNCN1	28
 361 | C1=NNCSC1	28
 362 | C1C=CN=N1	28
 363 | C1CNCOC1	27
 364 | C1CSCC[NH2+]1	27
 365 | C1CCCC=N1	27
 366 | C1=NCCC1	27
 367 | C1=C[NH+]=CCN1	26
 368 | C1=NNC[NH]1	26
 369 | C1CNCN=N1	26
 370 | C1CC[NH+]CCC1	26
 371 | C1=CNC[NH]1	26
 372 | C1=NCCCN1	26
 373 | C1CCCCCCN1	26
 374 | C1CCC=N1	26
 375 | C1=C[N-]C=N1	26
 376 | C1=C[N+]=CN1	25
 377 | C1CCN=CN1	25
 378 | C1CSCC=CN1	25
 379 | C1C[NH]N=N1	24
 380 | C1CNOC1	24
 381 | C1=CSCCO1	24
 382 | C1C=CSN1	23
 383 | C1=CCSC1	23
 384 | C1C[NH2+]CS1	23
 385 | C1=CCCNCC1	23
 386 | C1CNCC[NH]1	22
 387 | C1CSC=[N+]1	22
 388 | C1=CC=[N+]CC1	22
 389 | C1NNCS1	22
 390 | C1CC=COC1	22
 391 | C1CCON1	21
 392 | C1CCCC=C1	21
 393 | C1NCNCS1	21
 394 | C1CN1	21
 395 | C1=NCNC=N1	21
 396 | C1C[NH]CC[NH]1	21
 397 | C1C=NC=[NH+]C1	21
 398 | C1NC=CS1	21
 399 | C1=[NH+]CCCN1	21
 400 | C1CNC=NN1	21
 401 | C1=NCC[NH]C1	20
 402 | C1C[NH]CN1	20
 403 | C1=NCNN=C1	20
 404 | C1CCCNN1	20
 405 | C1C=CCCCN1	20
 406 | C1=[NH+]CNCN1	20
 407 | C1C=NC=N1	19
 408 | C1CSCSC1	19
 409 | C1=CNCSC1	19
 410 | C1C[NH]NC1	19
 411 | C1=[NH+]CCC1	19
 412 | C1CN=COC1	19
 413 | C1CC[NH]C=C1	19
 414 | C1CNC[NH]C1	19
 415 | C1CNSNC1	18
 416 | C1=CCC=CC1	18
 417 | C1CCC=CC1	18
 418 | C1=C[N+]=CN=C1	18
 419 | C1C=NNC=N1	18
 420 | C1CSN1	17
 421 | C1CC=CC=C1	17
 422 | C1=[NH+]CCCCC1	17
 423 | C1=NCNC=C1	17
 424 | C1COC[NH+]C1	16
 425 | C1CSCO1	16
 426 | C1=CSCCN1	16
 427 | C1C[NH+]=CC=[NH+]1	16
 428 | C1C[NH2+]CCCN1	15
 429 | C1NCCSN1	15
 430 | C1CC=[NH+]N=C1	15
 431 | C1=NN=CSC1	15
 432 | C1=[NH+]CCCC1	15
 433 | C1CC[N+]C1	15
 434 | C1=CN=NC=N1	14
 435 | C1CCN[NH]C1	14
 436 | C1=NC[NH2+]CC1	14
 437 | C1C=CNCCN1	14
 438 | C1N=CCS1	14
 439 | C1CN[NH]C1	14
 440 | C1=COCCNC1	13
 441 | C1=NN=C[NH]C1	13
 442 | C1C[NH+]=CC=C1	13
 443 | C1[N+]=CCS1	13
 444 | C1CCC[N+]C1	13
 445 | C1CSN=N1	13
 446 | C1=C[NH]CC=N1	13
 447 | C1=NC[NH]N=C1	13
 448 | C1CSNCC1	12
 449 | C1=C[N-]C=C1	12
 450 | C1=CNSN=C1	12
 451 | C1=CCC[NH2+]CC1	12
 452 | C1CCCCOC1	12
 453 | C1CSCCCC1	11
 454 | C1COCC[N+]1	11
 455 | C1=CC=COC1	11
 456 | C1C[NH+]CCOC1	11
 457 | C1=N[NH]C[NH]C1	11
 458 | C1=CNOC1	11
 459 | C1COCCC[NH2+]1	11
 460 | C1COC=CC1	11
 461 | C1CC=CCC1	11
 462 | C1=NCCNCC1	11
 463 | C1C=NCC1	11
 464 | C1C=N[NH]C=N1	11
 465 | C1C=C[NH+]=CC1	11
 466 | C1=C[NH]N=N1	11
 467 | C1C=CCCO1	11
 468 | C1C[NH2+]CCNC1	11
 469 | C1CCSC=N1	11
 470 | C1CON=C1	10
 471 | C1=CCSC=C1	10
 472 | C1C=CC=N1	10
 473 | C1CC=CN=N1	10
 474 | C1NN=CCS1	10
 475 | C1=N[NH+]=CSC1	10
 476 | C1=COCN1	10
 477 | C1CCN=CCN1	10
 478 | C1=NCCO1	10
 479 | C1=NCCCC1	10
 480 | C1CN=NC1	10
 481 | C1CN=NCN1	10
 482 | C1=COC[NH]1	10
 483 | C1CNC[NH+]C1	10
 484 | C1=NCNN1	10
 485 | C1CN=C[NH+]=N1	10
 486 | C1=NC=C[NH]C1	10
 487 | C1CCNNC1	10
 488 | C1C[NH+]CC[NH2+]1	9
 489 | C1C=CSCC1	9
 490 | C1CCC=CCN1	9
 491 | C1N=CC=CN1	9
 492 | C1=NCCSC1	9
 493 | C1CN=CCCN1	9
 494 | C1NC=NS1	9
 495 | C1CC[NH2+]CCC1	9
 496 | C1=[N+]CCCS1	9
 497 | C1=C[NH+]CCC1	9
 498 | C1CN=C[N+]C1	9
 499 | C1=NCCCS1	9
 500 | C1=NCCOC1	9
 501 | C1=CC[NH2+]C1	9
 502 | C1=CC=CNC1	9
 503 | C1=CNSCC1	9
 504 | C1CCONC1	9
 505 | C1=CSCNC1	9
 506 | C1=CC=[O+]C=C1	9
 507 | C1=[N+]CCCCC1	9
 508 | C1N=[NH+]CS1	9
 509 | C1COPOC1	9
 510 | C1=CC[N+]CC1	9
 511 | C1=[N+]CCCN1	9
 512 | C1CC=NC=C1	9
 513 | C1C[NH+]CN1	8
 514 | C1=COCCN1	8
 515 | C1=NC=NCN1	8
 516 | C1CSNC=C1	8
 517 | C1=CC=CCC=C1	8
 518 | C1CSC=CN1	8
 519 | C1=NCN=N1	8
 520 | C1=CN=NCC1	8
 521 | C1C=CON1	8
 522 | C1CCCCCCCCCCC1	8
 523 | C1CONC1	8
 524 | C1CN=NN1	8
 525 | C1CCCCS1	8
 526 | C1=CCC=C1	8
 527 | C1C=CN=CN1	8
 528 | C1=COCC[NH+]C1	8
 529 | C1CCOCCCN1	8
 530 | C1CNCCS1	8
 531 | C1CCNO1	8
 532 | C1CC[NH+]CN1	8
 533 | C1CNCC=N1	7
 534 | C1CC=CO1	7
 535 | C1=CSNCC1	7
 536 | C1=C[NH]CN=N1	7
 537 | C1=[N+]CC[N+]=C1	7
 538 | C1CCC=C1	7
 539 | C1NCON1	7
 540 | C1=C[NH][NH+]=C1	7
 541 | C1C[N+]CC[N+]1	7
 542 | C1C=C[N+]=CC1	7
 543 | C1=CCNNC1	7
 544 | C1N=NN=N1	7
 545 | C1SCCS1	7
 546 | C1N=NNN1	7
 547 | C1=NO[N+]=C1	7
 548 | C1[NH+]=CC=[NH+]1	7
 549 | C1CCCSCC1	7
 550 | C1=[N+]CNC1	7
 551 | C1=COC[NH+]C1	7
 552 | C1C[N+]CCC1	7
 553 | C1=CNC[NH+]C1	6
 554 | C1=CSNC1	6
 555 | C1C[NH+]CCO1	6
 556 | C1CCNC=CN1	6
 557 | C1=[NH+]CON1	6
 558 | C1CCCNCN1	6
 559 | C1NC[N+]CN1	6
 560 | C1CNCC[NH2+]1	6
 561 | C1C=NNCN1	6
 562 | C1=CCN[NH]C1	6
 563 | C1C=CC=C1	6
 564 | C1=[N+]CCCC1	6
 565 | C1=NCCOCC1	6
 566 | C1C[NH2+]CCO1	6
 567 | C1CCCC[NH2+]C1	6
 568 | C1NC=NN1	6
 569 | C1=NC[NH]C=N1	6
 570 | C1CSSC1	6
 571 | C1=NC=CCS1	6
 572 | C1=CN=COC1	6
 573 | C1CC[NH2+]CCN1	6
 574 | C1=C[N+]CCN1	6
 575 | C1=CCC[NH+]CC1	6
 576 | C1=[NH+]CCCS1	6
 577 | C1CCCNC=N1	6
 578 | C1C[NH+]=CC=N1	6
 579 | C1CC=CNC1	5
 580 | C1=CCCOC=C1	5
 581 | C1=[NH+]CNN1	5
 582 | C1=C[N+]=CC=[N+]1	5
 583 | C1C[N+]CC1	5
 584 | C1=CC[NH+]C1	5
 585 | C1N=NCO1	5
 586 | C1=C[NH+]=NC1	5
 587 | C1=C[NH][NH]C1	5
 588 | C1CCCCNCC1	5
 589 | C1CN=NC=N1	5
 590 | C1=NNCCC1	5
 591 | C1C[NH+]CCCC1	5
 592 | C1CC[NH+]1	5
 593 | C1C=NC=C1	5
 594 | C1=C[NH+]=NN1	5
 595 | C1=CCOCCC1	5
 596 | C1CNC=CC1	5
 597 | C1C=[N+]CCN1	5
 598 | C1CC[NH]CC1	5
 599 | C1C[NH]C[NH]C1	5
 600 | C1C=CN=C1	5
 601 | C1CC=CC1	5
 602 | C1CC=[NH+]C=N1	5
 603 | C1=NC=[NH+]CN1	5
 604 | C1CNCC[NH+]1	5
 605 | C1C=CSCN1	5
 606 | C1=N[N-]N=C1	5
 607 | C1N=CCCN1	5
 608 | C1CCN=CC1	5
 609 | C1=N[NH]C=[NH+]1	5
 610 | C1=CSC=CC1	5
 611 | C1=CNNCC1	4
 612 | C1=CNCCCN1	4
 613 | C1=NCOC1	4
 614 | C1CCC=CCC1	4
 615 | C1C[NH]N=CN1	4
 616 | C1NNC[NH+]1	4
 617 | C1C[NH+]1	4
 618 | C1=CNCCO1	4
 619 | C1CSCCSC1	4
 620 | C1C[NH2+]CCS1	4
 621 | C1=CCCSC1	4
 622 | C1CSCC[NH2+]C1	4
 623 | C1=NNSCC1	4
 624 | C1=CNCCN=C1	4
 625 | C1=CN=C[N+]=C1	4
 626 | C1C=NCN=C1	4
 627 | C1=NCC[N+]1	4
 628 | C1C=CNC=[N+]1	4
 629 | C1C=CCNCC1	4
 630 | C1C[NH]CC=N1	4
 631 | C1CCCCCCCCC1	4
 632 | C1CCC[NH+]=N1	4
 633 | C1NNCO1	4
 634 | C1[N-]C=NN1	4
 635 | C1C[NH2+]CCSC1	4
 636 | C1=NC=CCN1	4
 637 | C1=[NH+]CN=N1	4
 638 | C1=CSCO1	4
 639 | C1CCCOCC1	4
 640 | C1CCOCCC1	4
 641 | C1=NCCSCC1	4
 642 | C1NCCNN1	4
 643 | C1=CC[NH+]CCC1	4
 644 | C1=CCC[NH+]C1	4
 645 | C1CN=CC=CN1	4
 646 | C1CC=CCCN1	4
 647 | C1CN=CN=N1	4
 648 | C1NCC=CO1	4
 649 | C1=CC=[NH+]N=C1	4
 650 | C1=C[NH+]=C[NH]C1	4
 651 | C1=NCCCCC1	4
 652 | C1CNCCC[NH2+]C1	4
 653 | C1=NCCSN1	4
 654 | C1=C[N-]CC1	4
 655 | C1=NN[N+]=C1	4
 656 | C1=NCCC=NN1	4
 657 | C1COCC[NH2+]C1	4
 658 | C1CNSCC1	4
 659 | C1C[NH+]CO1	3
 660 | C1=N[NH]CNC1	3
 661 | C1COCCOCCOCCOCCO1	3
 662 | C1CCCSN1	3
 663 | C1=NN=NC1	3
 664 | C1CN=CCC1	3
 665 | C1CSC1	3
 666 | C1CNS[NH+]C1	3
 667 | C1NC[NH+]CN1	3
 668 | C1=NOC[N-]1	3
 669 | C1=NNCCS1	3
 670 | C1CNN=C[NH]1	3
 671 | C1CN=[NH+]C=N1	3
 672 | C1C=CC1	3
 673 | C1=COCCC[NH+]C1	3
 674 | C1C=CCNN1	3
 675 | C1=[NH+]CCCCN1	3
 676 | C1=[NH+]CCO1	3
 677 | C1C[NH+]=CSC1	3
 678 | C1=NC=NNC1	3
 679 | C1CN=C[N+]=C1	3
 680 | C1=CCSN1	3
 681 | C1=CSC=[NH+]1	3
 682 | C1=CCCCOC1	3
 683 | C1=CN=C[N-]C1	3
 684 | C1C=CC[NH+]C1	3
 685 | C1=NSCC[N-]1	3
 686 | C1=NNC[NH]C1	3
 687 | C1CC=[N+]CC1	3
 688 | C1C=CNN1	3
 689 | C1=CO[NH]C1	3
 690 | C1=NC=NCC1	3
 691 | C1C=NC[NH]C1	3
 692 | C1=NC[NH+]CN1	3
 693 | C1CC=NCC1	3
 694 | C1COC[NH2+]1	3
 695 | C1C[NH+]CC[NH+]1	3
 696 | C1COCCOCCOCCN1	3
 697 | C1CS1	3
 698 | C1=COCCCNC1	3
 699 | C1=NN=CN=N1	3
 700 | C1C=CNNC1	3
 701 | C1=NC[NH+]C1	3
 702 | C1=CCNCCC1	3
 703 | C1C=CCNC1	3
 704 | C1CN=C[NH+]=C1	3
 705 | C1N=C[NH]CN1	3
 706 | C1NNCNN1	3
 707 | C1C=NCCN1	3
 708 | C1C=CC=[NH+]1	3
 709 | C1CS[N-]CN1	3
 710 | C1=NCN=CO1	3
 711 | C1=[NH+]CCC=C1	3
 712 | C1=CNC=[N+]C1	3
 713 | C1CNSN1	3
 714 | C1=NNNN1	3
 715 | C1CCNCO1	3
 716 | C1CCCC=CN1	3
 717 | C1CSCNN1	3
 718 | C1=CNCCCC1	3
 719 | C1COC[NH+]1	3
 720 | C1=NNCNN1	3
 721 | C1=NSC=CN1	2
 722 | C1=CNCC[NH2+]C1	2
 723 | C1C[NH2+]CN1	2
 724 | C1CCSS1	2
 725 | C1C=CCCCC1	2
 726 | C1=[N+]CCNC1	2
 727 | C1CCCC[NH+]1	2
 728 | C1=N[NH]CS1	2
 729 | C1CNCC=CN1	2
 730 | C1N=CNCN1	2
 731 | C1=NSSC1	2
 732 | C1CNNCN1	2
 733 | C1=CCCOC1	2
 734 | C1CCCCCCCC1	2
 735 | C1COCCOCCOCCO1	2
 736 | C1CCCCC=C1	2
 737 | C1CCCNCCCN1	2
 738 | C1=NCCOC=C1	2
 739 | C1=CSSC1	2
 740 | C1CCOC=NN1	2
 741 | C1CCS1	2
 742 | C1C=CCOC1	2
 743 | C1=CNCCSC1	2
 744 | C1=NC=[NH+]C=N1	2
 745 | C1NN1	2
 746 | C1=NCCC=[NH+]1	2
 747 | C1CCC=[NH+]1	2
 748 | C1C[N+]=CC=N1	2
 749 | C1NCSS1	2
 750 | C1=CN=CSC1	2
 751 | C1SCSCS1	2
 752 | C1C=NNP1	2
 753 | C1C=COCO1	2
 754 | C1=CNNN1	2
 755 | C1CN=CCSC1	2
 756 | C1=CNCCC=C1	2
 757 | C1CC[NH][NH]C1	2
 758 | C1C=NN=C1	2
 759 | C1=C/CCCCCC/1	2
 760 | C1CC[NH+]CSC1	2
 761 | C1CNNC=N1	2
 762 | C1COCC[NH+]CCOCCOCC[NH+]CCO1	2
 763 | C1NC=CC=NN1	2
 764 | C1CCCC[NH+]CC1	2
 765 | C1CC=[N+]C=C1	2
 766 | C1=[NH+]CCSC[CH-]1	2
 767 | C1C[NH]COC1	2
 768 | C1=N[N-]N=N1	2
 769 | C1CC[N-]C=N1	2
 770 | C1N=C[NH+]=C[NH]1	2
 771 | C1CCN=N1	2
 772 | C1=CNC=[NH+]C1	2
 773 | C1NC=[NH+]CN1	2
 774 | C1NCCC=[NH+]1	2
 775 | C1=CN=N[NH]C1	2
 776 | C1N=N1	2
 777 | C1N=CS1	2
 778 | C1=CN=C[N+]C1	2
 779 | C1N[NH]CS1	2
 780 | C1COCCC[NH+]1	2
 781 | C1=[N+]CCS1	2
 782 | C1=NCCCSC1	2
 783 | C1C=NCO1	2
 784 | C1=CS[N+]=C1	2
 785 | C1=NCCC=C1	2
 786 | C1=CSOC1	2
 787 | C1=CC=[NH+]CN1	2
 788 | C1NCC[NH+]1	2
 789 | C1COCON1	2
 790 | C1=NN[N-]N1	2
 791 | C1CSCCNC1	2
 792 | C1CNSCCN1	2
 793 | C1=CN=NN=C1	2
 794 | C1=CCC[NH2+]C1	2
 795 | C1C=CC=CC=C1	2
 796 | C1COCS1	2
 797 | C1COCSN1	2
 798 | C1C[NH+]CS1	2
 799 | C1CCCC=[NH+]1	2
 800 | C1CSC=CCN1	2
 801 | C1=NC=C[N+]=C1	2
 802 | C1=CNPN=C1	2
 803 | C1CN[NH2+]N1	2
 804 | C1=[N+]NCC1	2
 805 | C1CCN[N+]CC1	2
 806 | C1CSCOC1	2
 807 | C1C[NH2+]CSC1	2
 808 | C1COPO1	2
 809 | C1=[NH+]CCNC1	2
 810 | C1CNNCC1	2
 811 | C1N=CN=CN1	2
 812 | C1CCOCOC1	2
 813 | C1CCCCCO1	2
 814 | C1N=[NH+]CO1	2
 815 | C1NCC[N+]1	2
 816 | C1=CC=CC1	2
 817 | C1C=CCN=C1	2
 818 | C1=NOCCN1	2
 819 | C1=CC[N+]C=C1	2
 820 | C1C=NCCN=C1	2
 821 | C1=CC1	2
 822 | C1CCOCC[NH2+]C1	2
 823 | C1CC[N+]=[N+]CC1	2
 824 | C1CSC=C1	2
 825 | C1COCPCO1	2
 826 | C1NOSN1	2
 827 | C1C[NH+]C[NH+]1	1
 828 | C1=CSNCN1	1
 829 | C1=NCCC=[N+]1	1
 830 | C1C=NC[NH+]C1	1
 831 | C1C=COCCO1	1
 832 | C1CNC[NH2+]C1	1
 833 | C1CSCNCCC1	1
 834 | C1CC=CCCO1	1
 835 | C1C[NH+]COC1	1
 836 | C1[NH+]CCO1	1
 837 | C1=COC=CN1	1
 838 | C1NCCCCCN1	1
 839 | C1C=CSNC1	1
 840 | C1C=C[NH+]C=C1	1
 841 | C1=NCN=C[N+]1	1
 842 | C1CSCCOC1	1
 843 | C1=C[N+]=CNC1	1
 844 | C1=CCC=CCC1	1
 845 | C1=CC=NCN1	1
 846 | C1=CC=CCCC1	1
 847 | C1=CCCCC/C=C/1	1
 848 | C1=[NH+]NCN1	1
 849 | C1CC=CS1	1
 850 | C1=CC=COC=C1	1
 851 | C1=NNCCP1	1
 852 | C1=NC=CCNC1	1
 853 | C1=NC=CSC1	1
 854 | C1COCCOCCOCCOCCOCCOCCOCCO1	1
 855 | C1NCNO1	1
 856 | C1C[N+]CN1	1
 857 | C1CCCN[P+]N1	1
 858 | C1CCC/C=C/C=CCOCCCC1	1
 859 | C1N=C[N-]N1	1
 860 | C1C=CNCN=C1	1
 861 | C1CC[S+]C1	1
 862 | C1=CSC=[N+]C1	1
 863 | C1=CC=[NH+]C=NN1	1
 864 | C1C=[NH+]C=N1	1
 865 | C1NCN1	1
 866 | C1OCCCO1	1
 867 | C1C[NH+]CCC[NH2+]C1	1
 868 | C1SC=CS1	1
 869 | C1C=CNO1	1
 870 | C1C=CC=NN1	1
 871 | C1CC=CCC=C1	1
 872 | C1CCC[NH]CC1	1
 873 | C1=CNCNN1	1
 874 | C1C[N+]=CC=C1	1
 875 | C1C=[NH+]CC1	1
 876 | C1CCNCCCCC1	1
 877 | C1N=CSS1	1
 878 | C1N=N[NH]N1	1
 879 | C1CSCNCCN1	1
 880 | C1=CNCCS1	1
 881 | C1=CCCSCC1	1
 882 | C1CNC[NH+]CN1	1
 883 | C1C=CNSC1	1
 884 | C1C[N+]=CC=[N+]1	1
 885 | C1=C\CCCC/C=C/CC/1	1
 886 | C1CCCC[N+]C1	1
 887 | C1CCNCCOC1	1
 888 | C1=NSCN1	1
 889 | C1=CC=C[NH]C1	1
 890 | C1[NH]C=[NH+]CN1	1
 891 | C1C=[N+]CC1	1
 892 | C1COCCOCC[NH+]CCOCCOCC[NH+]1	1
 893 | C1=NCSN1	1
 894 | C1=[NH+]CCC[NH2+]C1	1
 895 | C1C=[NH+]C=C[NH]1	1
 896 | C1=CC=CCCC=CC=CCC1	1
 897 | C1C=CCCCC=CC=CCC1	1
 898 | C1C=CCCCCC=CCCC1	1
 899 | C1=CC=CCCCC=CCCC1	1
 900 | C1=NNC=[NH+]1	1
 901 | C1=C/CCCCCC\C=C/CC\1	1
 902 | C1=NCN=CN1	1
 903 | C1CCSCCC1	1
 904 | C1=CNCC[NH]1	1
 905 | C1C[NH+]=CN=N1	1
 906 | C1CCPN1	1
 907 | C1N=CC=N1	1
 908 | C1C=CNSN1	1
 909 | C1[N+]CCO1	1
 910 | C1COCC=C1	1
 911 | C1=NCC=C[N-]1	1
 912 | C1CCC[N+]1	1
 913 | C1=C[NH2+]C=CC1	1
 914 | C1CCC=CC=C1	1
 915 | C1C=CCC=N1	1
 916 | C1C=NNS1	1
 917 | C1=CNN=CN1	1
 918 | C1=C[NH]CSC1	1
 919 | C1CNCCNN1	1
 920 | C1CSOC1	1
 921 | C1C/C=C\C=CCN1	1
 922 | C1CN=CC1	1
 923 | C1=COCCOCCOCCOCCO1	1
 924 | C1=CC=NC1	1
 925 | C1CCCSCNC1	1
 926 | C1=NN=C[N-]1	1
 927 | C1COCCOCCOCCSCCOCCO1	1
 928 | C1=CN[N+]=C1	1
 929 | C1CC\C=C/CCC1	1
 930 | C1CN[N+]=C1	1
 931 | C1CNC[NH+]1	1
 932 | C1CCCCSC1	1
 933 | C1C=[NH+]C=NC1	1
 934 | C1C[NH+]CCNN1	1
 935 | C1=C[N+]=C[NH]1	1
 936 | C1=CC=[NH+]CCN1	1
 937 | C1C=N[N+]=C1	1
 938 | C1COC[N-]1	1
 939 | C1=C[NH+]CC1	1
 940 | C1=CNC[NH2+]C1	1
 941 | C1=C[S+]=CS1	1
 942 | C1=NNCSCC1	1
 943 | C1=C[N-]CN=C1	1
 944 | C1=NN=N[NH]1	1
 945 | C1CCCCNCCC1	1
 946 | C1=C[NH2+]CCN=C1	1
 947 | C1=C[NH2+]NC1	1
 948 | C1CC[N+]N1	1
 949 | C1=C/CCCC/C=C/CC/1	1
 950 | C1C=NCC=C1	1
 951 | C1=NCC=C1	1
 952 | C1=CSC=CS1	1
 953 | C1C[NH2+]CCOC1	1
 954 | C1NNCCNN1	1
 955 | C1CN=CO1	1
 956 | C1=C[NH2+]CCC1	1
 957 | C1=COC[N+]1	1
 958 | C1=CSCC=C1	1
 959 | C1CN=[NH+]C1	1
 960 | C1CC=CCCC1	1
 961 | C1NN=NS1	1
 962 | C1=N\CC/N=C\CC/1	1
 963 | C1NCCC=[N+]1	1
 964 | C1=CCC=CC=C1	1
 965 | C1CCSCC[NH2+]1	1
 966 | C1CC[N+]C=N1	1
 967 | C1NCC=[N+]1	1
 968 | C1C[NH]C=C[NH]1	1
 969 | C1CC[NH+]=N1	1
 970 | C1=NC=[O+]C=N1	1
 971 | C1=NCC=CN1	1
 972 | C1CN=CSC1	1
 973 | C1NNC=[NH+]1	1
 974 | C1=C[NH+]=CSC1	1
 975 | C1C=CNC=CN1	1
 976 | C1C[NH+]=NC1	1
 977 | C1CSSCCSS1	1
 978 | C1CCCOCCCCO1	1
 979 | C1=CNN=CC1	1
 980 | C1C=NSN=C1	1
 981 | C1C[NH+]CC[NH+]C1	1
 982 | C1=CC[NH]C1	1
 983 | C1C=NCCCN1	1
 984 | C1CNC=NC1	1
 985 | C1CSCCSCCS1	1
 986 | C1C=NNCS1	1
 987 | C1NC=CC=[NH+]1	1
 988 | C1C=CC[N+]CC1	1
 989 | C1CSC=NN1	1
 990 | C1=[NH+]NCCN1	1
 991 | C1=CN=P[NH+]=C1	1
 992 | C1COPN1	1
 993 | C1=NNSC1	1
 994 | C1C[NH+]=NCN1	1
 995 | C1N=CSN1	1
 996 | C1=NNC=[N+]1	1
 997 | C1CCNC=NN1	1
 998 | C1CSNS1	1
 999 | C1COSO1	1
1000 | C1=CC=CCSCC=NCCSC1	1
1001 | C1C=CCCSCC=NCCSC1	1
1002 | N1NNN1	1
1003 | C1CSSCCNN1	1
1004 | C1=C[NH]COC1	1
1005 | C1=CN=SCC1	1
1006 | C1CCC=NCC1	1
1007 | C1=CC[NH2+]CCC1	1
1008 | C1CCNNCC1	1
1009 | C1=CCNC=CC1	1
1010 | C1=CCCOCC1	1
1011 | C1=[NH+]CCNCC1	1
1012 | C1CN[PH]O1	1
1013 | C1CO[PH]O1	1
1014 | C1C=[N+]CCCC1	1
1015 | C1CCOCCCCCCOC1	1
1016 | C1\C=C/CNCCC1	1
1017 | C1=CSCNN1	1
1018 | C1=NNC[NH2+]C1	1
1019 | C1CC[NH]NC1	1
1020 | C1C=CC=NC1	1
1021 | C1CSNN1	1
1022 | C1=N[NH+]=CNC1	1
1023 | C1=NC=CC1	1
1024 | C1=NCSCC1	1
1025 | C1C[NH+]CC[N+]1	1
1026 | C1=[N+]C[N+]=C1	1
1027 | C1CSCN[NH]1	1
1028 | C1COC=CCN1	1
1029 | C1=C[N+]=CC=N1	1
1030 | C1=NCCCCO1	1
1031 | C1=CNC[NH+]=C1	1
1032 | C1COCC[NH2+]CCOCCNCCCCCN1	1
1033 | C1CC[N+]NC1	1
1034 | C1=CNCC[NH+]=C1	1
1035 | C1CSCCSCCCSCCSC1	1
1036 | C1NN[NH2+]N1	1
1037 | C1=NNCN=N1	1
1038 | C1NC[NH+]CS1	1
1039 | C1=[NH+]C=CCN1	1
1040 | C1CCOPN1	1
1041 | C1=CSC[N-]1	1
1042 | C1CCC=CS1	1
1043 | C1C=[N+]C=CN1	1
1044 | C1=CC=COCC1	1
1045 | C1OCCS1	1
1046 | C1C=CC[N+]1	1
1047 | C1=NC[NH+]=C1	1
1048 | C1=NC=N[N+]=C1	1
1049 | C1CNC[N+]=C1	1
1050 | C1=NCC[N+]=C1	1
1051 | C1=NCCCO1	1
1052 | C1CNC=[NH+]1	1
1053 | C1C[NH+]SN1	1
1054 | C1NCOCN1	1
1055 | C1CC=CC=CN1	1
1056 | C1CC[N+]C=CN1	1
1057 | C1=NCCN=CC1	1
1058 | C1COCNN1	1
1059 | C1CNCNN1	1
1060 | C1=CN=[N+]C=C1	1
1061 | C1CCC=C[N-]1	1
1062 | C1CN=NC=C1	1
1063 | C1=[N+]C=[NH+]CC1	1
1064 | C1CNC[N+]C1	1
1065 | C1=C[N+]=NC=N1	1
1066 | C1=[NH+]C[NH][NH]1	1
1067 | C1C=CC=CCN1	1
1068 | C1COCCOCCOCC[NH+]1	1
1069 | C1=NC=[NH+]C1	1
1070 | C1=C[NH+]=COC1	1
1071 | C1C[NH2+]CC[NH2+]1	1
1072 | C1=CCCSCCC1	1
1073 | C1NOCNO1	1
1074 | C1=C\CNC/C=C\CNC/1	1
1075 | C1COCCOCC[NH2+]CCOCCOCCN1	1
1076 | C1=C[N+]CN=C1	1
1077 | C1=NCOCS1	1
1078 | C1CNCC[N-]1	1
1079 | C1=NCC=[NH+]C1	1
1080 | C1C=COCCN1	1
1081 | C1COCCOCCOCCOCC[NH2+]CCO1	1
1082 | C1CCC=NC=N1	1
1083 | C1C[N-]SCCSO1	1
1084 | C1C=CNC=N1	1
1085 | C1=C[N+]=CCC1	1
1086 | C1CCNCCCN1	1
1087 | C1CC=NNCN1	1
1088 | C1NNN=[NH+]1	1
1089 | C1=NC=CC[N-]1	1
1090 | C1C[O+]=CC=N1	1
1091 | C1[N-]CCS1	1
1092 | C1COC=[NH+]1	1
1093 | C1=CC=NCC[NH2+]1	1
1094 | C1=C[NH2+]CCCN1	1
1095 | C1C=NCC[NH]C1	1
1096 | C1=NCCP1	1
1097 | C1CC[NH+]NC1	1
1098 | C1N[NH+]1	1
1099 | C1NNC=CS1	1
1100 | C1=CC[NH+]=C1	1
1101 | C1=CCCC=CC1	1
1102 | C1C=NNCC1	1
1103 | C1C[N-]NC1	1
1104 | C1CC/C=C/CCCCCCCCO1	1
1105 | C1NCN[NH]1	1
1106 | C1CCCCOCCCCCC1	1
1107 | C1N=[NH+]CN1	1
1108 | C1CN=CC=C1	1
1109 | C1=NC=CPN1	1
1110 | C1=CCC\C=C/CCCC1	1
1111 | C1=NCSS1	1
1112 | C1COCCOCCOCCOCCOCCO1	1
1113 | C1=CSC1	1
1114 | C1NSC=[N+]1	1
1115 | C1SCCCS1	1
1116 | C1=NN=COC1	1
1117 | C1=NCC=CO1	1
1118 | C1C[N-]CC1	1
1119 | C1[NH]CN[NH]1	1
1120 | C1C=C[NH+]=N1	1
1121 | C1C[NH+]CCNCC[NH+]CCNCC[NH+]CCN1	1
1122 | C1N=CCC=N1	1
1123 | C1C=CCNCN1	1
1124 | C1=C[N-]N=C1	1
1125 | C1CCC[NH+]CCC=CC1	1
1126 | C1C[NH]C=[N+]C1	1
1127 | C1=CNPC=C1	1
1128 | C1=NCNPN1	1
1129 | C1=CSC=NC1	1
1130 | C1C[NH+]=CCCN1	1
1131 | C1=CSCC=[NH+]1	1
1132 | C1COCCOCCOCCOCCN1	1
1133 | C1CCNS1	1
1134 | C1=CC[N+]C1	1
1135 | C1C[NH2+]CC[NH+]1	1
1136 | C1=CCCC[NH2+]C1	1
1137 | C1CSNC1	1
1138 | C1=C[NH+]CCN1	1
1139 | C1CN=[NH+]CC1	1
1140 | C1=CC=[O+]C1	1
1141 | C1N=NN[N+]1	1
1142 | C1CCOPO1	1
1143 | C1=NCCNN1	1
1144 | C1COCOO1	1
1145 | C1CCOOCC1	1
1146 | C1CCC[N+]C=N1	1
1147 | C1C[NH+]=COC1	1
1148 | C1NC=CSCCO1	1
1149 | 


--------------------------------------------------------------------------------
/src/chemutils.py:
--------------------------------------------------------------------------------
   1 | import rdkit
   2 | from rdkit import Chem, DataStructs
   3 | from rdkit.Chem import AllChem
   4 | from rdkit.Chem import Draw
   5 | from functools import reduce 
   6 | from tqdm import tqdm 
   7 | from copy import deepcopy 
   8 | import numpy as np 
   9 | import torch 
  10 | from torch.autograd import Variable
  11 | torch.manual_seed(4) 
  12 | np.random.seed(1)
  13 | import random 
  14 | random.seed(1)
  15 | 
  16 | '''
  17 | 	1. vocabulary: find frequent words (atom and ring) 
  18 | 	2. graph2tree 
  19 |     3. generate smiles set 
  20 |     4. chemical utility 
  21 |         tanimot similarity 
  22 |         canonicalize smiles  
  23 |         is valid
  24 |     5. score modifier  
  25 |         logp_modifier [-inf, inf] -> [0,1] 
  26 | 
  27 |         qed_logp_jnk_gsk_fusion
  28 |             qed, logp, jsn, gsk  -> [0,1]
  29 |     
  30 |     
  31 | '''
  32 | def sigmoid(float_x):
  33 |     return 1.0 / (1 + np.exp(-float_x))
  34 | 
  35 | from scipy.stats import gmean
  36 | 
  37 | def logp_modifier(logp_score):
  38 |     return max(0.0,min(1.0,1/14*(logp_score+10))) 
  39 | '''
  40 | [-inf, inf] -> [0,1]
  41 | '''
  42 | 
  43 | def docking_modifier(docking_score):
  44 |     '''
  45 |         [-12,-4]  -> [0,1]
  46 |         -12  ----->  1
  47 |         -4   ----->  0 
  48 |     '''
  49 |     docking_score = 1/(12-4)*(-docking_score - 4)
  50 |     docking_score = max(docking_score, 0.0)
  51 |     docking_score = min(docking_score, 1.0) 
  52 |     return docking_score 
  53 | 
  54 | def qed_logp_fusion(qed_score, logp_score, jnk_score, gsk_score):
  55 |     logp_score = logp_modifier(logp_score)
  56 |     gmean_score = gmean([qed_score, logp_score])
  57 |     modified_score = min(1.0,gmean_score)
  58 |     return modified_score
  59 | 
  60 | def logp_jnk_gsk_fusion(logp_score, jnk_score, gsk_score):
  61 |     logp_score = logp_modifier(logp_score)
  62 |     return np.mean([logp_score, jnk_score, gsk_score])
  63 | 
  64 | 
  65 | def qed_logp_jnk_gsk_fusion(qed_score, logp_score, jnk_score, gsk_score):
  66 |     logp_score = logp_modifier(logp_score)
  67 |     gmean_score = gmean([qed_score, logp_score, jnk_score, gsk_score])
  68 |     modified_score = min(1.0,gmean_score)
  69 |     return modified_score
  70 | 
  71 | def qed_logp_jnk_gsk_fusion2(qed_score, logp_score, jnk_score, gsk_score):
  72 |     logp_score = logp_modifier(logp_score)
  73 |     return  np.mean([qed_score, logp_score, jnk_score, gsk_score])
  74 | 
  75 | def qed_logp_fusion(qed_score, logp_score):
  76 |     logp_score = logp_modifier(logp_score)
  77 |     gmean_score = gmean([qed_score, logp_score])
  78 |     modified_score = min(1.0, gmean_score)
  79 |     return modified_score 
  80 | 
  81 | def jnk_gsk_fusion(jnk_score, gsk_score):
  82 |     gmean_score = gmean([jnk_score, gsk_score])
  83 |     modified_score = min(1.0,gmean_score)
  84 |     return modified_score
  85 | 
  86 | 
  87 | def load_vocabulary():
  88 | 	datafile = "data/vocabulary.txt"
  89 | 	with open(datafile, 'r') as fin:
  90 | 		lines = fin.readlines()
  91 | 	vocabulary = [line.split()[0] for line in lines]
  92 | 	return vocabulary 
  93 | 
  94 | vocabulary = load_vocabulary()
  95 | bondtype_list = [rdkit.Chem.rdchem.BondType.SINGLE, rdkit.Chem.rdchem.BondType.DOUBLE]
  96 | 
  97 | 
  98 | def ith_substructure_is_atom(i):
  99 |     substructure = vocabulary[i]
 100 |     return True if len(substructure)==1 else False
 101 | 
 102 | def word2idx(word):
 103 |     return vocabulary.index(word)
 104 | 
 105 | 
 106 | # def smiles2fingerprint(smiles):
 107 | #     mol = Chem.MolFromSmiles(smiles)
 108 | #     fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False)
 109 | #     return np.array(fp)
 110 | #     ### shape: (2048,)
 111 | 
 112 | def smiles2fingerprint(smiles):
 113 |     mol = Chem.MolFromSmiles(smiles)
 114 |     fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useChirality=False)
 115 |     return np.array(fp)
 116 |     ### shape: (1024,)
 117 | 
 118 | 
 119 | ## similarity of two SMILES 
 120 | def similarity(a, b):
 121 |     if a is None or b is None: 
 122 |         return 0.0
 123 |     amol = Chem.MolFromSmiles(a)
 124 |     bmol = Chem.MolFromSmiles(b)
 125 |     if amol is None or bmol is None:
 126 |         return 0.0
 127 |     fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False)
 128 |     fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False)
 129 |     return DataStructs.TanimotoSimilarity(fp1, fp2) 
 130 | 
 131 | 
 132 | def similarity_matrix(smiles_lst):
 133 |     n = len(smiles_lst)
 134 |     sim_matrix = np.eye(n)
 135 |     mol_lst = [Chem.MolFromSmiles(smiles) for smiles in smiles_lst]
 136 |     fingerprint_lst = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False) for mol in mol_lst]
 137 |     for i in range(n):
 138 |         fp1 = fingerprint_lst[i]
 139 |         for j in range(i+1,n):
 140 |             fp2 = fingerprint_lst[j]
 141 |             sim = DataStructs.TanimotoSimilarity(fp1, fp2)
 142 |             sim_matrix[i,j] = sim_matrix[j,i] = sim
 143 |     return sim_matrix 
 144 | 
 145 | 
 146 | def canonical(smiles):
 147 |     try:
 148 |         mol = Chem.MolFromSmiles(smiles)
 149 |     except:
 150 |         return None 
 151 |     if mol is not None:
 152 |         return Chem.MolToSmiles(mol, isomericSmiles=True) ### todo double check
 153 |     else:
 154 |         return None
 155 | 
 156 | 
 157 | def smiles2mol(smiles):
 158 |     try:
 159 |         mol = Chem.MolFromSmiles(smiles)
 160 |     except:
 161 |         return None 
 162 |     if mol is None: 
 163 |         return None
 164 |     Chem.Kekulize(mol)
 165 |     return mol 
 166 | 
 167 | ## input: smiles, output: word lst;  
 168 | def smiles2word(smiles):
 169 |     mol = smiles2mol(smiles)
 170 |     if mol is None:
 171 |         return None 
 172 |     word_lst = []
 173 | 
 174 |     cliques = [list(x) for x in Chem.GetSymmSSSR(mol)]
 175 |     cliques_smiles = []
 176 |     for clique in cliques:
 177 |         clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
 178 |         cliques_smiles.append(clique_smiles)
 179 |     atom_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
 180 |     return cliques_smiles + atom_not_in_rings_list 
 181 | 
 182 | ## is_valid_smiles 
 183 | def is_valid(smiles):
 184 |     word_lst = smiles2word(smiles)
 185 |     word_set = set(word_lst)
 186 |     return word_set.issubset(vocabulary)     
 187 | 
 188 | 
 189 | def is_valid_mol(mol):
 190 |     try:
 191 |         smiles = Chem.MolToSmiles(mol)
 192 |     except:
 193 |         return False 
 194 |     if smiles.strip() == '':
 195 |         return False 
 196 |     mol = Chem.MolFromSmiles(smiles)
 197 |     if mol is None or mol.GetNumAtoms() == 0:
 198 |         return False 
 199 |     return True 
 200 | 
 201 | def substr_num(smiles):
 202 |     mol = smiles2mol(smiles)
 203 |     clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
 204 |     return len(clique_lst)
 205 | 
 206 | 
 207 | def smiles2substrs(smiles):
 208 |     if not is_valid(smiles):
 209 |         return None 
 210 |     mol = smiles2mol(smiles)
 211 |     if mol is None:
 212 |         return None
 213 |     idx_lst = []
 214 | 
 215 |     clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
 216 |     # print(clique_lst)  ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]]
 217 |     for clique in clique_lst:
 218 |         clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
 219 |         # print("clique_smiles", clique_smiles)  ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1 
 220 |         idx_lst.append(word2idx(clique_smiles))
 221 |     atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
 222 |     atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()]
 223 |     # print(atom_idx_not_in_rings_list)  ## [0, 1, 2, 3, 11, 12, 13, 14, 21]  nonring atom's index in molecule
 224 |     for atom in atom_symbol_not_in_rings_list:
 225 |         idx_lst.append(word2idx(atom))
 226 | 
 227 |     return idx_lst 
 228 | 
 229 | 
 230 | 
 231 | def smiles2graph(smiles):
 232 |     '''     N is # of substructures in the molecule 
 233 | 
 234 |     Output:
 235 |         1.
 236 |             idx_lst                 [N]      list of substructure's index
 237 |             node_mat                [N,d]
 238 |         2. 
 239 |             substructure_lst 
 240 |             atomidx_2substridx     dict 
 241 |         3. 
 242 |             adjacency_matrix        [N,N]    0/1   np.zeros((4,4))  
 243 |         4. 
 244 |             leaf_extend_idx_pair    [(x1,y1), (x2,y2), ...]
 245 |     '''
 246 | 
 247 |     ### 0. smiles -> mol 
 248 |     if not is_valid(smiles):
 249 |         return None 
 250 |     mol = smiles2mol(smiles)
 251 |     if mol is None:
 252 |         return None
 253 | 
 254 |     ### 1. idx_lst & node_mat 
 255 |     idx_lst = []
 256 |     clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
 257 |     # print(clique_lst)  ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]]
 258 |     for clique in clique_lst:
 259 |         clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
 260 |         # print("clique_smiles", clique_smiles)  ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1 
 261 |         idx_lst.append(word2idx(clique_smiles))
 262 | 
 263 |     atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
 264 |     atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()]
 265 |     # print(atom_idx_not_in_rings_list)  ## [0, 1, 2, 3, 11, 12, 13, 14, 21]  nonring atom's index in molecule
 266 |     for atom in atom_symbol_not_in_rings_list:
 267 |         idx_lst.append(word2idx(atom))
 268 |     # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4]  
 269 |     d = len(vocabulary)
 270 |     N = len(idx_lst)
 271 |     node_mat = np.zeros((N, d))
 272 |     for i,v in enumerate(idx_lst):
 273 |         node_mat[i,v]=1
 274 | 
 275 | 
 276 |     ### 2. substructure_lst & atomidx_2substridx     
 277 |     ###    map from atom index to substructure index 
 278 |     atomidx_2substridx = dict()
 279 |     substructure_lst = clique_lst + atom_idx_not_in_rings_list   
 280 |     ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21] 
 281 |     ### 4:0  23:0, 22:0, ...   8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4, 
 282 |     for idx, substructure in enumerate(substructure_lst):
 283 |     	if type(substructure)==list:
 284 |     		for atom in substructure:
 285 |     			atomidx_2substridx[atom] = idx 
 286 |     	else:
 287 |     		atomidx_2substridx[substructure] = idx 
 288 | 
 289 | 
 290 |     ### 3. adjacency_matrix 
 291 |     adjacency_matrix = np.zeros((N,N),dtype=np.int32)
 292 | 
 293 |     ####### 3.1 atom-atom bonds and atom-ring bonds
 294 |     for bond in mol.GetBonds():
 295 |     	if not bond.IsInRing():
 296 |     		a1 = bond.GetBeginAtom().GetIdx()
 297 |     		a2 = bond.GetEndAtom().GetIdx()
 298 |     		idx1 = atomidx_2substridx[a1] 
 299 |     		idx2 = atomidx_2substridx[a2]
 300 |     		adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1 
 301 |     ####### 3.2 ring-ring connection 
 302 |     for i1,c1 in enumerate(clique_lst):
 303 |     	for i2,c2 in enumerate(clique_lst):
 304 |     		if i1>=i2:
 305 |     			continue 
 306 |     		if len(set(c1).intersection(set(c2))) > 0:
 307 |     			adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1
 308 |     assert np.sum(adjacency_matrix)>=2*(N-1)
 309 | 
 310 |     leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0])
 311 |     M = len(leaf_idx_lst)
 312 |     extend_idx_lst = list(range(N,N+M))
 313 |     leaf_extend_idx_pair = list(zip(leaf_idx_lst, extend_idx_lst))
 314 |     ####### [(3, 12), (5, 13), (6, 14), (9, 15), (11, 16)]
 315 | 
 316 |     return idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair 
 317 | 
 318 | 
 319 | def smiles2feature(smiles):
 320 |     """
 321 |         (1) molecule2tree
 322 |         (2) mask leaf node 
 323 |     """
 324 |     ### 0. smiles -> mol 
 325 |     if not is_valid(smiles):
 326 |         return None 
 327 |     mol = smiles2mol(smiles)
 328 |     if mol is None:
 329 |         return None
 330 | 
 331 |     ### 1. idx_lst  
 332 |     idx_lst = []
 333 |     clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
 334 |     # print(clique_lst)  ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]]
 335 |     for clique in clique_lst:
 336 |         clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
 337 |         # print("clique_smiles", clique_smiles)  ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1 
 338 |         idx_lst.append(word2idx(clique_smiles))
 339 | 
 340 |     atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
 341 |     atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()]
 342 |     # print(atom_idx_not_in_rings_list)  ## [0, 1, 2, 3, 11, 12, 13, 14, 21]  nonring atom's index in molecule
 343 |     for atom in atom_symbol_not_in_rings_list:
 344 |         idx_lst.append(word2idx(atom))
 345 |     # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4]  
 346 |     d = len(vocabulary)
 347 |     N = len(idx_lst)
 348 | 
 349 |     ### 2. substructure_lst & atomidx_2substridx     
 350 |     ###    map from atom index to substructure index 
 351 |     atomidx_2substridx = dict()
 352 |     substructure_lst = clique_lst + atom_idx_not_in_rings_list   
 353 |     ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21] 
 354 |     ### 4:0  23:0, 22:0, ...   8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4, 
 355 |     for idx, substructure in enumerate(substructure_lst):
 356 |         if type(substructure)==list:
 357 |             for atom in substructure:
 358 |                 atomidx_2substridx[atom] = idx 
 359 |         else:
 360 |             atomidx_2substridx[substructure] = idx 
 361 | 
 362 |     ### 3. adjacency_matrix 
 363 |     adjacency_matrix = np.zeros((N,N),dtype=np.int32)
 364 |     ####### 3.1 atom-atom bonds and atom-ring bonds
 365 |     for bond in mol.GetBonds():
 366 |         if not bond.IsInRing():
 367 |             a1 = bond.GetBeginAtom().GetIdx()
 368 |             a2 = bond.GetEndAtom().GetIdx()
 369 |             idx1 = atomidx_2substridx[a1] 
 370 |             idx2 = atomidx_2substridx[a2]
 371 |             adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1 
 372 |     ####### 3.2 ring-ring connection 
 373 |     for i1,c1 in enumerate(clique_lst):
 374 |         for i2,c2 in enumerate(clique_lst):
 375 |             if i1>=i2:
 376 |                 continue 
 377 |             if len(set(c1).intersection(set(c2))) > 0:
 378 |                 adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1
 379 |     assert np.sum(adjacency_matrix)>=2*(N-1)
 380 | 
 381 |     # print(adjacency_matrix, smiles)
 382 |     leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0])
 383 |     mask_idx = random.choice(leaf_idx_lst)
 384 |     label = idx_lst[mask_idx]
 385 | 
 386 |     node_mat = np.zeros((N, d + 1))
 387 |     for i,v in enumerate(idx_lst):
 388 |         if i==mask_idx:
 389 |             node_mat[i,d] = 1 
 390 |         else:
 391 |             node_mat[i,v] = 1
 392 | 
 393 |     return node_mat, adjacency_matrix, mask_idx, label 
 394 | 
 395 | 
 396 | 
 397 | def smiles2expandfeature(smiles):
 398 |     """
 399 |         (1) molecule2tree
 400 |         (2) mask leaf node 
 401 |     """
 402 |     ### 0. smiles -> mol 
 403 |     if not is_valid(smiles):
 404 |         return None 
 405 |     mol = smiles2mol(smiles)
 406 |     if mol is None:
 407 |         return None
 408 | 
 409 |     ### 1. idx_lst
 410 |     idx_lst = []
 411 |     clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
 412 |     # print(clique_lst)  ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]]
 413 |     for clique in clique_lst:
 414 |         clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
 415 |         # print("clique_smiles", clique_smiles)  ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1 
 416 |         idx_lst.append(word2idx(clique_smiles))
 417 | 
 418 |     atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
 419 |     atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()]
 420 |     # print(atom_idx_not_in_rings_list)  ## [0, 1, 2, 3, 11, 12, 13, 14, 21]  nonring atom's index in molecule
 421 |     for atom in atom_symbol_not_in_rings_list:
 422 |         idx_lst.append(word2idx(atom))
 423 |     # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4]  
 424 |     d = len(vocabulary)
 425 |     N = len(idx_lst)
 426 | 
 427 |     ### 2. substructure_lst & atomidx_2substridx     
 428 |     ###    map from atom index to substructure index 
 429 |     atomidx_2substridx = dict()
 430 |     substructure_lst = clique_lst + atom_idx_not_in_rings_list   
 431 |     ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21] 
 432 |     ### 4:0  23:0, 22:0, ...   8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4, 
 433 |     for idx, substructure in enumerate(substructure_lst):
 434 |         if type(substructure)==list:
 435 |             for atom in substructure:
 436 |                 atomidx_2substridx[atom] = idx 
 437 |         else:
 438 |             atomidx_2substridx[substructure] = idx 
 439 | 
 440 |     ### 3. adjacency_matrix 
 441 |     adjacency_matrix = np.zeros((N+1,N+1),dtype=np.int32)
 442 |     ####### 3.1 atom-atom bonds and atom-ring bonds
 443 |     for bond in mol.GetBonds():
 444 |         if not bond.IsInRing():
 445 |             a1 = bond.GetBeginAtom().GetIdx()
 446 |             a2 = bond.GetEndAtom().GetIdx()
 447 |             idx1 = atomidx_2substridx[a1] 
 448 |             idx2 = atomidx_2substridx[a2]
 449 |             adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1 
 450 |     ####### 3.2 ring-ring connection 
 451 |     for i1,c1 in enumerate(clique_lst):
 452 |         for i2,c2 in enumerate(clique_lst):
 453 |             if i1>=i2:
 454 |                 continue 
 455 |             if len(set(c1).intersection(set(c2))) > 0:
 456 |                 adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1
 457 |     # assert np.sum(adjacency_matrix)>=2*(N-1)
 458 | 
 459 |     # print(adjacency_matrix, smiles)
 460 |     leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0])
 461 |     mask_idx = random.choice(leaf_idx_lst)
 462 |     label = idx_lst[mask_idx]
 463 | 
 464 | 
 465 |     node_mat = np.zeros((N + 1, d + 1))
 466 |     for i,v in enumerate(idx_lst):
 467 |         node_mat[i,v] = 1
 468 | 
 469 |     feature_lst = []
 470 |     for idx in range(N):
 471 |         new_node_mat = deepcopy(node_mat)
 472 |         new_adj_mat = deepcopy(adjacency_matrix)
 473 |         new_node_mat[-1,d] = 1 
 474 |         new_adj_mat[idx,N] = 1 
 475 |         new_adj_mat[N,idx] = 1 
 476 |         feature_lst.append((new_node_mat, new_adj_mat, N))
 477 | 
 478 | 
 479 |     return feature_lst 
 480 | 
 481 | 
 482 | 
 483 | 
 484 | 
 485 | 
 486 | 
 487 | def copy_atom(atom):
 488 |     new_atom = Chem.Atom(atom.GetSymbol())
 489 |     new_atom.SetFormalCharge(atom.GetFormalCharge())
 490 |     new_atom.SetAtomMapNum(atom.GetAtomMapNum())
 491 |     return new_atom
 492 | 
 493 | def add_atom_at_position(editmol, position_idx, new_atom, new_bond):
 494 |     '''
 495 |         position_idx:   index of edited atom in editmol
 496 |         new_atom: 'C', 'N', 'O', ... 
 497 |         new_bond: SINGLE, DOUBLE  
 498 |     '''
 499 |     ######  1 edit mol 
 500 |     new_atom = Chem.rdchem.Atom(new_atom)
 501 |     rwmol = deepcopy(editmol)
 502 |     new_atom_idx = rwmol.AddAtom(new_atom)
 503 |     rwmol.AddBond(position_idx, new_atom_idx, order = new_bond)
 504 |     ######  2 check valid of new mol 
 505 |     if not is_valid_mol(rwmol):
 506 |         return None  
 507 |     try:
 508 |         rwmol.UpdatePropertyCache()
 509 |     except:
 510 |         return None
 511 |     smiles = Chem.MolToSmiles(rwmol)
 512 |     assert '.' not in smiles
 513 |     return canonical(smiles)
 514 | 
 515 | 
 516 | def add_fragment_at_position(editmol, position_idx, fragment, new_bond):
 517 |     '''
 518 |         position_idx:  index of edited atom in editmol
 519 |         fragment: e.g., "C1=CC=CC=C1", "C1=CC=NC=C1", ... 
 520 |         new_bond: {SINGLE, DOUBLE}  
 521 | 
 522 |         Return:  
 523 |             list of SMILES
 524 |     '''  
 525 |     new_smiles_set = set()
 526 |     fragment_mol = Chem.MolFromSmiles(fragment)
 527 |     current_atom = editmol.GetAtomWithIdx(position_idx)
 528 |     neighbor_atom_set = set()  ## index of neighbor of current atom in new_mol  
 529 | 
 530 | 
 531 |     ## (A) add a bond between atom and ring 
 532 |     #### 1. initialize empty new_mol
 533 |     new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
 534 | 
 535 |     #### 2. add editmol into new_mol
 536 |     old_idx2new_idx = dict()
 537 |     for atom in editmol.GetAtoms():
 538 |         old_idx = atom.GetIdx()
 539 |         new_atom = copy_atom(atom)
 540 |         new_idx = new_mol.AddAtom(new_atom)
 541 |         old_idx2new_idx[old_idx] = new_idx 
 542 |         assert old_idx == new_idx
 543 |     for bond in editmol.GetBonds():
 544 |         a1 = bond.GetBeginAtom()
 545 |         a2 = bond.GetEndAtom()
 546 |         i1 = a1.GetIdx()
 547 |         i2 = a2.GetIdx()
 548 |         i1_new = old_idx2new_idx[i1]
 549 |         i2_new = old_idx2new_idx[i2]
 550 |         bt = bond.GetBondType()
 551 |         new_mol.AddBond(i1_new, i2_new, bt)
 552 |         ### collect the neighbor atoms of current atom, both are in ring. 
 553 |         if (i1==position_idx or i2==position_idx) and (a1.IsInRing() and a2.IsInRing()):
 554 |             neighbor_atom_set.add(i1_new)
 555 |             neighbor_atom_set.add(i2_new)
 556 |     if neighbor_atom_set != set():
 557 |         neighbor_atom_set.remove(old_idx2new_idx[position_idx])
 558 | 
 559 |     #### 3. combine two components 
 560 |     #### 3.1 add fragment into new_mol
 561 |     new_atom_idx_lst = []
 562 |     old_idx2new_idx2 = dict()  ### fragment idx -> new mol idx 
 563 |     for atom in fragment_mol.GetAtoms():
 564 |         old_atom_idx = atom.GetIdx()
 565 |         new_atom = copy_atom(atom)
 566 |         new_atom_idx = new_mol.AddAtom(new_atom)
 567 |         new_atom_idx_lst.append(new_atom_idx)
 568 |         old_idx2new_idx2[old_atom_idx] = new_atom_idx 
 569 |     for bond in fragment_mol.GetBonds():
 570 |         a1 = bond.GetBeginAtom().GetIdx()
 571 |         a2 = bond.GetEndAtom().GetIdx()
 572 |         i1 = old_idx2new_idx2[a1]
 573 |         i2 = old_idx2new_idx2[a2]
 574 |         bt = bond.GetBondType()
 575 |         new_mol.AddBond(i1, i2, bt)
 576 | 
 577 |     #### 3.2 enumerate possible binding atoms and generate new smiles 
 578 |     for i in new_atom_idx_lst:  ### enumeration 
 579 |         copy_mol = deepcopy(new_mol)
 580 |         copy_mol.AddBond(old_idx2new_idx[position_idx], i, new_bond)
 581 |         if is_valid_mol(copy_mol):
 582 |             try:
 583 |                 copy_mol.UpdatePropertyCache()
 584 |                 new_smiles = Chem.MolToSmiles(copy_mol)
 585 |                 new_smiles = canonical(new_smiles)
 586 |                 if new_smiles is not None:
 587 |                     assert '.' not in new_smiles
 588 |                     new_smiles_set.add(new_smiles) 
 589 |             except:
 590 |                 pass  
 591 | 
 592 | 
 593 |     # if not current_atom.IsInRing() or new_bond != rdkit.Chem.rdchem.BondType.SINGLE:
 594 |     if not current_atom.IsInRing():
 595 |         return new_smiles_set
 596 | 
 597 | 
 598 |     # print(new_smiles_set)
 599 |     ## (B) share bond between rings 
 600 |     #### 1. initialize empty new_mol
 601 |     new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
 602 | 
 603 |     #### 2. add editmol into new_mol
 604 |     old_idx2new_idx = dict()
 605 |     for atom in editmol.GetAtoms():
 606 |         old_idx = atom.GetIdx() 
 607 |         new_atom = copy_atom(atom)
 608 |         new_idx = new_mol.AddAtom(new_atom)
 609 |         old_idx2new_idx[old_idx] = new_idx 
 610 |         assert old_idx == new_idx 
 611 |     for bond in editmol.GetBonds():
 612 |         a1 = bond.GetBeginAtom().GetIdx()
 613 |         a2 = bond.GetEndAtom().GetIdx()
 614 |         i1 = old_idx2new_idx[a1]
 615 |         i2 = old_idx2new_idx[a2]
 616 |         bt = bond.GetBondType()
 617 |         new_mol.AddBond(i1, i2, bt) 
 618 | 
 619 |     # print(Chem.MolToSmiles(new_mol))
 620 |     #### 3. fragment mol  
 621 |     ####### 3.1 find 2 common atoms and 1 bond  
 622 |     current_atom = editmol.GetAtomWithIdx(old_idx2new_idx[position_idx])
 623 |     current_atom_symbol = current_atom.GetSymbol()
 624 | 
 625 |     atom_lst = list(fragment_mol.GetAtoms())
 626 |     for neighbor_atom in neighbor_atom_set:
 627 |         neighbor_atom_symbol = editmol.GetAtomWithIdx(neighbor_atom).GetSymbol()
 628 |         bondtype_edit = new_mol.GetBondBetweenAtoms(neighbor_atom, old_idx2new_idx[position_idx]).GetBondType()
 629 |         for i,v in enumerate(atom_lst):
 630 |             v_idx = v.GetIdx()
 631 |             ### v1 is neighbor of v 
 632 |             for v1 in [atom_lst[i-1], atom_lst[i+1-len(atom_lst)]]: 
 633 |                 v1_idx = v1.GetIdx()
 634 |                 bondtype_frag = fragment_mol.GetBondBetweenAtoms(v_idx, v1_idx).GetBondType()
 635 |                 # print("current:", current_atom_symbol, "neighbor:", neighbor_atom_symbol, bondtype_edit)
 636 |                 # print(v.GetSymbol(), v1.GetSymbol(), bondtype_frag)
 637 |                 if v.GetSymbol()==current_atom_symbol and v1.GetSymbol()==neighbor_atom_symbol and bondtype_edit==bondtype_frag: 
 638 |                     ####### 3.1 find 2 common atoms and 1 bond  
 639 |                     # print("2 common atoms and 1 bond ")
 640 |                     ############################################
 641 |                     ####### 3.2 add other atoms and bonds 
 642 |                     new_mol2 = deepcopy(new_mol)
 643 |                     old_idx2new_idx2 = dict()
 644 |                     old_idx2new_idx2[v_idx] = current_atom.GetIdx()
 645 |                     old_idx2new_idx2[v1_idx] = neighbor_atom
 646 |                     for atom in fragment_mol.GetAtoms():
 647 |                         old_idx = atom.GetIdx()
 648 |                         if not (old_idx==v_idx or old_idx==v1_idx):
 649 |                             new_atom = copy_atom(atom)
 650 |                             new_idx = new_mol2.AddAtom(new_atom)
 651 |                             old_idx2new_idx2[old_idx] = new_idx 
 652 |                     for bond in fragment_mol.GetBonds():
 653 |                         a1 = bond.GetBeginAtom()
 654 |                         a2 = bond.GetEndAtom()
 655 |                         i1 = a1.GetIdx()
 656 |                         i2 = a2.GetIdx()
 657 |                         i1_new = old_idx2new_idx2[i1]
 658 |                         i2_new = old_idx2new_idx2[i2]
 659 |                         bt = bond.GetBondType()
 660 |                         if not (set([i1,i2]) == set([v1.GetIdx(), v.GetIdx()])):
 661 |                             new_mol2.AddBond(i1_new, i2_new, bt)
 662 |                     ####### 3.2 add other atoms and bonds 
 663 |                     ####### 3.3 check validity and canonicalize
 664 |                     if not is_valid_mol(new_mol2):
 665 |                         continue 
 666 |                     try:
 667 |                         new_mol2.UpdatePropertyCache()
 668 |                         # print("success")
 669 |                     except:
 670 |                         continue 
 671 |                     new_smiles = Chem.MolToSmiles(new_mol2)
 672 |                     new_smiles = canonical(new_smiles)
 673 |                     if new_smiles is not None:
 674 |                         assert '.' not in new_smiles
 675 |                         new_smiles_set.add(new_smiles)
 676 |                     # print(new_smiles)
 677 |     # print(new_smiles_set)
 678 |     return new_smiles_set
 679 | 
 680 | 
 681 | 
 682 | def delete_substructure_at_idx(editmol, atom_idx_lst):
 683 |     edit_smiles = Chem.MolToSmiles(editmol)
 684 |     #### 1. initialize with empty mol
 685 |     new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
 686 | 
 687 |     #### 2. add editmol into new_mol
 688 |     old_idx2new_idx = dict()
 689 |     for atom in editmol.GetAtoms():
 690 |         old_idx = atom.GetIdx()
 691 |         if old_idx in atom_idx_lst: 
 692 |             continue 
 693 |         new_atom = copy_atom(atom)
 694 |         new_idx = new_mol.AddAtom(new_atom)
 695 |         old_idx2new_idx[old_idx] = new_idx 
 696 |     for bond in editmol.GetBonds():
 697 |         a1 = bond.GetBeginAtom().GetIdx()
 698 |         a2 = bond.GetEndAtom().GetIdx()
 699 |         if a1 in atom_idx_lst or a2 in atom_idx_lst:
 700 |             continue 
 701 |         a1_new = old_idx2new_idx[a1]
 702 |         a2_new = old_idx2new_idx[a2]
 703 |         bt = bond.GetBondType()
 704 |         new_mol.AddBond(a1_new, a2_new, bt) 
 705 | 
 706 |     if not is_valid_mol(new_mol):
 707 |         return None
 708 |     try:
 709 |         new_mol.UpdatePropertyCache()
 710 |     except:
 711 |         return None 
 712 |     return new_mol, old_idx2new_idx 
 713 | 
 714 | 
 715 | 
 716 | 
 717 | 
 718 | 
 719 | def differentiable_graph2smiles_lgp(origin_smiles, differentiable_graph, 
 720 |                                 leaf_extend_idx_pair, leaf_nonleaf_lst, 
 721 |                                 max_num_offspring = 100, topk = 3):
 722 |     '''
 723 |         origin_smiles:
 724 |             origin_idx_lst              [N]      0,1,...,d-1 
 725 |             origin_node_mat             [N,d]
 726 |             origin_substructure_lst     
 727 |             origin_atomidx_2substridx   
 728 |             origin_adjacency_matrix     [N,N]    0/1
 729 |         differentiable_graph:   returned results 
 730 |             node_indicator              [N+M,d]
 731 |             adjacency_weight            [N+M,N+M]
 732 |         N is # of substructures in the molecule
 733 |         M is # of leaf node, also number of extended node. 
 734 |     main utility
 735 |         add_atom_at_position 
 736 |         add_fragment_at_position 
 737 |         delete_substructure_at_idx 
 738 |         REPLACE = delete + add 
 739 |     Output:
 740 |         new_smiles_set
 741 |     '''
 742 |     new_smiles_set = set()
 743 |     #### 1. data preparation 
 744 |     origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
 745 |     origin_idx_lst, origin_node_mat, origin_substructure_lst, \
 746 |     origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
 747 |     node_indicator, adjacency_weight = differentiable_graph 
 748 |     N = len(origin_idx_lst)
 749 |     M = len(leaf_extend_idx_pair) 
 750 |     d = len(vocabulary)
 751 | 
 752 |     ####### 2.3 add   todo: use adjacency_weight to further narrow scope
 753 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
 754 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
 755 |         if type(leaf_atom_idx_lst)==int:  ### int: single atom;   else: list of integer
 756 |             leaf_atom_idx_lst = [leaf_atom_idx_lst]
 757 |         for leaf_atom_idx in leaf_atom_idx_lst:
 758 |             added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk]
 759 |             for substructure_idx in added_substructure_lst:
 760 |                 new_substructure = vocabulary[substructure_idx]
 761 |                 for new_bond in bondtype_list:
 762 |                     if ith_substructure_is_atom(substructure_idx):
 763 |                         new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
 764 |                                                           new_atom = new_substructure, new_bond = new_bond)
 765 |                         new_smiles_set.add(new_smiles)
 766 |                     else:
 767 |                         new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
 768 |                                                                     fragment = new_substructure , new_bond = new_bond)
 769 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
 770 | 
 771 |     return new_smiles_set.difference(set([None]))  
 772 | 
 773 | 
 774 | 
 775 | 
 776 | 
 777 | def differentiable_graph2smiles_v0(origin_smiles, differentiable_graph, 
 778 |                                 leaf_extend_idx_pair, leaf_nonleaf_lst, 
 779 |                                 max_num_offspring = 100, topk = 3):
 780 |     '''
 781 |         origin_smiles:
 782 |             origin_idx_lst              [N]      0,1,...,d-1 
 783 |             origin_node_mat             [N,d]
 784 |             origin_substructure_lst     
 785 |             origin_atomidx_2substridx   
 786 |             origin_adjacency_matrix     [N,N]    0/1
 787 |         differentiable_graph:   returned results 
 788 |             node_indicator              [N+M,d]
 789 |             adjacency_weight            [N+M,N+M]
 790 |         N is # of substructures in the molecule
 791 |         M is # of leaf node, also number of extended node. 
 792 |     main utility
 793 |         add_atom_at_position 
 794 |         add_fragment_at_position 
 795 |         delete_substructure_at_idx 
 796 |         REPLACE = delete + add 
 797 |     Output:
 798 |         new_smiles_set
 799 |     '''
 800 |     new_smiles_set = set()
 801 |     #### 1. data preparation 
 802 |     origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
 803 |     origin_idx_lst, origin_node_mat, origin_substructure_lst, \
 804 |     origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
 805 |     node_indicator, adjacency_weight = differentiable_graph 
 806 |     N = len(origin_idx_lst)
 807 |     M = len(leaf_extend_idx_pair) 
 808 |     d = len(vocabulary)
 809 | 
 810 |     #### 2. edit the original molecule  
 811 |     ####### 2.1 delete & 2.2 replace 
 812 |     for leaf_idx, _ in leaf_extend_idx_pair:
 813 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
 814 |         if type(leaf_atom_idx_lst)==int:  ### single atom
 815 |             new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
 816 |         else:  #### ring     
 817 |             ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
 818 |             new_leaf_atom_idx_lst = []
 819 |             remaining_atoms_idx_lst = []
 820 |             for i,v in enumerate(origin_substructure_lst):
 821 |                 if i==leaf_idx:
 822 |                     continue 
 823 |                 if type(v)==int:
 824 |                     remaining_atoms_idx_lst.append(v)
 825 |                 else: #### list 
 826 |                     remaining_atoms_idx_lst.extend(v)
 827 |             new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
 828 |         ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 
 829 |         ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
 830 |         result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 
 831 |         if result is None: 
 832 |             continue
 833 |         delete_mol, old_idx2new_idx = result
 834 |         delete_smiles = Chem.MolToSmiles(delete_mol)
 835 |         if delete_smiles is None or '.' in delete_smiles:
 836 |             continue
 837 |         delete_smiles = canonical(delete_smiles)
 838 |         new_smiles_set.add(delete_smiles)  #### 2.1 delete done
 839 |         ####  2.2 replace  a & b 
 840 |         ######### (a) get neighbor substr
 841 |         neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
 842 |         assert len(neighbor_substructures_idx)==1 
 843 |         neighbor_substructures_idx = neighbor_substructures_idx[0]
 844 |         neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
 845 |         if type(neighbor_atom_idx_lst)==int:
 846 |             neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 
 847 |         ######### (b) add new substructure  todo, enumerate several possibility 
 848 |         added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk]  ### topk 
 849 |         for substructure_idx in added_substructure_lst: 
 850 |             new_substructure = vocabulary[substructure_idx]
 851 |             for new_bond in bondtype_list:
 852 |                 for leaf_atom_idx in neighbor_atom_idx_lst:
 853 |                     new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 
 854 |                     if ith_substructure_is_atom(substructure_idx):
 855 |                         new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
 856 |                                                           new_atom = new_substructure, new_bond = new_bond)
 857 |                         new_smiles_set.add(new_smiles)
 858 |                     else:
 859 |                         new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
 860 |                                                                     fragment = new_substructure, new_bond = new_bond)
 861 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
 862 | 
 863 | 
 864 | 
 865 |     ####### 2.3 add   todo: use adjacency_weight to further narrow scope
 866 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
 867 |         expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2  ### [-inf, inf]
 868 |         # print("expand prob", expand_prob)
 869 |         if expand_prob < -3:
 870 |             continue 
 871 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
 872 |         if type(leaf_atom_idx_lst)==int:  ### int: single atom;   else: list of integer
 873 |             leaf_atom_idx_lst = [leaf_atom_idx_lst]
 874 |         for leaf_atom_idx in leaf_atom_idx_lst:
 875 |             added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk]
 876 |             for substructure_idx in added_substructure_lst:
 877 |                 new_substructure = vocabulary[substructure_idx]
 878 |                 for new_bond in bondtype_list:
 879 |                     if ith_substructure_is_atom(substructure_idx):
 880 |                         new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
 881 |                                                           new_atom = new_substructure, new_bond = new_bond)
 882 |                         new_smiles_set.add(new_smiles)
 883 |                     else:
 884 |                         new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
 885 |                                                                     fragment = new_substructure , new_bond = new_bond)
 886 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
 887 | 
 888 | 
 889 | 
 890 |     return new_smiles_set.difference(set([None]))  
 891 | 
 892 | 
 893 | 
 894 | 
 895 | def differentiable_graph2smiles(origin_smiles, differentiable_graph, 
 896 |                                 leaf_extend_idx_pair, leaf_nonleaf_lst, 
 897 |                                 max_num_offspring = 100, topk = 3):
 898 |     '''
 899 |         origin_smiles:
 900 |             origin_idx_lst              [N]      0,1,...,d-1 
 901 |             origin_node_mat             [N,d]
 902 |             origin_substructure_lst     
 903 |             origin_atomidx_2substridx   
 904 |             origin_adjacency_matrix     [N,N]    0/1
 905 | 
 906 |         differentiable_graph:   returned results 
 907 |             node_indicator              [N+M,d]
 908 |             adjacency_weight            [N+M,N+M]
 909 | 
 910 |         N is # of substructures in the molecule
 911 |         M is # of leaf node, also number of extended node. 
 912 | 
 913 | 
 914 |     main utility
 915 |         add_atom_at_position 
 916 |         add_fragment_at_position 
 917 |         delete_substructure_at_idx 
 918 |         REPLACE = delete + add 
 919 | 
 920 |     Output:
 921 |         new_smiles_set
 922 |     '''
 923 |     leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
 924 |     leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
 925 |     new_smiles_set = set()
 926 |     #### 1. data preparation 
 927 |     origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
 928 |     origin_idx_lst, origin_node_mat, origin_substructure_lst, \
 929 |     origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
 930 |     node_indicator, adjacency_weight = differentiable_graph 
 931 |     N = len(origin_idx_lst)
 932 |     M = len(leaf_extend_idx_pair) 
 933 |     d = len(vocabulary)
 934 | 
 935 | 
 936 |     #### 2. edit the original molecule  
 937 |     ####### 2.1 delete & 2.2 replace 
 938 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
 939 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
 940 |         if type(leaf_atom_idx_lst)==int:  ### single atom
 941 |             new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
 942 |         else:  #### ring     
 943 |             ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
 944 |             new_leaf_atom_idx_lst = []
 945 |             remaining_atoms_idx_lst = []
 946 |             for i,v in enumerate(origin_substructure_lst):
 947 |                 if i==leaf_idx:
 948 |                     continue 
 949 |                 if type(v)==int:
 950 |                     remaining_atoms_idx_lst.append(v)
 951 |                 else: #### list 
 952 |                     remaining_atoms_idx_lst.extend(v)
 953 |             new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
 954 |         ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 
 955 |         ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
 956 |         result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 
 957 |         if result is None: 
 958 |             continue
 959 |         delete_mol, old_idx2new_idx = result
 960 |         delete_smiles = Chem.MolToSmiles(delete_mol)
 961 |         if delete_smiles is None or '.' in delete_smiles:
 962 |             continue
 963 |         delete_smiles = canonical(delete_smiles)
 964 |         nonleaf_idx = leaf2nonleaf[leaf_idx]
 965 |         shrink_prob = (adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2
 966 |         if shrink_prob > -3: ### sigmoid(-3)=0.1
 967 |             new_smiles_set.add(delete_smiles)
 968 |         #### 2.1 delete done
 969 |         ####  2.2 replace  a & b 
 970 |         ######### (a) get neighbor substr
 971 |         neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
 972 |         assert len(neighbor_substructures_idx)==1 
 973 |         neighbor_substructures_idx = neighbor_substructures_idx[0]
 974 |         neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
 975 |         if type(neighbor_atom_idx_lst)==int:
 976 |             neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 
 977 |         ######### (b) add new substructure  todo, enumerate several possibility 
 978 |         added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk]
 979 |         for substructure_idx in added_substructure_lst: 
 980 |             new_substructure = vocabulary[substructure_idx]
 981 |             for new_bond in bondtype_list:
 982 |                 for leaf_atom_idx in neighbor_atom_idx_lst:
 983 |                     new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 
 984 |                     if ith_substructure_is_atom(substructure_idx):
 985 |                         new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
 986 |                                                           new_atom = new_substructure, new_bond = new_bond)
 987 |                         new_smiles_set.add(new_smiles)
 988 |                     else:
 989 |                         new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
 990 |                                                                     fragment = new_substructure, new_bond = new_bond)
 991 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
 992 |     expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2
 993 |     if expand_prob < -3:
 994 |         return new_smiles_set.difference(set([None]))
 995 | 
 996 | 
 997 |     ####### 2.3 add   todo: use adjacency_weight to further narrow scope
 998 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
 999 |         expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2  ### [-inf, inf]
1000 |         # print("expand prob", expand_prob)
1001 |         if expand_prob < -3:
1002 |             continue 
1003 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1004 |         if type(leaf_atom_idx_lst)==int:  ### int: single atom;   else: list of integer
1005 |             leaf_atom_idx_lst = [leaf_atom_idx_lst]
1006 |         for leaf_atom_idx in leaf_atom_idx_lst:
1007 |             added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk]
1008 |             for substructure_idx in added_substructure_lst:
1009 |                 new_substructure = vocabulary[substructure_idx]
1010 |                 for new_bond in bondtype_list:
1011 |                     if ith_substructure_is_atom(substructure_idx):
1012 |                         new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1013 |                                                           new_atom = new_substructure, new_bond = new_bond)
1014 |                         new_smiles_set.add(new_smiles)
1015 |                     else:
1016 |                         new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1017 |                                                                     fragment = new_substructure , new_bond = new_bond)
1018 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
1019 | 
1020 |     return new_smiles_set.difference(set([None]))  
1021 | 
1022 | 
1023 | 
1024 | 
1025 | def differentiable_graph2smiles_sample(origin_smiles, differentiable_graph, 
1026 |                                 leaf_extend_idx_pair, leaf_nonleaf_lst, 
1027 |                                 topk, epsilon):
1028 |     '''
1029 |         origin_smiles:
1030 |             origin_idx_lst              [N]      0,1,...,d-1 
1031 |             origin_node_mat             [N,d]
1032 |             origin_substructure_lst     
1033 |             origin_atomidx_2substridx   
1034 |             origin_adjacency_matrix     [N,N]    0/1
1035 | 
1036 |         differentiable_graph:   returned results 
1037 |             node_indicator              [N+M,d]
1038 |             adjacency_weight            [N+M,N+M]
1039 | 
1040 |         N is # of substructures in the molecule
1041 |         M is # of leaf node, also number of extended node. 
1042 | 
1043 | 
1044 |     main utility
1045 |         add_atom_at_position 
1046 |         add_fragment_at_position 
1047 |         delete_substructure_at_idx 
1048 |         REPLACE = delete + add 
1049 | 
1050 |     Output:
1051 |         new_smiles_set
1052 |     '''
1053 |     leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
1054 |     leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
1055 |     new_smiles_set = set()
1056 |     #### 1. data preparation 
1057 |     origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
1058 |     origin_idx_lst, origin_node_mat, origin_substructure_lst, \
1059 |     origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
1060 |     node_indicator, adjacency_weight = differentiable_graph 
1061 |     N = len(origin_idx_lst)
1062 |     M = len(leaf_extend_idx_pair) 
1063 |     d = len(vocabulary)
1064 | 
1065 | 
1066 |     #### 2. edit the original molecule  
1067 |     ####### 2.1 delete & 2.2 replace 
1068 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
1069 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1070 |         if type(leaf_atom_idx_lst)==int:  ### single atom
1071 |             new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
1072 |         else:  #### ring     
1073 |             ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
1074 |             new_leaf_atom_idx_lst = []
1075 |             remaining_atoms_idx_lst = []
1076 |             for i,v in enumerate(origin_substructure_lst):
1077 |                 if i==leaf_idx:
1078 |                     continue 
1079 |                 if type(v)==int:
1080 |                     remaining_atoms_idx_lst.append(v)
1081 |                 else: #### list 
1082 |                     remaining_atoms_idx_lst.extend(v)
1083 |             new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
1084 |         ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 
1085 |         ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
1086 |         result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 
1087 |         if result is None: 
1088 |             continue
1089 |         delete_mol, old_idx2new_idx = result
1090 |         delete_smiles = Chem.MolToSmiles(delete_mol)
1091 |         if delete_smiles is None or '.' in delete_smiles:
1092 |             continue
1093 |         delete_smiles = canonical(delete_smiles)
1094 |         nonleaf_idx = leaf2nonleaf[leaf_idx]
1095 |         shrink_prob = (adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2
1096 |         if shrink_prob > -3: ### sigmoid(-3)=0.1
1097 |             new_smiles_set.add(delete_smiles)
1098 |         #### 2.1 delete done
1099 |         ####  2.2 replace  a & b 
1100 |         ######### (a) get neighbor substr
1101 |         neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
1102 |         assert len(neighbor_substructures_idx)==1 
1103 |         neighbor_substructures_idx = neighbor_substructures_idx[0]
1104 |         neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
1105 |         if type(neighbor_atom_idx_lst)==int:
1106 |             neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 
1107 |         ######### (b) add new substructure  todo, enumerate several possibility 
1108 |         u = random.random()
1109 |         if u < epsilon:
1110 |             added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk]  ### topk (greedy)
1111 |         else:
1112 |             added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[leaf_idx], k=topk + 3)
1113 |             added_substructure_lst = list(set(added_substructure_lst))[:topk]  ### avoid repetition
1114 |         for substructure_idx in added_substructure_lst: 
1115 |             new_substructure = vocabulary[substructure_idx]
1116 |             for new_bond in bondtype_list:
1117 |                 for leaf_atom_idx in neighbor_atom_idx_lst:
1118 |                     new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 
1119 |                     if ith_substructure_is_atom(substructure_idx):
1120 |                         new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
1121 |                                                           new_atom = new_substructure, new_bond = new_bond)
1122 |                         new_smiles_set.add(new_smiles)
1123 |                     else:
1124 |                         new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
1125 |                                                                     fragment = new_substructure, new_bond = new_bond)
1126 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
1127 |     expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2
1128 |     if expand_prob < -3:
1129 |         return new_smiles_set.difference(set([None]))
1130 | 
1131 | 
1132 |     ####### 2.3 add   todo: use adjacency_weight to further narrow scope
1133 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
1134 |         expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2  ### [-inf, inf]
1135 |         # print("expand prob", expand_prob)
1136 |         if expand_prob < -3:
1137 |             continue 
1138 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1139 |         if type(leaf_atom_idx_lst)==int:  ### int: single atom;   else: list of integer
1140 |             leaf_atom_idx_lst = [leaf_atom_idx_lst]
1141 |         for leaf_atom_idx in leaf_atom_idx_lst:
1142 |             u = random.random() 
1143 |             if u < epsilon:
1144 |                 added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk] 
1145 |             else:
1146 |                 added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[extend_idx], k=topk + 3)
1147 |                 added_substructure_lst = list(set(added_substructure_lst))[:topk]  ### avoid repetition
1148 |             for substructure_idx in added_substructure_lst:
1149 |                 new_substructure = vocabulary[substructure_idx]
1150 |                 for new_bond in bondtype_list:
1151 |                     if ith_substructure_is_atom(substructure_idx):
1152 |                         new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1153 |                                                           new_atom = new_substructure, new_bond = new_bond)
1154 |                         new_smiles_set.add(new_smiles)
1155 |                     else:
1156 |                         new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1157 |                                                                     fragment = new_substructure , new_bond = new_bond)
1158 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
1159 | 
1160 |     return new_smiles_set.difference(set([None]))  
1161 | 
1162 | 
1163 | 
1164 | def differentiable_graph2smiles_sample_v2(origin_smiles, differentiable_graph, 
1165 |                                 leaf_extend_idx_pair, leaf_nonleaf_lst, 
1166 |                                 topk, epsilon):
1167 |     '''
1168 |         origin_smiles:
1169 |             origin_idx_lst              [N]      0,1,...,d-1 
1170 |             origin_node_mat             [N,d]
1171 |             origin_substructure_lst     
1172 |             origin_atomidx_2substridx   
1173 |             origin_adjacency_matrix     [N,N]    0/1
1174 | 
1175 |         differentiable_graph:   returned results 
1176 |             node_indicator              [N+M,d]
1177 |             adjacency_weight            [N+M,N+M]
1178 | 
1179 |         N is # of substructures in the molecule
1180 |         M is # of leaf node, also number of extended node. 
1181 | 
1182 |     main utility
1183 |         add_atom_at_position 
1184 |         add_fragment_at_position 
1185 |         delete_substructure_at_idx 
1186 |         REPLACE = delete + add 
1187 | 
1188 |     Output:
1189 |         new_smiles_set
1190 |     '''
1191 |     leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
1192 |     leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
1193 |     new_smiles_set = set()
1194 |     #### 1. data preparation 
1195 |     origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
1196 |     origin_idx_lst, origin_node_mat, origin_substructure_lst, \
1197 |     origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
1198 |     node_indicator, adjacency_weight = differentiable_graph  #### both are np.array 
1199 |     N = len(origin_idx_lst)
1200 |     M = len(leaf_extend_idx_pair) 
1201 |     d = len(vocabulary)
1202 | 
1203 | 
1204 |     #### 2. edit the original molecule  
1205 |     ####### 2.1 delete & 2.2 replace 
1206 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
1207 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1208 |         if type(leaf_atom_idx_lst)==int:  ### single atom
1209 |             new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
1210 |         else:  #### ring     
1211 |             ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
1212 |             new_leaf_atom_idx_lst = []
1213 |             remaining_atoms_idx_lst = []
1214 |             for i,v in enumerate(origin_substructure_lst):
1215 |                 if i==leaf_idx:
1216 |                     continue 
1217 |                 if type(v)==int:
1218 |                     remaining_atoms_idx_lst.append(v)
1219 |                 else: #### list 
1220 |                     remaining_atoms_idx_lst.extend(v)
1221 |             new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
1222 |         ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 
1223 |         ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
1224 |         result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 
1225 |         if result is None: 
1226 |             continue
1227 |         delete_mol, old_idx2new_idx = result
1228 |         delete_smiles = Chem.MolToSmiles(delete_mol)
1229 |         if delete_smiles is None or '.' in delete_smiles:
1230 |             continue
1231 |         delete_smiles = canonical(delete_smiles)
1232 |         nonleaf_idx = leaf2nonleaf[leaf_idx]
1233 |         u = random.random() 
1234 |         shrink_prob = sigmoid(adjacency_weight[leaf_idx,nonleaf_idx]) + sigmoid(adjacency_weight[nonleaf_idx,leaf_idx])
1235 |         if u < shrink_prob:
1236 |             new_smiles_set.add(delete_smiles) 
1237 |         # if shrink_prob < 0: ### sigmoid(-3)=0.1
1238 |         #     new_smiles_set.add(delete_smiles)
1239 |         #### 2.1 delete done
1240 |         ####  2.2 replace  a & b 
1241 |         ######### (a) get neighbor substr
1242 |         neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
1243 |         assert len(neighbor_substructures_idx)==1 
1244 |         neighbor_substructures_idx = neighbor_substructures_idx[0]
1245 |         neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
1246 |         if type(neighbor_atom_idx_lst)==int:
1247 |             neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 
1248 |         ######### (b) add new substructure  todo, enumerate several possibility 
1249 |         u = random.random()
1250 | 
1251 |         node_indicator_leaf = node_indicator[leaf_idx]  ### before softmax
1252 |         node_indicator_leaf[12:] -= 5
1253 |         node_indicator_leaf = np.exp(node_indicator_leaf)
1254 |         node_indicator_leaf = node_indicator_leaf / np.sum(node_indicator_leaf)
1255 |         if u < epsilon:
1256 |             added_substructure_lst = list(np.argsort(-node_indicator_leaf))[:topk]  ### topk (greedy)
1257 |         else:
1258 |             added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator_leaf, k=topk + 3)
1259 |             added_substructure_lst = list(set(added_substructure_lst))[:topk]  ### avoid repetition
1260 |         for substructure_idx in added_substructure_lst: 
1261 |             new_substructure = vocabulary[substructure_idx]
1262 |             for new_bond in bondtype_list:
1263 |                 for leaf_atom_idx in neighbor_atom_idx_lst:
1264 |                     new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 
1265 |                     if ith_substructure_is_atom(substructure_idx):
1266 |                         new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
1267 |                                                           new_atom = new_substructure, new_bond = new_bond)
1268 |                         new_smiles_set.add(new_smiles)
1269 |                     else:
1270 |                         new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
1271 |                                                                     fragment = new_substructure, new_bond = new_bond)
1272 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
1273 | 
1274 |     expand_prob = sigmoid(adjacency_weight[leaf_idx,extend_idx]) + sigmoid(adjacency_weight[extend_idx,leaf_idx])/2
1275 |     u = random.random() 
1276 |     if u > expand_prob:
1277 |         return new_smiles_set.difference(set([None]))
1278 | 
1279 | 
1280 |     ####### 2.3 add   todo: use adjacency_weight to further narrow scope
1281 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
1282 |         expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2  ### [-inf, inf]
1283 |         # print("expand prob", expand_prob)
1284 |         if expand_prob < -3:
1285 |             continue 
1286 |         leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1287 |         if type(leaf_atom_idx_lst)==int:  ### int: single atom;   else: list of integer
1288 |             leaf_atom_idx_lst = [leaf_atom_idx_lst]
1289 |         for leaf_atom_idx in leaf_atom_idx_lst:
1290 |             u = random.random() 
1291 |             node_indicator_leaf = node_indicator[extend_idx]
1292 |             node_indicator_leaf[12:]-=5
1293 |             node_indicator_leaf = np.exp(node_indicator_leaf)
1294 |             node_indicator_leaf = node_indicator_leaf / np.sum(node_indicator_leaf)
1295 |             if u < epsilon:
1296 |                 added_substructure_lst = list(np.argsort(-node_indicator_leaf))[:topk] 
1297 |             else:
1298 |                 added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator_leaf, k=topk + 3)
1299 |                 added_substructure_lst = list(set(added_substructure_lst))[:topk]  ### avoid repetition
1300 |             for substructure_idx in added_substructure_lst:
1301 |                 new_substructure = vocabulary[substructure_idx]
1302 |                 for new_bond in bondtype_list:
1303 |                     if ith_substructure_is_atom(substructure_idx):
1304 |                         new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1305 |                                                           new_atom = new_substructure, new_bond = new_bond)
1306 |                         new_smiles_set.add(new_smiles)
1307 |                     else:
1308 |                         new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1309 |                                                                     fragment = new_substructure , new_bond = new_bond)
1310 |                         new_smiles_set = new_smiles_set.union(new_smiles_batch)
1311 | 
1312 |     return new_smiles_set.difference(set([None])) 
1313 | 
1314 | 
1315 | def differentiable_graph_to_smiles_purely_randomwalk(origin_smiles, differentiable_graph, 
1316 |                                              leaf_extend_idx_pair, leaf_nonleaf_lst, 
1317 |                                              topk = 3, epsilon = 0.7,):
1318 |     # print(origin_smiles)
1319 |     leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
1320 |     leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
1321 |     new_smiles_set = set()
1322 |     #### 1. data preparation 
1323 |     origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
1324 |     origin_idx_lst, origin_node_mat, origin_substructure_lst, \
1325 |     origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
1326 |     node_indicator, adjacency_weight = differentiable_graph 
1327 |     N = len(origin_idx_lst)
1328 |     M = len(leaf_extend_idx_pair) 
1329 |     d = len(vocabulary)
1330 | 
1331 | 
1332 | 
1333 |     #### 2. edit the original molecule  
1334 |     ####### 2.1 delete & 2.2 replace 
1335 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
1336 |         u_shrink = random.random() 
1337 |         shrink, unchange, expand = False, False, False 
1338 |         if u_shrink < 0.7 and substr_num(origin_smiles) > 1:
1339 |             shrink = True 
1340 |         else:
1341 |             u_expand = random.random()
1342 |             if u_expand < 0.3:
1343 |                 expand = True 
1344 |             else:
1345 |                 unchange = True  
1346 | 
1347 |         if shrink or unchange:
1348 |             leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1349 |             if type(leaf_atom_idx_lst)==int:  ### single atom
1350 |                 new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
1351 |             else:  #### ring     
1352 |                 ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
1353 |                 new_leaf_atom_idx_lst = []
1354 |                 remaining_atoms_idx_lst = []
1355 |                 for i,v in enumerate(origin_substructure_lst):
1356 |                     if i==leaf_idx:
1357 |                         continue 
1358 |                     if type(v)==int:
1359 |                         remaining_atoms_idx_lst.append(v)
1360 |                     else: #### list 
1361 |                         remaining_atoms_idx_lst.extend(v)
1362 |                 new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
1363 |             ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 
1364 |             ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
1365 |             result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 
1366 |             if result is None: 
1367 |                 continue
1368 |             delete_mol, old_idx2new_idx = result
1369 |             delete_smiles = Chem.MolToSmiles(delete_mol)
1370 |             if delete_smiles is None or '.' in delete_smiles:
1371 |                 continue
1372 |             delete_smiles = canonical(delete_smiles)
1373 |             nonleaf_idx = leaf2nonleaf[leaf_idx]
1374 | 
1375 |             if shrink: 
1376 |                 new_smiles_set.add(delete_smiles)
1377 |                 continue 
1378 |             #### 2.1 delete done
1379 |             ####  2.2 replace  a & b 
1380 |             ######### (a) get neighbor substr
1381 |             neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
1382 |             assert len(neighbor_substructures_idx)==1 
1383 |             neighbor_substructures_idx = neighbor_substructures_idx[0]
1384 |             neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
1385 |             if type(neighbor_atom_idx_lst)==int:
1386 |                 neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 
1387 |             ######### (b) add new substructure  todo, enumerate several possibility 
1388 |             # added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk]
1389 |             added_substructure_lst = [random.choice(list(range(len(vocabulary)))) for i in range(topk)]
1390 |             for substructure_idx in added_substructure_lst: 
1391 |                 new_substructure = vocabulary[substructure_idx]
1392 |                 for new_bond in bondtype_list:
1393 |                     for leaf_atom_idx in neighbor_atom_idx_lst:
1394 |                         new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 
1395 |                         if ith_substructure_is_atom(substructure_idx):
1396 |                             new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
1397 |                                                           new_atom = new_substructure, new_bond = new_bond)
1398 |                             new_smiles_set.add(new_smiles)
1399 |                         else:
1400 |                             new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
1401 |                                                                     fragment = new_substructure, new_bond = new_bond)
1402 |                             new_smiles_set = new_smiles_set.union(new_smiles_batch)
1403 |             continue ### end of shrink or unchange 
1404 | 
1405 |         ####### 2.3 add   todo: use adjacency_weight to further narrow scope
1406 |         for leaf_idx, extend_idx in leaf_extend_idx_pair:
1407 |             leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1408 |             if type(leaf_atom_idx_lst)==int:  ### int: single atom;   else: list of integer
1409 |                 leaf_atom_idx_lst = [leaf_atom_idx_lst]
1410 |             for leaf_atom_idx in leaf_atom_idx_lst:
1411 |                 added_substructure_lst = [random.choice(list(range(len(vocabulary)))) for i in range(topk)]
1412 |                 for substructure_idx in added_substructure_lst:
1413 |                     new_substructure = vocabulary[substructure_idx]
1414 |                     for new_bond in bondtype_list:
1415 |                         if ith_substructure_is_atom(substructure_idx):
1416 |                             new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1417 |                                                           new_atom = new_substructure, new_bond = new_bond)
1418 |                             new_smiles_set.add(new_smiles)
1419 |                         else:
1420 |                             new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1421 |                                                                     fragment = new_substructure , new_bond = new_bond)
1422 |                             new_smiles_set = new_smiles_set.union(new_smiles_batch)
1423 | 
1424 |     return new_smiles_set.difference(set([None]))  
1425 | 
1426 | 
1427 | 
1428 | 
1429 | def differentiable_graph2smiles_plus_random(origin_smiles, differentiable_graph, 
1430 |                                              leaf_extend_idx_pair, leaf_nonleaf_lst, 
1431 |                                              max_num_offspring = 100, topk = 3, epsilon = 0.7,
1432 |                                              random_topology = False, random_substr = False):
1433 |     '''
1434 |         origin_smiles:
1435 |             origin_idx_lst              [N]      0,1,...,d-1 
1436 |             origin_node_mat             [N,d]
1437 |             origin_substructure_lst     
1438 |             origin_atomidx_2substridx   
1439 |             origin_adjacency_matrix     [N,N]    0/1
1440 | 
1441 |         differentiable_graph:   returned results 
1442 |             node_indicator              [N+M,d]
1443 |             adjacency_weight            [N+M,N+M]
1444 | 
1445 |         N is # of substructures in the molecule
1446 |         M is # of leaf node, also number of extended node. 
1447 | 
1448 | 
1449 |     main utility
1450 |         add_atom_at_position 
1451 |         add_fragment_at_position 
1452 |         delete_substructure_at_idx 
1453 |         REPLACE = delete + add 
1454 | 
1455 |     Output:
1456 |         new_smiles_set
1457 |     '''
1458 |     leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
1459 |     leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
1460 |     new_smiles_set = set()
1461 |     #### 1. data preparation 
1462 |     origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
1463 |     origin_idx_lst, origin_node_mat, origin_substructure_lst, \
1464 |     origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
1465 |     node_indicator, adjacency_weight = differentiable_graph 
1466 |     N = len(origin_idx_lst)
1467 |     M = len(leaf_extend_idx_pair) 
1468 |     d = len(vocabulary)
1469 | 
1470 |     u_topology = random.random() 
1471 |     ### shrink, unchange, expand prob = 0.2, 0.3, 0.5 
1472 |     shrink, unchange, expand = False, False, False 
1473 |     for leaf_idx, extend_idx in leaf_extend_idx_pair:
1474 |         u_topology = random.random()
1475 |         #### 1. topology 
1476 |         if random_topology:
1477 |             # if u_topology < 0.1:
1478 |             #     shrink = True
1479 |             # elif 0.4 > u_topology >= 0.2:
1480 |             #     unchange = True
1481 |             if u_topology < 0.2:
1482 |                 unchange = True 
1483 |             else:
1484 |                 expand = True 
1485 |         else: ## dmg topology 
1486 |             nonleaf_idx = leaf2nonleaf[leaf_idx]
1487 |             shrink_prob = sigmoid((adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2)
1488 |             # if u_topology < shrink_prob:
1489 |             if False:
1490 |                 shrink = True 
1491 |             else:
1492 |                 u_topology2 = random.random() 
1493 |                 expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2
1494 |                 if u_topology2 < expand_prob:
1495 |                     expand_prob = True
1496 |                 else:
1497 |                     unchange = True 
1498 | 
1499 |         if shrink or unchange: 
1500 | 
1501 |             leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1502 |             if type(leaf_atom_idx_lst)==int:  ### single atom
1503 |                 new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
1504 |             else:  #### ring     
1505 |                 ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
1506 |                 new_leaf_atom_idx_lst = []
1507 |                 remaining_atoms_idx_lst = []
1508 |                 for i,v in enumerate(origin_substructure_lst):
1509 |                     if i==leaf_idx:
1510 |                         continue 
1511 |                     if type(v)==int:
1512 |                         remaining_atoms_idx_lst.append(v)
1513 |                     else: #### list 
1514 |                         remaining_atoms_idx_lst.extend(v)
1515 |                 new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
1516 |             ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 
1517 |             ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 
1518 |             result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 
1519 |             if result is None: 
1520 |                 continue
1521 |             delete_mol, old_idx2new_idx = result
1522 |             delete_smiles = Chem.MolToSmiles(delete_mol)
1523 |             if delete_smiles is None or '.' in delete_smiles:
1524 |                 continue
1525 |             delete_smiles = canonical(delete_smiles)
1526 |             if shrink:
1527 |                 new_smiles_set.add(delete_smiles)
1528 |             if unchange:
1529 |                 ######### (a) get neighbor substr
1530 |                 neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
1531 |                 assert len(neighbor_substructures_idx)==1 
1532 |                 neighbor_substructures_idx = neighbor_substructures_idx[0]
1533 |                 neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
1534 |                 if type(neighbor_atom_idx_lst)==int:
1535 |                     neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 
1536 |                 ######### (b) add new substructure  todo, enumerate several possibility 
1537 |                 if random_substr: ## random sample 
1538 |                     added_substructure_lst = random.choices(list(range(len(vocabulary))), k=topk)
1539 |                 else: ## dmg sampling
1540 |                     u = random.random()
1541 |                     if u < epsilon:
1542 |                         added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk]  ### topk (greedy)
1543 |                     else:
1544 |                         added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[leaf_idx], k=topk + 3)
1545 |                         added_substructure_lst = list(set(added_substructure_lst))[:topk]  ### avoid repetition
1546 |                 for substructure_idx in added_substructure_lst: 
1547 |                     new_substructure = vocabulary[substructure_idx]
1548 |                     for new_bond in bondtype_list:
1549 |                         for leaf_atom_idx in neighbor_atom_idx_lst:
1550 |                             new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 
1551 |                             if ith_substructure_is_atom(substructure_idx):
1552 |                                 new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
1553 |                                                                   new_atom = new_substructure, new_bond = new_bond)
1554 |                                 new_smiles_set.add(new_smiles)
1555 |                             else:
1556 |                                 new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 
1557 |                                                                             fragment = new_substructure, new_bond = new_bond)
1558 |                                 new_smiles_set = new_smiles_set.union(new_smiles_batch)
1559 |         else:  ## expand 
1560 | 
1561 |             leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1562 |             if type(leaf_atom_idx_lst)==int:  ### int: single atom;   else: list of integer
1563 |                 leaf_atom_idx_lst = [leaf_atom_idx_lst]
1564 |             if random_substr:
1565 |                 added_substructure_lst = random.choices(list(range(len(vocabulary))), k=topk)
1566 |             else:    
1567 |                 for leaf_atom_idx in leaf_atom_idx_lst:
1568 |                     u = random.random() 
1569 |                     if u < epsilon:
1570 |                         added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk] 
1571 |                     else:
1572 |                         added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[extend_idx], k=topk + 3)
1573 |                         added_substructure_lst = list(set(added_substructure_lst))[:topk]  ### avoid repetition
1574 |                     for substructure_idx in added_substructure_lst:
1575 |                         new_substructure = vocabulary[substructure_idx]
1576 |                         for new_bond in bondtype_list:
1577 |                             if ith_substructure_is_atom(substructure_idx):
1578 |                                 new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1579 |                                                                   new_atom = new_substructure, new_bond = new_bond)
1580 |                                 new_smiles_set.add(new_smiles)
1581 |                             else:
1582 |                                 new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 
1583 |                                                                             fragment = new_substructure , new_bond = new_bond)
1584 |                                 new_smiles_set = new_smiles_set.union(new_smiles_batch)
1585 | 
1586 | 
1587 | 
1588 |     return new_smiles_set.difference(set([None]))  
1589 | 
1590 | 
1591 | def draw_smiles(smiles, figfile_name):
1592 |     mol = Chem.MolFromSmiles(smiles)
1593 |     Draw.MolToImageFile(mol, figfile_name, size = (300,180))
1594 |     return 
1595 | 
1596 | 
1597 | 
1598 | 
1599 | 
1600 | if __name__ == "__main__":
1601 | 
1602 |     # s = 'FC1NCC(-C1=CC=CC(Br)=C1)C1'
1603 |     s = 'C1=CC=CC=C1NC2=NC=CC(F)=N2'
1604 |     draw_smiles(s, "figure/tmp.png")
1605 |     # rawdata_file = "raw_data/zinc.tab"
1606 |     # with open(rawdata_file) as fin:
1607 |     #     lines = fin.readlines()[1:]
1608 |     #     smiles_lst = [line.strip().strip('"') for line in lines]
1609 | 
1610 |     # from random import shuffle 
1611 |     # # shuffle(smiles_lst)
1612 |     # fragment_lst = ['C1NCC1', 'C1CNCCN1', 'C1=CC=CC=C1', 'C1CNNC1']
1613 | 
1614 | 
1615 |     # smiles = smiles_lst[0]
1616 |     # differentiable_graph = smiles2differentiable_graph(smiles)  
1617 |     # ### optimize differentiable_graph using GNN   
1618 |     # smiles_set = differentiable_graph2smiles(origin_smiles = smiles, differentiable_graph = differentiable_graph, max_num_offspring = 100)
1619 |     # print(len(smiles_set))
1620 | 
1621 |     # s = "CCc1ccc(Nc2nc(-c3ccccc3)cs2)cc1"
1622 |     # s = 'Oc1ccc(Nc2nc(-c3ccc(Cl)cc3)cs2)cc1'
1623 |     # draw_smiles(s, "figure/tmp.png")
1624 |     # from tdc import Oracle 
1625 |     # qed = Oracle('qed')
1626 |     # logp = Oracle('logp')
1627 |     # jnk = Oracle('jnk3')
1628 |     # gsk = Oracle('gsk3b')
1629 |     # print(qed(s), logp(s), jnk(s), gsk(s))
1630 | 
1631 | 
1632 |     # smiles_lst = ['NO', 'ONO', 'CNO', 'CS']
1633 |     # print(similarity_matrix(smiles_lst))
1634 | 
1635 | 
1636 | 
1637 |     ##### test over zinc 
1638 |     # for smiles in tqdm(smiles_lst):
1639 |     #     mol = Chem.MolFromSmiles(smiles)
1640 |     #     print(smiles)
1641 |     #     new_smiles_lst = []
1642 |     #     for idx in range(mol.GetNumAtoms()):
1643 |     #         for fragment in fragment_lst:
1644 |     #             smiles_set = add_fragment_at_position(editmol = mol, position_idx = idx, fragment = fragment, new_bond = bondtype_list[0])
1645 |     #             new_smiles_lst.extend(list(smiles_set))
1646 |     #         new_smiles_lst = list(set(new_smiles_lst))
1647 |     #     print("length of smiles set is", len(new_smiles_lst))
1648 | 
1649 | 
1650 | 
1651 |     ### single test
1652 |     # smiles = 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1'
1653 |     # draw_smiles(smiles, "figure/origin.png")
1654 |     # fragment = 'C1CCNCN1'
1655 |     # mol = Chem.MolFromSmiles(smiles)
1656 |     # for idx in range(mol.GetNumAtoms()):
1657 |     #     smiles_set = add_fragment_at_position(editmol = mol, position_idx = idx, fragment = fragment, new_bond = bondtype_list[0])
1658 |     #     print("length of smiles set is", len(smiles_set), smiles_set)
1659 |     #     for i,smiles in enumerate(smiles_set):
1660 |     #         name = "figure/" + str(idx) + '_' + str(i) + '.png'
1661 |     #         draw_smiles(smiles, name)
1662 | 
1663 | 
1664 | 
1665 | 
1666 | '''
1667 | 
1668 | "CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1",
1669 | "C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1",
1670 | "N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)cc2)cc1",
1671 | "CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1",
1672 | "N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#N)C12CCCCC2",
1673 | "CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br"
1674 | 
1675 | CCc1ccc(Nc2nc(-c3ccccc3)cs2)cc1
1676 | 
1677 | 
1678 | 
1679 | rawdata_file = "raw_data/zinc.tab"
1680 | with open(rawdata_file) as fin:
1681 | 	lines = fin.readlines()[1:]
1682 | 	smiles_lst = [line.strip().strip('"') for line in lines]
1683 | 
1684 | 
1685 | 
1686 | test case:
1687 |     
1688 |     smiles         fragment 
1689 |     C1CCCC1         C1NCC1
1690 |     C1=CC=CC=C1    C1CNCCN1  
1691 |     C1=CC=CC=C1    C1CCNCN1
1692 |     CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1   
1693 | '''
1694 | 
1695 | 
1696 | 


--------------------------------------------------------------------------------