├── .gitignore ├── LICENSE ├── README.md ├── execute_fragmentation.py └── fragmenter.py /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore all logs 2 | *.log 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Simon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## fragmentation_algorithm_paper 2 | this is the immutable version of the algorithm for the following paper: 3 | - [**Flexible Heuristic Algorithm for Automatic Molecule Fragmentation: Application to the UNIFAC Group Contribution Model**](https://doi.org/10.1186/s13321-019-0382-3) 4 | 5 | # ⚠️If you are interested in a new version⚠️ 6 | - [New version of the fragmentation algorithm](https://github.com/simonmb/fragmentation_algorithm) 7 | - Why is there a newer version? 8 | - The fragmentation algorithm originally published with [the paper](https://doi.org/10.1186/s13321-019-0382-3) tried to find smaller functional groups that are contained within other larger functional groups to sort them automatically in an intelligent way. This turned out to be quite difficult to implement as the capabilities of RDKit to match SMARTS with SMARTS are limited. This lead me to writing workarounds that became broken in subsequent RDKit versions. 9 | - The newer version does not try to automatically sort the groups you are searching for but relies on the user to provide this order. This allows the algorithm to be applicable with recent RDKit versions without problems. 10 | -------------------------------------------------------------------------------- /execute_fragmentation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' Code to execute and evaluate fragmenting molecules into molecular subgroups 3 | 4 | MIT License 5 | 6 | Copyright (C) 2019, Simon Mueller 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE.''' 25 | 26 | # tested with Python 3.6.8 and RDKit version 2017.09.3 27 | 28 | import operator 29 | from fragmenter import fragmenter 30 | from rdkit import Chem 31 | 32 | def info_to_CSV(inchikey, SMILES, pubchem_id, fragmentation): 33 | 34 | fragmentation_array = [] 35 | for group_number, amount in fragmentation.items(): 36 | fragmentation_array.append(str(group_number) + ":" + str(amount)) 37 | 38 | return inchikey + "," + SMILES + "," + pubchem_id + "," + "|".join(fragmentation_array) 39 | 40 | def CSV_to_info(CSV_line, has_fragmentation = False): 41 | CSV_line = CSV_line.replace('\n', '') 42 | array = CSV_line.split(',') 43 | 44 | fragmentation = {} 45 | 46 | if has_fragmentation: 47 | fragmentation_array = array[3].split('|') 48 | for match_str in fragmentation_array: 49 | array2 = match_str.split(':') 50 | group_number = int(array2[0]) 51 | amount = int(array2[1]) 52 | 53 | fragmentation[group_number] = amount 54 | 55 | return array[0], array[1], array[2], fragmentation 56 | 57 | 58 | def function_to_choose_fragmentation(fragmentations): 59 | fragmentations_descriptors = {} 60 | i = 0 61 | for fragmentation in fragmentations: 62 | fragmentations_descriptors[i] = [len(fragmentation)] 63 | i += 1 64 | 65 | sorted_fragmentations_dict = sorted(fragmentations_descriptors.items(), key=operator.itemgetter(1)) 66 | 67 | return fragmentations[sorted_fragmentations_dict[0][0]] 68 | 69 | def is_fragmentation_equal_to_other_fragmentation(fragmentation, other_fragmentation): 70 | 71 | for group_number, amount in fragmentation.items(): 72 | if group_number in other_fragmentation: 73 | if fragmentation[group_number] != other_fragmentation[group_number]: 74 | return False 75 | return True 76 | 77 | def log_structure_results(f, pubchem_id, SMILES, inchikey, success, fragmentation, fragmentation_reference_DB, status = ''): 78 | 79 | f.write('https://pubchem.ncbi.nlm.nih.gov/compound/' + pubchem_id + '#section=2D-Structure\n') 80 | f.write(SMILES + '\n') 81 | f.write(inchikey + '\n') 82 | f.write('\n' + 'Fragmentation was successfull: ' + str(success) + '\n') 83 | 84 | if status != '': 85 | f.write(status + '\n') 86 | 87 | if success: 88 | f.write('Fragmentation from the algorithm:\n') 89 | sorted_group_number = sorted(fragmentation.keys()) 90 | 91 | for group_number in sorted_group_number: 92 | f.write((UNIFAC_SMARTS[group_number - 1][0]).ljust(12, ' ') + '\t' + str(group_number).ljust(8, ' ') + str(fragmentation[group_number]).ljust(8, ' ') + '\n') 93 | 94 | f.write('\n') 95 | 96 | if len(fragmentation_reference_DB) > 0: 97 | f.write('Fragmentation from the reference database:\n') 98 | sorted_group_number = sorted(fragmentation_reference_DB.keys()) 99 | 100 | for group_number in sorted_group_number: 101 | f.write((UNIFAC_SMARTS[group_number - 1][0]).ljust(12, ' ') + '\t' + str(group_number).ljust(8, ' ') + str(fragmentation_reference_DB[group_number]).ljust(8, ' ') + '\n') 102 | 103 | 104 | f.write('\n\n') 105 | 106 | 107 | UNIFAC_SMARTS = [ 108 | ("CH3", "[CH3;X4]"), 109 | ("CH2", "[CH2;X4]"), 110 | ("CH", "[CH1;X4]"), 111 | ("C", ["[CH0;X4]", "[CH0;X3]"]), 112 | ("CH2=CH", "[CH2]=[CH]"), 113 | ("CH=CH", "[CH]=[CH]"), 114 | ("CH2=C", ["[CH2]=[C]", "[CH2]=[c]"]), 115 | ("CH=C", ["[CH]=[CH0]", "[CH]=[cH0]"]), 116 | ("ACH", "[cH]"), 117 | ("AC", "[cH0]"), 118 | ("ACCH3", "[c][CH3;X4]"), 119 | ("ACCH2", "[c][CH2;X4]"), 120 | ("ACCH", "[c][CH;X4]"), 121 | ('OH', "[OH]"), 122 | ('CH3OH', "[CH3][OH]"), 123 | ('H2O', "[OH2]"), 124 | ('ACOH', "[c][OH]"), 125 | ("CH3CO", "[CH3][CH0]=O"), 126 | ("CH2CO", "[CH2][CH0]=O"), 127 | ("CH=O", "[CH]=O"), 128 | ("CH3COO", "[CH3]C(=O)[OH0]"), 129 | ("CH2COO", "[CH2]C(=O)[OH0]"), 130 | ("HCOO", "[CH](=O)[OH0]"), 131 | ("CH3O", "[CH3][OH0]"), 132 | ("CH2O", "[CH2][OH0]"), 133 | ("CHO", "[CH][OH0]"), 134 | ("THF", "[CH2;R][OH0]"), 135 | ("CH3NH2", "[CH3][NH2]"), 136 | ("CH2NH2", "[CH2][NH2]"), 137 | ("CHNH2", "[CH][NH2]"), 138 | ("CH3NH", "[CH3][NH]"), 139 | ("CH2NH", "[CH2][NH]"), 140 | ("CHNH", "[CH][NH]"), 141 | ("CH3N", ["[CH3][N]", "[CH3][n]"]), 142 | ("CH2N", "[CH2][N]"), 143 | ("ACNH2", "[c][NH2]"), 144 | ("C5H5N", "n1[cH][cH][cH][cH][cH]1"), 145 | ("C5H4N", ["n1[c][cH][cH][cH][cH]1", 146 | "n1[cH][c][cH][cH][cH]1", 147 | "n1[cH][cH][c][cH][cH]1"]), 148 | ('C5H3N', ["n1[c][c][cH][cH][cH]1", 149 | "n1[c][cH][c][cH][cH]1", 150 | "n1[c][cH][cH][c][cH]1", 151 | "n1[c][cH][cH][cH][c]1", 152 | "n1[cH][c][c][cH][cH]1", 153 | "n1[cH][c][cH][c][cH]1"]), 154 | ("CH3CN", "[CH3]C#N"), 155 | ("CH2CN", "[CH2]C#N"), 156 | ("COOH", "C(=O)[OH]"), 157 | ("HCOOH", "[CH](=O)[OH]"), 158 | ("CH2Cl", "[CH2]Cl"), 159 | ("CHCl", "[CH]Cl"), 160 | ("CCl", "[CH0]Cl"), 161 | ("CH2Cl2", "[CH2](Cl)Cl"), 162 | ("CHCl2", "[CH](Cl)Cl"), 163 | ("CCl2", "C(Cl)Cl"), 164 | ("CHCl3", "[CH](Cl)(Cl)Cl"), 165 | ("CCl3", "C(Cl)(Cl)(Cl)"), 166 | ("CCl4", "C(Cl)(Cl)(Cl)(Cl)"), 167 | ("ACCl", "[c]Cl"), 168 | ("CH3NO2", "[CH3][N+](=O)[O-]"), 169 | ("CH2NO2", "[CH2][N+](=O)[O-]"), 170 | ("CHNO2", "[CH][N+](=O)[O-]"), 171 | ("ACNO2", "[c][N+](=O)[O-]"), 172 | ("CS2", "C(=S)=S"), 173 | ("CH3SH", "[CH3][SH]"), 174 | ("CH2SH", "[CH2][SH]"), 175 | ("Furfural", "O=[CH]c1[cH][cH][cH]o1"), 176 | ("DOH", "[OH][CH2][CH2][OH]"), 177 | ("I", "[IH0]"), 178 | ("Br", "[BrH0]"), 179 | ("CH#C", "[CH]#C"), 180 | ("C#C", "C#C"), 181 | ("DMSO", "[CH3]S(=O)[CH3]"), 182 | ("ACRY", "[CH2]=[CH1][C]#N"), 183 | ("Cl(C=C)", "[$(Cl[C]=[C])]"), 184 | ("C=C", "[CH0]=[CH0]"), 185 | ("ACF", "[c]F"), 186 | ("DMF", "[CH](=O)N([CH3])[CH3]"), 187 | ("HCON(CH2)2", ["[CH](=O)N([CH2])[CH2]", "[CH](=O)N([CH2])[CH3]"]), 188 | ("CF3", "C(F)(F)F"), 189 | ("CF2", "C(F)F"), 190 | ("CF", "[C]F"), 191 | ("COO", ["[CH0](=O)[OH0]", "[cH0](=O)[oH0]"]), 192 | ("SiH3", "[SiH3]"), 193 | ("SiH2", "[SiH2]"), 194 | ("SiH", "[SiH]"), 195 | ("Si", "[Si]"), 196 | ("SiH2O", "[SiH2][OH0]"), 197 | ("SiHO", "[SiH][OH0]"), 198 | ("SiO", "[Si][OH0]"), 199 | ("NMP", "[CH3]N1[CH2][CH2][CH2]C(=O)1"), 200 | ("CCl3F", "C(Cl)(Cl)(Cl)F"), 201 | ("CCl2F", "C(Cl)(Cl)F"), 202 | ("HCCl2F", "[CH](Cl)(Cl)F"), 203 | ("HCClF", "[CH](Cl)F"), 204 | ("CClF2", "C(Cl)(F)F"), 205 | ("HCClF2", "[CH](Cl)(F)F"), 206 | ("CClF3", "C(Cl)(F)(F)F"), 207 | ("CCl2F2", "C(Cl)(Cl)(F)F"), 208 | ("CONH2", "C(=O)[NH2]"), 209 | ("CONHCH3", "C(=O)[NH][CH3]"), 210 | ("CONHCH2", "C(=O)[NH][CH2]"), 211 | ("CON(CH3)2", "C(=O)N([CH3])[CH3]"), 212 | ("CONCH3CH2", "C(=O)N([CH3])[CH2]"), 213 | ("CON(CH2)2", "C(=O)N([CH2])[CH2]"), 214 | ("C2H5O2", "[OH0;!$(OC=O);!R][CH2;!R][CH2;!R][OH]"), 215 | ("C2H4O2", ["[OH0;!$(OC=O);!R][CH;!R][CH2;!R][OH]", 216 | "[OH0;!$(OC=O);!R][CH2;!R][CH;!R][OH]"]), 217 | ("CH3S", "[CH3]S"), 218 | ("CH2S", "[CH2]S"), 219 | ("CHS", "[CH]S"), 220 | ("MORPH", "[CH2]1[CH2][NH][CH2][CH2]O1"), 221 | ("C4H4S", "[cH]1[cH][s;X2][cH][cH]1"), 222 | ('C4H3S', ["[c]1[cH][s;X2][cH][cH]1", 223 | "[cH]1[c][s;X2][cH][cH]1"]), 224 | ('C4H2S', ["[c]1[c][s;X2][cH][cH]1", 225 | "[c]1[cH][s;X2][cH][c]1", 226 | "[cH]1[c][s;X2][c][cH]1", 227 | "[cH]1[c][s;X2][cH][c]1"]), 228 | ("NCO", "N=C=O"), 229 | ("H2COCH", ""), # "[CH2]1[CH]O1" 230 | ("HCOCH", ""), # "[CH]1[CH]O1" 231 | ("COCH", ""), # "C1[CH]O1" 232 | ("H2COCH2", ""), # "[CH2]1[CH2]O1" 233 | ("OCOCO", ""), # "C(=O)OC(=O)"), 234 | ("(CH3O)2CO", ""), # "[CH3]OC(=O)O[CH3]" 235 | ("(CH2O)2CO", ""), # "[CH2]OC(=O)O[CH2]" 236 | ("CH3OCH2OCO", ""),# "[CH3]OC(=O)O[CH2]" 237 | ("(CH2)2SU", "[CH2]S(=O)(=O)[CH2]"), 238 | ("CH2CHSU", "[CH2]S(=O)(=O)[CH]")] 239 | 240 | # get the fragmentation scheme in the format necessary 241 | fragmentation_scheme = {i+1: j[1] for i, j in enumerate(UNIFAC_SMARTS)} 242 | 243 | # sort the fragmentation scheme according to the descriptors 244 | pattern_descriptors = {} 245 | for group_number, SMARTS in fragmentation_scheme.items(): 246 | if type(SMARTS) is list: 247 | SMARTS = SMARTS[0] 248 | 249 | if SMARTS != "": 250 | pattern = fragmenter.get_mol_with_properties_from_SMARTS(SMARTS) 251 | 252 | pattern_descriptors[group_number] = [pattern.GetUnsignedProp('n_available_bonds') == 0, \ 253 | (pattern.GetBoolProp('is_simple_atom_on_c') or pattern.GetBoolProp('is_simple_atom')), \ 254 | pattern.GetUnsignedProp('n_atoms_defining_SMARTS'), 255 | pattern.GetUnsignedProp('n_available_bonds') == 1, \ 256 | fragmenter.get_heavy_atom_count(pattern) - pattern.GetUnsignedProp('n_carbons'), \ 257 | pattern.GetBoolProp('has_atoms_in_ring'), \ 258 | pattern.GetUnsignedProp('n_triple_bonds'), \ 259 | pattern.GetUnsignedProp('n_double_bonds')] 260 | 261 | sorted_pattern_descriptors = sorted(pattern_descriptors.items(), key=operator.itemgetter(1), reverse=True) 262 | sorted_group_numbers = [i[0] for i in sorted_pattern_descriptors] 263 | 264 | 265 | #for t in sorted_pattern_descriptors: 266 | # print t 267 | # 268 | # 269 | #gg 270 | 271 | # first step: fragment reference database and compare with the results 272 | reference_DB = [] 273 | with open('reference_DB.csv') as f: 274 | for line in f.readlines(): 275 | reference_DB.append(CSV_to_info(line, True)) 276 | 277 | reference_DB_fragmentation_stats = {} 278 | 279 | simple_fragmenter_fragmented = [] 280 | simple_fragmenter_fragmented_and_equal_to_reference_DB = [] 281 | complete_fragmenter_fragmented = [] 282 | complete_fragmenter_fragmented_and_equal_to_reference_DB = [] 283 | 284 | right_size_for_complete_fragmenter = [] 285 | 286 | simple_fragmenter = fragmenter(fragmentation_scheme, 'simple') 287 | complete_fragmenter = fragmenter(fragmentation_scheme, 'complete', 20, function_to_choose_fragmentation) 288 | 289 | # without sorting the patterns 290 | print('####################################################################') 291 | print('Fragmenting the reference database without the patterns sorted (simple and complete algorithm)') 292 | 293 | i_structure = 0 294 | f_simple = open('reference_DB_simple_fragmentation_without_pattern_sorting_results.log','w+') 295 | f_complete = open('reference_DB_complete_fragmentation_without_pattern_sorting_results.log','w+') 296 | for inchikey, SMILES, pubchem_id, fragmentation_reference_DB in reference_DB: 297 | 298 | i_structure = i_structure + 1 299 | if i_structure % 2000 == 0: 300 | print('{:2.1f} .'.format((100.0 * i_structure) / len(reference_DB)), end=" ") 301 | 302 | lines = [] 303 | 304 | 305 | for group_number, amount in fragmentation_reference_DB.items(): 306 | if not group_number in reference_DB_fragmentation_stats: 307 | reference_DB_fragmentation_stats[group_number] = 0 308 | 309 | reference_DB_fragmentation_stats[group_number] += amount 310 | 311 | fragmentation, success = simple_fragmenter.fragment(SMILES) 312 | if success: 313 | simple_fragmenter_fragmented.append(inchikey) 314 | if is_fragmentation_equal_to_other_fragmentation(fragmentation, fragmentation_reference_DB): 315 | simple_fragmenter_fragmented_and_equal_to_reference_DB.append(inchikey) 316 | 317 | log_structure_results(f_simple, pubchem_id, SMILES, inchikey, success, fragmentation, fragmentation_reference_DB) 318 | 319 | n_heavy_atoms = 0 320 | for sub_SMILES in SMILES.split("."): 321 | n_heavy_atoms = max(n_heavy_atoms, fragmenter.get_heavy_atom_count(Chem.MolFromSmiles(sub_SMILES))) 322 | 323 | if n_heavy_atoms <= 20: 324 | right_size_for_complete_fragmenter.append(inchikey) 325 | fragmentation, success = complete_fragmenter.fragment(SMILES) 326 | if success: 327 | complete_fragmenter_fragmented.append(inchikey) 328 | if is_fragmentation_equal_to_other_fragmentation(fragmentation, fragmentation_reference_DB): 329 | complete_fragmenter_fragmented_and_equal_to_reference_DB.append(inchikey) 330 | 331 | log_structure_results(f_complete, pubchem_id, SMILES, inchikey, success, fragmentation, fragmentation_reference_DB) 332 | else: 333 | log_structure_results(f_complete, pubchem_id, SMILES, inchikey, success, fragmentation, fragmentation_reference_DB, 'Structure was skipped because it is larger than 20 atoms.') 334 | 335 | 336 | f_simple.close() 337 | f_complete.close() 338 | 339 | 340 | print('') 341 | print('N_structures(simple): ' + str(len(reference_DB))) 342 | print('N_fragmented(simple): ' + str(len(simple_fragmenter_fragmented)) + "(" + str((1.0 * len(simple_fragmenter_fragmented)) / len(reference_DB)) + ")") 343 | print('N_fragmented_and_equal(simple): ' + str(len(simple_fragmenter_fragmented_and_equal_to_reference_DB)) + "(" + str((1.0 * len(simple_fragmenter_fragmented_and_equal_to_reference_DB)) / len(reference_DB)) + ")") 344 | print('') 345 | print('N_structures(complete):' + str(len(right_size_for_complete_fragmenter))) 346 | print('N_fragmented(complete): ' + str(len(complete_fragmenter_fragmented)) + "(" + str((1.0 * len(complete_fragmenter_fragmented)) / len(right_size_for_complete_fragmenter)) + ")") 347 | print('N_fragmented_and_equal(complete): ' + str(len(complete_fragmenter_fragmented_and_equal_to_reference_DB)) + "(" + str((1.0 * len(complete_fragmenter_fragmented_and_equal_to_reference_DB)) / len(right_size_for_complete_fragmenter)) + ")") 348 | print('') 349 | print('') 350 | print('') 351 | print('') 352 | 353 | right_size_for_complete_fragmenter2 = [] 354 | 355 | # with sorting the patterns 356 | simple_fragmenter.fragmentation_scheme_order = sorted_group_numbers 357 | complete_fragmenter.fragmentation_scheme_order = sorted_group_numbers 358 | 359 | simple_fragmenter_sorted_fragmented = [] 360 | simple_fragmenter_sorted_fragmented_and_equal_to_reference_DB = [] 361 | complete_fragmenter_sorted_fragmented = [] 362 | complete_fragmenter_sorted_fragmented_and_equal_to_reference_DB = [] 363 | 364 | 365 | print('####################################################################') 366 | print('Fragmenting the reference database with the patterns sorted (simple and complete algorithm)') 367 | i_structure = 0 368 | f_simple = open('reference_DB_simple_fragmentation_with_pattern_sorting_results.log','w+') 369 | f_complete = open('reference_DB_complete_fragmentation_with_pattern_sorting_results.log','w+') 370 | for inchikey, SMILES, pubchem_id, fragmentation_reference_DB in reference_DB: 371 | 372 | 373 | i_structure = i_structure + 1 374 | 375 | if i_structure % 2000 == 0: 376 | print('{:2.1f} .'.format((100.0 * i_structure) / len(reference_DB)), end=" ") 377 | 378 | fragmentation, success = simple_fragmenter.fragment(SMILES) 379 | if success: 380 | simple_fragmenter_sorted_fragmented.append(inchikey) 381 | if is_fragmentation_equal_to_other_fragmentation(fragmentation, fragmentation_reference_DB): 382 | simple_fragmenter_sorted_fragmented_and_equal_to_reference_DB.append(inchikey) 383 | 384 | log_structure_results(f_simple, pubchem_id, SMILES, inchikey, success, fragmentation, fragmentation_reference_DB) 385 | 386 | n_heavy_atoms = 0 387 | for sub_SMILES in SMILES.split("."): 388 | n_heavy_atoms = max(n_heavy_atoms, fragmenter.get_heavy_atom_count(Chem.MolFromSmiles(sub_SMILES))) 389 | 390 | if n_heavy_atoms <= 20: 391 | right_size_for_complete_fragmenter2.append(inchikey) 392 | fragmentation, success = complete_fragmenter.fragment(SMILES) 393 | if success: 394 | complete_fragmenter_sorted_fragmented.append(inchikey) 395 | if is_fragmentation_equal_to_other_fragmentation(fragmentation, fragmentation_reference_DB): 396 | complete_fragmenter_sorted_fragmented_and_equal_to_reference_DB.append(inchikey) 397 | 398 | log_structure_results(f_complete, pubchem_id, SMILES, inchikey, success, fragmentation, fragmentation_reference_DB) 399 | else: 400 | log_structure_results(f_complete, pubchem_id, SMILES, inchikey, success, fragmentation, fragmentation_reference_DB, 'Structure was skipped because it is larger than 20 atoms.') 401 | 402 | f_simple.close() 403 | f_complete.close() 404 | 405 | print('') 406 | print('N_structures(simple): ' + str(len(reference_DB))) 407 | print('N_fragmented(simple): ' + str(len(simple_fragmenter_sorted_fragmented)) + "(" + str((1.0 * len(simple_fragmenter_sorted_fragmented)) / len(reference_DB)) + ")") 408 | print('N_fragmented_and_equal(simple): ' + str(len(simple_fragmenter_sorted_fragmented_and_equal_to_reference_DB)) + "(" + str((1.0 * len(simple_fragmenter_sorted_fragmented_and_equal_to_reference_DB)) / len(reference_DB)) + ")") 409 | print('') 410 | print('N_structures(complete):' + str(len(right_size_for_complete_fragmenter2))) 411 | print('N_fragmented(complete): ' + str(len(complete_fragmenter_sorted_fragmented)) + "(" + str((1.0 * len(complete_fragmenter_sorted_fragmented)) / len(right_size_for_complete_fragmenter2)) + ")") 412 | print('N_fragmented_and_equal(complete): ' + str(len(complete_fragmenter_sorted_fragmented_and_equal_to_reference_DB)) + "(" + str((1.0 * len(complete_fragmenter_sorted_fragmented_and_equal_to_reference_DB)) / len(right_size_for_complete_fragmenter2)) + ")") 413 | print('') 414 | print('') 415 | print('') 416 | print('') 417 | 418 | 419 | # second step: try to fragent all from the component 420 | structures_DB = [] 421 | with open('structures_DB.csv') as f: 422 | for line in f.readlines(): 423 | structures_DB.append(CSV_to_info(line)) 424 | 425 | combined_fragmenter = fragmenter(fragmentation_scheme, 'combined', 20, function_to_choose_fragmentation, 1) 426 | combined_fragmenter.fragmentation_scheme_order = sorted_group_numbers 427 | combined_fragmenter.n_max_fragmentations_to_find = 1 428 | 429 | combined_fragmenter_sorted_fragmented = [] 430 | right_size_for_combined_fragmenter = [] 431 | print('####################################################################') 432 | print('Fragmenting the structures database with the patterns sorted (combined algorithm)') 433 | i_structure = 0 434 | f_combined = open('structures_DB_combined_fragmentation_with_pattern_sorting_results.log','w+') 435 | for inchikey, SMILES, pubchem_id, empty_fragmentation in structures_DB: 436 | 437 | i_structure = i_structure + 1 438 | if i_structure % 4000 == 0: 439 | print('{:2.1f} .'.format((100.0 * i_structure) / len(structures_DB)), end=" ") 440 | 441 | fragmentation, success = combined_fragmenter.fragment(SMILES) 442 | if success: 443 | combined_fragmenter_sorted_fragmented.append(inchikey) 444 | 445 | n_heavy_atoms = 0 446 | for sub_SMILES in SMILES.split("."): 447 | n_heavy_atoms = max(n_heavy_atoms, fragmenter.get_heavy_atom_count(Chem.MolFromSmiles(sub_SMILES))) 448 | 449 | if n_heavy_atoms <= 20: 450 | right_size_for_combined_fragmenter.append(inchikey) 451 | log_structure_results(f_combined, pubchem_id, SMILES, inchikey, success, fragmentation, {}) 452 | else: 453 | if success: 454 | log_structure_results(f_combined, pubchem_id, SMILES, inchikey, success, fragmentation, {}) 455 | else: 456 | log_structure_results(f_combined, pubchem_id, SMILES, inchikey, success, fragmentation, {}, 'Structure was skipped because it is larger than 20 atoms.') 457 | 458 | 459 | f_combined.close() 460 | 461 | print('') 462 | print('N_structures(simple): ' + str(len(structures_DB))) 463 | print('N_fragmented(simple): ' + str(len(combined_fragmenter_sorted_fragmented)) + "(" + str((1.0 * len(combined_fragmenter_sorted_fragmented)) / len(structures_DB)) + ")") 464 | print('') 465 | print('####################################################################') -------------------------------------------------------------------------------- /fragmenter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' Class for fragmenting molecules into molecular subgroups 3 | 4 | MIT License 5 | 6 | Copyright (C) 2019, Simon Mueller 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE.''' 25 | 26 | 27 | 28 | class fragmenter: 29 | # tested with Python 3.6.8 and RDKit version 2017.09.3 30 | 31 | # import dependencies 32 | try: 33 | import rdkit as __rdkit 34 | from rdkit import Chem as __Chem 35 | import marshal as __marshal 36 | import regex as __regex 37 | 38 | except: 39 | raise Exception('rdkit, marshal and regex have to be installed.') 40 | 41 | 42 | if __rdkit.rdBase.rdkitVersion != '2017.09.3': 43 | raise Exception('In this code, the SMARTS are \ 44 | analyzed for their properties. Unfortunately different rdkit versions give different values. \ 45 | This code has only been tested on version \'2017.09.3\'.') 46 | # for more details have a look at the comments in the function get_mol_with_properties_from_SMARTS 47 | 48 | 49 | # get a molecule from a SMARTS with the properties necessary to calculate 50 | # more complex properties 51 | @classmethod 52 | def get_mol_from_SMARTS(cls, smarts): 53 | mol = cls.__Chem.MolFromSmarts(smarts) 54 | mol.UpdatePropertyCache(strict = False) 55 | cls.__Chem.GetSymmSSSR(mol) 56 | return mol 57 | 58 | # get a molecule from a SMARTS and calculate complex properties 59 | # this function could be improved by someone who knows RDkit better than me 60 | # 61 | # Unfortunately it only works corrrectly with version 2017.09.3 of RDKit, 62 | # for more details see the following issues: 63 | # https://github.com/rdkit/rdkit/issues/2448 64 | # https://github.com/rdkit/rdkit/issues/1978 65 | # 66 | # if anyone finds a better way of calculating the number of 67 | # bonds a SMARTS fragment has, please contact me or send a merge request 68 | # 69 | # if you are using conda you may install it in a new environemnt using the following command: conda install -c rdkit rdkit=2017.09.3 70 | 71 | @classmethod 72 | def get_mol_with_properties_from_SMARTS(cls, SMARTS): 73 | if cls.__rdkit.rdBase.rdkitVersion != '2017.09.3': 74 | print('#################WARNING########################') 75 | print('In this code, the SMARTS are \ 76 | analyzed programmatically for their properties. Unfortunately different rdkit versions give different values. \ 77 | This code has been developed with version \'2017.09.3\'.') 78 | # if you are using conda you may install it in a new environemnt using the following command: conda install -c rdkit rdkit=2017.09.3 79 | print('################################################') 80 | 81 | mol_SMARTS = fragmenter.get_mol_from_SMARTS(SMARTS) 82 | 83 | # some cleaning up 84 | # conditional SMARTS are not supported 85 | SMARTS = SMARTS.replace('-,:', '') 86 | conditional_statement_exists = SMARTS.count(',') > 0 87 | if conditional_statement_exists: 88 | print (SMARTS) 89 | raise ValueError('Algorithm can\'t handle conditional SMARTS. Please use a list of SMARTS in the fragmentation scheme to mimic conditional SMARTS.') 90 | 91 | n_atoms_defining_SMARTS = 0 92 | n_available_bonds = 0 93 | n_atoms_with_available_bonds = 0 94 | n_hydrogens_this_molecule = 0 95 | n_atoms_in_ring = 0 96 | n_carbons = 0 97 | found_atoms = [] 98 | 99 | # iterate over atoms to get their properties 100 | for atom in mol_SMARTS.GetAtoms(): 101 | SMARTS_atom = atom.GetSmarts() 102 | n_atoms_defining_SMARTS += 1 103 | n_available_bonds_this_atom = 0 104 | 105 | if atom.GetSymbol() == '*': 106 | matches = cls.__regex.finditer("\$?\(([^()]|(?R))*\)", SMARTS_atom) 107 | # if it is a recursive SMARTS 108 | if matches is not None: 109 | for match in matches: 110 | m = match.group(0) 111 | 112 | found_atoms.append(m[2:-1]) 113 | mol_SMARTS2 = fragmenter.get_mol_with_properties_from_SMARTS(m[2:-1]) 114 | n_atoms_defining_SMARTS += mol_SMARTS2.GetUnsignedProp('n_atoms_defining_SMARTS') 115 | 116 | first_atom = mol_SMARTS2.GetAtomWithIdx(0) 117 | n_hydrogens = first_atom.GetUnsignedProp('n_hydrogens') 118 | 119 | n_available_bonds_this_atom = first_atom.GetTotalValence() - n_hydrogens 120 | 121 | atom.SetUnsignedProp('n_hydrogens', n_hydrogens) 122 | atom.SetUnsignedProp('n_available_bonds', n_available_bonds_this_atom) 123 | 124 | if first_atom.GetAtomicNum() == 6: 125 | n_carbons += 1 126 | 127 | if len(found_atoms) == 1: 128 | n_atoms_defining_SMARTS -= 1 129 | elif len(found_atoms) > 1: 130 | raise ValueError('Algorithm can\'t handle SMARTS with 2 levels of recursion') 131 | else: 132 | atom.SetUnsignedProp('n_hydrogens', 0) 133 | atom.SetUnsignedProp('n_available_bonds', 0) 134 | else: 135 | 136 | # get number of hydrogens from SMARTS 137 | n_available_bonds_this_atom = atom.GetImplicitValence() 138 | 139 | n_hydrogens = 0 140 | match = cls.__regex.findall('AtomHCount\s+(\d+)\s*', atom.DescribeQuery()) 141 | if match: 142 | n_hydrogens = int(match[0]) 143 | 144 | n_available_bonds_this_atom -= n_hydrogens 145 | 146 | atom.SetUnsignedProp('n_hydrogens', n_hydrogens) 147 | 148 | if n_available_bonds_this_atom < 0: 149 | n_available_bonds_this_atom = 0 150 | 151 | atom.SetUnsignedProp('n_available_bonds', n_available_bonds_this_atom) 152 | 153 | if atom.GetAtomicNum() == 6: 154 | n_carbons += 1 155 | 156 | is_within_ring = False 157 | match = cls.__regex.findall('AtomInNRings\s+(-?\d+)\s+(!?=)\s*', atom.DescribeQuery()) 158 | if match: 159 | match = match[0] 160 | n_rings = int(match[0]) 161 | comparison_sign = match[1] 162 | 163 | if n_rings == -1: # if number of rings was not defined 164 | is_within_ring = comparison_sign == '=' 165 | else: # if number of rings was defined 166 | if comparison_sign == '=': 167 | is_within_ring = n_rings != 0 168 | else: 169 | is_within_ring = n_rings == 0 170 | 171 | if atom.GetIsAromatic() or is_within_ring: 172 | n_atoms_in_ring += 1 173 | 174 | n_available_bonds +=n_available_bonds_this_atom 175 | 176 | n_hydrogens_this_molecule += atom.GetUnsignedProp('n_hydrogens') 177 | 178 | if n_available_bonds_this_atom > 0: 179 | n_atoms_with_available_bonds += 1 180 | 181 | 182 | # find whether the SMARTS is simple 183 | atom_with_valence_one_on_carbon = cls.__Chem.MolFromSmarts('[*;v1][#6]') 184 | atom_with_valence_one_on_carbon.UpdatePropertyCache() 185 | 186 | atom_with_valence_one_on_excluding_carbon = cls.__Chem.MolFromSmarts('[$([*;v1][#6])]') 187 | atom_with_valence_one_on_excluding_carbon.UpdatePropertyCache() 188 | 189 | is_simple_atom_on_c = n_atoms_with_available_bonds == 1 and \ 190 | (mol_SMARTS.HasSubstructMatch(atom_with_valence_one_on_carbon) or \ 191 | mol_SMARTS.HasSubstructMatch(atom_with_valence_one_on_excluding_carbon)) 192 | 193 | atom_with_valence_one = cls.__Chem.MolFromSmarts('[*;v1;!#1]') 194 | atom_with_valence_one.UpdatePropertyCache() 195 | is_simple_atom = n_atoms_with_available_bonds == 1 and (mol_SMARTS.HasSubstructMatch(atom_with_valence_one)) 196 | 197 | 198 | if len(found_atoms) > 0: 199 | if len(found_atoms) == 1: 200 | sub_mol_SMARTS = cls.__Chem.MolFromSmarts(found_atoms[0]) 201 | sub_mol_SMARTS.UpdatePropertyCache() 202 | is_simple_atom_on_c = (sub_mol_SMARTS.HasSubstructMatch(atom_with_valence_one_on_carbon) or \ 203 | sub_mol_SMARTS.HasSubstructMatch(atom_with_valence_one_on_excluding_carbon)) 204 | is_simple_atom = n_atoms_defining_SMARTS == 1 and sub_mol_SMARTS.HasSubstructMatch(atom_with_valence_one) 205 | elif len(found_atoms) > 1: 206 | raise ValueError('Algorithm can\'t handle SMARTS with 2 recursive SMARTS') 207 | 208 | # set the gathered properties 209 | mol_SMARTS.SetUnsignedProp('n_hydrogens', n_hydrogens_this_molecule) 210 | mol_SMARTS.SetUnsignedProp('n_carbons', n_carbons) 211 | mol_SMARTS.SetUnsignedProp('n_available_bonds', n_available_bonds) 212 | mol_SMARTS.SetUnsignedProp('n_atoms_defining_SMARTS', n_atoms_defining_SMARTS) 213 | mol_SMARTS.SetUnsignedProp('n_atoms_with_available_bonds', n_atoms_with_available_bonds) 214 | mol_SMARTS.SetBoolProp('has_atoms_in_ring', n_atoms_in_ring > 0) 215 | mol_SMARTS.SetBoolProp('is_simple_atom', is_simple_atom) 216 | mol_SMARTS.SetBoolProp('is_simple_atom_on_c', is_simple_atom_on_c) 217 | mol_SMARTS.SetUnsignedProp('n_double_bonds', len(mol_SMARTS.GetSubstructMatches(cls.__Chem.MolFromSmarts("*=*")))) 218 | mol_SMARTS.SetUnsignedProp('n_triple_bonds', len(mol_SMARTS.GetSubstructMatches(cls.__Chem.MolFromSmarts("*#*")))) 219 | 220 | return mol_SMARTS 221 | 222 | # this function does a substructure match and then checks whether the match 223 | # is adjacent to previous matches and/or checks if the hydrogen number is correct 224 | @classmethod 225 | def get_substruct_matches(cls, mol_searched_for, mol_searched_in, atomIdxs_to_which_new_matches_have_to_be_adjacent, check_for_hydrogens = False): 226 | 227 | valid_matches = [] 228 | 229 | if mol_searched_in.GetNumAtoms() >= mol_searched_for.GetNumAtoms(): 230 | matches = mol_searched_in.GetSubstructMatches(mol_searched_for) 231 | 232 | if matches: 233 | for match in matches: 234 | all_hydrogens_OK = True 235 | 236 | if check_for_hydrogens: 237 | # following lines are a workaround for the fact that SMARTS 238 | # matching SMARTS is not working completely correctly as the number 239 | # of hydrogens is ignored in some cases by RDkit 240 | # 241 | # for more details see the following issues: 242 | # https://github.com/rdkit/rdkit/issues/2448 243 | # https://github.com/rdkit/rdkit/issues/1978 244 | 245 | for i in range(mol_searched_for.GetNumAtoms()): 246 | atom_mol_searched_for = mol_searched_for.GetAtomWithIdx(i) 247 | atom_mol_searched_in = mol_searched_in.GetAtomWithIdx(match[i]) 248 | 249 | # if mol_searched_in is SMARTS 250 | if atom_mol_searched_in.HasProp('n_hydrogens'): 251 | if atom_mol_searched_for.GetUnsignedProp('n_hydrogens') > atom_mol_searched_in.GetUnsignedProp('n_hydrogens'): 252 | all_hydrogens_OK = False 253 | break 254 | # if mol_searched_in is SMILES 255 | else: 256 | break 257 | 258 | if all_hydrogens_OK: 259 | add_this_match = True 260 | if len(atomIdxs_to_which_new_matches_have_to_be_adjacent) > 0: 261 | add_this_match = False 262 | 263 | for i in match: 264 | for neighbor in mol_searched_in.GetAtomWithIdx(i).GetNeighbors(): 265 | if neighbor.GetIdx() in atomIdxs_to_which_new_matches_have_to_be_adjacent: 266 | add_this_match = True 267 | break 268 | 269 | if add_this_match: 270 | valid_matches.append(match) 271 | 272 | return valid_matches 273 | 274 | # this dunction is to avoid counting heavier versions of hydrogen as heavy atom 275 | @classmethod 276 | def get_heavy_atom_count(cls, mol): 277 | heavy_atom_count = 0 278 | for atom in mol.GetAtoms(): 279 | if atom.GetAtomicNum() != 1: 280 | heavy_atom_count += 1 281 | 282 | return heavy_atom_count 283 | 284 | def __init__(self, fragmentation_scheme = {}, algorithm = '', n_atoms_cuttoff = -1, function_to_choose_fragmentation = False, n_max_fragmentations_to_find = -1): 285 | 286 | if not type(fragmentation_scheme) is dict: 287 | raise TypeError('fragmentation_scheme must be a dctionary with integers as keys and either strings or list of strings as values.') 288 | 289 | if len(fragmentation_scheme) == 0: 290 | raise ValueError('fragmentation_scheme must be provided.') 291 | 292 | if not algorithm in ['simple', 'complete', 'combined']: 293 | raise ValueError('Algorithm must be either simple ,complete or combined.') 294 | 295 | if algorithm == 'simple': 296 | if n_max_fragmentations_to_find != -1: 297 | raise ValueError('Setting n_max_fragmentations_to_find only makes sense with complete or combined algorithm.') 298 | 299 | self.algorithm = algorithm 300 | 301 | if algorithm in ['complete', 'combined']: 302 | if n_atoms_cuttoff == -1: 303 | raise ValueError('n_atoms_cuttoff needs to be specified for complete or combined algorithms.') 304 | 305 | if function_to_choose_fragmentation == False: 306 | raise ValueError('function_to_choose_fragmentation needs to be specified for complete or combined algorithms.') 307 | 308 | if not callable(function_to_choose_fragmentation): 309 | raise TypeError('function_to_choose_fragmentation needs to be a function.') 310 | 311 | if n_max_fragmentations_to_find != -1: 312 | if n_max_fragmentations_to_find < 1: 313 | raise ValueError('n_max_fragmentations_to_find has to be 1 or higher.') 314 | 315 | self.n_max_fragmentations_to_find = n_max_fragmentations_to_find 316 | 317 | self.n_atoms_cuttoff = n_atoms_cuttoff 318 | 319 | self.fragmentation_scheme = fragmentation_scheme 320 | 321 | self.function_to_choose_fragmentation = function_to_choose_fragmentation 322 | 323 | # create a lookup dictionaries to faster finding a group number and the 324 | # respective pattern with its properties for a specific SMARTS 325 | self._fragmentation_scheme_group_number_lookup = {} 326 | self._fragmentation_scheme_pattern_lookup = {} 327 | self.fragmentation_scheme_order = [] 328 | for group_number, list_SMARTS in fragmentation_scheme.items(): 329 | 330 | self.fragmentation_scheme_order.append(group_number) 331 | 332 | if type(list_SMARTS) is not list: 333 | list_SMARTS = [list_SMARTS] 334 | 335 | for SMARTS in list_SMARTS: 336 | if SMARTS != '': 337 | self._fragmentation_scheme_group_number_lookup[SMARTS] = group_number 338 | 339 | mol_SMARTS = fragmenter.get_mol_with_properties_from_SMARTS(SMARTS) 340 | self._fragmentation_scheme_pattern_lookup[SMARTS] = mol_SMARTS 341 | 342 | 343 | 344 | # create a lookup dictionaries to faster finding of parent pattern 345 | # for a specific SMARTS 346 | self._parent_pattern_lookup = {} 347 | for SMARTS1, mol_SMARTS1 in self._fragmentation_scheme_pattern_lookup.items(): 348 | parent_patterns_of_SMARTS1 = [] 349 | 350 | for SMARTS2, mol_SMARTS2 in self._fragmentation_scheme_pattern_lookup.items(): 351 | if SMARTS1 != SMARTS2: 352 | if mol_SMARTS2.GetNumAtoms() >= mol_SMARTS1.GetNumAtoms(): 353 | matches = fragmenter.get_substruct_matches(mol_SMARTS1, mol_SMARTS2, set(), True) 354 | 355 | if matches: 356 | parent_patterns_of_SMARTS1.append(SMARTS2) 357 | 358 | mol_SMARTS1.SetBoolProp('has_parent_pattern', len(parent_patterns_of_SMARTS1) > 0) 359 | 360 | self._parent_pattern_lookup[SMARTS1] = parent_patterns_of_SMARTS1 361 | 362 | 363 | def fragment(self, SMILES): 364 | 365 | is_valid_SMILES = fragmenter.__Chem.MolFromSmiles(SMILES) is not None 366 | 367 | if not is_valid_SMILES: 368 | raise ValueError('Following SMILES is not valid: ' + SMILES) 369 | 370 | # handle mixtures 371 | if SMILES.count('.') > 0: 372 | list_SMILES = SMILES.split('.') 373 | else: 374 | list_SMILES = [SMILES] 375 | 376 | # iterate over all separated SMILES 377 | success = False 378 | fragmentation = {} 379 | for SMILES in list_SMILES: 380 | temp_fragmentation, success = self.__get_fragmentation(SMILES) 381 | 382 | for SMARTS, matches in temp_fragmentation.items(): 383 | group_number = self._fragmentation_scheme_group_number_lookup[SMARTS] 384 | 385 | if not group_number in fragmentation: 386 | fragmentation[group_number] = 0 387 | 388 | fragmentation[group_number] += len(matches) 389 | 390 | if not success: 391 | break 392 | 393 | return fragmentation, success 394 | 395 | def __get_fragmentation(self, SMILES): 396 | 397 | success = False 398 | fragmentation = {} 399 | # use simple fragmentation algorithm 400 | if self.algorithm in ['simple', 'combined']: 401 | fragmentation, success = self.__simple_fragmentation(SMILES) 402 | 403 | if success: 404 | return fragmentation, success 405 | 406 | if self.algorithm in ['complete', 'combined']: 407 | fragmentations, success = self.__complete_fragmentation(SMILES) 408 | 409 | if success: 410 | fragmentation = self.function_to_choose_fragmentation(fragmentations) 411 | 412 | return fragmentation, success 413 | 414 | def __simple_fragmentation(self, SMILES): 415 | mol_SMILES = self.__Chem.MolFromSmiles(SMILES) 416 | 417 | heavy_atom_count = fragmenter.get_heavy_atom_count(mol_SMILES) 418 | 419 | success = False 420 | fragmentation = {} 421 | 422 | fragmentation, atomIdxs_included_in_fragmentation = self.__search_non_overlapping_solution(mol_SMILES, {}, set(), set()) 423 | success = len(atomIdxs_included_in_fragmentation) == heavy_atom_count 424 | 425 | # if not success, clean up molecule and search again 426 | level = 1 427 | while not success: 428 | fragmentation_so_far , atomIdxs_included_in_fragmentation_so_far = fragmenter.__clean_molecule_surrounding_unmatched_atoms(mol_SMILES, fragmentation, atomIdxs_included_in_fragmentation, level) 429 | level += 1 430 | 431 | if len(atomIdxs_included_in_fragmentation_so_far) == 0: 432 | break 433 | 434 | fragmentation_so_far, atomIdxs_included_in_fragmentation_so_far = self.__search_non_overlapping_solution(mol_SMILES, fragmentation_so_far, atomIdxs_included_in_fragmentation_so_far, atomIdxs_included_in_fragmentation_so_far) 435 | 436 | success = len(atomIdxs_included_in_fragmentation_so_far) == heavy_atom_count 437 | 438 | if success: 439 | fragmentation = fragmentation_so_far 440 | 441 | return fragmentation, success 442 | 443 | 444 | def __search_non_overlapping_solution(self, mol_searched_in, fragmentation, atomIdxs_included_in_fragmentation, atomIdxs_to_which_new_matches_have_to_be_adjacent, search_for_parent_patterns = True): 445 | 446 | n_atomIdxs_included_in_fragmentation = len(atomIdxs_included_in_fragmentation) - 1 447 | 448 | while n_atomIdxs_included_in_fragmentation != len(atomIdxs_included_in_fragmentation): 449 | n_atomIdxs_included_in_fragmentation = len(atomIdxs_included_in_fragmentation) 450 | 451 | 452 | for group_number in self.fragmentation_scheme_order: 453 | list_SMARTS = self.fragmentation_scheme[group_number] 454 | if type(list_SMARTS) is not list: 455 | list_SMARTS = [list_SMARTS] 456 | 457 | for SMARTS in list_SMARTS: 458 | if SMARTS != "": 459 | fragmentation, atomIdxs_included_in_fragmentation = self.__get_next_non_overlapping_match(mol_searched_in, SMARTS, fragmentation, atomIdxs_included_in_fragmentation, atomIdxs_to_which_new_matches_have_to_be_adjacent, search_for_parent_patterns) 460 | 461 | return fragmentation, atomIdxs_included_in_fragmentation 462 | 463 | def __get_next_non_overlapping_match(self, mol_searched_in, SMARTS, fragmentation, atomIdxs_included_in_fragmentation, atomIdxs_to_which_new_matches_have_to_be_adjacent, search_for_parent_patterns): 464 | 465 | mol_searched_for = self._fragmentation_scheme_pattern_lookup[SMARTS] 466 | 467 | if search_for_parent_patterns: 468 | if mol_searched_for.GetBoolProp('has_parent_pattern'): 469 | for parent_SMARTS in self._parent_pattern_lookup[SMARTS]: 470 | fragmentation, atomIdxs_included_in_fragmentation = self.__get_next_non_overlapping_match(mol_searched_in, parent_SMARTS, fragmentation, atomIdxs_included_in_fragmentation, atomIdxs_to_which_new_matches_have_to_be_adjacent, search_for_parent_patterns) 471 | 472 | if atomIdxs_to_which_new_matches_have_to_be_adjacent: 473 | matches = fragmenter.get_substruct_matches(mol_searched_for, mol_searched_in, atomIdxs_to_which_new_matches_have_to_be_adjacent) 474 | else: 475 | matches = fragmenter.get_substruct_matches(mol_searched_for, mol_searched_in, set()) 476 | 477 | if matches: 478 | for match in matches: 479 | all_atoms_of_new_match_are_unassigned = atomIdxs_included_in_fragmentation.isdisjoint(match) 480 | 481 | if all_atoms_of_new_match_are_unassigned: 482 | if not SMARTS in fragmentation: 483 | fragmentation[SMARTS] = [] 484 | 485 | fragmentation[SMARTS].append(match) 486 | atomIdxs_included_in_fragmentation.update(match) 487 | 488 | return fragmentation, atomIdxs_included_in_fragmentation 489 | 490 | @classmethod 491 | def __clean_molecule_surrounding_unmatched_atoms(cls, mol_searched_in, fragmentation, atomIdxs_included_in_fragmentation, level): 492 | 493 | for i in range(0, level): 494 | 495 | atoms_missing = set(range(0, fragmenter.get_heavy_atom_count(mol_searched_in))).difference(atomIdxs_included_in_fragmentation) 496 | 497 | new_fragmentation = fragmenter.__marshal.loads(fragmenter.__marshal.dumps(fragmentation)) 498 | 499 | for atomIdx in atoms_missing: 500 | for neighbor in mol_searched_in.GetAtomWithIdx(atomIdx).GetNeighbors(): 501 | for smart, atoms_found in fragmentation.items(): 502 | for atoms in atoms_found: 503 | if neighbor.GetIdx() in atoms: 504 | if smart in new_fragmentation: 505 | if new_fragmentation[smart].count(atoms) > 0: 506 | new_fragmentation[smart].remove(atoms) 507 | 508 | if smart in new_fragmentation: 509 | if len(new_fragmentation[smart]) == 0: 510 | new_fragmentation.pop(smart) 511 | 512 | 513 | new_atomIdxs_included_in_fragmentation = set() 514 | for i in new_fragmentation.values(): 515 | for j in i: 516 | new_atomIdxs_included_in_fragmentation.update(j) 517 | 518 | atomIdxs_included_in_fragmentation = new_atomIdxs_included_in_fragmentation 519 | fragmentation = new_fragmentation 520 | 521 | return fragmentation, atomIdxs_included_in_fragmentation 522 | 523 | 524 | def __complete_fragmentation(self, SMILES): 525 | mol_SMILES = self.__Chem.MolFromSmiles(SMILES) 526 | 527 | heavy_atom_count = fragmenter.get_heavy_atom_count(mol_SMILES) 528 | 529 | if heavy_atom_count > self.n_atoms_cuttoff: 530 | return {}, False 531 | 532 | completed_fragmentations = [] 533 | groups_leading_to_incomplete_fragmentations = [] 534 | completed_fragmentations, groups_leading_to_incomplete_fragmentations, incomplete_fragmentation_found = self.__get_next_non_overlapping_adjacent_match_recursively(mol_SMILES, heavy_atom_count, completed_fragmentations, groups_leading_to_incomplete_fragmentations, {}, set(), set(), self.n_max_fragmentations_to_find) 535 | success = len(completed_fragmentations) > 0 536 | 537 | return completed_fragmentations, success 538 | 539 | def __get_next_non_overlapping_adjacent_match_recursively(self, mol_searched_in, heavy_atom_count, completed_fragmentations, groups_leading_to_incomplete_fragmentations, fragmentation_so_far, atomIdxs_included_in_fragmentation_so_far, atomIdxs_to_which_new_matches_have_to_be_adjacent, n_max_fragmentations_to_find = -1): 540 | 541 | n_completed_fragmentations = len(completed_fragmentations) 542 | incomplete_fragmentation_found = False 543 | complete_fragmentation_found = False 544 | 545 | if len(completed_fragmentations) == n_max_fragmentations_to_find: 546 | return completed_fragmentations, groups_leading_to_incomplete_fragmentations, incomplete_fragmentation_found 547 | 548 | 549 | for group_number in self.fragmentation_scheme_order: 550 | list_SMARTS = self.fragmentation_scheme[group_number] 551 | 552 | if complete_fragmentation_found: 553 | break 554 | 555 | if type(list_SMARTS) is not list: 556 | list_SMARTS = [list_SMARTS] 557 | 558 | for SMARTS in list_SMARTS: 559 | if complete_fragmentation_found: 560 | break 561 | 562 | if SMARTS != "": 563 | matches = fragmenter.get_substruct_matches(self._fragmentation_scheme_pattern_lookup[SMARTS], mol_searched_in, atomIdxs_included_in_fragmentation_so_far) 564 | 565 | for match in matches: 566 | 567 | # only allow non-overlapping matches 568 | all_atoms_are_unassigned = atomIdxs_included_in_fragmentation_so_far.isdisjoint(match) 569 | if not all_atoms_are_unassigned: 570 | continue 571 | 572 | # only allow matches that do not contain groups leading to incomplete matches 573 | for groups_leading_to_incomplete_fragmentation in groups_leading_to_incomplete_fragmentations: 574 | if fragmenter.__is_fragmentation_subset_of_other_fragmentation(groups_leading_to_incomplete_fragmentation, fragmentation_so_far): 575 | return completed_fragmentations, groups_leading_to_incomplete_fragmentations, incomplete_fragmentation_found 576 | 577 | # only allow matches that will lead to new fragmentations 578 | use_this_match = True 579 | n_found_groups = len(fragmentation_so_far) 580 | 581 | for completed_fragmentation in completed_fragmentations: 582 | 583 | if not SMARTS in completed_fragmentation: 584 | continue 585 | 586 | if n_found_groups == 0: 587 | use_this_match = not fragmenter.__is_match_contained_in_fragmentation(match, SMARTS, completed_fragmentation) 588 | else: 589 | if fragmenter.__is_fragmentation_subset_of_other_fragmentation(fragmentation_so_far, completed_fragmentation): 590 | use_this_match = not fragmenter.__is_match_contained_in_fragmentation(match, SMARTS, completed_fragmentation) 591 | 592 | if not use_this_match: 593 | break 594 | 595 | if not use_this_match: 596 | continue 597 | 598 | # make a deepcopy here, otherwise the variables are modified down the road 599 | # marshal is used here because it works faster than copy.deepcopy 600 | this_SMARTS_fragmentation_so_far = fragmenter.__marshal.loads(fragmenter.__marshal.dumps(fragmentation_so_far)) 601 | this_SMARTS_atomIdxs_included_in_fragmentation_so_far = atomIdxs_included_in_fragmentation_so_far.copy() 602 | 603 | if not SMARTS in this_SMARTS_fragmentation_so_far: 604 | this_SMARTS_fragmentation_so_far[SMARTS] = [] 605 | 606 | this_SMARTS_fragmentation_so_far[SMARTS].append(match) 607 | this_SMARTS_atomIdxs_included_in_fragmentation_so_far.update(match) 608 | 609 | # only allow matches that do not contain groups leading to incomplete matches 610 | for groups_leading_to_incomplete_match in groups_leading_to_incomplete_fragmentations: 611 | if fragmenter.__is_fragmentation_subset_of_other_fragmentation(groups_leading_to_incomplete_match, this_SMARTS_fragmentation_so_far): 612 | use_this_match = False 613 | break 614 | 615 | if not use_this_match: 616 | continue 617 | 618 | # if the complete molecule has not been fragmented, continue to do so 619 | if len(this_SMARTS_atomIdxs_included_in_fragmentation_so_far) < heavy_atom_count: 620 | completed_fragmentations, groups_leading_to_incomplete_fragmentations, incomplete_fragmentation_found = self.__get_next_non_overlapping_adjacent_match_recursively(mol_searched_in, heavy_atom_count, completed_fragmentations, groups_leading_to_incomplete_fragmentations, this_SMARTS_fragmentation_so_far, this_SMARTS_atomIdxs_included_in_fragmentation_so_far, this_SMARTS_atomIdxs_included_in_fragmentation_so_far, n_max_fragmentations_to_find) 621 | break 622 | 623 | # if the complete molecule has been fragmented, save and return 624 | if len(this_SMARTS_atomIdxs_included_in_fragmentation_so_far) == heavy_atom_count: 625 | completed_fragmentations.append(this_SMARTS_fragmentation_so_far) 626 | complete_fragmentation_found = True 627 | break 628 | 629 | # if until here no new fragmentation was found check whether an incomplete fragmentation was found 630 | if n_completed_fragmentations == len(completed_fragmentations): 631 | 632 | if not incomplete_fragmentation_found: 633 | 634 | incomplete_matched_groups = {} 635 | 636 | if len(atomIdxs_included_in_fragmentation_so_far) > 0: 637 | unassignes_atom_idx = set(range(0, heavy_atom_count)).difference(atomIdxs_included_in_fragmentation_so_far) 638 | for atom_idx in unassignes_atom_idx: 639 | neighbor_atoms_idx = [i.GetIdx() for i in mol_searched_in.GetAtomWithIdx(atom_idx).GetNeighbors()] 640 | 641 | for neighbor_atom_idx in neighbor_atoms_idx: 642 | for found_smarts, found_matches in fragmentation_so_far.items(): 643 | for found_match in found_matches: 644 | if neighbor_atom_idx in found_match: 645 | if not found_smarts in incomplete_matched_groups: 646 | incomplete_matched_groups[found_smarts] = [] 647 | 648 | if found_match not in incomplete_matched_groups[found_smarts]: 649 | incomplete_matched_groups[found_smarts].append(found_match) 650 | 651 | is_subset_of_groups_already_found = False 652 | indexes_to_remove = [] 653 | ind = 0 654 | 655 | # remove groups that are parent to the currently found groups 656 | for groups_leading_to_incomplete_match in groups_leading_to_incomplete_fragmentations: 657 | is_subset_of_groups_already_found = fragmenter.__is_fragmentation_subset_of_other_fragmentation(incomplete_matched_groups, groups_leading_to_incomplete_match) 658 | if is_subset_of_groups_already_found: 659 | indexes_to_remove.append(ind) 660 | 661 | ind += 1 662 | 663 | for index in sorted(indexes_to_remove, reverse=True): 664 | del groups_leading_to_incomplete_fragmentations[index] 665 | 666 | groups_leading_to_incomplete_fragmentations.append(incomplete_matched_groups) 667 | groups_leading_to_incomplete_fragmentations = sorted(groups_leading_to_incomplete_fragmentations, key = len) 668 | 669 | incomplete_fragmentation_found = True 670 | 671 | return completed_fragmentations, groups_leading_to_incomplete_fragmentations, incomplete_fragmentation_found 672 | 673 | @classmethod 674 | def __is_fragmentation_subset_of_other_fragmentation(cls, fragmentation, other_fragmentation): 675 | n_found_groups = len(fragmentation) 676 | n_found_other_groups = len(other_fragmentation) 677 | 678 | if n_found_groups == 0: 679 | return False 680 | 681 | if n_found_other_groups < n_found_groups: 682 | return False 683 | 684 | n_found_SMARTS_that_are_subset = 0 685 | for found_SMARTS, found_matches in fragmentation.items(): 686 | if found_SMARTS in other_fragmentation: 687 | found_matches_set = set(frozenset(i) for i in fragmentation[found_SMARTS]) 688 | found_other_matches_set = set(frozenset(i) for i in other_fragmentation[found_SMARTS]) 689 | 690 | if found_matches_set.issubset(found_other_matches_set): 691 | n_found_SMARTS_that_are_subset += 1 692 | else: 693 | return False 694 | 695 | return n_found_SMARTS_that_are_subset == n_found_groups 696 | 697 | @classmethod 698 | def __is_match_contained_in_fragmentation(cls, match, SMARTS, fragmentation): 699 | if not SMARTS in fragmentation: 700 | return False 701 | 702 | found_matches_set = set(frozenset(i) for i in fragmentation[SMARTS]) 703 | match_set = set(match) 704 | 705 | return match_set in found_matches_set --------------------------------------------------------------------------------