├── README.md └── FingerprintCalculator.py /README.md: -------------------------------------------------------------------------------- 1 | RDkitTutorial 2 | ============= 3 | 4 | Tutorial on the usage of Rdkit, Pandas, sklearn, machine learning, descriptor calculation, etc.. in the context of bioactivity predictive modeling 5 | -------------------------------------------------------------------------------- /FingerprintCalculator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Isidro Cortes Ciriano. 6/8/2013 4 | # Institut Pasteur 5 | # isidrolauscher@gmail.com 6 | 7 | # Import modules 8 | import argparse 9 | import numpy as np 10 | import os,sys 11 | # Arguments passed to the scripts 12 | parser = argparse.ArgumentParser(prog='PROG',description='Get Morgan Fingerprints for compounds codified in either SMILES or SDF format using RDkit. Isidro Cortes Ciriano. August/September 2013') 13 | parser.add_argument('--bits', required='TRUE',type=int, help="Size of the hashed Morgan Fingerprints (binary and with counts)") 14 | parser.add_argument('--rad', required='TRUE', type=int, help="Maximum radius of the substructures. Deafault is two, equivalent to ECFP4 from PipelinePilot") 15 | parser.add_argument('--f', required='TRUE', type=str, help="Format of the input file") 16 | parser.add_argument('--mols', type=str,help="File containing the molecules {.smi|.smiles|.sdf|.mol2}. If the format is smiles, each line should contain the smiles and the name separated by a comma (in this order)") 17 | parser.add_argument('--image', action='store_true', help="Write --image if you want the images of the substructures") 18 | parser.add_argument('--unhashed', action='store_true', help="Write --unhashed if you want the unhashed fingerprints") 19 | parser.add_argument('--v', action='store_true', help="Verbose") 20 | parser.add_argument('--extF',type=str, help="Type -extF followed by the format {.smi|.smiles|.sdf} of the external file for which you want to calculate HASHED circular fingerprints") 21 | parser.add_argument('--molsEXT',type=str,help="External file") 22 | parser.add_argument('--unhashedEXT', action='store_true', help="Write --unhashedEXT if you want the unhashed fingerprints for the external file. The substructures of the molecules in the external file will be compared to the pool of substructures contained in the molecules of the main file") 23 | parser.add_argument('--RDkitPath', required='TRUE', type=str, help="Path to the directory where the RDkit files are") 24 | parser.add_argument('--output', required='TRUE', type=str, help="Name of the output files") 25 | args = vars(parser.parse_args()) 26 | 27 | image=args['image'] 28 | unhashed=args['unhashed'] 29 | verbose=args['v'] 30 | formatFile=args['f'] 31 | fileMols=str(args['mols']) 32 | nbBits=int(args['bits']) 33 | fp_diam=int(args['rad']) 34 | # External file. 35 | formatFileEXT=args['extF'] 36 | fileMolsEXT=str(args['molsEXT']) 37 | unhashedEXT=args['unhashedEXT'] 38 | RDkitPath=args['f'] 39 | outname=args['output'] 40 | sys.path.append(RDkitPath) 41 | 42 | 43 | #if (formatFileEXT and not fileMolsEXT) or (fileMolsEXT and not formatFileEXT): 44 | # sys.exit("If molsEXT is defined, the argument extF also needs to be defined and vice versa.\nThe calculation has stopped here.") 45 | 46 | 47 | if verbose: 48 | if image: 49 | print "\nCalculation of Morgan Fingerprints with diameter %d hashed into a fingerprint size equal to %d.\nMolecules file: %s.\nImages for the chemical substructures will be created.\n" %(args['rad'],args['bits'],args['mols']) 50 | else : 51 | print "\nCalculation of Morgan Fingerprints with diameter %d hashed into a fingerprint size equal to %d.\nMolecules file: %s.\n NO Images for the chemical substructures will be created.\n" %(args['rad'],args['bits'],args['mols']) 52 | 53 | ##################################### 54 | # Import Modules 55 | ##################################### 56 | import gzip 57 | import rdkit 58 | from rdkit import Chem 59 | from rdkit.Chem import rdMolDescriptors 60 | import rdkit.rdBase 61 | from rdkit.Chem.MACCSkeys import GenMACCSKeys 62 | from rdkit.Chem import AllChem 63 | from rdkit import DataStructs 64 | from rdkit.DataStructs import BitVectToText 65 | from rdkit.Chem import Draw 66 | 67 | ##################################### 68 | # Define Functions 69 | ##################################### 70 | # To search within sublists: 71 | def insublist(item, list): 72 | for l in list: 73 | if np.array_equal(item,l): 74 | return True 75 | return False 76 | 77 | # Define a function to create matrices of empty strings 78 | def nans(shape, dtype=str): 79 | a = np.empty(shape, dtype) 80 | a[:]="" 81 | return a 82 | 83 | ##################################### 84 | # Open File 85 | ##################################### 86 | # Open the files where the fingerprints will be kept: 87 | fp_hash_b=outname+"_hashed_binary.csv" 88 | if os.path.exists(fp_hash_b): 89 | os.remove(fp_hash_b) 90 | f_fp_bin=open(fp_hash_b,'w') 91 | 92 | fp_hash_c=outname+"_hashed_counts.csv" 93 | if os.path.exists(fp_hash_c): 94 | os.remove(fp_hash_c) 95 | f_fp_counts=open(fp_hash_c,'w') 96 | 97 | ##################################### 98 | # Read Molecules 99 | ##################################### 100 | ### Read Mol2 files 101 | def RetrieveMol2Block(fileLikeObject, delimiter="@MOLECULE"): 102 | import rdkit.Chem 103 | """generator which retrieves one mol2 block at a time 104 | """ 105 | mol2 = [] 106 | for line in fileLikeObject: 107 | if line.startswith(delimiter) and mol2: 108 | yield "".join(mol2) 109 | mol2 = [] 110 | mol2.append(line) 111 | if mol2: 112 | yield "".join(mol2) 113 | 114 | 115 | 116 | if formatFile == 'smi' or formatFile == 'smiles': 117 | if verbose: 118 | print "Format of the main file = SMILES" 119 | suppl = Chem.SmilesMolSupplier(fileMols,smilesColumn=0,nameColumn=1,delimiter=',',titleLine=False) 120 | mols=[] 121 | molserr=[] 122 | for i,m in enumerate(suppl): 123 | if m is not None: 124 | mols.append(m) 125 | else: 126 | molserr.append(i) 127 | nbMols=len(mols) 128 | elif formatFile == 'mol2': 129 | print "molecules in mol2 format\n" 130 | molss=[] 131 | with open(fileMols) as fi: 132 | for mol2 in RetrieveMol2Block(fi): 133 | rdkMolecule = rdkit.Chem.MolFromMol2Block(mol2) 134 | molss.append(rdkMolecule) 135 | molserr=[] 136 | mols=[] 137 | for i,m in enumerate(molss): 138 | if m is not None: 139 | mols.append(m) 140 | else: 141 | molserr.append(i) 142 | mols.append(m) 143 | nbMols=len(mols) 144 | print nbMols 145 | else: 146 | if verbose: 147 | print "Format of the main file = SDF" 148 | suppl = Chem.SDMolSupplier(fileMols) 149 | mols=[] 150 | molserr=[] 151 | for i,m in enumerate(suppl): 152 | if m is not None: 153 | mols.append(m) 154 | else: 155 | molserr.append(i) 156 | nbMols=len(mols) 157 | 158 | if verbose: 159 | if len(molserr) !=0: 160 | print "The following %d molecules (starting at zero) could not be processed:\n"%(len(molserr)) 161 | for x in molserr: print x 162 | print "NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0." 163 | errfile="incorrect_molecules_"+outname+".csv" 164 | print "This information has been saved in the following file: %s\n"%(errfile) 165 | # Save the information about which molecules could not be processed correctly. 166 | np.savetxt(errfile,molserr,fmt="%d") 167 | del errfile 168 | else: 169 | print "All molecules in the input file were processed correctly" 170 | 171 | ########################### 172 | # External File 173 | ########################## 174 | if formatFileEXT: 175 | molserrEXT=[] 176 | molsEXT=[] 177 | if formatFileEXT == 'smi' or formatFileEXT == 'smiles': 178 | if verbose: 179 | print "Format of the external file = SMILES" 180 | supplEXT = Chem.SmilesMolSupplier(fileMolsEXT,smilesColumn=0,nameColumn=1,delimiter=',',titleLine=False) 181 | for i,m in enumerate(supplEXT): 182 | if m is not None: 183 | molsEXT.append(m) 184 | else: 185 | molserrEXT.append(i) 186 | nbMolsEXT=len(molsEXT) 187 | elif formatFile == 'mol2': 188 | molssEXT=[] 189 | with open(fileMolsEXT) as fi: 190 | for mol2 in RetrieveMol2Block(fi): 191 | rdkMolecule = rdkit.Chem.MolFromMol2 192 | molssEXT.append(rdkMolecule) 193 | for i,m in enumerate(molssEXT): 194 | if m is not None: 195 | molsEXT.append(m) 196 | else: 197 | molserrEXT.append(i) 198 | molsEXT.append(m) 199 | nbMolsEXT=len(molsEXT) 200 | else: 201 | if verbose: 202 | print "Format of the external file = SDF" 203 | supplEXT = Chem.SDMolSupplier(fileMolsEXT) 204 | for i,m in enumerate(supplEXT): 205 | if m is not None: 206 | molsEXT.append(m) 207 | else: 208 | molserrEXT.append(i) 209 | nbMolsEXT=len(molsEXT) 210 | 211 | if verbose and formatFileEXT: 212 | if len(molserrEXT) !=0: 213 | print "The following %d molecules (starting at zero) from the EXTERNAL file could not be processed:\n"%(len(molserr)) 214 | for x in molserrEXT: print x 215 | print "NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0." 216 | errfileEXT="incorrect_molecules_EXT_"+outname+".csv" 217 | print "This information has been saved in the following file: %s\n"%(errfileEXT) 218 | # Save the information about which molecules could not be processed correctly. 219 | np.savetxt(errfileEXT,molserrEXT,fmt="%d") 220 | del errfileEXT 221 | else: 222 | print "All molecules in the EXTERNAL file were processed correctly" 223 | 224 | if verbose: 225 | print 'Your molecules file has %d CORRECT molecules\n' % (len(mols)) 226 | if formatFileEXT: 227 | print 'Your external file contains %d CORRECT molecules\n' % (len(molsEXT)) 228 | 229 | #declare the vector of zeros to know which positions have appeared 230 | position_track=[0]*nbBits 231 | 232 | # Define variable to keep the smiles and bit numbers 233 | arr = [[],[]] 234 | for i in range(nbBits): 235 | arr[0].append(i) 236 | 237 | for i in range(nbBits): 238 | arr[1].append(['']) 239 | 240 | # Define variable to keep the smiles and bit numbers 241 | arr2 = [[],[]] 242 | for i in range(nbBits): 243 | arr2[0].append(i) 244 | 245 | for i in range(nbBits): 246 | arr2[1].append(['']) 247 | 248 | 249 | # Define the list of lists containing for each compound, the features that it contains. 250 | fps_by_comp=[[]] 251 | for i in range(nbMols): 252 | fps_by_comp[0].append(['']) 253 | 254 | # Define the list that will contain all the submolecules 255 | subm_all=[] 256 | # Define the list where the erroneous molecules will be saved. 257 | err_mols=[] 258 | 259 | # Define a progess bar 260 | from progressbar import * 261 | widgets = ['Progression: ', Percentage(), ' ', Bar(marker='.',left='[',right=']'),' ', ETA(), ' ', FileTransferSpeed()] #see docs for other options 262 | 263 | pbar = ProgressBar(widgets=widgets, maxval=nbMols) 264 | pbar.start() 265 | 266 | smiles_subs_kept =[] 267 | Atoms_subs=[] 268 | nbFeatTot=0 269 | 270 | #Loop over the molecules 271 | for molecule_nb,m in enumerate(mols): 272 | info={}; info2={} 273 | if m is None: 274 | print "Erroneous input at molecule: %d" %(molecule_nb) 275 | err_mols.append(molecule_nb) 276 | else: 277 | if image: 278 | image_name="%s_Molecule_%d.pdf"%(outname,molecule_nb+1) 279 | tmp=AllChem.Compute2DCoords(m) 280 | Draw.MolToFile(m,image_name,size=(300,300),wedgeBonds=True,kekulize=True) 281 | # if verbose: 282 | # print "Molecule %d\n" % (molecule_nb) 283 | fp = AllChem.GetMorganFingerprintAsBitVect(m,fp_diam,nbBits,bitInfo=info) 284 | AllBits=np.asarray([info.items()[i][0] for i in range(0,len(info.items()))]) 285 | #diameter 2 is equal to the length 4 of ECFP-4 286 | fp_bits=BitVectToText(fp) 287 | fp_counts=list(fp_bits) 288 | AtomRadBits=np.asarray([info.items()[p][1][:] for p in range(0,len(info.items()))]) 289 | 290 | fp2 = AllChem.GetMorganFingerprint(m,fp_diam,bitInfo=info2) 291 | ids_now=np.asarray([info2.items()[i][0] for i in range(0,len(info2.items()))]) 292 | ids_nowMOD=ids_now%nbBits 293 | AtomRadNow=np.asarray([info2.items()[p][1][:] for p in range(0,len(info2.items()))]) 294 | 295 | for i in range(0,len(info2.items())): 296 | radius=info2.items()[i][1][0][1] 297 | atom=info2.items()[i][1][0][0] 298 | 299 | for k in range(0,len(AllBits)): 300 | if insublist(AtomRadNow[i][0],AtomRadBits[k]): 301 | bit = AllBits[k] 302 | break 303 | 304 | counts=len(info2.items()[i][1]) 305 | if counts > 1: 306 | fp_counts[bit]=str(counts) 307 | 308 | if position_track[bit] == 0: 309 | position_track[bit]=1 310 | env=Chem.FindAtomEnvironmentOfRadiusN(m,radius,atom) 311 | amap={} 312 | submol=Chem.PathToSubmol(m,env,atomMap=amap) 313 | if radius ==0: ##if len(amap)==0: # This means that the radius is zero, so the feature is a single atom 314 | pass 315 | # arr[1][bit].append(ids_now[i]) #submol 316 | # # Draw the feature 317 | # if image and ids_now[i] not in subm_all: 318 | # image_name="%s_Feature_%d.pdf"%(outname,nbFeatTot) 319 | # amap={}; amap[atom] = atom 320 | # Draw.MolToFile(m,image_name,size=(300,300),highlightAtoms=amap.keys()) 321 | # smiles_subs_kept.append(Chem.MolToSmiles(submol)) 322 | # Atoms_subs.append(submol.GetNumAtoms()) 323 | # if ids_now[i] not in subm_all: 324 | # nbFeatTot+=1 325 | # subm_all.append(ids_now[i]) 326 | # # For each molecule keep the substructures 327 | # fps_by_comp[0][molecule_nb].append(ids_now[i]) 328 | # arr2[1][bit].append(str(nbFeatTot)) 329 | else: 330 | arr[1][bit].append(ids_now[i] ) 331 | if image and ids_now[i] not in subm_all: 332 | image_name="%s_Feature_%d.pdf"%(outname,nbFeatTot) 333 | Draw.MolToFile(m,image_name,size=(300,300),highlightAtoms=amap.keys()) 334 | smiles_subs_kept.append(Chem.MolToSmiles(submol)) 335 | Atoms_subs.append(submol.GetNumAtoms()) 336 | if ids_now[i] not in subm_all: 337 | nbFeatTot+=1 338 | subm_all.append(ids_now[i] ) 339 | fps_by_comp[0][molecule_nb].append(ids_now[i] ) 340 | arr2[1][bit].append(str(nbFeatTot)) 341 | 342 | 343 | else: #The bit is already on! 344 | env=Chem.FindAtomEnvironmentOfRadiusN(m,radius,atom) 345 | amap={} 346 | submol=Chem.PathToSubmol(m,env,atomMap=amap) 347 | 348 | if radius == 0 and ids_now[i] not in arr[1][bit]: ####:if len(amap)==0 and ids_now[i] not in arr[1][bit]: 349 | pass 350 | # arr[1][bit].append(ids_now[i] ) 351 | # if image and ids_now[i] not in subm_all: 352 | # image_name="%s_Feature_%d.pdf"%(outname,nbFeatTot) 353 | # amap={}; amap[atom] = atom 354 | # Draw.MolToFile(m,image_name,size=(300,300),highlightAtoms=amap.keys()) 355 | # smiles_subs_kept.append(Chem.MolToSmiles(submol)) 356 | # Atoms_subs.append(submol.GetNumAtoms()) 357 | # if ids_now[i] not in subm_all: 358 | # nbFeatTot+=1 359 | # subm_all.append(ids_now[i]) 360 | # fps_by_comp[0][molecule_nb].append(ids_now[i]) 361 | # arr2[1][bit].append(str(nbFeatTot)) 362 | # We keep the all the features for each compound anyway 363 | if radius == 123123 and ids_now[i] in arr[1][bit]: ###if len(amap)==0 and ids_now[i] in arr[1][bit]: 364 | pass 365 | #fps_by_comp[0][molecule_nb].append(ids_now[i] ) 366 | 367 | if submol.GetNumAtoms() >1 and ids_now[i] not in arr[1][bit]: #####len(amap)!=0 and ids_now[i] not in arr[1][bit]: 368 | arr[1][bit].append(ids_now[i] ) 369 | if image and ids_now[i] not in subm_all: 370 | image_name="%s_Feature_%d.pdf"%(outname,nbFeatTot) 371 | Draw.MolToFile(m,image_name,size=(300,300),highlightAtoms=amap.keys()) 372 | smiles_subs_kept.append(Chem.MolToSmiles(submol)) 373 | Atoms_subs.append(submol.GetNumAtoms()) 374 | if ids_now[i] not in subm_all: 375 | nbFeatTot+=1 376 | subm_all.append(ids_now[i] ) 377 | fps_by_comp[0][molecule_nb].append(ids_now[i] ) 378 | arr2[1][bit].append(str(nbFeatTot)) 379 | # We keep the all the features for each compound anyway 380 | if submol.GetNumAtoms() >1 and ids_now[i] in arr[1][bit]: ####if len(amap)!=0 and ids_now[i] in arr[1][bit]: 381 | fps_by_comp[0][molecule_nb].append(ids_now[i] ) 382 | 383 | # Print the features in the corresponding files 384 | count=1 385 | for item in fp_bits: 386 | if count != nbBits: 387 | f_fp_bin.write("%s," % (item)) 388 | else: 389 | f_fp_bin.write("%s" % (item)) 390 | count+=1 391 | f_fp_bin.write("\n") 392 | 393 | count=1 394 | for item in fp_counts: 395 | if count != nbBits: 396 | f_fp_counts.write("%s," % (item)) 397 | else: 398 | f_fp_counts.write("%s" % (item)) 399 | count+=1 400 | f_fp_counts.write("\n") 401 | 402 | # Updating the progress bar. 403 | #if verbose: 404 | if nbMols % (1+molecule_nb) == 0: 405 | pbar.update(molecule_nb) 406 | print "\n" 407 | 408 | f_fp_bin.close() 409 | f_fp_counts.close() 410 | 411 | fp_per_bit=outname+'_features_per_bit_hashed_fp.csv' 412 | if os.path.exists(fp_per_bit): 413 | os.remove(fp_per_bit) 414 | 415 | f=open(fp_per_bit,'w') 416 | for i in arr2[0]: 417 | if arr2[1][i]==['']: 418 | f.write("%d\n" % (i)) 419 | else: 420 | f.write("%d%s\n" % (i, ','.join(list(set(arr2[1][i]))))) 421 | 422 | f.close() 423 | 424 | 425 | if verbose: 426 | print "Total number of features : %d" %(len(subm_all)) 427 | 428 | if unhashed: 429 | FPS=nans((nbMols,len(subm_all))) 430 | FPS_counts=nans((nbMols,len(subm_all))) 431 | if verbose: 432 | print "Writing UNhashed fingerprints to file.." 433 | for i in range(nbMols): 434 | for j in range(len(subm_all)): 435 | if subm_all[j] in fps_by_comp[0][i]: 436 | FPS[i][j]=1 437 | FPS_counts[i][j]=fps_by_comp[0][i].count(subm_all[j]) 438 | else: 439 | FPS[i][j]=0 440 | FPS_counts[i][j]=0 441 | 442 | fpbinary=outname+'_unhashed_binary.csv' 443 | fpcounts=outname+'_unhashed_counts.csv' 444 | np.savetxt(fpbinary, FPS, fmt='%1s', delimiter=',', newline='\n') 445 | np.savetxt(fpcounts, FPS_counts, fmt='%1s', delimiter=',', newline='\n') 446 | 447 | 448 | 449 | ############################### 450 | # Write the smiles for the substructures 451 | ############################### 452 | 453 | filename = outname+"_smiles_substructures.smi" 454 | f = open(filename,'w') 455 | dat = 'Substructure_ID\tSmiles\n' 456 | f.write(dat) 457 | for i,m in enumerate(smiles_subs_kept): 458 | dat = str(Atoms_subs[i])+'\t'+m+'\n' 459 | f.write(dat) 460 | f.close() 461 | ############################### 462 | # External Dataset 463 | ############################### 464 | if formatFileEXT: 465 | # if verbose: 466 | # print "\nProcessing the external file..\n" 467 | ############################### 468 | # Open File 469 | ############################### 470 | # Open the files where the fingerprints will be kept: 471 | binaryEXT=outname+"_hashed_binary_EXT.csv" 472 | countsEXT=outname+"_hashed_counts_EXT.csv" 473 | if os.path.exists(binaryEXT): 474 | os.remove(binaryEXT) 475 | f_fp_binEXT=open(binaryEXT,'w') 476 | 477 | if os.path.exists(countsEXT): 478 | os.remove(countsEXT) 479 | f_fp_countsEXT=open(countsEXT,'w') 480 | 481 | 482 | #declare the vector of zeros to know which positions have appeared 483 | position_trackEXT=[0]*nbBits 484 | 485 | # Define variable to keep the smiles and bit numbers 486 | arrEXT = [[],[[]]] 487 | for i in range(nbBits): 488 | arrEXT[0].append(i) 489 | 490 | for i in range(nbBits): 491 | arrEXT[1].append(['']) 492 | 493 | 494 | 495 | # Define the list of lists containing for each compound, the features that it contains. 496 | fps_by_compEXT=[[]] 497 | for i in range(nbMols): 498 | fps_by_compEXT[0].append(['']) 499 | 500 | #Loop over the molecules 501 | for molecule_nb,m in enumerate(molsEXT): 502 | infoFP={}; infoEXT={} 503 | if m is None: 504 | print "Erroneous input at molecule (external file): %d" %(molecule_nb) 505 | else: 506 | if image: 507 | image_name="Molecule_Ext_%d.pdf"%(molecule_nb+1) 508 | tmp=AllChem.Compute2DCoords(m) 509 | Draw.MolToFile(m,image_name,size=(300,300),wedgeBonds=True,kekulize=True) 510 | #if verbose: 511 | # print "External molecule: %d\n" % (molecule_nb) 512 | fpEXT = AllChem.GetMorganFingerprintAsBitVect(m,fp_diam,nbBits,bitInfo=infoFP) 513 | fp_bitsEXT=BitVectToText(fpEXT) 514 | fp_countsEXT=list(fp_bitsEXT) 515 | AllBitsEXT=np.asarray([infoFP.items()[i][0] for i in range(0,len(infoFP.items()))]) 516 | AtomRadBitsEXT=np.asarray([infoFP.items()[p][1][:] for p in range(0,len(infoFP.items()))]) 517 | 518 | fp2EXT = AllChem.GetMorganFingerprint(m,fp_diam,bitInfo=infoEXT) 519 | ids_nowEXT=np.asarray([infoEXT.items()[i][0] for i in range(0,len(infoEXT.items()))]) 520 | ids_nowMODEXT=ids_nowEXT%nbBits 521 | AtomRadNowEXT=np.asarray([infoEXT.items()[p][1][:] for p in range(0,len(infoEXT.items()))]) 522 | 523 | for i in range(0,len(infoEXT.items())): 524 | radius=infoEXT.items()[i][1][0][1] 525 | atom=infoEXT.items()[i][1][0][0] 526 | 527 | for k in range(0,len(AllBitsEXT)): 528 | if insublist(AtomRadNowEXT[i][0],AtomRadBitsEXT[k]): 529 | bit = AllBitsEXT[k] 530 | break 531 | 532 | countsEXT=len(infoEXT.items()[i][1]) 533 | if countsEXT > 1: 534 | fp_countsEXT[bit]=str(countsEXT) 535 | if position_trackEXT[bit] == 0: 536 | position_trackEXT[bit]=1 537 | arrEXT[1][bit].append(ids_nowEXT[i]) #submol 538 | fps_by_compEXT[0][molecule_nb].append(ids_nowEXT[i] ) 539 | 540 | # Writing the EXTERNAL fingerprints 541 | count=1 542 | for item in fp_bitsEXT: 543 | if count != nbBits: 544 | f_fp_binEXT.write("%s," % (item)) 545 | else: 546 | f_fp_binEXT.write("%s" % (item)) 547 | count+=1 548 | f_fp_binEXT.write("\n") 549 | 550 | count=1 551 | for item in fp_countsEXT: 552 | if count != nbBits: 553 | f_fp_countsEXT.write("%s," % (item)) 554 | else: 555 | f_fp_countsEXT.write("%s" % (item)) 556 | count+=1 557 | f_fp_countsEXT.write("\n") 558 | 559 | f_fp_binEXT.close() 560 | f_fp_countsEXT.close() 561 | 562 | 563 | if unhashedEXT: 564 | FPS_EXT=nans((nbMols,len(subm_all))) 565 | FPS_countsEXT=nans((nbMols,len(subm_all))) 566 | if verbose: 567 | print "Writing UNhashed fingerprints for the external file.." 568 | for i in range(nbMols): 569 | for j in range(len(subm_all)): 570 | if subm_all[j] in fps_by_compEXT[0][i]: 571 | FPS_EXT[i][j]=1 572 | FPS_countsEXT[i][j]=fps_by_compEXT[0][i].count(subm_all[j]) 573 | else: 574 | FPS_EXT[i][j]=0 575 | FPS_countsEXT[i][j]=0 576 | 577 | outEXTbinary=outname+"_unhashed_binary_EXT.csv" 578 | outEXTcounts=outname+"_unhashed_counts_EXT.csv" 579 | # np.save("unhashed_binary.npy",FPS_EXT) 580 | # np.save("unhashed_counts.npy",FPS_countsEXT) 581 | np.savetxt(outEXTbinary, FPS_EXT, fmt='%1s', delimiter=',', newline='\n') 582 | np.savetxt(outEXTcounts, FPS_countsEXT, fmt='%1s', delimiter=',', newline='\n') 583 | 584 | if verbose: 585 | print "Calculation Finished. NO problems encountered" 586 | 587 | --------------------------------------------------------------------------------