├── README.md
└── FingerprintCalculator.py


/README.md:
--------------------------------------------------------------------------------
1 | RDkitTutorial
2 | =============
3 | 
4 | Tutorial on the usage of Rdkit, Pandas, sklearn, machine learning, descriptor calculation, etc.. in the context of bioactivity predictive modeling
5 | 


--------------------------------------------------------------------------------
/FingerprintCalculator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Isidro Cortes Ciriano.	6/8/2013
  4 | # Institut Pasteur
  5 | # isidrolauscher@gmail.com
  6 | 
  7 | # Import modules
  8 | import argparse
  9 | import numpy as np
 10 | import os,sys
 11 | # Arguments passed to the scripts
 12 | parser = argparse.ArgumentParser(prog='PROG',description='Get Morgan Fingerprints for compounds codified in either SMILES or SDF format using RDkit. Isidro Cortes Ciriano. August/September 2013')
 13 | parser.add_argument('--bits', required='TRUE',type=int, help="Size of the hashed Morgan Fingerprints (binary and with counts)")
 14 | parser.add_argument('--rad', required='TRUE', type=int, help="Maximum radius of the substructures. Deafault is two, equivalent to ECFP4 from PipelinePilot")
 15 | parser.add_argument('--f', required='TRUE', type=str, help="Format of the input file")
 16 | parser.add_argument('--mols', type=str,help="File containing the molecules {.smi|.smiles|.sdf|.mol2}. If the format is smiles, each line should contain the smiles and the name separated by a comma (in this order)")
 17 | parser.add_argument('--image', action='store_true', help="Write --image if you want the images of the substructures")
 18 | parser.add_argument('--unhashed', action='store_true', help="Write --unhashed if you want the unhashed fingerprints")
 19 | parser.add_argument('--v', action='store_true', help="Verbose")
 20 | parser.add_argument('--extF',type=str, help="Type -extF followed by the format {.smi|.smiles|.sdf} of the external file for which you want to calculate HASHED circular fingerprints")
 21 | parser.add_argument('--molsEXT',type=str,help="External file")
 22 | parser.add_argument('--unhashedEXT', action='store_true', help="Write --unhashedEXT if you want the unhashed fingerprints for the external file. The substructures of the molecules in the external file will be compared to the pool of substructures contained in the molecules of the main file")
 23 | parser.add_argument('--RDkitPath', required='TRUE', type=str, help="Path to the directory where the RDkit files are")
 24 | parser.add_argument('--output', required='TRUE', type=str, help="Name of the output files")
 25 | args = vars(parser.parse_args())
 26 | 
 27 | image=args['image']
 28 | unhashed=args['unhashed']
 29 | verbose=args['v']
 30 | formatFile=args['f']
 31 | fileMols=str(args['mols'])
 32 | nbBits=int(args['bits'])
 33 | fp_diam=int(args['rad'])
 34 | # External file.
 35 | formatFileEXT=args['extF']
 36 | fileMolsEXT=str(args['molsEXT'])
 37 | unhashedEXT=args['unhashedEXT']
 38 | RDkitPath=args['f']
 39 | outname=args['output']
 40 | sys.path.append(RDkitPath)
 41 | 
 42 | 
 43 | #if (formatFileEXT and not fileMolsEXT) or (fileMolsEXT and not formatFileEXT):
 44 | #	sys.exit("If molsEXT is defined, the argument extF also needs to be defined and vice versa.\nThe calculation has stopped here.")
 45 | 
 46 | 
 47 | if verbose:
 48 | 	if image:
 49 | 		print "\nCalculation of Morgan Fingerprints with diameter %d hashed into a fingerprint size equal to %d.\nMolecules file: %s.\nImages for the chemical substructures will be created.\n" %(args['rad'],args['bits'],args['mols'])
 50 | 	else :
 51 | 		print "\nCalculation of Morgan Fingerprints with diameter %d hashed into a fingerprint size equal to %d.\nMolecules file: %s.\n NO Images for the chemical substructures will be created.\n" %(args['rad'],args['bits'],args['mols'])
 52 | 
 53 | #####################################
 54 | # Import Modules
 55 | #####################################
 56 | import gzip
 57 | import rdkit
 58 | from rdkit import Chem
 59 | from rdkit.Chem import rdMolDescriptors
 60 | import rdkit.rdBase
 61 | from rdkit.Chem.MACCSkeys import GenMACCSKeys
 62 | from rdkit.Chem import AllChem
 63 | from rdkit import DataStructs 
 64 | from rdkit.DataStructs import BitVectToText
 65 | from rdkit.Chem import Draw
 66 | 
 67 | #####################################
 68 | # Define Functions
 69 | #####################################
 70 | # To search within sublists:
 71 | def insublist(item, list):
 72 |     for l in list:
 73 |         if np.array_equal(item,l):
 74 |             return True
 75 |     return False
 76 | 
 77 | # Define a function to create matrices of empty strings
 78 | def nans(shape, dtype=str):
 79 |     a = np.empty(shape, dtype)
 80 |     a[:]=""
 81 |     return a
 82 | 
 83 | #####################################
 84 | # Open File
 85 | #####################################
 86 | # Open the files where the fingerprints will be kept:
 87 | fp_hash_b=outname+"_hashed_binary.csv"
 88 | if os.path.exists(fp_hash_b):
 89 | 	os.remove(fp_hash_b)
 90 | f_fp_bin=open(fp_hash_b,'w')
 91 | 
 92 | fp_hash_c=outname+"_hashed_counts.csv"
 93 | if os.path.exists(fp_hash_c):
 94 | 	os.remove(fp_hash_c)
 95 | f_fp_counts=open(fp_hash_c,'w')
 96 | 
 97 | #####################################
 98 | # Read Molecules
 99 | #####################################
100 | ### Read Mol2 files
101 | def RetrieveMol2Block(fileLikeObject, delimiter="@<TRIPOS>MOLECULE"):
102 | 	import rdkit.Chem
103 | 	"""generator which retrieves one mol2 block at a time
104 | 	"""
105 | 	mol2 = []
106 | 	for line in fileLikeObject:
107 | 		if line.startswith(delimiter) and mol2:
108 | 			yield "".join(mol2)
109 | 			mol2 = []
110 | 		mol2.append(line)
111 | 	if mol2:
112 | 		yield "".join(mol2)
113 | 
114 | 
115 | 
116 | if formatFile == 'smi' or formatFile == 'smiles':
117 | 	if verbose:
118 | 		print "Format of the main file = SMILES"
119 | 	suppl = Chem.SmilesMolSupplier(fileMols,smilesColumn=0,nameColumn=1,delimiter=',',titleLine=False)
120 | 	mols=[]
121 | 	molserr=[]
122 | 	for i,m in enumerate(suppl):
123 | 		if m is not None:
124 | 			mols.append(m)
125 | 		else:
126 | 			molserr.append(i)
127 | 	nbMols=len(mols)
128 | elif formatFile == 'mol2':
129 | 	print "molecules in mol2 format\n"
130 | 	molss=[]
131 | 	with open(fileMols) as fi:
132 | 		for mol2 in RetrieveMol2Block(fi):
133 | 			rdkMolecule = rdkit.Chem.MolFromMol2Block(mol2)
134 | 			molss.append(rdkMolecule)
135 | 	molserr=[]
136 | 	mols=[]
137 | 	for i,m in enumerate(molss):
138 | 		if m is not None:
139 | 			mols.append(m)
140 | 		else:
141 | 			molserr.append(i)
142 | 			mols.append(m)  
143 | 	nbMols=len(mols)
144 | 	print nbMols
145 | else:
146 | 	if verbose:
147 | 		print "Format of the main file = SDF"
148 | 	suppl = Chem.SDMolSupplier(fileMols)
149 | 	mols=[]
150 | 	molserr=[]
151 | 	for i,m in enumerate(suppl):
152 | 		if m is not None:
153 | 			mols.append(m)
154 | 		else:
155 | 			molserr.append(i)
156 | 	nbMols=len(mols)
157 | 
158 | if verbose: 
159 | 	if len(molserr) !=0:
160 | 		print "The following %d molecules (starting at zero) could not be processed:\n"%(len(molserr))
161 | 		for x in molserr: print x
162 | 		print "NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0."
163 | 		errfile="incorrect_molecules_"+outname+".csv"
164 | 		print "This information has been saved in the following file: %s\n"%(errfile)
165 | 		# Save the information about which molecules could not be processed correctly.
166 | 		np.savetxt(errfile,molserr,fmt="%d")
167 | 		del errfile
168 | 	else:
169 | 		print "All molecules in the input file were processed correctly"
170 | 
171 | ###########################
172 | # External File
173 | ##########################
174 | if formatFileEXT:
175 | 	molserrEXT=[]
176 | 	molsEXT=[]
177 | 	if formatFileEXT == 'smi' or formatFileEXT == 'smiles':
178 | 		if verbose:
179 | 			print "Format of the external file = SMILES"
180 | 		supplEXT = Chem.SmilesMolSupplier(fileMolsEXT,smilesColumn=0,nameColumn=1,delimiter=',',titleLine=False)
181 | 		for i,m in enumerate(supplEXT):
182 | 			if m is not None:
183 | 				molsEXT.append(m)
184 | 			else:
185 | 				molserrEXT.append(i)
186 | 		nbMolsEXT=len(molsEXT)
187 | 	elif formatFile == 'mol2':
188 | 		molssEXT=[]
189 | 		with open(fileMolsEXT) as fi:
190 | 			for mol2 in RetrieveMol2Block(fi):
191 | 				rdkMolecule = rdkit.Chem.MolFromMol2
192 | 				molssEXT.append(rdkMolecule)
193 | 		for i,m in enumerate(molssEXT):
194 | 			if m is not None:
195 | 				molsEXT.append(m)
196 | 			else:
197 | 				molserrEXT.append(i)
198 | 				molsEXT.append(m)  
199 | 		nbMolsEXT=len(molsEXT)
200 | 	else:
201 | 		if verbose:
202 | 			print "Format of the external file = SDF"
203 | 		supplEXT = Chem.SDMolSupplier(fileMolsEXT)
204 | 		for i,m in enumerate(supplEXT):
205 | 			if m is not None:
206 | 				molsEXT.append(m)
207 | 			else:
208 | 				molserrEXT.append(i)
209 | 		nbMolsEXT=len(molsEXT)
210 | 
211 | if verbose and formatFileEXT: 
212 | 	if len(molserrEXT) !=0:
213 | 		print "The following %d molecules (starting at zero) from the EXTERNAL file could not be processed:\n"%(len(molserr))
214 | 		for x in molserrEXT: print x
215 | 		print "NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0."
216 | 		errfileEXT="incorrect_molecules_EXT_"+outname+".csv"
217 | 		print "This information has been saved in the following file: %s\n"%(errfileEXT)
218 | 		# Save the information about which molecules could not be processed correctly.
219 | 		np.savetxt(errfileEXT,molserrEXT,fmt="%d")
220 | 		del errfileEXT
221 | 	else:
222 | 		print "All molecules in the EXTERNAL file were processed correctly"
223 | 
224 | 	if verbose:
225 | 		print 'Your molecules file has %d CORRECT molecules\n' % (len(mols))
226 | 		if formatFileEXT:
227 | 			print 'Your external file contains %d CORRECT molecules\n' % (len(molsEXT))
228 | 
229 | #declare the vector of zeros to know which positions have appeared
230 | position_track=[0]*nbBits
231 | 
232 | # Define variable to keep the smiles and bit numbers
233 | arr = [[],[]]
234 | for i in range(nbBits):
235 | 	arr[0].append(i)
236 | 
237 | for i in range(nbBits):
238 |         arr[1].append([''])
239 | 
240 | # Define variable to keep the smiles and bit numbers
241 | arr2 = [[],[]]
242 | for i in range(nbBits):
243 | 	arr2[0].append(i)
244 | 
245 | for i in range(nbBits):
246 | 	arr2[1].append([''])
247 | 
248 | 
249 | # Define the list of lists containing for each compound, the features that it contains.
250 | fps_by_comp=[[]]
251 | for i in range(nbMols):
252 | 	fps_by_comp[0].append([''])
253 | 
254 | # Define the list that will contain all the submolecules
255 | subm_all=[]
256 | # Define the list where the erroneous molecules will be saved.
257 | err_mols=[]
258 | 
259 | # Define a progess bar
260 | from progressbar import *
261 | widgets = ['Progression: ', Percentage(), ' ', Bar(marker='.',left='[',right=']'),' ', ETA(), ' ', FileTransferSpeed()] #see docs for other options
262 | 
263 | pbar = ProgressBar(widgets=widgets, maxval=nbMols)
264 | pbar.start()
265 | 
266 | smiles_subs_kept =[]
267 | Atoms_subs=[]
268 | nbFeatTot=0
269 | 
270 | #Loop over the molecules
271 | for molecule_nb,m in enumerate(mols):
272 | 	info={}; info2={}
273 | 	if m is None:
274 | 			print "Erroneous input at molecule: %d" %(molecule_nb)
275 | 			err_mols.append(molecule_nb)
276 | 	else:
277 | 		if image:
278 | 			image_name="%s_Molecule_%d.pdf"%(outname,molecule_nb+1)
279 | 			tmp=AllChem.Compute2DCoords(m)
280 | 			Draw.MolToFile(m,image_name,size=(300,300),wedgeBonds=True,kekulize=True)
281 | #		if verbose:
282 | #			print "Molecule %d\n" % (molecule_nb)
283 | 		fp = AllChem.GetMorganFingerprintAsBitVect(m,fp_diam,nbBits,bitInfo=info) 
284 | 		AllBits=np.asarray([info.items()[i][0] for i in range(0,len(info.items()))])
285 | 		#diameter 2 is equal to the length 4 of ECFP-4
286 | 		fp_bits=BitVectToText(fp)
287 | 		fp_counts=list(fp_bits)
288 | 		AtomRadBits=np.asarray([info.items()[p][1][:] for p in range(0,len(info.items()))])
289 | 
290 | 		fp2 = AllChem.GetMorganFingerprint(m,fp_diam,bitInfo=info2)
291 | 		ids_now=np.asarray([info2.items()[i][0] for i in range(0,len(info2.items()))])
292 | 		ids_nowMOD=ids_now%nbBits
293 | 		AtomRadNow=np.asarray([info2.items()[p][1][:] for p in range(0,len(info2.items()))])
294 | 
295 | 		for i in range(0,len(info2.items())):
296 | 			radius=info2.items()[i][1][0][1]
297 | 			atom=info2.items()[i][1][0][0]
298 | 
299 | 			for k in range(0,len(AllBits)):
300 | 				if insublist(AtomRadNow[i][0],AtomRadBits[k]): 
301 | 					bit = AllBits[k]
302 | 					break
303 | 
304 | 			counts=len(info2.items()[i][1])
305 | 			if counts > 1: 
306 | 				fp_counts[bit]=str(counts)
307 | 
308 | 			if position_track[bit] == 0: 
309 | 				position_track[bit]=1
310 | 				env=Chem.FindAtomEnvironmentOfRadiusN(m,radius,atom)
311 | 				amap={}
312 | 				submol=Chem.PathToSubmol(m,env,atomMap=amap)
313 | 				if radius ==0: ##if len(amap)==0: # This means that the radius is zero, so the feature is a single atom
314 | 					pass
315 | 				#	arr[1][bit].append(ids_now[i]) #submol
316 | 				#	# Draw the feature
317 | 				#	if image and ids_now[i] not in subm_all:
318 | 				#		image_name="%s_Feature_%d.pdf"%(outname,nbFeatTot)
319 | 				#		amap={}; amap[atom] = atom	
320 | 				#		Draw.MolToFile(m,image_name,size=(300,300),highlightAtoms=amap.keys())
321 | 				#	smiles_subs_kept.append(Chem.MolToSmiles(submol))
322 | 				#	Atoms_subs.append(submol.GetNumAtoms())
323 | 				#	if ids_now[i]  not in subm_all:
324 | 				#		nbFeatTot+=1 
325 | 				#		subm_all.append(ids_now[i])
326 | 				#	# For each molecule keep the substructures
327 | 				#	fps_by_comp[0][molecule_nb].append(ids_now[i])
328 | 				#	arr2[1][bit].append(str(nbFeatTot))
329 | 				else:
330 | 					arr[1][bit].append(ids_now[i] )
331 | 					if image and ids_now[i]  not in subm_all:
332 | 						image_name="%s_Feature_%d.pdf"%(outname,nbFeatTot)
333 | 						Draw.MolToFile(m,image_name,size=(300,300),highlightAtoms=amap.keys())
334 | 					smiles_subs_kept.append(Chem.MolToSmiles(submol))
335 | 					Atoms_subs.append(submol.GetNumAtoms())
336 | 					if ids_now[i] not in subm_all:
337 | 						nbFeatTot+=1
338 | 						subm_all.append(ids_now[i] )
339 | 					fps_by_comp[0][molecule_nb].append(ids_now[i] ) 
340 | 					arr2[1][bit].append(str(nbFeatTot)) 
341 | 
342 | 
343 | 			else: #The bit is already on!
344 | 				env=Chem.FindAtomEnvironmentOfRadiusN(m,radius,atom)
345 | 				amap={}
346 | 				submol=Chem.PathToSubmol(m,env,atomMap=amap)
347 | 
348 | 				if radius == 0 and ids_now[i]  not in arr[1][bit]:  ####:if len(amap)==0 and ids_now[i]  not in arr[1][bit]:
349 | 					pass
350 | 				#	arr[1][bit].append(ids_now[i] )
351 | 				#	if image and ids_now[i]  not in subm_all:
352 | 				#		image_name="%s_Feature_%d.pdf"%(outname,nbFeatTot)
353 | 				#		amap={}; amap[atom] = atom
354 | 				#		Draw.MolToFile(m,image_name,size=(300,300),highlightAtoms=amap.keys())
355 | 				#	smiles_subs_kept.append(Chem.MolToSmiles(submol))
356 | 				#	Atoms_subs.append(submol.GetNumAtoms())
357 | 				#	if ids_now[i]  not in subm_all:
358 | 				#		nbFeatTot+=1
359 | 				#		subm_all.append(ids_now[i])
360 | 				#	fps_by_comp[0][molecule_nb].append(ids_now[i])
361 | 				#	arr2[1][bit].append(str(nbFeatTot)) 
362 | 				# We keep the all the features for each compound anyway
363 | 				if radius == 123123 and ids_now[i]  in arr[1][bit]: ###if len(amap)==0 and ids_now[i]  in arr[1][bit]:
364 | 					pass
365 | 					#fps_by_comp[0][molecule_nb].append(ids_now[i] )
366 | 
367 | 				if submol.GetNumAtoms() >1 and ids_now[i]  not in arr[1][bit]: #####len(amap)!=0 and ids_now[i]  not in arr[1][bit]:
368 | 					arr[1][bit].append(ids_now[i] )
369 | 					if image and ids_now[i]  not in subm_all:
370 | 						image_name="%s_Feature_%d.pdf"%(outname,nbFeatTot)
371 | 						Draw.MolToFile(m,image_name,size=(300,300),highlightAtoms=amap.keys())
372 | 					smiles_subs_kept.append(Chem.MolToSmiles(submol))
373 | 					Atoms_subs.append(submol.GetNumAtoms())
374 | 					if ids_now[i] not in subm_all:
375 | 						nbFeatTot+=1
376 | 						subm_all.append(ids_now[i] )
377 | 					fps_by_comp[0][molecule_nb].append(ids_now[i] ) 
378 | 					arr2[1][bit].append(str(nbFeatTot)) 
379 | 				# We keep the all the features for each compound anyway
380 | 				if submol.GetNumAtoms() >1 and ids_now[i]  in arr[1][bit]: ####if len(amap)!=0 and ids_now[i]  in arr[1][bit]:
381 | 					fps_by_comp[0][molecule_nb].append(ids_now[i] )
382 | 
383 | 		 # Print the features in the corresponding files
384 | 		count=1
385 | 		for item in fp_bits:
386 | 			 if count != nbBits:
387 | 			   f_fp_bin.write("%s," % (item))
388 | 			 else:
389 | 			   f_fp_bin.write("%s" % (item))
390 | 			 count+=1
391 | 		f_fp_bin.write("\n")
392 | 		   
393 | 		count=1
394 | 		for item in fp_counts:
395 | 		   if count != nbBits:
396 | 			 f_fp_counts.write("%s," % (item))
397 | 		   else:
398 | 			 f_fp_counts.write("%s" % (item))
399 | 		   count+=1
400 | 		f_fp_counts.write("\n")
401 | 
402 | # Updating the progress bar.
403 | 	#if verbose:
404 | 	if nbMols % (1+molecule_nb) == 0:
405 | 		pbar.update(molecule_nb)
406 | 		print "\n"
407 | 
408 | f_fp_bin.close()
409 | f_fp_counts.close()
410 | 
411 | fp_per_bit=outname+'_features_per_bit_hashed_fp.csv'
412 | if os.path.exists(fp_per_bit):
413 | 	os.remove(fp_per_bit)
414 | 
415 | f=open(fp_per_bit,'w')
416 | for i in arr2[0]:
417 |      if arr2[1][i]==['']:
418 |      	f.write("%d\n" % (i))
419 |      else:
420 |   	f.write("%d%s\n" % (i, ','.join(list(set(arr2[1][i])))))
421 | 
422 | f.close()
423 | 
424 | 
425 | if verbose:
426 | 	print "Total number of features : %d" %(len(subm_all))
427 | 
428 | if unhashed:
429 | 	FPS=nans((nbMols,len(subm_all)))
430 | 	FPS_counts=nans((nbMols,len(subm_all)))
431 | 	if verbose:
432 | 		print "Writing UNhashed fingerprints to file.."
433 | 	for i in range(nbMols):
434 | 		for j in range(len(subm_all)):
435 | 			if subm_all[j] in fps_by_comp[0][i]:
436 | 				FPS[i][j]=1
437 | 				FPS_counts[i][j]=fps_by_comp[0][i].count(subm_all[j])
438 | 			else:
439 | 				FPS[i][j]=0
440 | 				FPS_counts[i][j]=0
441 | 
442 | 	fpbinary=outname+'_unhashed_binary.csv'
443 | 	fpcounts=outname+'_unhashed_counts.csv'
444 | 	np.savetxt(fpbinary, FPS, fmt='%1s', delimiter=',', newline='\n')
445 | 	np.savetxt(fpcounts, FPS_counts, fmt='%1s', delimiter=',', newline='\n')
446 | 
447 | 
448 | 
449 | ###############################
450 | # Write the smiles for the substructures
451 | ###############################
452 | 
453 | filename = outname+"_smiles_substructures.smi"
454 | f = open(filename,'w')
455 | dat = 'Substructure_ID\tSmiles\n'
456 | f.write(dat)
457 | for i,m in enumerate(smiles_subs_kept):
458 | 	dat = str(Atoms_subs[i])+'\t'+m+'\n'
459 | 	f.write(dat)
460 | f.close()
461 | ###############################
462 | # External Dataset
463 | ###############################
464 | if formatFileEXT:
465 | #	if verbose:
466 | #		print "\nProcessing the external file..\n"
467 | ###############################
468 | # Open File
469 | ###############################
470 | # Open the files where the fingerprints will be kept:
471 | 	binaryEXT=outname+"_hashed_binary_EXT.csv"
472 | 	countsEXT=outname+"_hashed_counts_EXT.csv"
473 | 	if os.path.exists(binaryEXT):
474 | 		os.remove(binaryEXT)
475 | 	f_fp_binEXT=open(binaryEXT,'w')
476 | 
477 | 	if os.path.exists(countsEXT):
478 | 		os.remove(countsEXT)
479 | 	f_fp_countsEXT=open(countsEXT,'w')
480 | 
481 | 
482 | #declare the vector of zeros to know which positions have appeared
483 | 	position_trackEXT=[0]*nbBits
484 | 
485 | # Define variable to keep the smiles and bit numbers
486 | 	arrEXT = [[],[[]]]
487 | 	for i in range(nbBits):
488 | 		arrEXT[0].append(i)
489 | 
490 | 	for i in range(nbBits):
491 | 			arrEXT[1].append([''])
492 | 
493 | 
494 | 
495 | # Define the list of lists containing for each compound, the features that it contains.
496 | 	fps_by_compEXT=[[]]
497 | 	for i in range(nbMols):
498 | 		fps_by_compEXT[0].append([''])
499 | 
500 | #Loop over the molecules
501 | 	for molecule_nb,m in enumerate(molsEXT):
502 | 		infoFP={}; infoEXT={}
503 | 		if m is None:
504 | 				print "Erroneous input at molecule (external file): %d" %(molecule_nb)
505 | 		else:
506 | 			if image:
507 | 				image_name="Molecule_Ext_%d.pdf"%(molecule_nb+1)
508 | 				tmp=AllChem.Compute2DCoords(m)
509 | 				Draw.MolToFile(m,image_name,size=(300,300),wedgeBonds=True,kekulize=True)
510 | 			#if verbose:
511 | 			#	print "External molecule: %d\n" % (molecule_nb)
512 | 			fpEXT = AllChem.GetMorganFingerprintAsBitVect(m,fp_diam,nbBits,bitInfo=infoFP) 
513 | 			fp_bitsEXT=BitVectToText(fpEXT)
514 | 			fp_countsEXT=list(fp_bitsEXT)
515 | 			AllBitsEXT=np.asarray([infoFP.items()[i][0] for i in range(0,len(infoFP.items()))])
516 | 			AtomRadBitsEXT=np.asarray([infoFP.items()[p][1][:] for p in range(0,len(infoFP.items()))])
517 | 
518 | 			fp2EXT = AllChem.GetMorganFingerprint(m,fp_diam,bitInfo=infoEXT)
519 | 			ids_nowEXT=np.asarray([infoEXT.items()[i][0] for i in range(0,len(infoEXT.items()))])
520 | 			ids_nowMODEXT=ids_nowEXT%nbBits
521 | 			AtomRadNowEXT=np.asarray([infoEXT.items()[p][1][:] for p in range(0,len(infoEXT.items()))])
522 | 
523 | 			for i in range(0,len(infoEXT.items())):
524 | 				radius=infoEXT.items()[i][1][0][1]
525 | 				atom=infoEXT.items()[i][1][0][0]
526 | 
527 | 			for k in range(0,len(AllBitsEXT)):
528 | 				if insublist(AtomRadNowEXT[i][0],AtomRadBitsEXT[k]): 
529 | 					bit = AllBitsEXT[k]
530 | 					break
531 | 
532 | 			countsEXT=len(infoEXT.items()[i][1])
533 | 			if countsEXT > 1: 
534 | 				fp_countsEXT[bit]=str(countsEXT)
535 | 			if position_trackEXT[bit] == 0: 
536 | 				position_trackEXT[bit]=1
537 | 			arrEXT[1][bit].append(ids_nowEXT[i]) #submol
538 | 			fps_by_compEXT[0][molecule_nb].append(ids_nowEXT[i] )
539 | 
540 | 		# Writing the EXTERNAL fingerprints
541 | 		count=1
542 | 		for item in fp_bitsEXT:
543 | 			 if count != nbBits:
544 | 			   f_fp_binEXT.write("%s," % (item))
545 | 			 else:
546 | 			   f_fp_binEXT.write("%s" % (item))
547 | 			 count+=1
548 | 		f_fp_binEXT.write("\n")
549 | 		   
550 | 		count=1
551 | 		for item in fp_countsEXT:
552 | 		   if count != nbBits:
553 | 			 f_fp_countsEXT.write("%s," % (item))
554 | 		   else:
555 | 			 f_fp_countsEXT.write("%s" % (item))
556 | 		   count+=1
557 | 		f_fp_countsEXT.write("\n")
558 | 
559 | 	f_fp_binEXT.close()
560 | 	f_fp_countsEXT.close()
561 | 
562 | 
563 | 	if unhashedEXT:
564 | 		FPS_EXT=nans((nbMols,len(subm_all)))
565 | 		FPS_countsEXT=nans((nbMols,len(subm_all)))
566 | 		if verbose:
567 | 			print "Writing UNhashed fingerprints for the external file.."
568 | 		for i in range(nbMols):
569 | 			for j in range(len(subm_all)):
570 | 				if subm_all[j] in fps_by_compEXT[0][i]:
571 | 					FPS_EXT[i][j]=1
572 | 					FPS_countsEXT[i][j]=fps_by_compEXT[0][i].count(subm_all[j])
573 | 				else:
574 | 					FPS_EXT[i][j]=0
575 | 					FPS_countsEXT[i][j]=0
576 | 
577 | 		outEXTbinary=outname+"_unhashed_binary_EXT.csv"
578 | 		outEXTcounts=outname+"_unhashed_counts_EXT.csv"
579 | #		np.save("unhashed_binary.npy",FPS_EXT)
580 | #		np.save("unhashed_counts.npy",FPS_countsEXT)
581 | 		np.savetxt(outEXTbinary, FPS_EXT, fmt='%1s', delimiter=',', newline='\n')
582 | 		np.savetxt(outEXTcounts, FPS_countsEXT, fmt='%1s', delimiter=',', newline='\n')
583 | 
584 | if verbose:
585 | 	print "Calculation Finished. NO problems encountered"
586 | 
587 | 


--------------------------------------------------------------------------------