├── example_testset.csv ├── example_trainingset.csv ├── README.md ├── pt.txt └── smiles_to_sparkcsv.py /example_testset.csv: -------------------------------------------------------------------------------- 1 | 1,CCCCCC 2 | 2,CCCCCCC#CC 3 | -------------------------------------------------------------------------------- /example_trainingset.csv: -------------------------------------------------------------------------------- 1 | 1,C,CC,CCC,CCCC,CCCCC,CCCCCCC,CCCCCCCC,CCCCCCCCC,CCCCCCCCCC,CCCCCCCCCCC,CCCCCCCCCCCC,CCCCCCCCCCCCC,CCCCCCCCCCCCCC,CCCCCCCCCCCCCCC 2 | 2,C#C,CC#C,C#CCC,CC#CC,CC#CCC,CCC#CCC,CCCCCC#C,CC#CCCCCC,CCCCCCCCC#C,CCCCCCCCCCC#C,CCCCCCCCCC#C,CCCCCCCCCCCC#C,CCCCCCCCCC#CCCC,CCCCCCCCCCCCCC#C 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [Sorted Coulomb Matrix](https://papers.nips.cc/paper/4830-learning-invariant-representations-of-molecules-for-atomization-energy-prediction.pdf) generator 2 | [Coulomb matrix](http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.108.058301) has been developed as a descriptor for molecules, inorder to learn and predict their properties using Machine Learning. 3 | 4 | [A Python script](https://github.com/pythonpanda/coulomb_matrix/blob/coulomb-matrix-generator/smiles_to_sparkcsv_convertor_V1.py) to construct a sorted coulomb matrix from [SMILES](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system) string of molecules. The code internally utilizes [openbabel](http://openbabel.org/wiki/Main_Page) to process the chemical data input in the form of SMILES. By default the Sorted Coulomb matrix is saved to a CSV output file containing LabeledPoint vectors optimized to be read by [Apache Spark](http://spark.apache.org/). Apache Spark is particularly optimal for handling big data and comes with built in [powerful Machine learning library](http://spark.apache.org/mllib/). 5 | 6 | An optional [scikit-learn](http://scikit-learn.org/stable/) is invoked at the end of the script to classify molecules using SVM. 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /pt.txt: -------------------------------------------------------------------------------- 1 | H 1 2 | He 2 3 | Li 3 4 | Be 4 5 | B 5 6 | C 6 7 | N 7 8 | O 8 9 | F 9 10 | Ne 10 11 | Na 11 12 | Mg 12 13 | Al 13 14 | Si 14 15 | P 15 16 | S 16 17 | Cl 17 18 | Ar 18 19 | K 19 20 | Ca 20 21 | Sc 21 22 | Ti 22 23 | V 23 24 | Cr 24 25 | Mn 25 26 | Fe 26 27 | Co 27 28 | Ni 28 29 | Cu 29 30 | Zn 30 31 | Ga 31 32 | Ge 32 33 | As 33 34 | Se 34 35 | Br 35 36 | Kr 36 37 | Rb 37 38 | Sr 38 39 | Y 39 40 | Zr 40 41 | Nb 41 42 | Mo 42 43 | Tc 43 44 | Ru 44 45 | Rh 45 46 | Pd 46 47 | Ag 47 48 | Cd 48 49 | In 49 50 | Sn 50 51 | Sb 51 52 | Te 52 53 | I 53 54 | Xe 54 55 | Cs 55 56 | Ba 56 57 | La 57 58 | Ce 58 59 | Pr 59 60 | Nd 60 61 | Pm 61 62 | Sm 62 63 | Eu 63 64 | Gd 64 65 | Tb 65 66 | Dy 66 67 | Ho 67 68 | Er 68 69 | Tm 69 70 | Yb 70 71 | Lu 71 72 | Hf 72 73 | Ta 73 74 | W 74 75 | Re 75 76 | Os 76 77 | Ir 77 78 | Pt 78 79 | Au 79 80 | Hg 80 81 | Tl 81 82 | Pb 82 83 | Bi 83 84 | Po 84 85 | At 85 86 | Rn 86 87 | Fr 87 88 | Ra 88 89 | Ac 89 90 | Th 90 91 | Pa 91 92 | U 92 93 | Np 93 94 | Pu 94 95 | Am 95 96 | Cm 96 97 | Bk 97 98 | Cf 98 99 | Es 99 100 | Fm 100 101 | Md 101 102 | No 102 103 | Lr 103 104 | Rf 104 105 | Db 105 106 | Sg 106 107 | Bh 107 108 | Hs 108 109 | Mt 109 110 | Ds 110 111 | Rg 111 112 | Uub 112 113 | Uut 113 114 | Uuq 114 115 | Uup 115 116 | Uuh 116 117 | Uus 117 118 | Uuo 118 119 | -------------------------------------------------------------------------------- /smiles_to_sparkcsv.py: -------------------------------------------------------------------------------- 1 | #!/data/ganesh/Software/anaconda/bin/python -tt 2 | # E-MAIL:- ganesh@icp.uni-stuttgart.de 3 | # import modules used here 4 | from __future__ import print_function 5 | import sys 6 | import subprocess 7 | import pybel 8 | import csv 9 | from time import time 10 | from numpy import * 11 | from numba import jit 12 | from sklearn import svm 13 | 14 | """ 15 | kernprof -l script.py 16 | python -m line_profiler script.py.lprof 17 | Uncomment @profile after identifying the bottleneck! 18 | """ 19 | '@profile' 20 | 21 | """" 22 | 23 | Prepares Input for a Spark MLlib based Molecule Classifier 24 | 25 | Save the training set and test set in CSV format 26 | The first item of each line in CSV should correspond to the category number E.g. '1','C','CC',...., where 1 is the classification number followed by the SMILESTRING. 27 | Run the script as : $./script.py 'trainingset.csv' 'testset.csv' 28 | 29 | """" 30 | 31 | def extract_csv(file): 32 | """ 33 | A function to process the CSV files and import them as python list . 34 | Each row of the list inturn forms a sub-list for each class of molecules. 35 | """ 36 | opened_file = open(file) 37 | read_csv = csv.reader(opened_file) 38 | csv_to_list = list(read_csv) 39 | #csv_to_list 40 | opened_file.close() 41 | return csv_to_list 42 | 43 | def largest_molecule_size(training_input) : 44 | ''' 45 | All the rows of Coulomb matrix shoud be of same dimension. Hence we need number of atoms in the largest molecule . 46 | This function uses Pybel to compute just that! 47 | ''' 48 | mols = [pybel.readstring("smi", molecule) for rows in training_input for index, molecule in enumerate(rows) if index!=0 ] 49 | [mol.OBMol.AddHydrogens() for mol in mols] 50 | return int(max([ len(mol.atoms) for mol in mols])) 51 | 52 | def process_smile(row,par): 53 | """ 54 | A function to convert SMILESTRING to 3D coordinates using openbabel 55 | """ 56 | dict_list= [] 57 | atomnum_row_array = range(len(row)-1) 58 | for ind,item in enumerate(row): 59 | 60 | if ind !=0: 61 | 62 | cmd =' obabel -:'+str(row[ind])+' -oxyz -O '+par+'/'+str(ind)+'_'+str(row[0])+'_'+par+'.xyz --gen3d' 63 | output = subprocess.check_output(cmd,stderr=subprocess.STDOUT, shell=True) 64 | 65 | dict = {row[0] :par+'/'+str(ind)+'_'+str(row[0])+'_'+par+'.xyz'} 66 | print("A total of : %d molecules of class : %d converted by OpenBabel"%(int(ind),int(row[0]) )) 67 | 68 | def periodicfunc(element): 69 | """ 70 | A function to output atomic number for each element in the periodic table 71 | """ 72 | f = open("pt.txt") 73 | atomicnum = [line.split()[1] for line in f if line.split()[0] == element] 74 | f.close() 75 | return int(atomicnum[0]) 76 | 77 | def coulombmat(file,dim): 78 | """ 79 | This function takes in an xyz input file for a molecule, number of atoms in the biggest molecule to computes the corresponding coulomb Matrix 80 | """ 81 | xyzfile=open(file) 82 | xyzheader = int(xyzfile.readline()) 83 | xyzfile.close() 84 | i=0 ; j=0 85 | cij=zeros((dim,dim)) 86 | chargearray = zeros((xyzheader,1)) 87 | xyzmatrix = loadtxt(file,skiprows=2,usecols=[1,2,3]) 88 | atominfoarray = loadtxt(file,skiprows=2,dtype=str,usecols=[0]) 89 | chargearray = [periodicfunc(symbol) for symbol in atominfoarray] 90 | 91 | for i in range(xyzheader): 92 | for j in range(xyzheader): 93 | if i == j: 94 | cij[i,j]=0.5*chargearray[i]**2.4 # Diagonal term described by Potential energy of isolated atom 95 | else: 96 | dist= linalg.norm(xyzmatrix[i,:] - xyzmatrix[j,:]) 97 | cij[i,j]=chargearray[i]*chargearray[j]/dist #Pair-wise repulsion 98 | return cij 99 | 100 | def matsort(xyzfile,dim): 101 | """ 102 | Takes in a Coloumb matrix of (mxn) dimension and performs a rowwise sorting such that ||C(j,:)|| > ||C(j+1,:)||, J= 0,1,.......,(m-1) 103 | Finally returns a vectorized (m*n,1) column matrix . 104 | """ 105 | unsorted_mat = coulombmat(xyzfile,dim) 106 | summation = array([sum(x**2) for x in unsorted_mat]) 107 | sorted_mat = unsorted_mat[argsort(summation)[::-1,],:] 108 | return sorted_mat.ravel() 109 | 110 | # Gather our code in a main() function 111 | def main(): 112 | ########################### Reading Inputs & Preparing Folders ############################################################## 113 | start = time() 114 | print('\nThe training data is read from : %s \n The test data is read from:%s ' %(sys.argv[1],sys.argv[2])) 115 | print('\nRemoving Exisiting Training and Test xyz directories') 116 | output = subprocess.call('rm -fr train test ', shell=True) 117 | 118 | print('\nCreating new training and test directories') 119 | output = subprocess.check_output('mkdir train test ',stderr=subprocess.STDOUT, shell=True) 120 | print(output) 121 | 122 | ########################### Pre-processing CSV inputs ############################################################## 123 | print('\nExtracting and analyzing CSV Data \n') 124 | training_input = extract_csv(sys.argv[1]) # sys.argv[0] is the script name itself and can be ignored 125 | test_input = extract_csv(sys.argv[2]) 126 | 127 | # Process Training set data and get the dimension for Coulomb matrix 128 | no_of_class = len(training_input) 129 | print('\nThe training set has %d classes of molecules to train' %(no_of_class ) ) 130 | 131 | max_atom_index = largest_molecule_size(training_input) # Dimension of rows of the Coulomb matrix 132 | print('\nThe largest molecule has: %d atoms' %(max_atom_index ) ) 133 | 134 | ########################### Post-processing CSV training inputs ############################################################## 135 | print('\nPost-processing CSV training set data to generate matrices for training set\n') 136 | par='train' 137 | train_iter_array = range(no_of_class) 138 | for iter, row in enumerate(training_input): 139 | par='train' 140 | train_iter_array[iter] = len(row)-1 141 | process_smile(row,par) 142 | print('\n ') 143 | 144 | q=0 145 | scikit_train_Xarray = empty((sum(train_iter_array),max_atom_index*max_atom_index)) 146 | scikit_train_Yarray = empty(sum(train_iter_array)) 147 | for classes in range(no_of_class): 148 | for subclass in range(train_iter_array[classes]): 149 | label=array([float(classes)]) 150 | filetrain=open('train_array.csv','a') 151 | training_sarray = matsort(par+'/'+str(subclass+1)+'_'+str(classes+1)+'_'+par+'.xyz',max_atom_index) 152 | scikit_train_Xarray[q] = training_sarray 153 | scikit_train_Yarray[q] = label 154 | save_train_array = concatenate((label, training_sarray), axis=0) 155 | savetxt(filetrain,save_train_array[None],fmt='%.6f',delimiter=',',newline='\n') #numpyarray[None] should be used to avoid error write all elements are columns in output file! 156 | filetrain.close() 157 | q += 1 158 | print("The sorted Coloumb Matrix (vectorized) for the training set has been written to : 'train_array.csv' \n") 159 | 160 | ########################### SVC-SCIKIT_LEARN ############################################################## 161 | print('\n Learning from the Training set data \n') 162 | clf = svm.SVC() 163 | clf.fit(scikit_train_Xarray, scikit_train_Yarray) 164 | 165 | ########################### Post-processing CSV test inputs ############################################################## 166 | print('\nPost-processing CSV data to generate matrices for test set\n') 167 | par='test' 168 | test_iter_array = range(no_of_class) # An array to store the number of test sets in each classifying groups E.g. class 1 has 15 molecules so test_iter_array[0] = 15 169 | print('\nPost-processing CSV test set data to generate matrices \n') 170 | for iter, row in enumerate(test_input): 171 | par='test' 172 | test_iter_array[iter] = len(row)-1 173 | process_smile(row,par) 174 | print('\n') 175 | 176 | r=0 177 | scikit_test_Xarray = empty((sum(test_iter_array),max_atom_index*max_atom_index)) 178 | scikit_test_Yarray = empty(sum(test_iter_array)) 179 | 180 | for classes in range(no_of_class): 181 | for subclass in range(test_iter_array[classes]): 182 | label=array([float(classes)]) 183 | filetest=open('test_array.csv','a') 184 | test_sarray = matsort(par+'/'+str(subclass+1)+'_'+str(classes+1)+'_'+par+'.xyz',max_atom_index) 185 | scikit_test_Xarray[r] = test_sarray 186 | scikit_test_Yarray[r] = label 187 | save_test_array = concatenate((label, test_sarray), axis=0) 188 | savetxt(filetest,save_test_array[None],fmt='%.6f',delimiter=',',newline='\n') 189 | filetest.close() 190 | r += 1 191 | 192 | print("The sorted Coloumb Matrix (vectorized) for the test set has been written to : 'test_array.csv' \n") 193 | print('\nNote : First element of matrix for each molecule correponds to the label point for supervised learning') 194 | 195 | ########################### SVC-SCIKIT_CV or TEST ############################################################## 196 | print('\n Validating the Test set data \n') 197 | prediction = clf.predict(scikit_test_Xarray) 198 | print(prediction == scikit_test_Yarray) 199 | success = 100.*sum(prediction == scikit_test_Yarray)/float(len(scikit_test_Yarray)) 200 | print("\n The SVM predictions are %.4f %% accurate" %(success) ) 201 | 202 | end = time() 203 | print("\nTotal execution time was %.4f seconds" %(end-start) ) 204 | 205 | # Standard boilerplate to call the main() function to begin 206 | # the program. 207 | if __name__ == '__main__': 208 | main() 209 | --------------------------------------------------------------------------------