├── ProjectReport.pdf
├── README.md
├── graph.png
├── graphssl.py
└── load.py


/ProjectReport.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deerishi/graph-based-semi-supervised-learning/e20583ff2bbd93b92994992217f1fe1dd8c75e6b/ProjectReport.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # graph-based-semi-supervised-learning
2 | This project explores the different techniques (both scalable and non scalable) for Graph based semi supervised learning. Recent techniques such as ITML and LMNN along with a few others are empirically evaluated on the 20 newsgroups dataset.
3 | 


--------------------------------------------------------------------------------
/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deerishi/graph-based-semi-supervised-learning/e20583ff2bbd93b92994992217f1fe1dd8c75e6b/graph.png


--------------------------------------------------------------------------------
/graphssl.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics.pairwise import pairwise_distances
  3 | from sklearn.datasets import fetch_20newsgroups
  4 | from sklearn.feature_extraction.text import TfidfVectorizer
  5 | from sklearn.cross_validation import StratifiedKFold   
  6 | from nltk.stem.porter import PorterStemmer                                                                 
  7 | from copy import copy
  8 | from sklearn.manifold import TSNE
  9 | from nltk import word_tokenize
 10 | import matplotlib.pyplot as plt
 11 | import networkx as nx
 12 | from networkx.drawing.nx_agraph import graphviz_layout
 13 | from networkx.drawing.nx_agraph import write_dot
 14 | from sklearn import svm
 15 | from metric_learn import LMNN
 16 | import sys
 17 | from sklearn.decomposition import PCA
 18 | from matplotlib.backends.backend_pdf import PdfPages
 19 | #from modshogun import RealFeatures,BinaryLabels,LMNN,MulticlassLabels
 20 | from metric_learn import ITML
 21 | from scipy.spatial.distance import pdist
 22 | 
 23 | pp=PdfPages('PlotPdf.pdf')
 24 | 
 25 | randomState=13204
 26 | data=np.load('FullData.npy')
 27 | labels=np.load('NewLabels.npy')
 28 | 
 29 | print 'data.shape is ',data.shape
 30 | print 'labels.shape is ',labels.shape
 31 | #data=data.tolist()
 32 | #labels=labels.tolist()
 33 | 
 34 | print 'labels are ',labels
 35 | class StemmerTokenizer(object):
 36 | 
 37 |     def __init__(self): 
 38 |         self.stemmer = PorterStemmer()
 39 | 
 40 |     def __call__(self, doc):
 41 |         return [self.stemmer.stem(t) for t in word_tokenize(doc)]
 42 |         
 43 | class GraphBasedLearning:
 44 |     
 45 |     def __init__(self,X_train,y_train,x_test,y_test):
 46 |         self.x_train=copy(X_train)
 47 |         self.y_train=copy(y_train)
 48 |         self.x_test=copy(x_test)
 49 |         self.y_test=copy(y_test)
 50 |         
 51 |         self.y_train=copy(self.y_train.reshape(-1,))
 52 |         self.y_test=copy(self.y_test.reshape(-1,))
 53 |         
 54 |         self.data=copy(np.vstack((self.x_train,self.x_test)))
 55 |         self.labels=copy(np.hstack((self.y_train,self.y_test)))
 56 |         print 'labels are ',self.labels
 57 |         self.Vectorize()
 58 |   
 59 |         
 60 |         #print 'before PCA SVM Accuracy is ',self.compareWithSvm(self.trainVectors,self.testVectors)
 61 |         print 'now computing pca ',self.computePca()
 62 |         print 'after PCA svm accuracy is ',self.compareWithSvm(self.trainVectorsPCA,self.testVectorsPCA)
 63 |         self.constructSimilartyMatrixITML()
 64 |         ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
 65 |         for k in ks:
 66 |             self.constructSimilartyMatrixLMNN(k)
 67 |         self.constructSimilartyMatrixCosinePCA()
 68 |         self.constructEucleadianGaussianKernel()
 69 |         self.constructSimilartyMatrixCosine()
 70 |         self.constructSimilartyMatrixCosinePCA()
 71 |         self.constructCovarianceMatrix()
 72 |         self.constructEucleadianGaussianKernelNoPca()
 73 |     
 74 |     def constructCovarianceMatrix(self):
 75 |         
 76 |         #this function constructs the covariance matrix for the dataset and then does a label propagation over it
 77 |         
 78 |         self.covarianceMatrix=np.cov(self.trainVectorsPCA.T) #as numpy treats them as column vetcors
 79 |         self.inverseCovarianceMatrix=np.linalg.inv(self.covarianceMatrix)
 80 |         
 81 |         #compute the cholesky decomposition and then transform the data into the new space
 82 |         
 83 |         self.L_cov=np.linalg.cholesky(self.covarianceMatrix)
 84 |         self.allDataCov=np.dot(self.allDataPCA,self.L_cov.T)
 85 |         self.pwdis=pairwise_distances(self.allDataCov)
 86 |         self.D=np.zeros(self.pwdis.shape)
 87 |         projectedDigits=TSNE(random_state=randomState).fit_transform(self.allDataCov)
 88 |         plt.figure()
 89 |         plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels)
 90 |         plt.title('Data projected by Covariance Matrix in Mahalanobis metric')
 91 |         plt.savefig(pp,format='pdf')
 92 |         plt.close()
 93 |        
 94 |         ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
 95 |         accs=[]
 96 |         for k in ks:
 97 |             for i in range(0,self.pwdis.shape[0]):
 98 |                 l1=self.pwdis[i].tolist()
 99 |                 #print 'l1 is ',l1,'\n\n'
100 |                 allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
101 |                 #now set the all the weights except for k+1 to 0
102 |                 self.pwdis[i,allnearestNeighbours[k:]]=0
103 |                 self.D[i,i]=sum(self.pwdis[i]+0.01)
104 |             
105 |             print 'accuracy by using Covariance Matrix for Mahalanobis Distance for k= ',k,'\n'
106 |             accs.append(self.labelPropogation())
107 | 
108 |         plt.figure()
109 |         plt.plot(ks,accs)
110 |         plt.title('Plot of accuracy vs k using Covariance Matrix in  Mahalanobis metric')
111 |         plt.savefig(pp,format='pdf')
112 |         
113 |         
114 |     def constructEucleadianGaussianKernel(self):
115 |         
116 |         self.pwdis=pairwise_distances(self.allDataPCA)
117 |         
118 |         maccs=[]
119 |         ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
120 |         for k in ks:
121 |             sigmas=[1,1.5,2,2.5,3,3.5]
122 |             accs=[]
123 |             for sigma in sigmas:
124 |                 self.pwdis=-1*self.pwdis/(2*sigma*sigma)
125 |                 self.pwdis=np.exp(self.pwdis)
126 |                 self.D=np.zeros(self.pwdis.shape)
127 |                 for i in range(0,self.pwdis.shape[0]):
128 |                     l1=self.pwdis[i].tolist()
129 |                     #print 'l1 is ',l1,'\n\n'
130 |                     allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
131 |                     #now set the all the weights except for k+1 to 0
132 |                     self.pwdis[i,allnearestNeighbours[k:]]=0
133 |                     self.D[i,i]=sum(self.pwdis[i])
134 |                     
135 |                     #here we make no trnasformation on the dataset, as this is simply the 
136 |                 print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n'
137 |                 accs.append(self.labelPropogation())
138 |             maccs.append(np.mean(accs))
139 |         
140 |         plt.figure()
141 |         plt.plot(ks,maccs)
142 |         plt.title('Accuarcy vs k for Eucledian Gaussian Kernel')
143 |         plt.savefig(pp,format='pdf')
144 |         plt.close()            
145 |          
146 |     def constructEucleadianGaussianKernelNoPca(self):
147 |         
148 |         self.pwdis=pairwise_distances(self.allVectors)
149 |         
150 |         maccs=[]
151 |         ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
152 |         for k in ks:
153 |             sigmas=[1,1.5,2,2.5,3,3.5]
154 |             accs=[]
155 |             for sigma in sigmas:
156 |                 self.pwdis=-1*self.pwdis/(2*sigma*sigma)
157 |                 self.pwdis=np.exp(self.pwdis)
158 |                 self.D=np.zeros(self.pwdis.shape)
159 |                 for i in range(0,self.pwdis.shape[0]):
160 |                     l1=self.pwdis[i].tolist()
161 |                     #print 'l1 is ',l1,'\n\n'
162 |                     allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
163 |                     #now set the all the weights except for k+1 to 0
164 |                     self.pwdis[i,allnearestNeighbours[k:]]=0
165 |                     self.D[i,i]=sum(self.pwdis[i])
166 |                     
167 |                     #here we make no trnasformation on the dataset, as this is simply the 
168 |                 print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n'
169 |                 accs.append(self.labelPropogation())
170 |             maccs.append(np.mean(accs))
171 |          
172 |         plt.figure()   
173 |         plt.plot(ks,maccs)
174 |         plt.title('Accuarcy vs k for Eucledian Gaussian Kernel')
175 |         plt.savefig(pp,format='pdf')           
176 |         plt.close()
177 |     
178 |     def convertToDenseMatrix(self):
179 |         #transform the trainVectors to dense
180 |         self.trainVectors=self.trainVectors.todense()
181 |         temp=copy(np.zeros(self.trainVectors.shape))
182 |         for i in range(0,self.trainVectors.shape[0]):
183 |             for j in range(0,self.trainVectors.shape[1]):
184 |                 temp[i,j]=self.trainVectors[i,j]
185 |         
186 |         #transform the testVectors to dense
187 |         self.trainVectors=copy(temp)
188 |         self.testVectors=self.testVectors.todense()
189 |         temp=copy(np.zeros(self.testVectors.shape))
190 |         for i in range(0,self.testVectors.shape[0]):
191 |             for j in range(0,self.testVectors.shape[1]):
192 |                 temp[i,j]=self.testVectors[i,j]    
193 |         self.testVectors=copy(temp)
194 |         
195 |         self.allVectors=copy(np.vstack((self.trainVectors,self.testVectors)))
196 |         
197 |     def Vectorize(self):
198 |         self.vectorizer = TfidfVectorizer(decode_error='replace',analyzer='word',stop_words='english',lowercase=True,tokenizer=StemmerTokenizer())
199 |      
200 |         self.x2=[]
201 |         for doc in self.x_train:
202 |             #print 'doc is ',doc[0],'\n\n'
203 |             self.x2.append(doc[0])
204 |         self.data2=[]
205 |         for doc in self.data:   
206 |             self.data2.append(doc[0])
207 |         #print 'self.data[0:2] is ',self.data
208 |         self.xtest2=[]
209 |         for doc in self.x_test:  
210 |             self.xtest2.append(doc[0])
211 |         
212 |         self.vectorizer.fit(self.x2)
213 |         #print 'self.x2 is ',len(self.x2)
214 |         self.trainVectors=self.vectorizer.transform(self.x2)
215 |         print 'train vectors are ',self.trainVectors.shape
216 |       
217 |         self.testVectors=self.vectorizer.transform(self.xtest2)
218 |         self.allVectors=self.vectorizer.transform(self.data2)
219 |         self.convertToDenseMatrix()
220 |         print 'allVectors are ',self.allVectors.shape
221 |         
222 |         #projectedDigits = TSNE(random_state=randomState).fit_transform(self.allVectors)
223 |         #plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels)
224 |         #plt.title('All Datas Set projected into 2D by TSNE')
225 |         #plt.savefig(pp,format='pdf')
226 |         #plt.show()
227 |         #print 'projectedDigits are ',projectedDigits.shape
228 |         #pp.close()
229 |         #sys.exit(0)
230 |         
231 |     
232 |     def constructSimilartyMatrixCosine(self):
233 |         #This is a simpole k nearest neighbour approach based on the cosine distance
234 |         #for this takefrom modshogun import RealFeatures, MulticlassLabels
235 |         #then find the k nearest neighbours for each node 
236 |         
237 |         #now we have all the pairwise cosine distances between all the sentences
238 |         #now we need to do a knnNeighbour search
239 |         #now we can construct the diagonal weight marix , which has the sum of all the weights
240 |         ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
241 |         accs=[]
242 |         for k in ks:
243 |             
244 |             self.pwdis=pairwise_distances(self.allVectors,metric='cosine')
245 |             self.D=np.zeros(self.pwdis.shape)
246 |             for i in range(0,self.pwdis.shape[0]):
247 |                 l1=self.pwdis[i].tolist()
248 |                 #print 'l1 is ',l1,'\n\n'
249 |                 allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
250 |                 #now set the all the weights except for k+1 to 0
251 |                 self.pwdis[i,allnearestNeighbours[k:]]=0
252 |                 self.D[i,i]=sum(self.pwdis[i])
253 |             
254 |             print 'accuracy on non pca data using cosine and k= ',k,' is ','\n'
255 |             accs.append(self.labelPropogation())
256 |         
257 |         plt.figure()
258 |         plt.plot(ks,accs)
259 |         plt.title('Plot of accuracy vs k using cosine non PCA data')
260 |         plt.savefig(pp,format='pdf')
261 |         plt.close()
262 |             
263 | 
264 |     def constructSimilartyMatrixCosinePCA(self):
265 |         #This is a simpole k nearest neighbour approach based on the cosine distance
266 |         #for this takefrom modshogun import RealFeatures, MulticlassLabels
267 |         #then find the k nearest neighbours for each node 
268 |         ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
269 |         accs=[]
270 |         for k in ks:
271 |             self.pwdis=pairwise_distances(self.allDataPCA,metric='cosine')
272 |             #now we have all the pairwise cosine distances between all the sentences
273 |             #now we need to do a knnNeighbour search
274 |             #now we can construct the diagonal weight marix , which has the sum of all the weights
275 |             self.D=np.zeros(self.pwdis.shape)
276 |             for i in range(0,self.pwdis.shape[0]):
277 |                 l1=self.pwdis[i].tolist()
278 |                 #print 'l1 is ',l1,'\n\n'
279 |                 allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
280 |                 #now set the all the weights except for k+1 to 0
281 |                 self.pwdis[i,allnearestNeighbours[k:]]=0
282 |                 self.D[i,i]=sum(self.pwdis[i])
283 |             
284 |             print 'Now computing accuracy for cosine metric on PCA data'
285 |             accs.append(self.labelPropogation())
286 |         
287 |         plt.figure()
288 |         plt.plot(ks,accs)
289 |         plt.title('Plot of accuracy vs k using cosine  PCA data')
290 |         plt.savefig(pp,format='pdf')    
291 |         plt.close()
292 |         
293 |         #now we have the weight matrix graph based on the cosine distance
294 |         #print 'self.D is ',self.D
295 |     
296 | 
297 |     
298 |     def checkAccuracy(self,predicted,goldset):
299 |         predicted=predicted.tolist()
300 |         goldset=goldset.tolist()
301 |         correct=0
302 |         for i in range(0,len(predicted)):
303 |             #print 'predicted is ',predicted[i],' goldset is ',goldset[i]
304 |             if goldset[i]==predicted[i]:
305 |                 correct+=1
306 |         
307 |         return (float(correct)/len(predicted))*100
308 |     
309 |     def computePca(self):
310 |         
311 |         pca=PCA(n_components=100)
312 |         pca.fit(self.trainVectors)
313 |         self.trainVectorsPCA=copy(pca.transform(self.trainVectors))
314 |         self.testVectorsPCA=copy(pca.transform(self.testVectors))
315 |         print '\ndata ',self.trainVectorsPCA,'\n'
316 |         #print 'the explained variance is ',np.cumsum(pca.explained_variance_ratio_)
317 |         self.allDataPCA=copy(np.vstack((self.trainVectorsPCA,self.testVectorsPCA)))
318 |         
319 |          
320 |     
321 |     
322 |     def constructSimilartyMatrixLMNN(self,ks):
323 |         
324 |         
325 |         print 'now doing LMNN for k= ',ks
326 |         self.y_train=self.y_train.reshape(-1,)
327 |         lmnn=LMNN(k=ks, learn_rate=1e-7,max_iter=3000)
328 |         lmnn.fit(self.trainVectorsPCA, self.y_train, verbose=False)
329 |         self.L_lmnn = lmnn.transformer()
330 |         name='lmnn/LMNN transformer matrix with dataset shape '+str(self.trainVectorsPCA.shape)
331 |         np.save(name,self.L_lmnn)
332 |         print 'L.shape is ',self.L_lmnn.shape,'\n\n'
333 |         # Input data transformed to the metric space by X*L.T
334 |         self.transformedTrainLMNN=copy(lmnn.transform(self.trainVectorsPCA))
335 |         self.transformedTestLMNN=copy(lmnn.transform(self.testVectorsPCA))
336 |         self.transformedAllLMNN=copy(lmnn.transform(self.allDataPCA)) #we compute the pairwise distance on this now 
337 |         projectedDigits = TSNE(random_state=randomState).fit_transform(self.transformedAllLMNN)
338 |         
339 |         plt.figure()
340 |         plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels)
341 |         plt.title('LMNN Transformed ALL set projected to 2 Dimensions by TSNE with k='+str(ks))
342 |         plt.savefig(pp,format='pdf')
343 |         plt.close()
344 |         
345 |         
346 |         self.pwdis=copy(pairwise_distances(self.transformedAllLMNN,metric='euclidean'))
347 |         self.D=np.zeros(self.pwdis.shape)
348 |         for i in range(0,self.pwdis.shape[0]):
349 |             l1=self.pwdis[i].tolist()
350 |             #print 'l1 is ',l1,'\n\n'
351 |             allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
352 |             #now set the all the weights except for k+1 to 0
353 |             self.pwdis[i,allnearestNeighbours[ks:]]=0
354 |             self.D[i,i]=sum(self.pwdis[i])
355 |         
356 |         print 'accuracy for LMNN for k= ',ks,'\n'
357 |         self.labelPropogation()
358 |     
359 |     def transformPairwiseDistanceToGaussian(self,sigma):
360 |         
361 |         self.pwdis=copy(-1*self.pwdis/(2*sigma*sigma))
362 |         self.pwdis=copy(np.exp(self.pwdis))
363 |         
364 |     
365 |     def constructSimilartyMatrixITML(self):
366 |     
367 |         print 'Now doing itml'
368 |         counter=1
369 |         ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65]
370 |         
371 |         constraints=[80,100,120,150,180,200]
372 |         constraints=[100]
373 |         for k in ks:
374 |             for num_constraints in constraints:
375 |                 
376 |                 itml=ITML()
377 |                 self.y_train=self.y_train.reshape(-1,)
378 |                 C = ITML.prepare_constraints(self.y_train, self.trainVectorsPCA.shape[0], num_constraints)
379 |                 itml.fit(self.trainVectorsPCA, C, verbose=True)
380 |                 self.L_itml=copy(itml.transformer())
381 |                 
382 |                 name='itml/ITML transformer matrix with dataset shape '+str(self.trainVectorsPCA.shape)+' and k='+str(k)+' and num_constraints='+str(num_constraints)
383 |                 #print 'L itml shape  is ',self.L_itml.shape
384 |                 np.save(name,self.L_itml)
385 |                 
386 |                 # Input data transformed to the metric space by X*L.T
387 |                 
388 |                 self.transformedTrainITML=copy(itml.transform(self.trainVectorsPCA))
389 |                 self.transformedTestITML=copy(itml.transform(self.testVectorsPCA))
390 |                 self.transformedAllITML=copy(itml.transform(self.allDataPCA))
391 |                 # now we can simply calculate the eucledian distances on the above transformed dataset
392 |                 
393 |                 
394 |                     #Visualizing the dataset by TSNE
395 |                 projectedDigits = TSNE(random_state=randomState).fit_transform(self.transformedAllITML)
396 |                   
397 |                 print 'projectedDigits is ',projectedDigits.shape    
398 |                 plt.figure()
399 |                 plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels)
400 |                 plt.title('ITML Transformed ALL set projected to 2 Dimensions by TSNE with'+str(k)+' and num_constraints='+str(num_constraints))
401 |                 plt.savefig(pp,format='pdf')
402 |                 #plt.show()
403 |                 plt.close()
404 |                 
405 |                 self.pwdis=copy(pairwise_distances(self.transformedAllITML,metric='euclidean'))
406 |                 #sigmas=[1,2.5,2,2.5,3,3.5,4,4.5,5]
407 |                 #for sigma in sigmas:
408 |                     
409 |                 self.D=np.zeros(self.pwdis.shape)
410 |                 for i in range(0,self.pwdis.shape[0]):
411 |                     l1=self.pwdis[i].tolist()
412 |                         #print 'l1 is ',l1,'\n\n'
413 |                     allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i])
414 |                         #now set the all the weights except for k+1 to 0
415 |                         #since we exponentiated the distances with the minus sign we need to set the lowest weights to 0, so everything except for the last k go to 0
416 |                     self.pwdis[i,allnearestNeighbours[k:]]=0
417 |                     self.D[i,i]=sum(self.pwdis[i])
418 |                     
419 |                 print 'accuracy for ITML for k= ',k,' and num_constraints='+str(num_constraints),'\n'
420 |                 self.labelPropogation()
421 |                 
422 |   
423 |             
424 |     def labelPropogation(self):
425 |         #Algorithm 11.1 Label propagation (Zhu and Ghahramani, 2002)
426 |         self.y_test=self.y_test.reshape(-1,1)
427 |         self.y_train=self.y_train.reshape(-1,1)
428 |         
429 |         self.yUnlabeled=np.zeros(self.y_test.shape)
430 |         self.y_labeled=copy(self.y_train)
431 |         
432 |         self.ypred=copy(np.vstack((self.y_labeled,self.yUnlabeled)))
433 |         
434 |         #now to do the label propogation 
435 |         
436 |         for i in range(0,50):   
437 |             self.ypred=np.dot(np.linalg.inv(self.D),np.dot(self.pwdis,self.ypred))
438 |             #now need to relabel all the labeled points
439 |             for i in range(0,self.y_labeled.shape[0]):
440 |                 self.ypred[i,0]=self.y_labeled[i,0]
441 |             
442 |             
443 |             
444 |         #now label propogation is complete
445 |         numTrain=self.y_train.shape[0]
446 |         self.predicted1=self.ypred[numTrain:,0]
447 |         #now we need to rethreshold them to 1 and -1
448 |         
449 |         for i in range(self.predicted1.shape[0]):
450 |             if self.predicted1[i]>0:
451 |                 self.predicted1[i]=1
452 |             else:
453 |                 self.predicted1[i]=-1
454 |         
455 |         self.predicted1=self.predicted1.reshape(-1,)
456 |         self.y_test=self.y_test.reshape(-1) 
457 |         print 'self.predicted1 is ',self.predicted1.shape
458 |         print 'self.y_test is ',self.y_test.shape       
459 |         acc=self.checkAccuracy(self.predicted1,self.y_test)
460 |         print 'the accuracy is ',acc
461 |         return acc
462 |                     
463 |             
464 |          
465 |     def compareWithSvm(self,datasetTrain,datasetTest):
466 |         C=[0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]
467 |         print '\n'
468 |         print 'dataset shape is ',datasetTrain.shape
469 |         self.y_train=self.y_train.reshape(-1,)
470 |         for c in C:
471 |             self.Svm=svm.LinearSVC(C=c)
472 |             self.Svm.fit(datasetTrain,self.y_train)
473 |             labels=self.Svm.predict(datasetTest)
474 |             print 'accuracy with c=',c,'  is  ',self.checkAccuracy(labels,self.y_test),'% ','\n'   
475 |         
476 |             
477 |             
478 |             
479 |     
480 | 
481 |         
482 |     
483 | 
484 | 
485 | 
486 | #for graph based reasoning , replace every 0 with -1
487 | 
488 | newLabels=[]
489 | for label in labels:
490 |     if label==1:
491 |         newLabels.append(label)
492 |     else:
493 |         newLabels.append(-1)
494 | 
495 | #print 'newLabels are ',newLabels
496 | 
497 | newLabels=np.asarray(newLabels)
498 | np.save('NewLabels',newLabels)
499 | 
500 |     
501 | 
502 | skf=StratifiedKFold(newLabels,n_folds=4,shuffle=True)
503 | counter1=1
504 | for train_index,test_index in skf:
505 |    
506 |     X_train,X_test=data[test_index],data[train_index]
507 |     y_train,y_test=labels[test_index],labels[train_index]
508 |     X_train=copy(X_train.reshape(-1,1))
509 |     X_test=copy(X_test.reshape(-1,1))
510 |     #y_train=copy(y_train.reshape(-1,1))
511 |     #y_test=copy(y_test.reshape(-1,1))
512 |     ob1=GraphBasedLearning(X_train,y_train,X_test,y_test)
513 |     print 'X_train is ',y_train.shape
514 |     print 'X_test is ',y_test.shape
515 |     
516 |     
517 | pp.close()
518 | 
519 | 


--------------------------------------------------------------------------------
/load.py:
--------------------------------------------------------------------------------
  1 | from sklearn.datasets import fetch_20newsgroups
  2 | import numpy as np
  3 | from sklearn.feature_extraction.text import TfidfVectorizer
  4 | from sklearn import metrics
  5 | import numpy as np
  6 | from nltk import word_tokenize
  7 | import nltk
  8 | from nltk.corpus import stopwords
  9 | from nltk.stem.porter import PorterStemmer
 10 | from nltk.stem import WordNetLemmatizer
 11 | import os
 12 | import re
 13 | from os.path import expanduser
 14 | from sklearn.metrics.pairwise import pairwise_distances
 15 |                                                                                                                                                                                                                                                                                                  
 16 | from copy import copy
 17 | 
 18 | categories=['alt.atheism', 'sci.space']
 19 | newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)
 20 | 
 21 | class StemmerTokenizer(object):
 22 | 
 23 |     def __init__(self): 
 24 |         self.stemmer = PorterStemmer()
 25 | 
 26 |     def __call__(self, doc):
 27 |         return [self.stemmer.stem(t) for t in word_tokenize(doc)]
 28 |         
 29 | 
 30 | stop=stopwords.words('english')
 31 | stemmer=PorterStemmer()
 32 | lemmer=WordNetLemmatizer()
 33 | home=expanduser("~")
 34 | 
 35 | nonWords=['[',']','{','}','/','_','|']
 36 |     
 37 | def preProcess(dataset,target):
 38 |     processed=[]
 39 |     target2=target.tolist()
 40 |     labels=[]
 41 |     counter=-1
 42 |     j=1
 43 |     for line in dataset:
 44 |         #words=line.split(' ')
 45 |         
 46 |         print ' at j = ',j,'/',len(dataset)
 47 |         j=j+1
 48 |         counter+=1
 49 |         line=line.lower()
 50 |         line=line.strip()
 51 |         if len(line)>0:
 52 |             labels.append(target2[counter])
 53 |         else:
 54 |             continue
 55 |                    
 56 |         cleaned1=re.sub(r'[,.!?&$@%/\\"-;:()+*^`<>#`~=]',"",line)
 57 |         cleaned2=re.sub(r'\d',"",cleaned1)
 58 |         cleaned2=re.sub(r"'s","",cleaned2)
 59 |         cleaned2=re.sub(r"'","",cleaned2)
 60 |         cleaned2=re.sub(r'"',"",cleaned2)
 61 |         
 62 |         for ch in nonWords:
 63 |             try:
 64 |                 cleaned2=cleaned2.replace(ch,'')
 65 |             except:
 66 |                 pass
 67 |         words=nltk.word_tokenize(cleaned2)
 68 |         #print 'words is ',words
 69 |         tags=nltk.pos_tag(words)
 70 |         sentence=""
 71 |         sentence+=words[0]+" "
 72 |         for i in range(1,len(tags)):
 73 |             tag=tags[i]
 74 |             #print 'tag is ',tag
 75 |             try:
 76 |                 if tag[1][0].lower() in ['a','v','n','s','r']:
 77 |                     word=lemmer.lemmatize(tag[0],tag[1][0].lower())
 78 |                 else:
 79 |                     word=lemmer.lemmatize(tag[0])
 80 |                 sentence+=word+" "
 81 |             except Exception:
 82 |                 pass
 83 |             
 84 |             
 85 |         #print 'the lemmatized sentence is ',sentence
 86 |         for stopWord in stop:
 87 |             try:
 88 |                 cleaned2=sentence.replace(stopWord,"")
 89 |             except Exception:
 90 |                 pass
 91 |             
 92 |         cleaned2.strip()
 93 |         processed.append(cleaned2)
 94 | 
 95 |     return processed,labels
 96 |         
 97 | smallData=newsgroups_train.data
 98 | target=newsgroups_train.target
 99 | j=0
100 | '''
101 | numwords=0
102 | for data in smallData:
103 |     print 'data j=',j,' is ',data,'\n\n'
104 |     j+=1'''    
105 | processedData,labels=preProcess(smallData,target)
106 | parray=np.asarray(processedData)
107 | np.save('FullData',parray)
108 | labels=np.asarray(labels)
109 | np.save('Labels',labels)
110 | 
111 | i=0
112 | words={}
113 | for line in processedData:
114 |     #print 'i= ',i,' ',line,'\n\n'
115 |     ws=(line.split(' '))
116 |     for w in ws:
117 |         words[w]=1
118 |     #print 'original sentence i= ',i,' is ',newsgroups_train.data[i],'\n\n'
119 |     i+=1
120 | 
121 | vectorizer = TfidfVectorizer(decode_error='replace',analyzer='word',stop_words='english',lowercase=True,tokenizer=StemmerTokenizer())
122 | vectorizer.fit(processedData)
123 | vectors = vectorizer.transform(processedData)
124 | print 'vectorizer is  ' ,vectors.shape
125 | print 'numwords is ',len(words)
126 | print 'get_feature_names is ',vectorizer.get_feature_names()
127 | pwdis=pairwise_distances(vectors,metric='cosine')
128 | 
129 | print 'pairwise_distances is ',pwdis.shape,'\n'
130 | print 'labels .shape is ',len(labels)
131 | 


--------------------------------------------------------------------------------