├── ProjectReport.pdf ├── README.md ├── graph.png ├── graphssl.py └── load.py /ProjectReport.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deerishi/graph-based-semi-supervised-learning/e20583ff2bbd93b92994992217f1fe1dd8c75e6b/ProjectReport.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # graph-based-semi-supervised-learning 2 | This project explores the different techniques (both scalable and non scalable) for Graph based semi supervised learning. Recent techniques such as ITML and LMNN along with a few others are empirically evaluated on the 20 newsgroups dataset. 3 | -------------------------------------------------------------------------------- /graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deerishi/graph-based-semi-supervised-learning/e20583ff2bbd93b92994992217f1fe1dd8c75e6b/graph.png -------------------------------------------------------------------------------- /graphssl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics.pairwise import pairwise_distances 3 | from sklearn.datasets import fetch_20newsgroups 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | from sklearn.cross_validation import StratifiedKFold 6 | from nltk.stem.porter import PorterStemmer 7 | from copy import copy 8 | from sklearn.manifold import TSNE 9 | from nltk import word_tokenize 10 | import matplotlib.pyplot as plt 11 | import networkx as nx 12 | from networkx.drawing.nx_agraph import graphviz_layout 13 | from networkx.drawing.nx_agraph import write_dot 14 | from sklearn import svm 15 | from metric_learn import LMNN 16 | import sys 17 | from sklearn.decomposition import PCA 18 | from matplotlib.backends.backend_pdf import PdfPages 19 | #from modshogun import RealFeatures,BinaryLabels,LMNN,MulticlassLabels 20 | from metric_learn import ITML 21 | from scipy.spatial.distance import pdist 22 | 23 | pp=PdfPages('PlotPdf.pdf') 24 | 25 | randomState=13204 26 | data=np.load('FullData.npy') 27 | labels=np.load('NewLabels.npy') 28 | 29 | print 'data.shape is ',data.shape 30 | print 'labels.shape is ',labels.shape 31 | #data=data.tolist() 32 | #labels=labels.tolist() 33 | 34 | print 'labels are ',labels 35 | class StemmerTokenizer(object): 36 | 37 | def __init__(self): 38 | self.stemmer = PorterStemmer() 39 | 40 | def __call__(self, doc): 41 | return [self.stemmer.stem(t) for t in word_tokenize(doc)] 42 | 43 | class GraphBasedLearning: 44 | 45 | def __init__(self,X_train,y_train,x_test,y_test): 46 | self.x_train=copy(X_train) 47 | self.y_train=copy(y_train) 48 | self.x_test=copy(x_test) 49 | self.y_test=copy(y_test) 50 | 51 | self.y_train=copy(self.y_train.reshape(-1,)) 52 | self.y_test=copy(self.y_test.reshape(-1,)) 53 | 54 | self.data=copy(np.vstack((self.x_train,self.x_test))) 55 | self.labels=copy(np.hstack((self.y_train,self.y_test))) 56 | print 'labels are ',self.labels 57 | self.Vectorize() 58 | 59 | 60 | #print 'before PCA SVM Accuracy is ',self.compareWithSvm(self.trainVectors,self.testVectors) 61 | print 'now computing pca ',self.computePca() 62 | print 'after PCA svm accuracy is ',self.compareWithSvm(self.trainVectorsPCA,self.testVectorsPCA) 63 | self.constructSimilartyMatrixITML() 64 | ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] 65 | for k in ks: 66 | self.constructSimilartyMatrixLMNN(k) 67 | self.constructSimilartyMatrixCosinePCA() 68 | self.constructEucleadianGaussianKernel() 69 | self.constructSimilartyMatrixCosine() 70 | self.constructSimilartyMatrixCosinePCA() 71 | self.constructCovarianceMatrix() 72 | self.constructEucleadianGaussianKernelNoPca() 73 | 74 | def constructCovarianceMatrix(self): 75 | 76 | #this function constructs the covariance matrix for the dataset and then does a label propagation over it 77 | 78 | self.covarianceMatrix=np.cov(self.trainVectorsPCA.T) #as numpy treats them as column vetcors 79 | self.inverseCovarianceMatrix=np.linalg.inv(self.covarianceMatrix) 80 | 81 | #compute the cholesky decomposition and then transform the data into the new space 82 | 83 | self.L_cov=np.linalg.cholesky(self.covarianceMatrix) 84 | self.allDataCov=np.dot(self.allDataPCA,self.L_cov.T) 85 | self.pwdis=pairwise_distances(self.allDataCov) 86 | self.D=np.zeros(self.pwdis.shape) 87 | projectedDigits=TSNE(random_state=randomState).fit_transform(self.allDataCov) 88 | plt.figure() 89 | plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels) 90 | plt.title('Data projected by Covariance Matrix in Mahalanobis metric') 91 | plt.savefig(pp,format='pdf') 92 | plt.close() 93 | 94 | ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] 95 | accs=[] 96 | for k in ks: 97 | for i in range(0,self.pwdis.shape[0]): 98 | l1=self.pwdis[i].tolist() 99 | #print 'l1 is ',l1,'\n\n' 100 | allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) 101 | #now set the all the weights except for k+1 to 0 102 | self.pwdis[i,allnearestNeighbours[k:]]=0 103 | self.D[i,i]=sum(self.pwdis[i]+0.01) 104 | 105 | print 'accuracy by using Covariance Matrix for Mahalanobis Distance for k= ',k,'\n' 106 | accs.append(self.labelPropogation()) 107 | 108 | plt.figure() 109 | plt.plot(ks,accs) 110 | plt.title('Plot of accuracy vs k using Covariance Matrix in Mahalanobis metric') 111 | plt.savefig(pp,format='pdf') 112 | 113 | 114 | def constructEucleadianGaussianKernel(self): 115 | 116 | self.pwdis=pairwise_distances(self.allDataPCA) 117 | 118 | maccs=[] 119 | ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] 120 | for k in ks: 121 | sigmas=[1,1.5,2,2.5,3,3.5] 122 | accs=[] 123 | for sigma in sigmas: 124 | self.pwdis=-1*self.pwdis/(2*sigma*sigma) 125 | self.pwdis=np.exp(self.pwdis) 126 | self.D=np.zeros(self.pwdis.shape) 127 | for i in range(0,self.pwdis.shape[0]): 128 | l1=self.pwdis[i].tolist() 129 | #print 'l1 is ',l1,'\n\n' 130 | allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) 131 | #now set the all the weights except for k+1 to 0 132 | self.pwdis[i,allnearestNeighbours[k:]]=0 133 | self.D[i,i]=sum(self.pwdis[i]) 134 | 135 | #here we make no trnasformation on the dataset, as this is simply the 136 | print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n' 137 | accs.append(self.labelPropogation()) 138 | maccs.append(np.mean(accs)) 139 | 140 | plt.figure() 141 | plt.plot(ks,maccs) 142 | plt.title('Accuarcy vs k for Eucledian Gaussian Kernel') 143 | plt.savefig(pp,format='pdf') 144 | plt.close() 145 | 146 | def constructEucleadianGaussianKernelNoPca(self): 147 | 148 | self.pwdis=pairwise_distances(self.allVectors) 149 | 150 | maccs=[] 151 | ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] 152 | for k in ks: 153 | sigmas=[1,1.5,2,2.5,3,3.5] 154 | accs=[] 155 | for sigma in sigmas: 156 | self.pwdis=-1*self.pwdis/(2*sigma*sigma) 157 | self.pwdis=np.exp(self.pwdis) 158 | self.D=np.zeros(self.pwdis.shape) 159 | for i in range(0,self.pwdis.shape[0]): 160 | l1=self.pwdis[i].tolist() 161 | #print 'l1 is ',l1,'\n\n' 162 | allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) 163 | #now set the all the weights except for k+1 to 0 164 | self.pwdis[i,allnearestNeighbours[k:]]=0 165 | self.D[i,i]=sum(self.pwdis[i]) 166 | 167 | #here we make no trnasformation on the dataset, as this is simply the 168 | print 'accuracy for constructEucleadianGaussianKernel with k=',k,' and sigma =',sigma,' is \n' 169 | accs.append(self.labelPropogation()) 170 | maccs.append(np.mean(accs)) 171 | 172 | plt.figure() 173 | plt.plot(ks,maccs) 174 | plt.title('Accuarcy vs k for Eucledian Gaussian Kernel') 175 | plt.savefig(pp,format='pdf') 176 | plt.close() 177 | 178 | def convertToDenseMatrix(self): 179 | #transform the trainVectors to dense 180 | self.trainVectors=self.trainVectors.todense() 181 | temp=copy(np.zeros(self.trainVectors.shape)) 182 | for i in range(0,self.trainVectors.shape[0]): 183 | for j in range(0,self.trainVectors.shape[1]): 184 | temp[i,j]=self.trainVectors[i,j] 185 | 186 | #transform the testVectors to dense 187 | self.trainVectors=copy(temp) 188 | self.testVectors=self.testVectors.todense() 189 | temp=copy(np.zeros(self.testVectors.shape)) 190 | for i in range(0,self.testVectors.shape[0]): 191 | for j in range(0,self.testVectors.shape[1]): 192 | temp[i,j]=self.testVectors[i,j] 193 | self.testVectors=copy(temp) 194 | 195 | self.allVectors=copy(np.vstack((self.trainVectors,self.testVectors))) 196 | 197 | def Vectorize(self): 198 | self.vectorizer = TfidfVectorizer(decode_error='replace',analyzer='word',stop_words='english',lowercase=True,tokenizer=StemmerTokenizer()) 199 | 200 | self.x2=[] 201 | for doc in self.x_train: 202 | #print 'doc is ',doc[0],'\n\n' 203 | self.x2.append(doc[0]) 204 | self.data2=[] 205 | for doc in self.data: 206 | self.data2.append(doc[0]) 207 | #print 'self.data[0:2] is ',self.data 208 | self.xtest2=[] 209 | for doc in self.x_test: 210 | self.xtest2.append(doc[0]) 211 | 212 | self.vectorizer.fit(self.x2) 213 | #print 'self.x2 is ',len(self.x2) 214 | self.trainVectors=self.vectorizer.transform(self.x2) 215 | print 'train vectors are ',self.trainVectors.shape 216 | 217 | self.testVectors=self.vectorizer.transform(self.xtest2) 218 | self.allVectors=self.vectorizer.transform(self.data2) 219 | self.convertToDenseMatrix() 220 | print 'allVectors are ',self.allVectors.shape 221 | 222 | #projectedDigits = TSNE(random_state=randomState).fit_transform(self.allVectors) 223 | #plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels) 224 | #plt.title('All Datas Set projected into 2D by TSNE') 225 | #plt.savefig(pp,format='pdf') 226 | #plt.show() 227 | #print 'projectedDigits are ',projectedDigits.shape 228 | #pp.close() 229 | #sys.exit(0) 230 | 231 | 232 | def constructSimilartyMatrixCosine(self): 233 | #This is a simpole k nearest neighbour approach based on the cosine distance 234 | #for this takefrom modshogun import RealFeatures, MulticlassLabels 235 | #then find the k nearest neighbours for each node 236 | 237 | #now we have all the pairwise cosine distances between all the sentences 238 | #now we need to do a knnNeighbour search 239 | #now we can construct the diagonal weight marix , which has the sum of all the weights 240 | ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] 241 | accs=[] 242 | for k in ks: 243 | 244 | self.pwdis=pairwise_distances(self.allVectors,metric='cosine') 245 | self.D=np.zeros(self.pwdis.shape) 246 | for i in range(0,self.pwdis.shape[0]): 247 | l1=self.pwdis[i].tolist() 248 | #print 'l1 is ',l1,'\n\n' 249 | allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) 250 | #now set the all the weights except for k+1 to 0 251 | self.pwdis[i,allnearestNeighbours[k:]]=0 252 | self.D[i,i]=sum(self.pwdis[i]) 253 | 254 | print 'accuracy on non pca data using cosine and k= ',k,' is ','\n' 255 | accs.append(self.labelPropogation()) 256 | 257 | plt.figure() 258 | plt.plot(ks,accs) 259 | plt.title('Plot of accuracy vs k using cosine non PCA data') 260 | plt.savefig(pp,format='pdf') 261 | plt.close() 262 | 263 | 264 | def constructSimilartyMatrixCosinePCA(self): 265 | #This is a simpole k nearest neighbour approach based on the cosine distance 266 | #for this takefrom modshogun import RealFeatures, MulticlassLabels 267 | #then find the k nearest neighbours for each node 268 | ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] 269 | accs=[] 270 | for k in ks: 271 | self.pwdis=pairwise_distances(self.allDataPCA,metric='cosine') 272 | #now we have all the pairwise cosine distances between all the sentences 273 | #now we need to do a knnNeighbour search 274 | #now we can construct the diagonal weight marix , which has the sum of all the weights 275 | self.D=np.zeros(self.pwdis.shape) 276 | for i in range(0,self.pwdis.shape[0]): 277 | l1=self.pwdis[i].tolist() 278 | #print 'l1 is ',l1,'\n\n' 279 | allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) 280 | #now set the all the weights except for k+1 to 0 281 | self.pwdis[i,allnearestNeighbours[k:]]=0 282 | self.D[i,i]=sum(self.pwdis[i]) 283 | 284 | print 'Now computing accuracy for cosine metric on PCA data' 285 | accs.append(self.labelPropogation()) 286 | 287 | plt.figure() 288 | plt.plot(ks,accs) 289 | plt.title('Plot of accuracy vs k using cosine PCA data') 290 | plt.savefig(pp,format='pdf') 291 | plt.close() 292 | 293 | #now we have the weight matrix graph based on the cosine distance 294 | #print 'self.D is ',self.D 295 | 296 | 297 | 298 | def checkAccuracy(self,predicted,goldset): 299 | predicted=predicted.tolist() 300 | goldset=goldset.tolist() 301 | correct=0 302 | for i in range(0,len(predicted)): 303 | #print 'predicted is ',predicted[i],' goldset is ',goldset[i] 304 | if goldset[i]==predicted[i]: 305 | correct+=1 306 | 307 | return (float(correct)/len(predicted))*100 308 | 309 | def computePca(self): 310 | 311 | pca=PCA(n_components=100) 312 | pca.fit(self.trainVectors) 313 | self.trainVectorsPCA=copy(pca.transform(self.trainVectors)) 314 | self.testVectorsPCA=copy(pca.transform(self.testVectors)) 315 | print '\ndata ',self.trainVectorsPCA,'\n' 316 | #print 'the explained variance is ',np.cumsum(pca.explained_variance_ratio_) 317 | self.allDataPCA=copy(np.vstack((self.trainVectorsPCA,self.testVectorsPCA))) 318 | 319 | 320 | 321 | 322 | def constructSimilartyMatrixLMNN(self,ks): 323 | 324 | 325 | print 'now doing LMNN for k= ',ks 326 | self.y_train=self.y_train.reshape(-1,) 327 | lmnn=LMNN(k=ks, learn_rate=1e-7,max_iter=3000) 328 | lmnn.fit(self.trainVectorsPCA, self.y_train, verbose=False) 329 | self.L_lmnn = lmnn.transformer() 330 | name='lmnn/LMNN transformer matrix with dataset shape '+str(self.trainVectorsPCA.shape) 331 | np.save(name,self.L_lmnn) 332 | print 'L.shape is ',self.L_lmnn.shape,'\n\n' 333 | # Input data transformed to the metric space by X*L.T 334 | self.transformedTrainLMNN=copy(lmnn.transform(self.trainVectorsPCA)) 335 | self.transformedTestLMNN=copy(lmnn.transform(self.testVectorsPCA)) 336 | self.transformedAllLMNN=copy(lmnn.transform(self.allDataPCA)) #we compute the pairwise distance on this now 337 | projectedDigits = TSNE(random_state=randomState).fit_transform(self.transformedAllLMNN) 338 | 339 | plt.figure() 340 | plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels) 341 | plt.title('LMNN Transformed ALL set projected to 2 Dimensions by TSNE with k='+str(ks)) 342 | plt.savefig(pp,format='pdf') 343 | plt.close() 344 | 345 | 346 | self.pwdis=copy(pairwise_distances(self.transformedAllLMNN,metric='euclidean')) 347 | self.D=np.zeros(self.pwdis.shape) 348 | for i in range(0,self.pwdis.shape[0]): 349 | l1=self.pwdis[i].tolist() 350 | #print 'l1 is ',l1,'\n\n' 351 | allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) 352 | #now set the all the weights except for k+1 to 0 353 | self.pwdis[i,allnearestNeighbours[ks:]]=0 354 | self.D[i,i]=sum(self.pwdis[i]) 355 | 356 | print 'accuracy for LMNN for k= ',ks,'\n' 357 | self.labelPropogation() 358 | 359 | def transformPairwiseDistanceToGaussian(self,sigma): 360 | 361 | self.pwdis=copy(-1*self.pwdis/(2*sigma*sigma)) 362 | self.pwdis=copy(np.exp(self.pwdis)) 363 | 364 | 365 | def constructSimilartyMatrixITML(self): 366 | 367 | print 'Now doing itml' 368 | counter=1 369 | ks=[3,5,7,10,12,15,20,22,25,27,30,33,35,37,40,43,45,47,50,53,55,57,60,65] 370 | 371 | constraints=[80,100,120,150,180,200] 372 | constraints=[100] 373 | for k in ks: 374 | for num_constraints in constraints: 375 | 376 | itml=ITML() 377 | self.y_train=self.y_train.reshape(-1,) 378 | C = ITML.prepare_constraints(self.y_train, self.trainVectorsPCA.shape[0], num_constraints) 379 | itml.fit(self.trainVectorsPCA, C, verbose=True) 380 | self.L_itml=copy(itml.transformer()) 381 | 382 | name='itml/ITML transformer matrix with dataset shape '+str(self.trainVectorsPCA.shape)+' and k='+str(k)+' and num_constraints='+str(num_constraints) 383 | #print 'L itml shape is ',self.L_itml.shape 384 | np.save(name,self.L_itml) 385 | 386 | # Input data transformed to the metric space by X*L.T 387 | 388 | self.transformedTrainITML=copy(itml.transform(self.trainVectorsPCA)) 389 | self.transformedTestITML=copy(itml.transform(self.testVectorsPCA)) 390 | self.transformedAllITML=copy(itml.transform(self.allDataPCA)) 391 | # now we can simply calculate the eucledian distances on the above transformed dataset 392 | 393 | 394 | #Visualizing the dataset by TSNE 395 | projectedDigits = TSNE(random_state=randomState).fit_transform(self.transformedAllITML) 396 | 397 | print 'projectedDigits is ',projectedDigits.shape 398 | plt.figure() 399 | plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels) 400 | plt.title('ITML Transformed ALL set projected to 2 Dimensions by TSNE with'+str(k)+' and num_constraints='+str(num_constraints)) 401 | plt.savefig(pp,format='pdf') 402 | #plt.show() 403 | plt.close() 404 | 405 | self.pwdis=copy(pairwise_distances(self.transformedAllITML,metric='euclidean')) 406 | #sigmas=[1,2.5,2,2.5,3,3.5,4,4.5,5] 407 | #for sigma in sigmas: 408 | 409 | self.D=np.zeros(self.pwdis.shape) 410 | for i in range(0,self.pwdis.shape[0]): 411 | l1=self.pwdis[i].tolist() 412 | #print 'l1 is ',l1,'\n\n' 413 | allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) 414 | #now set the all the weights except for k+1 to 0 415 | #since we exponentiated the distances with the minus sign we need to set the lowest weights to 0, so everything except for the last k go to 0 416 | self.pwdis[i,allnearestNeighbours[k:]]=0 417 | self.D[i,i]=sum(self.pwdis[i]) 418 | 419 | print 'accuracy for ITML for k= ',k,' and num_constraints='+str(num_constraints),'\n' 420 | self.labelPropogation() 421 | 422 | 423 | 424 | def labelPropogation(self): 425 | #Algorithm 11.1 Label propagation (Zhu and Ghahramani, 2002) 426 | self.y_test=self.y_test.reshape(-1,1) 427 | self.y_train=self.y_train.reshape(-1,1) 428 | 429 | self.yUnlabeled=np.zeros(self.y_test.shape) 430 | self.y_labeled=copy(self.y_train) 431 | 432 | self.ypred=copy(np.vstack((self.y_labeled,self.yUnlabeled))) 433 | 434 | #now to do the label propogation 435 | 436 | for i in range(0,50): 437 | self.ypred=np.dot(np.linalg.inv(self.D),np.dot(self.pwdis,self.ypred)) 438 | #now need to relabel all the labeled points 439 | for i in range(0,self.y_labeled.shape[0]): 440 | self.ypred[i,0]=self.y_labeled[i,0] 441 | 442 | 443 | 444 | #now label propogation is complete 445 | numTrain=self.y_train.shape[0] 446 | self.predicted1=self.ypred[numTrain:,0] 447 | #now we need to rethreshold them to 1 and -1 448 | 449 | for i in range(self.predicted1.shape[0]): 450 | if self.predicted1[i]>0: 451 | self.predicted1[i]=1 452 | else: 453 | self.predicted1[i]=-1 454 | 455 | self.predicted1=self.predicted1.reshape(-1,) 456 | self.y_test=self.y_test.reshape(-1) 457 | print 'self.predicted1 is ',self.predicted1.shape 458 | print 'self.y_test is ',self.y_test.shape 459 | acc=self.checkAccuracy(self.predicted1,self.y_test) 460 | print 'the accuracy is ',acc 461 | return acc 462 | 463 | 464 | 465 | def compareWithSvm(self,datasetTrain,datasetTest): 466 | C=[0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000] 467 | print '\n' 468 | print 'dataset shape is ',datasetTrain.shape 469 | self.y_train=self.y_train.reshape(-1,) 470 | for c in C: 471 | self.Svm=svm.LinearSVC(C=c) 472 | self.Svm.fit(datasetTrain,self.y_train) 473 | labels=self.Svm.predict(datasetTest) 474 | print 'accuracy with c=',c,' is ',self.checkAccuracy(labels,self.y_test),'% ','\n' 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | #for graph based reasoning , replace every 0 with -1 487 | 488 | newLabels=[] 489 | for label in labels: 490 | if label==1: 491 | newLabels.append(label) 492 | else: 493 | newLabels.append(-1) 494 | 495 | #print 'newLabels are ',newLabels 496 | 497 | newLabels=np.asarray(newLabels) 498 | np.save('NewLabels',newLabels) 499 | 500 | 501 | 502 | skf=StratifiedKFold(newLabels,n_folds=4,shuffle=True) 503 | counter1=1 504 | for train_index,test_index in skf: 505 | 506 | X_train,X_test=data[test_index],data[train_index] 507 | y_train,y_test=labels[test_index],labels[train_index] 508 | X_train=copy(X_train.reshape(-1,1)) 509 | X_test=copy(X_test.reshape(-1,1)) 510 | #y_train=copy(y_train.reshape(-1,1)) 511 | #y_test=copy(y_test.reshape(-1,1)) 512 | ob1=GraphBasedLearning(X_train,y_train,X_test,y_test) 513 | print 'X_train is ',y_train.shape 514 | print 'X_test is ',y_test.shape 515 | 516 | 517 | pp.close() 518 | 519 | -------------------------------------------------------------------------------- /load.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import fetch_20newsgroups 2 | import numpy as np 3 | from sklearn.feature_extraction.text import TfidfVectorizer 4 | from sklearn import metrics 5 | import numpy as np 6 | from nltk import word_tokenize 7 | import nltk 8 | from nltk.corpus import stopwords 9 | from nltk.stem.porter import PorterStemmer 10 | from nltk.stem import WordNetLemmatizer 11 | import os 12 | import re 13 | from os.path import expanduser 14 | from sklearn.metrics.pairwise import pairwise_distances 15 | 16 | from copy import copy 17 | 18 | categories=['alt.atheism', 'sci.space'] 19 | newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories) 20 | 21 | class StemmerTokenizer(object): 22 | 23 | def __init__(self): 24 | self.stemmer = PorterStemmer() 25 | 26 | def __call__(self, doc): 27 | return [self.stemmer.stem(t) for t in word_tokenize(doc)] 28 | 29 | 30 | stop=stopwords.words('english') 31 | stemmer=PorterStemmer() 32 | lemmer=WordNetLemmatizer() 33 | home=expanduser("~") 34 | 35 | nonWords=['[',']','{','}','/','_','|'] 36 | 37 | def preProcess(dataset,target): 38 | processed=[] 39 | target2=target.tolist() 40 | labels=[] 41 | counter=-1 42 | j=1 43 | for line in dataset: 44 | #words=line.split(' ') 45 | 46 | print ' at j = ',j,'/',len(dataset) 47 | j=j+1 48 | counter+=1 49 | line=line.lower() 50 | line=line.strip() 51 | if len(line)>0: 52 | labels.append(target2[counter]) 53 | else: 54 | continue 55 | 56 | cleaned1=re.sub(r'[,.!?&$@%/\\"-;:()+*^`<>#`~=]',"",line) 57 | cleaned2=re.sub(r'\d',"",cleaned1) 58 | cleaned2=re.sub(r"'s","",cleaned2) 59 | cleaned2=re.sub(r"'","",cleaned2) 60 | cleaned2=re.sub(r'"',"",cleaned2) 61 | 62 | for ch in nonWords: 63 | try: 64 | cleaned2=cleaned2.replace(ch,'') 65 | except: 66 | pass 67 | words=nltk.word_tokenize(cleaned2) 68 | #print 'words is ',words 69 | tags=nltk.pos_tag(words) 70 | sentence="" 71 | sentence+=words[0]+" " 72 | for i in range(1,len(tags)): 73 | tag=tags[i] 74 | #print 'tag is ',tag 75 | try: 76 | if tag[1][0].lower() in ['a','v','n','s','r']: 77 | word=lemmer.lemmatize(tag[0],tag[1][0].lower()) 78 | else: 79 | word=lemmer.lemmatize(tag[0]) 80 | sentence+=word+" " 81 | except Exception: 82 | pass 83 | 84 | 85 | #print 'the lemmatized sentence is ',sentence 86 | for stopWord in stop: 87 | try: 88 | cleaned2=sentence.replace(stopWord,"") 89 | except Exception: 90 | pass 91 | 92 | cleaned2.strip() 93 | processed.append(cleaned2) 94 | 95 | return processed,labels 96 | 97 | smallData=newsgroups_train.data 98 | target=newsgroups_train.target 99 | j=0 100 | ''' 101 | numwords=0 102 | for data in smallData: 103 | print 'data j=',j,' is ',data,'\n\n' 104 | j+=1''' 105 | processedData,labels=preProcess(smallData,target) 106 | parray=np.asarray(processedData) 107 | np.save('FullData',parray) 108 | labels=np.asarray(labels) 109 | np.save('Labels',labels) 110 | 111 | i=0 112 | words={} 113 | for line in processedData: 114 | #print 'i= ',i,' ',line,'\n\n' 115 | ws=(line.split(' ')) 116 | for w in ws: 117 | words[w]=1 118 | #print 'original sentence i= ',i,' is ',newsgroups_train.data[i],'\n\n' 119 | i+=1 120 | 121 | vectorizer = TfidfVectorizer(decode_error='replace',analyzer='word',stop_words='english',lowercase=True,tokenizer=StemmerTokenizer()) 122 | vectorizer.fit(processedData) 123 | vectors = vectorizer.transform(processedData) 124 | print 'vectorizer is ' ,vectors.shape 125 | print 'numwords is ',len(words) 126 | print 'get_feature_names is ',vectorizer.get_feature_names() 127 | pwdis=pairwise_distances(vectors,metric='cosine') 128 | 129 | print 'pairwise_distances is ',pwdis.shape,'\n' 130 | print 'labels .shape is ',len(labels) 131 | --------------------------------------------------------------------------------