├── Extractn.py ├── GKbCl.png ├── GUI_files ├── maingui.ui └── recgui.ui ├── MinePredict.py ├── MineStackOverflow.py ├── Prediction.py ├── README.md ├── Remove_Duplicates.py ├── RunApp.py ├── StackOverflow_CSV_Preprocessing.py ├── maingui.py ├── recgui.py ├── savejob.py ├── so.png ├── stackoverflow-logo.png └── testSO.py /Extractn.py: -------------------------------------------------------------------------------- 1 | import csv 2 | fi=open("Train.csv",'rb') 3 | fo=open("Train2.csv",'wb') 4 | reader=csv.reader(fi) 5 | writer=csv.writer(fo) 6 | for i, row in enumerate(reader): 7 | if(i<1000000): 8 | writer.writerow(row) 9 | print i 10 | else: 11 | break 12 | 13 | print "Extracted 10 Lakh Docs" 14 | -------------------------------------------------------------------------------- /GKbCl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pranay360/StackOverflow-Recommendation-System/67955c81eb797d108d52557f1f05b1018a768f82/GKbCl.png -------------------------------------------------------------------------------- /GUI_files/maingui.ui: -------------------------------------------------------------------------------- 1 | 2 | 3 | MainWindow 4 | 5 | 6 | true 7 | 8 | 9 | 10 | 0 11 | 0 12 | 920 13 | 561 14 | 15 | 16 | 17 | StackOverflow Application 18 | 19 | 20 | 21 | GKbCl.pngGKbCl.png 22 | 23 | 24 | 25 | 26 | 27 | 15 28 | 10 29 | 481 30 | 501 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 75 43 | true 44 | 45 | 46 | 47 | Title 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 75 63 | true 64 | 65 | 66 | 67 | Body 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 75 83 | true 84 | 85 | 86 | 87 | Predict 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 75 98 | true 99 | 100 | 101 | 102 | Tags 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 75 118 | true 119 | 120 | 121 | 122 | Original Tags 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 500 137 | 150 138 | 387 139 | 218 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 75 148 | true 149 | 150 | 151 | 152 | Get Recommendations 153 | 154 | 155 | 156 | 157 | 158 | 159 | true 160 | 161 | 162 | 163 | 14 164 | 75 165 | true 166 | 167 | 168 | 169 | RECOMMENDATIONS 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 510 218 | 10 219 | 381 220 | 131 221 | 222 | 223 | 224 | 225 | 226 | 227 | stackoverflow-logo.png 228 | 229 | 230 | true 231 | 232 | 233 | 234 | 235 | 236 | 500 237 | 480 238 | 421 239 | 30 240 | 241 | 242 | 243 | 244 | 245 | 246 | Get Accuracy 247 | 248 | 249 | 250 | 251 | 252 | 253 | Get F1 Score 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | Clear 263 | 264 | 265 | 266 | 267 | 268 | 269 | Exit 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 640 281 | 460 282 | 66 283 | 17 284 | 285 | 286 | 287 | 288 | 75 289 | true 290 | true 291 | 292 | 293 | 294 | F1 Score 295 | 296 | 297 | 298 | 299 | 300 | 520 301 | 460 302 | 66 303 | 17 304 | 305 | 306 | 307 | 308 | 75 309 | true 310 | true 311 | 312 | 313 | 314 | Accuracy 315 | 316 | 317 | 318 | 319 | 320 | 321 | 0 322 | 0 323 | 920 324 | 25 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | pushButton_9 334 | clicked() 335 | plainTextEdit 336 | clear() 337 | 338 | 339 | 813 340 | 507 341 | 342 | 343 | 441 344 | 276 345 | 346 | 347 | 348 | 349 | pushButton_9 350 | clicked() 351 | lineEdit_2 352 | clear() 353 | 354 | 355 | 813 356 | 507 357 | 358 | 359 | 493 360 | 498 361 | 362 | 363 | 364 | 365 | pushButton_9 366 | clicked() 367 | lineEdit 368 | clear() 369 | 370 | 371 | 813 372 | 507 373 | 374 | 375 | 438 376 | 45 377 | 378 | 379 | 380 | 381 | pushButton_9 382 | clicked() 383 | label_9 384 | hide() 385 | 386 | 387 | 813 388 | 507 389 | 390 | 391 | 562 392 | 208 393 | 394 | 395 | 396 | 397 | pushButton_9 398 | clicked() 399 | r1 400 | hide() 401 | 402 | 403 | 813 404 | 507 405 | 406 | 407 | 657 408 | 237 409 | 410 | 411 | 412 | 413 | pushButton_9 414 | clicked() 415 | r2 416 | hide() 417 | 418 | 419 | 813 420 | 507 421 | 422 | 423 | 699 424 | 269 425 | 426 | 427 | 428 | 429 | pushButton_9 430 | clicked() 431 | r3 432 | hide() 433 | 434 | 435 | 813 436 | 507 437 | 438 | 439 | 741 440 | 301 441 | 442 | 443 | 444 | 445 | pushButton_9 446 | clicked() 447 | r4 448 | hide() 449 | 450 | 451 | 813 452 | 507 453 | 454 | 455 | 768 456 | 333 457 | 458 | 459 | 460 | 461 | pushButton_9 462 | clicked() 463 | r5 464 | hide() 465 | 466 | 467 | 813 468 | 507 469 | 470 | 471 | 714 472 | 390 473 | 474 | 475 | 476 | 477 | pushButton_8 478 | clicked() 479 | MainWindow 480 | close() 481 | 482 | 483 | 918 484 | 507 485 | 486 | 487 | 811 488 | 594 489 | 490 | 491 | 492 | 493 | pushButton_9 494 | clicked() 495 | acl 496 | hide() 497 | 498 | 499 | 824 500 | 522 501 | 502 | 503 | 579 504 | 487 505 | 506 | 507 | 508 | 509 | pushButton_9 510 | clicked() 511 | lf1 512 | hide() 513 | 514 | 515 | 797 516 | 528 517 | 518 | 519 | 661 520 | 486 521 | 522 | 523 | 524 | 525 | pushButton_9 526 | clicked() 527 | otags 528 | clear() 529 | 530 | 531 | 752 532 | 529 533 | 534 | 535 | 264 536 | 528 537 | 538 | 539 | 540 | 541 | pushButton_9 542 | clicked() 543 | lotags 544 | hide() 545 | 546 | 547 | 778 548 | 521 549 | 550 | 551 | 54 552 | 523 553 | 554 | 555 | 556 | 557 | pushButton_9 558 | clicked() 559 | otags 560 | hide() 561 | 562 | 563 | 754 564 | 516 565 | 566 | 567 | 395 568 | 512 569 | 570 | 571 | 572 | 573 | pushButton_9 574 | clicked() 575 | otags 576 | clear() 577 | 578 | 579 | 773 580 | 520 581 | 582 | 583 | 384 584 | 512 585 | 586 | 587 | 588 | 589 | ac 590 | clicked() 591 | acl 592 | show() 593 | 594 | 595 | 528 596 | 518 597 | 598 | 599 | 538 600 | 495 601 | 602 | 603 | 604 | 605 | pushButton_3 606 | clicked() 607 | lf1 608 | show() 609 | 610 | 611 | 679 612 | 522 613 | 614 | 615 | 665 616 | 493 617 | 618 | 619 | 620 | 621 | pushButton_4 622 | clicked() 623 | label_9 624 | show() 625 | 626 | 627 | 651 628 | 184 629 | 630 | 631 | 626 632 | 220 633 | 634 | 635 | 636 | 637 | 638 | -------------------------------------------------------------------------------- /GUI_files/recgui.ui: -------------------------------------------------------------------------------- 1 | 2 | 3 | Dialog 4 | 5 | 6 | 7 | 0 8 | 0 9 | 722 10 | 430 11 | 12 | 13 | 14 | RECOMMENDATIONS 15 | 16 | 17 | 18 | GKbCl.pngGKbCl.png 19 | 20 | 21 | 22 | 23 | 270 24 | 10 25 | 201 26 | 20 27 | 28 | 29 | 30 | 31 | 14 32 | 75 33 | true 34 | 35 | 36 | 37 | RECOMMENDATION 38 | 39 | 40 | 41 | 42 | 43 | 0 44 | 30 45 | 711 46 | 361 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 75 55 | true 56 | 57 | 58 | 59 | ID 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 75 72 | true 73 | 74 | 75 | 76 | Title 77 | 78 | 79 | 80 | 81 | 82 | 83 | true 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 75 96 | true 97 | 98 | 99 | 100 | Body 101 | 102 | 103 | 104 | 105 | 106 | 107 | true 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 75 120 | true 121 | 122 | 123 | 124 | Tags 125 | 126 | 127 | 128 | 129 | 130 | 131 | true 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 300 145 | 400 146 | 96 147 | 26 148 | 149 | 150 | 151 | Exit 152 | 153 | 154 | 155 | 156 | 157 | 158 | pushButton 159 | clicked() 160 | Dialog 161 | close() 162 | 163 | 164 | 324 165 | 418 166 | 167 | 168 | 199 169 | 471 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /MinePredict.py: -------------------------------------------------------------------------------- 1 | import time, re, json, numpy as np 2 | from sklearn.svm import LinearSVC 3 | from nltk.corpus import stopwords 4 | from sklearn.pipeline import Pipeline 5 | from nltk.stem.snowball import SnowballStemmer 6 | from sklearn.multiclass import OneVsRestClassifier 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | 10 | s=set(stopwords.words('english')) 11 | stemmer = SnowballStemmer("english", ignore_stopwords=True) 12 | fh=open('Tags.txt','r') 13 | fh2=open('cleaned.txt','r') 14 | fh3=open('TTags.txt', 'r') 15 | fh4=open('Tcleaned.txt','r') 16 | tags={} 17 | freq=[] 18 | count=0 19 | tagrows=fh.read().split('\n')[:500000] 20 | X=fh2.read().split('\n')[:500000] 21 | Y = [[] for i in range(len(X))] 22 | 23 | for line in tagrows: 24 | for tag in line.split(): 25 | if tag in tags: 26 | tags[tag]+=1 27 | else: 28 | tags[tag]=1 29 | #34945 unique tags in 10 lakh posts 30 | 31 | for tag in sorted(tags,key=lambda tag:tags[tag], reverse=True): 32 | if tags[tag] > 800: 33 | count += 1 34 | freq.append(tag) 35 | else: 36 | break 37 | 38 | print "Training..." 39 | for x,tag in enumerate(freq): 40 | i=0 41 | for row in tagrows: 42 | if tag in row.split(): 43 | Y[i].append(tag) 44 | i=i+1 45 | 46 | 47 | classifier = Pipeline([ 48 | ('vectorizer', CountVectorizer()), 49 | ('tfidf', TfidfTransformer()), 50 | ('clf', OneVsRestClassifier(LinearSVC(class_weight='auto'), n_jobs = -2))]) 51 | classifier.fit(X,Y) 52 | print "Ready..." 53 | 54 | 55 | while True: 56 | T=[] 57 | words = fh.readline().lower().replace(' \n','') 58 | T.append(words) 59 | print '\n',classifier.predict(T),fh3.readline(),'\n' 60 | 61 | 62 | print "Exiting..." 63 | fh.close() 64 | fh2.close() 65 | -------------------------------------------------------------------------------- /MineStackOverflow.py: -------------------------------------------------------------------------------- 1 | import stackexchange, json, HTMLParser, re, nltk 2 | from nltk.corpus import stopwords 3 | from bs4 import BeautifulSoup 4 | from collections import defaultdict 5 | 6 | s=set(stopwords.words('english')) 7 | fh = open("stackoverflow.txt","w") 8 | html_parser= HTMLParser.HTMLParser() 9 | so = stackexchange.Site(stackexchange.StackOverflow) 10 | so.be_inclusive() 11 | 12 | results=[] 13 | d=defaultdict() 14 | i=0 15 | for question in so.questions(pagesize=10): 16 | try: 17 | if(i>20): 18 | break 19 | q=str(question.title.lower()) 20 | b=str(question.body.lower()) 21 | _id=question.id 22 | #c=[] 23 | words=[] 24 | uW=[] 25 | bd='' 26 | z=str(html_parser.unescape(str(b.encode('utf-8')))) 27 | f=0 28 | while f", f) 30 | y=z.find("

",f) 31 | if x>=0 and y>=0: 32 | s1=[m.start() for m in re.finditer('', z[x+3:y])] 33 | s2=[m.start() for m in re.finditer('', z[x+3:y])] 34 | s3=[m.start() for m in re.finditer('', z[x+3:y])] 36 | r=0 37 | while r=~,;`{}|]',' ',str(q+' '+bd)) 50 | words=re.sub('\n',' ',words) 51 | words=re.sub('_','-',words) 52 | pattern = re.compile(r'(\s)\-+(\s)') # ' ---- ' -> ' ' 53 | words = pattern.sub(r'\1', words) 54 | pattern = re.compile(r'\.+(\s)') 55 | words = pattern.sub(r'\1', words) 56 | pattern = re.compile(r'(\s)\.+(\s)') 57 | words = pattern.sub(r'\1', words) 58 | words = re.sub("'",'', words) 59 | words= re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", words)# asbcd 111 sad 60 | words= re.sub(" # ",' ',words) 61 | #clean_text=filter(lambda w: not w in s,words.split()) 62 | #uniqueWords=[] 63 | #for word in clean_text: 64 | # if not (word,clean_text.count(word)) in uniqueWords: 65 | # uniqueWords.append((word,clean_text.count(word))) 66 | 67 | d[_id]=words 68 | #results.append(d) 69 | print i+1,"Post Collected-",_id 70 | i=i+1 71 | 72 | except Exception,e: 73 | print "EXCEPTION: ",str(e) 74 | pass 75 | 76 | fh.write(json.dumps(d)) 77 | fh.close() 78 | 79 | -------------------------------------------------------------------------------- /Prediction.py: -------------------------------------------------------------------------------- 1 | import time, re, json, numpy as np 2 | from sklearn.svm import LinearSVC 3 | from nltk.corpus import stopwords 4 | from sklearn.pipeline import Pipeline 5 | from nltk.stem.snowball import SnowballStemmer 6 | from sklearn.multiclass import OneVsRestClassifier 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | 10 | s=set(stopwords.words('english')) 11 | stemmer = SnowballStemmer("english", ignore_stopwords=True) 12 | fh=open('Tags.txt','r') 13 | fh2=open('cleaned.txt','r') 14 | tags={} 15 | freq=[] 16 | count=0 17 | tagrows=fh.read().split('\n')[:500000] 18 | X=fh2.read().split('\n')[:500000] 19 | Y = [[] for i in range(len(X))] 20 | 21 | for line in tagrows: 22 | for tag in line.split(): 23 | if tag in tags: 24 | tags[tag]+=1 25 | else: 26 | tags[tag]=1 27 | #34945 unique tags in 10 lakh posts 28 | 29 | for tag in sorted(tags,key=lambda tag:tags[tag], reverse=True): 30 | if tags[tag] > 800: 31 | count += 1 32 | freq.append(tag) 33 | else: 34 | break 35 | 36 | print "Training..." 37 | for x,tag in enumerate(freq): 38 | i=0 39 | for row in tagrows: 40 | if tag in row.split(): 41 | Y[i].append(tag) 42 | i=i+1 43 | 44 | 45 | classifier = Pipeline([ 46 | ('vectorizer', CountVectorizer()), 47 | ('tfidf', TfidfTransformer()), 48 | ('clf', OneVsRestClassifier(LinearSVC(class_weight='auto'), n_jobs = -2))]) 49 | classifier.fit(X,Y) 50 | print "Ready..." 51 | 52 | 53 | while True: 54 | T=[] 55 | words = (raw_input("Enter Your Question: ")).lower() 56 | words = re.sub('\n',' ',words) 57 | words = re.sub('[!@%^&*()$:"?<>=~,;`{}|]',' ',words) 58 | words = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))''',' ',words) 59 | words = re.sub('_','-',words) 60 | words = words.replace('[',' ') 61 | words = words.replace(']',' ') 62 | words = words.replace('/',' ') 63 | words = words.replace('\\',' ') 64 | words = re.sub(r'(\s)\-+(\s)',r'\1', words) 65 | words = re.sub(r'\.+(\s)',r'\1', words) 66 | words = re.sub(r'\.+\.(\w)',r'\1', words) 67 | words = re.sub(r'(\s)\.+(\s)',r'\1', words) 68 | words = re.sub("'",'', words) 69 | words = re.sub(r'\s\d+[\.\-\+]+\d+|\s[\.\-\+]+\d+|\s+\d+\s+|\s\d+[\+\-]+',' ',words) 70 | words = re.sub("^\d+\s|\s\d+\s|\s\d+$"," ", words) 71 | words = re.sub(r'\s\#+\s|\s\++\s',' ',words) 72 | stemmed_words = [stemmer.stem(word) for word in words.split()] 73 | clean_text = filter(lambda w: not w in s,stemmed_words) 74 | words='' 75 | for word in clean_text: 76 | words+=word+' ' 77 | T.append(words) 78 | print '\n',classifier.predict(T),'\n' 79 | c=input("Continue?\nPress 0 to Quit: ") 80 | if c is 0: 81 | break 82 | 83 | print "Exiting..." 84 | fh.close() 85 | fh2.close() 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StackOverFlow-Recommendation-System 2 | Python scripts to mine data from StackOverflow, preprocess it and then predict which tags belong to the question/post using multi-label LinearSVC and recommends similar questions using cosine distances 3 | 4 | 1. Mined StackOverflow and pre-processed the data. 5 | 2. Created a Supervised Learning Classification Model to Predict Tags 6 | 3. Created a Similarity Model to suggest similar questions by applying various Document Similarity Algorithms 7 | 4. Combined the functionality in an QT Application 8 | 9 | ![Alt text](/so.png "Working GUI") 10 | 11 | -------------------------------------------------------------------------------- /Remove_Duplicates.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from collections import defaultdict 3 | rows = csv.DictReader(open("Train2.csv", "rb")) 4 | writer = csv.writer(open("new.csv", "wb")) 5 | newrows = defaultdict() 6 | for row in rows: 7 | x=row['Id'] 8 | newrows[x]=newrows[x]+1 9 | if newrows[row['Id']]==1: 10 | writer.writerow(row) 11 | else: 12 | print "duplicate",row['Id'] 13 | 14 | writer.writerows(newrows) 15 | -------------------------------------------------------------------------------- /RunApp.py: -------------------------------------------------------------------------------- 1 | import time, re, json, numpy as np, sys, csv 2 | from PyQt5.QtGui import * 3 | from PyQt5.Qt import * 4 | try: 5 | from PyQt4.QtCore import QString 6 | except ImportError: 7 | QString = str 8 | from maingui import Ui_MainWindow 9 | from recgui import Ui_Dialog 10 | from sklearn.svm import LinearSVC 11 | from nltk.corpus import stopwords 12 | from sklearn.pipeline import Pipeline 13 | from sklearn.externals import joblib 14 | from nltk.stem.snowball import SnowballStemmer 15 | from sklearn.multiclass import OneVsRestClassifier 16 | from sklearn.metrics import f1_score 17 | from sklearn.metrics import accuracy_score 18 | from sklearn.metrics.pairwise import cosine_similarity 19 | from sklearn.feature_extraction.text import CountVectorizer 20 | from sklearn.feature_extraction.text import TfidfTransformer 21 | import pandas as pd 22 | 23 | s=set(stopwords.words('english')) 24 | stemmer = SnowballStemmer('english', ignore_stopwords=True) 25 | fh=open('Tags.txt','r') 26 | fh2=open('cleaned.txt','r') 27 | _i=[] 28 | _t=[] 29 | _T=[] 30 | _b=[] 31 | count=0 32 | tagrows=fh.read().split('\n')[:500000] 33 | checktags=[] 34 | X=fh2.read().split('\n')[:500000] 35 | classifier = joblib.load('clf.txt') 36 | multibin = joblib.load('multibin.txt') 37 | vectorizer_2=CountVectorizer() 38 | 39 | class SO(QMainWindow,Ui_MainWindow): 40 | def __init__(self, parent=None): 41 | super(SO, self).__init__(parent) 42 | self.setupUi(self) 43 | self.pushButton.clicked.connect(self.predictTags) 44 | self.pushButton_4.clicked.connect(self.recommend) 45 | self.pushButton_4.hide() 46 | self.df=pd.read_csv("trainset.csv") 47 | self.r1.clicked.connect(self.newwindowr1) 48 | self.r2.clicked.connect(self.newwindowr2) 49 | self.r3.clicked.connect(self.newwindowr3) 50 | self.r4.clicked.connect(self.newwindowr4) 51 | self.r5.clicked.connect(self.newwindowr5) 52 | #self.ac.clicked.connect(self.accuracy) 53 | #self.pushButton_3.clicked.connect(self.f1score) 54 | self.r1.hide() 55 | self.r2.hide() 56 | self.r3.hide() 57 | self.r4.hide() 58 | self.r5.hide() 59 | self.lotags.hide() 60 | self.otags.hide() 61 | self.ac.hide() 62 | self.acl.hide() 63 | self.pushButton_3.hide() 64 | self.lf1.hide() 65 | self.label_9.hide() 66 | 67 | def newwindowr1(self): 68 | self.new=RCMD(self) 69 | self.new.display(str(self.r1.text())) 70 | def newwindowr2(self): 71 | self.new=RCMD(self) 72 | self.new.display(str(self.r2.text())) 73 | def newwindowr3(self): 74 | self.new=RCMD(self) 75 | self.new.display(str(self.r3.text())) 76 | def newwindowr4(self): 77 | self.new=RCMD(self) 78 | self.new.display(str(self.r4.text())) 79 | def newwindowr5(self): 80 | self.new=RCMD(self) 81 | self.new.display(str(self.r5.text())) 82 | ''' 83 | def f1score(self): 84 | otags=str(self.otags.text()).split() 85 | comm=set(otags)&set(self.tagarr) 86 | commlist=list(comm) 87 | if len(comm) 0: 122 | B=vectorizer_2.transform([X[i]]) 123 | cossim.append([i+1,cosine_similarity(A,B)[0][0]]) 124 | else: 125 | if len(set(self.tagarr)&set(Tags)) > 1: 126 | B=vectorizer_2.transform([X[i]]) 127 | cossim.append([i+1,cosine_similarity(A,B)[0][0]]) 128 | #QApplication.processEvents() 129 | cossim.sort(key=lambda x: x[1], reverse=True) 130 | indexes=[x[0] for x in cossim[:5]] 131 | larr=[self.r1,self.r2,self.r3,self.r4,self.r5] 132 | temp=self.df[self.df.Id.isin(indexes)].reset_index() 133 | #print row['Id'],'--',row['Title'],'--',row['Tags'] 134 | for i, row in temp.iterrows(): 135 | larr[i].setText(QString(row['Title'])) 136 | _i.insert(0,row['Id']) 137 | _t.insert(0,row['Title']) 138 | _T.insert(0,row['Tags']) 139 | _b.insert(0,row['Body']) 140 | larr[i].show() 141 | 142 | def predictTags(self): 143 | self.T=[] 144 | words = str(self.lineEdit.text())+' '+str(self.plainTextEdit.toPlainText()) 145 | words = re.sub('\n',' ',words) 146 | words = re.sub('[!@%^&*()$:"?<>=~,;`{}|]',' ',words) 147 | words = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))''',' ',words) 148 | words = re.sub('_','-',words) 149 | words = words.replace('[',' ') 150 | words = words.replace(']',' ') 151 | words = words.replace('/',' ') 152 | words = words.replace('\\',' ') 153 | words = re.sub(r'(\s)\-+(\s)',r'\1', words) 154 | words = re.sub(r'\.+(\s)',r'\1', words) 155 | words = re.sub(r'\.+\.(\w)',r'\1', words) 156 | words = re.sub(r'(\s)\.+(\s)',r'\1', words) 157 | words = re.sub("'",'', words) 158 | words = re.sub(r'\s\d+[\.\-\+]+\d+|\s[\.\-\+]+\d+|\s+\d+\s+|\s\d+[\+\-]+',' ',words) 159 | words = re.sub("^\d+\s|\s\d+\s|\s\d+$"," ", words) 160 | words = re.sub(r'\s\#+\s|\s\++\s',' ',words) 161 | stemmed_words = [stemmer.stem(word) for word in words.split()] 162 | clean_text = filter(lambda w: not w in s,stemmed_words) 163 | words='' 164 | for word in clean_text: 165 | words+=word+' ' 166 | self.T.append(words) 167 | results=classifier.predict(self.T) 168 | results=multibin.inverse_transform(results) 169 | print '\n',results,'\n' 170 | buff='' 171 | self.tagarr=[] 172 | for result in results[0]: 173 | buff=buff+QString(result)+' ; ' 174 | self.tagarr.append(result) 175 | self.lineEdit_2.setText(buff[:len(buff)-3]) 176 | self.recommend() 177 | 178 | class RCMD(QDialog,Ui_Dialog): 179 | def __init__(self, parent=None): 180 | super(RCMD, self).__init__(parent) 181 | self.setupUi(self) 182 | def display(self,index): 183 | x=_i.index(index) 184 | self.label.setText('id: '+QString(_i[x])) 185 | self.lineEdit.setText(QString(_t[x])) 186 | self.lineEdit_2.setText(QString(_T[x])) 187 | self.plainTextEdit.setPlainText(QString(_b[x])) 188 | self.show() 189 | 190 | def main(argv): 191 | for line in tagrows: 192 | checktags.append(line.split()) 193 | app = QApplication(argv) 194 | window=SO() 195 | window.show() 196 | retval = app.exec_() 197 | sys.exit(retval) 198 | 199 | if __name__ == '__main__': 200 | main(sys.argv) 201 | -------------------------------------------------------------------------------- /StackOverflow_CSV_Preprocessing.py: -------------------------------------------------------------------------------- 1 | import json, HTMLParser, re, nltk, csv, sys 2 | from bs4 import BeautifulSoup 3 | from nltk.corpus import stopwords 4 | from collections import defaultdict 5 | from nltk.stem.snowball import SnowballStemmer 6 | 7 | ft=open('Tags.txt','w') 8 | fh=open('cleaned.txt','w') 9 | fe=open('error_log.txt','w') 10 | fi=open("trainset.csv", "rb") 11 | s=set(stopwords.words('english')) 12 | reader = csv.DictReader(fi) 13 | html_parser= HTMLParser.HTMLParser() 14 | i=0 15 | 16 | #no. of rows 5547983 17 | 18 | for row in reader: 19 | try: 20 | z=str(row['Body']).lower() 21 | f=0 22 | bd='' 23 | while f",f) 25 | y=z.find("

",f) 26 | bd='' 27 | if x>=0 and y>=0: 28 | s1=[] 29 | s2=[] 30 | s3=[] 31 | s4=[] 32 | s1=[m.start() for m in re.finditer('', z[x:y])] 33 | s2=[m.start() for m in re.finditer('', z[x:y])] 34 | r=0 35 | lh=0 36 | while r",f) 42 | s3=[m.start() for m in re.finditer('
', z[x:y])] 44 | lh=0 45 | while r=~,;`{}|]',' ',str(row['Title']).lower()+' '+bd) 54 | words=re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))''',' ',words) 55 | words=re.sub('\n',' ',words) 56 | words=re.sub('_','-',words) 57 | words=words.replace('[',' ') 58 | words=words.replace(']',' ') 59 | words=words.replace('/',' ') 60 | words=words.replace('\\',' ') 61 | words = re.sub(r'(\s)\-+(\s)',r'\1', words) 62 | words = re.sub(r'\.+(\s)',r'\1', words) 63 | words = re.sub(r'\.+\.(\w)',r'\1', words) 64 | words = re.sub(r'(\s)\.+(\s)',r'\1', words) 65 | words = re.sub("'",'', words) 66 | words = re.sub(r'\s\d+[\.\-\+]+\d+|\s[\.\-\+]+\d+|\s+\d+\s+|\s\d+[\+\-]+',' ',words) 67 | words= re.sub("^\d+\s|\s\d+\s|\s\d+$"," ", words) 68 | words= re.sub(r'\s\#+\s|\s\++\s',' ',words) 69 | stemmer = SnowballStemmer("english", ignore_stopwords=True) 70 | stemmed_words = [stemmer.stem(word) for word in words.split()] 71 | clean_text=filter(lambda w: not w in s,stemmed_words) 72 | words='' 73 | for word in clean_text: 74 | words+=word+' ' 75 | fh.write(words+'\n') 76 | ft.write(row['Tags']+'\n') 77 | print "Post Cleaned-",row['Id'] 78 | i=i+1 79 | except Exception,e: 80 | print "EXCEPTION: ",str(e) 81 | fe.write(str(row['Id'])+' '+str(e)+'\n') 82 | error.append(row['Id']) 83 | fh.write('\n') 84 | ft.write('\n') 85 | i=i-1 86 | pass 87 | 88 | fh.close() 89 | fi.close() 90 | ft.close() 91 | fe.close() 92 | print 93 | print 94 | print 95 | print "CLEANING COMPLETED" 96 | print i," Posts Cleaned." 97 | print 98 | print "Error in the Following: ",error 99 | -------------------------------------------------------------------------------- /maingui.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Form implementation generated from reading ui file 'maingui.ui' 4 | # 5 | # Created by: PyQt5 UI code generator 5.6 6 | # 7 | # WARNING! All changes made in this file will be lost! 8 | 9 | from PyQt5 import QtCore, QtGui, QtWidgets 10 | 11 | class Ui_MainWindow(object): 12 | def setupUi(self, MainWindow): 13 | MainWindow.setObjectName("MainWindow") 14 | MainWindow.setEnabled(True) 15 | MainWindow.resize(920, 561) 16 | icon = QtGui.QIcon() 17 | icon.addPixmap(QtGui.QPixmap("GKbCl.png"), QtGui.QIcon.Normal, QtGui.QIcon.Off) 18 | MainWindow.setWindowIcon(icon) 19 | self.centralwidget = QtWidgets.QWidget(MainWindow) 20 | self.centralwidget.setObjectName("centralwidget") 21 | self.layoutWidget = QtWidgets.QWidget(self.centralwidget) 22 | self.layoutWidget.setGeometry(QtCore.QRect(15, 10, 481, 501)) 23 | self.layoutWidget.setObjectName("layoutWidget") 24 | self.verticalLayout_2 = QtWidgets.QVBoxLayout(self.layoutWidget) 25 | self.verticalLayout_2.setContentsMargins(0, 0, 0, 0) 26 | self.verticalLayout_2.setObjectName("verticalLayout_2") 27 | self.verticalLayout = QtWidgets.QVBoxLayout() 28 | self.verticalLayout.setObjectName("verticalLayout") 29 | self.horizontalLayout = QtWidgets.QHBoxLayout() 30 | self.horizontalLayout.setObjectName("horizontalLayout") 31 | self.label = QtWidgets.QLabel(self.layoutWidget) 32 | font = QtGui.QFont() 33 | font.setBold(True) 34 | font.setWeight(75) 35 | self.label.setFont(font) 36 | self.label.setObjectName("label") 37 | self.horizontalLayout.addWidget(self.label) 38 | self.lineEdit = QtWidgets.QLineEdit(self.layoutWidget) 39 | self.lineEdit.setObjectName("lineEdit") 40 | self.horizontalLayout.addWidget(self.lineEdit) 41 | self.verticalLayout.addLayout(self.horizontalLayout) 42 | self.horizontalLayout_2 = QtWidgets.QHBoxLayout() 43 | self.horizontalLayout_2.setObjectName("horizontalLayout_2") 44 | self.label_2 = QtWidgets.QLabel(self.layoutWidget) 45 | font = QtGui.QFont() 46 | font.setBold(True) 47 | font.setWeight(75) 48 | self.label_2.setFont(font) 49 | self.label_2.setObjectName("label_2") 50 | self.horizontalLayout_2.addWidget(self.label_2) 51 | self.plainTextEdit = QtWidgets.QPlainTextEdit(self.layoutWidget) 52 | self.plainTextEdit.setObjectName("plainTextEdit") 53 | self.horizontalLayout_2.addWidget(self.plainTextEdit) 54 | self.verticalLayout.addLayout(self.horizontalLayout_2) 55 | self.verticalLayout_2.addLayout(self.verticalLayout) 56 | self.pushButton = QtWidgets.QPushButton(self.layoutWidget) 57 | font = QtGui.QFont() 58 | font.setBold(True) 59 | font.setWeight(75) 60 | self.pushButton.setFont(font) 61 | self.pushButton.setObjectName("pushButton") 62 | self.verticalLayout_2.addWidget(self.pushButton) 63 | self.horizontalLayout_3 = QtWidgets.QHBoxLayout() 64 | self.horizontalLayout_3.setObjectName("horizontalLayout_3") 65 | self.label_3 = QtWidgets.QLabel(self.layoutWidget) 66 | font = QtGui.QFont() 67 | font.setBold(True) 68 | font.setWeight(75) 69 | self.label_3.setFont(font) 70 | self.label_3.setObjectName("label_3") 71 | self.horizontalLayout_3.addWidget(self.label_3) 72 | self.lineEdit_2 = QtWidgets.QLineEdit(self.layoutWidget) 73 | self.lineEdit_2.setObjectName("lineEdit_2") 74 | self.horizontalLayout_3.addWidget(self.lineEdit_2) 75 | self.verticalLayout_2.addLayout(self.horizontalLayout_3) 76 | self.horizontalLayout_5 = QtWidgets.QHBoxLayout() 77 | self.horizontalLayout_5.setObjectName("horizontalLayout_5") 78 | self.lotags = QtWidgets.QLabel(self.layoutWidget) 79 | font = QtGui.QFont() 80 | font.setBold(True) 81 | font.setWeight(75) 82 | self.lotags.setFont(font) 83 | self.lotags.setObjectName("lotags") 84 | self.horizontalLayout_5.addWidget(self.lotags) 85 | self.otags = QtWidgets.QLineEdit(self.layoutWidget) 86 | self.otags.setObjectName("otags") 87 | self.horizontalLayout_5.addWidget(self.otags) 88 | self.verticalLayout_2.addLayout(self.horizontalLayout_5) 89 | self.layoutWidget1 = QtWidgets.QWidget(self.centralwidget) 90 | self.layoutWidget1.setGeometry(QtCore.QRect(500, 150, 387, 218)) 91 | self.layoutWidget1.setObjectName("layoutWidget1") 92 | self.verticalLayout_4 = QtWidgets.QVBoxLayout(self.layoutWidget1) 93 | self.verticalLayout_4.setContentsMargins(0, 0, 0, 0) 94 | self.verticalLayout_4.setObjectName("verticalLayout_4") 95 | self.pushButton_4 = QtWidgets.QPushButton(self.layoutWidget1) 96 | font = QtGui.QFont() 97 | font.setBold(True) 98 | font.setWeight(75) 99 | self.pushButton_4.setFont(font) 100 | self.pushButton_4.setObjectName("pushButton_4") 101 | self.verticalLayout_4.addWidget(self.pushButton_4) 102 | self.label_9 = QtWidgets.QLabel(self.layoutWidget1) 103 | self.label_9.setEnabled(True) 104 | font = QtGui.QFont() 105 | font.setPointSize(14) 106 | font.setBold(True) 107 | font.setWeight(75) 108 | self.label_9.setFont(font) 109 | self.label_9.setObjectName("label_9") 110 | self.verticalLayout_4.addWidget(self.label_9) 111 | self.verticalLayout_3 = QtWidgets.QVBoxLayout() 112 | self.verticalLayout_3.setObjectName("verticalLayout_3") 113 | self.r1 = QtWidgets.QPushButton(self.layoutWidget1) 114 | self.r1.setText("") 115 | self.r1.setObjectName("r1") 116 | self.verticalLayout_3.addWidget(self.r1) 117 | self.r2 = QtWidgets.QPushButton(self.layoutWidget1) 118 | self.r2.setText("") 119 | self.r2.setObjectName("r2") 120 | self.verticalLayout_3.addWidget(self.r2) 121 | self.r3 = QtWidgets.QPushButton(self.layoutWidget1) 122 | self.r3.setText("") 123 | self.r3.setObjectName("r3") 124 | self.verticalLayout_3.addWidget(self.r3) 125 | self.r4 = QtWidgets.QPushButton(self.layoutWidget1) 126 | self.r4.setText("") 127 | self.r4.setObjectName("r4") 128 | self.verticalLayout_3.addWidget(self.r4) 129 | self.r5 = QtWidgets.QPushButton(self.layoutWidget1) 130 | self.r5.setText("") 131 | self.r5.setObjectName("r5") 132 | self.verticalLayout_3.addWidget(self.r5) 133 | self.verticalLayout_4.addLayout(self.verticalLayout_3) 134 | self.label_4 = QtWidgets.QLabel(self.centralwidget) 135 | self.label_4.setGeometry(QtCore.QRect(510, 10, 381, 131)) 136 | self.label_4.setText("") 137 | self.label_4.setPixmap(QtGui.QPixmap("stackoverflow-logo.png")) 138 | self.label_4.setScaledContents(True) 139 | self.label_4.setObjectName("label_4") 140 | self.layoutWidget2 = QtWidgets.QWidget(self.centralwidget) 141 | self.layoutWidget2.setGeometry(QtCore.QRect(500, 480, 421, 30)) 142 | self.layoutWidget2.setObjectName("layoutWidget2") 143 | self.horizontalLayout_6 = QtWidgets.QHBoxLayout(self.layoutWidget2) 144 | self.horizontalLayout_6.setContentsMargins(0, 0, 0, 0) 145 | self.horizontalLayout_6.setObjectName("horizontalLayout_6") 146 | self.ac = QtWidgets.QPushButton(self.layoutWidget2) 147 | self.ac.setObjectName("ac") 148 | self.horizontalLayout_6.addWidget(self.ac) 149 | self.pushButton_3 = QtWidgets.QPushButton(self.layoutWidget2) 150 | self.pushButton_3.setObjectName("pushButton_3") 151 | self.horizontalLayout_6.addWidget(self.pushButton_3) 152 | self.horizontalLayout_4 = QtWidgets.QHBoxLayout() 153 | self.horizontalLayout_4.setObjectName("horizontalLayout_4") 154 | self.pushButton_9 = QtWidgets.QPushButton(self.layoutWidget2) 155 | self.pushButton_9.setObjectName("pushButton_9") 156 | self.horizontalLayout_4.addWidget(self.pushButton_9) 157 | self.pushButton_8 = QtWidgets.QPushButton(self.layoutWidget2) 158 | self.pushButton_8.setObjectName("pushButton_8") 159 | self.horizontalLayout_4.addWidget(self.pushButton_8) 160 | self.horizontalLayout_6.addLayout(self.horizontalLayout_4) 161 | self.lf1 = QtWidgets.QLabel(self.centralwidget) 162 | self.lf1.setGeometry(QtCore.QRect(640, 460, 66, 17)) 163 | font = QtGui.QFont() 164 | font.setBold(True) 165 | font.setItalic(True) 166 | font.setWeight(75) 167 | self.lf1.setFont(font) 168 | self.lf1.setObjectName("lf1") 169 | self.acl = QtWidgets.QLabel(self.centralwidget) 170 | self.acl.setGeometry(QtCore.QRect(520, 460, 66, 17)) 171 | font = QtGui.QFont() 172 | font.setBold(True) 173 | font.setItalic(True) 174 | font.setWeight(75) 175 | self.acl.setFont(font) 176 | self.acl.setObjectName("acl") 177 | MainWindow.setCentralWidget(self.centralwidget) 178 | self.menubar = QtWidgets.QMenuBar(MainWindow) 179 | self.menubar.setGeometry(QtCore.QRect(0, 0, 920, 25)) 180 | self.menubar.setObjectName("menubar") 181 | MainWindow.setMenuBar(self.menubar) 182 | self.statusbar = QtWidgets.QStatusBar(MainWindow) 183 | self.statusbar.setObjectName("statusbar") 184 | MainWindow.setStatusBar(self.statusbar) 185 | 186 | self.retranslateUi(MainWindow) 187 | self.pushButton_9.clicked.connect(self.plainTextEdit.clear) 188 | self.pushButton_9.clicked.connect(self.lineEdit_2.clear) 189 | self.pushButton_9.clicked.connect(self.lineEdit.clear) 190 | self.pushButton_9.clicked.connect(self.label_9.hide) 191 | self.pushButton_9.clicked.connect(self.r1.hide) 192 | self.pushButton_9.clicked.connect(self.r2.hide) 193 | self.pushButton_9.clicked.connect(self.r3.hide) 194 | self.pushButton_9.clicked.connect(self.r4.hide) 195 | self.pushButton_9.clicked.connect(self.r5.hide) 196 | self.pushButton_8.clicked.connect(MainWindow.close) 197 | self.pushButton_9.clicked.connect(self.acl.hide) 198 | self.pushButton_9.clicked.connect(self.lf1.hide) 199 | self.pushButton_9.clicked.connect(self.otags.clear) 200 | self.pushButton_9.clicked.connect(self.lotags.hide) 201 | self.pushButton_9.clicked.connect(self.otags.hide) 202 | self.pushButton_9.clicked.connect(self.otags.clear) 203 | self.ac.clicked.connect(self.acl.show) 204 | self.pushButton_3.clicked.connect(self.lf1.show) 205 | self.pushButton_4.clicked.connect(self.label_9.show) 206 | QtCore.QMetaObject.connectSlotsByName(MainWindow) 207 | 208 | def retranslateUi(self, MainWindow): 209 | _translate = QtCore.QCoreApplication.translate 210 | MainWindow.setWindowTitle(_translate("MainWindow", "StackOverflow Application")) 211 | self.label.setText(_translate("MainWindow", "Title")) 212 | self.label_2.setText(_translate("MainWindow", "Body")) 213 | self.pushButton.setText(_translate("MainWindow", "Predict")) 214 | self.label_3.setText(_translate("MainWindow", "Tags ")) 215 | self.lotags.setText(_translate("MainWindow", "Original Tags")) 216 | self.pushButton_4.setText(_translate("MainWindow", "Get Recommendations")) 217 | self.label_9.setText(_translate("MainWindow", " RECOMMENDATIONS")) 218 | self.ac.setText(_translate("MainWindow", "Get Accuracy")) 219 | self.pushButton_3.setText(_translate("MainWindow", "Get F1 Score")) 220 | self.pushButton_9.setText(_translate("MainWindow", "Clear")) 221 | self.pushButton_8.setText(_translate("MainWindow", "Exit")) 222 | self.lf1.setText(_translate("MainWindow", "F1 Score")) 223 | self.acl.setText(_translate("MainWindow", "Accuracy")) 224 | 225 | -------------------------------------------------------------------------------- /recgui.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Form implementation generated from reading ui file 'recgui.ui' 4 | # 5 | # Created by: PyQt5 UI code generator 5.6 6 | # 7 | # WARNING! All changes made in this file will be lost! 8 | 9 | from PyQt5 import QtCore, QtGui, QtWidgets 10 | 11 | class Ui_Dialog(object): 12 | def setupUi(self, Dialog): 13 | Dialog.setObjectName("Dialog") 14 | Dialog.resize(722, 430) 15 | icon = QtGui.QIcon() 16 | icon.addPixmap(QtGui.QPixmap("GKbCl.png"), QtGui.QIcon.Normal, QtGui.QIcon.Off) 17 | Dialog.setWindowIcon(icon) 18 | self.label_2 = QtWidgets.QLabel(Dialog) 19 | self.label_2.setGeometry(QtCore.QRect(270, 10, 201, 20)) 20 | font = QtGui.QFont() 21 | font.setPointSize(14) 22 | font.setBold(True) 23 | font.setWeight(75) 24 | self.label_2.setFont(font) 25 | self.label_2.setObjectName("label_2") 26 | self.layoutWidget = QtWidgets.QWidget(Dialog) 27 | self.layoutWidget.setGeometry(QtCore.QRect(0, 30, 711, 361)) 28 | self.layoutWidget.setObjectName("layoutWidget") 29 | self.verticalLayout_2 = QtWidgets.QVBoxLayout(self.layoutWidget) 30 | self.verticalLayout_2.setContentsMargins(0, 0, 0, 0) 31 | self.verticalLayout_2.setObjectName("verticalLayout_2") 32 | self.label = QtWidgets.QLabel(self.layoutWidget) 33 | font = QtGui.QFont() 34 | font.setBold(True) 35 | font.setWeight(75) 36 | self.label.setFont(font) 37 | self.label.setObjectName("label") 38 | self.verticalLayout_2.addWidget(self.label) 39 | self.verticalLayout = QtWidgets.QVBoxLayout() 40 | self.verticalLayout.setObjectName("verticalLayout") 41 | self.horizontalLayout = QtWidgets.QHBoxLayout() 42 | self.horizontalLayout.setObjectName("horizontalLayout") 43 | self.label_5 = QtWidgets.QLabel(self.layoutWidget) 44 | font = QtGui.QFont() 45 | font.setBold(True) 46 | font.setWeight(75) 47 | self.label_5.setFont(font) 48 | self.label_5.setObjectName("label_5") 49 | self.horizontalLayout.addWidget(self.label_5) 50 | self.lineEdit = QtWidgets.QLineEdit(self.layoutWidget) 51 | self.lineEdit.setReadOnly(True) 52 | self.lineEdit.setObjectName("lineEdit") 53 | self.horizontalLayout.addWidget(self.lineEdit) 54 | self.verticalLayout.addLayout(self.horizontalLayout) 55 | self.horizontalLayout_2 = QtWidgets.QHBoxLayout() 56 | self.horizontalLayout_2.setObjectName("horizontalLayout_2") 57 | self.label_3 = QtWidgets.QLabel(self.layoutWidget) 58 | font = QtGui.QFont() 59 | font.setBold(True) 60 | font.setWeight(75) 61 | self.label_3.setFont(font) 62 | self.label_3.setObjectName("label_3") 63 | self.horizontalLayout_2.addWidget(self.label_3) 64 | self.plainTextEdit = QtWidgets.QPlainTextEdit(self.layoutWidget) 65 | self.plainTextEdit.setReadOnly(True) 66 | self.plainTextEdit.setObjectName("plainTextEdit") 67 | self.horizontalLayout_2.addWidget(self.plainTextEdit) 68 | self.verticalLayout.addLayout(self.horizontalLayout_2) 69 | self.horizontalLayout_3 = QtWidgets.QHBoxLayout() 70 | self.horizontalLayout_3.setObjectName("horizontalLayout_3") 71 | self.label_4 = QtWidgets.QLabel(self.layoutWidget) 72 | font = QtGui.QFont() 73 | font.setBold(True) 74 | font.setWeight(75) 75 | self.label_4.setFont(font) 76 | self.label_4.setObjectName("label_4") 77 | self.horizontalLayout_3.addWidget(self.label_4) 78 | self.lineEdit_2 = QtWidgets.QLineEdit(self.layoutWidget) 79 | self.lineEdit_2.setReadOnly(True) 80 | self.lineEdit_2.setObjectName("lineEdit_2") 81 | self.horizontalLayout_3.addWidget(self.lineEdit_2) 82 | self.verticalLayout.addLayout(self.horizontalLayout_3) 83 | self.verticalLayout_2.addLayout(self.verticalLayout) 84 | self.pushButton = QtWidgets.QPushButton(Dialog) 85 | self.pushButton.setGeometry(QtCore.QRect(300, 400, 96, 26)) 86 | self.pushButton.setObjectName("pushButton") 87 | 88 | self.retranslateUi(Dialog) 89 | self.pushButton.clicked.connect(Dialog.close) 90 | QtCore.QMetaObject.connectSlotsByName(Dialog) 91 | 92 | def retranslateUi(self, Dialog): 93 | _translate = QtCore.QCoreApplication.translate 94 | Dialog.setWindowTitle(_translate("Dialog", "RECOMMENDATIONS")) 95 | self.label_2.setText(_translate("Dialog", "RECOMMENDATION")) 96 | self.label.setText(_translate("Dialog", "ID")) 97 | self.label_5.setText(_translate("Dialog", "Title")) 98 | self.label_3.setText(_translate("Dialog", "Body")) 99 | self.label_4.setText(_translate("Dialog", "Tags")) 100 | self.pushButton.setText(_translate("Dialog", "Exit")) 101 | 102 | -------------------------------------------------------------------------------- /savejob.py: -------------------------------------------------------------------------------- 1 | from sklearn.externals import joblib 2 | from sklearn.pipeline import Pipeline 3 | from sklearn.svm import LinearSVC 4 | from sklearn.multiclass import OneVsRestClassifier 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from sklearn.feature_extraction.text import TfidfTransformer 7 | from sklearn.preprocessing import MultiLabelBinarizer 8 | tags={} 9 | freq=[] 10 | count=0 11 | fh=open('Tags.txt','r') 12 | fh2=open('cleaned.txt','r') 13 | tagrows=fh.read().split('\n')[:500000] 14 | X=fh2.read().split('\n')[:500000] 15 | Y = [[] for i in range(len(X))] 16 | classifier = Pipeline([ 17 | ('vectorizer', CountVectorizer()), 18 | ('tfidf', TfidfTransformer()), 19 | ('clf', OneVsRestClassifier(LinearSVC(), n_jobs = -2))]) 20 | for line in tagrows: 21 | for tag in line.split(): 22 | if tag in tags: 23 | tags[tag]+=1 24 | else: 25 | tags[tag]=1 26 | count=0 27 | for tag in sorted(tags,key=lambda tag:tags[tag], reverse=True): 28 | if tags[tag] > 4000: 29 | count += 1 30 | freq.append(tag) 31 | else: 32 | break 33 | print "Training..." 34 | for x,tag in enumerate(freq): 35 | i=0 36 | for row in tagrows: 37 | if tag in row.split(): 38 | Y[i].append(tag) 39 | i=i+1 40 | 41 | multibin=MultiLabelBinarizer() 42 | Y=multibin.fit_transform(Y) 43 | classifier.fit(X,Y) 44 | job = joblib.dump(classifier, 'clf.txt', compress=9) 45 | job = joblib.dump(multibin, 'multibin.txt', compress=9) 46 | -------------------------------------------------------------------------------- /so.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pranay360/StackOverflow-Recommendation-System/67955c81eb797d108d52557f1f05b1018a768f82/so.png -------------------------------------------------------------------------------- /stackoverflow-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pranay360/StackOverflow-Recommendation-System/67955c81eb797d108d52557f1f05b1018a768f82/stackoverflow-logo.png -------------------------------------------------------------------------------- /testSO.py: -------------------------------------------------------------------------------- 1 | import json, HTMLParser, re, nltk, csv, sys, stackexchange 2 | from bs4 import BeautifulSoup 3 | from nltk.corpus import stopwords 4 | from collections import defaultdict 5 | from nltk.stem.snowball import SnowballStemmer 6 | 7 | ft=open('TTags.txt','w') 8 | fh=open('Tcleaned.txt','w') 9 | fi=open('Tid.txt','w') 10 | s=set(stopwords.words('english')) 11 | html_parser= HTMLParser.HTMLParser() 12 | so = stackexchange.Site(stackexchange.StackOverflow) 13 | so.be_inclusive() 14 | i=0 15 | 16 | #no. of rows 5547983 17 | 18 | for question in so.questions(pagesize=10): 19 | try: 20 | t=str(question.title.lower()) 21 | b=str(question.body.lower()) 22 | z=str(html_parser.unescape(str(b.encode('utf-8')))) 23 | tags=str(question.tags) 24 | _id=question.id 25 | f=0 26 | bd='' 27 | while f",f) 29 | y=z.find("

",f) 30 | bd='' 31 | if x>=0 and y>=0: 32 | s1=[] 33 | s2=[] 34 | s3=[] 35 | s4=[] 36 | s1=[m.start() for m in re.finditer('', z[x:y])] 37 | s2=[m.start() for m in re.finditer('', z[x:y])] 38 | r=0 39 | lh=0 40 | while r",f) 46 | s3=[m.start() for m in re.finditer('
', z[x:y])] 48 | lh=0 49 | while r=~,;`{}|]',' ',t.lower()+' '+bd) 58 | words=re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))''',' ',words) 59 | words=re.sub('\n',' ',words) 60 | words=re.sub('_','-',words) 61 | words=words.replace('[',' ') 62 | words=words.replace(']',' ') 63 | words=words.replace('/',' ') 64 | words=words.replace('\\',' ') 65 | words = re.sub(r'(\s)\-+(\s)',r'\1', words) 66 | words = re.sub(r'\.+(\s)',r'\1', words) 67 | words = re.sub(r'\.+\.(\w)',r'\1', words) 68 | words = re.sub(r'(\s)\.+(\s)',r'\1', words) 69 | words = re.sub("'",'', words) 70 | words = re.sub(r'\s\d+[\.\-\+]+\d+|\s[\.\-\+]+\d+|\s+\d+\s+|\s\d+[\+\-]+',' ',words) 71 | words= re.sub("^\d+\s|\s\d+\s|\s\d+$"," ", words) 72 | words= re.sub(r'\s\#+\s|\s\++\s',' ',words) 73 | stemmer = SnowballStemmer("english", ignore_stopwords=True) 74 | stemmed_words = [stemmer.stem(word) for word in words.split()] 75 | clean_text=filter(lambda w: not w in s,stemmed_words) 76 | words='' 77 | for word in clean_text: 78 | words+=word+' ' 79 | fh.write(words+'\n') 80 | for tag in tags: 81 | ft.write(tag+' ') 82 | ft.write('\n') 83 | fi.write(str(_id)) 84 | print "Post Cleaned-",_id 85 | i=i+1 86 | except Exception,e: 87 | print "EXCEPTION: ",str(e) 88 | pass 89 | 90 | fh.close() 91 | fi.close() 92 | ft.close() 93 | 94 | print 95 | print 96 | print 97 | print "CLEANING COMPLETED" 98 | print i," Posts Cleaned." 99 | print 100 | --------------------------------------------------------------------------------