├── Extractn.py ├── GKbCl.png ├── GUI_files ├── maingui.ui └── recgui.ui ├── MinePredict.py ├── MineStackOverflow.py ├── Prediction.py ├── README.md ├── Remove_Duplicates.py ├── RunApp.py ├── StackOverflow_CSV_Preprocessing.py ├── maingui.py ├── recgui.py ├── savejob.py ├── so.png ├── stackoverflow-logo.png └── testSO.py /Extractn.py: -------------------------------------------------------------------------------- 1 | import csv 2 | fi=open("Train.csv",'rb') 3 | fo=open("Train2.csv",'wb') 4 | reader=csv.reader(fi) 5 | writer=csv.writer(fo) 6 | for i, row in enumerate(reader): 7 | if(i<1000000): 8 | writer.writerow(row) 9 | print i 10 | else: 11 | break 12 | 13 | print "Extracted 10 Lakh Docs" 14 | -------------------------------------------------------------------------------- /GKbCl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pranay360/StackOverflow-Recommendation-System/67955c81eb797d108d52557f1f05b1018a768f82/GKbCl.png -------------------------------------------------------------------------------- /GUI_files/maingui.ui: -------------------------------------------------------------------------------- 1 | 2 | 3 | MainWindow 4 | 5 | 6 | true 7 | 8 | 9 | 10 | 0 11 | 0 12 | 920 13 | 561 14 | 15 | 16 | 17 | StackOverflow Application 18 | 19 | 20 | 21 | GKbCl.pngGKbCl.png 22 | 23 | 24 | 25 | 26 | 27 | 15 28 | 10 29 | 481 30 | 501 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 75 43 | true 44 | 45 | 46 | 47 | Title 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 75 63 | true 64 | 65 | 66 | 67 | Body 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 75 83 | true 84 | 85 | 86 | 87 | Predict 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 75 98 | true 99 | 100 | 101 | 102 | Tags 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 75 118 | true 119 | 120 | 121 | 122 | Original Tags 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 500 137 | 150 138 | 387 139 | 218 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 75 148 | true 149 | 150 | 151 | 152 | Get Recommendations 153 | 154 | 155 | 156 | 157 | 158 | 159 | true 160 | 161 | 162 | 163 | 14 164 | 75 165 | true 166 | 167 | 168 | 169 | RECOMMENDATIONS 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 510 218 | 10 219 | 381 220 | 131 221 | 222 | 223 | 224 | 225 | 226 | 227 | stackoverflow-logo.png 228 | 229 | 230 | true 231 | 232 | 233 | 234 | 235 | 236 | 500 237 | 480 238 | 421 239 | 30 240 | 241 | 242 | 243 | 244 | 245 | 246 | Get Accuracy 247 | 248 | 249 | 250 | 251 | 252 | 253 | Get F1 Score 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | Clear 263 | 264 | 265 | 266 | 267 | 268 | 269 | Exit 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 640 281 | 460 282 | 66 283 | 17 284 | 285 | 286 | 287 | 288 | 75 289 | true 290 | true 291 | 292 | 293 | 294 | F1 Score 295 | 296 | 297 | 298 | 299 | 300 | 520 301 | 460 302 | 66 303 | 17 304 | 305 | 306 | 307 | 308 | 75 309 | true 310 | true 311 | 312 | 313 | 314 | Accuracy 315 | 316 | 317 | 318 | 319 | 320 | 321 | 0 322 | 0 323 | 920 324 | 25 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | pushButton_9 334 | clicked() 335 | plainTextEdit 336 | clear() 337 | 338 | 339 | 813 340 | 507 341 | 342 | 343 | 441 344 | 276 345 | 346 | 347 | 348 | 349 | pushButton_9 350 | clicked() 351 | lineEdit_2 352 | clear() 353 | 354 | 355 | 813 356 | 507 357 | 358 | 359 | 493 360 | 498 361 | 362 | 363 | 364 | 365 | pushButton_9 366 | clicked() 367 | lineEdit 368 | clear() 369 | 370 | 371 | 813 372 | 507 373 | 374 | 375 | 438 376 | 45 377 | 378 | 379 | 380 | 381 | pushButton_9 382 | clicked() 383 | label_9 384 | hide() 385 | 386 | 387 | 813 388 | 507 389 | 390 | 391 | 562 392 | 208 393 | 394 | 395 | 396 | 397 | pushButton_9 398 | clicked() 399 | r1 400 | hide() 401 | 402 | 403 | 813 404 | 507 405 | 406 | 407 | 657 408 | 237 409 | 410 | 411 | 412 | 413 | pushButton_9 414 | clicked() 415 | r2 416 | hide() 417 | 418 | 419 | 813 420 | 507 421 | 422 | 423 | 699 424 | 269 425 | 426 | 427 | 428 | 429 | pushButton_9 430 | clicked() 431 | r3 432 | hide() 433 | 434 | 435 | 813 436 | 507 437 | 438 | 439 | 741 440 | 301 441 | 442 | 443 | 444 | 445 | pushButton_9 446 | clicked() 447 | r4 448 | hide() 449 | 450 | 451 | 813 452 | 507 453 | 454 | 455 | 768 456 | 333 457 | 458 | 459 | 460 | 461 | pushButton_9 462 | clicked() 463 | r5 464 | hide() 465 | 466 | 467 | 813 468 | 507 469 | 470 | 471 | 714 472 | 390 473 | 474 | 475 | 476 | 477 | pushButton_8 478 | clicked() 479 | MainWindow 480 | close() 481 | 482 | 483 | 918 484 | 507 485 | 486 | 487 | 811 488 | 594 489 | 490 | 491 | 492 | 493 | pushButton_9 494 | clicked() 495 | acl 496 | hide() 497 | 498 | 499 | 824 500 | 522 501 | 502 | 503 | 579 504 | 487 505 | 506 | 507 | 508 | 509 | pushButton_9 510 | clicked() 511 | lf1 512 | hide() 513 | 514 | 515 | 797 516 | 528 517 | 518 | 519 | 661 520 | 486 521 | 522 | 523 | 524 | 525 | pushButton_9 526 | clicked() 527 | otags 528 | clear() 529 | 530 | 531 | 752 532 | 529 533 | 534 | 535 | 264 536 | 528 537 | 538 | 539 | 540 | 541 | pushButton_9 542 | clicked() 543 | lotags 544 | hide() 545 | 546 | 547 | 778 548 | 521 549 | 550 | 551 | 54 552 | 523 553 | 554 | 555 | 556 | 557 | pushButton_9 558 | clicked() 559 | otags 560 | hide() 561 | 562 | 563 | 754 564 | 516 565 | 566 | 567 | 395 568 | 512 569 | 570 | 571 | 572 | 573 | pushButton_9 574 | clicked() 575 | otags 576 | clear() 577 | 578 | 579 | 773 580 | 520 581 | 582 | 583 | 384 584 | 512 585 | 586 | 587 | 588 | 589 | ac 590 | clicked() 591 | acl 592 | show() 593 | 594 | 595 | 528 596 | 518 597 | 598 | 599 | 538 600 | 495 601 | 602 | 603 | 604 | 605 | pushButton_3 606 | clicked() 607 | lf1 608 | show() 609 | 610 | 611 | 679 612 | 522 613 | 614 | 615 | 665 616 | 493 617 | 618 | 619 | 620 | 621 | pushButton_4 622 | clicked() 623 | label_9 624 | show() 625 | 626 | 627 | 651 628 | 184 629 | 630 | 631 | 626 632 | 220 633 | 634 | 635 | 636 | 637 | 638 | -------------------------------------------------------------------------------- /GUI_files/recgui.ui: -------------------------------------------------------------------------------- 1 | 2 | 3 | Dialog 4 | 5 | 6 | 7 | 0 8 | 0 9 | 722 10 | 430 11 | 12 | 13 | 14 | RECOMMENDATIONS 15 | 16 | 17 | 18 | GKbCl.pngGKbCl.png 19 | 20 | 21 | 22 | 23 | 270 24 | 10 25 | 201 26 | 20 27 | 28 | 29 | 30 | 31 | 14 32 | 75 33 | true 34 | 35 | 36 | 37 | RECOMMENDATION 38 | 39 | 40 | 41 | 42 | 43 | 0 44 | 30 45 | 711 46 | 361 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 75 55 | true 56 | 57 | 58 | 59 | ID 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 75 72 | true 73 | 74 | 75 | 76 | Title 77 | 78 | 79 | 80 | 81 | 82 | 83 | true 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 75 96 | true 97 | 98 | 99 | 100 | Body 101 | 102 | 103 | 104 | 105 | 106 | 107 | true 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 75 120 | true 121 | 122 | 123 | 124 | Tags 125 | 126 | 127 | 128 | 129 | 130 | 131 | true 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 300 145 | 400 146 | 96 147 | 26 148 | 149 | 150 | 151 | Exit 152 | 153 | 154 | 155 | 156 | 157 | 158 | pushButton 159 | clicked() 160 | Dialog 161 | close() 162 | 163 | 164 | 324 165 | 418 166 | 167 | 168 | 199 169 | 471 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /MinePredict.py: -------------------------------------------------------------------------------- 1 | import time, re, json, numpy as np 2 | from sklearn.svm import LinearSVC 3 | from nltk.corpus import stopwords 4 | from sklearn.pipeline import Pipeline 5 | from nltk.stem.snowball import SnowballStemmer 6 | from sklearn.multiclass import OneVsRestClassifier 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | 10 | s=set(stopwords.words('english')) 11 | stemmer = SnowballStemmer("english", ignore_stopwords=True) 12 | fh=open('Tags.txt','r') 13 | fh2=open('cleaned.txt','r') 14 | fh3=open('TTags.txt', 'r') 15 | fh4=open('Tcleaned.txt','r') 16 | tags={} 17 | freq=[] 18 | count=0 19 | tagrows=fh.read().split('\n')[:500000] 20 | X=fh2.read().split('\n')[:500000] 21 | Y = [[] for i in range(len(X))] 22 | 23 | for line in tagrows: 24 | for tag in line.split(): 25 | if tag in tags: 26 | tags[tag]+=1 27 | else: 28 | tags[tag]=1 29 | #34945 unique tags in 10 lakh posts 30 | 31 | for tag in sorted(tags,key=lambda tag:tags[tag], reverse=True): 32 | if tags[tag] > 800: 33 | count += 1 34 | freq.append(tag) 35 | else: 36 | break 37 | 38 | print "Training..." 39 | for x,tag in enumerate(freq): 40 | i=0 41 | for row in tagrows: 42 | if tag in row.split(): 43 | Y[i].append(tag) 44 | i=i+1 45 | 46 | 47 | classifier = Pipeline([ 48 | ('vectorizer', CountVectorizer()), 49 | ('tfidf', TfidfTransformer()), 50 | ('clf', OneVsRestClassifier(LinearSVC(class_weight='auto'), n_jobs = -2))]) 51 | classifier.fit(X,Y) 52 | print "Ready..." 53 | 54 | 55 | while True: 56 | T=[] 57 | words = fh.readline().lower().replace(' \n','') 58 | T.append(words) 59 | print '\n',classifier.predict(T),fh3.readline(),'\n' 60 | 61 | 62 | print "Exiting..." 63 | fh.close() 64 | fh2.close() 65 | -------------------------------------------------------------------------------- /MineStackOverflow.py: -------------------------------------------------------------------------------- 1 | import stackexchange, json, HTMLParser, re, nltk 2 | from nltk.corpus import stopwords 3 | from bs4 import BeautifulSoup 4 | from collections import defaultdict 5 | 6 | s=set(stopwords.words('english')) 7 | fh = open("stackoverflow.txt","w") 8 | html_parser= HTMLParser.HTMLParser() 9 | so = stackexchange.Site(stackexchange.StackOverflow) 10 | so.be_inclusive() 11 | 12 | results=[] 13 | d=defaultdict() 14 | i=0 15 | for question in so.questions(pagesize=10): 16 | try: 17 | if(i>20): 18 | break 19 | q=str(question.title.lower()) 20 | b=str(question.body.lower()) 21 | _id=question.id 22 | #c=[] 23 | words=[] 24 | uW=[] 25 | bd='' 26 | z=str(html_parser.unescape(str(b.encode('utf-8')))) 27 | f=0 28 | while f", f) 30 | y=z.find("

",f) 31 | if x>=0 and y>=0: 32 | s1=[m.start() for m in re.finditer('

', z[x+3:y])]
33 | 				s2=[m.start() for m in re.finditer('

",f) 26 | bd='' 27 | if x>=0 and y>=0: 28 | s1=[] 29 | s2=[] 30 | s3=[] 31 | s4=[] 32 | s1=[m.start() for m in re.finditer('

', z[x:y])]
33 | 				s2=[m.start() for m in re.finditer('

",f) 30 | bd='' 31 | if x>=0 and y>=0: 32 | s1=[] 33 | s2=[] 34 | s3=[] 35 | s4=[] 36 | s1=[m.start() for m in re.finditer('

', z[x:y])]
 37 | 				s2=[m.start() for m in re.finditer('

', z[x:y])] 38 | r=0 39 | lh=0 40 | while r",f) 46 | s3=[m.start() for m in re.finditer('', z[x:y])] 48 | lh=0 49 | while r=~,;`{}|]',' ',t.lower()+' '+bd) 58 | words=re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|$([^\s()<>]+|(\([^\s()<>]+$))*\))+(?:$([^\s()<>]+|(\([^\s()<>]+$))*\)|[^\s`!()\[\]{};:'".,<>?]))''',' ',words) 59 | words=re.sub('\n',' ',words) 60 | words=re.sub('_','-',words) 61 | words=words.replace('[',' ') 62 | words=words.replace(']',' ') 63 | words=words.replace('/',' ') 64 | words=words.replace('\\',' ') 65 | words = re.sub(r'(\s)\-+(\s)',r'\1', words) 66 | words = re.sub(r'\.+(\s)',r'\1', words) 67 | words = re.sub(r'\.+\.(\w)',r'\1', words) 68 | words = re.sub(r'(\s)\.+(\s)',r'\1', words) 69 | words = re.sub("'",'', words) 70 | words = re.sub(r'\s\d+[\.\-\+]+\d+|\s[\.\-\+]+\d+|\s+\d+\s+|\s\d+[\+\-]+',' ',words) 71 | words= re.sub("^\d+\s|\s\d+\s|\s\d+$"," ", words) 72 | words= re.sub(r'\s\#+\s|\s\++\s',' ',words) 73 | stemmer = SnowballStemmer("english", ignore_stopwords=True) 74 | stemmed_words = [stemmer.stem(word) for word in words.split()] 75 | clean_text=filter(lambda w: not w in s,stemmed_words) 76 | words='' 77 | for word in clean_text: 78 | words+=word+' ' 79 | fh.write(words+'\n') 80 | for tag in tags: 81 | ft.write(tag+' ') 82 | ft.write('\n') 83 | fi.write(str(_id)) 84 | print "Post Cleaned-",_id 85 | i=i+1 86 | except Exception,e: 87 | print "EXCEPTION: ",str(e) 88 | pass 89 | 90 | fh.close() 91 | fi.close() 92 | ft.close() 93 | 94 | print 95 | print 96 | print 97 | print "CLEANING COMPLETED" 98 | print i," Posts Cleaned." 99 | print 100 | --------------------------------------------------------------------------------