├── .gitignore ├── README.md ├── app ├── __init__.py ├── classifier.py ├── evaluate.py ├── generator.py ├── parser.py ├── plot_tags.py └── stat.py ├── config └── config.cfg.sample ├── database ├── __init__.py └── mongo.py └── install.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | #config 57 | config/config.cfg 58 | 59 | #dataset 60 | data/* 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StackExchange-tagger 2 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.972132.svg)](https://doi.org/10.5281/zenodo.972132) 3 | 4 | The goal of our project is to develop an accurate tagger for questions posted on Stack Exchange. Our problem is an instance of the more general problem of developing accurate classifiers for large scale text datasets. We are tackling the multilabel classification problem where each item (in this case, question) can belong to multiple classes (in this case, tags). We are predicting the tags (or keywords) for a particular Stack Exchange post given only the question text and the title of the post. In the process, we compare the performance of Support Vector Classification (SVC) for different kernel functions, loss function, etc. 5 | 6 | We found linear SVC with Crammer Singer technique produces best results. 7 | 8 | # Some Results 9 | 10 | Testing Error for SVC with different kernel functions where number of iterations = 10,000 11 | 12 | | Kernel | C = 1000(hard-margin) | C = 0.001(soft-margin) | 13 | |------------------|-----------------------|--------------------------| 14 | | RBF | 43.1 % | 48.5 % | 15 | | Linear | 51.9 % | 45.2 % | 16 | | Polynomial (n=2) | 54.4 % | 65 % | 17 | | Polynomial (n=3) | 72.2 % | 84.4 % | 18 | | Sigmoid | 84.4 % | 84.4 % | 19 | 20 | 21 | Testing Error for Linear SVC with different techniques where C = 0.001 (soft-margin) and number of iterations = 10,000 22 | 23 | | Technique | Hinge Loss Function | Square Hinge Loss Function | 24 | |----------------|---------------------|----------------------------| 25 | | One-vs-rest | 47.59 % | 68 % | 26 | | Crammer Singer | 45.25 % | 45.25% | 27 | 28 | 29 | #Report 30 | 31 | Our detailed report and results are available [here](https://sites.google.com/site/sanketmehtaiitr/home/stack-exchange-tagger). 32 | 33 | 34 | #Team 35 | 36 | * [Sanket Mehta](https://twitter.com/sanketvmehta) 37 | * [Shagun Sodhani](https://twitter.com/shagunsodhani) 38 | 39 | This work has been done as a part of a course project for Artificial Neural Network (IEE-03) at IIT Roorkee. 40 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shagunsodhani/StackExchange-tagger/8fac6a40f3de416776f236dfa4cf8e3dbd64cf5b/app/__init__.py -------------------------------------------------------------------------------- /app/classifier.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import string 5 | import operator 6 | import pickle 7 | from time import time 8 | 9 | import numpy as np 10 | from sklearn import svm 11 | from sklearn.multiclass import OneVsRestClassifier 12 | from sklearn.multiclass import OneVsOneClassifier 13 | from sklearn.preprocessing import MultiLabelBinarizer 14 | 15 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 16 | 17 | if not path in sys.path: 18 | sys.path.insert(1, path) 19 | del path 20 | 21 | try: 22 | import database.mongo as mongo 23 | except ImportError as exc: 24 | print("Error: failed to import settings module ({})".format(exc)) 25 | 26 | try: 27 | from app import stat 28 | except ImportError as exc: 29 | print("Error: failed to import settings module ({})".format(exc)) 30 | 31 | try: 32 | from app import evaluate 33 | except ImportError as exc: 34 | print("Error: failed to import settings module ({})".format(exc)) 35 | 36 | def input_representation(result): 37 | tag_count = {} 38 | for i in result: 39 | for j in i: 40 | if j not in tag_count: 41 | tag_count[j] = 1 42 | else: 43 | tag_count[j]+=1 44 | for j in tag_count: 45 | print str(j)+" : "+str(tag_count[j]) 46 | 47 | def predict(input_size = 100000, select_transform = 1, read_database = 1, one_vs_one = 0, model = "LinearSVC", mode = "multilable", repeat = 0, k = 0.8, max_number_of_tags = 5, max_iter = 100000, use_cache = 0): 48 | 49 | to_print = 0 50 | raw_train_data, raw_train_results = stat.get_trainingdata(input_size, select_transform = select_transform, read_database = read_database, to_print = to_print, mode = mode, repeat = repeat, max_number_of_tags = max_number_of_tags) 51 | t0 = time() 52 | # k = 0.8 53 | 54 | # # print raw_train_data 55 | # print raw_train_data 56 | # print raw_train_results 57 | 58 | split_point = int(k*input_size) 59 | # print split_point 60 | train_data = raw_train_data[0:split_point,:] 61 | train_results = raw_train_results[0:split_point] 62 | # print train_results 63 | # print train_data 64 | # print train_results 65 | 66 | test_data = raw_train_data[split_point:,:] 67 | test_results = raw_train_results[split_point:] 68 | # print test_results 69 | 70 | fname_U = "SVD_U.txt" 71 | fname_V = "SVD_V.txt" 72 | fname_S = "SVD_S.txt" 73 | 74 | if use_cache==1: 75 | with open(fname_U, 'rb') as f: 76 | U = pickle.load(f) 77 | with open(fname_V, 'rb') as f: 78 | V = pickle.load(f) 79 | with open(fname_S, 'rb') as f: 80 | s = pickle.load(f) 81 | print "Using SVD from file" 82 | else: 83 | U, s, V = np.linalg.svd(train_data, full_matrices=True) 84 | with open(fname_U, 'wb') as f: 85 | pickle.dump(U, f) 86 | with open(fname_V, 'wb') as f: 87 | pickle.dump(V, f) 88 | with open(fname_S, 'wb') as f: 89 | pickle.dump(s, f) 90 | print "Using SVD by calculation" 91 | 92 | print("SVD decomposition done in %fs" % (time() - t0)) 93 | square_sum_s = np.square(s).sum() 94 | #not sure if this is the most optimal way for finding the sum of squares 95 | 96 | temp_sum = 0 97 | count = 0 98 | for i in s: 99 | temp_sum+= i*i 100 | count+=1 101 | if(temp_sum >= 0.9*square_sum_s): 102 | break; 103 | 104 | print "count = "+str(count) 105 | x = np.delete(V, np.s_[count::1], 0) 106 | processedV = np.transpose(x) 107 | train_X = np.dot(train_data, processedV) 108 | test_X = np.dot(test_data, processedV) 109 | 110 | # X = X_raw[0:k*input_size + 1, :] 111 | # test_X = X_raw[k*input_size+1:,:] 112 | 113 | 114 | # print "count = "+str(count) 115 | # print "V.shape = "+str(V.shape) 116 | # print "s.shape = "+str(s.shape) 117 | # x = np.delete(V, np.s_[count::1], 0) 118 | # print "x.shape = "+str(x.shape) 119 | # print "raw_train_data.shape = "+str(raw_train_data) 120 | # print "processedV.shape = "+str(processedV.shape) 121 | 122 | #can use splicing instead of delete 123 | 124 | # print "X.shape = "+str(X.shape) 125 | 126 | # train_results = stat.get_trainmatrix(input_size, read_database = read_database, to_print = to_print) 127 | 128 | mlb = MultiLabelBinarizer() 129 | trainingdata_results = mlb.fit_transform(raw_train_results) 130 | # print train_results 131 | train_Y = trainingdata_results[0:split_point,:] 132 | test_Y = trainingdata_results[split_point+1:,:] 133 | 134 | # print train_Y 135 | # test_Y = mlb.fit_transform(test_results) 136 | # print test_results 137 | 138 | 139 | # print Y.shape 140 | # test_X = X[0:k*input_size,:] 141 | # print train_X 142 | # print train_Y 143 | # print train_results 144 | 145 | if(one_vs_one == 1): 146 | clf = OneVsOneClassifier(svm.LinearSVC(random_state=0, max_iter =10000, verbose = 0)) 147 | prediction_Y = clf.fit(X, Y).predict(X) 148 | else: 149 | if model == "LinearSVC": 150 | print "Showing Results for one vs rest multilabel classifier using LinearSVC model" 151 | clf = OneVsRestClassifier(svm.LinearSVC(random_state=0, dual = True, max_iter = max_iter, verbose = 0, C = 0.001, loss = "squared_hinge", multi_class="crammer_singer")) 152 | 153 | elif model == "SVC": 154 | print "Showing Results for one vs rest multilabel classifier using SVC model" 155 | clf = OneVsRestClassifier(svm.SVC(C = 0.001, kernel = 'poly', max_iter = max_iter, verbose = 0, degree = 3)) 156 | clf.fit(train_X, train_Y) 157 | print clf.get_params 158 | scores = clf.decision_function(test_X) 159 | scores_train = clf.decision_function(train_X) 160 | 161 | indices = scores.argmax(axis = 1) 162 | indices_train = scores_train.argmax(axis = 1) 163 | 164 | prediction_Y = np.zeros(scores.shape) 165 | prediction_train = np.zeros(scores_train.shape) 166 | 167 | # print prediction_Y.shape 168 | for i in range(0, len(indices)): 169 | prediction_Y[i][indices[i]] = 1 170 | 171 | for i in range(0, len(indices_train)): 172 | prediction_train[i][indices_train[i]] = 1 173 | 174 | 175 | 176 | #class sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000 177 | #class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) 178 | 179 | prediction = mlb.inverse_transform(prediction_Y) 180 | # print prediction 181 | # for i in prediction: 182 | # print i 183 | # print "\n" 184 | # for i in test_results: 185 | # print i 186 | # print clf.decision_function(test_X) 187 | # # # # print Y 188 | # print test_Y 189 | print "Testing Error : " 190 | evaluate.accuracy_atleast_one_match(test_results, prediction) 191 | evaluate.accuracy_null_results(prediction) 192 | evaluate.accuracy_exact_match(test_results, prediction) 193 | evaluate.accuracy_multilabel(test_results, prediction) 194 | evaluate.precision_multilabel(test_results, prediction) 195 | evaluate.recall_multilabel(test_results, prediction) 196 | evaluate.hamming_loss_multilabel(test_results, prediction) 197 | 198 | # print train_results 199 | # print prediction 200 | # print prediction 201 | print "Training Error : " 202 | prediction = mlb.inverse_transform(prediction_train) 203 | # for i in prediction: 204 | # print i 205 | # print "\n" 206 | # for i in test_results: 207 | # print i 208 | # print clf.decision_function(test_X) 209 | # # # # print Y 210 | # print test_Y 211 | # print prediction_Y 212 | evaluate.accuracy_atleast_one_match(train_results, prediction) 213 | evaluate.accuracy_null_results(prediction) 214 | evaluate.accuracy_exact_match(train_results, prediction) 215 | evaluate.accuracy_multilabel(train_results, prediction) 216 | evaluate.precision_multilabel(train_results, prediction) 217 | evaluate.recall_multilabel(train_results, prediction) 218 | evaluate.hamming_loss_multilabel(train_results, prediction) 219 | 220 | # print raw_train_data.shape 221 | 222 | 223 | 224 | 225 | if __name__ == "__main__": 226 | predict(10000, select_transform = 2, read_database = 1, one_vs_one = 0, model = "LinearSVC", mode="multiclass", repeat = 0, k = 0.8, max_number_of_tags = 2, max_iter = 10000, use_cache = 1) 227 | -------------------------------------------------------------------------------- /app/evaluate.py: -------------------------------------------------------------------------------- 1 | def accuracy_atleast_one_match(actual, prediction, verbose = 1): 2 | ''' 3 | actual - list of actual results 4 | prediction - list of predicted results 5 | ''' 6 | 7 | length = len(actual) 8 | count = 0.0 9 | for i in range(0, length): 10 | flag = 0 11 | for j in prediction[i]: 12 | if j in actual[i]: 13 | flag = 1 14 | count+=flag 15 | # print "Result : "+str(result[i]) 16 | # print "Prediction : "+str(prediction[i]) 17 | 18 | if(verbose): 19 | print "Accuracy for matching atleast one = "+str(count/length) 20 | return count/length 21 | 22 | def accuracy_null_results(prediction, verbose = 1): 23 | ''' 24 | actual - list of actual results 25 | prediction - list of predicted results 26 | ''' 27 | length = len(prediction) 28 | count = 0.0 29 | for i in range(0, length): 30 | if not prediction[i]: 31 | count+=1 32 | if(verbose): 33 | print "Percentage of null_results = "+str(count/length) 34 | return count/length 35 | 36 | def accuracy_exact_match(actual, prediction, verbose = 1): 37 | ''' 38 | actual - list of actual results 39 | prediction - list of predicted results 40 | ''' 41 | 42 | length = len(actual) 43 | count = 0.0 44 | for i in range(0, length): 45 | flag = 1 46 | if len(prediction[i]) == len(actual[i]): 47 | for j in prediction[i]: 48 | if j not in actual[i]: 49 | flag = 0 50 | else: 51 | flag = 0 52 | count+=flag 53 | # print "Result : "+str(result[i]) 54 | # print "Prediction : "+str(prediction[i]) 55 | 56 | if(verbose): 57 | print "Accuracy for exact matching = "+str(count/length) 58 | return count/length 59 | 60 | def hamming_loss_multilabel(actual, prediction, verbose = 1): 61 | ''' 62 | actual - list of actual results 63 | prediction - list of predicted results 64 | defination taken from http://lpis.csd.auth.gr/publications/tsoumakas-ijdwm.pdf 65 | ''' 66 | 67 | length = len(actual) 68 | hamming_loss = 0.0 69 | for i in range(0, length): 70 | yi = set() 71 | zi = set() 72 | 73 | for j in actual[i]: 74 | yi.add(j) 75 | 76 | for j in prediction[i]: 77 | zi.add(j) 78 | 79 | hamming_loss+=(len(yi.symmetric_difference(zi))+0.0)/len(zi) 80 | 81 | hamming_loss = hamming_loss/length 82 | if (verbose): 83 | print "Hamming Loss = "+str(hamming_loss) 84 | return hamming_loss 85 | 86 | def accuracy_multilabel(actual, prediction, verbose = 1): 87 | ''' 88 | actual - list of actual results 89 | prediction - list of predicted results 90 | defination taken from http://lpis.csd.auth.gr/publications/tsoumakas-ijdwm.pdf 91 | ''' 92 | 93 | length = len(actual) 94 | accuracy = 0.0 95 | for i in range(0, length): 96 | yi = set() 97 | zi = set() 98 | 99 | for j in actual[i]: 100 | yi.add(j) 101 | 102 | for j in prediction[i]: 103 | zi.add(j) 104 | 105 | accuracy+=(len(yi.intersection(zi))+0.0)/len(yi.union(zi)) 106 | 107 | accuracy = accuracy/length 108 | if (verbose): 109 | print "Accuracy (Godbole & Sarawagi) = "+str(accuracy) 110 | return accuracy 111 | 112 | def precision_multilabel(actual, prediction, verbose = 1): 113 | ''' 114 | actual - list of actual results 115 | prediction - list of predicted results 116 | defination taken from http://lpis.csd.auth.gr/publications/tsoumakas-ijdwm.pdf 117 | ''' 118 | 119 | length = len(actual) 120 | precision = 0.0 121 | for i in range(0, length): 122 | yi = set() 123 | zi = set() 124 | 125 | for j in actual[i]: 126 | yi.add(j) 127 | 128 | for j in prediction[i]: 129 | zi.add(j) 130 | 131 | precision+=(len(yi.intersection(zi))+0.0)/len(zi) 132 | # print zi 133 | precision = precision/length 134 | if (verbose): 135 | print "Precision (Godbole & Sarawagi) = "+str(precision) 136 | return precision 137 | 138 | def recall_multilabel(actual, prediction, verbose = 1): 139 | ''' 140 | actual - list of actual results 141 | prediction - list of predicted results 142 | defination taken from http://lpis.csd.auth.gr/publications/tsoumakas-ijdwm.pdf 143 | ''' 144 | 145 | length = len(actual) 146 | recall = 0.0 147 | for i in range(0, length): 148 | yi = set() 149 | zi = set() 150 | 151 | for j in actual[i]: 152 | yi.add(j) 153 | 154 | for j in prediction[i]: 155 | zi.add(j) 156 | 157 | recall+=(len(yi.intersection(zi))+0.0)/len(yi) 158 | 159 | recall = recall/length 160 | if (verbose): 161 | print "Recall (Godbole & Sarawagi) = "+str(recall) 162 | return recall 163 | -------------------------------------------------------------------------------- /app/generator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import time 5 | import string 6 | 7 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 8 | 9 | if not path in sys.path: 10 | sys.path.insert(1, path) 11 | del path 12 | 13 | try: 14 | import database.mongo as mongo 15 | except ImportError as exc: 16 | print("Error: failed to import settings module ({})".format(exc)) 17 | 18 | try: 19 | import nltk 20 | except ImportError as exc: 21 | print("Error: failed to import settings module ({})".format(exc)) 22 | 23 | try: 24 | from bs4 import BeautifulSoup 25 | except ImportError as exc: 26 | print("Error: failed to import settings module ({})".format(exc)) 27 | 28 | try: 29 | from parser import fetch_top_tags 30 | except ImportError as exc: 31 | print("Error: failed to import settings module ({})".format(exc)) 32 | 33 | test_data = "data/processed.csv" 34 | stopword_data = "data/stopword.txt" 35 | replaceword_data = "data/replaceword.txt" 36 | tag_data = "data/tag.txt" 37 | 38 | def generate_data(tag_count, question_count): 39 | #generate data using top 'tag_count' number of tags and 'question_count' number of questions 40 | db = mongo.connect() 41 | tags = fetch_top_tags(tag_count) 42 | print tags 43 | time.sleep(30) 44 | count = 0 45 | with open(test_data) as infile: 46 | for line in infile: 47 | striped_line = line.strip() 48 | if striped_line: 49 | a = striped_line.split(',', 2) 50 | post_id = str(a[0]).replace('\"', '').strip() 51 | title = str(a[1]).replace('\"', '').strip() 52 | a = a[2].rsplit(',', 1) 53 | tag_list = a[1].replace('\"', '').replace('\'', '').split() 54 | # print tag_list 55 | flag = 0 56 | for tag in tag_list: 57 | if tag not in tags: 58 | flag = 1 59 | if(flag==0): 60 | # print "printing" 61 | count+=1; 62 | body = a[0] 63 | code = "" 64 | soup = BeautifulSoup(body) 65 | body = soup.get_text() 66 | for code_snippet in soup.find_all('code'): 67 | temp_code = code_snippet.get_text().strip() 68 | code+= temp_code + "\n" 69 | body = body.replace(temp_code, "") 70 | body = ' '.join(body.split()) 71 | post = {} 72 | post['post_id'] = post_id 73 | post['title'] = title 74 | post['body'] = body 75 | post['tag'] = tag_list 76 | post['code'] = code 77 | mongo_id = db.insert(post) 78 | # print tag_list 79 | if(count%10000 == 0): 80 | print count, " number of questions processed" 81 | if(count > question_count): 82 | break; 83 | 84 | if __name__ == "__main__": 85 | generate_data(10, 40000) 86 | -------------------------------------------------------------------------------- /app/parser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import time 5 | import string 6 | import operator 7 | 8 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 9 | 10 | if not path in sys.path: 11 | sys.path.insert(1, path) 12 | del path 13 | 14 | try: 15 | import database.mongo as mongo 16 | except ImportError as exc: 17 | print("Error: failed to import settings module ({})".format(exc)) 18 | 19 | try: 20 | import nltk 21 | except ImportError as exc: 22 | print("Error: failed to import settings module ({})".format(exc)) 23 | 24 | try: 25 | from bs4 import BeautifulSoup 26 | except ImportError as exc: 27 | print("Error: failed to import settings module ({})".format(exc)) 28 | 29 | test_data = "data/processed.csv" 30 | stopword_data = "data/stopword.txt" 31 | replaceword_data = "data/replaceword.txt" 32 | 33 | def preprocess_dataset(): 34 | #preprocess the raw dataset we got online 35 | count = 0 36 | with open(test_data) as infile: 37 | for line in infile: 38 | if(line[-3:]=="\"\r\n"): 39 | #End of one post 40 | print line.strip() 41 | else: 42 | print line.strip(), 43 | count+=1 44 | # print count 45 | 46 | def remove_stopwords(): 47 | #remove all the stopwords 48 | porter_stemmer = nltk.stem.porter.PorterStemmer() 49 | wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() 50 | nltk_stopwords = nltk.corpus.stopwords.words('english') 51 | 52 | stopwords = {} 53 | replace_words = {} 54 | stopword_count = 0 55 | takenword_count = 0 56 | 57 | with open(stopword_data) as infile: 58 | for line in infile: 59 | i = line.strip().split() 60 | for token in i: 61 | a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) 62 | if a not in stopwords: 63 | stopwords[a] = 1 64 | 65 | for token in nltk_stopwords: 66 | a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) 67 | if a not in stopwords: 68 | stopwords[a] = 1 69 | 70 | for a in string.punctuation: 71 | if a not in replace_words: 72 | replace_words[a] = 1 73 | 74 | with open(replaceword_data) as infile: 75 | for line in infile: 76 | a = line.strip() 77 | if a not in replace_words: 78 | replace_words[a] = 1 79 | 80 | with open(test_data) as infile: 81 | for line in infile: 82 | striped_line = line.strip() 83 | if striped_line : 84 | a = striped_line.split(',',2) 85 | post_id = str(a[0]) 86 | title = str(a[1]) 87 | a = a[2].rsplit(',',1) 88 | tag_list_string = a[1] 89 | body = a[0] 90 | #print body 91 | soup = BeautifulSoup(body) 92 | body = soup.get_text() 93 | for i in replace_words: 94 | body = body.replace(i, '') 95 | body = ' '.join(body.split()) 96 | list_token = nltk.word_tokenize(body) 97 | for token in list_token: 98 | processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower())) 99 | if(processed_token in stopwords): 100 | stopword_count+=1 101 | else: 102 | takenword_count+=1 103 | try: 104 | print processed_token 105 | except UnicodeEncodeError as e: 106 | print "Unicode Encode Error ", e 107 | print "\n" 108 | print "stopword_count : ", stopword_count 109 | print "takenword_count : ", takenword_count 110 | 111 | def fetch_top_tags(k = 100): 112 | #script to fetch top k most popular tags from raw data 113 | tags = {} 114 | with open(test_data) as infile: 115 | for line in infile: 116 | striped_line = line.strip().replace('"','') 117 | if striped_line : 118 | a = striped_line.split(',',2) 119 | post_id = str(a[0]) 120 | title = str(a[1]) 121 | a = a[2].rsplit(',',1) 122 | tag_list = a[1].split(' ') 123 | for tag in tag_list: 124 | if tag not in tags: 125 | tags[tag]=1 126 | else: 127 | tags[tag]+=1 128 | sorted_tags = sorted(tags.items(), key=operator.itemgetter(1), reverse = True) 129 | tag_dict = {} 130 | with open("data/tag.txt", "w") as f: 131 | for i in range(0, k): 132 | f.write(sorted_tags[i][0]) 133 | f.write("\n") 134 | tag_dict[sorted_tags[i][0]]=0 135 | 136 | return tag_dict 137 | 138 | #preprocess_dataset() 139 | #remove_stopwords() 140 | # fetch_top_tags() 141 | -------------------------------------------------------------------------------- /app/plot_tags.py: -------------------------------------------------------------------------------- 1 | print(__doc__) 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_multilabel_classification 7 | from sklearn.multiclass import OneVsRestClassifier 8 | from sklearn.svm import SVC 9 | from sklearn.preprocessing import LabelBinarizer 10 | from sklearn.decomposition import PCA 11 | from sklearn.cross_decomposition import CCA 12 | 13 | 14 | def plot_hyperplane(clf, min_x, max_x, linestyle, label): 15 | # get the separating hyperplane 16 | w = clf.coef_[0] 17 | a = -w[0] / w[1] 18 | xx = np.linspace(min_x - 5, max_x + 5) # make sure the line is long enough 19 | yy = a * xx - (clf.intercept_[0]) / w[1] 20 | plt.plot(xx, yy, linestyle, label=label) 21 | 22 | 23 | def plot_subfigure(X, Y, subplot, title, transform): 24 | if transform == "pca": 25 | X = PCA(n_components=2).fit_transform(X) 26 | elif transform == "cca": 27 | X = CCA(n_components=2).fit(X, Y).transform(X) 28 | else: 29 | raise ValueError 30 | 31 | min_x = np.min(X[:, 0]) 32 | max_x = np.max(X[:, 0]) 33 | 34 | min_y = np.min(X[:, 1]) 35 | max_y = np.max(X[:, 1]) 36 | 37 | classif = OneVsRestClassifier(SVC(kernel='linear')) 38 | classif.fit(X, Y) 39 | 40 | plt.subplot(2, 2, subplot) 41 | plt.title(title) 42 | 43 | zero_class = np.where(Y[:, 0]) 44 | one_class = np.where(Y[:, 1]) 45 | plt.scatter(X[:, 0], X[:, 1], s=40, c='gray') 46 | plt.scatter(X[zero_class, 0], X[zero_class, 1], s=160, edgecolors='b', 47 | facecolors='none', linewidths=2, label='Class 1') 48 | plt.scatter(X[one_class, 0], X[one_class, 1], s=80, edgecolors='orange', 49 | facecolors='none', linewidths=2, label='Class 2') 50 | 51 | plot_hyperplane(classif.estimators_[0], min_x, max_x, 'k--', 52 | 'Boundary\nfor class 1') 53 | plot_hyperplane(classif.estimators_[1], min_x, max_x, 'k-.', 54 | 'Boundary\nfor class 2') 55 | plt.xticks(()) 56 | plt.yticks(()) 57 | 58 | plt.xlim(min_x - .5 * max_x, max_x + .5 * max_x) 59 | plt.ylim(min_y - .5 * max_y, max_y + .5 * max_y) 60 | if subplot == 2: 61 | plt.xlabel('First principal component') 62 | plt.ylabel('Second principal component') 63 | plt.legend(loc="upper left") 64 | 65 | 66 | plt.figure(figsize=(8, 6)) 67 | 68 | X, Y = make_multilabel_classification(n_classes=2, n_labels=1, 69 | allow_unlabeled=True, 70 | return_indicator=True, 71 | random_state=1) 72 | 73 | plot_subfigure(X, Y, 1, "With unlabeled samples + CCA", "cca") 74 | plot_subfigure(X, Y, 2, "With unlabeled samples + PCA", "pca") 75 | 76 | X, Y = make_multilabel_classification(n_classes=2, n_labels=1, 77 | allow_unlabeled=False, 78 | return_indicator=True, 79 | random_state=1) 80 | 81 | plot_subfigure(X, Y, 3, "Without unlabeled samples + CCA", "cca") 82 | plot_subfigure(X, Y, 4, "Without unlabeled samples + PCA", "pca") 83 | 84 | plt.subplots_adjust(.04, .02, .97, .94, .09, .2) 85 | plt.show() 86 | -------------------------------------------------------------------------------- /app/stat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import time 5 | import string 6 | import operator 7 | 8 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 9 | 10 | if not path in sys.path: 11 | sys.path.insert(1, path) 12 | del path 13 | 14 | try: 15 | import database.mongo as mongo 16 | except ImportError as exc: 17 | print("Error: failed to import settings module ({})".format(exc)) 18 | 19 | try: 20 | import nltk 21 | except ImportError as exc: 22 | print("Error: failed to import settings module ({})".format(exc)) 23 | 24 | try: 25 | from bs4 import BeautifulSoup 26 | except ImportError as exc: 27 | print("Error: failed to import settings module ({})".format(exc)) 28 | 29 | from sklearn.feature_extraction.text import CountVectorizer 30 | from sklearn.feature_extraction.text import TfidfVectorizer 31 | from time import time 32 | import numpy as np 33 | import pickle 34 | 35 | stopword_data = "data/stopword.txt" 36 | replaceword_data = "data/replaceword.txt" 37 | test_data = "data/processed.csv" 38 | takeword_data = "data/take_word.txt" 39 | 40 | def get_codewords(): 41 | #this function is meant prints all the code segments 42 | db = mongo.connect() 43 | code_word = {} 44 | for post in db.find(): 45 | code_temp = post['code'].split() 46 | for i in code_temp: 47 | if i not in code_word: 48 | try: 49 | print i 50 | except UnicodeEncodeError as e: 51 | pass 52 | code_word[i] = 1 53 | print "\n" 54 | 55 | def get_bodywords(): 56 | #this function is meant to print the unique words with their frequency so that some potential stopwords can be removed 57 | porter_stemmer = nltk.stem.porter.PorterStemmer() 58 | wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() 59 | nltk_stopwords = nltk.corpus.stopwords.words('english') 60 | stopwords = {} 61 | replace_words = {} 62 | stopword_count = 0 63 | takenword_count = 0 64 | 65 | with open(stopword_data) as infile: 66 | for line in infile: 67 | i = line.strip().split() 68 | for token in i: 69 | a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) 70 | if a not in stopwords: 71 | stopwords[a] = 1 72 | 73 | for token in nltk_stopwords: 74 | a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) 75 | if a not in stopwords: 76 | stopwords[a] = 1 77 | 78 | for a in string.punctuation: 79 | if a not in replace_words: 80 | replace_words[a] = 1 81 | 82 | with open(replaceword_data) as infile: 83 | for line in infile: 84 | a = line.strip() 85 | if a not in replace_words: 86 | replace_words[a] = 1 87 | 88 | db= mongo.connect() 89 | word = {} 90 | 91 | for post in db.find(): 92 | body = post['body'].strip() 93 | for i in replace_words: 94 | body = body.replace(i, '') 95 | list_token = nltk.word_tokenize(body) 96 | for token in list_token: 97 | # print token 98 | processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower())) 99 | if processed_token not in stopwords: 100 | if processed_token not in word: 101 | word[processed_token]=1 102 | # print processed_token 103 | else: 104 | word[processed_token]=1 105 | sorted_word = sorted(word.items(), key=operator.itemgetter(1), reverse = True) 106 | #print sorted_word 107 | for i in sorted_word: 108 | try: 109 | print i[0], " : ",i[1] 110 | except UnicodeEncodeError as e: 111 | print "Unicode Error : ", i[1] 112 | 113 | def get_idf(): 114 | #this function is meant to print the unique words with their frequency so that some potential stopwords can be removed 115 | 116 | porter_stemmer = nltk.stem.porter.PorterStemmer() 117 | wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() 118 | nltk_stopwords = nltk.corpus.stopwords.words('english') 119 | stopwords = {} 120 | replace_words = {} 121 | stopword_count = 0 122 | takenword_count = 0 123 | 124 | with open(stopword_data) as infile: 125 | for line in infile: 126 | i = line.strip().split() 127 | for token in i: 128 | a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) 129 | if a not in stopwords: 130 | stopwords[a] = 1 131 | 132 | for token in nltk_stopwords: 133 | a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) 134 | if a not in stopwords: 135 | stopwords[a] = 1 136 | 137 | for a in string.punctuation: 138 | if a not in replace_words: 139 | replace_words[a] = 1 140 | 141 | with open(replaceword_data) as infile: 142 | 143 | for line in infile: 144 | a = line.strip() 145 | if a not in replace_words: 146 | replace_words[a] = 1 147 | 148 | db= mongo.connect() 149 | word = {} 150 | idf = {} 151 | flag = {} 152 | 153 | for post in db.find(): 154 | body = post['body'].strip() 155 | flag = {} 156 | for i in replace_words: 157 | body = body.replace(i, '') 158 | list_token = nltk.word_tokenize(body) 159 | for token in list_token: 160 | # print token 161 | processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower())) 162 | if processed_token not in stopwords and not (processed_token.isdigit()): 163 | if processed_token not in word: 164 | word[processed_token]=1 165 | idf[processed_token] = 1 166 | flag[processed_token] = 1 167 | # print processed_token 168 | else: 169 | word[processed_token]=1 170 | if processed_token not in flag: 171 | flag[processed_token] = 1 172 | idf[processed_token]=1 173 | 174 | for i in idf: 175 | if idf[i] > 7: 176 | try: 177 | print i 178 | except UnicodeEncodeError as e: 179 | pass 180 | 181 | # sorted_idf = sorted(idf.items(), key=operator.itemgetter(1), reverse = True) 182 | # for i in sorted_idf: 183 | # try: 184 | # print i[0], " : ",i[1] 185 | # except UnicodeEncodeError as e: 186 | # print "Unicode Error : ", i[1] 187 | 188 | def get_trainingdata(input_size = 100000, select_transform = 1, read_database = 1, to_print = 0, mode = "multiclass", repeat = 0, max_number_of_tags = 5): 189 | ''' 190 | generate training data 191 | if read_database == 0: 192 | All other options are ignored 193 | if mode == multilabel 194 | repeat option is ignored 195 | 196 | ''' 197 | 198 | fname_feature = "trainfeaturematrix.csv" 199 | fname_result = "trainresultmatrix.csv" 200 | fname_result_pickle = "trainresultmatrix" 201 | 202 | if read_database == 0: 203 | t0 = time() 204 | a = np.loadtxt(fname_feature, delimiter = ",") 205 | print("Loaded feature matrix for training from File in %fs" % (time() - t0)) 206 | # print "input_size = ", input_size 207 | # print a.size 208 | trainingdata_features = a.reshape(input_size, a.size/input_size) 209 | 210 | t0 = time() 211 | print("Loaded result matrix for training from File in %fs" % (time() - t0)) 212 | #print "input_size = ", input_size 213 | #print a.size 214 | with open(fname_python, 'rb') as f: 215 | trainingdata_result = pickle.load(f) 216 | #train = a.reshape(input_size, a.size/input_size) 217 | 218 | return trainingdata_features, trainingdata_result 219 | 220 | else: 221 | 222 | porter_stemmer = nltk.stem.porter.PorterStemmer() 223 | wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() 224 | nltk_stopwords = nltk.corpus.stopwords.words('english') 225 | 226 | take_words = {} 227 | replace_words = {} 228 | stopwords = {} 229 | 230 | with open(stopword_data) as infile: 231 | for line in infile: 232 | i = line.strip().split() 233 | for token in i: 234 | a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) 235 | if a not in stopwords: 236 | stopwords[a] = 1 237 | 238 | for a in string.punctuation: 239 | if a not in replace_words: 240 | replace_words[a] = 1 241 | 242 | with open(replaceword_data) as infile: 243 | for line in infile: 244 | a = line.strip() 245 | if a not in replace_words: 246 | replace_words[a] = 1 247 | 248 | db = mongo.connect() 249 | corpus = [] 250 | tag_set = set() 251 | question_tag = {} 252 | question_count = 0 253 | # counter = 0 254 | trainingdata_result = [] 255 | 256 | if mode == "multilabel": 257 | 258 | for post in list(db.find().skip(1).limit(input_size*(max_number_of_tags+1 ))): 259 | 260 | #not fool proof 261 | if(len(post['tag']) <= max_number_of_tags): 262 | question_tag[question_count] = [] 263 | 264 | trainingdata_result.append(post['tag']) 265 | 266 | body = post['body'].strip() 267 | for i in replace_words: 268 | body = body.replace(i, '') 269 | list_token = nltk.word_tokenize(body) 270 | processed_body = "" 271 | for token in list_token: 272 | processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower())) 273 | if processed_token not in stopwords and not (processed_token.isdigit()): 274 | processed_body+=processed_token+" " 275 | corpus.append(processed_body.strip()) 276 | 277 | for i in post['tag']: 278 | question_tag[question_count].append(i) 279 | tag_set.add(i) 280 | question_count+=1 281 | 282 | if(question_count>=input_size): 283 | break 284 | 285 | 286 | #entire point of writing to csv is to use the data with matlab 287 | sorted_taglist = sorted(tag_set) 288 | tag_dict = {} 289 | tag_count = 0 290 | # print "size of set" 291 | # print len(tag_set) 292 | for i in sorted_taglist: 293 | # print i 294 | tag_dict[i] = tag_count 295 | tag_count+=1 296 | # print "number of unique tags = "+str(tag_count) 297 | train_matrix = np.zeros((input_size, tag_count), dtype = np.int) 298 | for i in question_tag: 299 | for j in question_tag[i]: 300 | train_matrix[i][tag_dict[j]]=1 301 | with open(fname_result_pickle, 'wb') as f: 302 | pickle.dump(trainingdata_result, f) 303 | 304 | if(select_transform == 1): 305 | transform = CountVectorizer(min_df=1) 306 | elif(select_transform == 2): 307 | transform = TfidfVectorizer(min_df=1) 308 | a = transform.fit_transform(corpus) 309 | # print transform.get_feature_names() 310 | trainingdata_features = a.toarray() 311 | # print trainingdata_features 312 | np.savetxt(fname_feature, trainingdata_features, delimiter=",") 313 | 314 | elif mode == "multiclass": 315 | if repeat == 0: 316 | for post in list(db.find().skip(1).limit(input_size*(max_number_of_tags+1))): 317 | #not fool proof 318 | 319 | if(len(post['tag']) <= max_number_of_tags): 320 | # print len(post['tag']) 321 | body = post['body'].strip() 322 | for i in replace_words: 323 | body = body.replace(i, '') 324 | list_token = nltk.word_tokenize(body) 325 | processed_body = "" 326 | for token in list_token: 327 | processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower())) 328 | if processed_token not in stopwords and not (processed_token.isdigit()): 329 | processed_body+=processed_token+" " 330 | 331 | for i in post['tag']: 332 | # print i 333 | question_tag[question_count] = [] 334 | question_tag[question_count].append(i) 335 | #can be done in a single step - then do it 336 | tag_set.add(i) 337 | question_count+=1 338 | corpus.append(processed_body.strip()) 339 | trainingdata_result.append([i]) 340 | if(question_count >= input_size): 341 | break 342 | 343 | if(question_count >= input_size): 344 | break 345 | 346 | # print corpus 347 | # print 348 | #entire point of writing to csv is to use the data with matlab 349 | sorted_taglist = sorted(tag_set) 350 | tag_dict = {} 351 | tag_count = 0 352 | # print "size of set" 353 | # print len(tag_set) 354 | for i in sorted_taglist: 355 | # print i 356 | tag_dict[i] = tag_count 357 | tag_count+=1 358 | print "number of unique tags = "+str(tag_count) 359 | train_matrix = np.zeros((input_size, tag_count), dtype = np.int) 360 | for i in question_tag: 361 | for j in question_tag[i]: 362 | train_matrix[i][tag_dict[j]]=1 363 | with open(fname_result_pickle, 'wb') as f: 364 | pickle.dump(trainingdata_result, f) 365 | 366 | if(select_transform == 1): 367 | transform = CountVectorizer(min_df=1) 368 | elif(select_transform == 2): 369 | transform = TfidfVectorizer(min_df=1) 370 | a = transform.fit_transform(corpus) 371 | # print transform.get_feature_names() 372 | trainingdata_features = a.toarray() 373 | # print trainingdata_features 374 | np.savetxt(fname_feature, trainingdata_features, delimiter=",") 375 | 376 | 377 | if to_print == 1: 378 | for i in trainingdata_features: 379 | to_print = "" 380 | for j in i: 381 | to_print+=str(j)+", " 382 | to_print = to_print[:-2] 383 | print to_print 384 | 385 | return trainingdata_features, trainingdata_result 386 | 387 | if __name__ == "__main__": 388 | # get_trainmatrix(input_size = 10000) 389 | get_featurematrix(200, select_transform = 2, read_database = 1) 390 | get_trainmatrix(200, read_database = 1) 391 | 392 | -------------------------------------------------------------------------------- /config/config.cfg.sample: -------------------------------------------------------------------------------- 1 | [app-name] 2 | sts 3 | host = 4 | port = 5 | db_name = 6 | collection_name = 7 | -------------------------------------------------------------------------------- /database/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shagunsodhani/StackExchange-tagger/8fac6a40f3de416776f236dfa4cf8e3dbd64cf5b/database/__init__.py -------------------------------------------------------------------------------- /database/mongo.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | #---------------------------------------------------------Import Modules----------------------------------------------------------------------# 3 | 4 | import os 5 | from ConfigParser import ConfigParser 6 | 7 | try: 8 | import pymongo 9 | except ImportError as exc: 10 | print("Error: failed to import settings module ({})".format(exc)) 11 | 12 | def connect(app_name = "tagger", config_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../config', 'config.cfg') ): 13 | 14 | '''Open connection to mongodb and return db object to perform queries''' 15 | config=ConfigParser() 16 | config.read(config_path) 17 | host=config.get(app_name,"host") 18 | port=config.get(app_name,"port") 19 | db_name=config.get(app_name,"db_name") 20 | collection_name=config.get(app_name, "collection_name") 21 | try: 22 | client = pymongo.MongoClient(host, int(port)) 23 | db = client[db_name] 24 | return db[collection_name] 25 | except pymongo.errors, e: 26 | print "ERROR %d IN CONNECTION: %s" % (e.args[0], e.args[1]) 27 | return 0 28 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10 4 | echo "deb http://repo.mongodb.org/apt/ubuntu "$(lsb_release -sc)"/mongodb-org/3.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list 5 | apt-get update 6 | pip install pymongo numpy nltk BeautifulSoup cython sparsesvd 7 | apt-get install -y mongodb-org python-scipy scikit-learn 8 | service mongod start 9 | python -m nltk.downloader 10 | #To download nltk datasets 11 | --------------------------------------------------------------------------------