├── .gitignore
├── README.md
├── app
    ├── __init__.py
    ├── classifier.py
    ├── evaluate.py
    ├── generator.py
    ├── parser.py
    ├── plot_tags.py
    └── stat.py
├── config
    └── config.cfg.sample
├── database
    ├── __init__.py
    └── mongo.py
└── install.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 
56 | #config
57 | config/config.cfg
58 | 
59 | #dataset
60 | data/*
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # StackExchange-tagger
 2 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.972132.svg)](https://doi.org/10.5281/zenodo.972132)
 3 | 
 4 | The goal of our project is to develop an accurate tagger for questions posted on Stack Exchange. Our problem is an instance of the more general problem of developing accurate classifiers for large scale text datasets. We are tackling the multilabel classification problem where each item (in this case, question) can belong to multiple classes (in this case, tags). We are predicting the tags (or keywords) for a particular Stack Exchange post given only the question text and the title of the post. In the process, we compare the performance of Support Vector Classification (SVC) for different kernel functions, loss function, etc. 
 5 | 
 6 | We found linear SVC with Crammer Singer technique produces best results. 
 7 | 
 8 | # Some Results
 9 | 
10 | Testing Error for SVC with different kernel functions where number of iterations = 10,000
11 | 
12 | |  Kernel          | C = 1000(hard-margin) | C = 0.001(soft-margin)   |
13 | |------------------|-----------------------|--------------------------|
14 | | RBF              |  43.1 %               |   48.5 %                 |
15 | | Linear           |  51.9 %               |   45.2 %                 |
16 | | Polynomial (n=2) |  54.4 %               |   65 %                   |
17 | | Polynomial (n=3) |  72.2 %               |   84.4 %                 |
18 | | Sigmoid          |  84.4 %               |   84.4 %                 |
19 | 
20 | 
21 | Testing Error for Linear SVC with different techniques where C = 0.001 (soft-margin) and number of iterations = 10,000
22 | 
23 | | Technique      | Hinge Loss Function | Square Hinge Loss Function |
24 | |----------------|---------------------|----------------------------|
25 | | One-vs-rest    |      47.59 %        |            68 %            |
26 | | Crammer Singer |      45.25 %        |           45.25%           |
27 | 
28 | 
29 | #Report
30 | 
31 | Our detailed report and results are available [here](https://sites.google.com/site/sanketmehtaiitr/home/stack-exchange-tagger).
32 | 
33 | 
34 | #Team
35 | 
36 | * [Sanket Mehta](https://twitter.com/sanketvmehta)
37 | * [Shagun Sodhani](https://twitter.com/shagunsodhani)
38 | 
39 | This work has been done as a part of a course project for Artificial Neural Network (IEE-03) at IIT Roorkee. 
40 | 


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shagunsodhani/StackExchange-tagger/8fac6a40f3de416776f236dfa4cf8e3dbd64cf5b/app/__init__.py


--------------------------------------------------------------------------------
/app/classifier.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import json
  4 | import string
  5 | import operator
  6 | import pickle
  7 | from time import time
  8 | 
  9 | import numpy as np
 10 | from sklearn import svm
 11 | from sklearn.multiclass import OneVsRestClassifier
 12 | from sklearn.multiclass import OneVsOneClassifier
 13 | from sklearn.preprocessing import MultiLabelBinarizer
 14 | 
 15 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 16 | 
 17 | if not path in sys.path:
 18 |     sys.path.insert(1, path)
 19 | del path
 20 | 
 21 | try:
 22 |     import database.mongo as mongo
 23 | except ImportError as exc:
 24 |     print("Error: failed to import settings module ({})".format(exc))
 25 | 
 26 | try:
 27 | 	from app import stat
 28 | except ImportError as exc:
 29 |     print("Error: failed to import settings module ({})".format(exc))
 30 | 
 31 | try:
 32 | 	from app import evaluate
 33 | except ImportError as exc:
 34 |     print("Error: failed to import settings module ({})".format(exc))
 35 | 
 36 | def input_representation(result):
 37 | 	tag_count = {}
 38 | 	for i in result:
 39 | 		for j in i:
 40 | 			if j not in tag_count:
 41 | 				tag_count[j] = 1
 42 | 			else:
 43 | 				tag_count[j]+=1
 44 | 	for j in tag_count:
 45 | 		print str(j)+" : "+str(tag_count[j])
 46 | 
 47 | def predict(input_size = 100000, select_transform = 1, read_database = 1, one_vs_one = 0, model = "LinearSVC", mode = "multilable", repeat = 0, k = 0.8, max_number_of_tags = 5, max_iter = 100000, use_cache = 0):
 48 | 
 49 | 	to_print = 0
 50 | 	raw_train_data, raw_train_results = stat.get_trainingdata(input_size, select_transform = select_transform, read_database = read_database, to_print = to_print, mode = mode, repeat = repeat, max_number_of_tags = max_number_of_tags)
 51 | 	t0 = time()
 52 | 	# k = 0.8
 53 | 
 54 | 	# # print raw_train_data
 55 | 	# print raw_train_data
 56 | 	# print raw_train_results
 57 | 
 58 | 	split_point = int(k*input_size)
 59 | 	# print split_point
 60 | 	train_data = raw_train_data[0:split_point,:]
 61 | 	train_results = raw_train_results[0:split_point]
 62 | 	# print train_results
 63 | 	# print train_data
 64 | 	# print train_results
 65 | 
 66 | 	test_data = raw_train_data[split_point:,:]
 67 | 	test_results = raw_train_results[split_point:]
 68 | 	# print test_results
 69 | 	
 70 | 	fname_U = "SVD_U.txt"
 71 | 	fname_V = "SVD_V.txt"
 72 | 	fname_S = "SVD_S.txt"
 73 | 
 74 | 	if use_cache==1:
 75 | 		with open(fname_U, 'rb') as f:
 76 | 				U = pickle.load(f)
 77 | 		with open(fname_V, 'rb') as f:
 78 | 				V = pickle.load(f)
 79 | 		with open(fname_S, 'rb') as f:
 80 | 				s = pickle.load(f)
 81 | 		print "Using SVD from file"
 82 | 	else:
 83 | 		U, s, V = np.linalg.svd(train_data, full_matrices=True)
 84 | 		with open(fname_U, 'wb') as f:
 85 | 				pickle.dump(U, f)
 86 | 		with open(fname_V, 'wb') as f:
 87 | 				pickle.dump(V, f)
 88 | 		with open(fname_S, 'wb') as f:
 89 | 				pickle.dump(s, f)
 90 | 		print "Using SVD by calculation"
 91 | 
 92 | 	print("SVD decomposition done in %fs" % (time() - t0))
 93 | 	square_sum_s = np.square(s).sum()
 94 | 	#not sure if this is the most optimal way for finding the sum of squares
 95 | 
 96 | 	temp_sum = 0
 97 | 	count = 0
 98 | 	for i in s:
 99 | 		temp_sum+= i*i
100 | 		count+=1 
101 | 		if(temp_sum >= 0.9*square_sum_s):
102 | 			break;
103 | 
104 | 	print "count = "+str(count)
105 | 	x = np.delete(V, np.s_[count::1], 0)
106 | 	processedV = np.transpose(x)
107 | 	train_X = np.dot(train_data, processedV)
108 | 	test_X = np.dot(test_data, processedV)
109 | 	
110 | 	# X = X_raw[0:k*input_size + 1, :]
111 | 	# test_X = X_raw[k*input_size+1:,:]
112 | 
113 | 	
114 | 	# print "count = "+str(count)
115 | 	# print "V.shape = "+str(V.shape)
116 | 	# print "s.shape = "+str(s.shape)
117 | 	# x = np.delete(V, np.s_[count::1], 0)
118 | 	# print "x.shape = "+str(x.shape)
119 | 	# print "raw_train_data.shape = "+str(raw_train_data)
120 | 	# print "processedV.shape = "+str(processedV.shape)
121 | 
122 | 	#can use splicing instead of delete
123 | 	
124 | 	# print "X.shape = "+str(X.shape)	
125 | 
126 | 	# train_results = stat.get_trainmatrix(input_size, read_database = read_database, to_print = to_print)
127 | 	
128 | 	mlb = MultiLabelBinarizer()
129 | 	trainingdata_results = mlb.fit_transform(raw_train_results)
130 | 	# print train_results
131 | 	train_Y = trainingdata_results[0:split_point,:]
132 | 	test_Y = trainingdata_results[split_point+1:,:]
133 | 
134 | 	# print train_Y
135 | 	# test_Y = mlb.fit_transform(test_results)
136 | 	# print test_results
137 | 
138 | 
139 | 	# print Y.shape
140 | 	# test_X = X[0:k*input_size,:]
141 | 	# print train_X
142 | 	# print train_Y
143 | 	# print train_results
144 | 
145 | 	if(one_vs_one == 1):
146 | 		clf = OneVsOneClassifier(svm.LinearSVC(random_state=0, max_iter =10000, verbose = 0))
147 | 		prediction_Y  = clf.fit(X, Y).predict(X)
148 | 	else:
149 | 		if model == "LinearSVC":
150 | 			print "Showing Results for one vs rest multilabel classifier using LinearSVC model"
151 | 			clf = OneVsRestClassifier(svm.LinearSVC(random_state=0, dual = True, max_iter = max_iter, verbose = 0, C = 0.001, loss = "squared_hinge", multi_class="crammer_singer"))
152 | 			
153 | 		elif model == "SVC":
154 | 			print "Showing Results for one vs rest multilabel classifier using SVC model"
155 | 			clf = OneVsRestClassifier(svm.SVC(C = 0.001, kernel = 'poly',  max_iter = max_iter, verbose = 0, degree = 3))
156 | 		clf.fit(train_X, train_Y)
157 | 		print clf.get_params
158 | 		scores = clf.decision_function(test_X)
159 | 		scores_train = clf.decision_function(train_X)
160 | 
161 | 		indices = scores.argmax(axis = 1)
162 | 		indices_train = scores_train.argmax(axis = 1)
163 | 
164 | 		prediction_Y  = np.zeros(scores.shape)
165 | 		prediction_train = np.zeros(scores_train.shape)
166 | 
167 | 			# print prediction_Y.shape
168 | 		for i in range(0, len(indices)):
169 | 			prediction_Y[i][indices[i]] = 1
170 | 
171 | 		for i in range(0, len(indices_train)):
172 | 			prediction_train[i][indices_train[i]] = 1
173 | 
174 | 
175 | 
176 | 	#class sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000
177 | 	#class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)
178 | 
179 | 	prediction = mlb.inverse_transform(prediction_Y)
180 | 	# print prediction
181 | 	# for i in prediction:
182 | 	# 	print i
183 | 	# print "\n"
184 | 	# for i in test_results:
185 | 	# 	print i
186 | 	# print clf.decision_function(test_X)
187 | 	# # # # print Y
188 | 	# print test_Y
189 | 	print "Testing Error : "
190 | 	evaluate.accuracy_atleast_one_match(test_results, prediction)
191 | 	evaluate.accuracy_null_results(prediction)
192 | 	evaluate.accuracy_exact_match(test_results, prediction)
193 | 	evaluate.accuracy_multilabel(test_results, prediction)
194 | 	evaluate.precision_multilabel(test_results, prediction)
195 | 	evaluate.recall_multilabel(test_results, prediction)
196 | 	evaluate.hamming_loss_multilabel(test_results, prediction)
197 | 	
198 | 	# print train_results
199 | 	# print prediction
200 | 	# print prediction
201 | 	print "Training Error : "
202 | 	prediction = mlb.inverse_transform(prediction_train)
203 | 	# for i in prediction:
204 | 	# 	print i
205 | 	# print "\n"
206 | 	# for i in test_results:
207 | 	# 	print i
208 | 	# print clf.decision_function(test_X)
209 | 	# # # # print Y
210 | 	# print test_Y
211 | 	# print prediction_Y
212 | 	evaluate.accuracy_atleast_one_match(train_results, prediction)
213 | 	evaluate.accuracy_null_results(prediction)
214 | 	evaluate.accuracy_exact_match(train_results, prediction)
215 | 	evaluate.accuracy_multilabel(train_results, prediction)
216 | 	evaluate.precision_multilabel(train_results, prediction)
217 | 	evaluate.recall_multilabel(train_results, prediction)
218 | 	evaluate.hamming_loss_multilabel(train_results, prediction)
219 | 	
220 | 	# print raw_train_data.shape
221 | 
222 | 
223 | 
224 | 
225 | if __name__ == "__main__":
226 | 	predict(10000, select_transform = 2, read_database = 1, one_vs_one = 0, model = "LinearSVC", mode="multiclass", repeat = 0, k = 0.8, max_number_of_tags = 2, max_iter = 10000, use_cache = 1)
227 | 


--------------------------------------------------------------------------------
/app/evaluate.py:
--------------------------------------------------------------------------------
  1 | def accuracy_atleast_one_match(actual, prediction, verbose = 1):
  2 | 	'''
  3 | 		actual - list of actual results
  4 | 		prediction - list of predicted results
  5 | 	'''
  6 | 
  7 | 	length = len(actual)
  8 | 	count = 0.0
  9 | 	for i in range(0, length):
 10 | 		flag = 0
 11 | 		for j in prediction[i]:
 12 | 			if j in actual[i]:
 13 | 				flag = 1
 14 | 		count+=flag
 15 | 		# print "Result : "+str(result[i])
 16 | 		# print "Prediction : "+str(prediction[i])
 17 | 
 18 | 	if(verbose): 
 19 | 		print "Accuracy for matching atleast one = "+str(count/length)
 20 | 	return count/length
 21 | 
 22 | def accuracy_null_results(prediction, verbose = 1):
 23 | 	''' 
 24 | 		actual - list of actual results
 25 | 		prediction - list of predicted results
 26 | 	'''
 27 | 	length = len(prediction)
 28 | 	count = 0.0
 29 | 	for i in range(0, length):
 30 | 		if not prediction[i]:
 31 | 			count+=1
 32 | 	if(verbose):
 33 | 		print "Percentage of null_results = "+str(count/length)
 34 | 	return count/length
 35 | 
 36 | def accuracy_exact_match(actual, prediction, verbose = 1):
 37 | 	'''
 38 | 		actual - list of actual results
 39 | 		prediction - list of predicted results
 40 | 	'''
 41 | 
 42 | 	length = len(actual)
 43 | 	count = 0.0
 44 | 	for i in range(0, length):
 45 | 		flag = 1
 46 | 		if len(prediction[i]) == len(actual[i]):
 47 | 			for j in prediction[i]:
 48 | 				if j not in actual[i]:
 49 | 					flag = 0
 50 | 		else:
 51 | 			flag = 0
 52 | 		count+=flag
 53 | 		# print "Result : "+str(result[i])
 54 | 		# print "Prediction : "+str(prediction[i])
 55 | 
 56 | 	if(verbose): 
 57 | 		print "Accuracy for exact matching = "+str(count/length)
 58 | 	return count/length
 59 | 
 60 | def hamming_loss_multilabel(actual, prediction, verbose = 1):
 61 | 	'''
 62 | 		actual - list of actual results
 63 | 		prediction - list of predicted results
 64 | 		defination taken from http://lpis.csd.auth.gr/publications/tsoumakas-ijdwm.pdf
 65 | 	'''
 66 | 
 67 | 	length = len(actual)
 68 | 	hamming_loss = 0.0
 69 | 	for i in range(0, length):
 70 | 		yi = set()
 71 | 		zi = set()
 72 | 
 73 | 		for j in actual[i]:
 74 | 			yi.add(j)
 75 | 
 76 | 		for j in prediction[i]:
 77 | 			zi.add(j)
 78 | 
 79 | 		hamming_loss+=(len(yi.symmetric_difference(zi))+0.0)/len(zi)
 80 | 
 81 | 	hamming_loss = hamming_loss/length
 82 | 	if (verbose):
 83 | 		print "Hamming Loss = "+str(hamming_loss)
 84 | 	return hamming_loss
 85 | 
 86 | def accuracy_multilabel(actual, prediction, verbose = 1):
 87 | 	'''
 88 | 		actual - list of actual results
 89 | 		prediction - list of predicted results
 90 | 		defination taken from http://lpis.csd.auth.gr/publications/tsoumakas-ijdwm.pdf
 91 | 	'''
 92 | 
 93 | 	length = len(actual)
 94 | 	accuracy = 0.0
 95 | 	for i in range(0, length):
 96 | 		yi = set()
 97 | 		zi = set()
 98 | 
 99 | 		for j in actual[i]:
100 | 			yi.add(j)
101 | 
102 | 		for j in prediction[i]:
103 | 			zi.add(j)
104 | 
105 | 		accuracy+=(len(yi.intersection(zi))+0.0)/len(yi.union(zi))
106 | 
107 | 	accuracy = accuracy/length
108 | 	if (verbose):
109 | 		print "Accuracy (Godbole & Sarawagi) = "+str(accuracy)
110 | 	return accuracy
111 | 
112 | def precision_multilabel(actual, prediction, verbose = 1):
113 | 	'''
114 | 		actual - list of actual results
115 | 		prediction - list of predicted results
116 | 		defination taken from http://lpis.csd.auth.gr/publications/tsoumakas-ijdwm.pdf
117 | 	'''
118 | 
119 | 	length = len(actual)
120 | 	precision = 0.0
121 | 	for i in range(0, length):
122 | 		yi = set()
123 | 		zi = set()
124 | 
125 | 		for j in actual[i]:
126 | 			yi.add(j)
127 | 
128 | 		for j in prediction[i]:
129 | 			zi.add(j)
130 | 
131 | 		precision+=(len(yi.intersection(zi))+0.0)/len(zi)
132 | 		# print zi
133 | 	precision = precision/length
134 | 	if (verbose):
135 | 		print "Precision (Godbole & Sarawagi) = "+str(precision)
136 | 	return precision
137 | 
138 | def recall_multilabel(actual, prediction, verbose = 1):
139 | 	'''
140 | 		actual - list of actual results
141 | 		prediction - list of predicted results
142 | 		defination taken from http://lpis.csd.auth.gr/publications/tsoumakas-ijdwm.pdf
143 | 	'''
144 | 
145 | 	length = len(actual)
146 | 	recall = 0.0
147 | 	for i in range(0, length):
148 | 		yi = set()
149 | 		zi = set()
150 | 
151 | 		for j in actual[i]:
152 | 			yi.add(j)
153 | 
154 | 		for j in prediction[i]:
155 | 			zi.add(j)
156 | 
157 | 		recall+=(len(yi.intersection(zi))+0.0)/len(yi)
158 | 
159 | 	recall = recall/length
160 | 	if (verbose):
161 | 		print "Recall (Godbole & Sarawagi) = "+str(recall)
162 | 	return recall
163 | 


--------------------------------------------------------------------------------
/app/generator.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import json
 4 | import time
 5 | import string
 6 | 
 7 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 8 | 
 9 | if not path in sys.path:
10 |     sys.path.insert(1, path)
11 | del path
12 | 
13 | try:
14 |     import database.mongo as mongo
15 | except ImportError as exc:
16 |     print("Error: failed to import settings module ({})".format(exc))
17 | 
18 | try:
19 |     import nltk
20 | except ImportError as exc:
21 |     print("Error: failed to import settings module ({})".format(exc))
22 | 
23 | try:
24 | 	from bs4 import BeautifulSoup
25 | except ImportError as exc:
26 | 	print("Error: failed to import settings module ({})".format(exc))
27 | 
28 | try:
29 |     from parser import fetch_top_tags
30 | except ImportError as exc:
31 |     print("Error: failed to import settings module ({})".format(exc))
32 | 
33 | test_data = "data/processed.csv"
34 | stopword_data = "data/stopword.txt"
35 | replaceword_data = "data/replaceword.txt"
36 | tag_data = "data/tag.txt"
37 | 
38 | def generate_data(tag_count, question_count):
39 | 	#generate data using top 'tag_count' number of tags and 'question_count' number of questions
40 | 	db = mongo.connect()
41 | 	tags = fetch_top_tags(tag_count)
42 | 	print tags
43 | 	time.sleep(30)
44 | 	count = 0
45 | 	with open(test_data) as infile:
46 | 		for line in infile:
47 | 			striped_line = line.strip()
48 | 			if striped_line:
49 | 				a = striped_line.split(',', 2)
50 | 				post_id = str(a[0]).replace('\"', '').strip()
51 | 				title = str(a[1]).replace('\"', '').strip()
52 | 				a = a[2].rsplit(',', 1)
53 | 				tag_list = a[1].replace('\"', '').replace('\'', '').split()
54 | 				# print tag_list
55 | 				flag = 0
56 |  				for tag in tag_list:
57 | 					if tag not in tags:
58 | 						flag = 1
59 | 				if(flag==0):
60 | 					# print "printing"
61 | 					count+=1;
62 | 					body = a[0]
63 | 					code = ""
64 | 					soup = BeautifulSoup(body)
65 | 					body = soup.get_text()
66 | 					for code_snippet in soup.find_all('code'):
67 | 						temp_code = code_snippet.get_text().strip()
68 | 						code+= temp_code + "\n"
69 | 						body = body.replace(temp_code, "")
70 | 					body = ' '.join(body.split())
71 | 					post = {}
72 | 					post['post_id'] = post_id
73 | 					post['title'] = title
74 | 					post['body'] = body
75 | 					post['tag'] = tag_list
76 | 					post['code'] = code
77 | 					mongo_id = db.insert(post)
78 | 					# print tag_list
79 | 					if(count%10000 == 0):
80 | 						print count, " number of questions processed"
81 | 					if(count > question_count):
82 | 						break;
83 | 
84 | if __name__ == "__main__":
85 | 	generate_data(10, 40000)
86 | 


--------------------------------------------------------------------------------
/app/parser.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import json
  4 | import time
  5 | import string
  6 | import operator
  7 | 
  8 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
  9 | 
 10 | if not path in sys.path:
 11 |     sys.path.insert(1, path)
 12 | del path
 13 | 
 14 | try:
 15 |     import database.mongo as mongo
 16 | except ImportError as exc:
 17 |     print("Error: failed to import settings module ({})".format(exc))
 18 | 
 19 | try:
 20 |     import nltk
 21 | except ImportError as exc:
 22 |     print("Error: failed to import settings module ({})".format(exc))
 23 | 
 24 | try:
 25 | 	from bs4 import BeautifulSoup
 26 | except ImportError as exc:
 27 | 	print("Error: failed to import settings module ({})".format(exc))
 28 | 
 29 | test_data = "data/processed.csv"
 30 | stopword_data = "data/stopword.txt"
 31 | replaceword_data = "data/replaceword.txt"
 32 | 
 33 | def preprocess_dataset():
 34 |     #preprocess the raw dataset we got online
 35 | 	count = 0
 36 | 	with open(test_data) as infile:
 37 | 		for line in infile:
 38 | 			if(line[-3:]=="\"\r\n"):
 39 | 				#End of one post
 40 | 				print line.strip()
 41 | 			else:
 42 | 				print line.strip(),
 43 | 			count+=1
 44 | 	# print count
 45 | 
 46 | def remove_stopwords():
 47 |     #remove all the stopwords
 48 | 	porter_stemmer = nltk.stem.porter.PorterStemmer()
 49 | 	wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
 50 | 	nltk_stopwords = nltk.corpus.stopwords.words('english')
 51 | 	
 52 | 	stopwords = {}
 53 | 	replace_words = {}
 54 | 	stopword_count = 0
 55 | 	takenword_count = 0
 56 | 
 57 | 	with open(stopword_data) as infile:
 58 | 		for line in infile:
 59 | 			i = line.strip().split()
 60 | 			for token in i:
 61 | 				a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token))
 62 | 				if a not in stopwords:
 63 | 					stopwords[a] = 1
 64 | 
 65 | 	for token in nltk_stopwords:
 66 | 		a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token))
 67 | 		if a not in stopwords:
 68 | 			stopwords[a] = 1
 69 | 
 70 | 	for a in string.punctuation:
 71 | 		if a not in replace_words:
 72 | 			replace_words[a] = 1
 73 | 	
 74 | 	with open(replaceword_data) as infile:
 75 | 		for line in infile:
 76 | 			a = line.strip()
 77 | 			if a not in replace_words:
 78 | 				replace_words[a] = 1			
 79 | 
 80 | 	with open(test_data) as infile:
 81 | 		for line in infile:
 82 | 			striped_line = line.strip()
 83 | 			if striped_line :
 84 | 				a = striped_line.split(',',2)
 85 | 				post_id = str(a[0])
 86 | 				title = str(a[1])
 87 | 				a = a[2].rsplit(',',1)
 88 | 				tag_list_string = a[1]
 89 | 				body = a[0]
 90 | 				#print body  
 91 | 				soup = BeautifulSoup(body)
 92 | 				body = soup.get_text()
 93 | 				for i in replace_words:
 94 | 					body = body.replace(i, '')
 95 | 				body = ' '.join(body.split())	
 96 | 				list_token = nltk.word_tokenize(body)
 97 | 				for token in list_token:
 98 | 					processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower()))
 99 | 					if(processed_token in stopwords):
100 | 						stopword_count+=1
101 | 					else:
102 | 						takenword_count+=1
103 | 						try:
104 | 							print processed_token
105 | 						except UnicodeEncodeError as e:
106 | 							print "Unicode Encode Error ", e
107 | 			print "\n"
108 | 	print "stopword_count : ", stopword_count
109 | 	print "takenword_count : ", takenword_count	
110 | 
111 | def fetch_top_tags(k = 100):
112 | 	#script to fetch top k most popular tags from raw data
113 | 	tags = {}
114 | 	with open(test_data) as infile:
115 | 		for line in infile:
116 | 			striped_line = line.strip().replace('"','')
117 | 			if striped_line :
118 | 				a = striped_line.split(',',2)
119 | 				post_id = str(a[0])
120 | 				title = str(a[1])
121 | 				a = a[2].rsplit(',',1)
122 | 				tag_list = a[1].split(' ')
123 | 				for tag in tag_list:
124 | 					if tag not in tags:
125 | 						tags[tag]=1
126 | 					else:
127 | 						tags[tag]+=1
128 | 	sorted_tags = sorted(tags.items(), key=operator.itemgetter(1), reverse = True)
129 | 	tag_dict = {}
130 | 	with open("data/tag.txt", "w") as f:
131 | 		for i in range(0, k):
132 | 			f.write(sorted_tags[i][0])
133 | 			f.write("\n")	 
134 | 			tag_dict[sorted_tags[i][0]]=0
135 | 
136 | 	return tag_dict
137 | 
138 | #preprocess_dataset()
139 | #remove_stopwords()
140 | # fetch_top_tags()
141 | 


--------------------------------------------------------------------------------
/app/plot_tags.py:
--------------------------------------------------------------------------------
 1 | print(__doc__)
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_multilabel_classification
 7 | from sklearn.multiclass import OneVsRestClassifier
 8 | from sklearn.svm import SVC
 9 | from sklearn.preprocessing import LabelBinarizer
10 | from sklearn.decomposition import PCA
11 | from sklearn.cross_decomposition import CCA
12 | 
13 | 
14 | def plot_hyperplane(clf, min_x, max_x, linestyle, label):
15 |     # get the separating hyperplane
16 |     w = clf.coef_[0]
17 |     a = -w[0] / w[1]
18 |     xx = np.linspace(min_x - 5, max_x + 5)  # make sure the line is long enough
19 |     yy = a * xx - (clf.intercept_[0]) / w[1]
20 |     plt.plot(xx, yy, linestyle, label=label)
21 | 
22 | 
23 | def plot_subfigure(X, Y, subplot, title, transform):
24 |     if transform == "pca":
25 |         X = PCA(n_components=2).fit_transform(X)
26 |     elif transform == "cca":
27 |         X = CCA(n_components=2).fit(X, Y).transform(X)
28 |     else:
29 |         raise ValueError
30 | 
31 |     min_x = np.min(X[:, 0])
32 |     max_x = np.max(X[:, 0])
33 | 
34 |     min_y = np.min(X[:, 1])
35 |     max_y = np.max(X[:, 1])
36 | 
37 |     classif = OneVsRestClassifier(SVC(kernel='linear'))
38 |     classif.fit(X, Y)
39 | 
40 |     plt.subplot(2, 2, subplot)
41 |     plt.title(title)
42 | 
43 |     zero_class = np.where(Y[:, 0])
44 |     one_class = np.where(Y[:, 1])
45 |     plt.scatter(X[:, 0], X[:, 1], s=40, c='gray')
46 |     plt.scatter(X[zero_class, 0], X[zero_class, 1], s=160, edgecolors='b',
47 |                facecolors='none', linewidths=2, label='Class 1')
48 |     plt.scatter(X[one_class, 0], X[one_class, 1], s=80, edgecolors='orange',
49 |                facecolors='none', linewidths=2, label='Class 2')
50 | 
51 |     plot_hyperplane(classif.estimators_[0], min_x, max_x, 'k--',
52 |                     'Boundary\nfor class 1')
53 |     plot_hyperplane(classif.estimators_[1], min_x, max_x, 'k-.',
54 |                     'Boundary\nfor class 2')
55 |     plt.xticks(())
56 |     plt.yticks(())
57 | 
58 |     plt.xlim(min_x - .5 * max_x, max_x + .5 * max_x)
59 |     plt.ylim(min_y - .5 * max_y, max_y + .5 * max_y)
60 |     if subplot == 2:
61 |         plt.xlabel('First principal component')
62 |         plt.ylabel('Second principal component')
63 |         plt.legend(loc="upper left")
64 | 
65 | 
66 | plt.figure(figsize=(8, 6))
67 | 
68 | X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
69 |                                       allow_unlabeled=True,
70 |                                       return_indicator=True,
71 |                                       random_state=1)
72 | 
73 | plot_subfigure(X, Y, 1, "With unlabeled samples + CCA", "cca")
74 | plot_subfigure(X, Y, 2, "With unlabeled samples + PCA", "pca")
75 | 
76 | X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
77 |                                       allow_unlabeled=False,
78 |                                       return_indicator=True,
79 |                                       random_state=1)
80 | 
81 | plot_subfigure(X, Y, 3, "Without unlabeled samples + CCA", "cca")
82 | plot_subfigure(X, Y, 4, "Without unlabeled samples + PCA", "pca")
83 | 
84 | plt.subplots_adjust(.04, .02, .97, .94, .09, .2)
85 | plt.show()
86 | 


--------------------------------------------------------------------------------
/app/stat.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import json
  4 | import time
  5 | import string
  6 | import operator
  7 | 
  8 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
  9 | 
 10 | if not path in sys.path:
 11 |     sys.path.insert(1, path)
 12 | del path
 13 | 
 14 | try:
 15 |     import database.mongo as mongo
 16 | except ImportError as exc:
 17 |     print("Error: failed to import settings module ({})".format(exc))
 18 | 
 19 | try:
 20 |     import nltk
 21 | except ImportError as exc:
 22 |     print("Error: failed to import settings module ({})".format(exc))
 23 | 
 24 | try:
 25 | 	from bs4 import BeautifulSoup
 26 | except ImportError as exc:
 27 | 	print("Error: failed to import settings module ({})".format(exc))
 28 | 
 29 | from sklearn.feature_extraction.text import CountVectorizer
 30 | from sklearn.feature_extraction.text import TfidfVectorizer
 31 | from time import time
 32 | import numpy as np
 33 | import pickle
 34 | 
 35 | stopword_data = "data/stopword.txt"
 36 | replaceword_data = "data/replaceword.txt"
 37 | test_data = "data/processed.csv"
 38 | takeword_data = "data/take_word.txt"
 39 | 
 40 | def get_codewords():
 41 | 	#this function is meant prints all the code segments
 42 | 	db = mongo.connect()
 43 | 	code_word = {}
 44 | 	for post in db.find():
 45 | 		code_temp = post['code'].split()
 46 | 		for i in code_temp:
 47 | 			if i not in code_word:
 48 | 				try:
 49 | 					print i
 50 | 				except UnicodeEncodeError as e:
 51 | 					pass
 52 | 				code_word[i] = 1
 53 | 		print "\n"
 54 | 
 55 | def get_bodywords():
 56 |     #this function is meant to print the unique words with their frequency so that some potential stopwords can be removed
 57 | 	porter_stemmer = nltk.stem.porter.PorterStemmer()
 58 | 	wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
 59 | 	nltk_stopwords = nltk.corpus.stopwords.words('english')
 60 | 	stopwords = {}
 61 | 	replace_words = {}
 62 | 	stopword_count = 0
 63 | 	takenword_count = 0
 64 | 
 65 | 	with open(stopword_data) as infile:
 66 | 		for line in infile:
 67 | 			i = line.strip().split()
 68 | 			for token in i:
 69 | 				a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token))
 70 | 				if a not in stopwords:
 71 | 					stopwords[a] = 1
 72 | 
 73 | 	for token in nltk_stopwords:
 74 | 		a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token))
 75 | 		if a not in stopwords:
 76 | 			stopwords[a] = 1
 77 | 
 78 | 	for a in string.punctuation:
 79 | 		if a not in replace_words:
 80 | 			replace_words[a] = 1
 81 | 	
 82 | 	with open(replaceword_data) as infile:
 83 | 		for line in infile:
 84 | 			a = line.strip()
 85 | 			if a not in replace_words:
 86 | 				replace_words[a] = 1			
 87 | 
 88 | 	db= mongo.connect()
 89 | 	word = {}
 90 | 
 91 | 	for post in db.find():
 92 | 		body = post['body'].strip()
 93 | 		for i in replace_words:
 94 | 			body = body.replace(i, '')
 95 | 		list_token = nltk.word_tokenize(body)
 96 | 		for token in list_token:
 97 | 			# print token
 98 | 			processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower()))
 99 | 			if processed_token not in stopwords:
100 | 				if processed_token not in word:
101 | 					word[processed_token]=1
102 | 					# print processed_token
103 | 				else:
104 | 					word[processed_token]=1
105 | 	sorted_word = sorted(word.items(), key=operator.itemgetter(1), reverse = True)
106 | 	#print sorted_word
107 | 	for i in sorted_word:
108 | 		try:
109 | 			print i[0], " : ",i[1] 
110 | 		except UnicodeEncodeError as e:
111 | 			print "Unicode Error : ", i[1]
112 | 
113 | def get_idf():
114 |     #this function is meant to print the unique words with their frequency so that some potential stopwords can be removed
115 | 
116 | 	porter_stemmer = nltk.stem.porter.PorterStemmer()
117 | 	wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
118 | 	nltk_stopwords = nltk.corpus.stopwords.words('english')
119 | 	stopwords = {}
120 | 	replace_words = {}
121 | 	stopword_count = 0
122 | 	takenword_count = 0
123 | 
124 | 	with open(stopword_data) as infile:
125 | 		for line in infile:
126 | 			i = line.strip().split()
127 | 			for token in i:
128 | 				a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token))
129 | 				if a not in stopwords:
130 | 					stopwords[a] = 1
131 | 
132 | 	for token in nltk_stopwords:
133 | 		a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token))
134 | 		if a not in stopwords:
135 | 			stopwords[a] = 1
136 | 
137 | 	for a in string.punctuation:
138 | 		if a not in replace_words:
139 | 			replace_words[a] = 1
140 | 	
141 | 	with open(replaceword_data) as infile:
142 | 
143 | 		for line in infile:
144 | 			a = line.strip()
145 | 			if a not in replace_words:
146 | 				replace_words[a] = 1			
147 | 
148 | 	db= mongo.connect()
149 | 	word = {}
150 | 	idf = {}
151 | 	flag = {} 
152 | 
153 | 	for post in db.find():
154 | 		body = post['body'].strip()
155 | 		flag = {}
156 | 		for i in replace_words:
157 | 			body = body.replace(i, '')
158 | 		list_token = nltk.word_tokenize(body)
159 | 		for token in list_token:
160 | 			# print token
161 | 			processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower()))
162 | 			if processed_token not in stopwords and not (processed_token.isdigit()):
163 | 				if processed_token not in word:
164 | 					word[processed_token]=1
165 | 					idf[processed_token] = 1
166 | 					flag[processed_token] = 1
167 | 					# print processed_token
168 | 				else:
169 | 					word[processed_token]=1
170 | 					if processed_token not in flag:
171 | 						flag[processed_token] = 1
172 | 						idf[processed_token]=1
173 | 
174 | 	for i in idf:
175 | 		if idf[i] > 7:
176 | 			try:
177 | 				print i
178 | 			except UnicodeEncodeError as e:
179 | 				pass
180 | 
181 | 	# sorted_idf = sorted(idf.items(), key=operator.itemgetter(1), reverse = True)
182 | 	# for i in sorted_idf:
183 | 	# 	try:
184 | 	# 		print i[0], " : ",i[1] 
185 | 	# 	except UnicodeEncodeError as e:
186 | 	# 		print "Unicode Error : ", i[1]
187 | 
188 | def get_trainingdata(input_size = 100000, select_transform = 1, read_database = 1, to_print = 0, mode = "multiclass", repeat = 0, max_number_of_tags = 5):
189 | 	'''
190 | 		generate training data
191 | 		if read_database == 0:
192 | 			All other options are ignored
193 | 		if mode == multilabel
194 | 			repeat option is ignored
195 | 
196 | 	'''
197 | 
198 | 	fname_feature = "trainfeaturematrix.csv"
199 | 	fname_result = "trainresultmatrix.csv"
200 | 	fname_result_pickle = "trainresultmatrix"
201 | 
202 | 	if read_database == 0:
203 | 		t0 = time()
204 | 		a = np.loadtxt(fname_feature, delimiter = ",")
205 | 		print("Loaded feature matrix for training from File in %fs" % (time() - t0))
206 | 		# print "input_size = ", input_size
207 | 		# print a.size
208 | 		trainingdata_features = a.reshape(input_size, a.size/input_size)
209 | 
210 | 		t0 = time()
211 | 		print("Loaded result matrix for training from File in %fs" % (time() - t0))
212 | 		#print "input_size = ", input_size
213 | 		#print a.size
214 | 		with open(fname_python, 'rb') as f:
215 | 			    trainingdata_result = pickle.load(f)
216 | 		#train = a.reshape(input_size, a.size/input_size)
217 | 
218 | 		return trainingdata_features, trainingdata_result
219 | 
220 | 	else:
221 | 
222 | 		porter_stemmer = nltk.stem.porter.PorterStemmer()
223 | 		wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
224 | 		nltk_stopwords = nltk.corpus.stopwords.words('english')
225 | 		
226 | 		take_words = {}
227 | 		replace_words = {}
228 | 		stopwords = {}
229 | 
230 | 		with open(stopword_data) as infile:
231 | 			for line in infile:
232 | 				i = line.strip().split()
233 | 				for token in i:
234 | 					a = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token))
235 | 					if a not in stopwords:
236 | 						stopwords[a] = 1
237 | 
238 | 		for a in string.punctuation:
239 | 			if a not in replace_words:
240 | 				replace_words[a] = 1
241 | 		
242 | 		with open(replaceword_data) as infile:
243 | 			for line in infile:
244 | 				a = line.strip()
245 | 				if a not in replace_words:
246 | 					replace_words[a] = 1			
247 | 
248 | 		db = mongo.connect()
249 | 		corpus = []
250 | 		tag_set = set()
251 | 		question_tag = {}
252 | 		question_count = 0
253 | 		# counter = 0
254 | 		trainingdata_result = []
255 | 
256 | 		if mode == "multilabel":
257 | 
258 | 			for post in list(db.find().skip(1).limit(input_size*(max_number_of_tags+1 ))):
259 | 
260 | 				#not fool proof
261 | 				if(len(post['tag']) <= max_number_of_tags):
262 | 					question_tag[question_count] = []
263 | 
264 | 					trainingdata_result.append(post['tag'])
265 | 
266 | 					body = post['body'].strip()
267 | 					for i in replace_words:
268 | 						body = body.replace(i, '')
269 | 					list_token = nltk.word_tokenize(body)
270 | 					processed_body = ""
271 | 					for token in list_token:
272 | 						processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower()))
273 | 						if processed_token not in stopwords and not (processed_token.isdigit()):
274 | 							processed_body+=processed_token+" "
275 | 					corpus.append(processed_body.strip())
276 | 
277 | 					for i in post['tag']:
278 | 						question_tag[question_count].append(i)
279 | 						tag_set.add(i)
280 | 					question_count+=1
281 | 
282 | 				if(question_count>=input_size):
283 | 					break
284 | 
285 | 
286 | 			#entire point of writing to csv is to use the data with matlab
287 | 			sorted_taglist = sorted(tag_set)
288 | 			tag_dict = {}
289 | 			tag_count = 0
290 | 			# print "size of set"
291 | 			# print len(tag_set)
292 | 			for i in sorted_taglist:
293 | 				# print i
294 | 				tag_dict[i] = tag_count
295 | 				tag_count+=1
296 | 			# print "number of unique tags = "+str(tag_count)
297 | 			train_matrix = np.zeros((input_size, tag_count), dtype = np.int)
298 | 			for i in question_tag:
299 | 				for j in question_tag[i]:
300 | 					train_matrix[i][tag_dict[j]]=1
301 | 			with open(fname_result_pickle, 'wb') as f:
302 | 				pickle.dump(trainingdata_result, f)
303 | 
304 | 			if(select_transform == 1):
305 | 				transform = CountVectorizer(min_df=1)
306 | 			elif(select_transform == 2):
307 | 				transform = TfidfVectorizer(min_df=1)
308 | 			a = transform.fit_transform(corpus)
309 | 			# print transform.get_feature_names()
310 | 			trainingdata_features = a.toarray()
311 | 			# print trainingdata_features
312 | 			np.savetxt(fname_feature, trainingdata_features, delimiter=",")
313 | 
314 | 		elif mode == "multiclass":
315 | 			if repeat == 0:
316 | 				for post in list(db.find().skip(1).limit(input_size*(max_number_of_tags+1))):
317 | 					#not fool proof
318 | 
319 | 					if(len(post['tag']) <= max_number_of_tags):
320 | 						# print len(post['tag'])
321 | 						body = post['body'].strip()
322 | 						for i in replace_words:
323 | 							body = body.replace(i, '')
324 | 						list_token = nltk.word_tokenize(body)
325 | 						processed_body = ""
326 | 						for token in list_token:
327 | 							processed_token = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token.strip().lower()))
328 | 							if processed_token not in stopwords and not (processed_token.isdigit()):
329 | 								processed_body+=processed_token+" "
330 | 						
331 | 						for i in post['tag']:
332 | 							# print i
333 | 							question_tag[question_count] = []
334 | 							question_tag[question_count].append(i)
335 | 							#can be done in a single step - then do it
336 | 							tag_set.add(i)
337 | 							question_count+=1
338 | 							corpus.append(processed_body.strip())
339 | 							trainingdata_result.append([i])
340 | 							if(question_count >= input_size):
341 | 								break
342 | 
343 | 					if(question_count >= input_size):
344 | 						break
345 | 
346 | 				# print corpus
347 | 				# print 
348 | 				#entire point of writing to csv is to use the data with matlab
349 | 				sorted_taglist = sorted(tag_set)
350 | 				tag_dict = {}
351 | 				tag_count = 0
352 | 				# print "size of set"
353 | 				# print len(tag_set)
354 | 				for i in sorted_taglist:
355 | 					# print i
356 | 					tag_dict[i] = tag_count
357 | 					tag_count+=1
358 | 				print "number of unique tags = "+str(tag_count)
359 | 				train_matrix = np.zeros((input_size, tag_count), dtype = np.int)
360 | 				for i in question_tag:
361 | 					for j in question_tag[i]:
362 | 						train_matrix[i][tag_dict[j]]=1
363 | 				with open(fname_result_pickle, 'wb') as f:
364 | 					pickle.dump(trainingdata_result, f)
365 | 
366 | 				if(select_transform == 1):
367 | 					transform = CountVectorizer(min_df=1)
368 | 				elif(select_transform == 2):
369 | 					transform = TfidfVectorizer(min_df=1)
370 | 				a = transform.fit_transform(corpus)
371 | 				# print transform.get_feature_names()
372 | 				trainingdata_features = a.toarray()
373 | 				# print trainingdata_features
374 | 				np.savetxt(fname_feature, trainingdata_features, delimiter=",")
375 | 
376 | 
377 | 		if to_print == 1:
378 | 			for i in trainingdata_features:
379 | 				to_print = ""
380 | 				for j in i:
381 | 					to_print+=str(j)+", "
382 | 					to_print = to_print[:-2]
383 | 				print to_print
384 | 	
385 | 		return trainingdata_features, trainingdata_result
386 | 
387 | if __name__ == "__main__":
388 | #	get_trainmatrix(input_size = 10000)
389 | 	get_featurematrix(200, select_transform = 2, read_database = 1)
390 | 	get_trainmatrix(200, read_database = 1)
391 | 
392 | 


--------------------------------------------------------------------------------
/config/config.cfg.sample:
--------------------------------------------------------------------------------
1 | [app-name]
2 | sts
3 | host =
4 | port = 
5 | db_name =
6 | collection_name = 
7 | 


--------------------------------------------------------------------------------
/database/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shagunsodhani/StackExchange-tagger/8fac6a40f3de416776f236dfa4cf8e3dbd64cf5b/database/__init__.py


--------------------------------------------------------------------------------
/database/mongo.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | #---------------------------------------------------------Import Modules----------------------------------------------------------------------#
 3 | 
 4 | import os
 5 | from ConfigParser import ConfigParser
 6 | 
 7 | try:
 8 |     import pymongo
 9 | except ImportError as exc:
10 |     print("Error: failed to import settings module ({})".format(exc))
11 | 
12 | def connect(app_name = "tagger", config_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), '../config', 'config.cfg') ):
13 | 
14 |     '''Open connection to mongodb and return db object to perform queries'''
15 |     config=ConfigParser()
16 |     config.read(config_path)
17 |     host=config.get(app_name,"host")
18 |     port=config.get(app_name,"port")
19 |     db_name=config.get(app_name,"db_name")
20 |     collection_name=config.get(app_name, "collection_name")
21 |     try:
22 |         client = pymongo.MongoClient(host, int(port))
23 |         db = client[db_name]
24 |         return db[collection_name]
25 |     except pymongo.errors, e:
26 |         print "ERROR %d IN CONNECTION: %s" % (e.args[0], e.args[1])
27 |         return 0
28 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10
 4 | echo "deb http://repo.mongodb.org/apt/ubuntu "$(lsb_release -sc)"/mongodb-org/3.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list
 5 | apt-get update
 6 | pip install pymongo numpy nltk BeautifulSoup cython sparsesvd
 7 | apt-get install -y mongodb-org python-scipy scikit-learn
 8 | service mongod start
 9 | python -m nltk.downloader
10 | #To download nltk datasets
11 | 


--------------------------------------------------------------------------------