├── .gitignore
├── TISC.py
├── README.md
├── ccrf_module2.py
└── generate_ccrf_feature_fast_module_pin_info_follower.py


/.gitignore:
--------------------------------------------------------------------------------
1 | pin/
2 | 


--------------------------------------------------------------------------------
/TISC.py:
--------------------------------------------------------------------------------
 1 | from os import listdir
 2 | from os.path import isfile, join
 3 | import pickle
 4 | import networkx as nx
 5 | from random import randint
 6 | import generate_ccrf_feature_fast_module_pin_info_follower as gen_ccrf
 7 | import ccrf_module2 as ccrf
 8 | import sys
 9 | 
10 | 
11 | test_graph = pickle.load(open('graph/pinterest_test_graph.pickle'))
12 | test_u_graph = test_graph.to_undirected()
13 | training_graph = pickle.load(open('graph/pinterest_training_graph.pickle'))
14 | training_u_graph = training_graph.to_undirected()
15 | 
16 | query = 'design'
17 | feature_filename, edge_filename, edge_filename2, key_filename, regression_file_name = gen_ccrf.generateCCRFFeature(training_graph, training_u_graph, "train", query)
18 | '''
19 | alpha, beta, beta2 = ccrf.learning(feature_filename, edge_filename, edge_filename2)
20 | print alpha
21 | print beta
22 | print beta2
23 | '''
24 | feature_filename, edge_filename, edge_filename2, key_filename, regression_file_name = gen_ccrf.generateCCRFFeature(test_graph, test_u_graph, "test", query)
25 | #score_dict = ccrf.prediction_dict(alpha, beta, beta2, feature_filename, edge_filename, edge_filename2, key_filename)
26 | '''
27 | f = open("prediction_result.txt", 'w')
28 | for user in score_dict:
29 |     f.write(user + "\t" + str(score_dict[user]) + "\n")
30 | f.close()
31 | '''
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Topical-Influence
 2 | 
 3 | =========================
 4 | ## Overview
 5 | Social curation services are emerging social media
 6 | platforms that enable users to curate their contents according to
 7 | the topic and express their interests at the topic level by following
 8 | curated collections of other users’ contents rather than the users
 9 | themselves. The topic-level information revealed through this
10 | new feature far exceeds what existing methods solicit from the
11 | traditional social networking services, to greatly enhance the
12 | quality of topic-sensitive influence modeling. In this paper, we
13 | propose a novel model called the topical influence with social
14 | curation (TISC) to find influential users from social curation
15 | services. This model, formulated by the continuous conditional
16 | random field, fully takes advantage of the explicitly available
17 | topic-level information reflected in both contents and interactions.
18 | In order to validate its merits, we comprehensively compare
19 | TISC with state-of-the-art models using two real-world data
20 | sets collected from Pinterest and Scoop.it. The results show that
21 | TISC achieves higher accuracy by up to around 80% and finds
22 | more convincing results in case studies than the other models.
23 | Moreover, we develop a distributed learning algorithm on Spark
24 | and demonstrate its excellent scalability on a cluster of 48 cores.
25 | 
26 | ## Data Sets
27 | [Download](http://dmserver6.kaist.ac.kr/TISC/TISC_graph.zip)
28 | 


--------------------------------------------------------------------------------
/ccrf_module2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy import stats
  3 | from scipy.sparse import *
  4 | from scipy import *
  5 | import math
  6 | 
  7 | def learning(feature_filename, edge_filename, edge_filename2):
  8 | 
  9 | 	filepath = feature_filename
 10 | 
 11 | 	data = np.loadtxt(fname=filepath, delimiter='\t')
 12 | 	data = stats.zscore(data)
 13 | 
 14 | 	relation_filepath = edge_filename
 15 | 
 16 | 	relation_sparse = np.loadtxt(fname=relation_filepath, delimiter='\t')
 17 | 
 18 | 	row = relation_sparse[:,0]
 19 | 	col = relation_sparse[:,1]
 20 | 	weight = relation_sparse[:,2]
 21 | 
 22 | 	R = csr_matrix((weight,(row,col))).todense()
 23 | 	R = np.transpose(R)
 24 | 
 25 | 	relation_filepath = edge_filename2
 26 | 
 27 | 	relation_sparse = np.loadtxt(fname=relation_filepath, delimiter='\t')
 28 | 
 29 | 	row = relation_sparse[:,0]
 30 | 	col = relation_sparse[:,1]
 31 | 	weight = relation_sparse[:,2]
 32 | 
 33 | 	R2 = csr_matrix((weight,(row,col))).todense()
 34 | 	R2 = np.transpose(R2)
 35 | 
 36 | 
 37 | 	y = np.transpose(np.matrix(data[:,0]))
 38 | 	X = np.matrix(data[:,1:])
 39 | 
 40 | 	num_data, num_x_feature = X.shape
 41 | 
 42 | 	num_y_feature = 2
 43 | 	num_feature = num_x_feature + num_y_feature
 44 | 
 45 | 	alpha = np.matrix(1 * np.ones((num_x_feature, 1)))
 46 | 	alpha_new = np.matrix(alpha)
 47 | 	delta_log_alpha = np.matrix(0.0 * np.ones((num_x_feature, 1)))
 48 | 
 49 | 	beta = 0.1
 50 | 	beta_new = beta
 51 | 
 52 | 	beta2 = 0.1
 53 | 	beta2_new = beta2
 54 | 
 55 | 	n = num_data
 56 | 
 57 | 	D_r_1 = np.matrix(np.diag(np.squeeze(np.asarray(R.sum(axis=1)))))
 58 | 	D_c_1 = np.matrix(np.diag(np.squeeze(np.asarray(R.sum(axis=0)))))
 59 | 
 60 | 	D_r_2 = np.matrix(np.diag(np.squeeze(np.asarray(R2.sum(axis=1)))))
 61 | 	D_c_2 = np.matrix(np.diag(np.squeeze(np.asarray(R2.sum(axis=0)))))
 62 | 
 63 | 
 64 | 	iterations = 100
 65 | 	learning_rate = 0.000001
 66 | 	precision = 0.0017
 67 | 
 68 | 	for i in range(iterations):
 69 | 		a = np.transpose(alpha) * np.matrix(np.ones((alpha.shape[0], 1)))
 70 | 		b = 2*X*alpha + beta*(D_r_1-D_c_1) * np.matrix(np.ones((D_r_1.shape[0],1))) + beta2*(D_r_2-D_c_2) * np.matrix(np.ones((D_r_2.shape[0],1)))
 71 | 		b_t = np.transpose(b)
 72 | 
 73 | 		for k in range(num_x_feature):
 74 | 			x_k = X[:,k]
 75 | 			alpha_k = alpha[k]
 76 | 			log_alpha_k = math.log(alpha_k)
 77 | 
 78 | 			delta_log_alpha_k = alpha_k*(n/(2*a) + (1/(4*a**2))*b_t*b - (1/(2*a))*b_t*x_k  + np.transpose(x_k)*x_k - np.transpose(y-x_k)*(y-x_k))
 79 | 			delta_log_alpha[k] = delta_log_alpha_k
 80 | 			log_alpha_k = log_alpha_k + learning_rate*delta_log_alpha_k
 81 | 			alpha_new[k] = math.exp(log_alpha_k)
 82 | 	
 83 | 		delta_beta = -1/(2*a)*b_t*(D_r_1-D_c_1)*np.matrix(np.ones((D_r_1.shape[0],1))) + np.transpose(D_r_1*y)*np.matrix(np.ones((y.shape[0],1))) - np.transpose(D_c_1*y) * np.matrix(np.ones((y.shape[0],1)))
 84 | 		beta_new = beta + learning_rate*delta_beta
 85 | 
 86 | 		delta_beta2 = -1/(2*a)*b_t*(D_r_2-D_c_2)*np.matrix(np.ones((D_r_2.shape[0],1))) + np.transpose(D_r_2*y)*np.matrix(np.ones((y.shape[0],1))) - np.transpose(D_c_2*y) * np.matrix(np.ones((y.shape[0],1)))
 87 | 		beta2_new = beta2 + learning_rate*delta_beta2
 88 | 
 89 | 
 90 | 		delta_alpha_beta = np.linalg.norm(np.concatenate((alpha_new, beta_new, beta2_new)) -  np.concatenate((alpha, [[beta],[beta2]]))) / np.linalg.norm(np.concatenate((alpha, [[beta],[beta2]])))
 91 | 
 92 | 		if math.isinf(delta_alpha_beta) or delta_alpha_beta < precision or np.linalg.norm(np.concatenate((delta_log_alpha, delta_beta, delta_beta2))) < precision:
 93 | 			break
 94 | 
 95 | 		alpha = np.copy(alpha_new)
 96 | 		beta = beta_new[0,0]
 97 | 		beta2 = beta2_new[0,0]
 98 | 		#print "Iteration " + str(i)
 99 | 		#print alpha
100 | 		#print beta
101 | 
102 | 	return alpha, beta, beta2
103 | 
104 | def prediction(alpha, beta1, beta2, feature_filename, edge_filename, edge_filename2):
105 | 	filepath = feature_filename
106 | 
107 | 	data = np.loadtxt(fname=filepath, delimiter='\t')
108 | 	data = stats.zscore(data)
109 | 
110 | 	relation_filepath = edge_filename
111 | 
112 | 	relation_sparse = np.loadtxt(fname=relation_filepath, delimiter='\t')
113 | 
114 | 	row = relation_sparse[:,0]
115 | 	col = relation_sparse[:,1]
116 | 	weight = relation_sparse[:,2]
117 | 
118 | 	R = csr_matrix((weight,(row,col))).todense()
119 | 	R = np.transpose(R)
120 | 
121 | 	relation_filepath = edge_filename2
122 | 
123 | 	relation_sparse = np.loadtxt(fname=relation_filepath, delimiter='\t')
124 | 
125 | 	row = relation_sparse[:,0]
126 | 	col = relation_sparse[:,1]
127 | 	weight = relation_sparse[:,2]
128 | 
129 | 	R2 = csr_matrix((weight,(row,col))).todense()
130 | 	R2 = np.transpose(R2)
131 | 
132 | 	y = np.transpose(np.matrix(data[:,0]))
133 | 	X = np.matrix(data[:,1:])
134 | 
135 | 	num_data, num_x_feature = X.shape
136 | 
137 | 	D_r_1 = np.matrix(np.diag(np.squeeze(np.asarray(R.sum(axis=1)))))
138 | 	D_c_1 = np.matrix(np.diag(np.squeeze(np.asarray(R.sum(axis=0)))))
139 | 
140 | 	D_r_2 = np.matrix(np.diag(np.squeeze(np.asarray(R2.sum(axis=1)))))
141 | 	D_c_2 = np.matrix(np.diag(np.squeeze(np.asarray(R2.sum(axis=0)))))
142 | 
143 | 
144 | 	#print type(np.transpose(alpha))
145 | 	#print type(np.ones((alpha.shape[0],1)))
146 | 
147 | 	#print ((1 / (np.transpose(alpha) * np.matrix(np.ones((alpha.shape[0],1)))))[0,0])
148 | 	#print type(np.array((2*X*alpha + beta * (D_r-D_c) * np.matrix(np.ones((D_r.shape[0],1)))))[:,0])
149 | 
150 | 
151 | 	y_prec = (1.0 / (np.transpose(alpha) * np.matrix(np.ones((alpha.shape[0],1)))))[0,0] * (2*X*alpha + beta1 * (D_r_1-D_c_1) * np.matrix(np.ones((D_r_1.shape[0],1))) + beta2 * (D_r_2-D_c_2) * np.matrix(np.ones((D_r_2.shape[0],1))))
152 | 
153 | 	return np.array(y_prec)[:,0]
154 | 
155 | def prediction_dict(alpha, beta, beta2, feature_filename, edge_filename, edge_filename2, key_filename):
156 | 	y_prec = prediction(alpha, beta, beta2, feature_filename, edge_filename, edge_filename2)
157 | 
158 | 	y_prec_dict = dict()
159 | 
160 | 	for line in open(key_filename):
161 | 		line_info = line.split("\t")
162 | 		key = int(line_info[0])
163 | 		username = line_info[1].split("\n")[0]
164 | 		
165 | 		y_prec_dict[username] = y_prec[key]
166 | 	
167 | 	fp = open("ccrf_result.txt", "w")
168 | 	for user in y_prec_dict:
169 | 		fp.write(user + "\t" + str(y_prec_dict[user]) + "\n")		
170 | 	fp.close()
171 | 
172 | 	return y_prec_dict
173 | 
174 | def ccrf(feature_filename, edge_filename, key_filename):
175 | 	alpha, beta = learning(feature_filename, edge_filename)
176 | 	y_prec = prediction(alpha, beta, feature_filename, edge_filename)
177 | 
178 | 	y_prec_dict = dict()
179 | 
180 | 	for line in open(key_filename):
181 | 		line_info = line.split("\t")
182 | 		key = int(line_info[0])
183 | 		username = line_info[1].split("\n")[0]
184 | 		
185 | 		y_prec_dict[username] = y_prec[key]
186 | 		
187 | 
188 | 	return y_prec_dict
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/generate_ccrf_feature_fast_module_pin_info_follower.py:
--------------------------------------------------------------------------------
  1 | import MySQLdb
  2 | from nltk.stem.porter import *
  3 | import numpy as np
  4 | import math
  5 | from scipy import stats
  6 | #import ground_truth
  7 | import pickle
  8 | import networkx as nx
  9 | 
 10 | SELECT_TRAINING_USER_LIST_SQL = "SELECT * FROM userList WHERE state=2"
 11 | SELECT_BOARD_BY_USER_SQL = "SELECT * FROM board WHERE user_id = %s"
 12 | SELECT_BOARD_INFO_SQL = "SELECT * FROM board WHERE board_id = %s"
 13 | SELECT_FOLLOWING_BOARD_SQL = "SELECT * FROM following_board WHERE user_id = %s and board_id = %s"
 14 | SELECT_FOLLOWING_USER_SQL = "SELECT * FROM following_user WHERE user_id = %s and following_id = %s"
 15 | SELECT_COUNT_PIN_SQL = "SELECT count(*) FROM pin WHERE board_href = %s"
 16 | SELECT_USER_LIST_SQL = "SELECT * FROM userInfo ORDER BY follower_cnt DESC LIMIT 5000"
 17 | SELECT_GROUND_TRUTH_INFO_SQL = "SELECT * FROM ground_truth_info WHERE username = %s"
 18 | 
 19 | def connectDB(db_name):
 20 | 	db = None
 21 | 	cursor = None
 22 | 
 23 | 	if db_name == 'pin':
 24 | 		db = MySQLdb.connect(host='dmserver1.kaist.ac.kr', user='daehoon', passwd='rlaeogns', db='pinterest_design_pin', charset='utf8', use_unicode=True)
 25 | 		cursor = db.cursor()
 26 | 		cursor.execute("set names utf8")
 27 | 
 28 | 	elif db_name == 'all':
 29 | 		db = MySQLdb.connect(host='dmserver1.kaist.ac.kr', user='daehoon', passwd='rlaeogns', db='pinterest_design', charset='utf8', use_unicode=True)
 30 | 		cursor = db.cursor()
 31 | 	  	cursor.execute("set names utf8")
 32 | 
 33 | 	return db, cursor
 34 | 
 35 | def closeDB(db, cur):
 36 | 	cur.close()
 37 | 	db.close()
 38 | 
 39 | def selectDB(db, cursor, SqlQuery, params=()):
 40 | 	cursor.execute(SqlQuery, params)
 41 | 	return cursor.fetchall()
 42 | 
 43 | def getUserOfBoard(u_graph, board_href):
 44 | 	neighbors = u_graph[board_href]
 45 | 	for neighbor in neighbors:
 46 | 		if neighbors[neighbor]['type'] == 'curated':
 47 | 			return neighbor
 48 | 
 49 | def getBoardList(u_graph, username):
 50 | 	board_list = list()
 51 | 	neighbors = u_graph[username]
 52 | 	for neighbor in neighbors:
 53 | 		if neighbors[neighbor]['type'] == 'curated':
 54 | 			board_list.append(neighbor)
 55 | 	return board_list
 56 | 
 57 | def getBoardFollowerList(u_graph, board_id):
 58 | 	follower_list = list()
 59 | 	neighbors = u_graph[board_id]
 60 | 	for neighbor in neighbors:
 61 | 		if neighbors[neighbor]['type'] == 'following':
 62 | 			follower_list.append(neighbor)
 63 | 	return follower_list
 64 | 
 65 | def reverseScoreFeature(feature):
 66 | 	if np.isnan(feature):
 67 | 		feature = 0.0
 68 | 	elif feature == 0.0:
 69 | 		feature = 1.0
 70 | 	else:
 71 | 		feature = 1.0 / feature
 72 | 
 73 | 	return feature
 74 | 
 75 | def isFollowingBoard(graph, user_id, board_id):
 76 | 	if board_id in graph[user_id]:
 77 | 		if graph[user_id][board_id]['type'] == 'following':
 78 | 			return True
 79 | 	return False
 80 | 
 81 | def getGroundTruthFeatureByFile(filepath):
 82 | 	user_ground_truth_feature = dict()
 83 | 	f = open(filepath)
 84 | 	idx = 0
 85 | 	for line in f:
 86 | 		if idx > 0:
 87 | 			feature_info = line.split('\t')
 88 | 			username = feature_info[0]
 89 | 			user_ground_truth_feature[username] = dict()
 90 | 			user_ground_truth_feature[username]['like'] = int(feature_info[1])
 91 | 			user_ground_truth_feature[username]['repin'] = int(feature_info[2])
 92 | 			user_ground_truth_feature[username]['comment'] = int(feature_info[3])
 93 | 			user_ground_truth_feature[username]['word'] = int(feature_info[4])
 94 | 			user_ground_truth_feature[username]['query_follower'] = int(feature_info[5])
 95 | 			user_ground_truth_feature[username]['query_follower_weight'] = int(feature_info[6])
 96 | 		idx += 1
 97 | 
 98 | 	return user_ground_truth_feature
 99 | 
100 | def getGroundTruthFeatureInDB(db, cur, user_list):
101 | 	user_ground_truth_feature = dict()
102 | 	for username in user_list:
103 | 		result = selectDB(db, cur, SELECT_GROUND_TRUTH_INFO_SQL, params=(username,))
104 | 
105 | 		if len(result) > 0:
106 | 			ground_truth_info = result[0]
107 | 			comment_cnt = ground_truth_info[2]
108 | 			word_cnt = ground_truth_info[3]
109 | 			like_cnt = ground_truth_info[4]
110 | 			repin_cnt = ground_truth_info[5]
111 | 			query_follower = ground_truth_info[6]
112 | 			query_follower_weight = ground_truth_info[7]
113 | 		else:
114 | 			comment_cnt = 0
115 | 			word_cnt = 0
116 | 			like_cnt = 0
117 | 			repin_cnt = 0
118 | 			query_follower = 0
119 | 			query_follower_weight = 0
120 | 
121 | 		user_ground_truth_feature[username] = dict()
122 | 		user_ground_truth_feature[username]['like'] = like_cnt
123 | 		user_ground_truth_feature[username]['repin'] = repin_cnt
124 | 		user_ground_truth_feature[username]['comment'] = comment_cnt
125 | 		user_ground_truth_feature[username]['word'] = word_cnt
126 | 		user_ground_truth_feature[username]['query_follower'] = query_follower
127 | 		user_ground_truth_feature[username]['query_follower_weight'] = query_follower_weight
128 | 
129 | 	return  user_ground_truth_feature
130 | 
131 | def convertZscore(score_dict):
132 | 	repin_score = list()
133 | 	repin_zscore = list()
134 | 	word_score = list()
135 | 	word_zscore = list()
136 | 	like_score = list()
137 | 	like_zscore = list()
138 | 	comment_score = list()
139 | 	comment_zscore = list()
140 | 	query_follower_score = list()
141 | 	query_follower_zscore = list()
142 | 	query_follower_weight_score = list()
143 | 	query_follower_Weight_zscore = list()
144 | 
145 | 	user_key_list = list()
146 | 
147 | 	feature_zscore = dict()
148 | 
149 | 	for key in score_dict:
150 | 		repin = score_dict[key]['repin']
151 | 		word = score_dict[key]['word']
152 | 		like = score_dict[key]['like']
153 | 		comment = score_dict[key]['comment']
154 | 		query_follower = score_dict[key]['query_follower']
155 | 		query_follower_weight = score_dict[key]['query_follower_weight']
156 | 
157 | 		repin_score.append(repin)
158 | 		word_score.append(word)
159 | 		like_score.append(like)
160 | 		comment_score.append(comment)
161 | 		query_follower_score.append(query_follower)
162 | 		query_follower_weight_score.append(query_follower_weight)
163 | 
164 | 		user_key_list.append(key)
165 | 
166 | 	repin_zscore = stats.zscore(np.array(repin_score))
167 | 	word_zscore = stats.zscore(np.array(word_score))
168 | 	like_zscore = stats.zscore(np.array(like_score))
169 | 	comment_zscore = stats.zscore(np.array(comment_score))
170 | 	query_follower_zscore = stats.zscore(np.array(query_follower_score))
171 | 	query_follower_weight_zscore = stats.zscore(np.array(query_follower_weight_score))
172 | 
173 | 	for idx, user in enumerate(user_key_list):
174 | 		feature_zscore[user] = dict()
175 | 		feature_zscore[user]['repin'] = repin_zscore[idx]
176 | 		feature_zscore[user]['word'] = word_zscore[idx]
177 | 		feature_zscore[user]['like'] = like_zscore[idx]
178 | 		feature_zscore[user]['comment'] = comment_zscore[idx]
179 | 		feature_zscore[user]['query_follower'] = query_follower_zscore[idx]
180 | 		feature_zscore[user]['query_follower_weight'] = query_follower_weight_zscore[idx]
181 | 
182 | 	return feature_zscore
183 | 
184 | 
185 | def convertDictToZscore(data_dict):
186 | 	z_score_dict = dict()
187 | 	score_list = list()
188 | 	key_list = list()
189 | 	for key in data_dict.keys():
190 | 		score_list.append(data_dict[key])
191 | 		key_list.append(key)
192 | 
193 | 	z_score = stats.zscore(np.array(score_list))
194 | 
195 | 	for idx, key in enumerate(key_list):
196 | 		z_score_dict[key] = z_score[idx]
197 | 
198 | 	return z_score_dict
199 | 
200 | 
201 | def calculateGroundTruth(feature_list):
202 | 	result = 0.0
203 | 	for feature in feature_list:
204 | 		result += feature
205 | 
206 | 	''' PRODUCT
207 | 	result = 1.0
208 | 	for feature in feature_list:
209 | 		result *= feature
210 | 	'''
211 | 	''' MEAN
212 | 	result = np.mean(feature_list)
213 | 	'''
214 | 
215 | 	return result
216 | 
217 | 
218 | def getNumOfPinGraph(u_graph, board_id):
219 | 	pin_num = 0
220 | 
221 | 	if 'pin_num' in u_graph.node[board_id]:
222 | 		pin_num = u_graph.node[board_id]['pin_num']
223 | 
224 | 	return pin_num
225 | 
226 | 	'''
227 | 
228 | 	pin_list = list()
229 | 	neighbors = u_graph[board_id]
230 | 	for neighbor in neighbors:
231 | 		if neighbors[neighbor]['type'] == 'curated-pin':
232 | 			pin_list.append(neighbor)
233 | 
234 | 	return len(pin_list)
235 | 	'''
236 | 
237 | ###############  MAIN LOGIC  ###############
238 | 
239 | def generateCCRFFeature(graph, u_graph, file_postfix, query):
240 | 
241 | 	#file_postfix = '20150914_following'
242 | 	#query = 'design'
243 | 	#graph = pickle.load(open('../Graph/query_connected_graph_pin_info.pickle'))
244 | 	#u_graph = graph.to_undirected()
245 | 
246 | 	# Get target user list
247 | 	user_list = list()
248 | 	for node in graph.nodes():
249 | 		if graph.node[node]['type'] == 'user':
250 | 			user_list.append(node)
251 | 
252 | 	print "User Cnt : " + str(len(user_list))
253 | 
254 | 	#query = 'design'
255 | 
256 | 	pin_db, pin_cur = connectDB('pin')
257 | 	all_db, all_cur = connectDB('all')
258 | 
259 | 
260 | 	user_ground_truth_feature = getGroundTruthFeatureInDB(pin_db, pin_cur, user_list)
261 | 	#print user_ground_truth_feature
262 | 	#user_ground_truth_feature = getGroundTruthFeatureByFile('ground_truth_feature_training_' + file_postfix + '.txt')
263 | 	user_ground_truth_feature_z_score = convertZscore(user_ground_truth_feature)
264 | 
265 | 	#user_list = user_ground_truth_feature.keys()
266 | 	#print "Valid User Cnt : " + str(len(user_list))
267 | 
268 | 	feature_file_name = 'pinterest_train_' + file_postfix + '.txt'
269 | 	regression_file_name = 'regression_train_' + file_postfix + '.txt'
270 | 
271 | 	# just for information
272 | 	fp = open('user_feature_train_'+file_postfix+'.txt', 'w')
273 | 	fp3 = open('all_feature_train_'+file_postfix+'.txt', 'w')
274 | 
275 | 	# return feature file
276 | 	fp2 = open(feature_file_name, 'w')
277 | 	fp_regression = open(regression_file_name, 'w')
278 | 
279 | 	user_feature = dict()
280 | 
281 | 	print "Start generating features"
282 | 
283 | 	user_ground_truth_feature_follower = dict()
284 | 
285 | 	for idx, username in enumerate(user_list):
286 | 		#username = result[0]
287 | 		#if idx % 100 == 0:
288 | 		#	print str(idx)
289 | 
290 | 		if not user_feature.has_key(username):
291 | 			user_feature[username] = dict()
292 | 			user_feature[username]['pin'] = list()
293 | 			user_feature[username]['follower'] = list()
294 | 			user_feature[username]['all_pin'] = list()
295 | 			user_feature[username]['unique_follower'] = set()
296 | 			user_feature[username]['all_follower'] = list()
297 | 
298 | 		board_list = getBoardList(u_graph, username)
299 | 
300 | 		for board in board_list:
301 | 			board_href = board
302 | 			board_category = graph.node[board]['category']
303 | 			#board_info = getBoardInfoDB(all_db, all_cur, board_href)
304 | 			#pin_num = board_info[4] ## TODO : Change get pins on graph
305 | 			pin_num  = getNumOfPinGraph(u_graph, board_href)
306 | 			follower_list = getBoardFollowerList(u_graph, board_href)
307 | 			follower_num = len(follower_list)
308 | 
309 | 			if board_category == query:
310 | 				#print str(getNumOfCrawledPin(pin_db, pin_cur, board_href)) + " / " + str(pin_num)
311 | 				#pin_num = getNumOfCrawledPin(pin_db, pin_cur, board_href)
312 | 				user_feature[username]['pin'].append(pin_num)
313 | 				user_feature[username]['follower'].append(follower_num)
314 | 				for follower in follower_list:
315 | 					user_feature[username]['unique_follower'].add(follower)
316 | 
317 | 			user_feature[username]['all_pin'].append(pin_num)
318 | 			user_feature[username]['all_follower'].append(follower_num)
319 | 
320 | 		follower_feature = sum(user_feature[username]['follower'])
321 | 
322 | 		user_ground_truth_feature_follower[username] = follower_feature
323 | 
324 | 	user_ground_truth_feature_follower_zscore = convertDictToZscore(user_ground_truth_feature_follower)
325 | 
326 | 	for idx, username in enumerate(user_list):
327 | 		#### Generate Feature ####
328 | 		#### Feature 1 ####
329 | 		feature1 = sum(user_feature[username]['pin'])
330 | 		#### Feature 2 ####
331 | 		feature2 = sum(user_feature[username]['follower'])
332 | 		#### feature 3 ####
333 | 		feature3 = np.inner(user_feature[username]['pin'], user_feature[username]['follower'])
334 | 		if feature3 == False:
335 | 			feature3 = 0.0
336 | 		#### feature 4 ####
337 | 		feature4 = 0.0
338 | 		all_pin_num = sum(user_feature[username]['all_pin'])
339 | 		query_pin_num = sum(user_feature[username]['pin'])
340 | 		if all_pin_num > 0:
341 | 			feature4 = query_pin_num / float(all_pin_num)
342 | 
343 | 		#### feature 5 ####
344 | 
345 | 		feature_follow_ratio = 0.0
346 | 		all_follow_num = sum(user_feature[username]['all_follower'])
347 | 		follow_num = sum(user_feature[username]['follower'])
348 | 		if all_follow_num > 0:
349 | 			feature_follow_ratio = follow_num / float(all_follow_num)
350 | 
351 | 		### feature unique follower ##
352 | 		feature_u_f = 0
353 | 		if username in user_feature:
354 | 			feature_u_f = len(user_feature[username]['unique_follower'])
355 | 
356 | 
357 | 		### feature 5 ###
358 | 		#feature5 = 0.0
359 | 		#feature5 = reverseScoreFeature(np.std(user_feature[username]['pin']))
360 | 		### feature 6 ###
361 | 		#feature6 = 0.0
362 | 		#feature6 = reverseScoreFeature(np.std(user_feature[username]['follower']))
363 | 		### feature 7 ###
364 | 		#feature7 = 0.0
365 | 		#pinBYfollower = list()
366 | 		#for idx in range(0, len(user_feature[username]['pin'])):
367 | 		#	pinBYfollower.append(user_feature[username]['pin'][idx] * user_feature[username]['follower'][idx])
368 | 		#feature7 = reverseScoreFeature(np.std(pinBYfollower))
369 | 
370 | 
371 | 		result_content = username + "\t" + str(feature1) + "\t" + str(feature2) + "\t" +  str(feature3) + "\t" + str(feature4) + "\t" + str(feature_u_f)# + "\t" + str(feature5) + "\t" + str(feature6) + "\t" + str(feature7)
372 | 		fp.write(result_content + "\n")
373 | 
374 | 		#score = calculateGroundTruth([user_ground_truth_feature_z_score[username]['repin'], user_ground_truth_feature_z_score[username]['like'], user_ground_truth_feature_z_score[username]['comment'], user_ground_truth_feature_z_score[username]['query_follower'], user_ground_truth_feature_z_score[username]['word'], user_ground_truth_feature_z_score[username]['query_follower_weight']])
375 | 
376 | 		#score = calculateGroundTruth([user_ground_truth_feature_z_score[username]['repin'], user_ground_truth_feature_z_score[username]['like'], user_ground_truth_feature_z_score[username]['comment'], user_ground_truth_feature_z_score[username]['word']])
377 | 
378 | 		score = calculateGroundTruth([user_ground_truth_feature_z_score[username]['repin'], user_ground_truth_feature_z_score[username]['word'], user_ground_truth_feature_follower_zscore[username]])
379 | 
380 | 		result_content = str(score) + "\t" + str(feature1) + "\t" + str(feature4) + "\t" + str(feature_follow_ratio) # str(feature2) + "\t" +  str(feature3) + "\t" + str(feature4) + "\t" + str(feature_u_f)# + "\t" + str(feature5) + "\t" + str(feature6) + "\t" + str(feature7)
381 | 		fp2.write(result_content + "\n")
382 | 
383 | 		fp_regression.write(str(score) + "\t" + str(feature1) + "\t" + str(feature2) + "\t" + str(feature3) + "\t" + str(feature4) + "\t" + str(feature_follow_ratio) + "\n")
384 | 
385 | 		y_feature1 = user_ground_truth_feature_z_score[username]['repin']
386 | 		y_feature2 = user_ground_truth_feature_z_score[username]['like']
387 | 		y_feature3 = user_ground_truth_feature_z_score[username]['word']
388 | 		y_feature4 = user_ground_truth_feature_z_score[username]['comment']
389 | 		y_feature5 = user_ground_truth_feature_z_score[username]['query_follower']
390 | 		y_feature6 = user_ground_truth_feature_z_score[username]['query_follower_weight']
391 | 		y_feature7 = user_ground_truth_feature_follower_zscore[username]
392 | 
393 | 		result_content = str(y_feature1) + "\t" +  str(y_feature2) + "\t" +  str(y_feature3) + "\t" + str(y_feature4) + "\t" + str(y_feature5) + "\t" + str(y_feature6)+ "\t" + str(y_feature7) + "\t" + str(feature1) + "\t" + str(feature2) + "\t" +  str(feature3) + "\t" + str(feature4) + "\t" + str(feature_u_f)# + "\t" + str(feature5) + "\t" + str(feature6) + "\t" + str(feature7)
394 | 
395 | 		#print result_content
396 | 		fp3.write(result_content + "\n")
397 | 
398 | 	fp.close()
399 | 	fp2.close()
400 | 	fp3.close()
401 | 	fp_regression.close()
402 | 
403 | 	print "End generating features"
404 | 
405 | 
406 | 	print "Start generating edge features"
407 | 
408 | 	### feature 8 ###
409 | 
410 | 	edge_file_name = 'edge_list_key_train_'+file_postfix+'.txt'
411 | 	edge_file_name2 = 'edge2_list_key_train_'+file_postfix+'.txt'
412 | 	key_file_name = 'edge_key_train_'+file_postfix+'.txt'
413 | 
414 | 	fp = open('edge_feature_train_'+file_postfix+'.txt','w')
415 | 	fp2 = open('edge_list_train_'+file_postfix+'.txt','w')
416 | 	fp3 = open(key_file_name,'w')
417 | 	fp4 = open(edge_file_name,'w')
418 | 	fp5 = open(edge_file_name2, 'w')
419 | 
420 | 	## TODO USER TO KEY ##
421 | 
422 | 	user_key = dict()
423 | 	following_dict = dict()
424 | 
425 | 
426 | 	user_board_size = dict()
427 | 	user_all_board_size = dict()
428 | 
429 | 	for idx, username in enumerate(user_list):
430 | 		user_key[username] = idx
431 | 		fp3.write(str(user_key[username]) + "\t" + username + "\n")
432 | 
433 | 		following_dict[username] = dict()
434 | 		following_dict[username]['all_following_pin'] = 0.0
435 | 		user_board_size[username] = 0.0
436 | 
437 | 		board_list = getBoardList(u_graph, username)
438 | 		for board in board_list:
439 | 			#board_info = getBoardInfoDB(all_db, all_cur, board)
440 | 			#pin_num = board_info[4]
441 | 			pin_num  =  getNumOfPinGraph(u_graph, board)
442 | 			board_category = graph.node[board]['category']
443 | 			if board_category == query:
444 | 				user_board_size[username] += pin_num
445 | 
446 | 			if not username in user_all_board_size:
447 | 				user_all_board_size[username] = 0
448 | 			user_all_board_size[username] += pin_num
449 | 
450 | 	## GET BOARD FOLLOWING EDGE ##
451 | 
452 | 	for idx, edge in enumerate(graph.edges()):
453 | 		if graph[edge[0]][edge[1]]['type'] == 'following':
454 | 			board_href = edge[1]
455 | 			board_category = graph.node[board_href]['category']
456 | 
457 | 			if board_category == query:
458 | 				#board_info = getBoardInfoDB(all_db, all_cur, board_href)
459 | 				#board_user = board_info[6].split('/')[1]
460 | 				board_user = getUserOfBoard(u_graph, board_href)
461 | 				#board_user = graph.node[board_href]['user']
462 | 				#board_user = edge[1].split('/')[1]
463 | 				if not board_user in following_dict[edge[0]]:
464 | 					following_dict[edge[0]][board_user] = dict()
465 | 					following_dict[edge[0]][board_user]['following'] = 0.0
466 | 					following_dict[edge[0]][board_user]['all'] = 0.0
467 | 
468 | 				#pin_num = board_info[4] ## TODO : Change get pins on graph
469 | 				pin_num  = getNumOfPinGraph(u_graph, board_href)
470 | 				following_dict[edge[0]][board_user]['following'] += pin_num
471 | 				following_dict[edge[0]]['all_following_pin'] += pin_num
472 | 
473 | 	## edge[user][user]['follwoing']
474 | 	## edge[user][user]['all']
475 | 	## Calculating Score ##
476 | 
477 | 	following_ratio_dict = dict()
478 | 	for user in following_dict:
479 | 		following_ratio_dict[user] = list()
480 | 		for following_user in following_dict[user]:
481 | 			if following_user != "all_following_pin":
482 | 				query_following_pin_cnt = following_dict[user][following_user]['following']
483 | 				query_all_pin_cnt = user_board_size[following_user] ###
484 | 				if query_all_pin_cnt > 0.0:
485 | 					ratio = float(query_following_pin_cnt) / query_all_pin_cnt
486 | 					following_ratio_dict[user].append(ratio)
487 | 
488 | 
489 | 	for user in following_dict:
490 | 		for following_user in following_dict[user]:
491 | 			if following_user != 'all_following_pin':
492 | 
493 | 				query_following_pin_cnt = following_dict[user][following_user]['following']
494 | 				query_all_pin_cnt = user_board_size[following_user] #user_board_size[following_user]
495 | 				feature8 = 0.0
496 | 				sum_following_ratio = 0.0
497 | 
498 | 				if len(following_ratio_dict) > 0:
499 | 					sum_following_ratio = sum(following_ratio_dict[user])
500 | 
501 | 				if query_all_pin_cnt > 0.0 and sum_following_ratio > 0:
502 | 					feature8 = (float(query_following_pin_cnt) / query_all_pin_cnt) / float(sum_following_ratio)
503 | 					#print user + "\t" + following_user + "\t" + str(feature8)
504 | 				#else:
505 | 					#print "Pin Zero User :: " + following_user
506 | 
507 | 				'''
508 | 				#print str(user) + "\t" + str(following_user)
509 | 				query_following_pin_cnt = following_dict[user][following_user]['following']
510 | 				feature8 = 0.0
511 | 				if following_dict[user]['all_following_pin'] > 0:
512 | 					feature8 = float(query_following_pin_cnt) / following_dict[user]['all_following_pin']
513 | 				'''
514 | 
515 | 				#print user + "\t" + following_user + "\t" + str(feature8)
516 | 				#print str(user_key[user]) + "\t" + str(user_key[following_user]) + "\t" + str(feature8)
517 | 				if feature8 > 0.0:
518 | 					fp2.write(user + "\t" + following_user + "\t" + str(feature8) + "\n")
519 | 					fp4.write(str(user_key[user]) + "\t" + str(user_key[following_user]) + "\t" + str(feature8) + "\n")
520 | 
521 | 
522 | 
523 | 				query_following_pin_cnt = following_dict[user][following_user]['following']
524 |                                 feature9 = 0.0
525 | 
526 | 				if following_dict[user]['all_following_pin'] > 0:
527 |                                         feature9 = float(query_following_pin_cnt) / following_dict[user]['all_following_pin']
528 | 
529 |                                 #print user + "\t" + following_user + "\t" + str(feature8)
530 |                                 #print str(user_key[user]) + "\t" + str(user_key[following_user]) + "\t" + str(feature8)
531 |                                 if feature9 > 0.0:
532 |                                         fp5.write(str(user_key[user]) + "\t" + str(user_key[following_user]) + "\t" + str(feature9) + "\n")
533 | 
534 | 
535 | 
536 | 	## Sparse Matrix Size Setting ##
537 | 	max_user_key = len(user_key) - 1
538 | 	fp4.write("0\t0\t0.0\n")
539 | 	fp4.write(str(max_user_key) + "\t" + str(max_user_key) + "\t" + str(0.0) + "\n")
540 | 	fp5.write("0\t0\t0.0\n")
541 | 	fp5.write(str(max_user_key) + "\t" + str(max_user_key) + "\t" + str(0.0) + "\n")
542 | 
543 | 
544 | 	fp.close()
545 | 	fp2.close()
546 | 	fp3.close()
547 | 	fp4.close()
548 | 	fp5.close()
549 | 
550 | 	print "End generating edge features"
551 | 
552 | 	closeDB(pin_db, pin_cur)
553 | 	closeDB(all_db, all_cur)
554 | 
555 | 	return feature_file_name, edge_file_name, edge_file_name2, key_file_name, regression_file_name
556 | 
557 | #generateCCRFFeature()
558 | 


--------------------------------------------------------------------------------