├── .gitignore ├── TISC.py ├── README.md ├── ccrf_module2.py └── generate_ccrf_feature_fast_module_pin_info_follower.py /.gitignore: -------------------------------------------------------------------------------- 1 | pin/ 2 | -------------------------------------------------------------------------------- /TISC.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | from os.path import isfile, join 3 | import pickle 4 | import networkx as nx 5 | from random import randint 6 | import generate_ccrf_feature_fast_module_pin_info_follower as gen_ccrf 7 | import ccrf_module2 as ccrf 8 | import sys 9 | 10 | 11 | test_graph = pickle.load(open('graph/pinterest_test_graph.pickle')) 12 | test_u_graph = test_graph.to_undirected() 13 | training_graph = pickle.load(open('graph/pinterest_training_graph.pickle')) 14 | training_u_graph = training_graph.to_undirected() 15 | 16 | query = 'design' 17 | feature_filename, edge_filename, edge_filename2, key_filename, regression_file_name = gen_ccrf.generateCCRFFeature(training_graph, training_u_graph, "train", query) 18 | ''' 19 | alpha, beta, beta2 = ccrf.learning(feature_filename, edge_filename, edge_filename2) 20 | print alpha 21 | print beta 22 | print beta2 23 | ''' 24 | feature_filename, edge_filename, edge_filename2, key_filename, regression_file_name = gen_ccrf.generateCCRFFeature(test_graph, test_u_graph, "test", query) 25 | #score_dict = ccrf.prediction_dict(alpha, beta, beta2, feature_filename, edge_filename, edge_filename2, key_filename) 26 | ''' 27 | f = open("prediction_result.txt", 'w') 28 | for user in score_dict: 29 | f.write(user + "\t" + str(score_dict[user]) + "\n") 30 | f.close() 31 | ''' 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Topical-Influence 2 | 3 | ========================= 4 | ## Overview 5 | Social curation services are emerging social media 6 | platforms that enable users to curate their contents according to 7 | the topic and express their interests at the topic level by following 8 | curated collections of other users’ contents rather than the users 9 | themselves. The topic-level information revealed through this 10 | new feature far exceeds what existing methods solicit from the 11 | traditional social networking services, to greatly enhance the 12 | quality of topic-sensitive influence modeling. In this paper, we 13 | propose a novel model called the topical influence with social 14 | curation (TISC) to find influential users from social curation 15 | services. This model, formulated by the continuous conditional 16 | random field, fully takes advantage of the explicitly available 17 | topic-level information reflected in both contents and interactions. 18 | In order to validate its merits, we comprehensively compare 19 | TISC with state-of-the-art models using two real-world data 20 | sets collected from Pinterest and Scoop.it. The results show that 21 | TISC achieves higher accuracy by up to around 80% and finds 22 | more convincing results in case studies than the other models. 23 | Moreover, we develop a distributed learning algorithm on Spark 24 | and demonstrate its excellent scalability on a cluster of 48 cores. 25 | 26 | ## Data Sets 27 | [Download](http://dmserver6.kaist.ac.kr/TISC/TISC_graph.zip) 28 | -------------------------------------------------------------------------------- /ccrf_module2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import stats 3 | from scipy.sparse import * 4 | from scipy import * 5 | import math 6 | 7 | def learning(feature_filename, edge_filename, edge_filename2): 8 | 9 | filepath = feature_filename 10 | 11 | data = np.loadtxt(fname=filepath, delimiter='\t') 12 | data = stats.zscore(data) 13 | 14 | relation_filepath = edge_filename 15 | 16 | relation_sparse = np.loadtxt(fname=relation_filepath, delimiter='\t') 17 | 18 | row = relation_sparse[:,0] 19 | col = relation_sparse[:,1] 20 | weight = relation_sparse[:,2] 21 | 22 | R = csr_matrix((weight,(row,col))).todense() 23 | R = np.transpose(R) 24 | 25 | relation_filepath = edge_filename2 26 | 27 | relation_sparse = np.loadtxt(fname=relation_filepath, delimiter='\t') 28 | 29 | row = relation_sparse[:,0] 30 | col = relation_sparse[:,1] 31 | weight = relation_sparse[:,2] 32 | 33 | R2 = csr_matrix((weight,(row,col))).todense() 34 | R2 = np.transpose(R2) 35 | 36 | 37 | y = np.transpose(np.matrix(data[:,0])) 38 | X = np.matrix(data[:,1:]) 39 | 40 | num_data, num_x_feature = X.shape 41 | 42 | num_y_feature = 2 43 | num_feature = num_x_feature + num_y_feature 44 | 45 | alpha = np.matrix(1 * np.ones((num_x_feature, 1))) 46 | alpha_new = np.matrix(alpha) 47 | delta_log_alpha = np.matrix(0.0 * np.ones((num_x_feature, 1))) 48 | 49 | beta = 0.1 50 | beta_new = beta 51 | 52 | beta2 = 0.1 53 | beta2_new = beta2 54 | 55 | n = num_data 56 | 57 | D_r_1 = np.matrix(np.diag(np.squeeze(np.asarray(R.sum(axis=1))))) 58 | D_c_1 = np.matrix(np.diag(np.squeeze(np.asarray(R.sum(axis=0))))) 59 | 60 | D_r_2 = np.matrix(np.diag(np.squeeze(np.asarray(R2.sum(axis=1))))) 61 | D_c_2 = np.matrix(np.diag(np.squeeze(np.asarray(R2.sum(axis=0))))) 62 | 63 | 64 | iterations = 100 65 | learning_rate = 0.000001 66 | precision = 0.0017 67 | 68 | for i in range(iterations): 69 | a = np.transpose(alpha) * np.matrix(np.ones((alpha.shape[0], 1))) 70 | b = 2*X*alpha + beta*(D_r_1-D_c_1) * np.matrix(np.ones((D_r_1.shape[0],1))) + beta2*(D_r_2-D_c_2) * np.matrix(np.ones((D_r_2.shape[0],1))) 71 | b_t = np.transpose(b) 72 | 73 | for k in range(num_x_feature): 74 | x_k = X[:,k] 75 | alpha_k = alpha[k] 76 | log_alpha_k = math.log(alpha_k) 77 | 78 | delta_log_alpha_k = alpha_k*(n/(2*a) + (1/(4*a**2))*b_t*b - (1/(2*a))*b_t*x_k + np.transpose(x_k)*x_k - np.transpose(y-x_k)*(y-x_k)) 79 | delta_log_alpha[k] = delta_log_alpha_k 80 | log_alpha_k = log_alpha_k + learning_rate*delta_log_alpha_k 81 | alpha_new[k] = math.exp(log_alpha_k) 82 | 83 | delta_beta = -1/(2*a)*b_t*(D_r_1-D_c_1)*np.matrix(np.ones((D_r_1.shape[0],1))) + np.transpose(D_r_1*y)*np.matrix(np.ones((y.shape[0],1))) - np.transpose(D_c_1*y) * np.matrix(np.ones((y.shape[0],1))) 84 | beta_new = beta + learning_rate*delta_beta 85 | 86 | delta_beta2 = -1/(2*a)*b_t*(D_r_2-D_c_2)*np.matrix(np.ones((D_r_2.shape[0],1))) + np.transpose(D_r_2*y)*np.matrix(np.ones((y.shape[0],1))) - np.transpose(D_c_2*y) * np.matrix(np.ones((y.shape[0],1))) 87 | beta2_new = beta2 + learning_rate*delta_beta2 88 | 89 | 90 | delta_alpha_beta = np.linalg.norm(np.concatenate((alpha_new, beta_new, beta2_new)) - np.concatenate((alpha, [[beta],[beta2]]))) / np.linalg.norm(np.concatenate((alpha, [[beta],[beta2]]))) 91 | 92 | if math.isinf(delta_alpha_beta) or delta_alpha_beta < precision or np.linalg.norm(np.concatenate((delta_log_alpha, delta_beta, delta_beta2))) < precision: 93 | break 94 | 95 | alpha = np.copy(alpha_new) 96 | beta = beta_new[0,0] 97 | beta2 = beta2_new[0,0] 98 | #print "Iteration " + str(i) 99 | #print alpha 100 | #print beta 101 | 102 | return alpha, beta, beta2 103 | 104 | def prediction(alpha, beta1, beta2, feature_filename, edge_filename, edge_filename2): 105 | filepath = feature_filename 106 | 107 | data = np.loadtxt(fname=filepath, delimiter='\t') 108 | data = stats.zscore(data) 109 | 110 | relation_filepath = edge_filename 111 | 112 | relation_sparse = np.loadtxt(fname=relation_filepath, delimiter='\t') 113 | 114 | row = relation_sparse[:,0] 115 | col = relation_sparse[:,1] 116 | weight = relation_sparse[:,2] 117 | 118 | R = csr_matrix((weight,(row,col))).todense() 119 | R = np.transpose(R) 120 | 121 | relation_filepath = edge_filename2 122 | 123 | relation_sparse = np.loadtxt(fname=relation_filepath, delimiter='\t') 124 | 125 | row = relation_sparse[:,0] 126 | col = relation_sparse[:,1] 127 | weight = relation_sparse[:,2] 128 | 129 | R2 = csr_matrix((weight,(row,col))).todense() 130 | R2 = np.transpose(R2) 131 | 132 | y = np.transpose(np.matrix(data[:,0])) 133 | X = np.matrix(data[:,1:]) 134 | 135 | num_data, num_x_feature = X.shape 136 | 137 | D_r_1 = np.matrix(np.diag(np.squeeze(np.asarray(R.sum(axis=1))))) 138 | D_c_1 = np.matrix(np.diag(np.squeeze(np.asarray(R.sum(axis=0))))) 139 | 140 | D_r_2 = np.matrix(np.diag(np.squeeze(np.asarray(R2.sum(axis=1))))) 141 | D_c_2 = np.matrix(np.diag(np.squeeze(np.asarray(R2.sum(axis=0))))) 142 | 143 | 144 | #print type(np.transpose(alpha)) 145 | #print type(np.ones((alpha.shape[0],1))) 146 | 147 | #print ((1 / (np.transpose(alpha) * np.matrix(np.ones((alpha.shape[0],1)))))[0,0]) 148 | #print type(np.array((2*X*alpha + beta * (D_r-D_c) * np.matrix(np.ones((D_r.shape[0],1)))))[:,0]) 149 | 150 | 151 | y_prec = (1.0 / (np.transpose(alpha) * np.matrix(np.ones((alpha.shape[0],1)))))[0,0] * (2*X*alpha + beta1 * (D_r_1-D_c_1) * np.matrix(np.ones((D_r_1.shape[0],1))) + beta2 * (D_r_2-D_c_2) * np.matrix(np.ones((D_r_2.shape[0],1)))) 152 | 153 | return np.array(y_prec)[:,0] 154 | 155 | def prediction_dict(alpha, beta, beta2, feature_filename, edge_filename, edge_filename2, key_filename): 156 | y_prec = prediction(alpha, beta, beta2, feature_filename, edge_filename, edge_filename2) 157 | 158 | y_prec_dict = dict() 159 | 160 | for line in open(key_filename): 161 | line_info = line.split("\t") 162 | key = int(line_info[0]) 163 | username = line_info[1].split("\n")[0] 164 | 165 | y_prec_dict[username] = y_prec[key] 166 | 167 | fp = open("ccrf_result.txt", "w") 168 | for user in y_prec_dict: 169 | fp.write(user + "\t" + str(y_prec_dict[user]) + "\n") 170 | fp.close() 171 | 172 | return y_prec_dict 173 | 174 | def ccrf(feature_filename, edge_filename, key_filename): 175 | alpha, beta = learning(feature_filename, edge_filename) 176 | y_prec = prediction(alpha, beta, feature_filename, edge_filename) 177 | 178 | y_prec_dict = dict() 179 | 180 | for line in open(key_filename): 181 | line_info = line.split("\t") 182 | key = int(line_info[0]) 183 | username = line_info[1].split("\n")[0] 184 | 185 | y_prec_dict[username] = y_prec[key] 186 | 187 | 188 | return y_prec_dict 189 | 190 | 191 | -------------------------------------------------------------------------------- /generate_ccrf_feature_fast_module_pin_info_follower.py: -------------------------------------------------------------------------------- 1 | import MySQLdb 2 | from nltk.stem.porter import * 3 | import numpy as np 4 | import math 5 | from scipy import stats 6 | #import ground_truth 7 | import pickle 8 | import networkx as nx 9 | 10 | SELECT_TRAINING_USER_LIST_SQL = "SELECT * FROM userList WHERE state=2" 11 | SELECT_BOARD_BY_USER_SQL = "SELECT * FROM board WHERE user_id = %s" 12 | SELECT_BOARD_INFO_SQL = "SELECT * FROM board WHERE board_id = %s" 13 | SELECT_FOLLOWING_BOARD_SQL = "SELECT * FROM following_board WHERE user_id = %s and board_id = %s" 14 | SELECT_FOLLOWING_USER_SQL = "SELECT * FROM following_user WHERE user_id = %s and following_id = %s" 15 | SELECT_COUNT_PIN_SQL = "SELECT count(*) FROM pin WHERE board_href = %s" 16 | SELECT_USER_LIST_SQL = "SELECT * FROM userInfo ORDER BY follower_cnt DESC LIMIT 5000" 17 | SELECT_GROUND_TRUTH_INFO_SQL = "SELECT * FROM ground_truth_info WHERE username = %s" 18 | 19 | def connectDB(db_name): 20 | db = None 21 | cursor = None 22 | 23 | if db_name == 'pin': 24 | db = MySQLdb.connect(host='dmserver1.kaist.ac.kr', user='daehoon', passwd='rlaeogns', db='pinterest_design_pin', charset='utf8', use_unicode=True) 25 | cursor = db.cursor() 26 | cursor.execute("set names utf8") 27 | 28 | elif db_name == 'all': 29 | db = MySQLdb.connect(host='dmserver1.kaist.ac.kr', user='daehoon', passwd='rlaeogns', db='pinterest_design', charset='utf8', use_unicode=True) 30 | cursor = db.cursor() 31 | cursor.execute("set names utf8") 32 | 33 | return db, cursor 34 | 35 | def closeDB(db, cur): 36 | cur.close() 37 | db.close() 38 | 39 | def selectDB(db, cursor, SqlQuery, params=()): 40 | cursor.execute(SqlQuery, params) 41 | return cursor.fetchall() 42 | 43 | def getUserOfBoard(u_graph, board_href): 44 | neighbors = u_graph[board_href] 45 | for neighbor in neighbors: 46 | if neighbors[neighbor]['type'] == 'curated': 47 | return neighbor 48 | 49 | def getBoardList(u_graph, username): 50 | board_list = list() 51 | neighbors = u_graph[username] 52 | for neighbor in neighbors: 53 | if neighbors[neighbor]['type'] == 'curated': 54 | board_list.append(neighbor) 55 | return board_list 56 | 57 | def getBoardFollowerList(u_graph, board_id): 58 | follower_list = list() 59 | neighbors = u_graph[board_id] 60 | for neighbor in neighbors: 61 | if neighbors[neighbor]['type'] == 'following': 62 | follower_list.append(neighbor) 63 | return follower_list 64 | 65 | def reverseScoreFeature(feature): 66 | if np.isnan(feature): 67 | feature = 0.0 68 | elif feature == 0.0: 69 | feature = 1.0 70 | else: 71 | feature = 1.0 / feature 72 | 73 | return feature 74 | 75 | def isFollowingBoard(graph, user_id, board_id): 76 | if board_id in graph[user_id]: 77 | if graph[user_id][board_id]['type'] == 'following': 78 | return True 79 | return False 80 | 81 | def getGroundTruthFeatureByFile(filepath): 82 | user_ground_truth_feature = dict() 83 | f = open(filepath) 84 | idx = 0 85 | for line in f: 86 | if idx > 0: 87 | feature_info = line.split('\t') 88 | username = feature_info[0] 89 | user_ground_truth_feature[username] = dict() 90 | user_ground_truth_feature[username]['like'] = int(feature_info[1]) 91 | user_ground_truth_feature[username]['repin'] = int(feature_info[2]) 92 | user_ground_truth_feature[username]['comment'] = int(feature_info[3]) 93 | user_ground_truth_feature[username]['word'] = int(feature_info[4]) 94 | user_ground_truth_feature[username]['query_follower'] = int(feature_info[5]) 95 | user_ground_truth_feature[username]['query_follower_weight'] = int(feature_info[6]) 96 | idx += 1 97 | 98 | return user_ground_truth_feature 99 | 100 | def getGroundTruthFeatureInDB(db, cur, user_list): 101 | user_ground_truth_feature = dict() 102 | for username in user_list: 103 | result = selectDB(db, cur, SELECT_GROUND_TRUTH_INFO_SQL, params=(username,)) 104 | 105 | if len(result) > 0: 106 | ground_truth_info = result[0] 107 | comment_cnt = ground_truth_info[2] 108 | word_cnt = ground_truth_info[3] 109 | like_cnt = ground_truth_info[4] 110 | repin_cnt = ground_truth_info[5] 111 | query_follower = ground_truth_info[6] 112 | query_follower_weight = ground_truth_info[7] 113 | else: 114 | comment_cnt = 0 115 | word_cnt = 0 116 | like_cnt = 0 117 | repin_cnt = 0 118 | query_follower = 0 119 | query_follower_weight = 0 120 | 121 | user_ground_truth_feature[username] = dict() 122 | user_ground_truth_feature[username]['like'] = like_cnt 123 | user_ground_truth_feature[username]['repin'] = repin_cnt 124 | user_ground_truth_feature[username]['comment'] = comment_cnt 125 | user_ground_truth_feature[username]['word'] = word_cnt 126 | user_ground_truth_feature[username]['query_follower'] = query_follower 127 | user_ground_truth_feature[username]['query_follower_weight'] = query_follower_weight 128 | 129 | return user_ground_truth_feature 130 | 131 | def convertZscore(score_dict): 132 | repin_score = list() 133 | repin_zscore = list() 134 | word_score = list() 135 | word_zscore = list() 136 | like_score = list() 137 | like_zscore = list() 138 | comment_score = list() 139 | comment_zscore = list() 140 | query_follower_score = list() 141 | query_follower_zscore = list() 142 | query_follower_weight_score = list() 143 | query_follower_Weight_zscore = list() 144 | 145 | user_key_list = list() 146 | 147 | feature_zscore = dict() 148 | 149 | for key in score_dict: 150 | repin = score_dict[key]['repin'] 151 | word = score_dict[key]['word'] 152 | like = score_dict[key]['like'] 153 | comment = score_dict[key]['comment'] 154 | query_follower = score_dict[key]['query_follower'] 155 | query_follower_weight = score_dict[key]['query_follower_weight'] 156 | 157 | repin_score.append(repin) 158 | word_score.append(word) 159 | like_score.append(like) 160 | comment_score.append(comment) 161 | query_follower_score.append(query_follower) 162 | query_follower_weight_score.append(query_follower_weight) 163 | 164 | user_key_list.append(key) 165 | 166 | repin_zscore = stats.zscore(np.array(repin_score)) 167 | word_zscore = stats.zscore(np.array(word_score)) 168 | like_zscore = stats.zscore(np.array(like_score)) 169 | comment_zscore = stats.zscore(np.array(comment_score)) 170 | query_follower_zscore = stats.zscore(np.array(query_follower_score)) 171 | query_follower_weight_zscore = stats.zscore(np.array(query_follower_weight_score)) 172 | 173 | for idx, user in enumerate(user_key_list): 174 | feature_zscore[user] = dict() 175 | feature_zscore[user]['repin'] = repin_zscore[idx] 176 | feature_zscore[user]['word'] = word_zscore[idx] 177 | feature_zscore[user]['like'] = like_zscore[idx] 178 | feature_zscore[user]['comment'] = comment_zscore[idx] 179 | feature_zscore[user]['query_follower'] = query_follower_zscore[idx] 180 | feature_zscore[user]['query_follower_weight'] = query_follower_weight_zscore[idx] 181 | 182 | return feature_zscore 183 | 184 | 185 | def convertDictToZscore(data_dict): 186 | z_score_dict = dict() 187 | score_list = list() 188 | key_list = list() 189 | for key in data_dict.keys(): 190 | score_list.append(data_dict[key]) 191 | key_list.append(key) 192 | 193 | z_score = stats.zscore(np.array(score_list)) 194 | 195 | for idx, key in enumerate(key_list): 196 | z_score_dict[key] = z_score[idx] 197 | 198 | return z_score_dict 199 | 200 | 201 | def calculateGroundTruth(feature_list): 202 | result = 0.0 203 | for feature in feature_list: 204 | result += feature 205 | 206 | ''' PRODUCT 207 | result = 1.0 208 | for feature in feature_list: 209 | result *= feature 210 | ''' 211 | ''' MEAN 212 | result = np.mean(feature_list) 213 | ''' 214 | 215 | return result 216 | 217 | 218 | def getNumOfPinGraph(u_graph, board_id): 219 | pin_num = 0 220 | 221 | if 'pin_num' in u_graph.node[board_id]: 222 | pin_num = u_graph.node[board_id]['pin_num'] 223 | 224 | return pin_num 225 | 226 | ''' 227 | 228 | pin_list = list() 229 | neighbors = u_graph[board_id] 230 | for neighbor in neighbors: 231 | if neighbors[neighbor]['type'] == 'curated-pin': 232 | pin_list.append(neighbor) 233 | 234 | return len(pin_list) 235 | ''' 236 | 237 | ############### MAIN LOGIC ############### 238 | 239 | def generateCCRFFeature(graph, u_graph, file_postfix, query): 240 | 241 | #file_postfix = '20150914_following' 242 | #query = 'design' 243 | #graph = pickle.load(open('../Graph/query_connected_graph_pin_info.pickle')) 244 | #u_graph = graph.to_undirected() 245 | 246 | # Get target user list 247 | user_list = list() 248 | for node in graph.nodes(): 249 | if graph.node[node]['type'] == 'user': 250 | user_list.append(node) 251 | 252 | print "User Cnt : " + str(len(user_list)) 253 | 254 | #query = 'design' 255 | 256 | pin_db, pin_cur = connectDB('pin') 257 | all_db, all_cur = connectDB('all') 258 | 259 | 260 | user_ground_truth_feature = getGroundTruthFeatureInDB(pin_db, pin_cur, user_list) 261 | #print user_ground_truth_feature 262 | #user_ground_truth_feature = getGroundTruthFeatureByFile('ground_truth_feature_training_' + file_postfix + '.txt') 263 | user_ground_truth_feature_z_score = convertZscore(user_ground_truth_feature) 264 | 265 | #user_list = user_ground_truth_feature.keys() 266 | #print "Valid User Cnt : " + str(len(user_list)) 267 | 268 | feature_file_name = 'pinterest_train_' + file_postfix + '.txt' 269 | regression_file_name = 'regression_train_' + file_postfix + '.txt' 270 | 271 | # just for information 272 | fp = open('user_feature_train_'+file_postfix+'.txt', 'w') 273 | fp3 = open('all_feature_train_'+file_postfix+'.txt', 'w') 274 | 275 | # return feature file 276 | fp2 = open(feature_file_name, 'w') 277 | fp_regression = open(regression_file_name, 'w') 278 | 279 | user_feature = dict() 280 | 281 | print "Start generating features" 282 | 283 | user_ground_truth_feature_follower = dict() 284 | 285 | for idx, username in enumerate(user_list): 286 | #username = result[0] 287 | #if idx % 100 == 0: 288 | # print str(idx) 289 | 290 | if not user_feature.has_key(username): 291 | user_feature[username] = dict() 292 | user_feature[username]['pin'] = list() 293 | user_feature[username]['follower'] = list() 294 | user_feature[username]['all_pin'] = list() 295 | user_feature[username]['unique_follower'] = set() 296 | user_feature[username]['all_follower'] = list() 297 | 298 | board_list = getBoardList(u_graph, username) 299 | 300 | for board in board_list: 301 | board_href = board 302 | board_category = graph.node[board]['category'] 303 | #board_info = getBoardInfoDB(all_db, all_cur, board_href) 304 | #pin_num = board_info[4] ## TODO : Change get pins on graph 305 | pin_num = getNumOfPinGraph(u_graph, board_href) 306 | follower_list = getBoardFollowerList(u_graph, board_href) 307 | follower_num = len(follower_list) 308 | 309 | if board_category == query: 310 | #print str(getNumOfCrawledPin(pin_db, pin_cur, board_href)) + " / " + str(pin_num) 311 | #pin_num = getNumOfCrawledPin(pin_db, pin_cur, board_href) 312 | user_feature[username]['pin'].append(pin_num) 313 | user_feature[username]['follower'].append(follower_num) 314 | for follower in follower_list: 315 | user_feature[username]['unique_follower'].add(follower) 316 | 317 | user_feature[username]['all_pin'].append(pin_num) 318 | user_feature[username]['all_follower'].append(follower_num) 319 | 320 | follower_feature = sum(user_feature[username]['follower']) 321 | 322 | user_ground_truth_feature_follower[username] = follower_feature 323 | 324 | user_ground_truth_feature_follower_zscore = convertDictToZscore(user_ground_truth_feature_follower) 325 | 326 | for idx, username in enumerate(user_list): 327 | #### Generate Feature #### 328 | #### Feature 1 #### 329 | feature1 = sum(user_feature[username]['pin']) 330 | #### Feature 2 #### 331 | feature2 = sum(user_feature[username]['follower']) 332 | #### feature 3 #### 333 | feature3 = np.inner(user_feature[username]['pin'], user_feature[username]['follower']) 334 | if feature3 == False: 335 | feature3 = 0.0 336 | #### feature 4 #### 337 | feature4 = 0.0 338 | all_pin_num = sum(user_feature[username]['all_pin']) 339 | query_pin_num = sum(user_feature[username]['pin']) 340 | if all_pin_num > 0: 341 | feature4 = query_pin_num / float(all_pin_num) 342 | 343 | #### feature 5 #### 344 | 345 | feature_follow_ratio = 0.0 346 | all_follow_num = sum(user_feature[username]['all_follower']) 347 | follow_num = sum(user_feature[username]['follower']) 348 | if all_follow_num > 0: 349 | feature_follow_ratio = follow_num / float(all_follow_num) 350 | 351 | ### feature unique follower ## 352 | feature_u_f = 0 353 | if username in user_feature: 354 | feature_u_f = len(user_feature[username]['unique_follower']) 355 | 356 | 357 | ### feature 5 ### 358 | #feature5 = 0.0 359 | #feature5 = reverseScoreFeature(np.std(user_feature[username]['pin'])) 360 | ### feature 6 ### 361 | #feature6 = 0.0 362 | #feature6 = reverseScoreFeature(np.std(user_feature[username]['follower'])) 363 | ### feature 7 ### 364 | #feature7 = 0.0 365 | #pinBYfollower = list() 366 | #for idx in range(0, len(user_feature[username]['pin'])): 367 | # pinBYfollower.append(user_feature[username]['pin'][idx] * user_feature[username]['follower'][idx]) 368 | #feature7 = reverseScoreFeature(np.std(pinBYfollower)) 369 | 370 | 371 | result_content = username + "\t" + str(feature1) + "\t" + str(feature2) + "\t" + str(feature3) + "\t" + str(feature4) + "\t" + str(feature_u_f)# + "\t" + str(feature5) + "\t" + str(feature6) + "\t" + str(feature7) 372 | fp.write(result_content + "\n") 373 | 374 | #score = calculateGroundTruth([user_ground_truth_feature_z_score[username]['repin'], user_ground_truth_feature_z_score[username]['like'], user_ground_truth_feature_z_score[username]['comment'], user_ground_truth_feature_z_score[username]['query_follower'], user_ground_truth_feature_z_score[username]['word'], user_ground_truth_feature_z_score[username]['query_follower_weight']]) 375 | 376 | #score = calculateGroundTruth([user_ground_truth_feature_z_score[username]['repin'], user_ground_truth_feature_z_score[username]['like'], user_ground_truth_feature_z_score[username]['comment'], user_ground_truth_feature_z_score[username]['word']]) 377 | 378 | score = calculateGroundTruth([user_ground_truth_feature_z_score[username]['repin'], user_ground_truth_feature_z_score[username]['word'], user_ground_truth_feature_follower_zscore[username]]) 379 | 380 | result_content = str(score) + "\t" + str(feature1) + "\t" + str(feature4) + "\t" + str(feature_follow_ratio) # str(feature2) + "\t" + str(feature3) + "\t" + str(feature4) + "\t" + str(feature_u_f)# + "\t" + str(feature5) + "\t" + str(feature6) + "\t" + str(feature7) 381 | fp2.write(result_content + "\n") 382 | 383 | fp_regression.write(str(score) + "\t" + str(feature1) + "\t" + str(feature2) + "\t" + str(feature3) + "\t" + str(feature4) + "\t" + str(feature_follow_ratio) + "\n") 384 | 385 | y_feature1 = user_ground_truth_feature_z_score[username]['repin'] 386 | y_feature2 = user_ground_truth_feature_z_score[username]['like'] 387 | y_feature3 = user_ground_truth_feature_z_score[username]['word'] 388 | y_feature4 = user_ground_truth_feature_z_score[username]['comment'] 389 | y_feature5 = user_ground_truth_feature_z_score[username]['query_follower'] 390 | y_feature6 = user_ground_truth_feature_z_score[username]['query_follower_weight'] 391 | y_feature7 = user_ground_truth_feature_follower_zscore[username] 392 | 393 | result_content = str(y_feature1) + "\t" + str(y_feature2) + "\t" + str(y_feature3) + "\t" + str(y_feature4) + "\t" + str(y_feature5) + "\t" + str(y_feature6)+ "\t" + str(y_feature7) + "\t" + str(feature1) + "\t" + str(feature2) + "\t" + str(feature3) + "\t" + str(feature4) + "\t" + str(feature_u_f)# + "\t" + str(feature5) + "\t" + str(feature6) + "\t" + str(feature7) 394 | 395 | #print result_content 396 | fp3.write(result_content + "\n") 397 | 398 | fp.close() 399 | fp2.close() 400 | fp3.close() 401 | fp_regression.close() 402 | 403 | print "End generating features" 404 | 405 | 406 | print "Start generating edge features" 407 | 408 | ### feature 8 ### 409 | 410 | edge_file_name = 'edge_list_key_train_'+file_postfix+'.txt' 411 | edge_file_name2 = 'edge2_list_key_train_'+file_postfix+'.txt' 412 | key_file_name = 'edge_key_train_'+file_postfix+'.txt' 413 | 414 | fp = open('edge_feature_train_'+file_postfix+'.txt','w') 415 | fp2 = open('edge_list_train_'+file_postfix+'.txt','w') 416 | fp3 = open(key_file_name,'w') 417 | fp4 = open(edge_file_name,'w') 418 | fp5 = open(edge_file_name2, 'w') 419 | 420 | ## TODO USER TO KEY ## 421 | 422 | user_key = dict() 423 | following_dict = dict() 424 | 425 | 426 | user_board_size = dict() 427 | user_all_board_size = dict() 428 | 429 | for idx, username in enumerate(user_list): 430 | user_key[username] = idx 431 | fp3.write(str(user_key[username]) + "\t" + username + "\n") 432 | 433 | following_dict[username] = dict() 434 | following_dict[username]['all_following_pin'] = 0.0 435 | user_board_size[username] = 0.0 436 | 437 | board_list = getBoardList(u_graph, username) 438 | for board in board_list: 439 | #board_info = getBoardInfoDB(all_db, all_cur, board) 440 | #pin_num = board_info[4] 441 | pin_num = getNumOfPinGraph(u_graph, board) 442 | board_category = graph.node[board]['category'] 443 | if board_category == query: 444 | user_board_size[username] += pin_num 445 | 446 | if not username in user_all_board_size: 447 | user_all_board_size[username] = 0 448 | user_all_board_size[username] += pin_num 449 | 450 | ## GET BOARD FOLLOWING EDGE ## 451 | 452 | for idx, edge in enumerate(graph.edges()): 453 | if graph[edge[0]][edge[1]]['type'] == 'following': 454 | board_href = edge[1] 455 | board_category = graph.node[board_href]['category'] 456 | 457 | if board_category == query: 458 | #board_info = getBoardInfoDB(all_db, all_cur, board_href) 459 | #board_user = board_info[6].split('/')[1] 460 | board_user = getUserOfBoard(u_graph, board_href) 461 | #board_user = graph.node[board_href]['user'] 462 | #board_user = edge[1].split('/')[1] 463 | if not board_user in following_dict[edge[0]]: 464 | following_dict[edge[0]][board_user] = dict() 465 | following_dict[edge[0]][board_user]['following'] = 0.0 466 | following_dict[edge[0]][board_user]['all'] = 0.0 467 | 468 | #pin_num = board_info[4] ## TODO : Change get pins on graph 469 | pin_num = getNumOfPinGraph(u_graph, board_href) 470 | following_dict[edge[0]][board_user]['following'] += pin_num 471 | following_dict[edge[0]]['all_following_pin'] += pin_num 472 | 473 | ## edge[user][user]['follwoing'] 474 | ## edge[user][user]['all'] 475 | ## Calculating Score ## 476 | 477 | following_ratio_dict = dict() 478 | for user in following_dict: 479 | following_ratio_dict[user] = list() 480 | for following_user in following_dict[user]: 481 | if following_user != "all_following_pin": 482 | query_following_pin_cnt = following_dict[user][following_user]['following'] 483 | query_all_pin_cnt = user_board_size[following_user] ### 484 | if query_all_pin_cnt > 0.0: 485 | ratio = float(query_following_pin_cnt) / query_all_pin_cnt 486 | following_ratio_dict[user].append(ratio) 487 | 488 | 489 | for user in following_dict: 490 | for following_user in following_dict[user]: 491 | if following_user != 'all_following_pin': 492 | 493 | query_following_pin_cnt = following_dict[user][following_user]['following'] 494 | query_all_pin_cnt = user_board_size[following_user] #user_board_size[following_user] 495 | feature8 = 0.0 496 | sum_following_ratio = 0.0 497 | 498 | if len(following_ratio_dict) > 0: 499 | sum_following_ratio = sum(following_ratio_dict[user]) 500 | 501 | if query_all_pin_cnt > 0.0 and sum_following_ratio > 0: 502 | feature8 = (float(query_following_pin_cnt) / query_all_pin_cnt) / float(sum_following_ratio) 503 | #print user + "\t" + following_user + "\t" + str(feature8) 504 | #else: 505 | #print "Pin Zero User :: " + following_user 506 | 507 | ''' 508 | #print str(user) + "\t" + str(following_user) 509 | query_following_pin_cnt = following_dict[user][following_user]['following'] 510 | feature8 = 0.0 511 | if following_dict[user]['all_following_pin'] > 0: 512 | feature8 = float(query_following_pin_cnt) / following_dict[user]['all_following_pin'] 513 | ''' 514 | 515 | #print user + "\t" + following_user + "\t" + str(feature8) 516 | #print str(user_key[user]) + "\t" + str(user_key[following_user]) + "\t" + str(feature8) 517 | if feature8 > 0.0: 518 | fp2.write(user + "\t" + following_user + "\t" + str(feature8) + "\n") 519 | fp4.write(str(user_key[user]) + "\t" + str(user_key[following_user]) + "\t" + str(feature8) + "\n") 520 | 521 | 522 | 523 | query_following_pin_cnt = following_dict[user][following_user]['following'] 524 | feature9 = 0.0 525 | 526 | if following_dict[user]['all_following_pin'] > 0: 527 | feature9 = float(query_following_pin_cnt) / following_dict[user]['all_following_pin'] 528 | 529 | #print user + "\t" + following_user + "\t" + str(feature8) 530 | #print str(user_key[user]) + "\t" + str(user_key[following_user]) + "\t" + str(feature8) 531 | if feature9 > 0.0: 532 | fp5.write(str(user_key[user]) + "\t" + str(user_key[following_user]) + "\t" + str(feature9) + "\n") 533 | 534 | 535 | 536 | ## Sparse Matrix Size Setting ## 537 | max_user_key = len(user_key) - 1 538 | fp4.write("0\t0\t0.0\n") 539 | fp4.write(str(max_user_key) + "\t" + str(max_user_key) + "\t" + str(0.0) + "\n") 540 | fp5.write("0\t0\t0.0\n") 541 | fp5.write(str(max_user_key) + "\t" + str(max_user_key) + "\t" + str(0.0) + "\n") 542 | 543 | 544 | fp.close() 545 | fp2.close() 546 | fp3.close() 547 | fp4.close() 548 | fp5.close() 549 | 550 | print "End generating edge features" 551 | 552 | closeDB(pin_db, pin_cur) 553 | closeDB(all_db, all_cur) 554 | 555 | return feature_file_name, edge_file_name, edge_file_name2, key_file_name, regression_file_name 556 | 557 | #generateCCRFFeature() 558 | --------------------------------------------------------------------------------