├── Baselines ├── Content-based Active Learning │ ├── MatrixFactorization.py │ ├── content_based_active_learning.py │ └── read_write.py └── Factorized Decision Tree │ ├── FactorizedDecisionTree.py │ ├── MatrixFactorization.py │ ├── factorized_decision_tree.py │ └── read_write.py ├── README.md ├── Step1-Preprocessing ├── Commands.txt ├── LDA.py ├── buildtree_preparation.py ├── item_information.py ├── item_similarity.py ├── k_medoids.py ├── my_fitnlm.m ├── read2df.py ├── read_write.py ├── similarity_parameters.py ├── user_clustering.py ├── user_information.py └── user_similarity.py ├── Step2-Model ├── DecisionTree.py ├── MatrixFactorization.py ├── build_tree.py └── read_write.py └── script.py /Baselines/Content-based Active Learning/MatrixFactorization.py: -------------------------------------------------------------------------------- 1 | import shelve 2 | import pickle 3 | import numpy as np 4 | from scipy.sparse import * 5 | from pyspark.mllib.recommendation import ALS 6 | from pyspark.sql import SparkSession 7 | from pyspark import SparkConf 8 | from pyspark import SparkContext 9 | 10 | class MatrixFactorization: 11 | def __init__(self, usernum, itemnum, maxIter=15, regParam=0.01, rank=10): 12 | self.maxIter = maxIter 13 | self.regParam = regParam 14 | self.rank = rank 15 | self.usernum = usernum 16 | self.itemnum = itemnum 17 | conf = SparkConf().setAppName("appName").setMaster("local[*]") 18 | # self.spark = SparkSession.builder.master("local[*]").appName("Example").getOrCreate() 19 | conf.set("spark.driver.memory","8g") 20 | conf.set("spark.executor.memory","8g") 21 | self.spark = SparkContext(conf=conf) 22 | print("New SparkSession started...") 23 | 24 | def change_parameter(self, regParam): 25 | self.regParam = regParam 26 | 27 | def matrix_factorization(self, train_lst): 28 | ratings = self.spark.parallelize(train_lst) 29 | model = ALS.train(ratings, rank=self.rank, seed=10, \ 30 | iterations=self.maxIter, \ 31 | lambda_=self.regParam) 32 | print("MF DONE") 33 | userFeatures = sorted(model.userFeatures().collect(), key=lambda d: d[0], reverse=False) 34 | productFeatures = sorted(model.productFeatures().collect(), key=lambda d: d[0], reverse=False) 35 | 36 | userProfile = np.zeros((self.usernum, self.rank)) 37 | productProfile = np.zeros((self.itemnum, self.rank)) 38 | for i, each_user in zip(range(len(userFeatures)), userFeatures): 39 | userProfile[i, :] = np.array(each_user[1]) 40 | for each_item in productFeatures: 41 | productProfile[each_item[0], :] = np.array(each_item[1]) 42 | return userProfile, productProfile 43 | 44 | def end(self): 45 | self.spark.stop() 46 | print("SparkSession stopped.") -------------------------------------------------------------------------------- /Baselines/Content-based Active Learning/content_based_active_learning.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import read_write as rw 3 | import numpy as np 4 | import scipy.sparse 5 | from MatrixFactorization import MatrixFactorization 6 | 7 | if (__name__ == '__main__'): 8 | finput_dataset = sys.argv[1] 9 | finput_K = (int)(sys.argv[2]) 10 | iu_matrix_train_path = "../../Data/" + finput_dataset + "/iu_sparse_matrix_train.npz" 11 | iu_matrix_test_path = "../../Data/" + finput_dataset + "/iu_sparse_matrix_test.npz" 12 | train_item_id_path = "../../Data/" + finput_dataset + "/train_item_id" 13 | test_item_id_path = "../../Data/" + finput_dataset + "/test_item_id" 14 | item_sim_matrix_path = "../../Data/" + finput_dataset + "/item_sim_matrix" # pass 15 | 16 | ui_matrix_train = scipy.sparse.load_npz(iu_matrix_train_path).T 17 | ui_matrix_test = scipy.sparse.load_npz(iu_matrix_test_path).T 18 | ui_matrix = scipy.sparse.csr_matrix(np.hstack((ui_matrix_train.toarray(), np.zeros(ui_matrix_test.shape)))) 19 | train_item_id = rw.readffile(train_item_id_path) 20 | test_item_id = rw.readffile(test_item_id_path) 21 | item_sim_matrix = rw.readffile(item_sim_matrix_path) 22 | 23 | # Computing Score for user (Score = [user number, new item number]) 24 | Score = (ui_matrix_train * item_sim_matrix.loc[train_item_id, test_item_id]) / \ 25 | ((ui_matrix_train != 0) * item_sim_matrix.loc[train_item_id, test_item_id]) 26 | 27 | # Active Learning 28 | train_item_num = len(train_item_id) 29 | ui_matrix = ui_matrix.tolil() 30 | ui_matrix_test = ui_matrix_test.tolil() 31 | for i in range(len(test_item_id)): 32 | ind = np.argsort(-Score[:, i]) 33 | if finput_K < ind.shape[0]: 34 | topK = ind[:(finput_K+1)] 35 | else: 36 | topK = ind 37 | ui_matrix[topK, i+train_item_num] = ui_matrix_test[topK, i] 38 | ui_matrix_test[topK, i] = 0 39 | 40 | # Matrix Factorization 41 | nonzero = scipy.sparse.find(ui_matrix) 42 | train_lst = [] 43 | for uid, itemid, rating in zip(nonzero[0], nonzero[1], nonzero[2]): 44 | train_lst.append((uid, itemid, float(rating))) 45 | MF = MatrixFactorization(usernum=ui_matrix.shape[0], itemnum=ui_matrix.shape[1]) 46 | try: 47 | user_profile, item_profile = MF.matrix_factorization(train_lst) 48 | except: 49 | MF.end() 50 | MF = MatrixFactorization() 51 | user_profile, item_profile = MF.matrix_factorization(train_lst) 52 | pred_rating = np.dot(user_profile, item_profile[train_item_num:, :].T) 53 | nonzero_num = ui_matrix_test.getnnz() 54 | ui_matrix_test_arr = ui_matrix_test.toarray() 55 | RMSE = np.sum(((ui_matrix_test_arr != 0)*(pred_rating - ui_matrix_test_arr))**2 / nonzero_num)**0.5 56 | print("RMSE: %.4f"%RMSE) 57 | MF.end() 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /Baselines/Content-based Active Learning/read_write.py: -------------------------------------------------------------------------------- 1 | import shelve 2 | import pickle 3 | 4 | 5 | def write2file(dct, path): 6 | with shelve.open(path, protocol=pickle.HIGHEST_PROTOCOL) as d: 7 | d['content'] = dct 8 | 9 | def readffile(path): 10 | with shelve.open(path, protocol=pickle.HIGHEST_PROTOCOL) as d: 11 | return d['content'] -------------------------------------------------------------------------------- /Baselines/Factorized Decision Tree/FactorizedDecisionTree.py: -------------------------------------------------------------------------------- 1 | from scipy.sparse import * 2 | import numpy as np 3 | 4 | class DecisionTreeModel: 5 | def __init__(self, source, depth_threshold=10, plambda=7, MSP_item=200, flag=False, rU = None, biasU=None, sum_cur_t=None, sum_2_cur_t=None, sum_cntt=None): 6 | 7 | self.sMatrix = source 8 | self.depth_threshold = depth_threshold 9 | self.plambda = plambda 10 | self.MSP_item = MSP_item 11 | self.real_item_num = self.sMatrix.shape[0] 12 | self.global_mean = self.sMatrix.sum()/self.sMatrix.getnnz() 13 | x = find(source) 14 | itemset = x[0] 15 | userset = x[1] 16 | 17 | #### Calculate rate of progress #### 18 | self.cur_depth = 0 19 | self.node_num = 0 20 | self.cur_node = 0 21 | for i in range(self.depth_threshold): 22 | self.node_num += 3 ** i 23 | 24 | #### Initiate Tree, lr_bound #### 25 | self.tree = list(range(self.sMatrix.shape[1])) 26 | self.split_item = [] 27 | self.lr_bound = {'0': [[0, len(self.tree) - 1]]} 28 | 29 | #### Generate rU #### 30 | if flag == False: 31 | self.rU = {} 32 | num_ratings = len(userset) 33 | i = 0 34 | for itemid, userid in zip(itemset, userset): 35 | # put approximate 5000 user in each file. Divide user num with 5000. 36 | if i%100000 == 0: 37 | print("%.2f%%" %(100 * i/num_ratings)) 38 | i += 1 39 | self.rU.setdefault(userid, {})[itemid] = int(source[itemid, userid]) 40 | print("rU Generation DONE") 41 | 42 | #### Generate bias, sum_cur_t, sum_2_cur_t, sum_cntt #### 43 | self.biasU = np.zeros(self.sMatrix.shape[1]) 44 | self.sum_cur_t = np.zeros(self.real_item_num) 45 | self.sum_2_cur_t = np.zeros(self.real_item_num) 46 | self.sum_cntt = np.zeros(self.real_item_num) 47 | i = 0 48 | for userid in self.tree: 49 | if i % 50000 == 0: 50 | print("%.2f%%" % (100 * i / (0.75 * 480189))) 51 | i += 1 52 | 53 | self.biasU[userid] = (self.sMatrix.getcol(userid).sum() \ 54 | + self.plambda * self.global_mean) / \ 55 | (self.plambda + self.sMatrix.getcol(userid).getnnz()) 56 | user_all_rating_id = self.sMatrix.getcol(userid).nonzero()[0] 57 | user_all_rating = find(self.sMatrix.getcol(userid))[2] 58 | self.sum_cur_t[user_all_rating_id[:]] += user_all_rating[:] - self.biasU[userid] 59 | self.sum_2_cur_t[user_all_rating_id[:]] += (user_all_rating[:] - self.biasU[userid]) ** 2 60 | self.sum_cntt[user_all_rating_id[:]] += 1 61 | print("bias, sum_cur_t, sum_2_cur_t Generation DONE") 62 | else: 63 | self.rU = rU 64 | self.biasU = biasU 65 | self.sum_cur_t = sum_cur_t 66 | self.sum_2_cur_t = sum_2_cur_t 67 | self.sum_cntt = sum_cntt 68 | 69 | # initialize 70 | self.item_size = self.sMatrix.shape[0] 71 | self.user_size = len(self.tree) 72 | self.MPS = [] 73 | print("Initiation DONE!") 74 | 75 | 76 | def calculate_error(self, sumt, sumt_2, cntt): 77 | ''' Calculate error for one item-split in one node ''' 78 | Error_i = np.sum(sumt_2 - (sumt ** 2) / (cntt + 1e-9)) 79 | return Error_i 80 | 81 | def generate_decision_tree(self, lr_bound_for_node, chosen_id): 82 | 83 | #### Show Rating Progress #### 84 | for i in range(self.cur_depth - 1): 85 | print("┃", end="") 86 | print("┏", end="") 87 | self.cur_node += 1 88 | print("Current depth: " + str(self.cur_depth) + " %.2f%%" % (100 * self.cur_node / self.node_num)) 89 | 90 | #### Terminate #### 91 | self.cur_depth += 1 92 | if self.cur_depth >= self.depth_threshold or len(chosen_id) == self.item_size: 93 | return 94 | 95 | #### Choose Most Popular Items of This Node #### 96 | num_rec = np.zeros(self.item_size) 97 | for userid in self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)]: 98 | user_all_rating_id = np.array(list(self.rU[userid].keys())) 99 | num_rec[user_all_rating_id] += 1 100 | MPS_item_id = list(np.argsort(-num_rec)[:]) 101 | for item_id in chosen_id: 102 | MPS_item_id.remove(item_id) 103 | MPS_item_id = MPS_item_id[:self.MSP_item] 104 | 105 | #### Find optimum item to split #### 106 | min_sumtL, min_sumtD, min_sumtL_2, min_sumtD_2, min_sumtU, min_sumtU_2, Error = {}, {}, {}, {}, {}, {}, {} 107 | min_Error = "None" 108 | for itemid in MPS_item_id: 109 | if itemid in chosen_id: 110 | continue 111 | ''' 112 | user_rating_item_in_nodet: np.array([ [uid01, rating01], [uid02, rating02], ... ]) 113 | to find all users in node t who rates item i 114 | ''' 115 | 116 | user_rating_item_in_nodet = np.array([[userid, self.rU[userid][itemid]] for userid in 117 | self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)] if 118 | itemid in self.rU[userid]]) 119 | sumt = np.zeros((self.item_size, 3)) 120 | sumt_2 = np.zeros((self.item_size, 3)) 121 | cntt = np.zeros((self.item_size, 3)) 122 | for user in user_rating_item_in_nodet: 123 | ''' user_all_rating: array [ [itemid11, rating11], [itemid12, rating12], ... ] ''' 124 | user_all_rating_id = np.array(list(self.rU[user[0]].keys())) 125 | user_all_rating = np.array(list(self.rU[user[0]].values())) 126 | #### calculate sumtL for node LIKE #### 127 | if user[1] > 3: 128 | # split.setdefault(itemid, []).append(user[0]) 129 | sumt[user_all_rating_id[:], 0] += user_all_rating[:] - self.biasU[user[0]] 130 | sumt_2[user_all_rating_id[:], 0] += (user_all_rating[:] - self.biasU[user[0]]) ** 2 131 | cntt[user_all_rating_id[:], 0] += 1 132 | #### calculate sumtD for node DISLIKE #### 133 | elif user[1] <= 3: 134 | sumt[user_all_rating_id[:], 1] += user_all_rating[:] - self.biasU[user[0]] 135 | sumt_2[user_all_rating_id[:], 1] += (user_all_rating[:] - self.biasU[user[0]]) ** 2 136 | cntt[user_all_rating_id[:], 1] += 1 137 | 138 | #### calculate sumtU for node UNKNOWN #### 139 | sumt[:, 2] = self.sum_cur_t[:] - sumt[:, 0] - sumt[:, 1] 140 | sumt_2[:, 2] = self.sum_2_cur_t[:] - sumt_2[:, 0] - sumt_2[:, 1] 141 | cntt[:, 2] = self.sum_cntt[:] - cntt[:, 0] - cntt[:, 1] 142 | Error[itemid] = self.calculate_error(sumt, sumt_2, cntt) 143 | if min_Error == "None" or Error[itemid] < min_Error: 144 | min_sumt = sumt 145 | min_sumt_2 = sumt_2 146 | min_cntt = cntt 147 | min_Error = Error[itemid] 148 | #### Find optimum split-item #### 149 | optimum_itemid = min(Error, key=Error.get) 150 | if len(self.split_item) == self.cur_depth - 1: 151 | self.split_item.append([optimum_itemid]) 152 | else: 153 | self.split_item[self.cur_depth - 1].append(optimum_itemid) 154 | chosen_id.append(optimum_itemid) 155 | # print(Error) 156 | # print("split item found!") 157 | # print(optimum_itemid) 158 | #### sort tree #### 159 | self.lr_bound.setdefault(str(self.cur_depth), []).append([]) # for LIKE 160 | self.lr_bound[str(self.cur_depth)].append([]) # for DISLIKE 161 | self.lr_bound[str(self.cur_depth)].append([]) # for UNKNOWN 162 | listU, listL, listD = [], [], [] 163 | for userid in self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)]: 164 | if optimum_itemid not in self.rU[userid]: 165 | listU.append(userid) 166 | elif self.rU[userid][optimum_itemid] > 3: 167 | listL.append(userid) 168 | elif self.rU[userid][optimum_itemid] <= 3: 169 | listD.append(userid) 170 | self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)] = listL + listD + listU 171 | self.lr_bound[str(self.cur_depth)][-3] = [lr_bound_for_node[0], 172 | lr_bound_for_node[0] + len(listL) - 1] # for LIKE 173 | self.lr_bound[str(self.cur_depth)][-2] = [lr_bound_for_node[0] + len(listL), 174 | lr_bound_for_node[0] + len(listL) + len(listD) - 1] # for DISLIKE 175 | self.lr_bound[str(self.cur_depth)][-1] = [lr_bound_for_node[0] + len(listL) + len(listD), 176 | lr_bound_for_node[0] + len(listL) + len(listD) + len(listU) - 1] # for UNKNOWN 177 | 178 | #### Generate Subtree of Node LIKE #### 179 | self.sum_cur_t = min_sumt[:, 0] 180 | self.sum_2_cur_t = min_sumt_2[:, 0] 181 | self.sum_cntt = min_cntt[:, 0] 182 | self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-3], chosen_id[:]) 183 | self.cur_depth -= 1 184 | 185 | #### Generate Subtree of Node DISLIKE #### 186 | self.sum_cur_t = min_sumt[:, 1] 187 | self.sum_2_cur_t = min_sumt_2[:, 1] 188 | self.sum_cntt = min_cntt[:, 1] 189 | self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-2], chosen_id[:]) 190 | self.cur_depth -= 1 191 | 192 | #### Generate Subtree of Node UNKNOWN #### 193 | self.sum_cur_t = min_sumt[:, 2] 194 | self.sum_2_cur_t = min_sumt_2[:, 2] 195 | self.sum_cntt = min_cntt[:, 2] 196 | self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-1], chosen_id[:]) 197 | self.cur_depth -= 1 198 | 199 | def build_model(self): 200 | #### Construct the tree & get the prediction model #### 201 | self.generate_decision_tree(self.lr_bound['0'][0], []) -------------------------------------------------------------------------------- /Baselines/Factorized Decision Tree/MatrixFactorization.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf 2 | from pyspark import SparkContext 3 | from pyspark.mllib.recommendation import ALS 4 | 5 | class MatrixFactorization: 6 | def __init__(self, maxIter=15, regParam=0.01, rank=10): 7 | self.maxIter = maxIter 8 | self.regParam = regParam 9 | self.rank = rank 10 | conf = SparkConf().setAppName("appName").setMaster("local[*]") 11 | conf.set("spark.driver.memory","16g") 12 | conf.set("spark.executor.memory","16g") 13 | self.spark = SparkContext(conf=conf) 14 | print("New SparkSession started...") 15 | 16 | def change_parameter(self, regParam): 17 | self.regParam = regParam 18 | 19 | def matrix_factorization(self, train_lst): 20 | ratings = self.spark.parallelize(train_lst) 21 | model = ALS.train(ratings, self.rank, seed=10, \ 22 | iterations=self.maxIter, \ 23 | lambda_=self.regParam) 24 | print("MF DONE") 25 | userFeatures = sorted(model.userFeatures().collect(), key=lambda d: d[0], reverse=False) 26 | productFeatures = sorted(model.productFeatures().collect(), key=lambda d: d[0], reverse=False) 27 | itemProfile = {each[0]: each[1].tolist() for each in userFeatures} 28 | userProfile = {each[0]: each[1].tolist() for each in productFeatures} 29 | 30 | return userProfile, itemProfile 31 | 32 | def end(self): 33 | self.spark.stop() 34 | print("SparkSession stopped.") -------------------------------------------------------------------------------- /Baselines/Factorized Decision Tree/factorized_decision_tree.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import klepto 4 | import shelve 5 | import pickle 6 | import numpy as np 7 | from scipy.sparse import * 8 | import matplotlib.pyplot as plt 9 | import read_write as rw 10 | from FactorizedDecisionTree import DecisionTreeModel 11 | from MatrixFactorization import MatrixFactorization 12 | 13 | 14 | if (__name__ == '__main__'): 15 | # data path 16 | finput_dataset = sys.argv[1] 17 | finput_depth = (int)(sys.argv[2]) 18 | 19 | ui_matrix_train_csc = load_npz('../../Data/'+ finput_dataset + '/iu_sparse_matrix_train.npz').tocsc().T 20 | ui_matrix_test_csc = load_npz('../../Data/'+ finput_dataset + '/iu_sparse_matrix_test.npz').tocsc().T 21 | print("file load DONE") 22 | 23 | # build tree 24 | dtmodel = DecisionTreeModel(ui_matrix_train_csc, finput_depth) 25 | dtmodel.build_model() 26 | 27 | # parameter training 28 | split_item = dtmodel.split_item 29 | lr_bound = dtmodel.lr_bound 30 | tree = dtmodel.tree 31 | depth_threshold = dtmodel.depth_threshold 32 | lambda_list = [0.005, 0.025, 0.05, 0.075, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15] 33 | MF = MatrixFactorization() 34 | min_rmse_list = [] 35 | rmst_dict = {} 36 | prediction_model = {} 37 | for level in range(depth_threshold): 38 | #### Designate desired depth here if required #### 39 | # if level < 6: 40 | # continue 41 | #### ---------------------------------------- #### 42 | level = str(level) 43 | print("level:", level) 44 | prediction_model.setdefault(level, {}) 45 | train_lst = [] 46 | for pseudo_item_bound, itemid in zip(lr_bound[level], range(len(lr_bound[level]))): 47 | if pseudo_item_bound[0] > pseudo_item_bound[1]: 48 | continue 49 | pseudo_item_lst = tree[pseudo_item_bound[0]:(pseudo_item_bound[1] + 1)] 50 | pseudo_matrix = np.array(ui_matrix_train_csc[:, pseudo_item_lst].sum(axis=1))[:,0] / \ 51 | (ui_matrix_train_csc[:, pseudo_item_lst].getnnz(axis=1)+1e-9) 52 | train_lst += [(itemid, userid, float(pseudo_matrix[userid])) \ 53 | for userid in range(pseudo_matrix.shape[0]) if pseudo_matrix[userid]] 54 | 55 | print("Rating Number of level " + level + ": " + str(len(train_lst))) 56 | 57 | #### Train MF and Do validation #### 58 | min_RMSE = -1 59 | for plambda in lambda_list: 60 | #### Designate desired lambda here if required #### 61 | 62 | #### ----------------------------------------- #### 63 | print("Current plambda: " + str(plambda)) 64 | MF.change_parameter(plambda) 65 | user_profile, item_profile = MF.matrix_factorization(train_lst) 66 | # prediction_model[level]['upro'], prediction_model[level]['ipro'], prediction_model[level]['plambda'] \ 67 | # = user_profile, item_profile, plambda 68 | prediction_model[level]['P'], prediction_model[level]['plambda'] \ 69 | = np.dot(np.array(list(user_profile.values())), np.array(list(item_profile.values())).T), plambda 70 | prediction_model[level]['ipro'] = list(item_profile.keys()) 71 | P_test = np.zeros(ui_matrix_test_csc.shape) 72 | rating_matrix_test_unqueried = ui_matrix_test_csc.toarray() 73 | for itemid in range(ui_matrix_test_csc.shape[1]): 74 | # if userid % 100 == 0: 75 | # print("%.2f%%" % (100 * userid / rating_matrix_csc_test.shape[1])) 76 | pred_index = 0 77 | final_level = 0 78 | rated_user = [] 79 | user_all_ratings = ui_matrix_test_csc[:,itemid].nonzero()[0] 80 | for depth in range(int(level)): 81 | if split_item[depth][pred_index] not in user_all_ratings: 82 | tmp_pred_index = 3*pred_index + 2 83 | if tmp_pred_index in prediction_model[str(depth+1)]['ipro']: 84 | final_level += 1 85 | pred_index = tmp_pred_index 86 | else: 87 | break 88 | elif ui_matrix_test_csc[split_item[depth][pred_index], itemid] > 3: 89 | tmp_pred_index = 3*pred_index 90 | if tmp_pred_index in prediction_model[str(depth+1)]['ipro']: 91 | rated_user.append(split_item[depth][pred_index]-1) 92 | final_level += 1 93 | pred_index = tmp_pred_index 94 | else: 95 | break 96 | elif ui_matrix_test_csc[split_item[depth][pred_index], itemid] <= 3: 97 | tmp_pred_index = 3*pred_index + 1 98 | if tmp_pred_index in prediction_model[str(depth+1)]['ipro']: 99 | rated_user.append(split_item[depth][pred_index]-1) 100 | final_level += 1 101 | pred_index = tmp_pred_index 102 | else: 103 | break 104 | pred_index = prediction_model[str(final_level)]['ipro'].index(pred_index) 105 | P_test[:, itemid] = prediction_model[str(final_level)]['P'][:, pred_index] 106 | rating_matrix_test_unqueried[rated_user, itemid] = 0 107 | 108 | rating_matrix_test_unqueried = csc_matrix(rating_matrix_test_unqueried) 109 | P_test = (rating_matrix_test_unqueried!=0).multiply(P_test) 110 | dif = P_test - rating_matrix_test_unqueried 111 | RMSE = ( dif.multiply(dif).sum() / (rating_matrix_test_unqueried!=0).sum() )**0.5 112 | print("Current RMSE: " + str(RMSE)) 113 | rmst_dict.setdefault(level, []).append(RMSE) 114 | if min_RMSE == -1 or RMSE < min_RMSE: 115 | min_RMSE = RMSE 116 | min_item_profile = prediction_model[level]['ipro'] 117 | min_P = prediction_model[level]['P'] 118 | min_lambda = prediction_model[level]['plambda'] 119 | 120 | min_rmse_list.append(min_RMSE) 121 | prediction_model[level]['ipro'] = min_item_profile 122 | prediction_model[level]['P'] = min_P 123 | prediction_model[level]['plambda'] = min_lambda 124 | print("min RMSE: " + str(min(rmst_dict[level]))) -------------------------------------------------------------------------------- /Baselines/Factorized Decision Tree/read_write.py: -------------------------------------------------------------------------------- 1 | import shelve 2 | import pickle 3 | 4 | 5 | def write2file(dct, path): 6 | with shelve.open(path, protocol=pickle.HIGHEST_PROTOCOL) as d: 7 | d['content'] = dct 8 | 9 | def readffile(path): 10 | with shelve.open(path, protocol=pickle.HIGHEST_PROTOCOL) as d: 11 | return d['content'] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 主动学习推荐系统 2 | 3 | ### 0. 概述 4 | 5 | - 数据集:[Amazon商品数据集](http://jmcauley.ucsd.edu/data/amazon/) 6 | - 编程环境:Python, Matlab, Markdown 7 | 8 | ### 1. 数据预处理 9 | 10 | - 商品信息 11 | - 提取数据集中的title和description信息 12 | - 命令:`python item_information.py [file1, ..., file3]` 13 | 14 | - 用户物品评分信息 15 | 16 | - 提取用户-物品评分,划分train集和test集 17 | - 将train集中的用户作为用户全集,以防止出现train集中有用户没有评分的情况 18 | - 命令:`python user_information.py [file1, ..., file7]` 19 | 20 | - 商品相似度生成 21 | 22 | - title: 分词 + LDA主题模型(topic number = 15) 23 | - description: 分词 + LDA主题模型(topic number = 15) 24 | - 未使用price(缺失值太多) 25 | - 未使用category(同类商品) 26 | - 命令:`python item_similarity.py [topic number, file1, ..., file6]` 27 | 28 | - 商品description和title相似度权重生成 29 | 30 | - non linear regression 31 | - Similarity(i1, i2) = weight1 * S_title(i1) + weight2 * S_description(i2) 32 | - 命令: 33 | - `python similarity_parameters.py [file1, ..., file7]` 34 | - `fitnlm(path, param1, param2)` 35 | 36 | - 用户相似度生成 37 | 38 | - 评分相似度 39 | - 命令:`python user_similarity.py [file1, ..., file3]` 40 | 41 | - 用户聚类 42 | 43 | - 用户聚类依靠用户相似度作为距离度量,使用K-medoids作为聚类算法 44 | - 问题主要存在于:由于评分稀疏,很多用户之间距离为0 45 | - 命令:`python user_clustering.py input_file number_of_clusters output_file` 46 | 47 | - 建树前的准备工作 48 | 49 | - 生成用户聚类对任一物品的平均评分,便于计算时直接调用 50 | - 利用非线性回归拟合的参数生成相似度矩阵 51 | - 命令:`python buildtree_preparation.py input_file init_ptitle init_pdescrip output_file` 52 | 53 | ### 2. 建树及预测 54 | 55 | - 树的生成: 56 | - 三叉树,对应不喜欢、一般般喜欢和喜欢三个节点 57 | - 生成的节点信息用*self.tree*和*self.node_interval*两个变量保存 58 | - 构建预测模型: 59 | - 利用Spark的mllib包实现ALS Matrix Factorization 60 | - 生成伪物品(每个节点)和用户对应的latent vector(对每一层都计算) 61 | - 预测评分: 62 | - 对每一个test商品,从树的根节点开始向下走,利用目标叶子节点的latent vector作为它的特征向量 63 | - 利用特征向量和所有物品的特征向量的点积预测评分,计算RMSE(对每一层都计算) 64 | - 命令:`python build_tree.py [input_file1, ..., input_file5] desired_depth` 65 | 66 | ### 3. 运行 67 | 68 | - 利用*Python*脚本运行上述所有步骤:`python script.py` 69 | - 代码开头数据集名称(*dataset*)需相应更改 70 | 71 | ### 4. 对比实验 72 | 73 | - FDT (Factorized Deicision Tree) 74 | - `python factorized_decision_tree.py dataset depth` (dataset是数据集的名字,depth决定了树的高度) 75 | - **输入:** *I\*U* 的矩阵 => *new-user problem* 76 | - **输入:** *U\*I* 的矩阵 => *new-item problem* 77 | - CAL (Content-based Active Learning) 78 | - `python content_based_active_learning.py dataset K` (dataset是数据集的名字,K决定了选择TopK的用户进行query) 79 | - CBCF (Content-based Collaborative Filtering) 80 | 81 | ### 5. 当前问题 82 | 83 | 1. 对Amazon数据集来说树的第一层预测效果最好,分析原因可能如下: 84 | 85 | - 数据集过于稀疏,导致每一用户基本只有一个评分,第一层作为伪物品作矩阵分解时评分满,效果好,越往下效果越差。 86 | - 点的划分过于不均匀,使得伪物品选择不优秀。 87 | - 解决方法: 88 | - 使用平均划分法划分每一个节点 89 | - 使用相似度拟合评分作为MF的输入,而非平均评分 90 | 91 | 2. 物品个数超过30万的Automotive集合上计算*item similarity*时出现*Memory Error* 92 | 93 | - 解决方法:选择评分个数大于5个的物品和用户 94 | -------------------------------------------------------------------------------- /Step1-Preprocessing/Commands.txt: -------------------------------------------------------------------------------- 1 | python item_information.py "../Dataset/All_Beauty/meta_All_Beauty.json.gz" "Data/title Data/description" 2 | python user_information.py "../Dataset/All_Beauty/reviews_All_Beauty.json.gz" "Data/title" "Data/iu_sparse_matrix_train.npz" "Data/iu_sparse_matrix_test.npz" "Data/uid" "Data/train_item_id" "Data/test_item_id" 3 | python item_similarity.py 15 "Data/title" "Data/description" "Data/train_item_id" "Data/test_item_id" "Data/title_similarity_matrix" "Data/description_similarity_matrix" 4 | python similarity_parameters.py "Data/title_similarity_matrix" "Data/description_similarity_matrix" "Data/train_item_id" "Data/test_item_id" "Data/iu_sparse_matrix_train.npz" "Data/iu_sparse_matrix_test.npz" "Data/nonlinreg.mat" 5 | python user_similarity.py "Data/uid" "Data/iu_sparse_matrix_train.npz" "Data/user_similarity_matrix" 6 | python user_clustering.py "Data/user_similarity_matrix" 200 "Data/user_cluster_set" 7 | python buildtree_preparation.py "Data/iu_sparse_matrix_train.npz" "Data/title_similarity_matrix" "Data/description_similarity_matrix" "Data/user_cluster_set" "Data/train_item_id" "Data/test_item_id" "Data/nonlinreg" 1.0 1.0 "Data/iuclst_rating_matrix" "Data/item_sim_matrix" 8 | python build_tree.py "../Step1-Preprocessing/Data/iu_sparse_matrix_train.npz" "../Step1-Preprocessing/Data/iu_sparse_matrix_test.npz" "../Step1-Preprocessing/Data/iuclst_rating_matrix" "../Step1-Preprocessing/Data/user_cluster_set" 5 -------------------------------------------------------------------------------- /Step1-Preprocessing/LDA.py: -------------------------------------------------------------------------------- 1 | import string 2 | from nltk.corpus import stopwords 3 | import warnings 4 | warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') 5 | from gensim import corpora, models, similarities 6 | 7 | def text_preprocess(text): 8 | # split into words 9 | from nltk.tokenize import word_tokenize 10 | tokens = word_tokenize(text) 11 | # convert to lower case 12 | tokens = [w.lower() for w in tokens] 13 | # remove punctuation from each word 14 | import string 15 | table = str.maketrans('', '', string.punctuation) 16 | stripped = [w.translate(table) for w in tokens] 17 | # remove remaining tokens that are not alphabetic 18 | words = [word for word in stripped if word.isalpha()] 19 | # filter out stop words 20 | from nltk.corpus import stopwords 21 | stop_words = set(stopwords.words('english')) 22 | words = [w for w in words if not w in stop_words] 23 | # stemming of words 24 | from nltk.stem.porter import PorterStemmer 25 | porter = PorterStemmer() 26 | stemmed = [porter.stem(word) for word in words] 27 | # delete words with length 1 28 | final = [word for word in stemmed if len(word) is not 1] 29 | return final 30 | 31 | 32 | def texts_preprocess(input_dict): 33 | output_dict = {} 34 | texts = list(input_dict.values()) 35 | for ind in range(len(texts)): 36 | texts[ind] = text_preprocess(texts[ind]) 37 | from collections import defaultdict 38 | frequency = defaultdict(int) 39 | for text in texts: 40 | for token in text: 41 | frequency[token] += 1 42 | output_dict = {key: [token for token in texts[ind] if frequency[token] > 1] for key, ind in zip(input_dict.keys(), range(len(texts)))} 43 | return output_dict 44 | 45 | 46 | def LDA(texts, index_lst, num_topics=15): 47 | # 根据文本生成字典 48 | dictionary = corpora.Dictionary(texts) 49 | # print(dictionary) 50 | # V = len(dictionary) 51 | 52 | # 根据字典,将每行文档都转换为索引的形式 53 | corpus = [dictionary.doc2bow(text) for text in texts] 54 | corpus_selected = [corpus[index] for index in index_lst] 55 | # 逐行打印 56 | # for line in corpus: 57 | # print(line) 58 | 59 | # print('LDA Model:') 60 | # 训练模型 61 | lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, 62 | alpha='auto', eta='auto', minimum_probability=0.001) 63 | 64 | # 计算相似度 65 | index = similarities.MatrixSimilarity(lda[corpus_selected]) 66 | return index -------------------------------------------------------------------------------- /Step1-Preprocessing/buildtree_preparation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy as np 4 | import scipy.sparse 5 | from scipy.sparse import csr_matrix, find 6 | import read_write as rw 7 | import matlab.engine 8 | 9 | ''' 10 | finput_iu_rating_matrix_train = "Data/iu_sparse_matrix_train.npz" 11 | finput_title_sim_matrix = "Data/title_similarity_matrix" 12 | finput_description_sim_matrix = "Data/description_similarity_matrix" 13 | finput_user_cluster_set = "Data/user_cluster_set" 14 | finput_train_item_id = "Data/train_item_id" 15 | finput_test_item_id = "Data/test_item_id" 16 | finput_nonlinreg = "Data/nonlinreg" 17 | finput_init_tp = 1.0 18 | finput_init_dp = 1.0 19 | foutput_iuclst_rating_matrix = "Data/iuclst_rating_matrix" 20 | foutput_item_sim_matrix = "Data/item_sim_matrix" 21 | ''' 22 | 23 | if (__name__ == '__main__'): 24 | #### data path 25 | finput_iu_rating_matrix_train = sys.argv[1] 26 | finput_iu_rating_matrix_test = sys.argv[2] 27 | finput_title_sim_matrix = sys.argv[3] 28 | finput_description_sim_matrix = sys.argv[4] 29 | finput_user_cluster_set = sys.argv[5] 30 | finput_train_item_id = sys.argv[6] 31 | finput_test_item_id = sys.argv[7] 32 | finput_nonlinreg = sys.argv[8] 33 | finput_init_tp = float(sys.argv[9]) 34 | finput_init_dp = float(sys.argv[10]) 35 | foutput_iuclst_rating_matrix_train = sys.argv[11] 36 | foutput_iuclst_rating_matrix_test = sys.argv[12] 37 | foutput_item_sim_matrix = sys.argv[13] 38 | 39 | # load data 40 | iu_rating_matrix_train = scipy.sparse.load_npz(finput_iu_rating_matrix_train) 41 | iu_rating_matrix_test = scipy.sparse.load_npz(finput_iu_rating_matrix_test) 42 | title_sim_matrix = rw.readffile(finput_title_sim_matrix) 43 | description_sim_matrix = rw.readffile(finput_description_sim_matrix) 44 | user_cluster_set = rw.readffile(finput_user_cluster_set) 45 | train_item_id = rw.readffile(finput_train_item_id) 46 | test_item_id = rw.readffile(finput_test_item_id) 47 | 48 | # run matlab script and get parameters for title and description 49 | print("call matlab script....") 50 | cur_path = os.getcwd() 51 | os.chdir("D:\GitCode\Dissertation\Step1-Preprocessing") 52 | eng = matlab.engine.start_matlab() 53 | x = eng.my_fitnlm(finput_nonlinreg, finput_init_tp, finput_init_dp, nargout=3) 54 | theta1, theta2, RMSE = x[0], x[1], x[2] 55 | eng.quit() 56 | sim_matrix = theta1*title_sim_matrix + theta2*description_sim_matrix 57 | os.chdir(cur_path) 58 | rw.write2file(sim_matrix, foutput_item_sim_matrix) 59 | print("theta1 = ", theta1) 60 | print("theta2 = ", theta2) 61 | print("RMSE = ", RMSE) 62 | print("matlab finished") 63 | 64 | # extract similarity matrix for training and test item 65 | # resort_id = list(train_item_id.keys()) + list(test_item_id.keys()) 66 | sim_matrix_train = sim_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())].values 67 | sim_matrix_test = sim_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())].values 68 | 69 | # user cluster - item rating matrix 70 | iuclst_rating_matrix_train = np.zeros((len(train_item_id), len(user_cluster_set))) 71 | iuclst_rating_matrix_test = np.zeros((len(test_item_id), len(user_cluster_set))) 72 | item_in_node_train = list(range(iu_rating_matrix_train.shape[0])) 73 | item_in_node_test = list(range(iu_rating_matrix_test.shape[0])) 74 | for ind, user_cluster in zip(range(len(user_cluster_set)), user_cluster_set): 75 | print("user cluster: %d / %d"%(ind+1, len(user_cluster_set)), end="\r") 76 | user_cluster_size = len(user_cluster) 77 | sub_rating_matrix = iu_rating_matrix_train[np.ix_(item_in_node_train, user_cluster)].T.toarray() # user number * training item number 78 | sub_rating_matrix_pred = (np.dot(sub_rating_matrix, sim_matrix_train) / (1e-9+np.dot(sub_rating_matrix != 0, sim_matrix_train))) 79 | iuclst_rating_matrix_train[:, ind] = np.sum(sub_rating_matrix + 0.01*(sub_rating_matrix == 0) * sub_rating_matrix_pred, axis=0) / np.sum((sub_rating_matrix == 0)*0.01 + (sub_rating_matrix != 0)*1, axis=0) 80 | sub_rating_matrix = iu_rating_matrix_test[np.ix_(item_in_node_test, user_cluster)].T.toarray() # user number * test item number 81 | sub_rating_matrix_pred = (np.dot(sub_rating_matrix, sim_matrix_test) / (1e-9+np.dot(sub_rating_matrix != 0, sim_matrix_test))) 82 | iuclst_rating_matrix_test[:, ind] = np.sum(sub_rating_matrix + 0.01*(sub_rating_matrix == 0) * sub_rating_matrix_pred, axis=0) / np.sum((sub_rating_matrix == 0)*0.01 + (sub_rating_matrix != 0)*1, axis=0) 83 | print("\nuser cluster/item rating matrix generated done!") 84 | 85 | rw.write2file(iuclst_rating_matrix_train, foutput_iuclst_rating_matrix_train) 86 | rw.write2file(iuclst_rating_matrix_test, foutput_iuclst_rating_matrix_test) 87 | print("file saved done!") 88 | 89 | -------------------------------------------------------------------------------- /Step1-Preprocessing/item_information.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | import read2df as rdf 5 | import read_write as rw 6 | 7 | ''' 8 | Input: input path ("../Dataset/All_Beauty/meta_All_Beauty.json.gz") 9 | output path ("Data/title" && "Data/description") 10 | output: files 11 | ''' 12 | if (__name__ == '__main__'): 13 | #### data path 14 | finput = sys.argv[1] 15 | foutput_title = sys.argv[2] 16 | foutput_description = sys.argv[3] 17 | 18 | #### read data into dataframe 19 | df = rdf.getDF(finput) 20 | 21 | #### delete rows where title or description is nan 22 | dict_title = {} 23 | dict_description = {} 24 | subdf = df[~(df['title'].isin([np.nan]) | df['description'].isin([np.nan]))] 25 | for indexs in subdf.index: 26 | dict_title[subdf.loc[indexs]['asin']] = subdf.loc[indexs]['title'] 27 | dict_description[subdf.loc[indexs]['asin']] = subdf.loc[indexs]['description'] 28 | 29 | #### write generated dictionary into files 30 | rw.write2file(dict_title, foutput_title) 31 | rw.write2file(dict_description, foutput_description) 32 | print("Write Done!") 33 | print("Info: %d/%d"%(subdf.shape[0], df.shape[0])) -------------------------------------------------------------------------------- /Step1-Preprocessing/item_similarity.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | import gzip 5 | import read_write as rw 6 | import LDA as lda 7 | 8 | ''' 9 | finput_title = "Data/title" 10 | finput_description = "Data/description" 11 | finput_train_item_id = "Data/train_item_id" 12 | finput_test_item_id = "Data/test_item_id" 13 | foutput_title_similarity = "Data/title_similarity_matrix" 14 | foutput_description_similarity = "Data/description_similarity_matrix" 15 | ''' 16 | 17 | if (__name__ == '__main__'): 18 | #### data path 19 | finput_topic_num = int(sys.argv[1]) 20 | finput_title = sys.argv[2] 21 | finput_description = sys.argv[3] 22 | finput_train_item_id = sys.argv[4] 23 | finput_test_item_id = sys.argv[5] 24 | foutput_title_similarity = sys.argv[6] 25 | foutput_description_similarity = sys.argv[7] 26 | 27 | #### read into item title and description information (dict: {id : content}) 28 | dict_title = rw.readffile(finput_title) 29 | dict_description = rw.readffile(finput_description) 30 | train_item_id = rw.readffile(finput_train_item_id) 31 | test_item_id = rw.readffile(finput_test_item_id) 32 | 33 | #### preprocess before LDA 34 | dict_title_preprocessed = lda.texts_preprocess(dict_title) 35 | dict_description_preprocessed = lda.texts_preprocess(dict_description) 36 | list_title_preprocessed = list(dict_title_preprocessed.values()) 37 | list_description_preprocessed = list(dict_description_preprocessed.values()) 38 | print("text preprocessed done!") 39 | 40 | #### generate item title and description similarity for selected items 41 | item_tt_id_lst = list(train_item_id.keys())+list(test_item_id.keys()) 42 | item_total_id_lst = list(dict_title.keys()) 43 | index_lst = [] 44 | for id in item_tt_id_lst: 45 | index_lst.append(item_total_id_lst.index(id)) 46 | title_similarity = lda.LDA(texts=list_title_preprocessed, index_lst=index_lst, num_topics=finput_topic_num) 47 | description_similarity = lda.LDA(texts=list_description_preprocessed, index_lst=index_lst, num_topics=finput_topic_num) 48 | print("lda similarity calculated done!") 49 | 50 | #### generate train/test item similarity matrix 51 | df_title_similarity_matrix = pd.DataFrame(np.array(title_similarity),index=item_tt_id_lst,columns=item_tt_id_lst) 52 | df_description_similarity_matrix = pd.DataFrame(np.array(description_similarity),index=item_tt_id_lst,columns=item_tt_id_lst) 53 | # train_item_id = rw.readffile(finput_train_item_id) 54 | # test_item_id = rw.readffile(finput_test_item_id) 55 | # #### title/train 56 | # df_title_similarity_matrix_train = df_title_similarity_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())] 57 | # #### title/test 58 | # df_title_similarity_matrix_test = df_title_similarity_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())] 59 | # #### description/train 60 | # df_description_similarity_matrix_train = df_description_similarity_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())] 61 | # #### description/test 62 | # df_description_similarity_matrix_test = df_description_similarity_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())] 63 | print("similarity matrix generated done!") 64 | 65 | #### write data into files 66 | rw.write2file(df_title_similarity_matrix, foutput_title_similarity) 67 | rw.write2file(df_description_similarity_matrix, foutput_description_similarity) 68 | print("file saved done!") -------------------------------------------------------------------------------- /Step1-Preprocessing/k_medoids.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def k_medoids(distance_matrix, K, max_iterations=20): 4 | user_num = distance_matrix.shape[0] 5 | random_index = np.random.permutation(user_num) 6 | centroids = np.sort(random_index[0:K]) 7 | for time in range(max_iterations): 8 | # Assign user to clusters 9 | print("K-medoids: %d / %d"%(time+1, max_iterations), end="\r") 10 | user_centroids_dist = distance_matrix[:, centroids] 11 | indices = np.argmin(user_centroids_dist, axis=1) 12 | indices = centroids[indices] 13 | indices[centroids] = centroids 14 | centroids_update = np.zeros(K, 'int32') 15 | # Find new medoids 16 | for i in range(K): 17 | cluster_list = (indices == centroids[i]) 18 | cluster = distance_matrix[np.ix_(cluster_list, cluster_list)] 19 | cluster_list_ = np.where(cluster_list != 0)[0] 20 | new_center = np.argmin(np.sum(cluster, axis=1), axis=0) 21 | centroids_update[i] = cluster_list_[new_center] 22 | centroids_update = np.sort(centroids_update) 23 | # Termination condition 24 | if (centroids == centroids_update).all(): 25 | print('Iteration stop') 26 | break 27 | else: 28 | centroids = centroids_update 29 | # Assign user to clusters 30 | user_cluster_set = [] 31 | for i in range(K): 32 | user_cluster_set.append(list(np.where((indices==centroids[i]) != 0)[0])) 33 | return user_cluster_set -------------------------------------------------------------------------------- /Step1-Preprocessing/my_fitnlm.m: -------------------------------------------------------------------------------- 1 | function [ptitle, pdescrip, RMSE] = my_fitnlm(path, b1, b2) 2 | load(path) 3 | X = [st_r', sd_r', st', sd']; 4 | y = single(ratings); 5 | modelfun = @(b,x)(b(1)*X(:,1)+b(2)*X(:,2)) ./ (b(1)*X(:,3) + b(2)*X(:,4)); 6 | beta0 = [b1,b2]; 7 | mdl = fitnlm(X, y, modelfun, beta0); 8 | ptitle = mdl.Coefficients.Estimate(1); 9 | pdescrip = mdl.Coefficients.Estimate(2); 10 | RMSE = mdl.RMSE; -------------------------------------------------------------------------------- /Step1-Preprocessing/read2df.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import pandas as pd 3 | 4 | def parse(path): 5 | g = gzip.open(path, 'rb') 6 | for l in g: 7 | yield eval(l) 8 | 9 | def getDF(path): 10 | i = 0 11 | df = {} 12 | for d in parse(path): 13 | df[i] = d 14 | i += 1 15 | return pd.DataFrame.from_dict(df, orient='index') 16 | -------------------------------------------------------------------------------- /Step1-Preprocessing/read_write.py: -------------------------------------------------------------------------------- 1 | import shelve 2 | import pickle 3 | 4 | 5 | def write2file(dct, path): 6 | with shelve.open(path, protocol=pickle.HIGHEST_PROTOCOL) as d: 7 | d['content'] = dct 8 | 9 | def readffile(path): 10 | with shelve.open(path, protocol=pickle.HIGHEST_PROTOCOL) as d: 11 | return d['content'] -------------------------------------------------------------------------------- /Step1-Preprocessing/similarity_parameters.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | import scipy.sparse 5 | from scipy.sparse import csr_matrix, find 6 | import scipy.io as sio 7 | import gzip 8 | import string 9 | import read_write as rw 10 | 11 | ''' 12 | finput_title = "Data/title_similarity_matrix" 13 | finput_description = "Data/description_similarity_matrix" 14 | finput_train_id = "Data/train_item_id" 15 | finput_test_id = "Data/test_item_id" 16 | finput_rating_matrix_train = "Data/iu_sparse_matrix_train.npz" 17 | finput_rating_matrix_test = "Data/iu_sparse_matrix_test.npz" 18 | foutput = "Data/nonlinreg.mat" 19 | ''' 20 | if (__name__ == '__main__'): 21 | 22 | #### data path 23 | finput_title = sys.argv[1] 24 | finput_description = sys.argv[2] 25 | finput_train_id = sys.argv[3] 26 | finput_test_id = sys.argv[4] 27 | finput_rating_matrix_train = sys.argv[5] 28 | finput_rating_matrix_test = sys.argv[6] 29 | foutput = sys.argv[7] 30 | 31 | # read into similarity file and train/test item id 32 | matrix_title = rw.readffile(finput_title) 33 | matrix_description = rw.readffile(finput_description) 34 | train_id = rw.readffile(finput_train_id) 35 | test_id = rw.readffile(finput_test_id) 36 | 37 | # combine these items and select corresponding matrix 38 | item_id = list(train_id.keys()) + list(test_id.keys()) 39 | matrix_title = matrix_title.loc[item_id, item_id] 40 | matrix_description = matrix_description.loc[item_id, item_id] 41 | 42 | # read into train/test rating sparse matrix and combine them up 43 | rating_matrix_train = scipy.sparse.load_npz(finput_rating_matrix_train) 44 | rating_matrix_test = scipy.sparse.load_npz(finput_rating_matrix_test) 45 | rating_matrix = scipy.sparse.csr_matrix(np.vstack((rating_matrix_train.toarray(),rating_matrix_test.toarray()))) 46 | 47 | # generate argument pairs for non linear regression 48 | x = find(rating_matrix) 49 | st_r = [] 50 | sd_r = [] 51 | st = [] 52 | sd = [] 53 | ratings = [] 54 | item_num = rating_matrix.shape[0] 55 | length = x[0].shape[0] 56 | cnt = 0 57 | for iid, uid, rating in zip(x[0], x[1], x[2]): 58 | cnt += 1 59 | print("progress: %d / %d"%(cnt, length), end="\r") 60 | flag = np.ones(item_num).reshape(1, item_num) # 1 * item number 61 | flag[0, iid] = 0 62 | ur_history = rating_matrix[:, uid].T.toarray() * flag # 1 * item number 63 | if ur_history.any() == False: 64 | continue 65 | 66 | it_similarity = np.array(matrix_title.iloc[:, iid]).reshape(1, item_num) # 1 * item number 67 | id_similarity = np.array(matrix_description.iloc[:, iid]).reshape(1, item_num) # 1 * item number 68 | st_r.append(np.dot(ur_history, it_similarity.T)[0,0]) 69 | sd_r.append(np.dot(ur_history, id_similarity.T)[0,0]) 70 | st.append((it_similarity*(ur_history!=0)).sum()) 71 | sd.append((id_similarity*(ur_history!=0)).sum()) 72 | ratings.append(rating) 73 | sio.savemat(foutput, {'st_r': st_r,'sd_r': sd_r,'st': st, 'sd': sd, 'ratings' : ratings}) 74 | print("\nfile saved done!") 75 | -------------------------------------------------------------------------------- /Step1-Preprocessing/user_clustering.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | import read_write as rw 5 | import k_medoids as km 6 | 7 | ''' 8 | finput_user_similarity = "Data/user_similarity_matrix" 9 | finput_cluster_number = 200 10 | foutput_user_cluster_set = "Data/user_cluster_set" 11 | ''' 12 | 13 | if (__name__ == '__main__'): 14 | # data path 15 | finput_user_similarity = sys.argv[1] 16 | finput_cluster_number = int(sys.argv[2]) 17 | foutput_user_cluster_set = sys.argv[3] 18 | 19 | # read into user similarity matrix 20 | user_similarity_matrix = rw.readffile(finput_user_similarity) 21 | 22 | # k-medoids 23 | user_cluster_set = km.k_medoids(user_similarity_matrix.values, K=finput_cluster_number, max_iterations=20) 24 | print("\ndone!") 25 | 26 | rw.write2file(user_cluster_set, foutput_user_cluster_set) 27 | print("file saved done!") 28 | 29 | print("top 20% of user cluster:") 30 | length = [] 31 | for lst in user_cluster_set: 32 | length.append(len(lst)) 33 | length.sort(reverse=True) 34 | print(length[0:int(len(length)*0.2)]) -------------------------------------------------------------------------------- /Step1-Preprocessing/user_information.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | import gzip 5 | import scipy.sparse 6 | from scipy.sparse import csr_matrix, find 7 | import random 8 | import read2df as rdf 9 | import read_write as rw 10 | 11 | ''' 12 | finput = "../Dataset/All_Beauty/reviews_All_Beauty.json.gz" 13 | finput_item = "Data/title" 14 | foutput1 = "Data/iu_sparse_matrix_train.npz" 15 | foutput2 = "Data/iu_sparse_matrix_test.npz" 16 | foutput_uid = "Data/uid" 17 | foutput_train_item_id = "Data/train_item_id" 18 | foutput_test_item_id = "Data/test_item_id" 19 | ''' 20 | if (__name__ == '__main__'): 21 | 22 | #### data path 23 | finput = sys.argv[1] 24 | finput_item = sys.argv[2] 25 | foutput1 = sys.argv[3] 26 | foutput2 = sys.argv[4] 27 | foutput_uid = sys.argv[5] 28 | foutput_train_item_id = sys.argv[6] 29 | foutput_test_item_id = sys.argv[7] 30 | 31 | # read into item id whose title and description is not null 32 | dict_item_id = rw.readffile(finput_item) 33 | 34 | # read into review file and select item 35 | df = rdf.getDF(finput) 36 | df = df.loc[df['asin'].isin(dict_item_id)] 37 | 38 | # split item into train and test 39 | itemid = list(df['asin'].unique()) 40 | train_item_id = random.sample(itemid, int(0.75*len(itemid))) 41 | test_item_id = [ele for ele in itemid if ele not in train_item_id] 42 | print("train: %d/%d, test: %d/%d"%(len(train_item_id), len(itemid), len(test_item_id), len(itemid))) 43 | 44 | 45 | df_train = df.loc[df['asin'].isin(train_item_id)] 46 | df_test = df.loc[df['asin'].isin(test_item_id)] 47 | # set user set as those who rate at least one item in the training set 48 | userid = list(set(list(df_train['reviewerID']))) 49 | print("user number: ", len(userid)) 50 | 51 | # map user/item to id 52 | user_id_dict = {} 53 | for i in range(len(userid)): 54 | user_id_dict[userid[i]] = i 55 | train_item_id_dict = {} 56 | for i in range(len(train_item_id)): 57 | train_item_id_dict[train_item_id[i]] = i 58 | test_item_id_dict = {} 59 | for i in range(len(test_item_id)): 60 | test_item_id_dict[test_item_id[i]] = i 61 | col = len(userid) 62 | train_row = len(train_item_id) 63 | test_row = len(test_item_id) 64 | 65 | # transfer ratings to array 66 | # train 67 | iu_matrix_train = np.zeros((train_row, col), dtype=np.int8) 68 | cnt = 0 69 | lenght = df_train.shape[0] 70 | for index, row in df_train.iterrows(): 71 | print("iu train matrix: %d / %d"%(cnt, lenght), end="\r") 72 | iu_matrix_train[train_item_id_dict[row['asin']], user_id_dict[row['reviewerID']]] = int(row['overall']) 73 | cnt += 1 74 | iu_sparse_matrix_train = scipy.sparse.csr_matrix(iu_matrix_train) 75 | print("density of iu train matrix is: %.4f%%"%(100*len(find(iu_sparse_matrix_train)[0])/(iu_sparse_matrix_train.shape[0]*iu_sparse_matrix_train.shape[1]))) 76 | scipy.sparse.save_npz(foutput1, iu_sparse_matrix_train) 77 | # test 78 | iu_matrix_test = np.zeros((test_row, col), dtype=np.int8) 79 | cnt = 0 80 | lenght = df_test.shape[0] 81 | for index, row in df_test.iterrows(): 82 | print("iu test matrix: %d / %d"%(cnt, lenght), end="\r") 83 | if row['reviewerID'] in user_id_dict.keys(): 84 | iu_matrix_test[test_item_id_dict[row['asin']], user_id_dict[row['reviewerID']]] = int(row['overall']) 85 | cnt += 1 86 | iu_sparse_matrix_test = scipy.sparse.csr_matrix(iu_matrix_test) 87 | print("density of iu test matrix is: %.4f%%"%(100*len(find(iu_sparse_matrix_test)[0])/(iu_sparse_matrix_test.shape[0]*iu_sparse_matrix_test.shape[1]))) 88 | scipy.sparse.save_npz(foutput2, iu_sparse_matrix_test) 89 | print("iu matrix generated done!") 90 | 91 | 92 | # write uid, train_item_id and test_item_id into files 93 | rw.write2file(user_id_dict, foutput_uid) 94 | rw.write2file(train_item_id_dict, foutput_train_item_id) 95 | rw.write2file(test_item_id_dict, foutput_test_item_id) 96 | print("write done!") -------------------------------------------------------------------------------- /Step1-Preprocessing/user_similarity.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import pandas as pd 4 | import scipy.sparse 5 | from scipy.sparse import csr_matrix, find 6 | import read_write as rw 7 | 8 | ''' 9 | finput_uid = "Data/uid" 10 | finput_rating_matrix_train = "Data/iu_sparse_matrix_train.npz" 11 | foutput_user_similarity = "Data/user_similarity_matrix" 12 | ''' 13 | 14 | if (__name__ == '__main__'): 15 | #### data path 16 | finput_uid = sys.argv[1] 17 | finput_rating_matrix_train = sys.argv[2] 18 | foutput_user_similarity = sys.argv[3] 19 | 20 | # read into user id information and train rating matrix 21 | uid = rw.readffile(finput_uid) 22 | rating_matrix_train = scipy.sparse.load_npz(finput_rating_matrix_train).toarray() 23 | 24 | # generate user similarity 25 | rating_matrix_train = (rating_matrix_train - np.sum(rating_matrix_train, axis=0) / np.sum(rating_matrix_train != 0, axis=0)) * (rating_matrix_train!=0) 26 | rating_matrix_train_2 = rating_matrix_train**2 27 | # user_similarity_matrix = np.dot(rating_matrix_train.T, rating_matrix_train) / (np.dot(rating_matrix_train_2.T, rating_matrix_train_2)**0.5 + 1e-9) 28 | row_num = rating_matrix_train.shape[0] 29 | col_num = rating_matrix_train.shape[1] 30 | user_similarity_matrix = np.zeros((col_num, col_num)) 31 | nominatorM = np.dot(rating_matrix_train.T, rating_matrix_train) 32 | print("nominator done!") 33 | cnt = 0 34 | for i in range(col_num): 35 | cnt += 1 36 | print("progress: %d / %d"%(cnt, col_num), end="\r") 37 | flag = ((rating_matrix_train[:, i]!=0).reshape(row_num, 1))*(rating_matrix_train!=0) 38 | user_similarity_matrix[i] = nominatorM[i] / ((np.dot(rating_matrix_train_2[:, i].T, flag)**0.5) * (np.sum(rating_matrix_train_2*flag, axis=0)**0.5) + 1e-9) 39 | # or it will be 0 for some users 40 | # np.fill_diagonal(user_similarity_matrix, 1) 41 | print("\ndone!") 42 | 43 | # transfer to dataframe and save to file 44 | # rw.write2file(user_similarity_matrix, "Data/test") 45 | df_user_similarity_matrix = pd.DataFrame(user_similarity_matrix,index=list(uid.keys()),columns=list(uid.keys())) 46 | del user_similarity_matrix 47 | rw.write2file(df_user_similarity_matrix, foutput_user_similarity) 48 | print("file saved done!") -------------------------------------------------------------------------------- /Step2-Model/DecisionTree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.sparse import * 4 | from MatrixFactorization import MatrixFactorization 5 | 6 | class DecisionTree: 7 | 8 | def __init__(self, iu_sparse_matrix_train, iu_sparse_matrix_test, iuclst_rating_matrix_train, iuclst_rating_matrix_test, user_cluster_set, depth): 9 | self.iu_sparse_matrix_train = iu_sparse_matrix_train 10 | # consider user biases 11 | plambda = 7 12 | golbal_mean = np.sum(self.iu_sparse_matrix_train) / np.sum(self.iu_sparse_matrix_train!=0) 13 | user_bias = (np.sum(self.iu_sparse_matrix_train, 0) + plambda*golbal_mean) / (np.sum(self.iu_sparse_matrix_train!=0, 0) + plambda) 14 | self.iu_rating_matrix_bias = np.array(iu_sparse_matrix_train-user_bias) 15 | self.iu_rating_matrix_bias = csr_matrix((iu_sparse_matrix_train!=0).toarray() * self.iu_rating_matrix_bias) 16 | 17 | self.iu_sparse_matrix_test = iu_sparse_matrix_test 18 | self.iuclst_rating_matrix_train = iuclst_rating_matrix_train 19 | self.iuclst_rating_matrix_test = iuclst_rating_matrix_test 20 | self.user_cluster_set = user_cluster_set 21 | self.user_cluster_id_set = list(range(0, len(user_cluster_set))) 22 | self.item_num = iu_sparse_matrix_train.shape[0] 23 | self.user_num = iu_sparse_matrix_train.shape[1] 24 | self.tree = list(range(0, self.item_num)) 25 | self.depth_threshold = depth 26 | self.node_interval = [[] for i in range(depth)] 27 | self.node_interval[0].append([0, self.item_num-1]) 28 | self.user_set_id = [[] for i in range(depth-1)] 29 | self.node_assc_rating = [[] for i in range(depth-1)] 30 | # progress record 31 | self.cur_node = 1 32 | self.node_num = 0 33 | for i in range(depth): 34 | self.node_num += 3**i 35 | 36 | # pseudo item and user profile 37 | self.pseudo_item = {} 38 | self.user_profile = {} 39 | 40 | # prediction model 41 | self.prediction_model = {} 42 | 43 | def errorCalculation(self, item_in_node): 44 | sub_rating_matrix = self.iu_rating_matrix_bias[np.ix_(item_in_node)] # user rating matrix with bias 45 | sum_node = np.sum(sub_rating_matrix, axis=0) 46 | sum_node_2 = np.sum(sub_rating_matrix.power(2), axis=0) 47 | num_node = np.sum(sub_rating_matrix != 0, axis=0) 48 | deviation = np.sum(sum_node_2 - np.power(sum_node, 2)/(num_node+1e-9)) 49 | return deviation 50 | 51 | 52 | def findOptUserCluster(self, cur_depth, cur_index): 53 | min_error = -1 54 | opt_item_in_left_child = [] 55 | opt_item_in_middle_child = [] 56 | opt_item_in_right_child = [] 57 | item_in_node = self.tree[self.node_interval[cur_depth][cur_index][0]:self.node_interval[cur_depth][cur_index][1]+1] 58 | ratings = self.iuclst_rating_matrix_train[np.ix_(item_in_node)] 59 | 60 | if len(item_in_node) == 0: 61 | return [[], [], []], -1, -1, -1 62 | 63 | for user_cluster_id in self.user_cluster_id_set: 64 | # print(user_cluster_id) 65 | ratings_for_cluster = ratings[:, user_cluster_id] 66 | sorted_array = np.sort(ratings_for_cluster) 67 | node_size = len(sorted_array) 68 | itve1 = sorted_array[round(node_size/3)] 69 | itve2 = sorted_array[round((2*node_size)/3)] 70 | # itve1 = [] 71 | # itve2 = [] 72 | item_in_left_child = [] 73 | item_in_middle_child = [] 74 | item_in_right_child = [] 75 | for i in range(ratings_for_cluster.shape[0]): 76 | if ratings_for_cluster[i] > itve2: 77 | # if ratings_for_cluster[i] >= 4: 78 | item_in_right_child.append(item_in_node[i]) 79 | elif ratings_for_cluster[i] <= itve1: 80 | # elif ratings_for_cluster[i] <= 2.5: 81 | item_in_left_child.append(item_in_node[i]) 82 | else: 83 | item_in_middle_child.append(item_in_node[i]) 84 | 85 | error_dislike = self.errorCalculation(item_in_left_child) 86 | error_mediocre = self.errorCalculation(item_in_middle_child) 87 | error_like = self.errorCalculation(item_in_right_child) 88 | 89 | # if cur_depth == 0: 90 | # print("user_cluster_id:%d"%user_cluster_id) 91 | # print("error:%f"%(error_dislike+error_mediocre+error_like)) 92 | # # print("error_dislike:%f"%error_dislike) 93 | # # print("error_like:%f"%error_like) 94 | # # print("error_mediocre:%f"%error_mediocre) 95 | # # print(list(ratings_for_cluster)) 96 | # # print(ratings_for_cluster.shape[0]) 97 | # print("\n") 98 | 99 | error = error_dislike + error_mediocre + error_like 100 | if min_error == -1 or error < min_error: 101 | min_error = error 102 | opt_user_cluster_id = user_cluster_id 103 | opt_itve1 = itve1 104 | opt_itve2 = itve2 105 | opt_item_in_left_child = item_in_left_child[:] 106 | opt_item_in_middle_child = item_in_middle_child[:] 107 | opt_item_in_right_child = item_in_right_child[:] 108 | # print("opt_user_cluster_id:%d"%(opt_user_cluster_id)) 109 | return [opt_item_in_left_child, opt_item_in_middle_child, opt_item_in_right_child], opt_user_cluster_id, opt_itve1, opt_itve2 110 | 111 | 112 | def dividToChild(self, optRes, cur_depth, cur_index): 113 | # update tree 114 | self.tree[self.node_interval[cur_depth][cur_index][0]:self.node_interval[cur_depth][cur_index][1]+1] = optRes[0] + optRes[1] + optRes[2] 115 | if len(self.node_interval[cur_depth+1]) == 0: 116 | begin = 0 117 | else: 118 | begin = self.node_interval[cur_depth+1][-1][1] + 1 119 | interval1 = begin + len(optRes[0]) - 1 120 | interval2 = interval1 + len(optRes[1]) 121 | interval3 = interval2 + len(optRes[2]) 122 | # left child interval 123 | self.node_interval[cur_depth+1].append([begin, interval1]) 124 | # middle child interval 125 | self.node_interval[cur_depth+1].append([interval1+1, interval2]) 126 | # right child interval 127 | self.node_interval[cur_depth+1].append([interval2+1, interval3]) 128 | 129 | 130 | def treeConstruction(self, cur_depth, cur_index): 131 | 132 | # progress record 133 | print('Current depth: %d %.2f%%'%(cur_depth+1, 100*self.cur_node/self.node_num), end="\r") 134 | # termination condition 135 | if cur_depth >= self.depth_threshold - 1: 136 | return 137 | self.cur_node += 3 138 | 139 | # opt_itve1 -> left node; opt_itve2 -> right node 140 | optRes, opt_user_cluster_id, opt_itve1, opt_itve2 = self.findOptUserCluster(cur_depth, cur_index) 141 | self.user_set_id[cur_depth].append(opt_user_cluster_id) 142 | self.node_assc_rating[cur_depth].append([opt_itve1, opt_itve2]) 143 | self.dividToChild(optRes, cur_depth, cur_index) 144 | 145 | if opt_user_cluster_id != -1: 146 | self.user_cluster_id_set.remove(opt_user_cluster_id) 147 | # left child 148 | self.treeConstruction(cur_depth+1, cur_index*3) 149 | # middle child 150 | self.treeConstruction(cur_depth+1, cur_index*3+1) 151 | # right child 152 | self.treeConstruction(cur_depth+1, cur_index*3+2) 153 | self.user_cluster_id_set.append(opt_user_cluster_id) 154 | 155 | 156 | def buildTreeModel(self): 157 | self.treeConstruction(0, 0) 158 | 159 | 160 | 161 | def buildPredModel(self, params=[0.01], rank=10): 162 | min_rmse_dict = {} 163 | nonzero = self.iu_sparse_matrix_test.getnnz() 164 | nonzero_matrix = (self.iu_sparse_matrix_test != 0) 165 | MF = MatrixFactorization() 166 | for test_depth in range(self.depth_threshold): 167 | # if test_depth < 2: 168 | # continue 169 | print("level %d"%(test_depth)) 170 | train_lst = [] 171 | length = len(self.node_interval[test_depth]) 172 | # generate input for spark ALS train 173 | for index, interval in zip(range(length), self.node_interval[test_depth]): 174 | print("%d/%d"%(index+1, length), end="\r") 175 | if interval[1] - interval[0] == -1: 176 | continue 177 | sub_rating_matrix = self.iu_sparse_matrix_train[np.ix_(self.tree[interval[0]:interval[1]+1])] 178 | # calculate average ratings for pseudo item to users 179 | avg_rating = np.sum(sub_rating_matrix, axis=0) / (1e-9+sub_rating_matrix.getnnz(axis=0)) 180 | uid = avg_rating.nonzero()[1] 181 | rating = np.array(avg_rating[np.ix_([0], uid)])[0] 182 | for i in range(len(uid)): 183 | train_lst.append((uid[i], index, float(rating[i]))) 184 | print("Rating Number of level " + str(test_depth) + ": " + str(len(train_lst))) 185 | # print(train_lst) 186 | 187 | # test different params for MF 188 | min_RMSE = -1 189 | self.prediction_model.setdefault(test_depth, {}) 190 | for param in params: 191 | MF.change_parameter(regParam=param) 192 | #################################### Spark #################################### 193 | try: 194 | user_profile, item_profile = MF.matrix_factorization(train_lst) 195 | except: 196 | MF.end() 197 | MF = MatrixFactorization() 198 | MF.change_parameter(regParam=param) 199 | user_profile, item_profile = MF.matrix_factorization(train_lst) 200 | #################################### Spark #################################### 201 | 202 | ################################ Calculate RMSE ############################## 203 | RMSE, P_test = self.predict(test_depth, item_profile, user_profile) 204 | print("Parameters: %f, RMSE: %f"%(param, RMSE)) 205 | if min_RMSE == -1 or RMSE < min_RMSE: 206 | min_user_profile = user_profile 207 | min_item_profile = item_profile 208 | min_plambda = param 209 | min_RMSE = RMSE 210 | min_Ptest = P_test 211 | if RMSE > min_RMSE: 212 | break 213 | ################################ Calculate RMSE ############################## 214 | 215 | # save the best profiles and param corresponding to each level 216 | print("min RMSE: %f"%min_RMSE) 217 | min_rmse_dict[test_depth] = min_RMSE 218 | self.prediction_model[test_depth]['upro'] = min_user_profile 219 | self.prediction_model[test_depth]['ipro'] = min_item_profile 220 | self.prediction_model[test_depth]['plambda'] = min_plambda 221 | self.prediction_model[test_depth]['P_test'] = min_Ptest 222 | MF.end() 223 | 224 | 225 | def predict(self, test_depth, item_profile, user_profile): 226 | self.prediction_model[test_depth]['ipro'] = item_profile 227 | P = np.zeros((pow(3, test_depth), self.user_num)) 228 | P[np.ix_(list(item_profile.keys()), list(user_profile.keys()))] = np.dot(np.array(list(item_profile.values())), np.array(list(user_profile.values())).T) 229 | # print(user_profile) 230 | P_test = np.zeros(self.iu_sparse_matrix_test.shape) 231 | rating_matrix_test_unqueried = self.iu_sparse_matrix_test.toarray() 232 | for itemid in range(self.iu_sparse_matrix_test.shape[0]): 233 | pred_index = 0 234 | final_level = 0 235 | rated_user = [] 236 | user_all_ratings = self.iu_sparse_matrix_test[itemid, :].nonzero()[0] 237 | for depth in range(test_depth): 238 | rating = self.iuclst_rating_matrix_test[itemid][self.user_set_id[final_level][pred_index]] 239 | # if rating >= 4: 240 | if rating > self.node_assc_rating[final_level][pred_index][1]: 241 | tmp_pred_index = 3*pred_index + 2 242 | if tmp_pred_index in self.prediction_model[depth+1]['ipro']: 243 | final_level += 1 244 | pred_index = tmp_pred_index 245 | else: 246 | break 247 | # elif rating <= 2.5: 248 | elif rating <= self.node_assc_rating[final_level][pred_index][0]: 249 | tmp_pred_index = 3*pred_index 250 | if tmp_pred_index in self.prediction_model[depth+1]['ipro']: 251 | rated_user.append(self.user_set_id[depth][pred_index]) 252 | final_level += 1 253 | pred_index = tmp_pred_index 254 | else: 255 | break 256 | else: 257 | tmp_pred_index = 3*pred_index + 1 258 | if tmp_pred_index in self.prediction_model[depth+1]['ipro']: 259 | rated_user.append(self.user_set_id[depth][pred_index]) 260 | final_level += 1 261 | pred_index = tmp_pred_index 262 | else: 263 | break 264 | # final_level = test_depth 265 | # pred_index = 0 266 | # print("pred_index before", pred_index) 267 | pred_index = list(self.prediction_model[final_level]['ipro'].keys()).index(pred_index) 268 | # print("pred_index after", pred_index) 269 | # print(P.shape) 270 | # print("itemid:"+str(itemid)+" final_level:"+str(final_level)+" pred_index:"+str(pred_index)) 271 | P_test[itemid, :] = P[pred_index, :] 272 | rating_matrix_test_unqueried[itemid, rated_user] = 0 273 | 274 | rating_matrix_test_unqueried = csc_matrix(rating_matrix_test_unqueried) 275 | P_test = (rating_matrix_test_unqueried!=0).multiply(P_test) 276 | P_test = P_test.tolil() 277 | P_test[P_test>5] = 5 278 | P_test[P_test<0] = 0 279 | diff = P_test - rating_matrix_test_unqueried 280 | return ( diff.multiply(diff).sum() / (rating_matrix_test_unqueried!=0).sum() )**0.5, P_test -------------------------------------------------------------------------------- /Step2-Model/MatrixFactorization.py: -------------------------------------------------------------------------------- 1 | import shelve 2 | import pickle 3 | import numpy as np 4 | from scipy.sparse import * 5 | from pyspark.mllib.recommendation import ALS 6 | from pyspark.sql import SparkSession 7 | from pyspark import SparkConf 8 | from pyspark import SparkContext 9 | 10 | class MatrixFactorization: 11 | def __init__(self, maxIter=15, regParam=0.01, rank=10): 12 | self.maxIter = maxIter 13 | self.regParam = regParam 14 | self.rank = rank 15 | conf = SparkConf().setAppName("appName").setMaster("local[*]") 16 | # self.spark = SparkSession.builder.master("local[*]").appName("Example").getOrCreate() 17 | conf.set("spark.driver.memory","16g") 18 | conf.set("spark.executor.memory","16g") 19 | self.spark = SparkContext(conf=conf) 20 | print("New SparkSession started...") 21 | 22 | def change_parameter(self, regParam=0.01, rank=10): 23 | self.regParam = regParam 24 | self.rank = rank 25 | 26 | def matrix_factorization(self, train_lst): 27 | ratings = self.spark.parallelize(train_lst) 28 | model = ALS.train(ratings, rank=self.rank, seed=10, \ 29 | iterations=self.maxIter, \ 30 | lambda_=self.regParam) 31 | print("MF DONE") 32 | userFeatures = sorted(model.userFeatures().collect(), key=lambda d: d[0], reverse=False) 33 | productFeatures = sorted(model.productFeatures().collect(), key=lambda d: d[0], reverse=False) 34 | userProfile = {each[0]: each[1].tolist() for each in userFeatures} 35 | itemProfile = {each[0]: each[1].tolist() for each in productFeatures} 36 | 37 | return userProfile, itemProfile 38 | 39 | def end(self): 40 | self.spark.stop() 41 | print("SparkSession stopped.") -------------------------------------------------------------------------------- /Step2-Model/build_tree.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from DecisionTree import DecisionTree 4 | import read_write as rw 5 | import scipy.sparse 6 | 7 | ''' 8 | finput_iu_rating_matrix_train = "Data/iu_sparse_matrix_train.npz" 9 | finput_iuclst_rating_matrix = "Data/iuclst_rating_matrix" 10 | finput_user_cluster_set = "Data/user_cluster_set" 11 | finput_desired_depth = 5 12 | ''' 13 | 14 | if (__name__ == '__main__'): 15 | 16 | finput_iu_sparse_matrix_train = sys.argv[1] 17 | finput_iu_sparse_matrix_test = sys.argv[2] 18 | finput_iuclst_rating_matrix_train = sys.argv[3] 19 | finput_iuclst_rating_matrix_test = sys.argv[4] 20 | finput_user_cluster_set = sys.argv[5] 21 | finput_desired_depth = int(sys.argv[6]) 22 | 23 | # read into data for tree construction 24 | iu_sparse_matrix_train = scipy.sparse.load_npz(finput_iu_sparse_matrix_train) 25 | iu_sparse_matrix_test = scipy.sparse.load_npz(finput_iu_sparse_matrix_test) 26 | iuclst_rating_matrix_train = rw.readffile(finput_iuclst_rating_matrix_train) 27 | iuclst_rating_matrix_test = rw.readffile(finput_iuclst_rating_matrix_test) 28 | user_cluster_set = rw.readffile(finput_user_cluster_set) 29 | 30 | # build tree 31 | dt_model = DecisionTree(iu_sparse_matrix_train, iu_sparse_matrix_test, iuclst_rating_matrix_train, iuclst_rating_matrix_test, user_cluster_set, finput_desired_depth) 32 | dt_model.buildTreeModel() 33 | print("\ntree construction finished") 34 | # build prediction model 35 | dt_model.buildPredModel() 36 | print("prediction model finished") 37 | # predict 38 | # dt_model.predict() 39 | 40 | 41 | -------------------------------------------------------------------------------- /Step2-Model/read_write.py: -------------------------------------------------------------------------------- 1 | import shelve 2 | import pickle 3 | 4 | 5 | def write2file(dct, path): 6 | with shelve.open(path, protocol=pickle.HIGHEST_PROTOCOL) as d: 7 | d['content'] = dct 8 | 9 | def readffile(path): 10 | with shelve.open(path, protocol=pickle.HIGHEST_PROTOCOL) as d: 11 | return d['content'] -------------------------------------------------------------------------------- /script.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | dataset = "All_Beauty" 5 | METADATA = "Dataset/%s/meta_%s.json.gz"%(dataset, dataset) 6 | REVIEWDATA = "Dataset/%s/reviews_%s.json.gz"%(dataset, dataset) 7 | TITLEINFO = "Data/%s/title"%(dataset) 8 | DESCRIPTIONINFO = "Data/%s/description"%(dataset) 9 | TITLESIM = "Data/%s/title_similarity_matrix"%(dataset) 10 | DESCRIPTIONSIM = "Data/%s/description_similarity_matrix"%(dataset) 11 | USERSIM = "Data/%s/user_similarity_matrix"%(dataset) 12 | TRAINNPZ = "Data/%s/iu_sparse_matrix_train.npz"%(dataset) 13 | TESTNPZ = "Data/%s/iu_sparse_matrix_test.npz"%(dataset) 14 | UID = "Data/%s/uid"%(dataset) 15 | TRAINITEMID = "Data/%s/train_item_id"%(dataset) 16 | TESTITEMID = "Data/%s/test_item_id"%(dataset) 17 | TOPIC_NUM = 15 18 | CLUSTER_NUM = 200 19 | INIT_PARAM_TITLE = 1.0 20 | INIT_PARAM_DESCRIPTION = 1.0 21 | DEPTH_OF_TREE = 5 22 | NON_LINEAR = "D:/GitCode/Dissertation/Data/%s/nonlinreg.mat"%(dataset) 23 | USER_CLUSTER = "Data/%s/user_cluster_set"%(dataset) 24 | USERCLUSTER_ITEM_RATING_MATRIX_TRAIN = "Data/%s/iuclst_rating_matrix_train"%(dataset) 25 | USERCLUSTER_ITEM_RATING_MATRIX_TEST = "Data/%s/iuclst_rating_matrix_test"%(dataset) 26 | ITEM_SIM_MATRIX = "Data/%s/item_sim_matrix"%(dataset) 27 | 28 | 29 | print("################## item_information.py ####################") 30 | os.system('python Step1-Preprocessing/item_information.py %s %s %s'%(METADATA, TITLEINFO, DESCRIPTIONINFO)) 31 | print("\n") 32 | print("################## user_information.py ####################") 33 | os.system('python Step1-Preprocessing/user_information.py %s %s %s %s %s %s %s'%(REVIEWDATA, TITLEINFO, TRAINNPZ, TESTNPZ, UID, TRAINITEMID, TESTITEMID)) 34 | print("\n") 35 | print("################## item_similarity.py ####################") 36 | os.system('python Step1-Preprocessing/item_similarity.py %s %s %s %s %s %s %s'%(TOPIC_NUM, TITLEINFO, DESCRIPTIONINFO, TRAINITEMID, TESTITEMID, TITLESIM, DESCRIPTIONSIM)) 37 | print("\n") 38 | print("################## similarity_parameters.py ####################") 39 | os.system('python Step1-Preprocessing/similarity_parameters.py %s %s %s %s %s %s %s'%(TITLESIM, DESCRIPTIONSIM, TRAINITEMID, TESTITEMID, TRAINNPZ, TESTNPZ, NON_LINEAR)) 40 | print("\n") 41 | print("################## user_similarity.py ####################") 42 | os.system('python Step1-Preprocessing/user_similarity.py %s %s %s'%(UID, TRAINNPZ, USERSIM)) 43 | print("\n") 44 | print("################## user_clustering.py ####################") 45 | os.system('python Step1-Preprocessing/user_clustering.py %s %s %s'%(USERSIM, CLUSTER_NUM, USER_CLUSTER)) 46 | print("\n") 47 | print("################## buildtree_preparation.py ####################") 48 | os.system('python Step1-Preprocessing/buildtree_preparation.py %s %s %s %s %s %s %s %s %s %s %s'%(TRAINNPZ, TESTNPZ, TITLESIM, DESCRIPTIONSIM, USER_CLUSTER, TRAINITEMID, TESTITEMID, NON_LINEAR, INIT_PARAM_TITLE, INIT_PARAM_DESCRIPTION, USERCLUSTER_ITEM_RATING_MATRIX, USERCLUSTER_ITEM_RATING_MATRIX_TEST, ITEM_SIM_MATRIX)) 49 | print("\n") 50 | print("################## build_tree.py ####################") 51 | os.system('python Step2-Model/build_tree.py %s %s %s %s %s'%(TRAINNPZ, TESTNPZ, USERCLUSTER_ITEM_RATING_MATRIX_TRAIN, USERCLUSTER_ITEM_RATING_MATRIX_TEST, USER_CLUSTER, DEPTH_OF_TREE)) 52 | print("\n") 53 | --------------------------------------------------------------------------------