├── .gitignore ├── __init__.py ├── data.py ├── data_test.py ├── nmf.py ├── nmf_test.py ├── nmf_user_based_cf.py ├── nmf_user_based_cf_evaluation.py ├── progress.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore PyCharm temporary files, build results, and data files 2 | 3 | # PyCharm 4 | /.idea/ 5 | /__pycache__/ 6 | 7 | # Data 8 | /data/ 9 | # "data/" matches all directory who's name is "data", but "/data/" matches the exact directory who's name is "data" only in the root directory 10 | *.npy 11 | *.txt 12 | *.mat 13 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | Created on 2015-12-12 4 | @author: yuqiang 5 | Main entrance of program 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | Created on 2015-12-13 4 | @author: yuqiang 5 | Read of given data and reform it to useful format 6 | """ 7 | 8 | import time 9 | import numpy 10 | 11 | 12 | def _form_user_news_array_from_list(user_news_pairs): 13 | """Form user-news array from list(private method) 14 | 15 | Form the user-news array from given list, whose rows represent users and cols represent newses. 16 | The meaning of A[i, j] in the result array is whether user "i" clicked news "j"(if true, 1, else, 0). 17 | 18 | Args: 19 | user_news_pairs(Type: list[(user_id, news_id)]): list of user-news pair which represents the user clicked the news 20 | 21 | Returns: 22 | user_news_array(Type: numpy.ndarray): The result user-news array(rows for users and cols for newses) 23 | user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array 24 | news_ids(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the array 25 | """ 26 | 27 | user_ids = [user_id for (user_id, news_id) in user_news_pairs] 28 | news_ids = [news_id for (user_id, news_id) in user_news_pairs] 29 | 30 | # unique and sort user and news 31 | user_ids = numpy.array(list(set(user_ids))) 32 | user_ids.sort() 33 | news_ids = numpy.array(list(set(news_ids))) 34 | news_ids.sort() 35 | 36 | # form the array 37 | user_num = len(user_ids) 38 | news_num = len(news_ids) 39 | user_ids_dict = {value: index for (index, value) in enumerate(user_ids)} 40 | news_ids_dict = {value: index for (index, value) in enumerate(news_ids)} 41 | user_news_array = numpy.zeros((user_num, news_num), numpy.bool_) 42 | for (user_id, news_id) in user_news_pairs: 43 | user_news_array[user_ids_dict[user_id], news_ids_dict[news_id]] = 1 44 | 45 | return user_news_array, user_ids, news_ids 46 | 47 | 48 | def get_user_news_array(): 49 | """Form user-news array 50 | 51 | Form the user-news array, whose rows represent users and cols represent newses. 52 | The meaning of A[i, j] in the result array is whether user "i" clicked news "j"(if true, 1, else, 0). 53 | 54 | Returns: 55 | user_news_array(Type: numpy.ndarray): The result user-news array(rows for users and cols for newses) 56 | user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array 57 | news_ids(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the array 58 | """ 59 | 60 | # read data from file 61 | f = open("user_click_data.txt", "r", 1, "utf-8") 62 | user_news_pairs = [] 63 | while True: 64 | line = f.readline() 65 | if line: 66 | p = line.split('\t') 67 | user_news_pairs.append((int(p[0]), int(p[1]))) 68 | else: 69 | break 70 | f.close() 71 | 72 | return _form_user_news_array_from_list(user_news_pairs) 73 | 74 | 75 | def get_user_news_arrays_of_train_and_test(remove_new_users_in_test=False, remove_new_newses_in_test=False): 76 | """Form user-news arrays of train and test 77 | 78 | Form the user-news array of train and test, whose rows represent users and cols represent newses. 79 | The meaning of A[i, j] in the result array is whether user "i" clicked news "j"(if true, 1, else, 0). 80 | 81 | Args: 82 | remove_new_users_in_test(Type: bool): whether to remove new users in test 83 | remove_new_newses_in_test(Type: bool): whether to remove new newses in test 84 | 85 | Returns: 86 | user_news_array_of_train(Type: numpy.ndarray): The result user-news array of train(rows for users and cols for newses) 87 | user_ids_of_train(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the user-news array of train 88 | news_ids_of_train(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the user-news array of train 89 | user_news_array_of_test(Type: numpy.ndarray): The result user-news array of test(rows for users and cols for newses) 90 | user_ids_of_test(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the user-news array of test 91 | news_ids_of_test(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the user-news array of test 92 | """ 93 | 94 | # read data from file 95 | f = open("user_click_data.txt", "r", 1, "utf-8") 96 | user_news_time_list = [] 97 | while True: 98 | line = f.readline() 99 | if line: 100 | p = line.split('\t') 101 | user_news_time_list.append((int(p[0]), int(p[1]), int(p[2]))) 102 | else: 103 | break 104 | f.close() 105 | 106 | # calculate the threshold of visit times to divide data into a train set and a test set 107 | click_times = [click_time for (user_id, news_id, click_time) in user_news_time_list] 108 | time_start = min(click_times) 109 | time_end = max(click_times) 110 | time_threshold = time_start + (time_end - time_start) * 2 // 3 111 | 112 | # divide data into train set and test set 113 | user_news_pairs_of_all = [(user_id, news_id) for (user_id, news_id, click_time) in user_news_time_list] 114 | user_news_pairs_of_train = [(user_id, news_id) for (user_id, news_id, click_time) in user_news_time_list if click_time < time_threshold] 115 | user_news_pairs_of_test = [(user_id, news_id) for (user_id, news_id, click_time) in user_news_time_list if click_time >= time_threshold] 116 | 117 | # form the train array 118 | user_news_array_of_all, user_ids_of_all, news_ids_of_all = _form_user_news_array_from_list(user_news_pairs_of_all) 119 | user_news_array_of_train, user_ids_of_train, news_ids_of_train = _form_user_news_array_from_list(user_news_pairs_of_train) 120 | # user_news_array_of_test, user_ids_of_test, news_ids_of_test = _form_user_news_array_from_list(user_news_pairs_of_test) 121 | 122 | # form the test array(remove new users or new newses) 123 | user_ids_of_all_dict = {user_id: index for (index, user_id) in enumerate(user_ids_of_all)} 124 | news_ids_of_all_dict = {news_id: index for (index, news_id) in enumerate(news_ids_of_all)} 125 | train_samples = numpy.array([[user_ids_of_all_dict[user_id], news_ids_of_all_dict[news_id]] for (user_id, news_id) in user_news_pairs_of_train]).transpose() # coordinates of train samples in user_news_array_of_all 126 | user_news_array_of_test = user_news_array_of_all.copy() 127 | user_news_array_of_test[train_samples[0, :], train_samples[1, :]] = 0 # remove train samples in user_news_array_of_all 128 | user_indices_of_train_in_all = numpy.array([index for (index, user_id) in enumerate(user_ids_of_all) if user_id in user_ids_of_train]) # indices of users in user_ids_of_all who appeared in train set 129 | user_indices_of_train_in_all.reshape((-1, 1)) # reshape to column vector 130 | news_indices_of_train_in_all = numpy.array([index for (index, news_id) in enumerate(news_ids_of_all) if news_id in news_ids_of_train]) # indices of newses in news_ids_of_all who appeared in train set 131 | news_indices_of_train_in_all.reshape((1, -1)) # reshape to row vector 132 | user_num = len(user_ids_of_all) 133 | news_num = len(news_ids_of_all) 134 | user_news_array_of_test = user_news_array_of_test[user_indices_of_train_in_all if remove_new_users_in_test else range(user_num), :] # remove new users 135 | user_news_array_of_test = user_news_array_of_test[:, news_indices_of_train_in_all if remove_new_newses_in_test else range(news_num)] # remove new newses 136 | user_ids_of_test = user_ids_of_all[user_indices_of_train_in_all if remove_new_users_in_test else range(user_num)] # remove new users 137 | news_ids_of_test = news_ids_of_all[news_indices_of_train_in_all if remove_new_newses_in_test else range(news_num)] # remove new newses 138 | 139 | # remove users who don't appear in test set 140 | hold_indices = [index for (index, user_id) in enumerate(user_ids_of_test) if user_id in [user_id for (user_id, news_id) in user_news_pairs_of_test]] 141 | if len(hold_indices) > 0: 142 | user_news_array_of_test = user_news_array_of_test[hold_indices, :] 143 | user_ids_of_test = user_ids_of_test[hold_indices] 144 | 145 | # write data to files 146 | numpy.save("user_news_array_of_train.npy", user_news_array_of_train) 147 | numpy.save("user_ids_of_train.npy", user_ids_of_train) 148 | numpy.save("news_ids_of_train.npy", news_ids_of_train) 149 | numpy.save("user_news_array_of_test.npy", user_news_array_of_test) 150 | numpy.save("user_ids_of_test.npy", user_ids_of_test) 151 | numpy.save("news_ids_of_test.npy", news_ids_of_test) 152 | 153 | return user_news_array_of_train, user_ids_of_train, news_ids_of_train, user_news_array_of_test, user_ids_of_test, news_ids_of_test 154 | 155 | 156 | def get_news_dict(): 157 | """Form news dict 158 | 159 | Form the news dict, whose key is news id and value is news title and content. 160 | 161 | Returns: 162 | news_dict(Type: dictionary): key: news id, value: news title and content 163 | """ 164 | 165 | # read data from file 166 | f = open("user_click_data.txt", "r", 1, "utf-8") 167 | news_dict = {} 168 | while True: 169 | line = f.readline() 170 | if line: 171 | p = line.split('\t') 172 | news_dict.setdefault(int(p[1]), p[3] + p[4]) 173 | else: 174 | break 175 | f.close() 176 | 177 | return news_dict 178 | 179 | 180 | def get_user_clicked_news_dict(): 181 | """Form user clicked news dict 182 | 183 | Form the user clicked news dict, whose key is user id and value is news ids the user has clicked. 184 | 185 | Returns: 186 | user_clicked_news_dict(Type: dictionary): key: user id, value: news ids the user has clicked 187 | """ 188 | 189 | user_news_array, user_ids, news_ids = get_user_news_array() 190 | user_num = len(user_ids) 191 | user_clicked_news_dict = {} 192 | for i in range(user_num): 193 | clicked_news_ids = news_ids[user_news_array[i] == 1] 194 | user_clicked_news_dict.setdefault(user_ids[i], clicked_news_ids) 195 | 196 | return user_clicked_news_dict 197 | -------------------------------------------------------------------------------- /data_test.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | Created on 2015-12-12 4 | @author: yuqiang 5 | Test of given data of news 6 | """ 7 | 8 | # read data from file 9 | f = open("user_click_data.txt", "r", 1, "utf-8") 10 | user_ids = [] 11 | news_ids = [] 12 | visit_times = [] 13 | news_titles = [] 14 | news_bodies = [] 15 | while True: 16 | line = f.readline() 17 | if line: 18 | p = line.split('\t') 19 | user_ids.append(int(p[0])) 20 | news_ids.append(int(p[1])) 21 | visit_times.append(int(p[2])) 22 | news_titles.append(p[3]) 23 | news_bodies.append(p[4]) 24 | else: 25 | break 26 | f.close() 27 | 28 | print("total lines: %d" % len(user_ids)) 29 | 30 | # remove duplicate to calculate number of user and news 31 | unique_user_ids = list(set(user_ids)) 32 | # unique_user_ids.sort(user_ids.index) 33 | unique_news_ids = list(set(news_ids)) 34 | 35 | print("") 36 | print("unique user number: %d" % len(unique_user_ids)) 37 | print("unique news number: %d" % len(unique_news_ids)) 38 | 39 | # calculate the threshold of visit times to divide data into a train set and a test set 40 | # visit_times.sort() 41 | time_start = min(visit_times) 42 | time_end = max(visit_times) 43 | time_threshold = time_start + (time_end - time_start) * 2 // 3 44 | train_set_indexes = [index for (index, value) in enumerate(visit_times) if value < time_threshold] 45 | test_set_indexes = [index for (index, value) in enumerate(visit_times) if value >= time_threshold] 46 | 47 | print("") 48 | print("start time: %d" % time_start) 49 | print("end time: %d" % time_end) 50 | print("threshold time: %d (early 2/3 of time period)" % time_threshold) 51 | print("train set size: %d (before threshold time)" % len(train_set_indexes)) 52 | print("test set size: %d (after threshold time)" % len(test_set_indexes)) 53 | -------------------------------------------------------------------------------- /nmf.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | Created on 2015-12-12 4 | @author: yuqiang 5 | Modified from Chih-Jen Lin's NMF Algorithm 6 | http://www.csie.ntu.edu.tw/~cjlin/nmf/ 7 | 8 | # NMF by alternative non-negative least squares using projected gradients 9 | # Author: Chih-Jen Lin, National Taiwan University 10 | # Python/numpy translation: Anthony Di Franco 11 | """ 12 | 13 | from numpy import * 14 | from numpy.linalg import norm 15 | from time import time 16 | from sys import stdout 17 | 18 | 19 | def nmf(V, K, tol=0.0000000000001, timelimit=25, maxiter=8000): 20 | """ 21 | (W,H) = nmf(V,K,tol,timelimit,maxiter) 22 | W,H: output solution 23 | K: column number of W 24 | tol: tolerance for a relative stopping condition 25 | timelimit, maxiter: limit of time and iterations 26 | """ 27 | 28 | print('NMF started.') 29 | time_start = time() 30 | 31 | N = len(V) 32 | M = len(V[0]) 33 | W = random.rand(N, K) 34 | H = random.rand(K, M) 35 | initt = time() 36 | 37 | gradW = dot(W, dot(H, H.T)) - dot(V, H.T) 38 | gradH = dot(dot(W.T, W), H) - dot(W.T, V) 39 | initgrad = norm(r_[gradW, gradH.T]) 40 | # print('Init gradient norm %f' % initgrad) 41 | tolW = max(0.001,tol)*initgrad 42 | tolH = tolW 43 | 44 | for iter in range(1,maxiter): 45 | # stopping condition 46 | projnorm = norm(r_[gradW[logical_or(gradW<0, W>0)], 47 | gradH[logical_or(gradH<0, H>0)]]) 48 | if projnorm < tol*initgrad or time() - initt > timelimit: break 49 | 50 | (W, gradW, iterW) = nlssubprob(V.T,H.T,W.T,tolW,1000) 51 | W = W.T 52 | gradW = gradW.T 53 | 54 | if iterW==1: tolW = 0.1 * tolW 55 | 56 | (H,gradH,iterH) = nlssubprob(V,W,H,tolH,1000) 57 | if iterH==1: tolH = 0.1 * tolH 58 | 59 | if iter % 2 == 0: stdout.write('.'); stdout.flush() 60 | 61 | print('') 62 | 63 | # print('Iter = %d Final proj-grad norm %f' % (iter, projnorm)) 64 | 65 | time_end = time() 66 | print('NMF ended. %fs cost.' % (time_end - time_start)) 67 | 68 | return (W,H) 69 | 70 | 71 | def nlssubprob(V,W,Hinit,tol,maxiter): 72 | """ 73 | H, grad: output solution and gradient 74 | iter: #iterations used 75 | V, W: constant matrices 76 | Hinit: initial solution 77 | tol: stopping tolerance 78 | maxiter: limit of iterations 79 | """ 80 | 81 | H = Hinit 82 | WtV = dot(W.T, V) 83 | WtW = dot(W.T, W) 84 | 85 | alpha = 1; beta = 0.1; 86 | for iter in range(1, maxiter): 87 | grad = dot(WtW, H) - WtV 88 | projgrad = norm(grad[logical_or(grad < 0, H >0)]) 89 | if projgrad < tol: break 90 | 91 | # search step size 92 | for inner_iter in range(1,20): 93 | Hn = H - alpha*grad 94 | Hn = where(Hn > 0, Hn, 0) 95 | d = Hn-H 96 | gradd = sum(grad * d) 97 | dQd = sum(dot(WtW,d) * d) 98 | suff_decr = 0.99*gradd + 0.5*dQd < 0; 99 | if inner_iter == 1: 100 | decr_alpha = not suff_decr; Hp = H; 101 | if decr_alpha: 102 | if suff_decr: 103 | H = Hn; break; 104 | else: 105 | alpha = alpha * beta; 106 | else: 107 | if not suff_decr or (Hp == Hn).all(): 108 | H = Hp; break; 109 | else: 110 | alpha = alpha/beta; Hp = Hn; 111 | 112 | if iter == maxiter: 113 | print('Max iter in nlssubprob') 114 | return (H, grad, iter) -------------------------------------------------------------------------------- /nmf_test.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | Created on 2015-12-12 4 | @author: yuqiang 5 | Test of module "nmf" 6 | """ 7 | 8 | import numpy 9 | import time 10 | import nmf 11 | 12 | V = [ 13 | [5, 3, 0, 1], 14 | [4, 0, 0, 1], 15 | [1, 1, 0, 5], 16 | [1, 0, 0, 4], 17 | [0, 1, 5, 4], 18 | ] 19 | V = [ 20 | [1, 1, 0, 1, 1], 21 | [1, 0, 0, 1, 0], 22 | [1, 1, 0, 1, 0], 23 | [1, 0, 0, 1, 0], 24 | [0, 1, 1, 1, 0], 25 | ] 26 | 27 | V = numpy.array(V) 28 | print("V = ") 29 | print(V) 30 | 31 | time_start = time.time() 32 | K = 2 33 | W, H = nmf.nmf(V, K) 34 | time_end = time.time() 35 | estimatedV = numpy.dot(W, H) 36 | print("W = ") 37 | print(W) 38 | print("H = ") 39 | print(H) 40 | print("estimatedV = ") 41 | print(estimatedV) 42 | print(time_end - time_start) 43 | -------------------------------------------------------------------------------- /nmf_user_based_cf.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | Created on 2015-12-13 4 | @author: yuqiang 5 | NMF-User-based Collaborative Filtering 6 | """ 7 | 8 | import time 9 | import numpy 10 | import nmf 11 | import data 12 | import progress 13 | 14 | """ 15 | # get user-news array 16 | user_news_array, user_ids, news_ids = data.get_user_news_array() 17 | user_num = len(user_ids) 18 | news_num = len(news_ids) 19 | 20 | # NMF 21 | V = numpy.float16(user_news_array.T) 22 | N = len(V) 23 | M = len(V[0]) 24 | K = 15 # TODO refine this parameter 25 | W_init = numpy.random.rand(N, K) 26 | H_init = numpy.random.rand(K, M) 27 | W, H = nmf.nmf(V, W_init, H_init) 28 | del V, W_init, H_init, W, M, N 29 | # estimatedV = numpy.dot(W, H) 30 | 31 | # calculate similarity between users 32 | print("Calculate similarity between users started.") 33 | time_start = time.time() 34 | user_user_similarities = numpy.zeros((user_num, user_num), numpy.float16) 35 | H_norm = numpy.power(H, 2) 36 | H_norm = H_norm.sum(0) 37 | H_norm = numpy.sqrt(H_norm) # norm of each column vector in H 38 | H_norm = numpy.tile(H_norm, (K, 1)) 39 | H_normalized = H / H_norm 40 | H_normalized_transpose = H_normalized.transpose() 41 | computed_count = 0 42 | compute_step = 1000 # to avoid MemoryError, only compute a part each time 43 | while computed_count < user_num: 44 | compute_upper_limit = min((computed_count + compute_step, user_num)) 45 | user_user_similarities[computed_count:compute_upper_limit, :] = numpy.dot(H_normalized_transpose[computed_count:compute_upper_limit, :], H_normalized) 46 | computed_count += compute_step 47 | del H, H_norm, H_normalized, H_normalized_transpose, computed_count, compute_step 48 | time_end = time.time() 49 | print("Calculate similarity between users ended. %f s cost." % (time_end - time_start)) 50 | 51 | # find k nearest neighbors of users 52 | print("Find k nearest neighbors of users started.") 53 | time_start = time.time() 54 | neighbor_size = 20 # TODO refine this parameter 55 | user_neighbors_indexes = numpy.zeros((user_num, neighbor_size), numpy.int16) 56 | inverse_indexes = range(user_num-2, user_num-neighbor_size-2, -1) # choose the last k in the sorted list, remove the last one which is oneself 57 | for i in range(user_num): 58 | sorted_indexes = numpy.argsort(user_user_similarities[i, :]) 59 | user_neighbors_indexes[i, :] = sorted_indexes[inverse_indexes] 60 | if i % 100 == 0: 61 | print("%.1f%%" % (i / user_num * 100)) 62 | del inverse_indexes 63 | time_end = time.time() 64 | print("Find k nearest neighbors of users ended. %f s cost." % (time_end - time_start)) 65 | 66 | # predict ratings 67 | print("Predict ratings started.") 68 | time_start = time.time() 69 | user_news_array = numpy.int8(user_news_array) # int is faster than bool_ 70 | user_news_predict_array = numpy.zeros((user_num, news_num), numpy.float16) 71 | eps = numpy.finfo(float).eps 72 | for i in range(user_num): 73 | similarities_sum = 0.0 74 | for user_neighbors_index in user_neighbors_indexes[i]: 75 | user_news_predict_array[i] += user_news_array[user_neighbors_index] * user_user_similarities[user_neighbors_index, i] 76 | similarities_sum += user_user_similarities[user_neighbors_index, i] 77 | user_news_predict_array[i] /= (similarities_sum + eps) 78 | if i % 100 == 0: 79 | print("%.1f%%" % (i / user_num * 100)) 80 | user_news_predict_array[user_news_array == 1] = 0 # remove news one user has clicked 81 | del eps 82 | time_end = time.time() 83 | print("Predict ratings ended. %f s cost." % (time_end - time_start)) 84 | 85 | # choose first k news to recommend to users 86 | print("Choose first k news to recommend to users started.") 87 | time_start = time.time() 88 | recommend_size = 10 # TODO refine this parameter 89 | user_news_recommend_indexes = numpy.zeros((user_num, recommend_size), numpy.int16) 90 | inverse_indexes = range(news_num-1, news_num-recommend_size-1, -1) # choose the last k in the sorted list 91 | for i in range(user_num): 92 | sorted_indexes = numpy.argsort(user_news_predict_array[i, :]) 93 | user_news_recommend_indexes[i, :] = sorted_indexes[inverse_indexes] 94 | if i % 100 == 0: 95 | print("%.1f%%" % (i / user_num * 100)) 96 | del inverse_indexes 97 | time_end = time.time() 98 | print("Choose first k news to recommend to users ended. %f s cost." % (time_end - time_start)) 99 | 100 | print("") 101 | """ 102 | 103 | 104 | def train(): 105 | """Train with the train set 106 | 107 | Train with the train set, and return the user_user_similarities array. 108 | 109 | Returns: 110 | user_user_similarities(Type: numpy.ndarray): The similarities between user and user(user_user_similarities[i, j] represents the similarity between user "i" and user "j". Similarity of oneself is "1".) 111 | user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array 112 | """ 113 | 114 | # get user-news array 115 | user_news_array_of_train = numpy.load("user_news_array_of_train.npy") 116 | user_ids_of_train = numpy.load("user_ids_of_train.npy") 117 | user_news_array = user_news_array_of_train 118 | user_ids = user_ids_of_train 119 | user_num = len(user_ids) 120 | del user_news_array_of_train, user_ids_of_train 121 | 122 | # NMF 123 | V = numpy.float16(user_news_array.T) 124 | K = 15 # TODO refine this parameter 125 | W, H = nmf.nmf(V, K) 126 | del V, W 127 | # estimatedV = numpy.dot(W, H) 128 | 129 | # calculate similarity between users 130 | print("Calculate similarity between users started.") 131 | time_start = time.time() 132 | user_user_similarities = numpy.zeros((user_num, user_num), numpy.float16) 133 | H_norm = numpy.power(H, 2) 134 | H_norm = H_norm.sum(0) 135 | H_norm = numpy.sqrt(H_norm) # norm of each column vector in H 136 | H_norm = numpy.tile(H_norm, (K, 1)) 137 | eps = numpy.finfo(float).eps 138 | H_normalized = H / (H_norm + eps) 139 | H_normalized_transpose = H_normalized.transpose() 140 | computed_count = 0 141 | compute_step = 1000 # to avoid MemoryError, only compute a part each time 142 | while computed_count < user_num: 143 | compute_upper_limit = min((computed_count + compute_step, user_num)) 144 | user_user_similarities[computed_count:compute_upper_limit, :] = numpy.dot(H_normalized_transpose[computed_count:compute_upper_limit, :], H_normalized) 145 | computed_count += compute_step 146 | del H, H_norm, H_normalized, H_normalized_transpose, computed_count, compute_step 147 | time_end = time.time() 148 | print("Calculate similarity between users ended. %f s cost." % (time_end - time_start)) 149 | 150 | print("[NMF-User-based Collaborative Filtering] Train finished!") 151 | 152 | return user_user_similarities, user_ids 153 | 154 | 155 | def recommend(user_user_similarities, user_ids): 156 | """Recommend with the test set 157 | 158 | Recommend with the test set, and return the user_news_rating_predictions array. 159 | 160 | Args: 161 | user_user_similarities(Type: numpy.ndarray): The similarities between user and user(user_user_similarities[i, j] represents the similarity between user "i" and user "j". Similarity of oneself is "1".) 162 | user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array 163 | 164 | Returns: 165 | user_news_rating_predictions(Type: numpy.ndarray): The rating prediction of each user to each news(rating_prediction[i, j] represents the rating prediction of user "i" to news "j".) 166 | user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array 167 | news_ids(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the array 168 | """ 169 | 170 | # get user-news array 171 | user_news_array_of_train = numpy.load("user_news_array_of_train.npy") 172 | user_ids_of_train = numpy.load("user_ids_of_train.npy") 173 | news_ids_of_train = numpy.load("news_ids_of_train.npy") 174 | user_news_array_of_test = numpy.load("user_news_array_of_test.npy") 175 | user_ids_of_test = numpy.load("user_ids_of_test.npy") 176 | news_ids_of_test = numpy.load("news_ids_of_test.npy") 177 | user_num_of_train = len(user_ids_of_train) 178 | news_num_of_train = len(news_ids_of_train) 179 | user_num_of_test = len(user_ids_of_test) 180 | news_num_of_test = len(news_ids_of_test) 181 | 182 | # find k nearest neighbors of users 183 | print("Find k nearest neighbors of users started.") 184 | time_start = time.time() 185 | neighbor_size = min(20, user_num_of_train-1) # TODO refine this parameter 186 | user_neighbors_indexes = numpy.zeros((user_num_of_test, neighbor_size), numpy.int16) 187 | user_ids_of_train_dict = {user_id: index for (index, user_id) in enumerate(user_ids_of_train)} 188 | user_index_from_test_to_train_dict = {user_index_in_test: user_ids_of_train_dict[user_id_of_test] for (user_index_in_test, user_id_of_test) in enumerate(user_ids_of_test)} # dictionary of user index from test to train 189 | for i in range(user_num_of_test): 190 | sorted_indexes = numpy.argsort(-user_user_similarities[user_index_from_test_to_train_dict[i], :]) 191 | user_neighbors_indexes[i, :] = sorted_indexes[1:neighbor_size+1] # choose the first k in the sorted list, remove the first one which is oneself 192 | if i % 100 == 0: 193 | # print("%.1f%%" % (i / user_num_of_test * 100)) 194 | progress.update(i / user_num_of_test) 195 | progress.update(1) 196 | time_end = time.time() 197 | print("Find k nearest neighbors of users ended. %f s cost." % (time_end - time_start)) 198 | 199 | # predict ratings 200 | print("Predict ratings started.") 201 | time_start = time.time() 202 | news_ids_of_test_dict = {news_id: index for (index, news_id) in enumerate(news_ids_of_test)} 203 | news_index_from_train_to_test_dict = {news_index_in_train: news_ids_of_test_dict[news_id_of_train] for (news_index_in_train, news_id_of_train) in enumerate(news_ids_of_train)} # dictionary of news index from train to test 204 | user_news_array_of_train_expanded = numpy.zeros((user_num_of_train, news_num_of_test), numpy.int8) # int is faster than bool_ 205 | user_news_array_of_train_expanded[:, [news_index_from_train_to_test_dict[i] for i in range(news_num_of_train)]] = user_news_array_of_train # expand the column of user_news_array_of_train to the same size as user_news_array_of_test 206 | user_news_rating_predictions = numpy.zeros((user_num_of_test, news_num_of_test), numpy.float16) 207 | eps = numpy.finfo(float).eps 208 | for i in range(user_num_of_test): 209 | this_user_index_in_test = user_index_from_test_to_train_dict[i] 210 | similarities_sum = 0.0 211 | for user_neighbors_index in user_neighbors_indexes[i]: 212 | user_news_rating_predictions[i] += user_news_array_of_train_expanded[user_neighbors_index] * user_user_similarities[user_neighbors_index, this_user_index_in_test] 213 | similarities_sum += user_user_similarities[user_neighbors_index, this_user_index_in_test] 214 | user_news_rating_predictions[i] /= (similarities_sum + eps) 215 | if i % 100 == 0: 216 | # print("%.1f%%" % (i / user_num_of_test * 100)) 217 | progress.update(i / user_num_of_test) 218 | progress.update(1) 219 | user_news_rating_predictions[user_news_array_of_train_expanded[[user_index_from_test_to_train_dict[i] for i in range(user_num_of_test)], :] == 1] = 0 # remove news one user has clicked 220 | time_end = time.time() 221 | print("Predict ratings ended. %f s cost." % (time_end - time_start)) 222 | 223 | print("[NMF-User-based Collaborative Filtering] Recommend finished!") 224 | 225 | return user_news_rating_predictions, user_ids_of_test, news_ids_of_test 226 | 227 | 228 | user_user_similarities, user_ids_of_train = train() 229 | user_news_rating_predictions, user_ids_of_test, news_ids_of_test = recommend(user_user_similarities, user_ids_of_train) 230 | user_news_array_of_test = numpy.load("user_news_array_of_test.npy") 231 | import scipy.io 232 | scipy.io.savemat("data_for_evaluation.mat", 233 | {"user_news_rating_predictions": user_news_rating_predictions, 234 | "user_ids": user_ids_of_test, 235 | "news_ids": news_ids_of_test, 236 | "user_news_array_of_test": user_news_array_of_test}) 237 | -------------------------------------------------------------------------------- /nmf_user_based_cf_evaluation.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | Created on 2015-12-24 4 | @author: yuqiang 5 | Evaluation of NMF-User-based Collaborative Filtering 6 | """ 7 | 8 | import numpy 9 | import data 10 | 11 | user_news_array, user_ids, news_ids = data.get_user_news_array() 12 | user_num = len(user_ids) 13 | news_num = len(news_ids) 14 | news_dict = data.get_news_dict() 15 | user_news_recommend_indexes = numpy.load("user_news_recommend_indexes.npy") 16 | 17 | for i in range(user_num): 18 | clicked_news_ids = news_ids[user_news_array[i] == 1] 19 | recommend_news_ids = news_ids[user_news_recommend_indexes[i]] 20 | clicked_news_contents = [news_dict[key] for key in clicked_news_ids] 21 | recommend_news_contents = [news_dict[key] for key in recommend_news_ids] 22 | 23 | print("") 24 | 25 | -------------------------------------------------------------------------------- /progress.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | Created on 2015-12-27 4 | @author: yuqiang 5 | Tool for showing progress in console 6 | """ 7 | 8 | import sys 9 | 10 | 11 | def update(percent): 12 | """Update and show the progress in console 13 | 14 | Update and show the progress in console. 15 | Remember to call "update(1)" when all progress finished! 16 | 17 | Args: 18 | percent(Type: float): percent(0~1) of the part finished 19 | """ 20 | 21 | if percent < 0 or percent > 1: 22 | print("Error input of progress.update()!") 23 | return 24 | 25 | bar_length = 20 26 | hashes = '#' * int(percent * bar_length) 27 | spaces = ' ' * (bar_length - len(hashes)) 28 | sys.stdout.write("\rPercent: [%s] %d%%" % (hashes + spaces, percent * 100)) 29 | if percent == 1: 30 | sys.stdout.write("\n") 31 | sys.stdout.flush() 32 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | """ 3 | test 4 | """ 5 | 6 | 7 | ''' 8 | a = [2, 3, 3, 1, 1, 4, 5] 9 | b = [x for x in a if x < 3] 10 | inds = [i for (i, val) in enumerate(a) if val < 3] 11 | c = [(i, val) for (i, val) in enumerate(a)] 12 | c.sort(key=lambda x: x[1]) 13 | ''' 14 | 15 | ''' 16 | import time 17 | def linecount_1( ): 18 | return len(open("user_click_data.txt", "r", 1, "utf-8").readlines( )) 19 | def linecount_2( ): 20 | count = -1 21 | for count, line in enumerate(open("user_click_data.txt", "r", 1, "utf-8")): pass 22 | return count+1 23 | def linecount_3( ): 24 | count = 0 25 | thefile = open("user_click_data.txt", "r", 1, "utf-8") 26 | while True: 27 | buffer = thefile.read(65536) 28 | if not buffer: break 29 | count += buffer.count('\n') 30 | return count 31 | time_start = time.time() 32 | for i in list(range(10)): 33 | linecount_1() 34 | time_end = time.time() 35 | print("linecount_1: %f count=%d" % (time_end - time_start, linecount_1())) 36 | time_start = time.time() 37 | for i in list(range(10)): 38 | linecount_2() 39 | time_end = time.time() 40 | print("linecount_2: %f count=%d" % (time_end - time_start, linecount_2())) 41 | time_start = time.time() 42 | for i in list(range(10)): 43 | linecount_3() 44 | time_end = time.time() 45 | print("linecount_3: %f count=%d" % (time_end - time_start, linecount_3())) 46 | ''' 47 | 48 | """ 49 | #!/usr/bin/env python 50 | # -*- coding: utf-8 -*- 51 | import os 52 | import codecs 53 | import sys 54 | # reload(sys) 55 | # sys.setdefaultencoding('utf-8') 56 | 57 | user_ids = [] 58 | news_ids = [] 59 | time = [] 60 | news_title = [] 61 | news_body = [] 62 | 63 | def get_filedata(filename): 64 | try: 65 | with open(filename, "r", 1, "utf-8") as f: #with sentence open and close file automatically 66 | data = f.readline() 67 | print(data) 68 | #print data.split('\t') 69 | sp = data.split('\t') 70 | user_ids.append(sp[0]) 71 | news_ids.append(sp[1]) 72 | time.append(sp[2]) 73 | news_title.append(sp[3]) 74 | print(news_title[-1]) 75 | news_body.append(sp[4]) 76 | print(news_body[-1]) 77 | except IOError as ioerr: 78 | print('File Error' + str(ioerr)) #print the error 79 | return None 80 | 81 | get_filedata("user_click_data.txt") 82 | """ 83 | 84 | """ 85 | import time 86 | import sys 87 | 88 | time_start = time.time() 89 | for i in list(range(100)): 90 | print(".", end="") 91 | time_end = time.time() 92 | print(time_end - time_start) 93 | 94 | time_start = time.time() 95 | for i in list(range(100)): 96 | sys.stdout.write('.'); sys.stdout.flush() 97 | time_end = time.time() 98 | print(time_end - time_start) 99 | """ 100 | 101 | """ 102 | import numpy 103 | a = numpy.zeros((2, 2)) 104 | a.dump("a.numpydumpedarray") 105 | """ 106 | 107 | """ 108 | import numpy 109 | a = numpy.array([[1,2],[3,4],[2,4],[3,2],[2,4]]) 110 | b = a.tolist() 111 | b_key = [x for [x, y] in b] 112 | b_key_dict = {value: index for (index, value) in enumerate(b_key)} 113 | unique_ids = b_key_dict.values() 114 | c = [b[index] for index in unique_ids] 115 | """ 116 | 117 | """ 118 | # -*- coding:utf-8 -*- 119 | #!python3 120 | 121 | import math 122 | import numpy as np 123 | import os.path 124 | import time 125 | 126 | time_start = time.time() 127 | dim = 9 128 | def file_news_id(filename): 129 | ''' 130 | filename: 131 | ''' 132 | fr = open(filename) 133 | train_news = [] 134 | 135 | while True: 136 | line = fr.readline() 137 | if line: 138 | p = line.split('\n') 139 | train_news.append(int(p[0])) 140 | else: 141 | break 142 | 143 | 144 | return train_news 145 | 146 | def file_news_wordlist(filename): 147 | ''' 148 | filename: 149 | ''' 150 | fr = open(filename,'r', 1, "utf-8") 151 | wordlist = [] 152 | 153 | while True: 154 | line = fr.readline() 155 | if line: 156 | p = line.split('\n') 157 | wordlist.append(p[0]) 158 | else: 159 | break 160 | 161 | 162 | return wordlist 163 | 164 | s = os.getcwd() 165 | train_news_id = file_news_id('train_id.txt') 166 | train_wordlist = file_news_wordlist('frequence_word_use.txt') 167 | time_end = time.time() 168 | print(time_end - time_start) #prepare labels return 169 | """ 170 | 171 | """ 172 | import numpy 173 | eps = numpy.finfo(float).eps 174 | """ 175 | 176 | """ 177 | import numpy 178 | a = numpy.array([[0,1,1,0], [1,1,0,1]]) 179 | b = numpy.zeros((2, 4)) 180 | b[a==1] = 1 181 | """ 182 | 183 | """ 184 | import numpy 185 | a = numpy.array([[0,1,1,0], [1,1,0,1]]) 186 | b = numpy.zeros((2, 4)) 187 | c = a[0] * 0.5 + b[1] 188 | """ 189 | 190 | """ 191 | import numpy 192 | a = numpy.array([0,2,3,2,1]) 193 | index = numpy.argsort(a) 194 | """ 195 | 196 | """ 197 | import numpy 198 | a = numpy.array([1,2,3,4,5]) 199 | inverse_indexes = range(4, -1, -1) 200 | b = a[inverse_indexes] 201 | """ 202 | 203 | """ 204 | import numpy 205 | a = numpy.array([1,2,3]) 206 | b = numpy.array([4,5,6]) 207 | c = numpy.row_stack((a, b)) 208 | d = numpy.column_stack((a, b)) 209 | """ 210 | 211 | """ 212 | # calculate similarity between users 213 | print("Calculate similarity between users started.") 214 | time_start = time.time() 215 | user_user_similarities = numpy.zeros((user_num, user_num), numpy.float16) 216 | # progress_count = 0 217 | # for i in range(0, user_num): 218 | # for j in range(i + 1, user_num): # similarity to oneself is set to "0" 219 | # similarity = H[:, i].dot(H[:, j]) 220 | # user_user_similarities[i, j] = similarity 221 | # user_user_similarities[j, i] = similarity 222 | # # progress_count += 1 223 | # # if progress_count % 100000 == 0: 224 | # # print("%f %%" % (progress_count*2 / (user_num*(user_num-1)) * 100)) 225 | # # # sys.stdout.write("\r%f %%\r" % (progress_count*2 / (user_num*(user_num-1)) * 100)); sys.stdout.flush() 226 | # if i % 10 == 0: 227 | # print("%f%%. %f s elapsed." % (i / user_num * 100, time.time() - time_start)) 228 | H_transpose = H.transpose() 229 | computed_count = 0 230 | compute_step = 1000 # to avoid MemoryError, only compute a part each time 231 | while computed_count < user_num: 232 | compute_upper_limit = min((computed_count + compute_step, user_num)) 233 | user_user_similarities[computed_count:compute_upper_limit, :] = numpy.dot(H_transpose[computed_count:compute_upper_limit, :], H) 234 | computed_count += compute_step 235 | time_end = time.time() 236 | print("Calculate similarity between users ended. %f s cost." % (time_end - time_start)) 237 | # numpy.save("user_user_similarities.npy", user_user_similarities) 238 | # user_user_similarities = numpy.load("user_user_similarities.npy") 239 | """ 240 | 241 | """ 242 | import numpy 243 | a = numpy.eye(3, dtype=numpy.bool_) 244 | b = numpy.int16(a) 245 | c = a[0] * 3 246 | b[a == 1] = 2 247 | """ 248 | 249 | """ 250 | import numpy 251 | import scipy.io 252 | user_news_recommend_indexes = numpy.load("user_news_recommend_indexes.npy") 253 | scipy.io.savemat("user_news_recommend_indexes.mat", {"user_news_recommend_indexes": user_news_recommend_indexes}) 254 | # user_ids = numpy.load("user_ids.npy") 255 | # scipy.io.savemat("user_ids.mat", {"user_ids": user_ids}) 256 | # news_ids = numpy.load("news_ids.npy") 257 | # scipy.io.savemat("news_ids.mat", {"news_ids": news_ids}) 258 | """ 259 | 260 | """ 261 | import numpy 262 | a = numpy.array([[1,2,3], [1], [1,2,3,4]]) 263 | b = numpy.array([[1,2,3,5], [1,6,7,8], [1,2,3,4]]) 264 | """ 265 | 266 | """ 267 | import data 268 | # news_dict = data.get_news_dict() 269 | # user_clicked_news_dict = data.get_user_clicked_news_dict() 270 | user_news_array_for_train, user_ids_for_train, news_ids_for_train, user_news_array_for_test, user_ids_for_test, news_ids_for_test = data.get_user_news_arrays_of_train_and_test() 271 | """ 272 | 273 | """ 274 | a = list(range(5)) + list(range(5)) 275 | for b in a: 276 | if b % 2 == 0: 277 | a.remove(b) 278 | 279 | title_cuts_without_stop_word = [word for word in title_cuts if word not in stopkeys] 280 | """ 281 | 282 | """ 283 | import numpy 284 | a = numpy.array([[1,2], [3,4]]) 285 | # a.tofile("a.bin") 286 | # a.tofile("a.bin", sep=" ") 287 | numpy.savetxt("a.txt", a, fmt='%d', delimiter='\t') 288 | b = numpy.loadtxt("a.txt", dtype=numpy.int16, delimiter='\t') 289 | a = numpy.array([[0.1,2.2], [3.3,4]]) 290 | numpy.savetxt("a.txt", a, fmt='%.6f', delimiter='\t') 291 | b = numpy.loadtxt("a.txt", dtype=numpy.float32, delimiter='\t') 292 | """ 293 | 294 | """ 295 | import data 296 | user_news_array_for_train, user_ids_for_train, news_ids_for_train, user_news_array_for_test, user_ids_for_test, news_ids_for_test = data.get_user_news_arrays_of_train_and_test(True, True) 297 | """ 298 | 299 | """ 300 | import numpy 301 | a = numpy.array([[1, 2], [3, 4]]) 302 | b = a.transpose() 303 | """ 304 | 305 | """ 306 | import sys 307 | import time 308 | bar_length = 20 309 | for percent in range(0, 101): 310 | hashes = '#' * int(percent/100.0 * bar_length) 311 | spaces = ' ' * (bar_length - len(hashes)) 312 | sys.stdout.write("\rPercent: [%s] %d%%"%(hashes + spaces, percent)) 313 | sys.stdout.flush() 314 | time.sleep(0.1) 315 | """ 316 | 317 | print("") 318 | --------------------------------------------------------------------------------