├── .gitignore
├── __init__.py
├── data.py
├── data_test.py
├── nmf.py
├── nmf_test.py
├── nmf_user_based_cf.py
├── nmf_user_based_cf_evaluation.py
├── progress.py
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ## Ignore PyCharm temporary files, build results, and data files
 2 | 
 3 | # PyCharm
 4 | /.idea/
 5 | /__pycache__/
 6 | 
 7 | # Data
 8 | /data/
 9 | # "data/" matches all directory who's name is "data", but "/data/" matches the exact directory who's name is "data" only in the root directory
10 | *.npy
11 | *.txt
12 | *.mat
13 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #!python3
2 | """
3 | Created on 2015-12-12
4 | @author: yuqiang
5 | Main entrance of program
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | """
  3 | Created on 2015-12-13
  4 | @author: yuqiang
  5 | Read of given data and reform it to useful format
  6 | """
  7 | 
  8 | import time
  9 | import numpy
 10 | 
 11 | 
 12 | def _form_user_news_array_from_list(user_news_pairs):
 13 |     """Form user-news array from list(private method)
 14 | 
 15 |     Form the user-news array from given list, whose rows represent users and cols represent newses.
 16 |     The meaning of A[i, j] in the result array is whether user "i" clicked news "j"(if true, 1, else, 0).
 17 | 
 18 |     Args:
 19 |         user_news_pairs(Type: list[(user_id, news_id)]): list of user-news pair which represents the user clicked the news
 20 | 
 21 |     Returns:
 22 |         user_news_array(Type: numpy.ndarray): The result user-news array(rows for users and cols for newses)
 23 |         user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array
 24 |         news_ids(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the array
 25 |     """
 26 | 
 27 |     user_ids = [user_id for (user_id, news_id) in user_news_pairs]
 28 |     news_ids = [news_id for (user_id, news_id) in user_news_pairs]
 29 | 
 30 |     # unique and sort user and news
 31 |     user_ids = numpy.array(list(set(user_ids)))
 32 |     user_ids.sort()
 33 |     news_ids = numpy.array(list(set(news_ids)))
 34 |     news_ids.sort()
 35 | 
 36 |     # form the array
 37 |     user_num = len(user_ids)
 38 |     news_num = len(news_ids)
 39 |     user_ids_dict = {value: index for (index, value) in enumerate(user_ids)}
 40 |     news_ids_dict = {value: index for (index, value) in enumerate(news_ids)}
 41 |     user_news_array = numpy.zeros((user_num, news_num), numpy.bool_)
 42 |     for (user_id, news_id) in user_news_pairs:
 43 |         user_news_array[user_ids_dict[user_id], news_ids_dict[news_id]] = 1
 44 | 
 45 |     return user_news_array, user_ids, news_ids
 46 | 
 47 | 
 48 | def get_user_news_array():
 49 |     """Form user-news array
 50 | 
 51 |     Form the user-news array, whose rows represent users and cols represent newses.
 52 |     The meaning of A[i, j] in the result array is whether user "i" clicked news "j"(if true, 1, else, 0).
 53 | 
 54 |     Returns:
 55 |         user_news_array(Type: numpy.ndarray): The result user-news array(rows for users and cols for newses)
 56 |         user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array
 57 |         news_ids(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the array
 58 |     """
 59 | 
 60 |     # read data from file
 61 |     f = open("user_click_data.txt", "r", 1, "utf-8")
 62 |     user_news_pairs = []
 63 |     while True:
 64 |         line = f.readline()
 65 |         if line:
 66 |             p = line.split('\t')
 67 |             user_news_pairs.append((int(p[0]), int(p[1])))
 68 |         else:
 69 |             break
 70 |     f.close()
 71 | 
 72 |     return _form_user_news_array_from_list(user_news_pairs)
 73 | 
 74 | 
 75 | def get_user_news_arrays_of_train_and_test(remove_new_users_in_test=False, remove_new_newses_in_test=False):
 76 |     """Form user-news arrays of train and test
 77 | 
 78 |     Form the user-news array of train and test, whose rows represent users and cols represent newses.
 79 |     The meaning of A[i, j] in the result array is whether user "i" clicked news "j"(if true, 1, else, 0).
 80 | 
 81 |     Args:
 82 |         remove_new_users_in_test(Type: bool): whether to remove new users in test
 83 |         remove_new_newses_in_test(Type: bool): whether to remove new newses in test
 84 | 
 85 |     Returns:
 86 |         user_news_array_of_train(Type: numpy.ndarray): The result user-news array of train(rows for users and cols for newses)
 87 |         user_ids_of_train(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the user-news array of train
 88 |         news_ids_of_train(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the user-news array of train
 89 |         user_news_array_of_test(Type: numpy.ndarray): The result user-news array of test(rows for users and cols for newses)
 90 |         user_ids_of_test(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the user-news array of test
 91 |         news_ids_of_test(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the user-news array of test
 92 |     """
 93 | 
 94 |     # read data from file
 95 |     f = open("user_click_data.txt", "r", 1, "utf-8")
 96 |     user_news_time_list = []
 97 |     while True:
 98 |         line = f.readline()
 99 |         if line:
100 |             p = line.split('\t')
101 |             user_news_time_list.append((int(p[0]), int(p[1]), int(p[2])))
102 |         else:
103 |             break
104 |     f.close()
105 | 
106 |     # calculate the threshold of visit times to divide data into a train set and a test set
107 |     click_times = [click_time for (user_id, news_id, click_time) in user_news_time_list]
108 |     time_start = min(click_times)
109 |     time_end = max(click_times)
110 |     time_threshold = time_start + (time_end - time_start) * 2 // 3
111 | 
112 |     # divide data into train set and test set
113 |     user_news_pairs_of_all = [(user_id, news_id) for (user_id, news_id, click_time) in user_news_time_list]
114 |     user_news_pairs_of_train = [(user_id, news_id) for (user_id, news_id, click_time) in user_news_time_list if click_time < time_threshold]
115 |     user_news_pairs_of_test = [(user_id, news_id) for (user_id, news_id, click_time) in user_news_time_list if click_time >= time_threshold]
116 | 
117 |     # form the train array
118 |     user_news_array_of_all, user_ids_of_all, news_ids_of_all = _form_user_news_array_from_list(user_news_pairs_of_all)
119 |     user_news_array_of_train, user_ids_of_train, news_ids_of_train = _form_user_news_array_from_list(user_news_pairs_of_train)
120 |     # user_news_array_of_test, user_ids_of_test, news_ids_of_test = _form_user_news_array_from_list(user_news_pairs_of_test)
121 | 
122 |     # form the test array(remove new users or new newses)
123 |     user_ids_of_all_dict = {user_id: index for (index, user_id) in enumerate(user_ids_of_all)}
124 |     news_ids_of_all_dict = {news_id: index for (index, news_id) in enumerate(news_ids_of_all)}
125 |     train_samples = numpy.array([[user_ids_of_all_dict[user_id], news_ids_of_all_dict[news_id]] for (user_id, news_id) in user_news_pairs_of_train]).transpose()  # coordinates of train samples in user_news_array_of_all
126 |     user_news_array_of_test = user_news_array_of_all.copy()
127 |     user_news_array_of_test[train_samples[0, :], train_samples[1, :]] = 0  # remove train samples in user_news_array_of_all
128 |     user_indices_of_train_in_all = numpy.array([index for (index, user_id) in enumerate(user_ids_of_all) if user_id in user_ids_of_train])  # indices of users in user_ids_of_all who appeared in train set
129 |     user_indices_of_train_in_all.reshape((-1, 1))  # reshape to column vector
130 |     news_indices_of_train_in_all = numpy.array([index for (index, news_id) in enumerate(news_ids_of_all) if news_id in news_ids_of_train])  # indices of newses in news_ids_of_all who appeared in train set
131 |     news_indices_of_train_in_all.reshape((1, -1))  # reshape to row vector
132 |     user_num = len(user_ids_of_all)
133 |     news_num = len(news_ids_of_all)
134 |     user_news_array_of_test = user_news_array_of_test[user_indices_of_train_in_all if remove_new_users_in_test else range(user_num), :]  # remove new users
135 |     user_news_array_of_test = user_news_array_of_test[:, news_indices_of_train_in_all if remove_new_newses_in_test else range(news_num)]  # remove new newses
136 |     user_ids_of_test = user_ids_of_all[user_indices_of_train_in_all if remove_new_users_in_test else range(user_num)]  # remove new users
137 |     news_ids_of_test = news_ids_of_all[news_indices_of_train_in_all if remove_new_newses_in_test else range(news_num)]  # remove new newses
138 | 
139 |     # remove users who don't appear in test set
140 |     hold_indices = [index for (index, user_id) in enumerate(user_ids_of_test) if user_id in [user_id for (user_id, news_id) in user_news_pairs_of_test]]
141 |     if len(hold_indices) > 0:
142 |         user_news_array_of_test = user_news_array_of_test[hold_indices, :]
143 |         user_ids_of_test = user_ids_of_test[hold_indices]
144 | 
145 |     # write data to files
146 |     numpy.save("user_news_array_of_train.npy", user_news_array_of_train)
147 |     numpy.save("user_ids_of_train.npy", user_ids_of_train)
148 |     numpy.save("news_ids_of_train.npy", news_ids_of_train)
149 |     numpy.save("user_news_array_of_test.npy", user_news_array_of_test)
150 |     numpy.save("user_ids_of_test.npy", user_ids_of_test)
151 |     numpy.save("news_ids_of_test.npy", news_ids_of_test)
152 | 
153 |     return user_news_array_of_train, user_ids_of_train, news_ids_of_train, user_news_array_of_test, user_ids_of_test, news_ids_of_test
154 | 
155 | 
156 | def get_news_dict():
157 |     """Form news dict
158 | 
159 |     Form the news dict, whose key is news id and value is news title and content.
160 | 
161 |     Returns:
162 |         news_dict(Type: dictionary): key: news id, value: news title and content
163 |     """
164 | 
165 |     # read data from file
166 |     f = open("user_click_data.txt", "r", 1, "utf-8")
167 |     news_dict = {}
168 |     while True:
169 |         line = f.readline()
170 |         if line:
171 |             p = line.split('\t')
172 |             news_dict.setdefault(int(p[1]), p[3] + p[4])
173 |         else:
174 |             break
175 |     f.close()
176 | 
177 |     return news_dict
178 | 
179 | 
180 | def get_user_clicked_news_dict():
181 |     """Form user clicked news dict
182 | 
183 |     Form the user clicked news dict, whose key is user id and value is news ids the user has clicked.
184 | 
185 |     Returns:
186 |         user_clicked_news_dict(Type: dictionary): key: user id, value: news ids the user has clicked
187 |     """
188 | 
189 |     user_news_array, user_ids, news_ids = get_user_news_array()
190 |     user_num = len(user_ids)
191 |     user_clicked_news_dict = {}
192 |     for i in range(user_num):
193 |         clicked_news_ids = news_ids[user_news_array[i] == 1]
194 |         user_clicked_news_dict.setdefault(user_ids[i], clicked_news_ids)
195 | 
196 |     return user_clicked_news_dict
197 | 


--------------------------------------------------------------------------------
/data_test.py:
--------------------------------------------------------------------------------
 1 | #!python3
 2 | """
 3 | Created on 2015-12-12
 4 | @author: yuqiang
 5 | Test of given data of news
 6 | """
 7 | 
 8 | # read data from file
 9 | f = open("user_click_data.txt", "r", 1, "utf-8")
10 | user_ids = []
11 | news_ids = []
12 | visit_times = []
13 | news_titles = []
14 | news_bodies = []
15 | while True:
16 |     line = f.readline()
17 |     if line:
18 |         p = line.split('\t')
19 |         user_ids.append(int(p[0]))
20 |         news_ids.append(int(p[1]))
21 |         visit_times.append(int(p[2]))
22 |         news_titles.append(p[3])
23 |         news_bodies.append(p[4])
24 |     else:
25 |         break
26 | f.close()
27 | 
28 | print("total lines: %d" % len(user_ids))
29 | 
30 | # remove duplicate to calculate number of user and news
31 | unique_user_ids = list(set(user_ids))
32 | # unique_user_ids.sort(user_ids.index)
33 | unique_news_ids = list(set(news_ids))
34 | 
35 | print("")
36 | print("unique user number: %d" % len(unique_user_ids))
37 | print("unique news number: %d" % len(unique_news_ids))
38 | 
39 | # calculate the threshold of visit times to divide data into a train set and a test set
40 | # visit_times.sort()
41 | time_start = min(visit_times)
42 | time_end = max(visit_times)
43 | time_threshold = time_start + (time_end - time_start) * 2 // 3
44 | train_set_indexes = [index for (index, value) in enumerate(visit_times) if value < time_threshold]
45 | test_set_indexes = [index for (index, value) in enumerate(visit_times) if value >= time_threshold]
46 | 
47 | print("")
48 | print("start     time: %d" % time_start)
49 | print("end       time: %d" % time_end)
50 | print("threshold time: %d (early 2/3 of time period)" % time_threshold)
51 | print("train set size: %d (before threshold time)" % len(train_set_indexes))
52 | print("test  set size: %d (after  threshold time)" % len(test_set_indexes))
53 | 


--------------------------------------------------------------------------------
/nmf.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | """
  3 | Created on 2015-12-12
  4 | @author: yuqiang
  5 | Modified from Chih-Jen Lin's NMF Algorithm
  6 | http://www.csie.ntu.edu.tw/~cjlin/nmf/
  7 | 
  8 | # NMF by alternative non-negative least squares using projected gradients
  9 | # Author: Chih-Jen Lin, National Taiwan University
 10 | # Python/numpy translation: Anthony Di Franco
 11 | """
 12 | 
 13 | from numpy import *
 14 | from numpy.linalg import norm
 15 | from time import time
 16 | from sys import stdout
 17 | 
 18 | 
 19 | def nmf(V, K, tol=0.0000000000001, timelimit=25, maxiter=8000):
 20 |  """
 21 |  (W,H) = nmf(V,K,tol,timelimit,maxiter)
 22 |  W,H: output solution
 23 |  K: column number of W
 24 |  tol: tolerance for a relative stopping condition
 25 |  timelimit, maxiter: limit of time and iterations
 26 |  """
 27 | 
 28 |  print('NMF started.')
 29 |  time_start = time()
 30 | 
 31 |  N = len(V)
 32 |  M = len(V[0])
 33 |  W = random.rand(N, K)
 34 |  H = random.rand(K, M)
 35 |  initt = time()
 36 | 
 37 |  gradW = dot(W, dot(H, H.T)) - dot(V, H.T)
 38 |  gradH = dot(dot(W.T, W), H) - dot(W.T, V)
 39 |  initgrad = norm(r_[gradW, gradH.T])
 40 |  # print('Init gradient norm %f' % initgrad)
 41 |  tolW = max(0.001,tol)*initgrad
 42 |  tolH = tolW
 43 | 
 44 |  for iter in range(1,maxiter):
 45 |   # stopping condition
 46 |   projnorm = norm(r_[gradW[logical_or(gradW<0, W>0)],
 47 |                                  gradH[logical_or(gradH<0, H>0)]])
 48 |   if projnorm < tol*initgrad or time() - initt > timelimit: break
 49 | 
 50 |   (W, gradW, iterW) = nlssubprob(V.T,H.T,W.T,tolW,1000)
 51 |   W = W.T
 52 |   gradW = gradW.T
 53 | 
 54 |   if iterW==1: tolW = 0.1 * tolW
 55 | 
 56 |   (H,gradH,iterH) = nlssubprob(V,W,H,tolH,1000)
 57 |   if iterH==1: tolH = 0.1 * tolH
 58 | 
 59 |   if iter % 2 == 0: stdout.write('.'); stdout.flush()
 60 | 
 61 |  print('')
 62 | 
 63 |  # print('Iter = %d Final proj-grad norm %f' % (iter, projnorm))
 64 | 
 65 |  time_end = time()
 66 |  print('NMF ended. %fs cost.' % (time_end - time_start))
 67 | 
 68 |  return (W,H)
 69 | 
 70 | 
 71 | def nlssubprob(V,W,Hinit,tol,maxiter):
 72 |  """
 73 |  H, grad: output solution and gradient
 74 |  iter: #iterations used
 75 |  V, W: constant matrices
 76 |  Hinit: initial solution
 77 |  tol: stopping tolerance
 78 |  maxiter: limit of iterations
 79 |  """
 80 | 
 81 |  H = Hinit
 82 |  WtV = dot(W.T, V)
 83 |  WtW = dot(W.T, W)
 84 | 
 85 |  alpha = 1; beta = 0.1;
 86 |  for iter in range(1, maxiter):
 87 |   grad = dot(WtW, H) - WtV
 88 |   projgrad = norm(grad[logical_or(grad < 0, H >0)])
 89 |   if projgrad < tol: break
 90 | 
 91 |   # search step size
 92 |   for inner_iter in range(1,20):
 93 |    Hn = H - alpha*grad
 94 |    Hn = where(Hn > 0, Hn, 0)
 95 |    d = Hn-H
 96 |    gradd = sum(grad * d)
 97 |    dQd = sum(dot(WtW,d) * d)
 98 |    suff_decr = 0.99*gradd + 0.5*dQd < 0;
 99 |    if inner_iter == 1:
100 |     decr_alpha = not suff_decr; Hp = H;
101 |    if decr_alpha:
102 |     if suff_decr:
103 |      H = Hn; break;
104 |     else:
105 |      alpha = alpha * beta;
106 |    else:
107 |       if not suff_decr or (Hp == Hn).all():
108 |        H = Hp; break;
109 |       else:
110 |        alpha = alpha/beta; Hp = Hn;
111 | 
112 |   if iter == maxiter:
113 |    print('Max iter in nlssubprob')
114 |  return (H, grad, iter)


--------------------------------------------------------------------------------
/nmf_test.py:
--------------------------------------------------------------------------------
 1 | #!python3
 2 | """
 3 | Created on 2015-12-12
 4 | @author: yuqiang
 5 | Test of module "nmf"
 6 | """
 7 | 
 8 | import numpy
 9 | import time
10 | import nmf
11 | 
12 | V = [
13 |         [5, 3, 0, 1],
14 |         [4, 0, 0, 1],
15 |         [1, 1, 0, 5],
16 |         [1, 0, 0, 4],
17 |         [0, 1, 5, 4],
18 |     ]
19 | V = [
20 |         [1, 1, 0, 1, 1],
21 |         [1, 0, 0, 1, 0],
22 |         [1, 1, 0, 1, 0],
23 |         [1, 0, 0, 1, 0],
24 |         [0, 1, 1, 1, 0],
25 |     ]
26 | 
27 | V = numpy.array(V)
28 | print("V = ")
29 | print(V)
30 | 
31 | time_start = time.time()
32 | K = 2
33 | W, H = nmf.nmf(V, K)
34 | time_end = time.time()
35 | estimatedV = numpy.dot(W, H)
36 | print("W = ")
37 | print(W)
38 | print("H = ")
39 | print(H)
40 | print("estimatedV = ")
41 | print(estimatedV)
42 | print(time_end - time_start)
43 | 


--------------------------------------------------------------------------------
/nmf_user_based_cf.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | """
  3 | Created on 2015-12-13
  4 | @author: yuqiang
  5 | NMF-User-based Collaborative Filtering
  6 | """
  7 | 
  8 | import time
  9 | import numpy
 10 | import nmf
 11 | import data
 12 | import progress
 13 | 
 14 | """
 15 | # get user-news array
 16 | user_news_array, user_ids, news_ids = data.get_user_news_array()
 17 | user_num = len(user_ids)
 18 | news_num = len(news_ids)
 19 | 
 20 | # NMF
 21 | V = numpy.float16(user_news_array.T)
 22 | N = len(V)
 23 | M = len(V[0])
 24 | K = 15  # TODO refine this parameter
 25 | W_init = numpy.random.rand(N, K)
 26 | H_init = numpy.random.rand(K, M)
 27 | W, H = nmf.nmf(V, W_init, H_init)
 28 | del V, W_init, H_init, W, M, N
 29 | # estimatedV = numpy.dot(W, H)
 30 | 
 31 | # calculate similarity between users
 32 | print("Calculate similarity between users started.")
 33 | time_start = time.time()
 34 | user_user_similarities = numpy.zeros((user_num, user_num), numpy.float16)
 35 | H_norm = numpy.power(H, 2)
 36 | H_norm = H_norm.sum(0)
 37 | H_norm = numpy.sqrt(H_norm)  # norm of each column vector in H
 38 | H_norm = numpy.tile(H_norm, (K, 1))
 39 | H_normalized = H / H_norm
 40 | H_normalized_transpose = H_normalized.transpose()
 41 | computed_count = 0
 42 | compute_step = 1000  # to avoid MemoryError, only compute a part each time
 43 | while computed_count < user_num:
 44 |     compute_upper_limit = min((computed_count + compute_step, user_num))
 45 |     user_user_similarities[computed_count:compute_upper_limit, :] = numpy.dot(H_normalized_transpose[computed_count:compute_upper_limit, :], H_normalized)
 46 |     computed_count += compute_step
 47 | del H, H_norm, H_normalized, H_normalized_transpose, computed_count, compute_step
 48 | time_end = time.time()
 49 | print("Calculate similarity between users ended. %f s cost." % (time_end - time_start))
 50 | 
 51 | # find k nearest neighbors of users
 52 | print("Find k nearest neighbors of users started.")
 53 | time_start = time.time()
 54 | neighbor_size = 20  # TODO refine this parameter
 55 | user_neighbors_indexes = numpy.zeros((user_num, neighbor_size), numpy.int16)
 56 | inverse_indexes = range(user_num-2, user_num-neighbor_size-2, -1)  # choose the last k in the sorted list, remove the last one which is oneself
 57 | for i in range(user_num):
 58 |     sorted_indexes = numpy.argsort(user_user_similarities[i, :])
 59 |     user_neighbors_indexes[i, :] = sorted_indexes[inverse_indexes]
 60 |     if i % 100 == 0:
 61 |         print("%.1f%%" % (i / user_num * 100))
 62 | del inverse_indexes
 63 | time_end = time.time()
 64 | print("Find k nearest neighbors of users ended. %f s cost." % (time_end - time_start))
 65 | 
 66 | # predict ratings
 67 | print("Predict ratings started.")
 68 | time_start = time.time()
 69 | user_news_array = numpy.int8(user_news_array)  # int is faster than bool_
 70 | user_news_predict_array = numpy.zeros((user_num, news_num), numpy.float16)
 71 | eps = numpy.finfo(float).eps
 72 | for i in range(user_num):
 73 |     similarities_sum = 0.0
 74 |     for user_neighbors_index in user_neighbors_indexes[i]:
 75 |         user_news_predict_array[i] += user_news_array[user_neighbors_index] * user_user_similarities[user_neighbors_index, i]
 76 |         similarities_sum += user_user_similarities[user_neighbors_index, i]
 77 |     user_news_predict_array[i] /= (similarities_sum + eps)
 78 |     if i % 100 == 0:
 79 |         print("%.1f%%" % (i / user_num * 100))
 80 | user_news_predict_array[user_news_array == 1] = 0  # remove news one user has clicked
 81 | del eps
 82 | time_end = time.time()
 83 | print("Predict ratings ended. %f s cost." % (time_end - time_start))
 84 | 
 85 | # choose first k news to recommend to users
 86 | print("Choose first k news to recommend to users started.")
 87 | time_start = time.time()
 88 | recommend_size = 10  # TODO refine this parameter
 89 | user_news_recommend_indexes = numpy.zeros((user_num, recommend_size), numpy.int16)
 90 | inverse_indexes = range(news_num-1, news_num-recommend_size-1, -1)  # choose the last k in the sorted list
 91 | for i in range(user_num):
 92 |     sorted_indexes = numpy.argsort(user_news_predict_array[i, :])
 93 |     user_news_recommend_indexes[i, :] = sorted_indexes[inverse_indexes]
 94 |     if i % 100 == 0:
 95 |         print("%.1f%%" % (i / user_num * 100))
 96 | del inverse_indexes
 97 | time_end = time.time()
 98 | print("Choose first k news to recommend to users ended. %f s cost." % (time_end - time_start))
 99 | 
100 | print("")
101 | """
102 | 
103 | 
104 | def train():
105 |     """Train with the train set
106 | 
107 |     Train with the train set, and return the user_user_similarities array.
108 | 
109 |     Returns:
110 |         user_user_similarities(Type: numpy.ndarray): The similarities between user and user(user_user_similarities[i, j] represents the similarity between user "i" and user "j". Similarity of oneself is "1".)
111 |         user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array
112 |     """
113 | 
114 |     # get user-news array
115 |     user_news_array_of_train = numpy.load("user_news_array_of_train.npy")
116 |     user_ids_of_train = numpy.load("user_ids_of_train.npy")
117 |     user_news_array = user_news_array_of_train
118 |     user_ids = user_ids_of_train
119 |     user_num = len(user_ids)
120 |     del user_news_array_of_train, user_ids_of_train
121 | 
122 |     # NMF
123 |     V = numpy.float16(user_news_array.T)
124 |     K = 15  # TODO refine this parameter
125 |     W, H = nmf.nmf(V, K)
126 |     del V, W
127 |     # estimatedV = numpy.dot(W, H)
128 | 
129 |     # calculate similarity between users
130 |     print("Calculate similarity between users started.")
131 |     time_start = time.time()
132 |     user_user_similarities = numpy.zeros((user_num, user_num), numpy.float16)
133 |     H_norm = numpy.power(H, 2)
134 |     H_norm = H_norm.sum(0)
135 |     H_norm = numpy.sqrt(H_norm)  # norm of each column vector in H
136 |     H_norm = numpy.tile(H_norm, (K, 1))
137 |     eps = numpy.finfo(float).eps
138 |     H_normalized = H / (H_norm + eps)
139 |     H_normalized_transpose = H_normalized.transpose()
140 |     computed_count = 0
141 |     compute_step = 1000  # to avoid MemoryError, only compute a part each time
142 |     while computed_count < user_num:
143 |         compute_upper_limit = min((computed_count + compute_step, user_num))
144 |         user_user_similarities[computed_count:compute_upper_limit, :] = numpy.dot(H_normalized_transpose[computed_count:compute_upper_limit, :], H_normalized)
145 |         computed_count += compute_step
146 |     del H, H_norm, H_normalized, H_normalized_transpose, computed_count, compute_step
147 |     time_end = time.time()
148 |     print("Calculate similarity between users ended. %f s cost." % (time_end - time_start))
149 | 
150 |     print("[NMF-User-based Collaborative Filtering] Train finished!")
151 | 
152 |     return user_user_similarities, user_ids
153 | 
154 | 
155 | def recommend(user_user_similarities, user_ids):
156 |     """Recommend with the test set
157 | 
158 |     Recommend with the test set, and return the user_news_rating_predictions array.
159 | 
160 |     Args:
161 |         user_user_similarities(Type: numpy.ndarray): The similarities between user and user(user_user_similarities[i, j] represents the similarity between user "i" and user "j". Similarity of oneself is "1".)
162 |         user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array
163 | 
164 |     Returns:
165 |         user_news_rating_predictions(Type: numpy.ndarray): The rating prediction of each user to each news(rating_prediction[i, j] represents the rating prediction of user "i" to news "j".)
166 |         user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array
167 |         news_ids(Type: numpy.ndarray(vector)): news's ids(from small to large) associated with the array
168 |     """
169 | 
170 |     # get user-news array
171 |     user_news_array_of_train = numpy.load("user_news_array_of_train.npy")
172 |     user_ids_of_train = numpy.load("user_ids_of_train.npy")
173 |     news_ids_of_train = numpy.load("news_ids_of_train.npy")
174 |     user_news_array_of_test = numpy.load("user_news_array_of_test.npy")
175 |     user_ids_of_test = numpy.load("user_ids_of_test.npy")
176 |     news_ids_of_test = numpy.load("news_ids_of_test.npy")
177 |     user_num_of_train = len(user_ids_of_train)
178 |     news_num_of_train = len(news_ids_of_train)
179 |     user_num_of_test = len(user_ids_of_test)
180 |     news_num_of_test = len(news_ids_of_test)
181 | 
182 |     # find k nearest neighbors of users
183 |     print("Find k nearest neighbors of users started.")
184 |     time_start = time.time()
185 |     neighbor_size = min(20, user_num_of_train-1)  # TODO refine this parameter
186 |     user_neighbors_indexes = numpy.zeros((user_num_of_test, neighbor_size), numpy.int16)
187 |     user_ids_of_train_dict = {user_id: index for (index, user_id) in enumerate(user_ids_of_train)}
188 |     user_index_from_test_to_train_dict = {user_index_in_test: user_ids_of_train_dict[user_id_of_test] for (user_index_in_test, user_id_of_test) in enumerate(user_ids_of_test)}  # dictionary of user index from test to train
189 |     for i in range(user_num_of_test):
190 |         sorted_indexes = numpy.argsort(-user_user_similarities[user_index_from_test_to_train_dict[i], :])
191 |         user_neighbors_indexes[i, :] = sorted_indexes[1:neighbor_size+1]  # choose the first k in the sorted list, remove the first one which is oneself
192 |         if i % 100 == 0:
193 |             # print("%.1f%%" % (i / user_num_of_test * 100))
194 |             progress.update(i / user_num_of_test)
195 |     progress.update(1)
196 |     time_end = time.time()
197 |     print("Find k nearest neighbors of users ended. %f s cost." % (time_end - time_start))
198 | 
199 |     # predict ratings
200 |     print("Predict ratings started.")
201 |     time_start = time.time()
202 |     news_ids_of_test_dict = {news_id: index for (index, news_id) in enumerate(news_ids_of_test)}
203 |     news_index_from_train_to_test_dict = {news_index_in_train: news_ids_of_test_dict[news_id_of_train] for (news_index_in_train, news_id_of_train) in enumerate(news_ids_of_train)}  # dictionary of news index from train to test
204 |     user_news_array_of_train_expanded = numpy.zeros((user_num_of_train, news_num_of_test), numpy.int8)  # int is faster than bool_
205 |     user_news_array_of_train_expanded[:, [news_index_from_train_to_test_dict[i] for i in range(news_num_of_train)]] = user_news_array_of_train  # expand the column of user_news_array_of_train to the same size as user_news_array_of_test
206 |     user_news_rating_predictions = numpy.zeros((user_num_of_test, news_num_of_test), numpy.float16)
207 |     eps = numpy.finfo(float).eps
208 |     for i in range(user_num_of_test):
209 |         this_user_index_in_test = user_index_from_test_to_train_dict[i]
210 |         similarities_sum = 0.0
211 |         for user_neighbors_index in user_neighbors_indexes[i]:
212 |             user_news_rating_predictions[i] += user_news_array_of_train_expanded[user_neighbors_index] * user_user_similarities[user_neighbors_index, this_user_index_in_test]
213 |             similarities_sum += user_user_similarities[user_neighbors_index, this_user_index_in_test]
214 |         user_news_rating_predictions[i] /= (similarities_sum + eps)
215 |         if i % 100 == 0:
216 |             # print("%.1f%%" % (i / user_num_of_test * 100))
217 |             progress.update(i / user_num_of_test)
218 |     progress.update(1)
219 |     user_news_rating_predictions[user_news_array_of_train_expanded[[user_index_from_test_to_train_dict[i] for i in range(user_num_of_test)], :] == 1] = 0  # remove news one user has clicked
220 |     time_end = time.time()
221 |     print("Predict ratings ended. %f s cost." % (time_end - time_start))
222 | 
223 |     print("[NMF-User-based Collaborative Filtering] Recommend finished!")
224 | 
225 |     return user_news_rating_predictions, user_ids_of_test, news_ids_of_test
226 | 
227 | 
228 | user_user_similarities, user_ids_of_train = train()
229 | user_news_rating_predictions, user_ids_of_test, news_ids_of_test = recommend(user_user_similarities, user_ids_of_train)
230 | user_news_array_of_test = numpy.load("user_news_array_of_test.npy")
231 | import scipy.io
232 | scipy.io.savemat("data_for_evaluation.mat",
233 |                  {"user_news_rating_predictions": user_news_rating_predictions,
234 |                   "user_ids": user_ids_of_test,
235 |                   "news_ids": news_ids_of_test,
236 |                   "user_news_array_of_test": user_news_array_of_test})
237 | 


--------------------------------------------------------------------------------
/nmf_user_based_cf_evaluation.py:
--------------------------------------------------------------------------------
 1 | #!python3
 2 | """
 3 | Created on 2015-12-24
 4 | @author: yuqiang
 5 | Evaluation of NMF-User-based Collaborative Filtering
 6 | """
 7 | 
 8 | import numpy
 9 | import data
10 | 
11 | user_news_array, user_ids, news_ids = data.get_user_news_array()
12 | user_num = len(user_ids)
13 | news_num = len(news_ids)
14 | news_dict = data.get_news_dict()
15 | user_news_recommend_indexes = numpy.load("user_news_recommend_indexes.npy")
16 | 
17 | for i in range(user_num):
18 |     clicked_news_ids = news_ids[user_news_array[i] == 1]
19 |     recommend_news_ids = news_ids[user_news_recommend_indexes[i]]
20 |     clicked_news_contents = [news_dict[key] for key in clicked_news_ids]
21 |     recommend_news_contents = [news_dict[key] for key in recommend_news_ids]
22 | 
23 | print("")
24 | 
25 | 


--------------------------------------------------------------------------------
/progress.py:
--------------------------------------------------------------------------------
 1 | #!python3
 2 | """
 3 | Created on 2015-12-27
 4 | @author: yuqiang
 5 | Tool for showing progress in console
 6 | """
 7 | 
 8 | import sys
 9 | 
10 | 
11 | def update(percent):
12 |     """Update and show the progress in console
13 | 
14 |     Update and show the progress in console.
15 |     Remember to call "update(1)" when all progress finished!
16 | 
17 |     Args:
18 |         percent(Type: float): percent(0~1) of the part finished
19 |     """
20 | 
21 |     if percent < 0 or percent > 1:
22 |         print("Error input of progress.update()!")
23 |         return
24 | 
25 |     bar_length = 20
26 |     hashes = '#' * int(percent * bar_length)
27 |     spaces = ' ' * (bar_length - len(hashes))
28 |     sys.stdout.write("\rPercent: [%s] %d%%" % (hashes + spaces, percent * 100))
29 |     if percent == 1:
30 |         sys.stdout.write("\n")
31 |     sys.stdout.flush()
32 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | """
  3 | test
  4 | """
  5 | 
  6 | 
  7 | '''
  8 | a = [2, 3, 3, 1, 1, 4, 5]
  9 | b = [x for x in a if x < 3]
 10 | inds = [i for (i, val) in enumerate(a) if val < 3]
 11 | c = [(i, val) for (i, val) in enumerate(a)]
 12 | c.sort(key=lambda x: x[1])
 13 | '''
 14 | 
 15 | '''
 16 | import time
 17 | def linecount_1( ):
 18 |     return len(open("user_click_data.txt", "r", 1, "utf-8").readlines( ))
 19 | def linecount_2( ):
 20 |     count = -1
 21 |     for count, line in enumerate(open("user_click_data.txt", "r", 1, "utf-8")): pass
 22 |     return count+1
 23 | def linecount_3( ):
 24 |     count = 0
 25 |     thefile = open("user_click_data.txt", "r", 1, "utf-8")
 26 |     while True:
 27 |         buffer = thefile.read(65536)
 28 |         if not buffer: break
 29 |         count += buffer.count('\n')
 30 |     return count
 31 | time_start = time.time()
 32 | for i in list(range(10)):
 33 |     linecount_1()
 34 | time_end = time.time()
 35 | print("linecount_1: %f  count=%d" % (time_end - time_start, linecount_1()))
 36 | time_start = time.time()
 37 | for i in list(range(10)):
 38 |     linecount_2()
 39 | time_end = time.time()
 40 | print("linecount_2: %f  count=%d" % (time_end - time_start, linecount_2()))
 41 | time_start = time.time()
 42 | for i in list(range(10)):
 43 |     linecount_3()
 44 | time_end = time.time()
 45 | print("linecount_3: %f  count=%d" % (time_end - time_start, linecount_3()))
 46 | '''
 47 | 
 48 | """
 49 | #!/usr/bin/env python
 50 | # -*- coding: utf-8 -*-
 51 | import os
 52 | import codecs
 53 | import sys
 54 | # reload(sys)
 55 | # sys.setdefaultencoding('utf-8')
 56 | 
 57 | user_ids = []
 58 | news_ids = []
 59 | time = []
 60 | news_title = []
 61 | news_body = []
 62 | 
 63 | def get_filedata(filename):
 64 |     try:
 65 |         with open(filename, "r", 1, "utf-8") as f:   #with sentence open and close file automatically
 66 |             data = f.readline()
 67 |             print(data)
 68 |             #print data.split('\t')
 69 |             sp = data.split('\t')
 70 |             user_ids.append(sp[0])
 71 |             news_ids.append(sp[1])
 72 |             time.append(sp[2])
 73 |             news_title.append(sp[3])
 74 |             print(news_title[-1])
 75 |             news_body.append(sp[4])
 76 |             print(news_body[-1])
 77 |     except IOError as ioerr:
 78 |         print('File Error' + str(ioerr))    #print the error
 79 |         return None
 80 | 
 81 | get_filedata("user_click_data.txt")
 82 | """
 83 | 
 84 | """
 85 | import time
 86 | import sys
 87 | 
 88 | time_start = time.time()
 89 | for i in list(range(100)):
 90 |     print(".", end="")
 91 | time_end = time.time()
 92 | print(time_end - time_start)
 93 | 
 94 | time_start = time.time()
 95 | for i in list(range(100)):
 96 |     sys.stdout.write('.'); sys.stdout.flush()
 97 | time_end = time.time()
 98 | print(time_end - time_start)
 99 | """
100 | 
101 | """
102 | import numpy
103 | a = numpy.zeros((2, 2))
104 | a.dump("a.numpydumpedarray")
105 | """
106 | 
107 | """
108 | import numpy
109 | a = numpy.array([[1,2],[3,4],[2,4],[3,2],[2,4]])
110 | b = a.tolist()
111 | b_key = [x for [x, y] in b]
112 | b_key_dict = {value: index for (index, value) in enumerate(b_key)}
113 | unique_ids = b_key_dict.values()
114 | c = [b[index] for index in unique_ids]
115 | """
116 | 
117 | """
118 | # -*- coding:utf-8 -*-
119 | #!python3
120 | 
121 | import math
122 | import numpy as np
123 | import os.path
124 | import time
125 | 
126 | time_start = time.time()
127 | dim = 9
128 | def file_news_id(filename):
129 |     '''
130 |     filename:
131 |     '''
132 |     fr = open(filename)
133 |     train_news = []
134 | 
135 |     while True:
136 |         line = fr.readline()
137 |         if line:
138 |             p = line.split('\n')
139 |             train_news.append(int(p[0]))
140 |         else:
141 |             break
142 | 
143 | 
144 |     return train_news
145 | 
146 | def file_news_wordlist(filename):
147 |     '''
148 |     filename:
149 |     '''
150 |     fr = open(filename,'r', 1, "utf-8")
151 |     wordlist = []
152 | 
153 |     while True:
154 |         line = fr.readline()
155 |         if line:
156 |             p = line.split('\n')
157 |             wordlist.append(p[0])
158 |         else:
159 |             break
160 | 
161 | 
162 |     return wordlist
163 | 
164 | s = os.getcwd()
165 | train_news_id = file_news_id('train_id.txt')
166 | train_wordlist = file_news_wordlist('frequence_word_use.txt')
167 | time_end = time.time()
168 | print(time_end - time_start)                   #prepare labels return
169 | """
170 | 
171 | """
172 | import numpy
173 | eps = numpy.finfo(float).eps
174 | """
175 | 
176 | """
177 | import numpy
178 | a = numpy.array([[0,1,1,0], [1,1,0,1]])
179 | b = numpy.zeros((2, 4))
180 | b[a==1] = 1
181 | """
182 | 
183 | """
184 | import numpy
185 | a = numpy.array([[0,1,1,0], [1,1,0,1]])
186 | b = numpy.zeros((2, 4))
187 | c = a[0] * 0.5 + b[1]
188 | """
189 | 
190 | """
191 | import numpy
192 | a = numpy.array([0,2,3,2,1])
193 | index = numpy.argsort(a)
194 | """
195 | 
196 | """
197 | import numpy
198 | a = numpy.array([1,2,3,4,5])
199 | inverse_indexes = range(4, -1, -1)
200 | b = a[inverse_indexes]
201 | """
202 | 
203 | """
204 | import numpy
205 | a = numpy.array([1,2,3])
206 | b = numpy.array([4,5,6])
207 | c = numpy.row_stack((a, b))
208 | d = numpy.column_stack((a, b))
209 | """
210 | 
211 | """
212 | # calculate similarity between users
213 | print("Calculate similarity between users started.")
214 | time_start = time.time()
215 | user_user_similarities = numpy.zeros((user_num, user_num), numpy.float16)
216 | # progress_count = 0
217 | # for i in range(0, user_num):
218 | #     for j in range(i + 1, user_num):  # similarity to oneself is set to "0"
219 | #         similarity = H[:, i].dot(H[:, j])
220 | #         user_user_similarities[i, j] = similarity
221 | #         user_user_similarities[j, i] = similarity
222 | #         # progress_count += 1
223 | #         # if progress_count % 100000 == 0:
224 | #         #     print("%f %%" % (progress_count*2 / (user_num*(user_num-1)) * 100))
225 | #         #     # sys.stdout.write("\r%f %%\r" % (progress_count*2 / (user_num*(user_num-1)) * 100)); sys.stdout.flush()
226 | #     if i % 10 == 0:
227 | #         print("%f%%. %f s elapsed." % (i / user_num * 100, time.time() - time_start))
228 | H_transpose = H.transpose()
229 | computed_count = 0
230 | compute_step = 1000  # to avoid MemoryError, only compute a part each time
231 | while computed_count < user_num:
232 |     compute_upper_limit = min((computed_count + compute_step, user_num))
233 |     user_user_similarities[computed_count:compute_upper_limit, :] = numpy.dot(H_transpose[computed_count:compute_upper_limit, :], H)
234 |     computed_count += compute_step
235 | time_end = time.time()
236 | print("Calculate similarity between users ended. %f s cost." % (time_end - time_start))
237 | # numpy.save("user_user_similarities.npy", user_user_similarities)
238 | # user_user_similarities = numpy.load("user_user_similarities.npy")
239 | """
240 | 
241 | """
242 | import numpy
243 | a = numpy.eye(3, dtype=numpy.bool_)
244 | b = numpy.int16(a)
245 | c = a[0] * 3
246 | b[a == 1] = 2
247 | """
248 | 
249 | """
250 | import numpy
251 | import scipy.io
252 | user_news_recommend_indexes = numpy.load("user_news_recommend_indexes.npy")
253 | scipy.io.savemat("user_news_recommend_indexes.mat", {"user_news_recommend_indexes": user_news_recommend_indexes})
254 | # user_ids = numpy.load("user_ids.npy")
255 | # scipy.io.savemat("user_ids.mat", {"user_ids": user_ids})
256 | # news_ids = numpy.load("news_ids.npy")
257 | # scipy.io.savemat("news_ids.mat", {"news_ids": news_ids})
258 | """
259 | 
260 | """
261 | import numpy
262 | a = numpy.array([[1,2,3], [1], [1,2,3,4]])
263 | b = numpy.array([[1,2,3,5], [1,6,7,8], [1,2,3,4]])
264 | """
265 | 
266 | """
267 | import data
268 | # news_dict = data.get_news_dict()
269 | # user_clicked_news_dict = data.get_user_clicked_news_dict()
270 | user_news_array_for_train, user_ids_for_train, news_ids_for_train, user_news_array_for_test, user_ids_for_test, news_ids_for_test = data.get_user_news_arrays_of_train_and_test()
271 | """
272 | 
273 | """
274 | a = list(range(5)) + list(range(5))
275 | for b in a:
276 |     if b % 2 == 0:
277 |         a.remove(b)
278 | 
279 | title_cuts_without_stop_word = [word for word in title_cuts if word not in stopkeys]
280 | """
281 | 
282 | """
283 | import numpy
284 | a = numpy.array([[1,2], [3,4]])
285 | # a.tofile("a.bin")
286 | # a.tofile("a.bin", sep=" ")
287 | numpy.savetxt("a.txt", a, fmt='%d', delimiter='\t')
288 | b = numpy.loadtxt("a.txt", dtype=numpy.int16, delimiter='\t')
289 | a = numpy.array([[0.1,2.2], [3.3,4]])
290 | numpy.savetxt("a.txt", a, fmt='%.6f', delimiter='\t')
291 | b = numpy.loadtxt("a.txt", dtype=numpy.float32, delimiter='\t')
292 | """
293 | 
294 | """
295 | import data
296 | user_news_array_for_train, user_ids_for_train, news_ids_for_train, user_news_array_for_test, user_ids_for_test, news_ids_for_test = data.get_user_news_arrays_of_train_and_test(True, True)
297 | """
298 | 
299 | """
300 | import numpy
301 | a = numpy.array([[1, 2], [3, 4]])
302 | b = a.transpose()
303 | """
304 | 
305 | """
306 | import sys
307 | import time
308 | bar_length = 20
309 | for percent in range(0, 101):
310 |     hashes = '#' * int(percent/100.0 * bar_length)
311 |     spaces = ' ' * (bar_length - len(hashes))
312 |     sys.stdout.write("\rPercent: [%s] %d%%"%(hashes + spaces, percent))
313 |     sys.stdout.flush()
314 |     time.sleep(0.1)
315 | """
316 | 
317 | print("")
318 | 


--------------------------------------------------------------------------------