├── README.md ├── mf.py └── mf_threads.py /README.md: -------------------------------------------------------------------------------- 1 | implicitMF 2 | ========== 3 | 4 | Python implementation of implicit matrix factorization as outlined in http://labs.yahoo.com/files/HuKorenVolinsky-ICDM08.pdf. 5 | 6 | Requires numpy version 1.7.1 or greater and scipy version 0.12.0 or greater. 7 | 8 | This fork adds a multithreaded version: ```mf_threads.py``` that speeds things up a little bit. 9 | 10 | Note: because of Python's GIL, we actually use processes, not threads. 11 | -------------------------------------------------------------------------------- /mf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sparse 3 | from scipy.sparse.linalg import spsolve 4 | import time 5 | 6 | def load_matrix(filename, num_users, num_items): 7 | t0 = time.time() 8 | counts = np.zeros((num_users, num_items)) 9 | total = 0.0 10 | num_zeros = num_users * num_items 11 | for i, line in enumerate(open(filename, 'r')): 12 | user, item, count = line.strip().split('\t') 13 | user = int(user) 14 | item = int(item) 15 | count = float(count) 16 | if user >= num_users: 17 | continue 18 | if item >= num_items: 19 | continue 20 | if count != 0: 21 | counts[user, item] = count 22 | total += count 23 | num_zeros -= 1 24 | if i % 100000 == 0: 25 | print 'loaded %i counts...' % i 26 | alpha = num_zeros / total 27 | print 'alpha %.2f' % alpha 28 | counts *= alpha 29 | counts = sparse.csr_matrix(counts) 30 | t1 = time.time() 31 | print 'Finished loading matrix in %f seconds' % (t1 - t0) 32 | return counts 33 | 34 | 35 | class ImplicitMF(): 36 | 37 | def __init__(self, counts, num_factors=40, num_iterations=30, 38 | reg_param=0.8): 39 | self.counts = counts 40 | self.num_users = counts.shape[0] 41 | self.num_items = counts.shape[1] 42 | self.num_factors = num_factors 43 | self.num_iterations = num_iterations 44 | self.reg_param = reg_param 45 | 46 | def train_model(self): 47 | self.user_vectors = np.random.normal(size=(self.num_users, 48 | self.num_factors)) 49 | self.item_vectors = np.random.normal(size=(self.num_items, 50 | self.num_factors)) 51 | 52 | for i in xrange(self.num_iterations): 53 | t0 = time.time() 54 | print 'Solving for user vectors...' 55 | self.user_vectors = self.iteration(True, sparse.csr_matrix(self.item_vectors)) 56 | print 'Solving for item vectors...' 57 | self.item_vectors = self.iteration(False, sparse.csr_matrix(self.user_vectors)) 58 | t1 = time.time() 59 | print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0) 60 | 61 | def iteration(self, user, fixed_vecs): 62 | num_solve = self.num_users if user else self.num_items 63 | num_fixed = fixed_vecs.shape[0] 64 | YTY = fixed_vecs.T.dot(fixed_vecs) 65 | eye = sparse.eye(num_fixed) 66 | lambda_eye = self.reg_param * sparse.eye(self.num_factors) 67 | solve_vecs = np.zeros((num_solve, self.num_factors)) 68 | 69 | t = time.time() 70 | for i in xrange(num_solve): 71 | if user: 72 | counts_i = self.counts[i].toarray() 73 | else: 74 | counts_i = self.counts[:, i].T.toarray() 75 | CuI = sparse.diags(counts_i, [0]) 76 | pu = counts_i.copy() 77 | pu[np.where(pu != 0)] = 1.0 78 | YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs) 79 | YTCupu = fixed_vecs.T.dot(CuI + eye).dot(sparse.csr_matrix(pu).T) 80 | xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu) 81 | solve_vecs[i] = xu 82 | if i % 1000 == 0: 83 | print 'Solved %i vecs in %d seconds' % (i, time.time() - t) 84 | t = time.time() 85 | 86 | return solve_vecs 87 | -------------------------------------------------------------------------------- /mf_threads.py: -------------------------------------------------------------------------------- 1 | """ 2 | Original code from Chris Johnson: 3 | https://github.com/MrChrisJohnson/implicit-mf 4 | 5 | Multithreading added by Thierry Bertin-Mahieux (2014) 6 | """ 7 | 8 | import copy 9 | import numpy as np 10 | import scipy.sparse as sparse 11 | import scipy.linalg 12 | from scipy.sparse.linalg import spsolve 13 | from multiprocessing import Process, Queue 14 | import time 15 | 16 | def load_matrix(filename, num_users, num_items): 17 | t0 = time.time() 18 | counts = np.zeros((num_users, num_items)) 19 | total = 0.0 20 | num_zeros = num_users * num_items 21 | for i, line in enumerate(open(filename, 'r')): 22 | user, item, count = line.strip().split('\t') 23 | user = int(user) 24 | item = int(item) 25 | count = float(count) 26 | if user >= num_users: 27 | continue 28 | if item >= num_items: 29 | continue 30 | if count != 0: 31 | counts[user, item] = count 32 | total += count 33 | num_zeros -= 1 34 | if i % 100000 == 0: 35 | print 'loaded %i counts...' % i 36 | alpha = num_zeros / total 37 | print 'alpha %.2f' % alpha 38 | counts *= alpha 39 | counts = sparse.csr_matrix(counts) 40 | t1 = time.time() 41 | print 'Finished loading matrix in %f seconds' % (t1 - t0) 42 | return counts 43 | 44 | 45 | class ImplicitMF(): 46 | 47 | def __init__(self, counts, num_factors=40, num_iterations=30, 48 | reg_param=0.8, num_threads=1): 49 | self.counts = counts 50 | self.num_users = counts.shape[0] 51 | self.num_items = counts.shape[1] 52 | self.num_factors = num_factors 53 | self.num_iterations = num_iterations 54 | self.reg_param = reg_param 55 | self.num_threads = num_threads 56 | 57 | def train_model(self): 58 | self.user_vectors = np.random.normal(size=(self.num_users, 59 | self.num_factors)) 60 | self.item_vectors = np.random.normal(size=(self.num_items, 61 | self.num_factors)) 62 | 63 | for i in xrange(self.num_iterations): 64 | t0 = time.time() 65 | 66 | user_vectors_old = copy.deepcopy(self.user_vectors) 67 | item_vectors_old = copy.deepcopy(self.item_vectors) 68 | 69 | print 'Solving for user vectors...' 70 | self.user_vectors = self.iteration(True, sparse.csr_matrix(self.item_vectors)) 71 | print 'Solving for item vectors...' 72 | self.item_vectors = self.iteration(False, sparse.csr_matrix(self.user_vectors)) 73 | t1 = time.time() 74 | print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0) 75 | norm_diff = scipy.linalg.norm(user_vectors_old - self.user_vectors) + scipy.linalg.norm(item_vectors_old - self.item_vectors) 76 | print 'norm difference:', norm_diff 77 | 78 | def iteration(self, user, fixed_vecs): 79 | num_solve = self.num_users if user else self.num_items 80 | num_fixed = fixed_vecs.shape[0] 81 | YTY = fixed_vecs.T.dot(fixed_vecs) 82 | eye = sparse.eye(num_fixed) 83 | lambda_eye = self.reg_param * sparse.eye(self.num_factors) 84 | solve_vecs = np.zeros((num_solve, self.num_factors)) 85 | 86 | batch_size = int(np.ceil(num_solve * 1. / self.num_threads)) 87 | print 'batch_size per thread is: %d' % batch_size 88 | idx = 0 89 | processes = [] 90 | done_queue = Queue() 91 | while idx < num_solve: 92 | min_i = idx 93 | max_i = min(idx + batch_size, num_solve) 94 | p = Process(target=self.iteration_one_vec, 95 | args=(user, YTY, eye, lambda_eye, fixed_vecs, min_i, max_i, done_queue)) 96 | p.start() 97 | processes.append(p) 98 | idx += batch_size 99 | 100 | cnt_vecs = 0 101 | while True: 102 | is_alive = False 103 | for p in processes: 104 | if p.is_alive(): 105 | is_alive = True 106 | break 107 | if not is_alive and done_queue.empty(): 108 | break 109 | time.sleep(.1) 110 | while not done_queue.empty(): 111 | res = done_queue.get() 112 | i, xu = res 113 | solve_vecs[i] = xu 114 | cnt_vecs += 1 115 | assert cnt_vecs == len(solve_vecs) 116 | 117 | done_queue.close() 118 | for p in processes: 119 | p.join() 120 | 121 | print 'All processes completed.' 122 | return solve_vecs 123 | 124 | def iteration_one_vec(self, user, YTY, eye, lambda_eye, fixed_vecs, min_i, max_i, output): 125 | t = time.time() 126 | cnt = 0 127 | for i in xrange(min_i, max_i): 128 | if user: 129 | counts_i = self.counts[i].toarray() 130 | else: 131 | counts_i = self.counts[:, i].T.toarray() 132 | CuI = sparse.diags(counts_i, [0]) 133 | pu = counts_i.copy() 134 | pu[np.where(pu != 0)] = 1.0 135 | YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs) 136 | YTCupu = fixed_vecs.T.dot(CuI + eye).dot(sparse.csr_matrix(pu).T) 137 | xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu) 138 | output.put((i, list(xu))) 139 | cnt += 1 140 | if cnt % 1000 == 0: 141 | print 'Solved %d vecs in %d seconds (one thread)' % (cnt, time.time() - t) 142 | output.close() 143 | print 'Process done.' 144 | --------------------------------------------------------------------------------