├── README.md
├── mf.py
└── mf_threads.py


/README.md:
--------------------------------------------------------------------------------
 1 | implicitMF
 2 | ==========
 3 | 
 4 | Python implementation of implicit matrix factorization as outlined in http://labs.yahoo.com/files/HuKorenVolinsky-ICDM08.pdf.
 5 | 
 6 | Requires numpy version 1.7.1 or greater and scipy version 0.12.0 or greater.
 7 | 
 8 | This fork adds a multithreaded version: ```mf_threads.py``` that speeds things up a little bit.
 9 | 
10 | Note: because of Python's GIL, we actually use processes, not threads.
11 | 


--------------------------------------------------------------------------------
/mf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse as sparse
 3 | from scipy.sparse.linalg import spsolve
 4 | import time
 5 | 
 6 | def load_matrix(filename, num_users, num_items):
 7 |     t0 = time.time()
 8 |     counts = np.zeros((num_users, num_items))
 9 |     total = 0.0
10 |     num_zeros = num_users * num_items
11 |     for i, line in enumerate(open(filename, 'r')):
12 |         user, item, count = line.strip().split('\t')
13 |         user = int(user)
14 |         item = int(item)
15 |         count = float(count)
16 |         if user >= num_users:
17 |             continue
18 |         if item >= num_items:
19 |             continue
20 |         if count != 0:
21 |             counts[user, item] = count
22 |             total += count
23 |             num_zeros -= 1
24 |         if i % 100000 == 0:
25 |             print 'loaded %i counts...' % i
26 |     alpha = num_zeros / total
27 |     print 'alpha %.2f' % alpha
28 |     counts *= alpha
29 |     counts = sparse.csr_matrix(counts)
30 |     t1 = time.time()
31 |     print 'Finished loading matrix in %f seconds' % (t1 - t0)
32 |     return counts
33 | 
34 | 
35 | class ImplicitMF():
36 | 
37 |     def __init__(self, counts, num_factors=40, num_iterations=30,
38 |                  reg_param=0.8):
39 |         self.counts = counts
40 |         self.num_users = counts.shape[0]
41 |         self.num_items = counts.shape[1]
42 |         self.num_factors = num_factors
43 |         self.num_iterations = num_iterations
44 |         self.reg_param = reg_param
45 | 
46 |     def train_model(self):
47 |         self.user_vectors = np.random.normal(size=(self.num_users,
48 |                                                    self.num_factors))
49 |         self.item_vectors = np.random.normal(size=(self.num_items,
50 |                                                    self.num_factors))
51 | 
52 |         for i in xrange(self.num_iterations):
53 |             t0 = time.time()
54 |             print 'Solving for user vectors...'
55 |             self.user_vectors = self.iteration(True, sparse.csr_matrix(self.item_vectors))
56 |             print 'Solving for item vectors...'
57 |             self.item_vectors = self.iteration(False, sparse.csr_matrix(self.user_vectors))
58 |             t1 = time.time()
59 |             print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0)
60 | 
61 |     def iteration(self, user, fixed_vecs):
62 |         num_solve = self.num_users if user else self.num_items
63 |         num_fixed = fixed_vecs.shape[0]
64 |         YTY = fixed_vecs.T.dot(fixed_vecs)
65 |         eye = sparse.eye(num_fixed)
66 |         lambda_eye = self.reg_param * sparse.eye(self.num_factors)
67 |         solve_vecs = np.zeros((num_solve, self.num_factors))
68 | 
69 |         t = time.time()
70 |         for i in xrange(num_solve):
71 |             if user:
72 |                 counts_i = self.counts[i].toarray()
73 |             else:
74 |                 counts_i = self.counts[:, i].T.toarray()
75 |             CuI = sparse.diags(counts_i, [0])
76 |             pu = counts_i.copy()
77 |             pu[np.where(pu != 0)] = 1.0
78 |             YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs)
79 |             YTCupu = fixed_vecs.T.dot(CuI + eye).dot(sparse.csr_matrix(pu).T)
80 |             xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu)
81 |             solve_vecs[i] = xu
82 |             if i % 1000 == 0:
83 |                 print 'Solved %i vecs in %d seconds' % (i, time.time() - t)
84 |                 t = time.time()
85 | 
86 |         return solve_vecs
87 | 


--------------------------------------------------------------------------------
/mf_threads.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Original code from Chris Johnson:
  3 | https://github.com/MrChrisJohnson/implicit-mf
  4 | 
  5 | Multithreading added by Thierry Bertin-Mahieux (2014)
  6 | """
  7 | 
  8 | import copy
  9 | import numpy as np
 10 | import scipy.sparse as sparse
 11 | import scipy.linalg
 12 | from scipy.sparse.linalg import spsolve
 13 | from multiprocessing import Process, Queue
 14 | import time
 15 | 
 16 | def load_matrix(filename, num_users, num_items):
 17 |     t0 = time.time()
 18 |     counts = np.zeros((num_users, num_items))
 19 |     total = 0.0
 20 |     num_zeros = num_users * num_items
 21 |     for i, line in enumerate(open(filename, 'r')):
 22 |         user, item, count = line.strip().split('\t')
 23 |         user = int(user)
 24 |         item = int(item)
 25 |         count = float(count)
 26 |         if user >= num_users:
 27 |             continue
 28 |         if item >= num_items:
 29 |             continue
 30 |         if count != 0:
 31 |             counts[user, item] = count
 32 |             total += count
 33 |             num_zeros -= 1
 34 |         if i % 100000 == 0:
 35 |             print 'loaded %i counts...' % i
 36 |     alpha = num_zeros / total
 37 |     print 'alpha %.2f' % alpha
 38 |     counts *= alpha
 39 |     counts = sparse.csr_matrix(counts)
 40 |     t1 = time.time()
 41 |     print 'Finished loading matrix in %f seconds' % (t1 - t0)
 42 |     return counts
 43 | 
 44 | 
 45 | class ImplicitMF():
 46 | 
 47 |     def __init__(self, counts, num_factors=40, num_iterations=30,
 48 |                  reg_param=0.8, num_threads=1):
 49 |         self.counts = counts
 50 |         self.num_users = counts.shape[0]
 51 |         self.num_items = counts.shape[1]
 52 |         self.num_factors = num_factors
 53 |         self.num_iterations = num_iterations
 54 |         self.reg_param = reg_param
 55 |         self.num_threads = num_threads
 56 | 
 57 |     def train_model(self):
 58 |         self.user_vectors = np.random.normal(size=(self.num_users,
 59 |                                                    self.num_factors))
 60 |         self.item_vectors = np.random.normal(size=(self.num_items,
 61 |                                                    self.num_factors))
 62 | 
 63 |         for i in xrange(self.num_iterations):
 64 |             t0 = time.time()
 65 | 
 66 |             user_vectors_old = copy.deepcopy(self.user_vectors)
 67 |             item_vectors_old = copy.deepcopy(self.item_vectors)
 68 | 
 69 |             print 'Solving for user vectors...'
 70 |             self.user_vectors = self.iteration(True, sparse.csr_matrix(self.item_vectors))
 71 |             print 'Solving for item vectors...'
 72 |             self.item_vectors = self.iteration(False, sparse.csr_matrix(self.user_vectors))
 73 |             t1 = time.time()
 74 |             print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0)
 75 |             norm_diff = scipy.linalg.norm(user_vectors_old - self.user_vectors) + scipy.linalg.norm(item_vectors_old - self.item_vectors)
 76 |             print 'norm difference:', norm_diff
 77 | 
 78 |     def iteration(self, user, fixed_vecs):
 79 |         num_solve = self.num_users if user else self.num_items
 80 |         num_fixed = fixed_vecs.shape[0]
 81 |         YTY = fixed_vecs.T.dot(fixed_vecs)
 82 |         eye = sparse.eye(num_fixed)
 83 |         lambda_eye = self.reg_param * sparse.eye(self.num_factors)
 84 |         solve_vecs = np.zeros((num_solve, self.num_factors))
 85 | 
 86 |         batch_size = int(np.ceil(num_solve * 1. / self.num_threads))
 87 |         print 'batch_size per thread is: %d' % batch_size
 88 |         idx = 0
 89 |         processes = []
 90 |         done_queue = Queue()
 91 |         while idx < num_solve:
 92 |             min_i = idx
 93 |             max_i = min(idx + batch_size, num_solve)
 94 |             p = Process(target=self.iteration_one_vec,
 95 |                         args=(user, YTY, eye, lambda_eye, fixed_vecs, min_i, max_i, done_queue))
 96 |             p.start()
 97 |             processes.append(p)
 98 |             idx += batch_size
 99 | 
100 |         cnt_vecs = 0
101 |         while True:
102 |             is_alive = False
103 |             for p in processes:
104 |                 if p.is_alive():
105 |                     is_alive = True
106 |                     break
107 |             if not is_alive and done_queue.empty():
108 |                 break
109 |             time.sleep(.1)
110 |             while not done_queue.empty():
111 |                 res = done_queue.get()
112 |                 i, xu = res
113 |                 solve_vecs[i] = xu
114 |                 cnt_vecs += 1
115 |         assert cnt_vecs == len(solve_vecs)
116 | 
117 |         done_queue.close()
118 |         for p in processes:
119 |             p.join()
120 | 
121 |         print 'All processes completed.'
122 |         return solve_vecs
123 | 
124 |     def iteration_one_vec(self, user, YTY, eye, lambda_eye, fixed_vecs, min_i, max_i, output):
125 |         t = time.time()
126 |         cnt = 0
127 |         for i in xrange(min_i, max_i):
128 |             if user:
129 |                 counts_i = self.counts[i].toarray()
130 |             else:
131 |                 counts_i = self.counts[:, i].T.toarray()
132 |             CuI = sparse.diags(counts_i, [0])
133 |             pu = counts_i.copy()
134 |             pu[np.where(pu != 0)] = 1.0
135 |             YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs)
136 |             YTCupu = fixed_vecs.T.dot(CuI + eye).dot(sparse.csr_matrix(pu).T)
137 |             xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu)
138 |             output.put((i, list(xu)))
139 |             cnt += 1
140 |             if cnt % 1000 == 0:
141 |                 print 'Solved %d vecs in %d seconds (one thread)' % (cnt, time.time() - t)
142 |         output.close()
143 |         print 'Process done.'
144 | 


--------------------------------------------------------------------------------