├── .gitignore
├── README.md
├── bdoopr.py
├── bpr.py
├── bprslim.py
└── presampled_bpr.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | bpr
 2 | ===
 3 | 
 4 | Bayesian Personalized Ranking is a learning algorithm for collaborative filtering
 5 | first introduced in:
 6 | BPR: Bayesian Personalized Ranking from Implicit Feedback. Steffen Rendle, Christoph
 7 | Freudenthaler, Zeno Gantner and Lars Schmidt-Thieme, Proc. UAI 2009.
 8 | 
 9 | This is a simple Python implementation of BPR for matrix factorization based on the
10 | C# one provided by the original authors in MyMediaLite, see http://www.mymedialite.net.
11 | I've also implemented a BPR version of
12 | SLIM: Sparse Linear Methods for Top-N Recommender Systems.  Xia Ning and George Karypis, Proc. ACM RecSys 2012.
13 | 


--------------------------------------------------------------------------------
/bdoopr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Map-reduce algorithm to create a schedule of BPR samples.
 3 | 
 4 | The probability of emitting a candidate positive item
 5 | in the first mapper is designed to give a uniform
 6 | probability of any item in the dataset being output
 7 | as the positive item in the final list of triples.
 8 | """
 9 | 
10 | import random
11 | 
12 | J_IS_POSITIVE = '-'
13 | 
14 | class Mapper1:
15 | 
16 |     def __init__(self,user_item_counts,oversampling=1):
17 |         self.N = sum(user_item_counts.values())  # number of non-zeros
18 |         self.user_item_counts = user_item_counts
19 |         self.max_item_count = max(user_item_counts.values())
20 |         self.oversampling = oversampling
21 | 
22 |     def sample_positive(self,user):
23 |         alpha = float(self.N-self.max_item_count)/(self.N-self.user_item_counts[user])
24 |         return random.uniform(0,1) < alpha
25 | 
26 |     def rand_idx(self):
27 |         return random.randint(0,self.N*self.oversampling)
28 | 
29 |     def __call__(self,user,item):
30 |         # send candidate items to random indices
31 |         for _ in xrange(self.oversampling):
32 |             if self.sample_positive(user):
33 |                 # propose a candidate positive item
34 |                 yield self.rand_idx(),(user,item,'+')
35 |             # propose a candidate negative item
36 |             yield self.rand_idx(),(user,item,'-')
37 | 
38 | def reducer1(idx,values):
39 |     # sample a positive and negative item uniformly to make a candidate triple
40 |     seen = {'+':[],'-':[]}
41 |     for user,item,c in values:
42 |         seen[c].append((user,item))
43 |     if seen['+'] and seen['-']:
44 |         # we've got at least one postive and one negative item, now pick one
45 |         pos = random.choice(seen['+'])
46 |         neg = random.choice(seen['-'])
47 |         yield (pos[0],neg[1]),pos[1]  # candidate triple as (u,j),i
48 | 
49 | def mapper2(user,item):
50 |     # map the data again with an indicator value
51 |     # to help us spot negative items in candidate triples
52 |     yield (int(user),int(item)),J_IS_POSITIVE
53 | 
54 | def reducer2(key,values):
55 |     user,j = key
56 |     values = list(values)
57 |     # check the positive items
58 |     ii = set(i for i in values if i != J_IS_POSITIVE)
59 |     if len(ii) == len(values):
60 |         # j really is a negative item for u
61 |         for i in ii:
62 |             yield user,(i,j)
63 | 
64 | if __name__ == '__main__':
65 | 
66 |     import toydoop
67 | 
68 |     # create some user-item data
69 |     data = {
70 |             1:[10,20,30,40,50,60,70,80,90],
71 |             2:[10,30,110,120,130,140,150],
72 |             3:[20,30,40,90,120,160,170,180,190]
73 |            }
74 |     user_item_counts = dict((k,len(v)) for k,v in data.iteritems())
75 | 
76 |     datafile = 'bdoopr.in'
77 |     mapout1 = 'bdoopr.map1'
78 |     mapout2 = 'bdoopr.map2'
79 |     outfile = 'bdoopr.out'
80 | 
81 |     f = open(datafile,'w')
82 |     for user,items in data.iteritems():
83 |         for item in items:
84 |             print >>f,toydoop.default_formatter(user,item)
85 |     f.close()
86 | 
87 |     # run two stages of mapreduce
88 |     mapper1 = Mapper1(user_item_counts,oversampling=10)
89 |     toydoop.mapreduce(datafile,mapout1,mapper=mapper1,reducer=reducer1)
90 |     toydoop.mapreduce(datafile,mapout2,mapper=mapper2)  # map the data again
91 |     toydoop.mapreduce([mapout1,mapout2],outfile,reducer=reducer2)
92 | 


--------------------------------------------------------------------------------
/bpr.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Bayesian Personalized Ranking
  3 | 
  4 | Matrix Factorization model and a variety of classes
  5 | implementing different sampling strategies.
  6 | """
  7 | 
  8 | import numpy as np
  9 | from math import exp
 10 | import random
 11 | 
 12 | class BPRArgs(object):
 13 | 
 14 |     def __init__(self,learning_rate=0.05,
 15 |                  bias_regularization=1.0,
 16 |                  user_regularization=0.0025,
 17 |                  positive_item_regularization=0.0025,
 18 |                  negative_item_regularization=0.00025,
 19 |                  update_negative_item_factors=True):
 20 |         self.learning_rate = learning_rate
 21 |         self.bias_regularization = bias_regularization
 22 |         self.user_regularization = user_regularization
 23 |         self.positive_item_regularization = positive_item_regularization
 24 |         self.negative_item_regularization = negative_item_regularization
 25 |         self.update_negative_item_factors = update_negative_item_factors
 26 | 
 27 | class BPR(object):
 28 | 
 29 |     def __init__(self,D,args):
 30 |         """initialise BPR matrix factorization model
 31 |         D: number of factors
 32 |         """
 33 |         self.D = D
 34 |         self.learning_rate = args.learning_rate
 35 |         self.bias_regularization = args.bias_regularization
 36 |         self.user_regularization = args.user_regularization
 37 |         self.positive_item_regularization = args.positive_item_regularization
 38 |         self.negative_item_regularization = args.negative_item_regularization
 39 |         self.update_negative_item_factors = args.update_negative_item_factors
 40 | 
 41 |     def train(self,data,sampler,num_iters):
 42 |         """train model
 43 |         data: user-item matrix as a scipy sparse matrix
 44 |               users and items are zero-indexed
 45 |         """
 46 |         self.init(data)
 47 | 
 48 |         print 'initial loss = {0}'.format(self.loss())
 49 |         for it in xrange(num_iters):
 50 |             print 'starting iteration {0}'.format(it)
 51 |             for u,i,j in sampler.generate_samples(self.data):
 52 |                 self.update_factors(u,i,j)
 53 |             print 'iteration {0}: loss = {1}'.format(it,self.loss())
 54 | 
 55 |     def init(self,data):
 56 |         self.data = data
 57 |         self.num_users,self.num_items = self.data.shape
 58 | 
 59 |         self.item_bias = np.zeros(self.num_items)
 60 |         self.user_factors = np.random.random_sample((self.num_users,self.D))
 61 |         self.item_factors = np.random.random_sample((self.num_items,self.D))
 62 | 
 63 |         self.create_loss_samples()
 64 | 
 65 |     def create_loss_samples(self):
 66 |         # apply rule of thumb to decide num samples over which to compute loss
 67 |         num_loss_samples = int(100*self.num_users**0.5)
 68 | 
 69 |         print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples)
 70 |         sampler = UniformUserUniformItem(True)
 71 |         self.loss_samples = [t for t in sampler.generate_samples(data,num_loss_samples)]
 72 | 
 73 |     def update_factors(self,u,i,j,update_u=True,update_i=True):
 74 |         """apply SGD update"""
 75 |         update_j = self.update_negative_item_factors
 76 | 
 77 |         x = self.item_bias[i] - self.item_bias[j] \
 78 |             + np.dot(self.user_factors[u,:],self.item_factors[i,:]-self.item_factors[j,:])
 79 | 
 80 |         z = 1.0/(1.0+exp(x))
 81 | 
 82 |         # update bias terms
 83 |         if update_i:
 84 |             d = z - self.bias_regularization * self.item_bias[i]
 85 |             self.item_bias[i] += self.learning_rate * d
 86 |         if update_j:
 87 |             d = -z - self.bias_regularization * self.item_bias[j]
 88 |             self.item_bias[j] += self.learning_rate * d
 89 | 
 90 |         if update_u:
 91 |             d = (self.item_factors[i,:]-self.item_factors[j,:])*z - self.user_regularization*self.user_factors[u,:]
 92 |             self.user_factors[u,:] += self.learning_rate*d
 93 |         if update_i:
 94 |             d = self.user_factors[u,:]*z - self.positive_item_regularization*self.item_factors[i,:]
 95 |             self.item_factors[i,:] += self.learning_rate*d
 96 |         if update_j:
 97 |             d = -self.user_factors[u,:]*z - self.negative_item_regularization*self.item_factors[j,:]
 98 |             self.item_factors[j,:] += self.learning_rate*d
 99 | 
100 |     def loss(self):
101 |         ranking_loss = 0;
102 |         for u,i,j in self.loss_samples:
103 |             x = self.predict(u,i) - self.predict(u,j)
104 |             ranking_loss += 1.0/(1.0+exp(x))
105 | 
106 |         complexity = 0;
107 |         for u,i,j in self.loss_samples:
108 |             complexity += self.user_regularization * np.dot(self.user_factors[u],self.user_factors[u])
109 |             complexity += self.positive_item_regularization * np.dot(self.item_factors[i],self.item_factors[i])
110 |             complexity += self.negative_item_regularization * np.dot(self.item_factors[j],self.item_factors[j])
111 |             complexity += self.bias_regularization * self.item_bias[i]**2
112 |             complexity += self.bias_regularization * self.item_bias[j]**2
113 | 
114 |         return ranking_loss + 0.5*complexity
115 | 
116 |     def predict(self,u,i):
117 |         return self.item_bias[i] + np.dot(self.user_factors[u],self.item_factors[i])
118 | 
119 | 
120 | # sampling strategies
121 | 
122 | class Sampler(object):
123 | 
124 |     def __init__(self,sample_negative_items_empirically):
125 |         self.sample_negative_items_empirically = sample_negative_items_empirically
126 | 
127 |     def init(self,data,max_samples=None):
128 |         self.data = data
129 |         self.num_users,self.num_items = data.shape
130 |         self.max_samples = max_samples
131 | 
132 |     def sample_user(self):
133 |         u = self.uniform_user()
134 |         num_items = self.data[u].getnnz()
135 |         assert(num_items > 0 and num_items != self.num_items)
136 |         return u
137 | 
138 |     def sample_negative_item(self,user_items):
139 |         j = self.random_item()
140 |         while j in user_items:
141 |             j = self.random_item()
142 |         return j
143 | 
144 |     def uniform_user(self):
145 |         return random.randint(0,self.num_users-1)
146 | 
147 |     def random_item(self):
148 |         """sample an item uniformly or from the empirical distribution
149 |            observed in the training data
150 |         """
151 |         if self.sample_negative_items_empirically:
152 |             # just pick something someone rated!
153 |             u = self.uniform_user()
154 |             i = random.choice(self.data[u].indices)
155 |         else:
156 |             i = random.randint(0,self.num_items-1)
157 |         return i
158 | 
159 |     def num_samples(self,n):
160 |         if self.max_samples is None:
161 |             return n
162 |         return min(n,self.max_samples)
163 | 
164 | class UniformUserUniformItem(Sampler):
165 | 
166 |     def generate_samples(self,data,max_samples=None):
167 |         self.init(data,max_samples)
168 |         for _ in xrange(self.num_samples(self.data.nnz)):
169 |             u = self.uniform_user()
170 |             # sample positive item
171 |             i = random.choice(self.data[u].indices)
172 |             j = self.sample_negative_item(self.data[u].indices)
173 |             yield u,i,j
174 | 
175 | class UniformUserUniformItemWithoutReplacement(Sampler):
176 | 
177 |     def generate_samples(self,data,max_samples=None):
178 |         self.init(self,data,max_samples)
179 |         # make a local copy of data as we're going to "forget" some entries
180 |         self.local_data = self.data.copy()
181 |         for _ in xrange(self.num_samples(self.data.nnz)):
182 |             u = self.uniform_user()
183 |             # sample positive item without replacement if we can
184 |             user_items = self.local_data[u].nonzero()[1]
185 |             if len(user_items) == 0:
186 |                 # reset user data if it's all been sampled
187 |                 for ix in self.local_data[u].indices:
188 |                     self.local_data[u,ix] = self.data[u,ix]
189 |                 user_items = self.local_data[u].nonzero()[1]
190 |             i = random.choice(user_items)
191 |             # forget this item so we don't sample it again for the same user
192 |             self.local_data[u,i] = 0
193 |             j = self.sample_negative_item(user_items)
194 |             yield u,i,j
195 | 
196 | class UniformPair(Sampler):
197 | 
198 |     def generate_samples(self,data,max_samples=None):
199 |         self.init(data,max_samples)
200 |         for _ in xrange(self.num_samples(self.data.nnz)):
201 |             idx = random.randint(0,self.data.nnz-1)
202 |             u = self.users[self.idx]
203 |             i = self.items[self.idx]
204 |             j = self.sample_negative_item(self.data[u])
205 |             yield u,i,j
206 | 
207 | class UniformPairWithoutReplacement(Sampler):
208 | 
209 |     def generate_samples(self,data,max_samples=None):
210 |         self.init(data,max_samples)
211 |         idxs = range(self.data.nnz)
212 |         random.shuffle(idxs)
213 |         self.users,self.items = self.data.nonzero()
214 |         self.users = self.users[idxs]
215 |         self.items = self.items[idxs]
216 |         self.idx = 0
217 |         for _ in xrange(self.num_samples(self.data.nnz)):
218 |             u = self.users[self.idx]
219 |             i = self.items[self.idx]
220 |             j = self.sample_negative_item(self.data[u])
221 |             self.idx += 1
222 |             yield u,i,j
223 | 
224 | class ExternalSchedule(Sampler):
225 | 
226 |     def __init__(self,filepath,index_offset=0):
227 |         self.filepath = filepath
228 |         self.index_offset = index_offset
229 | 
230 |     def generate_samples(self,data,max_samples=None):
231 |         self.init(data,max_samples)
232 |         f = open(self.filepath)
233 |         samples = [map(int,line.strip().split()) for line in f]
234 |         random.shuffle(samples)  # important!
235 |         num_samples = self.num_samples(len(samples))
236 |         for u,i,j in samples[:num_samples]:
237 |             yield u-self.index_offset,i-self.index_offset,j-self.index_offset
238 | 
239 | if __name__ == '__main__':
240 | 
241 |     # learn a matrix factorization with BPR like this:
242 | 
243 |     import sys
244 |     from scipy.io import mmread
245 | 
246 |     data = mmread(sys.argv[1]).tocsr()
247 | 
248 |     args = BPRArgs()
249 |     args.learning_rate = 0.3
250 | 
251 |     num_factors = 10
252 |     model = BPR(num_factors,args)
253 | 
254 |     sample_negative_items_empirically = True
255 |     sampler = UniformPairWithoutReplacement(sample_negative_items_empirically)
256 |     num_iters = 10
257 |     model.train(data,sampler,num_iters)
258 | 


--------------------------------------------------------------------------------
/bprslim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Sparse LInear Method for collaborative filtering
  3 | using BPR to optimise for AUC.
  4 | 
  5 | Uses pysparse for the item similarity matrix as
  6 | this appears to give much faster read/write access
  7 | than anything in scipy.sparse.  Easiest to install
  8 | the Common Sense Computing version like this:
  9 |   pip install csc-pysparse
 10 | The data is still held in a scipy.sparse.csr_matrix
 11 | as for the access we need here that is actually faster
 12 | than the pysparse ll_mat.
 13 | """
 14 | 
 15 | from pysparse.sparse.spmatrix import *
 16 | import numpy as np
 17 | from math import exp
 18 | 
 19 | from bpr import BPRArgs, ExternalSchedule
 20 | 
 21 | class BPRSLIM(object):
 22 | 
 23 |     def __init__(self,args):
 24 |         """
 25 |         initialise SLIM model
 26 |         """
 27 |         self.learning_rate = args.learning_rate
 28 |         self.positive_item_regularization = args.positive_item_regularization
 29 |         self.negative_item_regularization = args.negative_item_regularization
 30 |         self.init_similarity_std = 0.1
 31 | 
 32 |     def train(self,data,sampler,num_iters):
 33 |         """train model
 34 |         data:    user-item matrix as a scipy sparse matrix
 35 |                  users and items are zero-indexed
 36 |         sampler: must be ExternalSchedule
 37 |         """
 38 |         self.data = data
 39 |         self.num_users,self.num_items = self.data.shape
 40 | 
 41 |         print 'finding sparsity structure of item similarities...'
 42 |         # do a dry run of an iteration and collect the item indices
 43 |         indices = set()
 44 |         for  u,i,j in sampler.generate_samples(self.data):
 45 |             for l in self.data[u].indices:
 46 |                 if l != i:
 47 |                     indices.add((i,l))
 48 |                 if l != j:
 49 |                     indices.add((j,l))
 50 |         print 'initialising item similarities...'
 51 |         self.item_similarities = ll_mat(self.num_items,self.num_items,len(indices))
 52 |         indices = np.array(list(indices))
 53 |         ii = indices[:,0]
 54 |         jj = indices[:,1]
 55 |         vals = self.init_similarity_std * np.random.randn(len(indices))
 56 |         for i,j,v in zip(ii,jj,vals):
 57 |             self.item_similarities[int(i),int(j)] = v
 58 | 
 59 |         # TODO: with pysparse we *might* get away with lazy initialization
 60 |         # and letting the item similarities grow over time...
 61 |         # i.e. we wouldn't be tied to a fixed schedule
 62 | 
 63 |         # create loss samples, again restrict to the scheduled samples
 64 |         # so we have initialised item similarities
 65 |         num_loss_samples = int(100*self.num_users**0.5)
 66 |         self.loss_samples = [t for t in sampler.generate_samples(data,num_loss_samples)]
 67 | 
 68 |         for it in xrange(num_iters):
 69 |             print 'starting iteration {0}'.format(it)
 70 |             for u,i,j in sampler.generate_samples(self.data):
 71 |                 self.update_factors(u,i,j)
 72 |             print 'iteration {0}: loss = {1}'.format(it,self.loss())
 73 | 
 74 |     def loss(self):
 75 | 
 76 |         # TODO: this seems to take a lot of the traning time - why??
 77 | 
 78 |         ranking_loss = 0;
 79 |         for u,i,j in self.loss_samples:
 80 |             x = self.predict(u,i) - self.predict(u,j)
 81 |             ranking_loss += 1.0/(1.0+exp(x))
 82 | 
 83 |         complexity = 0;
 84 |         for u,i,j in self.loss_samples:
 85 |             complexity += self.positive_item_regularization * self.item_similarities[i,:].norm('fro')**2
 86 |             complexity += self.negative_item_regularization * self.item_similarities[j,:].norm('fro')**2
 87 | 
 88 |         return ranking_loss + 0.5*complexity
 89 | 
 90 |     def predict(self,u,i):
 91 |         return sum(self.item_similarities[i,int(l)] for l in self.data[u].indices if l != i)
 92 | 
 93 |     def update_factors(self,u,i,j):
 94 |         """apply SGD update"""
 95 | 
 96 |         x = sum(self.item_similarities[i,int(l)]-self.item_similarities[j,int(l)] for l in self.data[u].indices)
 97 | 
 98 |         z = 1.0/(1.0+exp(x))
 99 | 
100 |         # update item similarity weights
101 |         for l in self.data[u].indices:
102 |             l = int(l)
103 |             if l != i:
104 |                 d = z - self.positive_item_regularization*self.item_similarities[i,l]
105 |                 self.item_similarities[i,l] += self.learning_rate*d
106 |             if l != j:
107 |                 d = -z - self.negative_item_regularization*self.item_similarities[j,l]
108 |                 self.item_similarities[j,l] += self.learning_rate*d
109 | 
110 | if __name__ == '__main__':
111 | 
112 |     # learn SLIM item similarities with BPR like this:
113 | 
114 |     import sys
115 |     from scipy.io import mmread
116 | 
117 |     data = mmread(sys.argv[1]).tocsr()
118 |     sample_file = sys.argv[2]
119 | 
120 |     args = BPRArgs()
121 |     args.learning_rate = 0.3
122 | 
123 |     model = BPRSLIM(args)
124 | 
125 |     num_iters = 10
126 |     sampler = ExternalSchedule(sample_file,index_offset=1)  # schedule is one-indexed
127 | 
128 |     model.train(data,sampler,num_iters)
129 | 
130 | 


--------------------------------------------------------------------------------
/presampled_bpr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Precompute a schedule of samples and use it to train
 3 | a BPR model.
 4 | 
 5 | Sampling is done in a map-reduce style.
 6 | """
 7 | 
 8 | from bdoopr import *
 9 | import toydoop
10 | from bpr import BPR, BPRArgs, ExternalSchedule
11 | from numpy import loadtxt
12 | from scipy.sparse import coo_matrix
13 | import sys
14 | 
15 | def parser(line):
16 |     return map(int,line.strip().split()[:2])
17 | 
18 | def formatter(key,val):
19 |     return '{0}\t{1}\t{2}'.format(key,val[0],val[1])
20 | 
21 | datafile = sys.argv[1]  # one-indexed, whitespace separated
22 | sample_file = datafile+'.samples'
23 | tmp1 = sample_file+'.tmp1'
24 | tmp2 = sample_file+'.tmp2'
25 | 
26 | print 'reading data...'
27 | data = loadtxt(datafile)
28 | print 'converting to zero-indexed sparse matrix...'
29 | idxs = data[:,:2]-1
30 | vals = data[:,2]
31 | data = coo_matrix((vals,idxs.T)).tocsr()
32 | user_item_counts = dict((i+1,data[i].getnnz()) for i in xrange(data.shape[0]))
33 | 
34 | print 'creating samples...'
35 | mapper1 = Mapper1(user_item_counts,oversampling=10)
36 | print 'map-red1...'
37 | toydoop.mapreduce(datafile,tmp1,mapper=mapper1,reducer=reducer1,parser=parser)
38 | print 'map2...'
39 | toydoop.mapreduce(datafile,tmp2,mapper=mapper2,parser=parser)  # map the data again
40 | print 'red2...'
41 | toydoop.mapreduce([tmp1,tmp2],sample_file,reducer=reducer2,formatter=formatter)
42 | 
43 | print 'training...'
44 | args = BPRArgs()
45 | args.learning_rate = 0.3
46 | num_factors = 10
47 | model = BPR(num_factors,args)
48 | sampler = ExternalSchedule(sample_file,index_offset=1)  # schedule is one-indexed
49 | num_iters = 10
50 | model.train(data,sampler,num_iters)
51 | 


--------------------------------------------------------------------------------