├── .gitignore ├── README.md ├── bdoopr.py ├── bpr.py ├── bprslim.py └── presampled_bpr.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bpr 2 | === 3 | 4 | Bayesian Personalized Ranking is a learning algorithm for collaborative filtering 5 | first introduced in: 6 | BPR: Bayesian Personalized Ranking from Implicit Feedback. Steffen Rendle, Christoph 7 | Freudenthaler, Zeno Gantner and Lars Schmidt-Thieme, Proc. UAI 2009. 8 | 9 | This is a simple Python implementation of BPR for matrix factorization based on the 10 | C# one provided by the original authors in MyMediaLite, see http://www.mymedialite.net. 11 | I've also implemented a BPR version of 12 | SLIM: Sparse Linear Methods for Top-N Recommender Systems. Xia Ning and George Karypis, Proc. ACM RecSys 2012. 13 | -------------------------------------------------------------------------------- /bdoopr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Map-reduce algorithm to create a schedule of BPR samples. 3 | 4 | The probability of emitting a candidate positive item 5 | in the first mapper is designed to give a uniform 6 | probability of any item in the dataset being output 7 | as the positive item in the final list of triples. 8 | """ 9 | 10 | import random 11 | 12 | J_IS_POSITIVE = '-' 13 | 14 | class Mapper1: 15 | 16 | def __init__(self,user_item_counts,oversampling=1): 17 | self.N = sum(user_item_counts.values()) # number of non-zeros 18 | self.user_item_counts = user_item_counts 19 | self.max_item_count = max(user_item_counts.values()) 20 | self.oversampling = oversampling 21 | 22 | def sample_positive(self,user): 23 | alpha = float(self.N-self.max_item_count)/(self.N-self.user_item_counts[user]) 24 | return random.uniform(0,1) < alpha 25 | 26 | def rand_idx(self): 27 | return random.randint(0,self.N*self.oversampling) 28 | 29 | def __call__(self,user,item): 30 | # send candidate items to random indices 31 | for _ in xrange(self.oversampling): 32 | if self.sample_positive(user): 33 | # propose a candidate positive item 34 | yield self.rand_idx(),(user,item,'+') 35 | # propose a candidate negative item 36 | yield self.rand_idx(),(user,item,'-') 37 | 38 | def reducer1(idx,values): 39 | # sample a positive and negative item uniformly to make a candidate triple 40 | seen = {'+':[],'-':[]} 41 | for user,item,c in values: 42 | seen[c].append((user,item)) 43 | if seen['+'] and seen['-']: 44 | # we've got at least one postive and one negative item, now pick one 45 | pos = random.choice(seen['+']) 46 | neg = random.choice(seen['-']) 47 | yield (pos[0],neg[1]),pos[1] # candidate triple as (u,j),i 48 | 49 | def mapper2(user,item): 50 | # map the data again with an indicator value 51 | # to help us spot negative items in candidate triples 52 | yield (int(user),int(item)),J_IS_POSITIVE 53 | 54 | def reducer2(key,values): 55 | user,j = key 56 | values = list(values) 57 | # check the positive items 58 | ii = set(i for i in values if i != J_IS_POSITIVE) 59 | if len(ii) == len(values): 60 | # j really is a negative item for u 61 | for i in ii: 62 | yield user,(i,j) 63 | 64 | if __name__ == '__main__': 65 | 66 | import toydoop 67 | 68 | # create some user-item data 69 | data = { 70 | 1:[10,20,30,40,50,60,70,80,90], 71 | 2:[10,30,110,120,130,140,150], 72 | 3:[20,30,40,90,120,160,170,180,190] 73 | } 74 | user_item_counts = dict((k,len(v)) for k,v in data.iteritems()) 75 | 76 | datafile = 'bdoopr.in' 77 | mapout1 = 'bdoopr.map1' 78 | mapout2 = 'bdoopr.map2' 79 | outfile = 'bdoopr.out' 80 | 81 | f = open(datafile,'w') 82 | for user,items in data.iteritems(): 83 | for item in items: 84 | print >>f,toydoop.default_formatter(user,item) 85 | f.close() 86 | 87 | # run two stages of mapreduce 88 | mapper1 = Mapper1(user_item_counts,oversampling=10) 89 | toydoop.mapreduce(datafile,mapout1,mapper=mapper1,reducer=reducer1) 90 | toydoop.mapreduce(datafile,mapout2,mapper=mapper2) # map the data again 91 | toydoop.mapreduce([mapout1,mapout2],outfile,reducer=reducer2) 92 | -------------------------------------------------------------------------------- /bpr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Bayesian Personalized Ranking 3 | 4 | Matrix Factorization model and a variety of classes 5 | implementing different sampling strategies. 6 | """ 7 | 8 | import numpy as np 9 | from math import exp 10 | import random 11 | 12 | class BPRArgs(object): 13 | 14 | def __init__(self,learning_rate=0.05, 15 | bias_regularization=1.0, 16 | user_regularization=0.0025, 17 | positive_item_regularization=0.0025, 18 | negative_item_regularization=0.00025, 19 | update_negative_item_factors=True): 20 | self.learning_rate = learning_rate 21 | self.bias_regularization = bias_regularization 22 | self.user_regularization = user_regularization 23 | self.positive_item_regularization = positive_item_regularization 24 | self.negative_item_regularization = negative_item_regularization 25 | self.update_negative_item_factors = update_negative_item_factors 26 | 27 | class BPR(object): 28 | 29 | def __init__(self,D,args): 30 | """initialise BPR matrix factorization model 31 | D: number of factors 32 | """ 33 | self.D = D 34 | self.learning_rate = args.learning_rate 35 | self.bias_regularization = args.bias_regularization 36 | self.user_regularization = args.user_regularization 37 | self.positive_item_regularization = args.positive_item_regularization 38 | self.negative_item_regularization = args.negative_item_regularization 39 | self.update_negative_item_factors = args.update_negative_item_factors 40 | 41 | def train(self,data,sampler,num_iters): 42 | """train model 43 | data: user-item matrix as a scipy sparse matrix 44 | users and items are zero-indexed 45 | """ 46 | self.init(data) 47 | 48 | print 'initial loss = {0}'.format(self.loss()) 49 | for it in xrange(num_iters): 50 | print 'starting iteration {0}'.format(it) 51 | for u,i,j in sampler.generate_samples(self.data): 52 | self.update_factors(u,i,j) 53 | print 'iteration {0}: loss = {1}'.format(it,self.loss()) 54 | 55 | def init(self,data): 56 | self.data = data 57 | self.num_users,self.num_items = self.data.shape 58 | 59 | self.item_bias = np.zeros(self.num_items) 60 | self.user_factors = np.random.random_sample((self.num_users,self.D)) 61 | self.item_factors = np.random.random_sample((self.num_items,self.D)) 62 | 63 | self.create_loss_samples() 64 | 65 | def create_loss_samples(self): 66 | # apply rule of thumb to decide num samples over which to compute loss 67 | num_loss_samples = int(100*self.num_users**0.5) 68 | 69 | print 'sampling {0} triples...'.format(num_loss_samples) 70 | sampler = UniformUserUniformItem(True) 71 | self.loss_samples = [t for t in sampler.generate_samples(data,num_loss_samples)] 72 | 73 | def update_factors(self,u,i,j,update_u=True,update_i=True): 74 | """apply SGD update""" 75 | update_j = self.update_negative_item_factors 76 | 77 | x = self.item_bias[i] - self.item_bias[j] \ 78 | + np.dot(self.user_factors[u,:],self.item_factors[i,:]-self.item_factors[j,:]) 79 | 80 | z = 1.0/(1.0+exp(x)) 81 | 82 | # update bias terms 83 | if update_i: 84 | d = z - self.bias_regularization * self.item_bias[i] 85 | self.item_bias[i] += self.learning_rate * d 86 | if update_j: 87 | d = -z - self.bias_regularization * self.item_bias[j] 88 | self.item_bias[j] += self.learning_rate * d 89 | 90 | if update_u: 91 | d = (self.item_factors[i,:]-self.item_factors[j,:])*z - self.user_regularization*self.user_factors[u,:] 92 | self.user_factors[u,:] += self.learning_rate*d 93 | if update_i: 94 | d = self.user_factors[u,:]*z - self.positive_item_regularization*self.item_factors[i,:] 95 | self.item_factors[i,:] += self.learning_rate*d 96 | if update_j: 97 | d = -self.user_factors[u,:]*z - self.negative_item_regularization*self.item_factors[j,:] 98 | self.item_factors[j,:] += self.learning_rate*d 99 | 100 | def loss(self): 101 | ranking_loss = 0; 102 | for u,i,j in self.loss_samples: 103 | x = self.predict(u,i) - self.predict(u,j) 104 | ranking_loss += 1.0/(1.0+exp(x)) 105 | 106 | complexity = 0; 107 | for u,i,j in self.loss_samples: 108 | complexity += self.user_regularization * np.dot(self.user_factors[u],self.user_factors[u]) 109 | complexity += self.positive_item_regularization * np.dot(self.item_factors[i],self.item_factors[i]) 110 | complexity += self.negative_item_regularization * np.dot(self.item_factors[j],self.item_factors[j]) 111 | complexity += self.bias_regularization * self.item_bias[i]**2 112 | complexity += self.bias_regularization * self.item_bias[j]**2 113 | 114 | return ranking_loss + 0.5*complexity 115 | 116 | def predict(self,u,i): 117 | return self.item_bias[i] + np.dot(self.user_factors[u],self.item_factors[i]) 118 | 119 | 120 | # sampling strategies 121 | 122 | class Sampler(object): 123 | 124 | def __init__(self,sample_negative_items_empirically): 125 | self.sample_negative_items_empirically = sample_negative_items_empirically 126 | 127 | def init(self,data,max_samples=None): 128 | self.data = data 129 | self.num_users,self.num_items = data.shape 130 | self.max_samples = max_samples 131 | 132 | def sample_user(self): 133 | u = self.uniform_user() 134 | num_items = self.data[u].getnnz() 135 | assert(num_items > 0 and num_items != self.num_items) 136 | return u 137 | 138 | def sample_negative_item(self,user_items): 139 | j = self.random_item() 140 | while j in user_items: 141 | j = self.random_item() 142 | return j 143 | 144 | def uniform_user(self): 145 | return random.randint(0,self.num_users-1) 146 | 147 | def random_item(self): 148 | """sample an item uniformly or from the empirical distribution 149 | observed in the training data 150 | """ 151 | if self.sample_negative_items_empirically: 152 | # just pick something someone rated! 153 | u = self.uniform_user() 154 | i = random.choice(self.data[u].indices) 155 | else: 156 | i = random.randint(0,self.num_items-1) 157 | return i 158 | 159 | def num_samples(self,n): 160 | if self.max_samples is None: 161 | return n 162 | return min(n,self.max_samples) 163 | 164 | class UniformUserUniformItem(Sampler): 165 | 166 | def generate_samples(self,data,max_samples=None): 167 | self.init(data,max_samples) 168 | for _ in xrange(self.num_samples(self.data.nnz)): 169 | u = self.uniform_user() 170 | # sample positive item 171 | i = random.choice(self.data[u].indices) 172 | j = self.sample_negative_item(self.data[u].indices) 173 | yield u,i,j 174 | 175 | class UniformUserUniformItemWithoutReplacement(Sampler): 176 | 177 | def generate_samples(self,data,max_samples=None): 178 | self.init(self,data,max_samples) 179 | # make a local copy of data as we're going to "forget" some entries 180 | self.local_data = self.data.copy() 181 | for _ in xrange(self.num_samples(self.data.nnz)): 182 | u = self.uniform_user() 183 | # sample positive item without replacement if we can 184 | user_items = self.local_data[u].nonzero()[1] 185 | if len(user_items) == 0: 186 | # reset user data if it's all been sampled 187 | for ix in self.local_data[u].indices: 188 | self.local_data[u,ix] = self.data[u,ix] 189 | user_items = self.local_data[u].nonzero()[1] 190 | i = random.choice(user_items) 191 | # forget this item so we don't sample it again for the same user 192 | self.local_data[u,i] = 0 193 | j = self.sample_negative_item(user_items) 194 | yield u,i,j 195 | 196 | class UniformPair(Sampler): 197 | 198 | def generate_samples(self,data,max_samples=None): 199 | self.init(data,max_samples) 200 | for _ in xrange(self.num_samples(self.data.nnz)): 201 | idx = random.randint(0,self.data.nnz-1) 202 | u = self.users[self.idx] 203 | i = self.items[self.idx] 204 | j = self.sample_negative_item(self.data[u]) 205 | yield u,i,j 206 | 207 | class UniformPairWithoutReplacement(Sampler): 208 | 209 | def generate_samples(self,data,max_samples=None): 210 | self.init(data,max_samples) 211 | idxs = range(self.data.nnz) 212 | random.shuffle(idxs) 213 | self.users,self.items = self.data.nonzero() 214 | self.users = self.users[idxs] 215 | self.items = self.items[idxs] 216 | self.idx = 0 217 | for _ in xrange(self.num_samples(self.data.nnz)): 218 | u = self.users[self.idx] 219 | i = self.items[self.idx] 220 | j = self.sample_negative_item(self.data[u]) 221 | self.idx += 1 222 | yield u,i,j 223 | 224 | class ExternalSchedule(Sampler): 225 | 226 | def __init__(self,filepath,index_offset=0): 227 | self.filepath = filepath 228 | self.index_offset = index_offset 229 | 230 | def generate_samples(self,data,max_samples=None): 231 | self.init(data,max_samples) 232 | f = open(self.filepath) 233 | samples = [map(int,line.strip().split()) for line in f] 234 | random.shuffle(samples) # important! 235 | num_samples = self.num_samples(len(samples)) 236 | for u,i,j in samples[:num_samples]: 237 | yield u-self.index_offset,i-self.index_offset,j-self.index_offset 238 | 239 | if __name__ == '__main__': 240 | 241 | # learn a matrix factorization with BPR like this: 242 | 243 | import sys 244 | from scipy.io import mmread 245 | 246 | data = mmread(sys.argv[1]).tocsr() 247 | 248 | args = BPRArgs() 249 | args.learning_rate = 0.3 250 | 251 | num_factors = 10 252 | model = BPR(num_factors,args) 253 | 254 | sample_negative_items_empirically = True 255 | sampler = UniformPairWithoutReplacement(sample_negative_items_empirically) 256 | num_iters = 10 257 | model.train(data,sampler,num_iters) 258 | -------------------------------------------------------------------------------- /bprslim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sparse LInear Method for collaborative filtering 3 | using BPR to optimise for AUC. 4 | 5 | Uses pysparse for the item similarity matrix as 6 | this appears to give much faster read/write access 7 | than anything in scipy.sparse. Easiest to install 8 | the Common Sense Computing version like this: 9 | pip install csc-pysparse 10 | The data is still held in a scipy.sparse.csr_matrix 11 | as for the access we need here that is actually faster 12 | than the pysparse ll_mat. 13 | """ 14 | 15 | from pysparse.sparse.spmatrix import * 16 | import numpy as np 17 | from math import exp 18 | 19 | from bpr import BPRArgs, ExternalSchedule 20 | 21 | class BPRSLIM(object): 22 | 23 | def __init__(self,args): 24 | """ 25 | initialise SLIM model 26 | """ 27 | self.learning_rate = args.learning_rate 28 | self.positive_item_regularization = args.positive_item_regularization 29 | self.negative_item_regularization = args.negative_item_regularization 30 | self.init_similarity_std = 0.1 31 | 32 | def train(self,data,sampler,num_iters): 33 | """train model 34 | data: user-item matrix as a scipy sparse matrix 35 | users and items are zero-indexed 36 | sampler: must be ExternalSchedule 37 | """ 38 | self.data = data 39 | self.num_users,self.num_items = self.data.shape 40 | 41 | print 'finding sparsity structure of item similarities...' 42 | # do a dry run of an iteration and collect the item indices 43 | indices = set() 44 | for u,i,j in sampler.generate_samples(self.data): 45 | for l in self.data[u].indices: 46 | if l != i: 47 | indices.add((i,l)) 48 | if l != j: 49 | indices.add((j,l)) 50 | print 'initialising item similarities...' 51 | self.item_similarities = ll_mat(self.num_items,self.num_items,len(indices)) 52 | indices = np.array(list(indices)) 53 | ii = indices[:,0] 54 | jj = indices[:,1] 55 | vals = self.init_similarity_std * np.random.randn(len(indices)) 56 | for i,j,v in zip(ii,jj,vals): 57 | self.item_similarities[int(i),int(j)] = v 58 | 59 | # TODO: with pysparse we *might* get away with lazy initialization 60 | # and letting the item similarities grow over time... 61 | # i.e. we wouldn't be tied to a fixed schedule 62 | 63 | # create loss samples, again restrict to the scheduled samples 64 | # so we have initialised item similarities 65 | num_loss_samples = int(100*self.num_users**0.5) 66 | self.loss_samples = [t for t in sampler.generate_samples(data,num_loss_samples)] 67 | 68 | for it in xrange(num_iters): 69 | print 'starting iteration {0}'.format(it) 70 | for u,i,j in sampler.generate_samples(self.data): 71 | self.update_factors(u,i,j) 72 | print 'iteration {0}: loss = {1}'.format(it,self.loss()) 73 | 74 | def loss(self): 75 | 76 | # TODO: this seems to take a lot of the traning time - why?? 77 | 78 | ranking_loss = 0; 79 | for u,i,j in self.loss_samples: 80 | x = self.predict(u,i) - self.predict(u,j) 81 | ranking_loss += 1.0/(1.0+exp(x)) 82 | 83 | complexity = 0; 84 | for u,i,j in self.loss_samples: 85 | complexity += self.positive_item_regularization * self.item_similarities[i,:].norm('fro')**2 86 | complexity += self.negative_item_regularization * self.item_similarities[j,:].norm('fro')**2 87 | 88 | return ranking_loss + 0.5*complexity 89 | 90 | def predict(self,u,i): 91 | return sum(self.item_similarities[i,int(l)] for l in self.data[u].indices if l != i) 92 | 93 | def update_factors(self,u,i,j): 94 | """apply SGD update""" 95 | 96 | x = sum(self.item_similarities[i,int(l)]-self.item_similarities[j,int(l)] for l in self.data[u].indices) 97 | 98 | z = 1.0/(1.0+exp(x)) 99 | 100 | # update item similarity weights 101 | for l in self.data[u].indices: 102 | l = int(l) 103 | if l != i: 104 | d = z - self.positive_item_regularization*self.item_similarities[i,l] 105 | self.item_similarities[i,l] += self.learning_rate*d 106 | if l != j: 107 | d = -z - self.negative_item_regularization*self.item_similarities[j,l] 108 | self.item_similarities[j,l] += self.learning_rate*d 109 | 110 | if __name__ == '__main__': 111 | 112 | # learn SLIM item similarities with BPR like this: 113 | 114 | import sys 115 | from scipy.io import mmread 116 | 117 | data = mmread(sys.argv[1]).tocsr() 118 | sample_file = sys.argv[2] 119 | 120 | args = BPRArgs() 121 | args.learning_rate = 0.3 122 | 123 | model = BPRSLIM(args) 124 | 125 | num_iters = 10 126 | sampler = ExternalSchedule(sample_file,index_offset=1) # schedule is one-indexed 127 | 128 | model.train(data,sampler,num_iters) 129 | 130 | -------------------------------------------------------------------------------- /presampled_bpr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Precompute a schedule of samples and use it to train 3 | a BPR model. 4 | 5 | Sampling is done in a map-reduce style. 6 | """ 7 | 8 | from bdoopr import * 9 | import toydoop 10 | from bpr import BPR, BPRArgs, ExternalSchedule 11 | from numpy import loadtxt 12 | from scipy.sparse import coo_matrix 13 | import sys 14 | 15 | def parser(line): 16 | return map(int,line.strip().split()[:2]) 17 | 18 | def formatter(key,val): 19 | return '{0}\t{1}\t{2}'.format(key,val[0],val[1]) 20 | 21 | datafile = sys.argv[1] # one-indexed, whitespace separated 22 | sample_file = datafile+'.samples' 23 | tmp1 = sample_file+'.tmp1' 24 | tmp2 = sample_file+'.tmp2' 25 | 26 | print 'reading data...' 27 | data = loadtxt(datafile) 28 | print 'converting to zero-indexed sparse matrix...' 29 | idxs = data[:,:2]-1 30 | vals = data[:,2] 31 | data = coo_matrix((vals,idxs.T)).tocsr() 32 | user_item_counts = dict((i+1,data[i].getnnz()) for i in xrange(data.shape[0])) 33 | 34 | print 'creating samples...' 35 | mapper1 = Mapper1(user_item_counts,oversampling=10) 36 | print 'map-red1...' 37 | toydoop.mapreduce(datafile,tmp1,mapper=mapper1,reducer=reducer1,parser=parser) 38 | print 'map2...' 39 | toydoop.mapreduce(datafile,tmp2,mapper=mapper2,parser=parser) # map the data again 40 | print 'red2...' 41 | toydoop.mapreduce([tmp1,tmp2],sample_file,reducer=reducer2,formatter=formatter) 42 | 43 | print 'training...' 44 | args = BPRArgs() 45 | args.learning_rate = 0.3 46 | num_factors = 10 47 | model = BPR(num_factors,args) 48 | sampler = ExternalSchedule(sample_file,index_offset=1) # schedule is one-indexed 49 | num_iters = 10 50 | model.train(data,sampler,num_iters) 51 | --------------------------------------------------------------------------------