├── data ├── track.py ├── user.py ├── __init__.py ├── artist.py └── record.py ├── tool ├── log.py ├── __init__.py ├── dataSplit.py ├── file.py ├── config.py ├── qmath.py └── TSNE.py ├── base ├── __init__.py ├── DeepRecommender ├── recommender.py └── IterativeRecommender.py ├── evaluation ├── __init__.py └── measure.py ├── parallel └── __init__.py ├── recommender ├── __init__.py ├── cf │ ├── __init__.py │ ├── UserKNN.py │ ├── WRMF.py │ ├── FISM.py │ ├── IPF.py │ └── BPR.py ├── advanced │ ├── __init__.py │ ├── LightGCN.py │ ├── CDAE.py │ ├── ExpoMF.py │ ├── RRN.py │ ├── APR.py │ ├── NGCF.py │ ├── DHCF.py │ ├── CoFactor.py │ ├── NeuMF.py │ ├── CUNE.py │ ├── DMF.py │ ├── Song2vec.py │ └── NeuTrans.py └── baseline │ ├── __init__.py │ ├── Rand.py │ └── MostPop.py ├── structure ├── __init__.py ├── sparseMatrix.py └── symmetricMatrix.py ├── .gitignore ├── config ├── Rand.conf ├── MostPop.conf ├── IPF.conf ├── UserKNN.conf ├── BPR.conf ├── WRMF.conf ├── FISM.conf ├── DHCF.conf ├── NeuMF.conf ├── RRN.conf ├── ExpoMF.conf ├── LightGCN.conf ├── MEM.conf ├── NGCF.conf ├── Song2vec.conf ├── ABLAH.conf ├── CUNE.conf ├── DMF.conf ├── APR.conf ├── CDAE.conf ├── CoFactor.conf └── NeuTrans.conf ├── main.py ├── Session-based RecSys Papers.md ├── yue.py └── README.md /data/track.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/user.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tool/log.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/artist.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tool/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /parallel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /recommender/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /structure/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /recommender/cf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /recommender/advanced/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /recommender/baseline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | results/ 3 | *.pyc 4 | dataset/ 5 | -------------------------------------------------------------------------------- /config/Rand.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=Rand 4 | evaluation.setup=-ap 0.2 -target track 5 | item.ranking=-topN 10 6 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/MostPop.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=MostPop 4 | evaluation.setup=-byTime 0.2 -target track 5 | item.ranking=-topN 5,10,15,20 6 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/IPF.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,time:0 -delim , 3 | recommender=IPF 4 | evaluation.setup=-target track -byTime 0.2 -sample 5 | item.ranking=-topN 5,10 6 | IPF=-rho 1 -beta 0.7 -eta 0.3 7 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/UserKNN.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | evaluation.setup=-target track -byTime 0.2 -sample 4 | recommender=UserKNN 5 | item.ranking=-topN 5,10,15,20 6 | num.neighbors=20 7 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/BPR.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=BPR 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10 6 | num.factors=10 7 | num.max.iter=1 8 | learnRate=-init 0.02 -max 1 9 | reg.lambda=-u 0.01 -i 0.01 -b 0.2 -s 0.2 10 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/WRMF.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=WRMF 4 | evaluation.setup=-target track -byTime 0.2 -sample 5 | item.ranking=-topN 5,10,15,20 6 | num.factors=20 7 | num.max.iter=10 8 | learnRate=-init 0.02 -max 1 9 | reg.lambda=-u 1 -i 0.1 -b 0.2 -s 0.2 10 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/FISM.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=FISM 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5 6 | num.factors=10 7 | num.max.iter=20 8 | FISM=-rho 2 -alpha 0.5 9 | learnRate=-init 0.015 -max 1 10 | reg.lambda=-u 0.01 -i 0.01 -b 0.01 -s 0.01 11 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/DHCF.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=DHCF 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10 6 | num.factors=64 7 | num.max.iter=1 8 | batch_size=512 9 | learnRate=-init 0.003 -max 1 10 | reg.lambda=-u 0.002 -i 0.01 -b 0.2 -s 0.2 11 | output.setup=on -dir ./results/DHCF/ 12 | -------------------------------------------------------------------------------- /config/NeuMF.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=NeuMF 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10 6 | num.factors=64 7 | num.max.iter=100 8 | batch_size=32 9 | learnRate=-init 0.02 -max 1 10 | reg.lambda=-u 0.01 -i 0.01 -b 0.2 -s 0.2 11 | output.setup=on -dir ./results/NeuMF/ -------------------------------------------------------------------------------- /config/RRN.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=RRN 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 10,20 6 | num.factors=20 7 | num.max.iter=100 8 | batch_size=32 9 | learnRate=-init 0.02 -max 1 10 | reg.lambda=-u 0.1 -i 0.1 -b 0.2 -s 0.2 11 | output.setup=on -dir ./results/RRN/ 12 | -------------------------------------------------------------------------------- /config/ExpoMF.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=ExpoMF 4 | evaluation.setup=-ap 0.2 -target track -b 1 5 | item.ranking=-topN 5,10,15,20 6 | num.factors=20 7 | num.max.iter=100 8 | learnRate=-init 0.01 -max 1 9 | CDAE=-batch_size 256 10 | reg.lambda=-u 0.1 -i 0.01 -b 0.01 11 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/LightGCN.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=LightGCN 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10 6 | num.factors=50 7 | num.max.iter=100 8 | batch_size=128 9 | learnRate=-init 0.002 -max 1 10 | reg.lambda=-u 0.001 -i 0.001 -b 0.2 -s 0.2 11 | output.setup=on -dir ./results/LightGCN/ 12 | -------------------------------------------------------------------------------- /config/MEM.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=MEM 4 | evaluation.setup=-byTime 0.2 -target track 5 | item.ranking=-topN 5 6 | num.factors=50 7 | num.max.iter=2 8 | learnRate=-init 0.01 -max 1 9 | MEM=-epoch 2 -winSize 5 -negCount 5 -beta 0.01 10 | reg.lambda=-u 0.01 -i 0.01 -b 0.2 -s 0.2 11 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/NGCF.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/test.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=NGCF 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10 6 | num.factors=64 7 | num.max.iter=100 8 | batch_size=16 9 | learnRate=-init 0.003 -max 1 10 | reg.lambda=-u 0.001 -i 0.001 -b 0.2 -s 0.2 11 | output.setup=on -dir ./results/NGCF/ 12 | -------------------------------------------------------------------------------- /config/Song2vec.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=Song2vec 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10,15,20 6 | num.factors=20 7 | num.max.iter=10 8 | learnRate=-init 0.02 -max 1 9 | Song2vec=-alpha 0.5 -k 10 10 | reg.lambda=-u 1 -i 0.1 -b 0.2 -s 0.2 11 | output.setup=on -dir ./results/ 12 | -------------------------------------------------------------------------------- /config/ABLAH.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/t.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=ABLAH 4 | evaluation.setup=-target track -ap 0.2 5 | item.ranking=-topN 5,10,20 6 | num.factors=32 7 | num.max.iter=1000 8 | learnRate=-init 0.001 -max 1 9 | ABLAH=-batch_size 32 -cutoff 3 10 | reg.lambda=-u 0.05 -i 0.05 -b 0.1 -s 0.1 11 | output.setup=on -dir ./results/ABLAH/ 12 | -------------------------------------------------------------------------------- /config/CUNE.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=CUNE 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10 6 | num.factors=20 7 | num.max.iter=20 8 | CUNE=-T 20 -L 10 -l 20 -w 5 -k 50 -s 2 -ep 10 9 | learnRate=-init 0.02 -max 0.1 10 | reg.lambda=-u 0.01 -i 0.01 -b 0.01 -s 0.2 11 | output.setup=on -dir ./results/ -------------------------------------------------------------------------------- /config/DMF.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=DMF 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10,15,20 6 | num.factors=20 7 | num.max.iter=100 8 | batch_size=512 9 | learnRate=-init 0.02 -max 1 10 | DMF=-alpha 0.2 -k 10 -neg 5 11 | reg.lambda=-u 0.1 -i 0.1 -b 0.2 12 | output.setup=on -dir ./results/DMF/ 13 | -------------------------------------------------------------------------------- /config/APR.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/log.txt 2 | record.setup=-columns user:1,track:2,artist:3,time:0 -delim , 3 | recommender=APR 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10 6 | num.factors=64 7 | num.max.iter=100 8 | batch_size=512 9 | APR=-regA 2 -eps 0.5 -advEpoch 100 10 | learnRate=-init 0.003 -max 1 11 | reg.lambda=-u 0.002 -i 0.01 -b 0.2 -s 0.2 12 | output.setup=on -dir ./results/APR/ 13 | -------------------------------------------------------------------------------- /config/CDAE.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/xiami.txt 2 | record.setup=-columns user:0,track:1,artist:2,album:3,time:4 -delim , 3 | recommender=CDAE 4 | evaluation.setup=-ap 0.2 -target track 5 | item.ranking=-topN 5,10,15,20 6 | num.factors=100 7 | num.max.iter=100 8 | learnRate=-init 0.6 -max 10 9 | CDAE=-batch_size 256 -co 0.1 -nh 128 10 | reg.lambda=-u 0.1 -i 0.1 -b 0.1 11 | output.setup=on -dir ./results/CDAE/ 12 | -------------------------------------------------------------------------------- /config/CoFactor.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/xiami.txt 2 | record.setup=-columns user:0,track:1,artist:2,album:3,time:4 -delim , 3 | recommender=CoFactor 4 | evaluation.setup=-ap 0.2 -b 5.0 -target track 5 | item.ranking=-topN 10,20 6 | num.factors=10 7 | num.max.iter=7 8 | learnRate=-init 0.01 -max 1 9 | reg.lambda=-u 0.01 -i 0.01 -b 0.01 -s 0.1 10 | CoFactor=-k 5 -gamma 0.03 -filter 10 11 | output.setup=on -dir ./results/CoFactor/ 12 | -------------------------------------------------------------------------------- /config/NeuTrans.conf: -------------------------------------------------------------------------------- 1 | record=./dataset/xiami1.txt 2 | record.setup=-columns user:0,track:1,artist:2,album:3,time:4 -delim , 3 | recommender=NeuTrans 4 | evaluation.setup=-target track -byTime 0.2 5 | item.ranking=-topN 5,10,15,20 6 | num.factors=64 7 | num.max.iter=1000 8 | batch_size=512 9 | NeuTrans=-alpha 0.2 -k 10 -neg 5 10 | learnRate=-init 0.02 -max 1 11 | reg.lambda=-u 0.02 -i 0.01 -b 0.2 12 | output.setup=on -dir ./results/NeuTrans/ 13 | -------------------------------------------------------------------------------- /recommender/baseline/Rand.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.recommender import Recommender 3 | from random import shuffle 4 | import numpy as np 5 | class Rand(Recommender): 6 | 7 | # Recommend items for every user at random 8 | 9 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 10 | super(Rand, self).__init__(conf,trainingSet,testSet,fold) 11 | 12 | 13 | 14 | def predict(self, u): 15 | 'invoked to rank all the items for the user' 16 | self.candidates = [] 17 | candidates = self.data.listened[self.recType].keys() 18 | shuffle(candidates) 19 | return self.candidates 20 | 21 | 22 | -------------------------------------------------------------------------------- /structure/sparseMatrix.py: -------------------------------------------------------------------------------- 1 | from random import choice 2 | from collections import defaultdict 3 | class SparseMatrix(object): 4 | def __init__(self): 5 | self.matrix = defaultdict(dict) 6 | self.contructed = False 7 | 8 | def set(self,r,c,val): 9 | self.matrix[r][c]=val 10 | 11 | def get(self,r,c): 12 | if r in self.matrix and c in self.matrix[r]: 13 | return self.matrix[r][c] 14 | else: 15 | print ('No element in row',r,'and column',c) 16 | raise KeyError 17 | 18 | def anyone(self): 19 | if not self.contructed: 20 | self.rows = self.matrix.keys() 21 | ## to be continued 22 | -------------------------------------------------------------------------------- /recommender/baseline/MostPop.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.recommender import Recommender 3 | import numpy as np 4 | class MostPop(Recommender): 5 | 6 | # Recommend the most popular items for every user 7 | 8 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 9 | super(MostPop, self).__init__(conf,trainingSet,testSet,fold) 10 | 11 | # def readConfiguration(self): 12 | # super(BPR, self).readConfiguration() 13 | 14 | def buildModel(self): 15 | 16 | self.recommendation = [] 17 | self.recommendation = sorted(self.data.listened[self.recType].iteritems(), key=lambda d: len(d[1]), reverse=True) 18 | self.recommendation = [item[0] for item in self.recommendation] 19 | 20 | 21 | def predict(self, u): 22 | 'invoked to rank all the items for the user' 23 | return self.recommendation 24 | 25 | 26 | -------------------------------------------------------------------------------- /structure/symmetricMatrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class SymmetricMatrix(object): 4 | def __init__(self, shape): 5 | self.symMatrix = {} 6 | self.shape = (shape,shape) 7 | 8 | def __getitem__(self, item): 9 | if item in self.symMatrix: 10 | return self.symMatrix[item] 11 | return {} 12 | 13 | def set(self,i,j,val): 14 | if i not in self.symMatrix: 15 | self.symMatrix[i] = {} 16 | self.symMatrix[i][j]=val 17 | if j not in self.symMatrix: 18 | self.symMatrix[j] = {} 19 | self.symMatrix[j][i] = val 20 | 21 | 22 | def get(self,i,j): 23 | if i not in self.symMatrix or j not in self.symMatrix[i]: 24 | return 0 25 | return self.symMatrix[i][j] 26 | 27 | def contains(self,i,j): 28 | if i in self.symMatrix and j in self.symMatrix[i]: 29 | return True 30 | else: 31 | return False 32 | 33 | -------------------------------------------------------------------------------- /tool/dataSplit.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | from tool.file import FileIO 3 | class DataSplit(object): 4 | 5 | def __init__(self): 6 | pass 7 | 8 | @staticmethod 9 | def dataSplit(data,test_ratio = 0.3,output=False,path='./',order=1): 10 | if test_ratio>=1 or test_ratio <=0: 11 | test_ratio = 0.3 12 | testSet = [] 13 | trainingSet = [] 14 | for entry in data: 15 | if random() < test_ratio: 16 | testSet.append(entry) 17 | else: 18 | trainingSet.append(entry) 19 | 20 | if output: 21 | FileIO.writeFile(path,'testSet['+str(order)+']',testSet) 22 | FileIO.writeFile(path, 'trainingSet[' + str(order) + ']', trainingSet) 23 | return trainingSet,testSet 24 | 25 | @staticmethod 26 | def crossValidation(data,k): 27 | if k<=1 or k>10: 28 | k=3 29 | for i in range(k): 30 | trainingSet = [] 31 | testSet = [] 32 | for ind,entry in enumerate(data): 33 | if ind%k == i: 34 | testSet.append(entry) 35 | else: 36 | trainingSet.append(entry) 37 | yield trainingSet,testSet 38 | 39 | 40 | -------------------------------------------------------------------------------- /tool/file.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from os import makedirs,remove 3 | from re import compile,findall,split 4 | from tool.config import LineConfig 5 | class FileIO(object): 6 | def __init__(self): 7 | pass 8 | 9 | @staticmethod 10 | def writeFile(dir,file,content,op = 'w'): 11 | if not os.path.exists(dir): 12 | os.makedirs(dir) 13 | with open(dir+file,op) as f: 14 | f.writelines(content) 15 | 16 | @staticmethod 17 | def deleteFile(filePath): 18 | if os.path.exists(filePath): 19 | remove(filePath) 20 | 21 | 22 | @staticmethod 23 | def loadDataSet(file, columns,binarized = False, threshold = 3, delim='' ): 24 | print ('load dataset...') 25 | record = [] 26 | colNames = columns.keys() 27 | if len(colNames) < 2: 28 | print ('The dataset needs more information or the record.setup setting has some problems...') 29 | exit(-1) 30 | index = [int(item) for item in columns.values()] 31 | delimiter=',| |\t' 32 | if delim != '': 33 | delimiter = delim 34 | with open(file) as f: 35 | lineNo = 0 36 | for line in f: 37 | lineNo += 1 38 | try: 39 | items = split(delimiter, line.strip()) 40 | event = {} 41 | for column,ind in zip(colNames,index): 42 | event[column] = items[ind] 43 | if binarized and 'play' in event: 44 | if int(event['play']) >= threshold: 45 | event['play'] = 1 46 | else: 47 | event['play'] = 0 48 | record.append(event) 49 | except IndexError: 50 | print ('The record file is not in a correct format. Error Location: Line num %d' % lineNo) 51 | exit(-1) 52 | return record 53 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import yue 4 | from tool.config import Config 5 | import time 6 | 7 | if __name__ == '__main__': 8 | 9 | print ('='*80) 10 | print (' Yue: Library for Music Recommendation. ') 11 | print ('='*80) 12 | print ('CF-based Recommenders:') 13 | print ('1. BPR 2. FISM 3. WRMF 4. IPF') 14 | print ('5. UserKNN') 15 | 16 | print ('Content-based Recommenders:\n') 17 | 18 | print ('Hybrid Recommenders:\n') 19 | 20 | print ('Advanced Recommenders:') 21 | print ('a1. CUNE a2. Song2vec a3. BasicMF') 22 | print ('a4. CDAE a5. DMF') 23 | 24 | s = time.time() 25 | 26 | print ('Baselines:') 27 | print ('b1. MostPop b2. Rand') 28 | print ('='*80) 29 | algor = -1 30 | conf = -1 31 | order = input('Please enter the num of the algorithm to run it:') 32 | 33 | import time 34 | s = time.time() 35 | 36 | if order=='1': 37 | conf = Config('./config/BPR.conf') 38 | 39 | elif order=='2': 40 | conf = Config('./config/FISM.conf') 41 | 42 | elif order=='3': 43 | conf = Config('./config/WRMF.conf') 44 | 45 | elif order=='4': 46 | conf = Config('./config/IPF.conf') 47 | 48 | elif order=='5': 49 | conf = Config('./config/UserKNN.conf') 50 | 51 | elif order == 'b1': 52 | conf = Config('./config/MostPop.conf') 53 | 54 | elif order == 'b2': 55 | conf = Config('./config/rand.conf') 56 | 57 | 58 | elif order == 'a1': 59 | conf = Config('./config/CUNE.conf') 60 | 61 | elif order == 'a2': 62 | conf = Config('./config/Song2vec.conf') 63 | 64 | elif order == 'a3': 65 | conf = Config('./config/BasicMF.conf') 66 | 67 | elif order == 'a4': 68 | conf = Config('./config/APR.conf') 69 | 70 | elif order == 'a5': 71 | conf = Config('./config/DMF.conf') 72 | 73 | else: 74 | print ('Error num!') 75 | exit(-1) 76 | 77 | musicSys = yue.Yue(conf) 78 | musicSys.execute() 79 | e = time.time() 80 | print ("Run time: %f s" % (e - s)) 81 | 82 | -------------------------------------------------------------------------------- /base/DeepRecommender: -------------------------------------------------------------------------------- 1 | from base.IterativeRecommender import IterativeRecommender 2 | from tool import config 3 | import numpy as np 4 | from random import shuffle 5 | import tensorflow as tf 6 | 7 | class DeepRecommender(IterativeRecommender): 8 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 9 | super(DeepRecommender, self).__init__(conf,trainingSet,testSet,fold) 10 | 11 | def readConfiguration(self): 12 | super(DeepRecommender, self).readConfiguration() 13 | # set the reduced dimension 14 | self.batch_size = int(self.config['batch_size']) 15 | # regularization parameter 16 | regular = config.LineConfig(self.config['reg.lambda']) 17 | self.regU,self.regI,self.regB= float(regular['-u']),float(regular['-i']),float(regular['-b']) 18 | 19 | def printAlgorConfig(self): 20 | super(DeepRecommender, self).printAlgorConfig() 21 | 22 | def initModel(self): 23 | self.u_idx = tf.placeholder(tf.int32, [None], name="u_idx") 24 | self.v_idx = tf.placeholder(tf.int32, [None], name="v_idx") 25 | 26 | self.r = tf.placeholder(tf.float32, [None], name="rating") 27 | self.m = self.data.getSize('user') 28 | self.n = self.data.getSize(self.recType) 29 | self.train_size = len(self.data.trainingData) 30 | self.U = tf.Variable(tf.truncated_normal(shape=[self.m, self.k], stddev=0.005), name='U') 31 | self.V = tf.Variable(tf.truncated_normal(shape=[self.n, self.k], stddev=0.005), name='V') 32 | 33 | self.U_embed = tf.nn.embedding_lookup(self.U, self.u_idx) 34 | self.V_embed = tf.nn.embedding_lookup(self.V, self.v_idx) 35 | self.sess = tf.Session() 36 | 37 | def saveModel(self): 38 | pass 39 | 40 | def loadModel(self): 41 | pass 42 | 43 | def predictForRanking(self,u): 44 | 'used to rank all the items for the user' 45 | pass 46 | 47 | def isConverged(self,iter): 48 | from math import isnan 49 | if isnan(self.loss): 50 | print ('Loss = NaN or Infinity: current settings does not fit the recommender! Change the settings and try again!') 51 | exit(-1) 52 | deltaLoss = (self.lastLoss-self.loss) 53 | #check if converged 54 | cond = abs(deltaLoss) < 1e-8 55 | converged = cond 56 | if not converged: 57 | self.updateLearningRate(iter) 58 | self.lastLoss = self.loss 59 | return converged 60 | -------------------------------------------------------------------------------- /tool/config.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from os.path import abspath 3 | class Config(object): 4 | def __init__(self,fileName): 5 | self.config = {} 6 | self.readConfiguration(fileName) 7 | 8 | def __getitem__(self, item): 9 | if not self.contains(item): 10 | print ('parameter '+item+' is invalid!') 11 | exit(-1) 12 | 13 | return self.config[item] 14 | 15 | def getOptions(self,item): 16 | if not self.contains(item): 17 | print ('parameter '+item+' is invalid!') 18 | exit(-1) 19 | return self.config[item] 20 | 21 | def contains(self,key): 22 | if key in self.config: 23 | return True 24 | else: 25 | return False 26 | # return self.config.has_key() 27 | 28 | def readConfiguration(self,fileName): 29 | path = abspath(fileName) 30 | if not os.path.exists(path): 31 | print ('config file is not found!') 32 | raise IOError 33 | with open(path) as f: 34 | for ind,line in enumerate(f): 35 | if line.strip()!='': 36 | try: 37 | key,value=line.strip().split('=') 38 | self.config[key]=value 39 | except ValueError: 40 | print ('config file is not in the correct format! Error Line:%d'%(ind)) 41 | 42 | 43 | class LineConfig(object): 44 | def __init__(self,content): 45 | self.line = content.strip().split(' ') 46 | self.options = {} 47 | self.mainOption = False 48 | if self.line[0] == 'on': 49 | self.mainOption = True 50 | elif self.line[0] == 'off': 51 | self.mainOption = False 52 | for i,item in enumerate(self.line): 53 | if (item.startswith('-') or item.startswith('--')) and not item[1:].isdigit(): 54 | ind = i+1 55 | for j,sub in enumerate(self.line[ind:]): 56 | if (sub.startswith('-') or sub.startswith('--')) and not sub[1:].isdigit(): 57 | ind = j 58 | break 59 | if j == len(self.line[ind:])-1: 60 | ind=j+1 61 | break 62 | try: 63 | self.options[item] = ' '.join(self.line[i+1:i+1+ind]) 64 | except IndexError: 65 | self.options[item] = 1 66 | 67 | 68 | def __getitem__(self, item): 69 | if not self.contains(item): 70 | print ('parameter '+item+' is invalid!') 71 | exit(-1) 72 | return self.options[item] 73 | 74 | def getOption(self,key): 75 | if not self.contains(key): 76 | print ('parameter '+key+' is invalid!') 77 | exit(-1) 78 | return self.options[key] 79 | 80 | def isMainOn(self): 81 | return self.mainOption 82 | 83 | def contains(self,key): 84 | if key in self.options: 85 | return True 86 | else: 87 | return False 88 | # return self.options.has_key(key) 89 | -------------------------------------------------------------------------------- /recommender/cf/UserKNN.py: -------------------------------------------------------------------------------- 1 | from base.recommender import Recommender 2 | from tool import qmath 3 | from structure.symmetricMatrix import SymmetricMatrix 4 | from collections import defaultdict 5 | 6 | class UserKNN(Recommender): 7 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 8 | super(UserKNN, self).__init__(conf,trainingSet,testSet,fold) 9 | self.userSim = SymmetricMatrix(len(self.data.name2id['user'])) 10 | self.topUsers = {} 11 | 12 | def readConfiguration(self): 13 | super(UserKNN, self).readConfiguration() 14 | self.neighbors = int(self.config['num.neighbors']) 15 | 16 | def printAlgorConfig(self): 17 | "show algorithm's configuration" 18 | super(UserKNN, self).printAlgorConfig() 19 | print ('Specified Arguments of',self.config['recommender']+':') 20 | print ('num.neighbors:',self.config['num.neighbors']) 21 | print ('='*80) 22 | 23 | def initModel(self): 24 | self.computeCorr() 25 | 26 | def predict(self,u): 27 | recommendations = [] 28 | for item in self.data.listened[self.recType]: 29 | sum, denom = 0, 0 30 | for simUser in self.topUsers[u]: 31 | #if user n has rating on item i 32 | if simUser[0] in self.data.listened[self.recType][item]: 33 | similarity = simUser[1] 34 | score = self.data.listened[self.recType][item][simUser[0]] 35 | sum += similarity*score 36 | denom += similarity 37 | if sum!=0: 38 | score = sum / float(denom) 39 | recommendations.append((item,score)) 40 | recommendations = sorted(recommendations,key=lambda d:d[1],reverse=True) 41 | recommendations = [item[0] for item in recommendations] 42 | return recommendations 43 | 44 | def computeCorr(self): 45 | 'compute correlation among users' 46 | userListen = defaultdict(dict) 47 | for user in self.data.userRecord: 48 | for item in self.data.userRecord[user]: 49 | if item[self.recType] in userListen[user]: 50 | userListen[user][item[self.recType]] += 1 51 | else: 52 | userListen[user][item[self.recType]] = 0 53 | print ('Computing user similarities...') 54 | for ind,u1 in enumerate(userListen): 55 | set1 = set(userListen[u1].keys()) 56 | for u2 in userListen: 57 | if u1 != u2: 58 | if self.userSim.contains(u1,u2): 59 | continue 60 | set2 = set(userListen[u2].keys()) 61 | sim = self.jaccard(set1,set2) 62 | self.userSim.set(u1,u2,sim) 63 | self.topUsers[u1] = sorted(self.userSim[u1].items(), key=lambda d: d[1], reverse=True)[:self.neighbors] 64 | if ind%100==0: 65 | print (ind,'/',len(userListen), 'finished.') 66 | print ('The user correlation has been figured out.') 67 | 68 | def jaccard(self,s1,s2): 69 | return 2*len(s1.intersection(s2))/(len(s1.union(s2))+0.0) 70 | 71 | -------------------------------------------------------------------------------- /tool/qmath.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics.pairwise import pairwise_distances,cosine_similarity 2 | import numpy as np 3 | from numpy.linalg import norm 4 | from scipy.stats.stats import pearsonr 5 | from math import sqrt,exp 6 | 7 | def l1(x): 8 | return norm(x,ord=1) 9 | 10 | def l2(x): 11 | return norm(x) 12 | 13 | def common(x1,x2): 14 | # find common ratings 15 | common = (x1!=0)&(x2!=0) 16 | new_x1 = x1[common] 17 | new_x2 = x2[common] 18 | return new_x1,new_x2 19 | 20 | def cosine_sp(x1,x2): 21 | 'x1,x2 are dicts,this version is for sparse representation' 22 | total = 0 23 | denom1 = 0 24 | denom2 =0 25 | for k in x1: 26 | if k in x2: 27 | total+=x1[k]*x2[k] 28 | denom1+=x1[k]**2 29 | denom2+=x2[k]**2 30 | try: 31 | return (total + 0.0) / (sqrt(denom1) * sqrt(denom2)) 32 | except ZeroDivisionError: 33 | return 0 34 | 35 | 36 | def cosine(x1,x2): 37 | #find common ratings 38 | #new_x1, new_x2 = common(x1,x2) 39 | #compute the cosine similarity between two vectors 40 | sum = x1.dot(x2) 41 | denom = sqrt(x1.dot(x1)*x2.dot(x2)) 42 | try: 43 | return float(sum)/denom 44 | except ZeroDivisionError: 45 | return 0 46 | 47 | #return cosine_similarity(x1,x2)[0][0] 48 | 49 | def pearson_sp(x1,x2): 50 | total = 0 51 | denom1 = 0 52 | denom2 = 0 53 | try: 54 | mean1 = sum(x1.values())/(len(x1)+0.0) 55 | mean2 = sum(x2.values()) / (len(x2) + 0.0) 56 | for k in x1: 57 | if k in x2: 58 | total += (x1[k]-mean1) * (x2[k]-mean2) 59 | denom1 += (x1[k]-mean1) ** 2 60 | denom2 += (x2[k]-mean2) ** 2 61 | 62 | return (total + 0.0) / (sqrt(denom1) * sqrt(denom2)) 63 | except ZeroDivisionError: 64 | return 0 65 | 66 | def euclidean(x1,x2): 67 | #find common ratings 68 | new_x1, new_x2 = common(x1, x2) 69 | #compute the euclidean between two vectors 70 | diff = new_x1-new_x2 71 | denom = sqrt((diff.dot(diff))) 72 | try: 73 | return 1/denom 74 | except ZeroDivisionError: 75 | return 0 76 | 77 | 78 | def pearson(x1,x2): 79 | #find common ratings 80 | #new_x1, new_x2 = common(x1, x2) 81 | #compute the pearson similarity between two vectors 82 | #ind1 = new_x1 > 0 83 | #ind2 = new_x2 > 0 84 | try: 85 | mean_x1 = float(x1.sum())/len(x1) 86 | mean_x2 = float(x2.sum())/len(x2) 87 | new_x1 = x1 - mean_x1 88 | new_x2 = x2 - mean_x2 89 | sum = new_x1.dot(new_x2) 90 | denom = sqrt((new_x1.dot(new_x1))*(new_x2.dot(new_x2))) 91 | return float(sum) / denom 92 | except ZeroDivisionError: 93 | return 0 94 | 95 | 96 | def similarity(x1,x2,sim): 97 | if sim == 'pcc': 98 | return pearson_sp(x1,x2) 99 | if sim == 'euclidean': 100 | return euclidean(x1,x2) 101 | else: 102 | return cosine_sp(x1, x2) 103 | 104 | 105 | def normalize(vec,maxVal,minVal): 106 | 'get the normalized value using min-max normalization' 107 | if maxVal > minVal: 108 | return float(vec-minVal)/(maxVal-minVal)+0.01 109 | elif maxVal==minVal: 110 | return vec/maxVal 111 | else: 112 | print ('error... maximum value is less than minimum value.') 113 | raise ArithmeticError 114 | 115 | def sigmoid(val): 116 | return 1/(1+exp(-val)) 117 | 118 | 119 | def denormalize(vec,maxVal,minVal): 120 | return minVal+(vec-0.01)*(maxVal-minVal) 121 | -------------------------------------------------------------------------------- /recommender/cf/WRMF.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.IterativeRecommender import IterativeRecommender 3 | import math 4 | import numpy as np 5 | from tool import qmath 6 | from random import choice 7 | from tool.qmath import sigmoid 8 | from math import log 9 | from collections import defaultdict 10 | from scipy.sparse import * 11 | from scipy import * 12 | 13 | class WRMF(IterativeRecommender): 14 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 15 | super(WRMF, self).__init__(conf,trainingSet,testSet,fold) 16 | 17 | def initModel(self): 18 | super(WRMF, self).initModel() 19 | self.X=self.P*10 20 | self.Y=self.Q*10 21 | self.m = self.data.getSize('user') 22 | self.n = self.data.getSize(self.recType) 23 | 24 | def buildModel(self): 25 | userListen = defaultdict(dict) 26 | for user in self.data.userRecord: 27 | for item in self.data.userRecord[user]: 28 | if item[self.recType] not in userListen[user]: 29 | userListen[user][item[self.recType]] = 0 30 | userListen[user][item[self.recType]] += 1 31 | print ('training...') 32 | iteration = 0 33 | while iteration < self.maxIter: 34 | self.loss = 0 35 | YtY = self.Y.T.dot(self.Y) 36 | I = np.ones(self.n) 37 | for user in self.data.name2id['user']: 38 | #C_u = np.ones(self.data.getSize(self.recType)) 39 | H = np.ones(self.n) 40 | val = [] 41 | pos = [] 42 | P_u = np.zeros(self.n) 43 | uid = self.data.getId(user,'user') 44 | for item in userListen[user]: 45 | iid = self.data.getId(item,self.recType) 46 | r_ui = userListen[user][item] 47 | pos.append(iid) 48 | val.append(10*r_ui) 49 | H[iid]+=10*r_ui 50 | P_u[iid]=1 51 | error = (P_u[iid]-self.X[uid].dot(self.Y[iid])) 52 | self.loss+=pow(error,2) 53 | #sparse matrix 54 | C_u = coo_matrix((val,(pos,pos)),shape=(self.n,self.n)) 55 | A = (YtY+np.dot(self.Y.T,C_u.dot(self.Y))+self.regU*np.eye(self.k)) 56 | self.X[uid] = np.dot(np.linalg.inv(A),(self.Y.T*H).dot(P_u)) 57 | 58 | 59 | XtX = self.X.T.dot(self.X) 60 | I = np.ones(self.m) 61 | for item in self.data.name2id[self.recType]: 62 | P_i = np.zeros(self.m) 63 | iid = self.data.getId(item, self.recType) 64 | H = np.ones(self.m) 65 | val = [] 66 | pos = [] 67 | for user in self.data.listened[self.recType][item]: 68 | uid = self.data.getId(user, 'user') 69 | r_ui = self.data.listened[self.recType][item][user] 70 | pos.append(uid) 71 | val.append(10*r_ui) 72 | H[uid] += 10*r_ui 73 | P_i[uid] = 1 74 | # sparse matrix 75 | C_i = coo_matrix((val, (pos, pos)),shape=(self.m,self.m)) 76 | A = (XtX+np.dot(self.X.T,C_i.dot(self.X))+self.regU*np.eye(self.k)) 77 | self.Y[iid]=np.dot(np.linalg.inv(A), (self.X.T*H).dot(P_i)) 78 | 79 | #self.loss += (self.X * self.X).sum() + (self.Y * self.Y).sum() 80 | iteration += 1 81 | print ('iteration:',iteration,'loss:',self.loss) 82 | # if self.isConverged(iteration): 83 | # break 84 | 85 | def predict(self, u): 86 | 'invoked to rank all the items for the user' 87 | u = self.data.getId(u,'user') 88 | return self.Y.dot(self.X[u]) 89 | -------------------------------------------------------------------------------- /evaluation/measure.py: -------------------------------------------------------------------------------- 1 | import math 2 | class Measure(object): 3 | def __init__(self): 4 | pass 5 | 6 | @staticmethod 7 | def hits(origin,res): 8 | hitCount = {} 9 | for user in origin: 10 | items = origin[user].keys() 11 | predicted = [item for item in res[user]] 12 | hitCount[user] = len(set(items).intersection(set(predicted))) 13 | return hitCount 14 | 15 | @staticmethod 16 | def rankingMeasure(origin,res,N,itemCount): 17 | print('rank measure...') 18 | measure = [] 19 | for n in N: 20 | predicted = {} 21 | for user in res: 22 | predicted[user] = res[user][:n] 23 | indicators = [] 24 | if len(origin)!= len(predicted): 25 | print('The Lengths of test set and predicted set are not match!') 26 | exit(-1) 27 | hits = Measure.hits(origin,predicted) 28 | prec = Measure.precision(hits,n) 29 | indicators.append('Precision:' + str(prec)+'\n') 30 | recall = Measure.recall(hits,origin) 31 | indicators.append('Recall:' + str(recall)+'\n') 32 | F1 = Measure.F1(prec,recall) 33 | indicators.append('F1:' + str(F1) + '\n') 34 | MAP = Measure.MAP(origin,predicted,n) 35 | indicators.append('MAP:' + str(MAP) + '\n') 36 | #AUC = Measure.AUC(origin,res,rawRes) 37 | #measure.append('AUC:' + str(AUC) + '\n') 38 | indicators.append('Coverage:'+str(Measure.coverage(predicted,itemCount))+'\n') 39 | measure.append('Top '+str(n)+'\n') 40 | measure+=indicators 41 | return measure 42 | @staticmethod 43 | def coverage(res,itemCount): 44 | recommendations = set() 45 | for user in res: 46 | for item in res[user]: 47 | recommendations.add(item) 48 | return len(recommendations)/float(itemCount) 49 | 50 | @staticmethod 51 | def precision(hits,N): 52 | prec = sum([hits[user] for user in hits]) 53 | return float(prec)/(len(hits)*N) 54 | 55 | @staticmethod 56 | def MAP(origin, res, N): 57 | sum_prec = 0 58 | for user in res: 59 | hits = 0 60 | precision = 0 61 | for n, item in enumerate(res[user]): 62 | if item in origin[user]: 63 | hits += 1 64 | precision += hits / (n + 1.0) 65 | sum_prec += precision / (min(len(origin[user]), N) + 0.0) 66 | return sum_prec / (len(res)) 67 | 68 | @staticmethod 69 | def AUC(origin,res,rawRes): 70 | 71 | from random import choice 72 | sum_AUC = 0 73 | for user in origin: 74 | count = 0 75 | larger = 0 76 | itemList = rawRes[user].keys() 77 | for item in origin[user]: 78 | item2 = choice(itemList) 79 | count+=1 80 | try: 81 | if rawRes[user][item]>rawRes[user][item2]: 82 | larger+=1 83 | except KeyError: 84 | count-=1 85 | if count: 86 | sum_AUC+=float(larger)/count 87 | 88 | return float(sum_AUC)/len(origin) 89 | 90 | @staticmethod 91 | def recall(hits,origin): 92 | recallList = [float(hits[user])/len(origin[user]) for user in hits] 93 | recall = sum(recallList)/float(len(recallList)) 94 | return recall 95 | 96 | @staticmethod 97 | def F1(prec,recall): 98 | if (prec+recall)!=0: 99 | return 2*prec*recall/(prec+recall) 100 | else: 101 | return 0 102 | -------------------------------------------------------------------------------- /Session-based RecSys Papers.md: -------------------------------------------------------------------------------- 1 | ## Must-read papers on RecSys-session-based. 2 | 3 | Contributed by [Junwei Zhang](https://github.com/0411tony). 4 | 5 | We release [Yue](https://github.com/0411tony/Yue), the open source toolkit for recommendation system on music dataset. This repository provides a standard music recommend training and testing frameworks. And we will implement some session-based models. Currently, the implemented models in Yue include Song2vec, BPR..... 6 | 7 | ### Survey papers: 8 | 9 | 10 | ### Journal and Conference papers: 11 | 12 | 1. **Item-based collaborative filtering recommendation algorithms.** 13 | *Sarwar, Badrul, et al.* ACM World Wide Web 2001. [paper](http://delivery.acm.org/10.1145/380000/372071/p285-sarwar.pdf?ip=218.70.255.152&id=372071&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2E4C4B77574FA1A7CF%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1546431542_d02487f30ebaed9ffe0e29e6070bb591) 14 | 15 | 1. **Amazon.com Recommendations: Item-to-Item Collaborative Filtering.** 16 | *Linden G, Smith B, York J.* IEEE Internet Computing 2003. [paper](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1167344) 17 | 18 | 1. **An MDP-Based Recommender System** 19 | *Shani G, Heckerman D, Brafman R I.* Journal of Machine Learning Research 2005. [paper](https://arxiv.org/ftp/arxiv/papers/1301/1301.0600.pdf) 20 | 21 | 1. **Factorizing personalized Markov chains for next-basket recommendation** 22 | *Rendle S, Freudenthaler C, Schmidt-Thieme L.* WWW 2010. [paper](http://delivery.acm.org/10.1145/1780000/1772773/p811-rendle.pdf?ip=218.70.255.152&id=1772773&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2E4C4B77574FA1A7CF%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1546431741_3fdbc1a74b222bc0ad4fd104cba5c6ea) 23 | 24 | 1. **Session-based Recommendations with Recurrent Neural Networks.** 25 | *Hidasi, Balázs, Karatzoglou A, Baltrunas L, et al.* Computer Science 2015. [paper](https://arxiv.org/pdf/1511.06939.pdf) 26 | 27 | 1. **Improved Recurrent Neural Networks for Session-based Recommendations** 28 | *Tan Y K, Xu X, Liu Y.* CoRR 2016. [paper](https://arxiv.org/pdf/1606.08117.pdf) 29 | 30 | 1. **Parallel Recurrent Neural Network Architectures for Feature-rich Session-based Recommendations** 31 | *Balázs Hidasi, Quadrana M, Karatzoglou A, et al.* RecSys 2016 [paper](http://delivery.acm.org/10.1145/2960000/2959167/p241-hidasi.pdf?ip=218.70.255.152&id=2959167&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2E4C4B77574FA1A7CF%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1546432069_1cfe1cb45fec758fe7be509c823032df) 32 | 33 | 1. **Fusing Similarity Models with Markov Chains for Sparse Sequential Recommendation** 34 | *He R, Mcauley J.* CoRR 2016. [paper](http://arxiv.org/pdf/1609.09152) 35 | 36 | 1. **When Recurrent Neural Networks meet the Neighborhood for Session-Based Recommendation** 37 | *Jannach D, Ludewig M.* RecSys 2017. [paper](http://delivery.acm.org/10.1145/3110000/3109872/p306-jannach.pdf?ip=218.70.255.152&id=3109872&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2E4C4B77574FA1A7CF%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1546432012_ba48759e70cbbf85d79db91f20d4a8d4) 38 | 39 | 1. **Neural Attentive Session-based Recommendation** 40 | *Li J, Ren P, Chen Z, et al.* CIKM 2017. [paper](https://arxiv.org/pdf/1711.04725v1.pdf) 41 | 42 | 1. **Next Item Recommendation with Self-Attention** 43 | *Zhang S, Tay Y, Yao L, et al.* CoRR 2018. [paper](https://arxiv.org/pdf/1808.06414v2.pdf) 44 | 45 | 1. **Session-based Recommendation with Graph Neural Networks.** 46 | *Wu S, Tang Y, Zhu Y, et al.* AAAI 2018. [paper](https://arxiv.org/pdf/1811.00855.pdf) 47 | 48 | 1. **STAMP: Short-Term Attention/Memory Priority Model for Session-based Recommendation** 49 | *Liu Q, Zeng Y, Mokhosi R, et al.* KDD 2018. [paper](http://delivery.acm.org/10.1145/3220000/3219950/p1831-liu.pdf?ip=218.70.255.152&id=3219950&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2E4C4B77574FA1A7CF%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1546431853_60d5f08803e9946f72653ae2d3a624d7) 50 | 51 | 52 | ### Preprints 53 | 54 | 55 | -------------------------------------------------------------------------------- /recommender/cf/FISM.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.IterativeRecommender import IterativeRecommender 3 | import math 4 | import numpy as np 5 | from tool.config import LineConfig 6 | from random import choice 7 | from tool.qmath import sigmoid 8 | from math import log 9 | from collections import defaultdict 10 | class FISM(IterativeRecommender): 11 | 12 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 13 | super(FISM, self).__init__(conf,trainingSet,testSet,fold) 14 | 15 | def initModel(self): 16 | super(FISM, self).initModel() 17 | self.Bi = np.random.rand(len(self.data.id2name[self.recType]))/100 18 | self.P = np.random.rand(len(self.data.id2name[self.recType]),self.k)/100 19 | 20 | def readConfiguration(self): 21 | super(FISM, self).readConfiguration() 22 | self.rho = int(LineConfig(self.config['FISM'])['-rho']) 23 | if self.rho<1: 24 | self.rho=1 25 | self.alpha = float(LineConfig(self.config['FISM'])['-alpha']) 26 | 27 | def buildModel(self): 28 | userListened = defaultdict(dict) 29 | for user in self.data.userRecord: 30 | for item in self.data.userRecord[user]: 31 | userListened[user][item[self.recType]] = 1 32 | 33 | print ('training...') 34 | iteration = 0 35 | itemList = list(self.data.name2id[self.recType].keys()) 36 | while iteration < self.maxIter: 37 | self.loss = 0 38 | for user in self.data.userRecord: 39 | nu = len(self.data.userRecord[user]) 40 | if nu==1: 41 | continue 42 | coef = pow(nu - 1, -self.alpha) 43 | sum_Pj = np.zeros(self.k) 44 | for item in self.data.userRecord[user]: 45 | j = self.data.getId(item[self.recType], self.recType) 46 | sum_Pj += self.P[j] 47 | X = [] 48 | for item in self.data.userRecord[user]: 49 | x = np.zeros(self.k) 50 | i = self.data.getId(item[self.recType],self.recType) 51 | for count in range(self.rho): 52 | item_j = choice(itemList) 53 | while (item_j in userListened[user]): 54 | item_j = choice(itemList) 55 | j = self.data.getId(item_j,self.recType) 56 | r_ui=coef*(sum_Pj-self.P[i]).dot(self.Q[i])+self.Bi[i] 57 | r_uj=coef*(sum_Pj-self.P[j]).dot(self.Q[j])+self.Bi[j] 58 | error = 1-(r_ui-r_uj) 59 | self.loss += 0.5*error**2 60 | #update 61 | self.Bi[i]+=self.lRate*(error-self.regB*self.Bi[i]) 62 | self.Bi[j]-=self.lRate*(error+self.regB*self.Bi[j]) 63 | self.Q[i]+=self.lRate*(error*coef*(sum_Pj-self.P[i])-self.regI*self.Q[i]) 64 | self.Q[j]-=self.lRate*(error*coef*(sum_Pj-self.P[j])+self.regI*self.Q[j]) 65 | x+=error*(self.Q[i]-self.Q[j]) 66 | X.append(x) 67 | 68 | for ind,item in enumerate(self.data.userRecord[user]): 69 | j = self.data.getId(item[self.recType], self.recType) 70 | self.P[j]+=self.lRate*(1/float(self.rho)*coef*X[ind]-self.regI*self.P[j]) 71 | 72 | self.loss += self.regU*(self.P*self.P).sum() + self.regI*(self.Q*self.Q).sum() + self.regB*(self.Bi.dot(self.Bi)) 73 | iteration += 1 74 | if self.isConverged(iteration): 75 | break 76 | 77 | def predict(self, user): 78 | 'invoked to rank all the items for the user' 79 | #a trick for quick matrix computation 80 | sum_Pj = np.zeros(self.k) 81 | for item in self.data.userRecord[user]: 82 | j = self.data.getId(item[self.recType], self.recType) 83 | sum_Pj += self.P[j] 84 | return self.Bi+self.Q.dot(sum_Pj)-(self.P*self.Q).sum(axis=1) 85 | -------------------------------------------------------------------------------- /recommender/cf/IPF.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.recommender import Recommender 3 | from tool.config import LineConfig 4 | from random import choice 5 | from collections import defaultdict 6 | class IPF(Recommender): 7 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 8 | super(IPF, self).__init__(conf,trainingSet,testSet,fold) 9 | 10 | def initModel(self): 11 | super(IPF, self).initModel() 12 | print ('initializing STG...') 13 | userListened = defaultdict(list) 14 | self.sessionNodes = {} 15 | for user in self.data.userRecord: 16 | for item in self.data.userRecord[user]: 17 | userListened[user].append(item[self.recType]) 18 | for user in userListened: 19 | t = max(0, len(userListened[user]) - 10) 20 | self.sessionNodes[user] = userListened[user][t:] 21 | self.STG = {} 22 | self.STG['user'] = userListened 23 | self.STG['session'] = self.sessionNodes 24 | 25 | item2session=defaultdict(list) 26 | for user in self.sessionNodes: 27 | for item in self.sessionNodes[user]: 28 | item2session[item].append(user) 29 | item2user = {} 30 | for item in self.data.listened[self.recType]: 31 | item2user[item]=self.data.listened[self.recType][item].keys() 32 | 33 | 34 | self.STG['item2user'] = item2user 35 | self.STG['item2session'] = item2session 36 | self.path = [['user','item2user','user','item'], 37 | ['user','item2session','session','item'], 38 | ['session','item2user','user','item'], 39 | ['session','item2session','session','item']] 40 | 41 | 42 | def readConfiguration(self): 43 | super(IPF, self).readConfiguration() 44 | self.rho = float(LineConfig(self.config['IPF'])['-rho']) 45 | if self.rho<0 or self.rho>1: 46 | self.rho=0.5 47 | self.beta = float(LineConfig(self.config['IPF'])['-beta']) 48 | self.eta = float(LineConfig(self.config['IPF'])['-eta']) 49 | 50 | def probability(self,v1,v2): 51 | if (v1[0]=='user' and v2[0]=='item2user') or (v1[0]=='session' and v2[0]=='item2session') or \ 52 | (v1[0] == 'user' and v2[0] == 'item2session') or (v1[0]=='session'and v2[0] =='item2user') or \ 53 | (v1[0] == 'user' and v2[0] == 'item') or (v1[0] == 'session' and v2[0] == 'item'): 54 | return 1.0/pow(len(self.STG[v1[0]][v1[1]]),self.rho) 55 | elif v1[0]=='item2user' and v2[0]=='user': 56 | return pow(self.eta/(self.eta*len(self.STG['item2user'][v1[1]])+ 57 | len(self.STG['item2session'][v1[1]])),self.rho) 58 | elif v1[0]=='item2session' and v2[0]=='session': 59 | return pow(1/(self.eta*len(self.STG['item2user'][v1[1]])+ 60 | len(self.STG['item2session'][v1[1]])),self.rho) 61 | 62 | 63 | def predict(self, user): 64 | rank = {} 65 | for p in self.path: 66 | visited = {} 67 | queue = [] 68 | queue.append((p[0],user)) 69 | distance = {} 70 | distance[p[0]+user]=0 71 | if p[0]=='user': 72 | rank[p[0]+'_'+user]=self.beta 73 | else: 74 | rank[p[0]+'_'+user] = 1-self.beta 75 | while len(queue)>0: 76 | vType,v = queue.pop() 77 | if (vType+v) in visited and visited[vType+v]==1: 78 | continue 79 | visited[vType+v]=1 80 | if vType=='item': 81 | continue 82 | for nextNode in self.STG[p[distance[vType+v]]][v]: 83 | nextType = p[distance[vType+v]+1] 84 | if (nextType+nextNode) not in visited: 85 | distance[nextType+nextNode]=distance[vType+v]+1 86 | queue.append((nextType,nextNode)) 87 | visited[nextType+nextNode]=0 88 | else: 89 | continue 90 | if distance[vType+v]< distance[p[distance[vType+v]+1]+nextNode]: 91 | if (nextType+'_'+nextNode) not in rank: 92 | rank[nextType+'_'+nextNode]=0 93 | rank[nextType+'_'+nextNode]+=rank[vType+'_'+v]*self.probability((vType,v),(nextType,nextNode)) 94 | recommendedList = [(key[5:],value) for key,value in rank.items() if key[0:5]=='item_'] 95 | recommendedList = sorted(recommendedList,key=lambda d:d[1],reverse=True) 96 | recommendedList = [item[0] for item in recommendedList] 97 | #print ('user',user,'finished') 98 | return recommendedList 99 | -------------------------------------------------------------------------------- /recommender/advanced/LightGCN.py: -------------------------------------------------------------------------------- 1 | from base.DeepRecommender import DeepRecommender 2 | import tensorflow as tf 3 | from math import sqrt 4 | from tensorflow import set_random_seed 5 | from collections import defaultdict 6 | import random 7 | 8 | set_random_seed(2) 9 | 10 | class LightGCN(DeepRecommender): 11 | 12 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 13 | super(LightGCN, self).__init__(conf,trainingSet,testSet,fold) 14 | 15 | def initModel(self): 16 | super(LightGCN, self).initModel() 17 | 18 | self.negativeCount = 5 19 | 20 | self.userListen = defaultdict(dict) 21 | for entry in self.data.trainingData: 22 | if entry['track'] not in self.userListen[entry['user']]: 23 | self.userListen[entry['user']][entry['track']] = 0 24 | self.userListen[entry['user']][entry['track']] += 1 25 | print('training...') 26 | 27 | ego_embeddings = tf.concat([self.U, self.V], axis=0) 28 | 29 | indices = [[self.data.getId(item['user'], 'user'), self.m + self.data.getId(item['track'], 'track')] for item in self.data.trainingData] 30 | indices += [[self.m + self.data.getId(item['track'], 'track'), self.data.getId(item['user'], 'user')] for item in self.data.trainingData] 31 | # values = [float(self.userListen[item['user']][item['track']]) / sqrt(len(self.data.userRecord[item['user']])) / sqrt(len(self.data.trackRecord[item['track']])) for item in self.data.trainingData]*2 32 | values = [float(self.userListen[item['user']][item['track']]) for item in self.data.trainingData]*2 33 | 34 | norm_adj = tf.SparseTensor(indices=indices, values=values, dense_shape=[self.m+self.n, self.m+self.n]) 35 | 36 | self.n_layers = 3 37 | 38 | all_embeddings = [ego_embeddings] 39 | for k in range(self.n_layers): 40 | ego_embeddings = tf.sparse_tensor_dense_matmul(norm_adj, ego_embeddings) 41 | # normalize the distribution of embeddings. 42 | norm_embeddings = tf.nn.l2_normalize(ego_embeddings, axis=1) 43 | all_embeddings += [norm_embeddings] 44 | 45 | all_embeddings = tf.reduce_sum(all_embeddings, axis=0) 46 | 47 | self.multi_user_embeddings, self.multi_item_embeddings = tf.split(all_embeddings, [self.m, self.n], 0) 48 | 49 | self.neg_idx = tf.placeholder(tf.int32, name="neg_holder") 50 | self.neg_item_embedding = tf.nn.embedding_lookup(self.multi_item_embeddings, self.neg_idx) 51 | self.u_embedding = tf.nn.embedding_lookup(self.multi_user_embeddings, self.u_idx) 52 | self.v_embedding = tf.nn.embedding_lookup(self.multi_item_embeddings, self.v_idx) 53 | 54 | self.test = tf.reduce_sum(tf.multiply(self.u_embedding, self.multi_item_embeddings), 1) 55 | 56 | def next_batch_pairwise(self): 57 | batch_id = 0 58 | while batch_id < self.train_size: 59 | if batch_id + self.batch_size <= self.train_size: 60 | users = [self.data.trainingData[idx]['user'] for idx in range(batch_id, self.batch_size + batch_id)] 61 | items = [self.data.trainingData[idx]['track'] for idx in range(batch_id, self.batch_size + batch_id)] 62 | batch_id += self.batch_size 63 | else: 64 | users = [self.data.trainingData[idx]['user'] for idx in range(batch_id, self.train_size)] 65 | items = [self.data.trainingData[idx]['track'] for idx in range(batch_id, self.train_size)] 66 | batch_id = self.train_size 67 | 68 | u_idx, i_idx, j_idx = [], [], [] 69 | 70 | for i, user in enumerate(users): 71 | for j in range(self.negativeCount): 72 | item_j = random.randint(0, self.n-1) 73 | while self.data.id2name['track'][item_j] in self.userListen[user]: 74 | item_j = random.randint(0, self.n-1) 75 | u_idx.append(self.data.getId(user, 'user')) 76 | i_idx.append(self.data.getId(items[i], 'track')) 77 | j_idx.append(item_j) 78 | 79 | yield u_idx, i_idx, j_idx 80 | 81 | 82 | def buildModel(self): 83 | y = tf.reduce_sum(tf.multiply(self.u_embedding, self.v_embedding), 1) \ 84 | - tf.reduce_sum(tf.multiply(self.u_embedding, self.neg_item_embedding), 1) 85 | 86 | loss = -tf.reduce_sum(tf.log(tf.sigmoid(y))) + self.regU * (tf.nn.l2_loss(self.u_embedding) + 87 | tf.nn.l2_loss(self.v_embedding) + 88 | tf.nn.l2_loss(self.neg_item_embedding)) 89 | opt = tf.train.AdamOptimizer(self.lRate) 90 | 91 | train = opt.minimize(loss) 92 | 93 | init = tf.global_variables_initializer() 94 | self.sess.run(init) 95 | for iteration in range(self.maxIter): 96 | for n, batch in enumerate(self.next_batch_pairwise()): 97 | user_idx, i_idx, j_idx = batch 98 | _, l = self.sess.run([train, loss], 99 | feed_dict={self.u_idx: user_idx, self.neg_idx: j_idx, self.v_idx: i_idx}) 100 | print('training:', iteration + 1, 'batch', n, 'loss:', l) 101 | 102 | def predict(self, u): 103 | 'invoked to rank all the items for the user' 104 | if self.data.contains(u, 'user'): 105 | uid = self.data.name2id['user'][u] 106 | return self.sess.run(self.test, feed_dict={self.u_idx: [uid]}) 107 | else: 108 | uid = self.data.getId(u,'user') 109 | return np.divide(self.V.dot(self.U[uid]), self.normalized_U[uid]*self.normalized_V) 110 | -------------------------------------------------------------------------------- /yue.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from re import split 3 | from tool.config import Config,LineConfig 4 | from tool.file import FileIO 5 | from tool.dataSplit import * 6 | from multiprocessing import Process,Manager 7 | from time import strftime,localtime,time 8 | from json import loads 9 | import mkl 10 | class Yue(object): 11 | def __init__(self,config): 12 | self.trainingData = [] # training data 13 | self.testData = [] # testData 14 | self.measure = [] 15 | self.config = config 16 | setup = LineConfig(config['record.setup']) 17 | columns = {} 18 | labels = setup['-columns'].split(',') 19 | delim = '' 20 | if setup.contains('-delim'): 21 | delim=setup['-delim'] 22 | for col in labels: 23 | label = col.split(':') 24 | columns[label[0]] = int(label[1]) 25 | 26 | if self.config.contains('evaluation.setup'): 27 | self.evaluation = LineConfig(config['evaluation.setup']) 28 | binarized = False 29 | bottom = 0 30 | if self.evaluation.contains('-b'): 31 | binarized = True 32 | bottom = float(self.evaluation['-b']) 33 | if self.evaluation.contains('-testSet'): 34 | #specify testSet 35 | self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim) 36 | self.testData = FileIO.loadDataSet(self.evaluation['-testSet'],binarized=binarized,columns=columns,threshold=bottom,delim=delim) 37 | 38 | elif self.evaluation.contains('-ap'): 39 | #auto partition 40 | self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim) 41 | self.trainingData,self.testData = DataSplit.\ 42 | dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap'])) 43 | 44 | elif self.evaluation.contains('-byTime'): 45 | self.trainingData = FileIO.loadDataSet(config['record'], columns=columns, binarized=binarized,threshold=bottom, delim=delim) 46 | self.testData = [] 47 | 48 | elif self.evaluation.contains('-cv'): 49 | #cross validation 50 | self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim) 51 | #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])) 52 | 53 | else: 54 | print ('Evaluation is not well configured!') 55 | exit(-1) 56 | 57 | print ('preprocessing...') 58 | 59 | def execute(self): 60 | #import the algorithm module 61 | try: 62 | importStr = 'from recommender.baseline.' + self.config['recommender'] + ' import ' + self.config['recommender'] 63 | exec (importStr) 64 | except ImportError: 65 | importStr = 'from recommender.cf.' + self.config['recommender'] + ' import ' + self.config['recommender'] 66 | try: 67 | exec (importStr) 68 | except ImportError: 69 | importStr = 'from recommender.advanced.' + self.config['recommender'] + ' import ' + self.config['recommender'] 70 | exec (importStr) 71 | 72 | if self.evaluation.contains('-cv'): 73 | k = int(self.evaluation['-cv']) 74 | if k <= 1 or k > 10: 75 | k = 3 76 | mkl.set_num_threads(max(1,mkl.get_max_threads()/k)) 77 | #create the manager used to communication in multiprocess 78 | manager = Manager() 79 | m = manager.dict() 80 | i = 1 81 | tasks = [] 82 | 83 | binarized = False 84 | if self.evaluation.contains('-b'): 85 | binarized = True 86 | 87 | for train,test in DataSplit.crossValidation(self.trainingData,k): 88 | fold = '['+str(i)+']' 89 | # if self.config.contains('social'): 90 | # recommender = self.config['recommender'] + "(self.config,train,test,self.relation,fold)" 91 | # else: 92 | recommender = self.config['recommender']+ "(self.config,train,test,fold)" 93 | #create the process 94 | p = Process(target=run,args=(m,eval(recommender),i)) 95 | tasks.append(p) 96 | i+=1 97 | #start the processes 98 | for p in tasks: 99 | p.start() 100 | if not self.evaluation.contains('-p'): 101 | p.join() 102 | #wait until all processes are completed 103 | if self.evaluation.contains('-p'): 104 | for p in tasks: 105 | p.join() 106 | #compute the mean error of k-fold cross validation 107 | self.measure = [dict(m)[i] for i in range(1,k+1)] 108 | res = [] 109 | for i in range(len(self.measure[0])): 110 | if self.measure[0][i][:3]=='Top': 111 | res.append(self.measure[0][i]) 112 | continue 113 | measure = self.measure[0][i].split(':')[0] 114 | total = 0 115 | for j in range(k): 116 | total += float(self.measure[j][i].split(':')[1]) 117 | res.append(measure+':'+str(total/k)+'\n') 118 | #output result 119 | currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) 120 | outDir = LineConfig(self.config['output.setup'])['-dir'] 121 | fileName = self.config['recommender'] +'@'+currentTime+'-'+str(k)+'-fold-cv' + '.txt' 122 | FileIO.writeFile(outDir,fileName,res) 123 | print ('The result of %d-fold cross validation:\n%s' %(k,''.join(res))) 124 | 125 | 126 | else: 127 | # if self.config.contains('social'): 128 | # recommender = self.config['recommender']+'(self.config,self.trainingData,self.testData,self.relation)' 129 | # else: 130 | recommender = self.config['recommender'] + '(self.config,self.trainingData,self.testData)' 131 | eval(recommender).execute() 132 | 133 | 134 | def run(measure,algor,order): 135 | measure[order] = algor.execute() 136 | -------------------------------------------------------------------------------- /recommender/cf/BPR.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.IterativeRecommender import IterativeRecommender 3 | import math 4 | import numpy as np 5 | from tool import qmath 6 | from random import choice 7 | from tool.qmath import sigmoid 8 | from math import log 9 | from collections import defaultdict 10 | import tensorflow as tf 11 | from tensorflow import set_random_seed 12 | import random 13 | 14 | set_random_seed(2) 15 | 16 | class BPR(IterativeRecommender): 17 | 18 | # BPR:Bayesian Personalized Ranking from Implicit Feedback 19 | # Steffen Rendle,Christoph Freudenthaler,Zeno Gantner and Lars Schmidt-Thieme 20 | 21 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 22 | super(BPR, self).__init__(conf,trainingSet,testSet,fold) 23 | 24 | def initModel(self): 25 | super(BPR, self).initModel() 26 | self.m = self.data.getSize('user') 27 | self.n = self.data.getSize(self.recType) 28 | self.train_size = len(self.data.trainingData) 29 | 30 | ''' 31 | def buildModel(self): 32 | userListen = defaultdict(dict) 33 | for user in self.data.userRecord: 34 | for item in self.data.userRecord[user]: 35 | userListen[user][item[self.recType]] = 1 36 | 37 | print ('training...') 38 | iteration = 0 39 | itemList = list(self.data.name2id[self.recType].keys()) 40 | while iteration < self.maxIter: 41 | self.loss = 0 42 | for user in self.data.userRecord: 43 | u = self.data.getId(user,'user') 44 | for item in self.data.userRecord[user]: 45 | i = self.data.getId(item[self.recType],self.recType) 46 | item_j = choice(itemList) 47 | while (item_j in userListen[user]): 48 | item_j = choice(itemList) 49 | j = self.data.getId(item_j,self.recType) 50 | s = sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j])) 51 | self.P[u] += self.lRate * (1 - s) * (self.Q[i] - self.Q[j]) 52 | self.Q[i] += self.lRate * (1 - s) * self.P[u] 53 | self.Q[j] -= self.lRate * (1 - s) * self.P[u] 54 | 55 | self.P[u] -= self.lRate * self.regU * self.P[u] 56 | self.Q[i] -= self.lRate * self.regI * self.Q[i] 57 | self.Q[j] -= self.lRate * self.regI * self.Q[j] 58 | self.loss += -log(s) 59 | self.loss += self.regU * (self.P * self.P).sum() + self.regI * (self.Q * self.Q).sum() 60 | iteration += 1 61 | if self.isConverged(iteration): 62 | break 63 | ''' 64 | 65 | def next_batch(self): 66 | batch_idx = np.random.randint(len(self.data.trainingData), size=512) 67 | users = [self.data.trainingData[idx]['user'] for idx in batch_idx] 68 | items = [self.data.trainingData[idx]['track'] for idx in batch_idx] 69 | user_idx,item_idx=[],[] 70 | neg_item_idx = [] 71 | for i,user in enumerate(users): 72 | uid = self.data.getId(user, 'user') 73 | for j in range(100): #negative sampling 74 | item_j = random.randint(0, self.n- 1) 75 | while item_j in self.userListen[uid]: 76 | item_j = random.randint(0, self.n - 1) 77 | tid = self.data.getId(items[i], 'track') 78 | user_idx.append(uid) 79 | item_idx.append(tid) 80 | neg_item_idx.append(item_j) 81 | return user_idx, item_idx, neg_item_idx 82 | 83 | def buildModel(self): 84 | self.userListen = defaultdict(dict) 85 | for user in self.data.userRecord: 86 | uid = self.data.getId(user, 'user') 87 | for item in self.data.userRecord[user]: 88 | iid = self.data.getId(item[self.recType], 'track') 89 | if item[self.recType] not in self.userListen[user]: 90 | self.userListen[uid][iid] = 0 91 | self.userListen[uid][iid] += 1 92 | 93 | self.u_idx = tf.placeholder(tf.int32, [None], name="u_idx") 94 | self.v_idx = tf.placeholder(tf.int32, [None], name="v_idx") 95 | self.neg_idx = tf.placeholder(tf.int32, [None], name="n_idx") 96 | 97 | self.U = tf.Variable(tf.truncated_normal(shape=[self.m, self.k], stddev=0.005), name='U') 98 | self.V = tf.Variable(tf.truncated_normal(shape=[self.n, self.k], stddev=0.005), name='V') 99 | 100 | self.U_embed = tf.nn.embedding_lookup(self.U, self.u_idx) 101 | self.V_embed = tf.nn.embedding_lookup(self.V, self.v_idx) 102 | self.V_neg_embed = tf.nn.embedding_lookup(self.V, self.neg_idx) 103 | # 构造损失函数 设置优化器 104 | self.reg_lambda = tf.constant(self.regU, dtype=tf.float32) 105 | 106 | error = tf.subtract(tf.reduce_sum(tf.multiply(self.U_embed, self.V_embed), 1), tf.reduce_sum(tf.multiply(self.U_embed, self.V_neg_embed), 1)) 107 | self.loss = tf.reduce_sum(tf.nn.softplus(-error)) 108 | # 构造正则化项 完善损失函数 109 | self.reg_loss = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.U_embed)), 110 | tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_embed))) 111 | self.reg_loss = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_neg_embed)), self.reg_loss) 112 | self.total_loss = tf.add(self.loss, self.reg_loss) 113 | 114 | self.optimizer = tf.train.AdamOptimizer(self.lRate) 115 | self.train = self.optimizer.minimize(self.total_loss) 116 | # 初始化会话 117 | with tf.Session() as sess: 118 | init = tf.global_variables_initializer() 119 | sess.run(init) 120 | # 迭代,传递变量 121 | for epoch in range(self.maxIter): 122 | # 按批优化 123 | user_idx, item_idx, neg_item_idx = self.next_batch() 124 | _,loss = sess.run([self.train,self.total_loss],feed_dict={self.u_idx: user_idx, self.v_idx: item_idx, self.neg_idx:neg_item_idx}) 125 | print ('iteration:', epoch, 'loss:',loss) 126 | # 输出训练完毕的矩阵 127 | self.P = sess.run(self.U) 128 | self.Q = sess.run(self.V) 129 | self.ranking_performance() 130 | 131 | def predict(self, u): 132 | 'invoked to rank all the items for the user' 133 | u = self.data.getId(u,'user') 134 | return self.Q.dot(self.P[u]) 135 | -------------------------------------------------------------------------------- /recommender/advanced/CDAE.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | 3 | from base.IterativeRecommender import IterativeRecommender 4 | import numpy as np 5 | from random import choice,random 6 | from tool import config 7 | import tensorflow as tf 8 | from collections import defaultdict 9 | from tensorflow import set_random_seed 10 | set_random_seed(2) 11 | 12 | class CDAE(IterativeRecommender): 13 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 14 | super(CDAE, self).__init__(conf,trainingSet,testSet,fold) 15 | 16 | def readConfiguration(self): 17 | super(CDAE, self).readConfiguration() 18 | eps = config.LineConfig(self.config['CDAE']) 19 | self.corruption_level = float(eps['-co']) 20 | self.n_hidden = int(eps['-nh']) 21 | self.batch_size = int(eps['-batch_size']) 22 | 23 | def initModel(self): 24 | super(CDAE, self).initModel() 25 | 26 | self.n_hidden = 128 27 | self.num_items = self.data.getSize(self.recType) 28 | self.num_users = self.data.getSize('user') 29 | 30 | self.negative_sp = 5 31 | initializer = tf.contrib.layers.xavier_initializer() 32 | self.X = tf.placeholder(tf.float32, [None, self.num_items]) 33 | self.mask_corruption = tf.placeholder(tf.float32, [None, self.num_items]) 34 | self.sample = tf.placeholder(tf.float32, [None, self.num_items]) 35 | 36 | self.U = tf.Variable(initializer([self.num_users, self.n_hidden])) 37 | self.u_idx = tf.placeholder(tf.int32, [None], name="u_idx") 38 | self.U_embed = tf.nn.embedding_lookup(self.U, self.u_idx) 39 | 40 | self.weights = { 41 | 'encoder': tf.Variable(tf.random_normal([self.num_items, self.n_hidden])), 42 | 'decoder': tf.Variable(tf.random_normal([self.n_hidden, self.num_items])), 43 | } 44 | 45 | self.biases = { 46 | 'encoder': tf.Variable(tf.random_normal([self.n_hidden])), 47 | 'decoder': tf.Variable(tf.random_normal([self.num_items])), 48 | } 49 | 50 | self.userListen = defaultdict(dict) 51 | for item in self.data.trainingData: 52 | uid = self.data.getId(item['user'], 'user') 53 | tid = self.data.getId(item['track'], 'track') 54 | if tid not in self.userListen[uid]: 55 | self.userListen[uid][tid] = 1 56 | else: 57 | self.userListen[uid][tid] += 1 58 | 59 | def encoder(self,x,v): 60 | layer = tf.nn.sigmoid(tf.matmul(x, self.weights['encoder'])+self.biases['encoder']+v) 61 | return layer 62 | 63 | def decoder(self,x): 64 | layer = tf.nn.sigmoid(tf.matmul(x, self.weights['decoder'])+self.biases['decoder']) 65 | return layer 66 | 67 | def row(self, u): 68 | k = self.userListen[u].keys() 69 | v = self.userListen[u].values() 70 | vec = np.zeros(self.num_items) 71 | for pair in zip(k,v): 72 | iid = pair[0] 73 | vec[iid] = pair[1] 74 | return vec 75 | 76 | def next_batch(self): 77 | X = np.zeros((self.batch_size, self.num_items)) 78 | uids = [] 79 | sample = np.zeros((self.batch_size, self.num_items)) 80 | userList = list(self.data.name2id['user'].keys()) 81 | itemList = list(self.data.name2id['track'].keys()) 82 | for n in range(self.batch_size): 83 | user = choice(userList) 84 | uid = self.data.name2id['user'][user] 85 | uids.append(uid) 86 | vec = self.row(uid) 87 | ratedItems = self.userListen[uid].keys() 88 | values = self.userListen[uid].values() 89 | for iid in ratedItems: 90 | sample[n][iid]=1 91 | for i in range(self.negative_sp*len(ratedItems)): 92 | ng = choice(itemList) 93 | while ng in self.data.userRecord[user]: 94 | ng = choice(itemList) 95 | ng_id = self.data.name2id['track'][ng] 96 | sample[n][ng_id]=1 97 | X[n]=vec 98 | return X, uids, sample 99 | 100 | def buildModel(self): 101 | self.corruption_input = tf.multiply(self.X, self.mask_corruption) 102 | self.encoder_op = self.encoder(self.corruption_input, self.U_embed) 103 | self.decoder_op = self.decoder(self.encoder_op) 104 | 105 | self.y_pred = tf.multiply(self.sample, self.decoder_op) 106 | y_true = tf.multiply(self.sample, self.corruption_input) 107 | self.y_pred = tf.maximum(1e-6, self.y_pred) 108 | 109 | self.loss = -tf.multiply(y_true,tf.log(self.y_pred))-tf.multiply((1-y_true),tf.log(1-self.y_pred)) 110 | self.reg_loss = self.regU*(tf.nn.l2_loss(self.weights['encoder'])+tf.nn.l2_loss(self.weights['decoder'])+ 111 | tf.nn.l2_loss(self.biases['encoder'])+tf.nn.l2_loss(self.biases['decoder'])) 112 | 113 | self.reg_loss = self.reg_loss + self.regU*tf.nn.l2_loss(self.U_embed) 114 | self.loss = self.loss + self.reg_loss 115 | self.loss = tf.reduce_mean(self.loss) 116 | 117 | optimizer = tf.train.AdamOptimizer(self.lRate).minimize(self.loss) 118 | 119 | self.sess = tf.Session() 120 | init = tf.global_variables_initializer() 121 | self.sess.run(init) 122 | 123 | for epoch in range(self.maxIter): 124 | mask = np.random.binomial(1, self.corruption_level, (self.batch_size, self.num_items)) 125 | batch_xs,users,sample = self.next_batch() 126 | 127 | _, loss,y = self.sess.run([optimizer, self.loss, self.y_pred], feed_dict={self.X: batch_xs,self.mask_corruption:mask,self.u_idx:users,self.sample:sample}) 128 | 129 | print (self.foldInfo,"Epoch:", '%04d' % (epoch + 1),"loss=", "{:.9f}".format(loss)) 130 | # self.ranking_performance() 131 | print("Optimization Finished!") 132 | 133 | 134 | def predictForRanking(self, u): 135 | 'invoked to rank all the items for the user' 136 | if self.data.containsUser(u,'user'): 137 | vec = self.row(u).reshape((1,len(self.data.TrackRecord))) 138 | uid = [self.data.name2id['user'][u]] 139 | return self.sess.run(self.decoder_op, feed_dict={self.X:vec,self.v_idx:uid})[0] 140 | else: 141 | return [self.data.globalMean] * len(self.data.TrackRecord) 142 | -------------------------------------------------------------------------------- /recommender/advanced/ExpoMF.py: -------------------------------------------------------------------------------- 1 | from base.IterativeRecommender import IterativeRecommender 2 | from scipy.sparse import * 3 | from scipy import * 4 | import numpy as np 5 | from numpy import linalg as LA 6 | from joblib import Parallel, delayed 7 | from math import sqrt 8 | 9 | EPS = 1e-8 10 | # this algorithm refers to the following paper: 11 | # #########---- Modeling User Exposure in Recommendation ----############# 12 | 13 | class ExpoMF(IterativeRecommender): 14 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 15 | super(ExpoMF, self).__init__(conf,trainingSet,testSet,fold) 16 | 17 | def initModel(self): 18 | super(ExpoMF, self).initModel() 19 | self.lam_theta = 1e-5 20 | self.lam_beta = 1e-5 21 | self.lam_y = 1.0 22 | self.init_mu = 0.01 23 | self.a = 1.0 24 | self.b = 99.0 25 | self.init_std = 0.01 26 | self.theta = self.init_std * \ 27 | np.random.randn(self.m, self.k).astype(np.float32) 28 | self.beta = self.init_std * \ 29 | np.random.randn(self.n, self.k).astype(np.float32) 30 | self.mu = self.init_mu * np.ones(self.n, dtype=np.float32) 31 | self.n_jobs=1 32 | self.batch_size=300 33 | row,col,val = [],[],[] 34 | for user in self.data.userRecord: 35 | u = self.data.getId(user, 'user') 36 | for item in self.data.userRecord[user]: 37 | i = self.data.getId(item['track'], 'track') 38 | row.append(u) 39 | col.append(i) 40 | val.append(1) 41 | self.X = csr_matrix((np.array(val),(np.array(row),np.array(col))),(self.m,self.n)) 42 | 43 | def buildModel(self): 44 | print('training...') 45 | n_users = self.X.shape[0] 46 | XT = self.X.T.tocsr() # pre-compute this 47 | for i in range(self.maxIter): 48 | print('ITERATION #%d' % i) 49 | self._update_factors(self.X, XT) 50 | self._update_expo(self.X, n_users) 51 | 52 | 53 | def _update_factors(self, X, XT): 54 | '''Update user and item collaborative factors with ALS''' 55 | print('update factors...') 56 | self.theta = recompute_factors(self.beta, self.theta, X, 57 | self.lam_theta / self.lam_y, 58 | self.lam_y, 59 | self.mu, 60 | self.n_jobs, 61 | batch_size=self.batch_size) 62 | 63 | self.beta = recompute_factors(self.theta, self.beta, XT, 64 | self.lam_beta / self.lam_y, 65 | self.lam_y, 66 | self.mu, 67 | self.n_jobs, 68 | batch_size=self.batch_size) 69 | 70 | 71 | def _update_expo(self, X, n_users): 72 | '''Update exposure prior''' 73 | print('\tUpdating exposure prior...') 74 | 75 | start_idx = list(range(0, n_users, self.batch_size)) 76 | end_idx = start_idx[1:] + [n_users] 77 | 78 | A_sum = np.zeros_like(self.mu) 79 | for lo, hi in zip(start_idx, end_idx): 80 | A_sum += a_row_batch(X[lo:hi], self.theta[lo:hi], self.beta, 81 | self.lam_y, self.mu).sum(axis=0) 82 | print(self.mu) 83 | self.mu = (self.a + A_sum - 1) / (self.a + self.b + n_users - 2) 84 | 85 | 86 | def predictForRanking(self,u): 87 | 'invoked to rank all the items for the user' 88 | if self.data.contains(u,'user'): 89 | u = self.data.getId(u,'user') 90 | return self.beta.dot(self.theta[u]) 91 | else: 92 | return [self.data.globalMean] * len(self.listened['track']) 93 | 94 | # Utility functions # 95 | 96 | 97 | 98 | def get_row(Y, i): 99 | '''Given a scipy.sparse.csr_matrix Y, get the values and indices of the 100 | non-zero values in i_th row''' 101 | lo, hi = Y.indptr[i], Y.indptr[i + 1] 102 | return Y.data[lo:hi], Y.indices[lo:hi] 103 | 104 | def a_row_batch(Y_batch, theta_batch, beta, lam_y, mu): 105 | '''Compute the posterior of exposure latent variables A by batch''' 106 | pEX = sqrt(lam_y / 2 * np.pi) * \ 107 | np.exp(-lam_y * theta_batch.dot(beta.T) ** 2 / 2) 108 | #print pEX.shape,mu.shape 109 | A = (pEX + EPS) / (pEX + EPS + (1 - mu) / mu) 110 | A[Y_batch.nonzero()] = 1. 111 | return A 112 | 113 | def _solve(k, A_k, X, Y, f, lam, lam_y, mu): 114 | '''Update one single factor''' 115 | s_u, i_u = get_row(Y, k) 116 | a = np.dot(s_u * A_k[i_u], X[i_u]) 117 | B = X.T.dot(A_k[:, np.newaxis] * X) + lam * np.eye(f) 118 | return LA.solve(B, a) 119 | 120 | def _solve_batch(lo, hi, X, X_old_batch, Y, m, f, lam, lam_y, mu): 121 | '''Update factors by batch, will eventually call _solve() on each factor to 122 | keep the parallel process busy''' 123 | assert X_old_batch.shape[0] == hi - lo 124 | 125 | if mu.size == X.shape[0]: # update users 126 | A_batch = a_row_batch(Y[lo:hi], X_old_batch, X, lam_y, mu) 127 | else: # update items 128 | A_batch = a_row_batch(Y[lo:hi], X_old_batch, X, lam_y, mu[lo:hi, 129 | np.newaxis]) 130 | 131 | X_batch = np.empty_like(X_old_batch, dtype=X_old_batch.dtype) 132 | for ib, k in enumerate(range(lo, hi)): 133 | X_batch[ib] = _solve(k, A_batch[ib], X, Y, f, lam, lam_y, mu) 134 | return X_batch 135 | 136 | def recompute_factors(X, X_old, Y, lam, lam_y, mu, n_jobs, batch_size=100): 137 | '''Regress X to Y with exposure matrix (computed on-the-fly with X_old) and 138 | ridge term lam by embarrassingly parallelization. All the comments below 139 | are in the view of computing user factors''' 140 | m, n = Y.shape # m = number of users, n = number of items 141 | assert X.shape[0] == n 142 | assert X_old.shape[0] == m 143 | f = X.shape[1] # f = number of factors 144 | 145 | start_idx = list(range(0, m, batch_size)) 146 | end_idx = start_idx[1:] + [m] 147 | 148 | res = Parallel(n_jobs=n_jobs)(delayed(_solve_batch)( 149 | lo, hi, X, X_old[lo:hi], Y, m, f, lam, lam_y, mu) 150 | for lo, hi in zip(start_idx, end_idx)) 151 | 152 | X_new = np.vstack(res) 153 | return X_new -------------------------------------------------------------------------------- /recommender/advanced/RRN.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.DeepRecommender import DeepRecommender 3 | import math 4 | import numpy as np 5 | from tool import config 6 | from collections import defaultdict 7 | from scipy.sparse import * 8 | from scipy import * 9 | import tensorflow as tf 10 | from random import shuffle 11 | 12 | np.random.seed(3) 13 | 14 | class RRN(DeepRecommender): 15 | 16 | # Recurrent Recommender Networks. WSDM 2017 17 | # Chao-Yuan Wu, Amr Ahmed, Alex Beutel, Alexander J. Smola, How Jing 18 | 19 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 20 | super(RRN, self).__init__(conf,trainingSet,testSet,fold) 21 | 22 | def initModel(self): 23 | super(RRN, self).initModel() 24 | 25 | self.negative_sp = 5 26 | self.n_step = 1 27 | self.userID = tf.placeholder(tf.int32, [None, 1], name='user_onehot') 28 | self.itemID = tf.placeholder(tf.int32, [None, 1], name='item_onehot') 29 | self.rating = tf.placeholder(tf.float32, [None,1], name="rating") 30 | self.dropout = tf.placeholder(tf.float32, name='dropout') 31 | 32 | self.U = np.zeros((self.m, self.k)) 33 | self.V = np.zeros((self.n, self.k)) 34 | 35 | self.userListen = defaultdict(dict) 36 | for user in self.data.userRecord: 37 | for item in self.data.userRecord[user]: 38 | if item[self.recType] not in self.userListen[user]: 39 | self.userListen[user][item[self.recType]] = 0 40 | self.userListen[user][item[self.recType]] += 1 41 | print ('training...') 42 | 43 | # def readConfiguration(self): 44 | # super(RRN, self).readConfiguration() 45 | # options = config.LineConfig(self.config['RRN']) 46 | # self.alpha = float(options['-alpha']) 47 | # self.topK = int(options['-k']) 48 | # self.negCount = int(options['-neg']) 49 | 50 | def buildModel(self): 51 | print ('the tensorflow...') 52 | with tf.name_scope("user_embedding"): 53 | # user id embedding 54 | uid_onehot = tf.reshape(tf.one_hot(self.userID, self.n), shape=[-1, self.n]) 55 | # uid_onehot_rating = tf.multiply(self.rating, uid_onehot) 56 | uid_layer = tf.layers.dense(uid_onehot, units=128, activation=tf.nn.relu) 57 | self.uid_layer = tf.reshape(uid_layer, shape=[-1, self.n_step, 128]) 58 | 59 | with tf.name_scope("item_embedding"): 60 | # movie id embedding 61 | vid_onehot = tf.reshape(tf.one_hot(self.itemID, self.m), shape=[-1, self.m]) 62 | # mid_onehot_rating = tf.multiply(self.rating, mid_onehot) 63 | vid_layer = tf.layers.dense(vid_onehot, units=128, activation=tf.nn.relu) 64 | self.vid_layer = tf.reshape(vid_layer, shape=[-1, self.n_step, 128]) 65 | 66 | with tf.variable_scope("user_rnn_cell", reuse=tf.AUTO_REUSE): 67 | userCell = tf.nn.rnn_cell.GRUCell(num_units=128) 68 | userInput = tf.transpose(self.vid_layer, [1, 0, 2]) 69 | userOutputs, userStates = tf.nn.dynamic_rnn(userCell, userInput, dtype=tf.float32) 70 | self.userOutput = userOutputs[-1] 71 | with tf.variable_scope("item_rnn_cell", reuse=tf.AUTO_REUSE): 72 | itemCell = tf.nn.rnn_cell.GRUCell(num_units=128) 73 | itemInput = tf.transpose(self.uid_layer, [1, 0, 2]) 74 | itemOutputs, itemStates = tf.nn.dynamic_rnn(itemCell, itemInput, dtype=tf.float32) 75 | self.itemOutput = itemOutputs[-1] 76 | 77 | user_W = tf.Variable(tf.random_normal(shape=[128, self.k], stddev=0.1)) 78 | item_W = tf.Variable(tf.random_normal(shape=[128, self.k], stddev=0.1)) 79 | user_b = tf.Variable(tf.random_normal(shape=[self.k], stddev=0.1)) 80 | item_b = tf.Variable(tf.random_normal(shape=[self.k], stddev=0.1)) 81 | 82 | self.U_embedding = tf.add(tf.matmul(self.userOutput, user_W), user_b) 83 | self.V_embedding = tf.add(tf.matmul(self.itemOutput, item_W), item_b) 84 | 85 | self.pred = tf.reduce_sum(tf.multiply(self.U_embedding, self.V_embedding), axis=1, keep_dims=True) 86 | 87 | loss = tf.losses.mean_squared_error(self.rating, self.pred) 88 | self.loss = tf.reduce_mean(loss) 89 | 90 | optimizer = tf.train.AdamOptimizer(self.lRate).minimize(self.loss) 91 | 92 | self.sess = tf.Session() 93 | init = tf.global_variables_initializer() 94 | self.sess.run(init) 95 | 96 | total_batch = int(len(self.data.trainingData)/ self.batch_size) 97 | for epoch in range(self.maxIter): 98 | user_idx, item_idx, ratings = self.next_batch() 99 | _,loss= self.sess.run([optimizer, self.loss], feed_dict={self.userID:user_idx, self.itemID:item_idx, self.rating:ratings, self.dropout:1.}) 100 | print('iteration:', epoch, 'loss:', loss) 101 | #print (self.foldInfo, "Epoch:", '%03d' % (epoch + 1), "Batch:", '%03d' % (i + 1), "loss=", "{:.9f}".format(loss)) 102 | U_embedding, V_embedding = self.sess.run([self.U_embedding, self.V_embedding], feed_dict={self.userID:user_idx, self.itemID:item_idx}) 103 | for ue,u in zip(U_embedding,user_idx): 104 | self.U[u] = ue 105 | for ve,v in zip(V_embedding,item_idx): 106 | self.V[v] = ve 107 | self.ranking_performance() 108 | print("Optimization Finished!") 109 | 110 | def next_batch(self): 111 | batch_idx = np.random.randint(len(self.data.trainingData), size=self.batch_size) 112 | users = [self.data.trainingData[idx]['user'] for idx in batch_idx] 113 | items = [self.data.trainingData[idx]['track'] for idx in batch_idx] 114 | user_idx,item_idx=[],[] 115 | ratings = [] 116 | for i,user in enumerate(users): 117 | uid = self.data.getId(user, 'user') 118 | tid = self.data.getId(items[i], 'track') 119 | rating = 0 120 | if items[i] in self.userListen[user]: 121 | rating = self.userListen[user][items[i]] 122 | user_idx.append([uid]) 123 | item_idx.append([tid]) 124 | ratings.append([rating]) 125 | return np.array(user_idx), np.array(item_idx), np.array(ratings) 126 | 127 | def predict(self, u): 128 | 'invoked to rank all the items for the user' 129 | uid = self.data.getId(u,'user') 130 | return self.V.dot(self.U[uid]) 131 | 132 | -------------------------------------------------------------------------------- /tool/TSNE.py: -------------------------------------------------------------------------------- 1 | # 2 | # tsne.py 3 | # 4 | # Implementation of t-SNE in Python. The implementation was tested on Python 5 | # 2.7.10, and it requires a working installation of NumPy. The implementation 6 | # comes with an example on the MNIST dataset. In order to plot the 7 | # results of this example, a working installation of matplotlib is required. 8 | # 9 | # The example can be run by executing: `ipython tsne.py` 10 | # 11 | # 12 | # Created by Laurens van der Maaten on 20-12-08. 13 | # Copyright (c) 2008 Tilburg University. All rights reserved. 14 | 15 | import numpy as np 16 | import pylab 17 | 18 | 19 | def Hbeta(D=np.array([]), beta=1.0): 20 | """ 21 | Compute the perplexity and the P-row for a specific value of the 22 | precision of a Gaussian distribution. 23 | """ 24 | 25 | # Compute P-row and corresponding perplexity 26 | P = np.exp(-D.copy() * beta) 27 | sumP = sum(P) 28 | H = np.log(sumP) + beta * np.sum(D * P) / sumP 29 | P = P / sumP 30 | return H, P 31 | 32 | 33 | def x2p(X=np.array([]), tol=1e-5, perplexity=30.0): 34 | """ 35 | Performs a binary search to get P-values in such a way that each 36 | conditional Gaussian has the same perplexity. 37 | """ 38 | 39 | # Initialize some variables 40 | print("Computing pairwise distances...") 41 | (n, d) = X.shape 42 | sum_X = np.sum(np.square(X), 1) 43 | D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) 44 | P = np.zeros((n, n)) 45 | beta = np.ones((n, 1)) 46 | logU = np.log(perplexity) 47 | 48 | # Loop over all datapoints 49 | for i in range(n): 50 | 51 | # Print progress 52 | if i % 500 == 0: 53 | print("Computing P-values for point %d of %d..." % (i, n)) 54 | 55 | # Compute the Gaussian kernel and entropy for the current precision 56 | betamin = -np.inf 57 | betamax = np.inf 58 | Di = D[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] 59 | (H, thisP) = Hbeta(Di, beta[i]) 60 | 61 | # Evaluate whether the perplexity is within tolerance 62 | Hdiff = H - logU 63 | tries = 0 64 | while np.abs(Hdiff) > tol and tries < 50: 65 | 66 | # If not, increase or decrease precision 67 | if Hdiff > 0: 68 | betamin = beta[i].copy() 69 | if betamax == np.inf or betamax == -np.inf: 70 | beta[i] = beta[i] * 2. 71 | else: 72 | beta[i] = (beta[i] + betamax) / 2. 73 | else: 74 | betamax = beta[i].copy() 75 | if betamin == np.inf or betamin == -np.inf: 76 | beta[i] = beta[i] / 2. 77 | else: 78 | beta[i] = (beta[i] + betamin) / 2. 79 | 80 | # Recompute the values 81 | (H, thisP) = Hbeta(Di, beta[i]) 82 | Hdiff = H - logU 83 | tries += 1 84 | 85 | # Set the final row of P 86 | P[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] = thisP 87 | 88 | # Return final P-matrix 89 | print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta))) 90 | return P 91 | 92 | 93 | def pca(X=np.array([]), no_dims=50): 94 | """ 95 | Runs PCA on the NxD array X in order to reduce its dimensionality to 96 | no_dims dimensions. 97 | """ 98 | 99 | print("Preprocessing the data using PCA...") 100 | (n, d) = X.shape 101 | X = X - np.tile(np.mean(X, 0), (n, 1)) 102 | (l, M) = np.linalg.eig(np.dot(X.T, X)) 103 | Y = np.dot(X, M[:, 0:no_dims]) 104 | return Y 105 | 106 | 107 | def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0): 108 | """ 109 | Runs t-SNE on the dataset in the NxD array X to reduce its 110 | dimensionality to no_dims dimensions. The syntaxis of the function is 111 | `Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array. 112 | """ 113 | 114 | # Check inputs 115 | if isinstance(no_dims, float): 116 | print("Error: array X should have type float.") 117 | return -1 118 | if round(no_dims) != no_dims: 119 | print("Error: number of dimensions should be an integer.") 120 | return -1 121 | 122 | # Initialize variables 123 | X = pca(X, initial_dims).real 124 | (n, d) = X.shape 125 | max_iter = 1000 126 | initial_momentum = 0.5 127 | final_momentum = 0.8 128 | eta = 500 129 | min_gain = 0.01 130 | Y = np.random.randn(n, no_dims) 131 | dY = np.zeros((n, no_dims)) 132 | iY = np.zeros((n, no_dims)) 133 | gains = np.ones((n, no_dims)) 134 | 135 | # Compute P-values 136 | P = x2p(X, 1e-5, perplexity) 137 | P = P + np.transpose(P) 138 | P = P / np.sum(P) 139 | P = P * 4. # early exaggeration 140 | P = np.maximum(P, 1e-12) 141 | 142 | # Run iterations 143 | for iter in range(max_iter): 144 | 145 | # Compute pairwise affinities 146 | sum_Y = np.sum(np.square(Y), 1) 147 | num = -2. * np.dot(Y, Y.T) 148 | num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y)) 149 | num[range(n), range(n)] = 0. 150 | Q = num / np.sum(num) 151 | Q = np.maximum(Q, 1e-12) 152 | 153 | # Compute gradient 154 | PQ = P - Q 155 | for i in range(n): 156 | dY[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0) 157 | 158 | # Perform the update 159 | if iter < 20: 160 | momentum = initial_momentum 161 | else: 162 | momentum = final_momentum 163 | gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \ 164 | (gains * 0.8) * ((dY > 0.) == (iY > 0.)) 165 | gains[gains < min_gain] = min_gain 166 | iY = momentum * iY - eta * (gains * dY) 167 | Y = Y + iY 168 | Y = Y - np.tile(np.mean(Y, 0), (n, 1)) 169 | 170 | # Compute current value of cost function 171 | if (iter + 1) % 10 == 0: 172 | C = np.sum(P * np.log(P / Q)) 173 | print("Iteration %d: error is %f" % (iter + 1, C)) 174 | 175 | # Stop lying about P-values 176 | if iter == 100: 177 | P = P / 4. 178 | 179 | # Return solution 180 | return Y 181 | 182 | 183 | # if __name__ == "__main__": 184 | # print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.") 185 | # print("Running example on 2,500 MNIST digits...") 186 | # X = np.loadtxt("mnist2500_X.txt") 187 | # labels = np.loadtxt("mnist2500_labels.txt") 188 | # Y = tsne(X, 2, 50, 20.0) 189 | # pylab.scatter(Y[:, 0], Y[:, 1], 20, labels) 190 | # pylab.show() 191 | -------------------------------------------------------------------------------- /recommender/advanced/APR.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.DeepRecommender import DeepRecommender 3 | import numpy as np 4 | import random 5 | from tool import config 6 | import tensorflow as tf 7 | from tensorflow import set_random_seed 8 | from collections import defaultdict 9 | 10 | set_random_seed(2) 11 | 12 | class APR(DeepRecommender): 13 | # APR:Adversarial Personalized Ranking for Recommendation 14 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 15 | super(APR, self).__init__(conf,trainingSet,testSet,fold) 16 | 17 | def readConfiguration(self): 18 | super(APR, self).readConfiguration() 19 | args = config.LineConfig(self.config['APR']) 20 | self.eps = float(args['-eps']) 21 | self.regAdv = float(args['-regA']) 22 | self.advEpoch = int(args['-advEpoch']) 23 | self.negativeCount = 3 24 | 25 | def _create_variables(self): 26 | #perturbation vectors 27 | self.adv_U = tf.Variable(tf.zeros(shape=[self.m, self.k]),dtype=tf.float32, trainable=False) 28 | self.adv_V = tf.Variable(tf.zeros(shape=[self.n, self.k]),dtype=tf.float32, trainable=False) 29 | 30 | self.neg_idx = tf.placeholder(tf.int32, [None], name="n_idx") 31 | self.V_neg_embed = tf.nn.embedding_lookup(self.V, self.neg_idx) 32 | #parameters 33 | self.eps = tf.constant(self.eps,dtype=tf.float32) 34 | self.regAdv = tf.constant(self.regAdv,dtype=tf.float32) 35 | 36 | def _create_inference(self): 37 | result = tf.subtract(tf.reduce_sum(tf.multiply(self.U_embed, self.V_embed), 1), 38 | tf.reduce_sum(tf.multiply(self.U_embed, self.V_neg_embed), 1)) 39 | return result 40 | 41 | def _create_adv_inference(self): 42 | self.U_plus_delta = tf.add(self.U_embed, tf.nn.embedding_lookup(self.adv_U, self.u_idx)) 43 | self.V_plus_delta = tf.add(self.V_embed, tf.nn.embedding_lookup(self.adv_V, self.v_idx)) 44 | self.V_neg_plus_delta = tf.add(self.V_neg_embed, tf.nn.embedding_lookup(self.adv_V, self.neg_idx)) 45 | result = tf.subtract(tf.reduce_sum(tf.multiply(self.U_plus_delta, self.V_plus_delta), 1), 46 | tf.reduce_sum(tf.multiply(self.U_plus_delta, self.V_neg_plus_delta), 1)) 47 | return result 48 | 49 | def _create_adversarial(self): 50 | #get gradients of Delta 51 | self.grad_U, self.grad_V = tf.gradients(self.loss_adv, [self.adv_U,self.adv_V]) 52 | 53 | # convert the IndexedSlice Data to Dense Tensor 54 | self.grad_U_dense = tf.stop_gradient(self.grad_U) 55 | self.grad_V_dense = tf.stop_gradient(self.grad_V) 56 | # normalization: new_grad = (grad / |grad|) * eps 57 | self.update_U = self.adv_U.assign(tf.nn.l2_normalize(self.grad_U_dense, 1) * self.eps) 58 | self.update_V = self.adv_V.assign(tf.nn.l2_normalize(self.grad_V_dense, 1) * self.eps) 59 | 60 | def _create_loss(self): 61 | self.reg_lambda = tf.constant(self.regU, dtype=tf.float32) 62 | self.loss = tf.reduce_sum(tf.nn.softplus(-self._create_inference())) 63 | self.reg_loss = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.U_embed)), 64 | tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_embed))) 65 | 66 | self.reg_loss = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.U_embed)), self.reg_loss) 67 | self.total_loss = tf.add(self.loss, self.reg_loss) 68 | #loss of adversarial training 69 | self.loss_adv = tf.multiply(self.regAdv, tf.reduce_sum(tf.nn.softplus(-self._create_adv_inference()))) 70 | self.loss_adv = tf.add(self.loss,self.loss_adv) 71 | 72 | def _create_optimizer(self): 73 | self.optimizer = tf.train.AdamOptimizer(self.lRate) 74 | self.train = self.optimizer.minimize(self.total_loss) 75 | self.optimizer_adv = tf.train.AdamOptimizer(self.lRate) 76 | self.train_adv = self.optimizer.minimize(self.loss_adv) 77 | 78 | def initModel(self): 79 | super(APR, self).initModel() 80 | 81 | self.userListen = defaultdict(dict) 82 | for user in self.data.userRecord: 83 | for item in self.data.userRecord[user]: 84 | if item[self.recType] not in self.userListen[user]: 85 | self.userListen[user][item[self.recType]] = 1 86 | self.userListen[user][item[self.recType]] += 1 87 | print ('training...') 88 | 89 | self._create_variables() 90 | self._create_loss() 91 | self._create_adversarial() 92 | self._create_optimizer() 93 | 94 | 95 | def next_batch(self): 96 | batch_idx = np.random.randint(self.train_size, size=self.batch_size) 97 | 98 | users = [self.data.trainingData[idx]['user'] for idx in batch_idx] 99 | items = [self.data.trainingData[idx]['track'] for idx in batch_idx] 100 | user_idx,item_idx=[],[] 101 | neg_item_idx = [] 102 | 103 | for i,user in enumerate(users): 104 | for j in range(self.negativeCount): #negative sampling 105 | item_j = random.randint(0,self.n-1) 106 | while self.data.id2name['track'][item_j] in self.userListen[user]: 107 | item_j = random.randint(0, self.n - 1) 108 | user_idx.append(self.data.getId(user, 'user')) 109 | item_idx.append(self.data.getId(items[i], 'track')) 110 | neg_item_idx.append(item_j) 111 | return user_idx,item_idx,neg_item_idx 112 | 113 | def buildModel(self): 114 | print ('training...') 115 | iteration = 0 116 | with tf.Session() as sess: 117 | init = tf.global_variables_initializer() 118 | sess.run(init) 119 | # train the model until converged 120 | for epoch in range(self.maxIter): 121 | user_idx,item_idx,neg_item_idx = self.next_batch() 122 | _,loss = sess.run([self.train,self.total_loss],feed_dict={self.u_idx: user_idx, self.v_idx: item_idx, self.neg_idx:neg_item_idx}) 123 | print ('iteration:', epoch, 'loss:',loss) 124 | 125 | self.P = sess.run(self.U) 126 | self.Q = sess.run(self.V) 127 | self.ranking_performance() 128 | # start adversarial training 129 | for epoch in range(self.advEpoch): 130 | user_idx,item_idx,neg_item_idx = self.next_batch() 131 | sess.run([self.update_U, self.update_V], 132 | feed_dict={self.u_idx: user_idx, self.v_idx: item_idx, self.neg_idx: neg_item_idx}) 133 | _,loss = sess.run([self.train_adv,self.loss_adv],feed_dict={self.u_idx: user_idx, self.v_idx: item_idx, self.neg_idx:neg_item_idx}) 134 | print ('iteration:', epoch, 'loss:',loss) 135 | self.P = sess.run(self.U) 136 | self.Q = sess.run(self.V) 137 | self.ranking_performance() 138 | 139 | def predictForRanking(self, u): 140 | 'invoked to rank all the items for the user' 141 | uid = self.data.getId(u,'user') 142 | return np.divide(self.V.dot(self.U[uid]), self.normalized_U[uid]*self.normalized_V) 143 | 144 | -------------------------------------------------------------------------------- /base/recommender.py: -------------------------------------------------------------------------------- 1 | from data.record import Record 2 | from tool.file import FileIO 3 | from tool.qmath import denormalize 4 | from tool.config import Config,LineConfig 5 | from os.path import abspath 6 | from time import strftime,localtime,time 7 | from evaluation.measure import Measure 8 | from collections import defaultdict 9 | class Recommender(object): 10 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 11 | self.config = conf 12 | self.isSaveModel = False 13 | self.isLoadModel = False 14 | self.isOutput = True 15 | self.data = Record(self.config, trainingSet, testSet) 16 | self.foldInfo = fold 17 | self.evalConfig = LineConfig(self.config['evaluation.setup']) 18 | if self.evalConfig.contains('-target'): 19 | self.recType = self.evalConfig['-target'] 20 | else: 21 | self.recType = 'track' 22 | if LineConfig(self.config['evaluation.setup']).contains('-cold'): 23 | #evaluation on cold-start users 24 | threshold = int(LineConfig(self.config['evaluation.setup'])['-cold']) 25 | removedUser = [] 26 | removedTrack = defaultdict(list) 27 | #for user in self.data.testSet: 28 | # if user in self.data.userRecord and len(self.data.userRecord[user])>threshold: 29 | # removedUser.append(user) 30 | for user in self.data.testSet: 31 | if user in self.data.userRecord: 32 | for item in self.data.testSet[user]: 33 | if len(self.data.trackRecord[item]) > threshold: 34 | removedTrack[user].append(item) 35 | for user in removedTrack: 36 | for item in removedTrack[user]: 37 | del self.data.testSet[user][item] 38 | if len(self.data.testSet[user]) == 0: 39 | del self.data.testSet[user] 40 | #for user in removedUser: 41 | # del self.data.testSet[user] 42 | 43 | 44 | 45 | if LineConfig(self.config['evaluation.setup']).contains('-sample'): 46 | userList = list(self.data.testSet.keys()) 47 | removedUser=userList[:int(len(userList)*0.9)] 48 | for user in removedUser: 49 | del self.data.testSet[user] 50 | 51 | def readConfiguration(self): 52 | self.algorName = self.config['recommender'] 53 | self.output = LineConfig(self.config['output.setup']) 54 | self.isOutput = self.output.isMainOn() 55 | self.ranking = LineConfig(self.config['item.ranking']) 56 | 57 | def printAlgorConfig(self): 58 | "show algorithm's configuration" 59 | print ('Algorithm:',self.config['recommender']) 60 | print ('Training set:',abspath(self.config['record'])) 61 | if LineConfig(self.config['evaluation.setup']).contains('-testSet'): 62 | print ('Test set:',abspath(LineConfig(self.config['evaluation.setup']).getOption('-testSet'))) 63 | #print 'Count of the users in training set: ',len() 64 | self.data.printTrainingSize() 65 | print ('='*80) 66 | 67 | def initModel(self): 68 | pass 69 | 70 | def buildModel(self): 71 | 'build the model (for model-based algorithms )' 72 | pass 73 | 74 | def saveModel(self): 75 | pass 76 | 77 | def loadModel(self): 78 | pass 79 | 80 | def predict(self,user): 81 | return [] 82 | 83 | 84 | 85 | def evalRanking(self): 86 | res = [] # used to contain the text of the result 87 | N = 0 88 | threshold = 0 89 | top = self.ranking['-topN'].split(',') 90 | top = [int(num) for num in top] 91 | N = int(top[-1]) 92 | if N > 100 or N < 0: 93 | print ('N can not be larger than 100! It has been reassigned with 10') 94 | N = 10 95 | 96 | res.append('userId: recommendations in (itemId, ranking score) pairs, * means the item matches, $ means the unpop item\n') 97 | # predict 98 | recList = {} 99 | userCount = len(self.data.testSet) 100 | 101 | for i, user in enumerate(self.data.testSet): 102 | 103 | num_pop = 0 104 | 105 | line = user + ':' 106 | if user in self.data.userRecord: 107 | predictedItems = self.predict(user) 108 | else: 109 | predictedItems = ['0']*N 110 | predicted = {} 111 | for k,item in enumerate(predictedItems): 112 | predicted[item] = k 113 | for item in self.data.userRecord[user]: 114 | if item[self.recType] in predicted: 115 | del predicted[item[self.recType]] 116 | predicted = sorted(predicted.items(),key=lambda d:d[1]) 117 | predictedItems = [item[0] for item in predicted] 118 | recList[user] = predictedItems[:N] 119 | #print('user', user, 'the recList:', type(self.data.testSet[user])) 120 | 121 | if i % 100 == 0: 122 | print (self.algorName, self.foldInfo, 'progress:' + str(i) + '/' + str(userCount)) 123 | for item in recList[user]: 124 | if item in self.data.testSet[user]: 125 | line += '*' 126 | if item in self.data.PopTrack: 127 | num_pop += 1 128 | line += '$' 129 | line += item + ',' 130 | 131 | line += '\n' 132 | res.append(line) 133 | currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) 134 | # output prediction result 135 | if self.isOutput: 136 | fileName = '' 137 | outDir = self.output['-dir'] 138 | if self.ranking.contains('-topN'): 139 | fileName = self.config['recommender'] + '@' + currentTime + '-top-' + self.ranking['-topN']\ 140 | + 'items' + self.foldInfo + '.txt' 141 | FileIO.writeFile(outDir, fileName, res) 142 | print ('The result has been output to ', abspath(outDir), '.') 143 | # output evaluation result 144 | outDir = self.output['-dir'] 145 | fileName = self.config['recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' 146 | 147 | self.measure = Measure.rankingMeasure(self.data.testSet, recList,top,self.data.getSize(self.recType)) 148 | 149 | FileIO.writeFile(outDir, fileName, self.measure) 150 | print ('The result of %s %s:\n%s' % (self.algorName, self.foldInfo, ''.join(self.measure))) 151 | 152 | def execute(self): 153 | self.readConfiguration() 154 | if self.foldInfo == '[1]': 155 | self.printAlgorConfig() 156 | #load model from disk or build model 157 | if self.isLoadModel: 158 | print ('Loading model %s...' %(self.foldInfo)) 159 | self.loadModel() 160 | else: 161 | print ('Initializing model %s...' %(self.foldInfo)) 162 | self.initModel() 163 | print ('Building Model %s...' %(self.foldInfo)) 164 | self.buildModel() 165 | 166 | #preict the ratings or item ranking 167 | print ('Predicting %s...' %(self.foldInfo)) 168 | self.evalRanking() 169 | #save model 170 | if self.isSaveModel: 171 | print ('Saving model %s...' %(self.foldInfo)) 172 | self.saveModel() 173 | 174 | return self.measure 175 | -------------------------------------------------------------------------------- /recommender/advanced/NGCF.py: -------------------------------------------------------------------------------- 1 | from base.DeepRecommender import DeepRecommender 2 | from random import choice 3 | import tensorflow as tf 4 | import numpy as np 5 | from math import sqrt 6 | from tensorflow import set_random_seed 7 | from collections import defaultdict 8 | 9 | set_random_seed(2) 10 | 11 | class NGCF(DeepRecommender): 12 | 13 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 14 | super(NGCF, self).__init__(conf,trainingSet,testSet,fold) 15 | 16 | def next_batch(self): 17 | batch_id = 0 18 | while batch_id < self.train_size: 19 | if batch_id + self.batch_size <= self.train_size: 20 | users = [self.data.trainingData[idx]['user'] for idx in range(batch_id, self.batch_size + batch_id)] 21 | items = [self.data.trainingData[idx]['track'] for idx in range(batch_id, self.batch_size + batch_id)] 22 | # 可以不断迭代 23 | batch_id += self.batch_size 24 | else: 25 | users = [self.data.trainingData[idx]['user'] for idx in range(batch_id, self.train_size)] 26 | items = [self.data.trainingData[idx]['track'] for idx in range(batch_id, self.train_size)] 27 | # 当训练数据小于batch_id时,让batch_id最大为trainingData_id 28 | batch_id = self.train_size 29 | 30 | u_idx, i_idx, j_idx = [], [], [] 31 | item_list = list(self.data.trackRecord.keys()) 32 | for i, user in enumerate(users): 33 | u_idx.append(self.data.getId(user, 'user')) 34 | i_idx.append(self.data.getId(items[i], 'track')) 35 | 36 | neg_item = choice(item_list) 37 | while neg_item in self.data.userRecord[user]: 38 | neg_item = choice(item_list) 39 | j_idx.append(self.data.getId(neg_item, 'track')) 40 | 41 | yield u_idx, i_idx, j_idx 42 | 43 | 44 | def initModel(self): 45 | super(NGCF, self).initModel() 46 | 47 | # 构建流程图 48 | self.userListen = defaultdict(dict) 49 | for entry in self.data.trainingData: 50 | user = entry['user'] 51 | track = entry['track'] 52 | if track not in self.userListen[user]: 53 | self.userListen[user][track] = 1 54 | self.userListen[user][track] += 1 55 | 56 | self.isTraining = tf.placeholder(tf.int32) 57 | self.isTraining = tf.cast(self.isTraining, tf.bool) 58 | self.user_embeddings = self.U 59 | self.item_embeddings = self.V 60 | ego_embeddings = tf.concat([self.user_embeddings,self.item_embeddings], axis=0) 61 | 62 | indices = [[self.data.getId(entry['user'], 'user'), self.m+self.data.getId(entry['track'], 'track')] for entry in self.data.trainingData] 63 | indices += [[self.m+self.data.getId(entry['user'], 'user'), self.data.getId(entry['track'], 'track')] for entry in self.data.trainingData] 64 | # 评分(1/sqrt(Nu)/sqrt(Ni)) 65 | values = [] 66 | for entry in self.data.trainingData: 67 | if len(self.data.userRecord[entry['user']]) == 0 or len(self.data.trackRecord[entry['track']]) == 0: 68 | values.append(0) 69 | else: 70 | values.append(float(self.userListen[entry['user']][entry['track']])/sqrt(len(self.data.userRecord[entry['user']]))/sqrt(len(self.data.trackRecord[entry['track']]))) 71 | values = values*2 72 | 73 | norm_adj = tf.SparseTensor(indices=indices, values=values, dense_shape=[self.m+self.n, self.m+self.n]) 74 | 75 | self.weights = dict() 76 | 77 | initializer = tf.contrib.layers.xavier_initializer() 78 | weight_size = [self.k, self.k, self.k] #can be changed 79 | weight_size_list = [self.k] + weight_size 80 | 81 | self.n_layers = 3 82 | 83 | #initialize parameters 84 | for k in range(self.n_layers): 85 | self.weights['W_%d_1' % k] = tf.Variable( 86 | initializer([weight_size_list[k], weight_size_list[k + 1]]), name='W_%d_1' % k) 87 | self.weights['W_%d_2' % k] = tf.Variable( 88 | initializer([weight_size_list[k], weight_size_list[k + 1]]), name='W_%d_2' % k) 89 | 90 | all_embeddings = [ego_embeddings] 91 | for k in range(self.n_layers): 92 | side_embeddings = tf.sparse_tensor_dense_matmul(norm_adj, ego_embeddings) 93 | sum_embeddings = tf.matmul(side_embeddings + ego_embeddings, self.weights['W_%d_1' % k]) 94 | bi_embeddings = tf.multiply(ego_embeddings, side_embeddings) 95 | bi_embeddings = tf.matmul(bi_embeddings, self.weights['W_%d_2' % k]) 96 | 97 | ego_embeddings = tf.nn.leaky_relu(sum_embeddings + bi_embeddings) 98 | 99 | # message dropout. 100 | def without_dropout(): 101 | return ego_embeddings 102 | def dropout(): 103 | return tf.nn.dropout(ego_embeddings, keep_prob=0.9) 104 | 105 | ego_embeddings = tf.cond(self.isTraining, lambda:dropout(), lambda:without_dropout()) 106 | 107 | # normalize the distribution of embeddings. 108 | norm_embeddings = tf.nn.l2_normalize(ego_embeddings, axis=1) 109 | 110 | all_embeddings += [norm_embeddings] 111 | 112 | all_embeddings = tf.concat(all_embeddings, 1) 113 | self.multi_user_embeddings, self.multi_item_embeddings = tf.split(all_embeddings, [self.m, self.n], 0) 114 | 115 | self.neg_idx = tf.placeholder(tf.int32, name="neg_holder") 116 | self.neg_item_embedding = tf.nn.embedding_lookup(self.multi_item_embeddings, self.neg_idx) 117 | self.u_embedding = tf.nn.embedding_lookup(self.multi_user_embeddings, self.u_idx) 118 | self.v_embedding = tf.nn.embedding_lookup(self.multi_item_embeddings, self.v_idx) 119 | 120 | self.test = tf.reduce_sum(tf.multiply(self.u_embedding,self.multi_item_embeddings),1) 121 | 122 | def buildModel(self): 123 | 124 | y = tf.reduce_sum(tf.multiply(self.u_embedding, self.v_embedding), 1) \ 125 | - tf.reduce_sum(tf.multiply(self.u_embedding, self.neg_item_embedding), 1) 126 | 127 | loss = -tf.reduce_sum(tf.log(tf.sigmoid(y))) + self.regU * (tf.nn.l2_loss(self.u_embedding) + 128 | tf.nn.l2_loss(self.v_embedding) + 129 | tf.nn.l2_loss(self.neg_item_embedding)) 130 | opt = tf.train.AdamOptimizer(self.lRate) 131 | 132 | train = opt.minimize(loss) 133 | 134 | init = tf.global_variables_initializer() 135 | self.sess.run(init) 136 | for iteration in range(self.maxIter): 137 | for n, batch in enumerate(self.next_batch()): 138 | user_idx, i_idx, j_idx = batch 139 | _, l = self.sess.run([train, loss], 140 | feed_dict={self.u_idx: user_idx, self.neg_idx: j_idx, self.v_idx: i_idx,self.isTraining:1}) 141 | print('training:', iteration + 1, 'batch', n, 'loss:', l) 142 | 143 | def predictForRanking(self, u): 144 | 'invoked to rank all the items for the user' 145 | if u in self.data.userRecord: 146 | u = self.data.getId(u, 'user') 147 | return self.sess.run(self.test,feed_dict={self.u_idx:u,self.isTraining:0}) 148 | else: 149 | return [self.data.globalMean] * self.n -------------------------------------------------------------------------------- /recommender/advanced/DHCF.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.DeepRecommender import DeepRecommender 3 | import numpy as np 4 | import random 5 | from tool import config 6 | import tensorflow as tf 7 | from tensorflow import set_random_seed 8 | from collections import defaultdict 9 | from scipy.sparse import coo_matrix,hstack 10 | 11 | set_random_seed(2) 12 | 13 | class DHCF(DeepRecommender): 14 | # Dual Channel Hypergraph Collaborative Filtering (KDD 2020). 15 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 16 | super(DHCF, self).__init__(conf,trainingSet,testSet,fold) 17 | 18 | def buildAdjacencyMatrix(self): 19 | row, col, entries = [], [], [] 20 | for pair in self.data.trainingData: 21 | row += [self.data.getId(pair['user'], 'user')] 22 | col += [self.data.getId(pair['track'], 'track')] 23 | entries += [1] 24 | u_i_adj = coo_matrix((entries, (row, col)), shape=(self.m, self.n),dtype=np.float32) 25 | return u_i_adj 26 | 27 | def initModel(self): 28 | super(DHCF, self).initModel() 29 | 30 | self.negativeCount = 5 31 | 32 | self.userListen = defaultdict(dict) 33 | for entry in self.data.trainingData: 34 | if entry['track'] not in self.userListen[entry['user']]: 35 | self.userListen[entry['user']][entry['track']] = 0 36 | self.userListen[entry['user']][entry['track']] += 1 37 | 38 | # Build adjacency matrix 39 | A = self.buildAdjacencyMatrix() 40 | # Build incidence matrix 41 | # H_u = hstack([A, A.dot(A.transpose().dot(A))]) 42 | H_u = A 43 | D_u_v = H_u.sum(axis=1).reshape(1, -1) 44 | D_u_e = H_u.sum(axis=0).reshape(1, -1) 45 | temp1 = (H_u.transpose().multiply(np.sqrt(1.0/D_u_v))).transpose() 46 | temp2 = temp1.transpose() 47 | A_u = temp1.multiply(1.0/D_u_e).dot(temp2) 48 | A_u = A_u.tocoo() 49 | indices = np.mat([A_u.row, A_u.col]).transpose() 50 | H_u = tf.SparseTensor(indices, A_u.data.astype(np.float32), A_u.shape) 51 | 52 | H_i = A.transpose() 53 | D_i_v = H_i.sum(axis=1).reshape(1, -1) 54 | D_i_e = H_i.sum(axis=0).reshape(1, -1) 55 | temp1 = (H_i.transpose().multiply(np.sqrt(1.0 / D_i_v))).transpose() 56 | temp2 = temp1.transpose() 57 | A_i = temp1.multiply(1.0 / D_i_e).dot(temp2) 58 | A_i = A_i.tocoo() 59 | indices = np.mat([A_i.row, A_i.col]).transpose() 60 | H_i = tf.SparseTensor(indices, A_i.data.astype(np.float32), A_i.shape) 61 | 62 | # Build network 63 | self.isTraining = tf.placeholder(tf.int32) 64 | self.isTraining = tf.cast(self.isTraining, tf.bool) 65 | initializer = tf.contrib.layers.xavier_initializer() 66 | self.n_layer = 2 67 | self.weights = {} 68 | for i in range(self.n_layer): 69 | self.weights['layer_%d' %(i+1)] = tf.Variable(initializer([self.k, self.k])) 70 | 71 | user_embeddings = self.U 72 | item_embeddings = self.V 73 | all_user_embeddings = [user_embeddings] 74 | all_item_embeddings = [item_embeddings] 75 | 76 | def without_dropout(embedding): 77 | return embedding 78 | 79 | def dropout(embedding): 80 | return tf.nn.dropout(embedding, keep_prob=0.1) 81 | 82 | for i in range(self.n_layer): 83 | new_user_embeddings = tf.sparse_tensor_dense_matmul(H_u, self.U) 84 | new_item_embeddings = tf.sparse_tensor_dense_matmul(H_i, self.V) 85 | 86 | user_embeddings = tf.nn.leaky_relu(tf.matmul(new_user_embeddings, self.weights['layer_%d' %(i+1)])+ user_embeddings) 87 | item_embeddings = tf.nn.leaky_relu(tf.matmul(new_item_embeddings, self.weights['layer_%d' %(i+1)])+ item_embeddings) 88 | 89 | user_embeddings = tf.cond(self.isTraining, lambda: dropout(user_embeddings), 90 | lambda: without_dropout(user_embeddings)) 91 | item_embeddings = tf.cond(self.isTraining, lambda: dropout(item_embeddings), 92 | lambda: without_dropout(item_embeddings)) 93 | 94 | user_embeddings = tf.nn.l2_normalize(user_embeddings,axis=1) 95 | item_embeddings = tf.nn.l2_normalize(item_embeddings,axis=1) 96 | 97 | all_item_embeddings.append(item_embeddings) 98 | all_user_embeddings.append(user_embeddings) 99 | 100 | # user_embeddings = tf.reduce_sum(all_user_embeddings,axis=0)/(1+self.n_layer) 101 | # item_embeddings = tf.reduce_sum(all_item_embeddings, axis=0) / (1 + self.n_layer) 102 | # 103 | user_embeddings = tf.concat(all_user_embeddings,axis=1) 104 | item_embeddings = tf.concat(all_item_embeddings, axis=1) 105 | 106 | self.neg_idx = tf.placeholder(tf.int32, name="neg_holder") 107 | self.neg_item_embedding = tf.nn.embedding_lookup(item_embeddings, self.neg_idx) 108 | self.u_embedding = tf.nn.embedding_lookup(user_embeddings, self.u_idx) 109 | self.v_embedding = tf.nn.embedding_lookup(item_embeddings, self.v_idx) 110 | self.test = tf.reduce_sum(tf.multiply(self.u_embedding,item_embeddings),1) 111 | 112 | def next_batch_pairwise(self): 113 | batch_id = 0 114 | while batch_id < self.train_size: 115 | if batch_id + self.batch_size <= self.train_size: 116 | users = [self.data.trainingData[idx]['user'] for idx in range(batch_id, self.batch_size + batch_id)] 117 | items = [self.data.trainingData[idx]['track'] for idx in range(batch_id, self.batch_size + batch_id)] 118 | batch_id += self.batch_size 119 | else: 120 | users = [self.data.trainingData[idx]['user'] for idx in range(batch_id, self.train_size)] 121 | items = [self.data.trainingData[idx]['track'] for idx in range(batch_id, self.train_size)] 122 | batch_id = self.train_size 123 | 124 | user_idx,item_idx=[],[] 125 | neg_item_idx = [] 126 | 127 | for i,user in enumerate(users): 128 | for j in range(self.negativeCount): #negative sampling 129 | item_j = random.randint(0,self.n-1) 130 | while self.data.id2name['track'][item_j] in self.userListen[user]: 131 | item_j = random.randint(0, self.n - 1) 132 | user_idx.append(self.data.getId(user, 'user')) 133 | item_idx.append(self.data.getId(items[i], 'track')) 134 | neg_item_idx.append(item_j) 135 | yield user_idx, item_idx, neg_item_idx 136 | 137 | def buildModel(self): 138 | print ('training...') 139 | y = tf.reduce_sum(tf.multiply(self.u_embedding, self.v_embedding), 1) \ 140 | - tf.reduce_sum(tf.multiply(self.u_embedding, self.neg_item_embedding), 1) 141 | reg_loss = self.regU * (tf.nn.l2_loss(self.u_embedding) + tf.nn.l2_loss(self.v_embedding) + 142 | tf.nn.l2_loss(self.neg_item_embedding)) 143 | for i in range(self.n_layer): 144 | reg_loss+= self.regU*tf.nn.l2_loss(self.weights['layer_%d' %(i+1)]) 145 | loss = -tf.reduce_sum(tf.log(tf.sigmoid(y))) + reg_loss 146 | opt = tf.train.AdamOptimizer(self.lRate) 147 | train = opt.minimize(loss) 148 | init = tf.global_variables_initializer() 149 | self.sess.run(init) 150 | for iteration in range(self.maxIter): 151 | for n, batch in enumerate(self.next_batch_pairwise()): 152 | user_idx, i_idx, j_idx = batch 153 | _, l = self.sess.run([train, loss], 154 | feed_dict={self.u_idx: user_idx, self.neg_idx: j_idx, self.v_idx: i_idx, self.isTraining:1}) 155 | print ('training:', iteration + 1, 'batch', n, 'loss:', l) 156 | 157 | 158 | def predict(self, u): 159 | 'invoked to rank all the items for the user' 160 | if self.data.contains(u, 'user'): 161 | uid = self.data.name2id['user'][u] 162 | return self.sess.run(self.test, feed_dict={self.u_idx: [uid], self.isTraining:0}) 163 | else: 164 | uid = self.data.getId(u,'user') 165 | return np.divide(self.V.dot(self.U[uid]), self.normalized_U[uid]*self.normalized_V) 166 | -------------------------------------------------------------------------------- /recommender/advanced/CoFactor.py: -------------------------------------------------------------------------------- 1 | from base.IterativeRecommender import IterativeRecommender 2 | import numpy as np 3 | from tool import config 4 | from collections import defaultdict 5 | from math import log,exp 6 | from scipy.sparse import * 7 | from scipy import * 8 | 9 | class CoFactor(IterativeRecommender): 10 | def __init__(self, conf, trainingSet=None, testSet=None, fold='[1]'): 11 | super(CoFactor, self).__init__(conf, trainingSet, testSet, fold) 12 | 13 | def readConfiguration(self): 14 | super(CoFactor, self).readConfiguration() 15 | extraSettings = config.LineConfig(self.config['CoFactor']) 16 | self.negCount = int(extraSettings['-k']) #the number of negative samples 17 | if self.negCount < 1: 18 | self.negCount = 1 19 | self.regR = float(extraSettings['-gamma']) 20 | self.filter = int(extraSettings['-filter']) 21 | 22 | def printAlgorConfig(self): 23 | super(CoFactor, self).printAlgorConfig() 24 | print('Specified Arguments of', self.config['recommender'] + ':') 25 | print('k: %d' % self.negCount) 26 | print('regR: %.5f' %self.regR) 27 | print('filter: %d' %self.filter) 28 | print('=' * 80) 29 | 30 | def initModel(self): 31 | super(CoFactor, self).initModel() 32 | self.num_items = self.n 33 | self.num_users = self.m 34 | #constructing SPPMI matrix 35 | self.SPPMI = defaultdict(dict) 36 | 37 | self.userListen = defaultdict(dict) 38 | for user in self.data.userRecord: 39 | for item in self.data.userRecord[user]: 40 | if item[self.recType] not in self.userListen[user]: 41 | self.userListen[user][item[self.recType]] = 0 42 | self.userListen[user][item[self.recType]] += 1 43 | 44 | print('Constructing SPPMI matrix...') 45 | #for larger data set has many items, the process will be time consuming 46 | occurrence = defaultdict(dict) 47 | i=0 48 | for item1 in self.data.name2id[self.recType]: 49 | i += 1 50 | if i % 100 == 0: 51 | print(str(i) + '/' + str(self.num_items)) 52 | uList1 = self.data.listened[self.recType][item1] 53 | 54 | if len(self.data.trackRecord[item1]) < self.filter: 55 | continue 56 | for item2 in self.data.name2id[self.recType]: 57 | if item1 == item2: 58 | continue 59 | if item2 not in occurrence[item1]: 60 | uList2 = self.data.listened[self.recType][item2] 61 | if len(self.data.trackRecord[item2]) < self.filter: 62 | continue 63 | count = len(set(uList1).intersection(set(uList2))) 64 | if count > self.filter: 65 | occurrence[item1][item2] = count 66 | occurrence[item2][item1] = count 67 | 68 | maxVal = 0 69 | frequency = {} 70 | for item1 in occurrence: 71 | frequency[item1] = sum(list(occurrence[item1].values())) * 1.0 72 | D = sum(list(frequency.values())) * 1.0 73 | # maxx = -1 74 | for item1 in occurrence: 75 | for item2 in occurrence[item1]: 76 | try: 77 | val = max([log(occurrence[item1][item2] * D / (frequency[item1] * frequency[item2])) - log( 78 | self.negCount), 0]) 79 | except ValueError: 80 | print(self.SPPMI[item1][item2]) 81 | print(self.SPPMI[item1][item2] * D / (frequency[item1] * frequency[item2])) 82 | 83 | if val > 0: 84 | if maxVal < val: 85 | maxVal = val 86 | self.SPPMI[item1][item2] = val 87 | self.SPPMI[item2][item1] = self.SPPMI[item1][item2] 88 | #normalize 89 | for item1 in self.SPPMI: 90 | for item2 in self.SPPMI[item1]: 91 | self.SPPMI[item1][item2] = self.SPPMI[item1][item2]/maxVal 92 | 93 | 94 | def buildModel(self): 95 | iteration = 0 96 | 97 | self.X=self.P*10 #Theta 98 | self.Y=self.Q*10 #Beta 99 | self.w = np.random.rand(self.num_items) / 10 # bias value of item 100 | self.c = np.random.rand(self.num_items) / 10 # bias value of context 101 | self.G = np.random.rand(self.num_items, self.k) / 10 # context embedding 102 | 103 | print('training...') 104 | iteration = 0 105 | while iteration < self.maxIter: 106 | self.loss = 0 107 | YtY = self.Y.T.dot(self.Y) 108 | for user in self.data.name2id['user']: 109 | H = np.ones(self.num_items) 110 | val, pos = [],[] 111 | P_u = np.zeros(self.num_items) 112 | uid = self.data.getId(user,'user') 113 | for item in self.userListen[user]: 114 | iid = self.data.getId(item,self.recType) 115 | r_ui = self.userListen[user][item] 116 | pos.append(iid) 117 | val.append(10 * r_ui) 118 | H[iid] += 10 * r_ui 119 | P_u[iid] = 1 120 | error = (P_u[iid] - self.X[uid].dot(self.Y[iid])) 121 | self.loss += pow(error, 2) 122 | # sparse matrix 123 | C_u = coo_matrix((val, (pos, pos)), shape=(self.num_items, self.num_items)) 124 | A = (YtY + np.dot(self.Y.T, C_u.dot(self.Y)) + self.regU * np.eye(self.k)) 125 | self.X[uid] = np.dot(np.linalg.inv(A), (self.Y.T * H).dot(P_u)) 126 | 127 | XtX = self.X.T.dot(self.X) 128 | for item in self.data.name2id[self.recType]: 129 | P_i = np.zeros(self.num_users) 130 | iid = self.data.getId(item, self.recType) 131 | H = np.ones(self.num_users) 132 | val,pos = [],[] 133 | for user in self.data.listened[self.recType][item]: 134 | uid = self.data.getId(user, 'user') 135 | r_ui = self.data.listened[self.recType][item][user] 136 | pos.append(uid) 137 | val.append(10*r_ui) 138 | H[uid] += 10*r_ui 139 | P_i[uid] = 1 140 | 141 | matrix_g1 = np.zeros((self.k,self.k)) 142 | matrix_g2 = np.zeros((self.k,self.k)) 143 | vector_m1 = np.zeros(self.k) 144 | vector_m2 = np.zeros(self.k) 145 | update_w = 0 146 | update_c = 0 147 | 148 | if len(self.SPPMI[item])>0: 149 | for context in self.SPPMI[item]: 150 | cid = self.data.getId(context, self.recType) 151 | gamma = self.G[cid] 152 | beta = self.Y[cid] 153 | matrix_g1 += gamma.reshape(self.k,1).dot(gamma.reshape(1,self.k)) 154 | vector_m1 += (self.SPPMI[item][context]-self.w[iid]-self.c[cid])*gamma 155 | 156 | matrix_g2 += beta.reshape(self.k,1).dot(beta.reshape(1,self.k)) 157 | vector_m2 += (self.SPPMI[item][context] - self.w[cid]-self.c[iid]) * beta 158 | 159 | update_w += self.SPPMI[item][context]-self.Y[iid].dot(gamma)-self.c[cid] 160 | update_c += self.SPPMI[item][context]-beta.dot(self.G[iid])-self.w[cid] 161 | 162 | C_i = coo_matrix((val, (pos, pos)), shape=(self.num_users, self.num_users)) 163 | A = (XtX + np.dot(self.X.T, C_i.dot(self.X)) + self.regU * np.eye(self.k) + matrix_g1) 164 | self.Y[iid] = np.dot(np.linalg.inv(A), (self.X.T * H).dot(P_i) + vector_m1) 165 | if len(self.SPPMI[item]) > 0: 166 | self.G[iid] = np.dot(np.linalg.inv(matrix_g2+self.regR * np.eye(self.k)),vector_m2) 167 | self.w[iid] = update_w/len(self.SPPMI[item]) 168 | self.c[iid] = update_c/len(self.SPPMI[item]) 169 | 170 | # self.loss += (self.X * self.X).sum() + (self.Y * self.Y).sum() 171 | iteration += 1 172 | print('iteration:', iteration, 'loss:', self.loss) 173 | # if self.isConverged(iteration): 174 | # break 175 | 176 | def predictForRanking(self,u): 177 | 'invoked to rank all the items for the user' 178 | u = self.data.getId(u,'user') 179 | return self.Y.dot(self.X[u]) 180 | -------------------------------------------------------------------------------- /recommender/advanced/NeuMF.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.DeepRecommender import DeepRecommender 3 | import numpy as np 4 | from random import randint 5 | 6 | try: 7 | import tensorflow as tf 8 | except ImportError: 9 | print('This method can only run on tensorflow!') 10 | exit(-1) 11 | from tensorflow import set_random_seed 12 | set_random_seed(2) 13 | tf.reset_default_graph() 14 | 15 | class NeuMF(DeepRecommender): 16 | 17 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 18 | super(NeuMF, self).__init__(conf,trainingSet,testSet,fold) 19 | 20 | def next_batch(self): 21 | batch_idx = np.random.randint(self.train_size, size=self.batch_size) 22 | users = [self.data.trainingData[idx]['user'] for idx in batch_idx] 23 | items = [self.data.trainingData[idx]['track'] for idx in batch_idx] 24 | user_idx,item_idx=[],[] 25 | y = [] 26 | for i,user in enumerate(users): 27 | user_idx.append(self.data.getId(user, 'user')) 28 | item_idx.append(self.data.getId(items[i], 'track')) 29 | y.append(1) 30 | #According to the paper, we sampled four negative instances per positive instance 31 | for instance in range(4): 32 | item_j = randint(0, self.n - 1) 33 | while self.data.id2name[item_j] in self.data.userRecord[user]: 34 | item_j = randint(0, self.n - 1) 35 | user_idx.append(self.data.getId(user, 'user')) 36 | item_idx.append(item_j) 37 | y.append(0) 38 | return user_idx,item_idx,y 39 | 40 | 41 | def initModel(self): 42 | super(NeuMF, self).initModel() 43 | 44 | # parameters used are consistent with default settings in the original paper 45 | mlp_regularizer = tf.contrib.layers.l2_regularizer(scale=0.005) 46 | mf_regularizer = tf.contrib.layers.l2_regularizer(scale=0.005) 47 | initializer = tf.contrib.layers.xavier_initializer() 48 | with tf.variable_scope("latent_factors",reuse=tf.AUTO_REUSE): 49 | self.PG = tf.get_variable(name='PG',initializer=initializer([self.m, self.k]),regularizer=mlp_regularizer) 50 | self.QG = tf.get_variable(name='QG',initializer=initializer([self.n, self.k]),regularizer=mlp_regularizer) 51 | 52 | self.PM = tf.get_variable(name='PM', initializer=initializer([self.m, self.k]), regularizer=mf_regularizer) 53 | self.QM = tf.get_variable(name='QM', initializer=initializer([self.n, self.k]), regularizer=mf_regularizer) 54 | 55 | with tf.name_scope("input"): 56 | self.r = tf.placeholder(tf.float32, [None], name="rating") 57 | self.u_idx = tf.placeholder(tf.int32, [None], name="u_idx") 58 | self.i_idx = tf.placeholder(tf.int32, [None], name="i_idx") 59 | self.UG_embedding = tf.nn.embedding_lookup(self.PG, self.u_idx) 60 | self.IG_embedding = tf.nn.embedding_lookup(self.QG, self.i_idx) 61 | self.UM_embedding = tf.nn.embedding_lookup(self.PM, self.u_idx) 62 | self.IM_embedding = tf.nn.embedding_lookup(self.QM, self.i_idx) 63 | 64 | # Generic Matrix Factorization 65 | with tf.variable_scope("mf_output",reuse=tf.AUTO_REUSE): 66 | self.GMF_Layer = tf.multiply(self.UG_embedding,self.IG_embedding) 67 | self.h_mf = tf.get_variable(name='mf_out', initializer=initializer([self.k]), regularizer=mf_regularizer) 68 | 69 | # MLP 70 | with tf.variable_scope("mlp_params",reuse=tf.AUTO_REUSE): 71 | MLP_W1 = tf.get_variable(initializer=initializer([self.k*2, self.k*2]), name='W1',regularizer=mlp_regularizer) 72 | MLP_b1 = tf.get_variable(initializer=tf.zeros(shape=[self.k*2]), name='b1',regularizer=mlp_regularizer) 73 | self.h_out = tf.nn.relu(tf.add(tf.matmul(tf.concat([self.UM_embedding,self.IM_embedding], 1), MLP_W1), MLP_b1)) 74 | 75 | MLP_W2 = tf.get_variable(initializer=initializer([self.k*2, self.k]), name='W2',regularizer=mlp_regularizer) 76 | MLP_b2 = tf.get_variable(initializer=tf.zeros(shape=[self.k]), name='b2',regularizer=mlp_regularizer) 77 | self.h_out = tf.nn.relu(tf.add(tf.matmul(self.h_out,MLP_W2), MLP_b2)) 78 | 79 | MLP_W3 = tf.get_variable(initializer=initializer([self.k, self.k]), name='W3',regularizer=mlp_regularizer) 80 | MLP_b3 = tf.get_variable(initializer=tf.zeros(shape=[self.k]), name='b3',regularizer=mlp_regularizer) 81 | self.MLP_Layer = tf.nn.relu(tf.add(tf.matmul(self.h_out,MLP_W3), MLP_b3)) 82 | self.h_mlp = tf.get_variable(name='mlp_out', initializer=initializer([self.k]), regularizer=mlp_regularizer) 83 | 84 | reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 85 | mlp_reg = tf.contrib.layers.apply_regularization(mlp_regularizer, reg_variables) 86 | mf_reg = tf.contrib.layers.apply_regularization(mf_regularizer, reg_variables) 87 | 88 | #single inference 89 | #GMF 90 | self.y_mf = tf.reduce_sum(tf.multiply(self.GMF_Layer,self.h_mf),1) 91 | self.y_mf = tf.sigmoid(self.y_mf) 92 | self.mf_loss = self.r * tf.log(self.y_mf) + (1 - self.r) * tf.log(1 - self.y_mf) 93 | self.mf_loss = tf.subtract(self.mf_loss,mf_reg) 94 | self.mf_loss = -tf.reduce_sum(self.mf_loss) 95 | self.mf_optimizer = tf.train.AdamOptimizer(self.lRate).minimize(self.mf_loss) 96 | #MLP 97 | self.y_mlp = tf.reduce_sum(tf.multiply(self.MLP_Layer,self.h_mlp),1) 98 | self.y_mlp = tf.sigmoid(self.y_mlp) 99 | self.mlp_loss = self.r * tf.log(self.y_mlp) + (1 - self.r) * tf.log(1 - self.y_mlp) 100 | self.mlp_loss = tf.subtract(self.mlp_loss, mlp_reg) 101 | self.mlp_loss = -tf.reduce_sum(self.mlp_loss) 102 | self.mlp_optimizer = tf.train.AdamOptimizer(self.lRate).minimize(self.mlp_loss) 103 | #fusion 104 | self.NeuMF_Layer = tf.concat([self.GMF_Layer,self.MLP_Layer], 1) 105 | self.h_NeuMF = tf.concat([0.5*self.h_mf,0.5*self.h_mlp], 0) 106 | self.y_neu = tf.reduce_sum(tf.multiply(self.NeuMF_Layer, self.h_NeuMF), 1) 107 | self.y_neu = tf.sigmoid(self.y_neu) 108 | self.neu_loss = self.r * tf.log(self.y_neu) + (1 - self.r) * tf.log(1 - self.y_neu) 109 | self.neu_loss = tf.subtract(self.neu_loss, mlp_reg) 110 | self.neu_loss = tf.subtract(self.neu_loss, mf_reg) 111 | self.neu_loss = -tf.reduce_sum(self.neu_loss) 112 | ###it seems Adam is better than SGD here... 113 | self.neu_optimizer = tf.train.AdamOptimizer(self.lRate).minimize(self.neu_loss) 114 | 115 | def buildModel(self): 116 | 117 | init = tf.global_variables_initializer() 118 | self.sess.run(init) 119 | 120 | print('pretraining... (GMF)') 121 | for epoch in range(self.maxIter): 122 | user_idx, item_idx, r = self.next_batch() 123 | 124 | _, loss,y_mf = self.sess.run([self.mf_optimizer, self.mf_loss,self.y_mf], 125 | feed_dict={self.u_idx: user_idx, self.i_idx: item_idx, self.r: r}) 126 | print('iteration:', epoch, 'loss:', loss) 127 | 128 | print('pretraining... (MLP)') 129 | for epoch in range(self.maxIter): 130 | user_idx, item_idx, r = self.next_batch() 131 | _, loss, y_mlp = self.sess.run([self.mlp_optimizer, self.mlp_loss, self.y_mlp], 132 | feed_dict={self.u_idx: user_idx, self.i_idx: item_idx, self.r: r}) 133 | print('iteration:', epoch, 'loss:', loss) 134 | 135 | print('training... (NeuMF)') 136 | for epoch in range(self.maxIter): 137 | user_idx, item_idx, r = self.next_batch() 138 | _, loss, y_neu = self.sess.run([self.neu_optimizer, self.neu_loss, self.y_neu], 139 | feed_dict={self.u_idx: user_idx, self.i_idx: item_idx, self.r: r}) 140 | print('iteration:', epoch, 'loss:', loss) 141 | 142 | 143 | def predict_mlp(self,uid): 144 | user_idx = [uid]*self.n 145 | y_mlp = self.sess.run([self.y_mlp],feed_dict={self.u_idx: user_idx, self.i_idx: range(self.n)}) 146 | return y_mlp[0] 147 | 148 | def predict_mf(self,uid): 149 | user_idx = [uid]*self.n 150 | y_mf = self.sess.run([self.y_mf],feed_dict={self.u_idx: user_idx, self.i_idx: range(self.n)}) 151 | return y_mf[0] 152 | 153 | def predict_neu(self,uid): 154 | user_idx = [uid]*self.n 155 | y_neu = self.sess.run([self.y_neu],feed_dict={self.u_idx: user_idx, self.i_idx: range(self.n)}) 156 | return y_neu[0] 157 | 158 | def predictForRanking(self, u): 159 | 'invoked to rank all the items for the user' 160 | if self.data.containsUser(u): 161 | u = self.data.user[u] 162 | return self.predict_neu(u) 163 | else: 164 | return [self.data.globalMean] * len(self.data.item) 165 | 166 | -------------------------------------------------------------------------------- /recommender/advanced/CUNE.py: -------------------------------------------------------------------------------- 1 | from base.IterativeRecommender import IterativeRecommender 2 | from tool import config 3 | from random import randint 4 | from random import shuffle,choice 5 | from collections import defaultdict 6 | import numpy as np 7 | from tool.qmath import sigmoid,cosine 8 | from math import log 9 | import gensim.models.word2vec as w2v 10 | 11 | class CUNE(IterativeRecommender): 12 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 13 | super(CUNE, self).__init__(conf,trainingSet,testSet,fold) 14 | 15 | def readConfiguration(self): 16 | super(CUNE, self).readConfiguration() 17 | options = config.LineConfig(self.config['CUNE']) 18 | self.walkCount = int(options['-T']) 19 | self.walkLength = int(options['-L']) 20 | self.walkDim = int(options['-l']) 21 | self.winSize = int(options['-w']) 22 | self.topK = int(options['-k']) 23 | self.s = float(options['-s']) 24 | self.epoch = int(options['-ep']) 25 | 26 | def printAlgorConfig(self): 27 | super(CUNE, self).printAlgorConfig() 28 | print ('Specified Arguments of', self.config['recommender'] + ':') 29 | print ('Walks count per user', self.walkCount) 30 | print ('Length of each walk', self.walkLength) 31 | print ('Dimension of user embedding', self.walkDim) 32 | print ('='*80) 33 | 34 | def buildModel(self): 35 | print ('Kind Note: This method will probably take much time.') 36 | #build C-U-NET 37 | print ('Building collaborative user network...') 38 | 39 | userListen = defaultdict(dict) 40 | for user in self.data.userRecord: 41 | for item in self.data.userRecord[user]: 42 | userListen[user][item[self.recType]] = 1 43 | self.CUNet = defaultdict(list) 44 | 45 | for user1 in userListen: 46 | s1 = set(userListen[user1].keys()) 47 | for user2 in userListen: 48 | if user1 != user2: 49 | s2 = set(userListen[user2].keys()) 50 | weight = len(s1.intersection(s2)) 51 | if weight > 0: 52 | self.CUNet[user1]+=[user2]*weight 53 | 54 | print ('Generating random deep walks...') 55 | self.walks = [] 56 | self.visited = defaultdict(dict) 57 | for user in self.CUNet: 58 | for t in range(self.walkCount): 59 | path = [user] 60 | lastNode = user 61 | for i in range(1,self.walkLength): 62 | nextNode = choice(self.CUNet[lastNode]) 63 | count=0 64 | while(nextNode in self.visited[lastNode]): 65 | nextNode = choice(self.CUNet[lastNode]) 66 | #break infinite loop 67 | count+=1 68 | if count==10: 69 | break 70 | path.append(nextNode) 71 | self.visited[user][nextNode] = 1 72 | lastNode = nextNode 73 | self.walks.append(path) 74 | shuffle(self.walks) 75 | 76 | #Training get top-k friends 77 | print ('Generating user embedding...') 78 | model = w2v.Word2Vec(self.walks, size=self.walkDim, window=self.winSize, min_count=0, iter=self.epoch) 79 | print ('User embedding generated.') 80 | 81 | print ('Constructing similarity matrix...') 82 | self.W = np.random.rand(self.data.getSize('user'), self.k) / 10 # global user preference 83 | self.topKSim = {} 84 | i = 0 85 | for user in self.CUNet: 86 | u = self.data.getId(user,'user') 87 | self.W[u] = model.wv[user] 88 | for user1 in self.CUNet: 89 | sims = [] 90 | u1 = self.data.getId(user1,'user') 91 | for user2 in self.CUNet: 92 | if user1 != user2: 93 | u2 = self.data.getId(user2,'user') 94 | sims.append((user2,cosine(self.W[u1],self.W[u2]))) 95 | self.topKSim[user1] = sorted(sims, key=lambda d: d[1], reverse=True)[:self.topK] 96 | i += 1 97 | if i % 200 == 0: 98 | print ('progress:', i, '/', len(self.CUNet)) 99 | print ('Similarity matrix finished.') 100 | #print self.topKSim 101 | 102 | #prepare Pu set, IPu set, and Nu set 103 | print ('Preparing item sets...') 104 | self.PositiveSet = defaultdict(list) 105 | self.IPositiveSet = defaultdict(list) 106 | #self.NegativeSet = defaultdict(list) 107 | for user in self.data.userRecord: 108 | for event in self.data.userRecord[user]: 109 | self.PositiveSet[user].append(event[self.recType]) 110 | 111 | 112 | for user in self.CUNet: 113 | for friend in self.topKSim[user]: 114 | self.IPositiveSet[user] += list(set(self.PositiveSet[friend[0]]).difference(self.PositiveSet[user])) 115 | 116 | 117 | 118 | print ('Training...') 119 | iteration = 0 120 | while iteration < self.maxIter: 121 | self.loss = 0 122 | itemList = list(self.data.name2id[self.recType].keys()) 123 | for user in self.PositiveSet: 124 | u = self.data.getId(user,'user') 125 | 126 | for item in self.PositiveSet[user]: 127 | i = self.data.getId(item,self.recType) 128 | for n in range(3): 129 | if len(self.IPositiveSet[user]) > 0: 130 | item_k = choice(self.IPositiveSet[user]) 131 | 132 | k = self.data.getId(item_k,self.recType) 133 | self.P[u] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * ( 134 | self.Q[i] - self.Q[k]) 135 | self.Q[i] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \ 136 | self.P[u] 137 | self.Q[k] -= self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \ 138 | self.P[u] 139 | 140 | item_j = '' 141 | # if len(self.NegativeSet[user])>0: 142 | # item_j = choice(self.NegativeSet[user]) 143 | # else: 144 | item_j = choice(itemList) 145 | while (user in self.data.listened[self.recType][item_j]): 146 | item_j = choice(itemList) 147 | j = self.data.getId(item_j,self.recType) 148 | self.P[u] += (1 / self.s) * self.lRate * ( 149 | 1 - sigmoid((1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * ( 150 | self.Q[k] - self.Q[j]) 151 | self.Q[k] += (1 / self.s) * self.lRate * ( 152 | 1 - sigmoid((1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * self.P[u] 153 | self.Q[j] -= (1 / self.s) * self.lRate * ( 154 | 1 - sigmoid((1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * self.P[u] 155 | 156 | self.P[u] -= self.lRate * self.regU * self.P[u] 157 | self.Q[i] -= self.lRate * self.regI * self.Q[i] 158 | self.Q[j] -= self.lRate * self.regI * self.Q[j] 159 | self.Q[k] -= self.lRate * self.regI * self.Q[k] 160 | 161 | self.loss += -log(sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) - \ 162 | log(sigmoid((1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) 163 | else: 164 | item_j = choice(itemList) 165 | while (user in self.data.listened[self.recType][item_j]): 166 | item_j = choice(itemList) 167 | j = self.data.getId(item_j,self.recType) 168 | self.P[u] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * (self.Q[i] - self.Q[j]) 169 | self.Q[i] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * self.P[u] 170 | self.Q[j] -= self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * self.P[u] 171 | 172 | self.loss += -log(sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) 173 | 174 | 175 | self.loss += self.regU*(self.P*self.P).sum() + self.regI*(self.Q*self.Q).sum() 176 | iteration += 1 177 | if self.isConverged(iteration): 178 | break 179 | 180 | 181 | def predict(self, u): 182 | 'invoked to rank all the items for the user' 183 | u = self.data.getId(u, 'user') 184 | return self.Q.dot(self.P[u]) 185 | -------------------------------------------------------------------------------- /recommender/advanced/DMF.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.DeepRecommender import DeepRecommender 3 | import math 4 | import numpy as np 5 | from tool import qmath 6 | from tool import config 7 | from random import choice 8 | from random import shuffle 9 | from tool.qmath import sigmoid 10 | from math import log 11 | from collections import defaultdict 12 | from scipy.sparse import * 13 | from scipy import * 14 | import gensim.models.word2vec as w2v 15 | from tool.qmath import cosine 16 | from sklearn.manifold import TSNE 17 | from time import time 18 | import matplotlib.pyplot as plt 19 | from sklearn import manifold, datasets 20 | import gc 21 | import pickle 22 | import tensorflow as tf 23 | 24 | from tensorflow import set_random_seed 25 | set_random_seed(2) 26 | 27 | ### implement by Xue et al., Deep Matrix Factorization Models for Recommender Systems, IJCAI 2017. 28 | 29 | class DMF(DeepRecommender): 30 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 31 | super(DMF, self).__init__(conf,trainingSet,testSet,fold) 32 | 33 | def initModel(self): 34 | super(DMF, self).initModel() 35 | gc.collect() 36 | 37 | self.negative_sp = 5 38 | 39 | n_input_u = self.n 40 | n_input_i = self.m 41 | self.n_hidden_u=[256,512] 42 | self.n_hidden_i=[256,512] 43 | self.input_u = tf.placeholder("float", [None, n_input_u]) 44 | self.input_i = tf.placeholder("float", [None, n_input_i]) 45 | 46 | self.userListen = defaultdict(dict) 47 | for user in self.data.userRecord: 48 | for item in self.data.userRecord[user]: 49 | if item[self.recType] not in self.userListen[user]: 50 | self.userListen[user][item[self.recType]] = 1 51 | self.userListen[user][item[self.recType]] += 1 52 | print ('training...') 53 | 54 | def readConfiguration(self): 55 | super(DMF, self).readConfiguration() 56 | options = config.LineConfig(self.config['DMF']) 57 | self.alpha = float(options['-alpha']) 58 | self.topK = int(options['-k']) 59 | self.negCount = int(options['-neg']) 60 | 61 | def buildModel(self): 62 | ###################### 构建神经网络非线性映射 #################################### 63 | print ('the tensorflow...') 64 | initializer = tf.truncated_normal#tf.contrib.layers.xavier_initializer() 65 | #user net 66 | user_W1 = tf.Variable(initializer([self.n, self.n_hidden_u[0]],stddev=0.01)) 67 | self.user_out = tf.nn.relu(tf.matmul(self.input_u, user_W1)) 68 | self.regLoss = tf.nn.l2_loss(user_W1) 69 | for i in range(1, len(self.n_hidden_u)): 70 | W = tf.Variable(initializer([self.n_hidden_u[i-1], self.n_hidden_u[i]],stddev=0.01)) 71 | b = tf.Variable(initializer([self.n_hidden_u[i]],stddev=0.01)) 72 | self.regLoss = tf.add(self.regLoss, tf.nn.l2_loss(W)) 73 | self.regLoss = tf.add(self.regLoss, tf.nn.l2_loss(b)) 74 | self.user_out = tf.nn.relu(tf.add(tf.matmul(self.user_out, W), b)) 75 | 76 | #item net 77 | item_W1 = tf.Variable(initializer([self.m, self.n_hidden_i[0]],stddev=0.01)) 78 | self.item_out = tf.nn.relu(tf.matmul(self.input_i, item_W1)) 79 | self.regLoss = tf.add(self.regLoss, tf.nn.l2_loss(item_W1)) 80 | for i in range(1, len(self.n_hidden_i)): 81 | W = tf.Variable(initializer([self.n_hidden_i[i-1], self.n_hidden_i[i]],stddev=0.01)) 82 | b = tf.Variable(initializer([self.n_hidden_i[i]],stddev=0.01)) 83 | self.regLoss = tf.add(self.regLoss, tf.nn.l2_loss(W)) 84 | self.regLoss = tf.add(self.regLoss, tf.nn.l2_loss(b)) 85 | self.item_out = tf.nn.relu(tf.add(tf.matmul(self.item_out, W), b)) 86 | 87 | norm_user_output = tf.sqrt(tf.reduce_sum(tf.square(self.user_out), axis=1)) 88 | norm_item_output = tf.sqrt(tf.reduce_sum(tf.square(self.item_out), axis=1)) 89 | 90 | self.y_ = tf.reduce_sum(tf.multiply(self.user_out, self.item_out), axis=1) / ( 91 | norm_item_output * norm_user_output) 92 | self.y_ = tf.maximum(1e-6, self.y_) 93 | 94 | self.loss = self.r*tf.log(self.y_) + (1 - self.r) * tf.log(1 - self.y_) 95 | #tf.nn.sigmoid_cross_entropy_with_logits(logits=self.y_,labels=self.r) 96 | #self.loss = tf.nn.l2_loss(tf.subtract(self.y_,self.r)) 97 | self.loss = -tf.reduce_sum(self.loss) 98 | reg_lambda = tf.constant(self.regU, dtype=tf.float32) 99 | self.regLoss = tf.multiply(reg_lambda,self.regLoss) 100 | self.loss = tf.add(self.loss,self.regLoss) 101 | 102 | optimizer = tf.train.AdamOptimizer(self.lRate).minimize(self.loss) 103 | 104 | self.U = np.zeros((self.m, self.n_hidden_u[-1])) 105 | self.V = np.zeros((self.n, self.n_hidden_i[-1])) 106 | 107 | self.sess = tf.Session() 108 | init = tf.global_variables_initializer() 109 | self.sess.run(init) 110 | 111 | total_batch = int(len(self.data.trainingData)/ self.batch_size) 112 | for epoch in range(self.maxIter): 113 | shuffle(self.data.trainingData) 114 | for i in range(total_batch): 115 | users, items, ratings, u_idx, v_idx = self.next_batch(i) 116 | 117 | shuffle_idx = np.random.permutation(range(len(users))) 118 | users = users[shuffle_idx] 119 | items = items[shuffle_idx] 120 | ratings = ratings[shuffle_idx] 121 | u_idx = u_idx[shuffle_idx] 122 | v_idx = v_idx[shuffle_idx] 123 | 124 | _,loss= self.sess.run([optimizer, self.loss], feed_dict={self.input_u:users, self.input_i:items, self.r:ratings}) 125 | #print (self.foldInfo, "Epoch:", '%03d' % (epoch + 1), "Batch:", '%03d' % (i + 1), "loss=", "{:.9f}".format(loss)) 126 | 127 | U_embedding, V_embedding = self.sess.run([self.user_out, self.item_out], feed_dict={self.input_u:users, self.input_i:items}) 128 | for ue,u in zip(U_embedding,u_idx): 129 | self.U[u] = ue 130 | for ve,v in zip(V_embedding,v_idx): 131 | self.V[v] = ve 132 | self.normalized_V = np.sqrt(np.sum(self.V * self.V, axis=1)) 133 | self.normalized_U = np.sqrt(np.sum(self.U * self.U, axis=1)) 134 | self.ranking_performance() 135 | print("Optimization Finished!") 136 | 137 | # return 1xm 的向量 138 | def row(self, u): 139 | user = self.data.id2name['user'][u] 140 | k = self.userListen[user].keys() 141 | v = self.userListen[user].values() 142 | vec = np.zeros(self.n) 143 | for pair in zip(k,v): 144 | iid = self.data.getId(pair[0], 'track') 145 | vec[iid] = pair[1] 146 | return vec 147 | 148 | # return 1xn 的向量 149 | def col(self,i): 150 | item = self.data.id2name['track'][i] 151 | k = self.data.listened['track'][item].keys() 152 | v = self.data.listened['track'][item].values() 153 | vec = np.zeros(self.m) 154 | for pair in zip(k, v): 155 | uid = self.data.getId(pair[0], 'user') 156 | vec[uid] = pair[1] 157 | return vec 158 | 159 | # 返回一个1xk的特征向量 160 | def col_Shallow(self,i): 161 | return self.Y[i] 162 | 163 | def next_batch(self,i): 164 | rows = np.zeros(((self.negative_sp+1)*self.batch_size,self.n)) 165 | cols = np.zeros(((self.negative_sp+1)*self.batch_size,self.m)) 166 | #cols = np.zeros(((self.negative_sp+1)*self.batch_size,self.m)) 167 | batch_idx = range(self.batch_size*i, self.batch_size*(i+1)) 168 | 169 | u_idx = [] 170 | v_idx = [] 171 | ratings = [] 172 | for idx in batch_idx: 173 | user = self.data.trainingData[idx]['user'] 174 | item = self.data.trainingData[idx]['track'] 175 | rating = 0 176 | if item in self.userListen[user]: 177 | rating = self.userListen[user][item] 178 | 179 | u_idx.append(self.data.getId(user, 'user')) 180 | v_idx.append(self.data.getId(item, 'track')) 181 | ratings.append(rating) 182 | 183 | for i,u in enumerate(u_idx): 184 | rows[i] = self.row(u) 185 | 186 | for i,t in enumerate(v_idx): 187 | cols[i] = self.col(t) 188 | #cols[i] = self.col_Shallow(t) 189 | 190 | #negative sample 191 | for i in range(self.negative_sp*self.batch_size): 192 | u = np.random.randint(self.m) 193 | v = np.random.randint(self.n) 194 | while self.data.id2name['track'][v] in self.userListen[self.data.id2name['user'][u]]: 195 | u = np.random.randint(self.m) 196 | v = np.random.randint(self.n) 197 | 198 | rows[self.batch_size-1+i]=self.row(u) 199 | cols[self.batch_size-1+i]=self.col(v) 200 | #cols[self.batch_size-1+i] = self.col_Shallow(v) 201 | u_idx.append(u) 202 | v_idx.append(v) 203 | ratings.append(0) 204 | return rows,cols,np.array(ratings),np.array(u_idx),np.array(v_idx) 205 | 206 | 207 | def predict(self, u): 208 | 'invoked to rank all the items for the user' 209 | uid = self.data.getId(u,'user') 210 | return np.divide(self.V.dot(self.U[uid]), self.normalized_U[uid]*self.normalized_V) 211 | -------------------------------------------------------------------------------- /recommender/advanced/Song2vec.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.IterativeRecommender import IterativeRecommender 3 | import math 4 | import numpy as np 5 | from tool import qmath 6 | from tool import config 7 | from random import choice 8 | from tool.qmath import sigmoid 9 | from math import log 10 | from collections import defaultdict 11 | from scipy.sparse import * 12 | from scipy import * 13 | import gensim.models.word2vec as w2v 14 | from tool.qmath import cosine 15 | class Song2vec(IterativeRecommender): 16 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 17 | super(Song2vec, self).__init__(conf,trainingSet,testSet,fold) 18 | 19 | def initModel(self): 20 | super(Song2vec, self).initModel() 21 | self.X=self.P*10 22 | self.Y=self.Q*10 23 | self.m = self.data.getSize('user') 24 | self.n = self.data.getSize(self.recType) 25 | self.Bu = np.random.rand(self.m)/10 # bias value of user 26 | self.Bi = np.random.rand(self.n)/10 # bias value of item 27 | 28 | def readConfiguration(self): 29 | super(Song2vec, self).readConfiguration() 30 | options = config.LineConfig(self.config['Song2vec']) 31 | self.alpha = float(options['-alpha']) 32 | self.topK = int(options['-k']) 33 | 34 | def buildModel(self): 35 | self.T = np.random.rand(self.data.getSize('track'),self.k) 36 | sentences = [] 37 | self.listenTrack = set() 38 | self.user = defaultdict(list) 39 | for user in self.data.userRecord: 40 | playList = [] 41 | if len(self.data.userRecord[user]) > 10: 42 | self.user[user] = self.data.userRecord[user] 43 | for item in self.data.userRecord[user]: 44 | playList.append(item['track']) 45 | self.listenTrack.add(item['track']) 46 | sentences.append(playList) 47 | model = w2v.Word2Vec(sentences, size=self.k, window=5, min_count=0, iter=10) 48 | for track in self.listenTrack: 49 | tid = self.data.getId(track, 'track') 50 | self.T[tid] = model.wv[track] 51 | print ('song embedding generated.') 52 | 53 | print ('Constructing similarity matrix...') 54 | i = 0 55 | self.topKSim = {} 56 | for track1 in self.listenTrack: 57 | tSim = [] 58 | i += 1 59 | if i % 200 == 0: 60 | print (i, '/', len(self.listenTrack)) 61 | vec1 = self.T[self.data.getId(track1, 'track')] 62 | for track2 in self.listenTrack: 63 | if track1 != track2: 64 | vec2 = self.T[self.data.getId(track2, 'track')] 65 | sim = cosine(vec1, vec2) 66 | tSim.append((track2, sim)) 67 | 68 | self.topKSim[track1] = sorted(tSim, key=lambda d: d[1], reverse=True)[:self.topK] 69 | 70 | userListen = defaultdict(dict) 71 | for user in self.user: 72 | for item in self.user[user]: 73 | if item[self.recType] not in userListen[user]: 74 | userListen[user][item[self.recType]] = 0 75 | userListen[user][item[self.recType]] += 1 76 | print ('training...') 77 | ''' 78 | iteration = 0 79 | itemList = list(self.data.name2id[self.recType].keys()) 80 | while iteration < self.maxIter: 81 | self.loss = 0 82 | 83 | YtY = self.Y.T.dot(self.Y) 84 | I = np.ones(self.n) 85 | for user in self.data.name2id['user']: 86 | #C_u = np.ones(self.data.getSize(self.recType)) 87 | H = np.ones(self.n) 88 | val = [] 89 | pos = [] 90 | P_u = np.zeros(self.n) 91 | uid = self.data.getId(user,'user') 92 | for item in userListen[user]: 93 | iid = self.data.getId(item,self.recType) 94 | r_ui = userListen[user][item] 95 | pos.append(iid) 96 | val.append(10*r_ui) 97 | H[iid]+=10*r_ui 98 | P_u[iid]=1 99 | error = (P_u[iid]-self.X[uid].dot(self.Y[iid])) 100 | self.loss+=pow(error,2) 101 | #sparse matrix 102 | C_u = coo_matrix((val,(pos,pos)),shape=(self.n,self.n)) 103 | A = (YtY+np.dot(self.Y.T,C_u.dot(self.Y))+self.regU*np.eye(self.k)) 104 | self.X[uid] = np.dot(np.linalg.inv(A),(self.Y.T*H).dot(P_u)) 105 | 106 | 107 | XtX = self.X.T.dot(self.X) 108 | I = np.ones(self.m) 109 | for item in self.data.name2id[self.recType]: 110 | P_i = np.zeros(self.m) 111 | iid = self.data.getId(item, self.recType) 112 | H = np.ones(self.m) 113 | val = [] 114 | pos = [] 115 | for user in self.data.listened[self.recType][item]: 116 | uid = self.data.getId(user, 'user') 117 | r_ui = self.data.listened[self.recType][item][user] 118 | pos.append(uid) 119 | val.append(10*r_ui) 120 | H[uid] += 10*r_ui 121 | P_i[uid] = 1 122 | # sparse matrix 123 | C_i = coo_matrix((val, (pos, pos)),shape=(self.m,self.m)) 124 | A = (XtX+np.dot(self.X.T,C_i.dot(self.X))+self.regU*np.eye(self.k)) 125 | self.Y[iid]=np.dot(np.linalg.inv(A), (self.X.T*H).dot(P_i)) 126 | 127 | for user in self.user: 128 | u = self.data.getId(user,'user') 129 | for item in self.user[user]: 130 | i = self.data.getId(item[self.recType],self.recType) 131 | for ind in range(3): 132 | item_j = choice(itemList) 133 | while (item_j in userListen[user]): 134 | item_j = choice(itemList) 135 | j = self.data.getId(item_j,self.recType) 136 | s = sigmoid(self.X[u].dot(self.Y[i]) - self.X[u].dot(self.Y[j])) 137 | self.X[u] += self.lRate * (1 - s) * (self.Y[i] - self.Y[j]) 138 | self.Y[i] += self.lRate * (1 - s) * self.X[u] 139 | self.Y[j] -= self.lRate * (1 - s) * self.X[u] 140 | 141 | self.X[u] -= self.lRate * self.regU * self.X[u] 142 | self.Y[i] -= self.lRate * self.regI * self.Y[i] 143 | self.Y[j] -= self.lRate * self.regI * self.Y[j] 144 | self.loss += -log(s) 145 | 146 | for t1 in self.topKSim: 147 | tid1 = self.data.getId(t1,'track') 148 | for t2 in self.topKSim[t1]: 149 | tid2 = self.data.getId(t2[0],'track') 150 | sim = t2[1] 151 | error = (sim-self.Y[tid1].dot(self.Y[tid2])) 152 | self.loss+=error**2 153 | self.Y[tid1]+=0.5*self.alpha*self.lRate*(error)*self.Y[tid2] 154 | self.Y[tid2]+=0.5*self.alpha*self.lRate*(error)*self.Y[tid1] 155 | 156 | #self.loss += (self.X * self.X).sum() + (self.Y * self.Y).sum() 157 | iteration += 1 158 | print ('iteration:',iteration,'loss:',self.loss) 159 | # if self.isConverged(iteration): 160 | # break 161 | ''' 162 | iteration = 0 163 | while iteration < self.maxIter: 164 | self.loss = 0 165 | for user in self.data.name2id['user']: 166 | u = self.data.getId(user,'user') 167 | bu = self.Bu[u] 168 | for item in userListen[user]: 169 | i = self.data.getId(item, self.recType) 170 | 171 | bi = self.Bi[i] 172 | rating = self.Y[i].dot(self.X[u]) + self.data.globalMean + self.Bu[u] + self.Bi[i] 173 | error = userListen[user][item] - rating 174 | self.loss += error**2 175 | self.X[u] += self.lRate * (error*self.Y[i] - self.regU*self.X[u]) 176 | self.Y[i] += self.lRate * (error*self.X[u] - self.regI*self.Y[i]) 177 | 178 | self.Bu[u] += self.lRate * (error - self.regB * bu) 179 | self.Bi[i] += self.lRate * (error - self.regB * bi) 180 | 181 | for t1 in self.topKSim: 182 | tid1 = self.data.getId(t1,'track') 183 | for t2 in self.topKSim[t1]: 184 | tid2 = self.data.getId(t2[0],'track') 185 | sim = t2[1] 186 | error2 = (sim-self.Y[tid1].dot(self.Y[tid2])) 187 | self.loss+=error2**2 188 | self.Y[tid1]+=0.5*self.alpha*self.lRate*(error2)*self.Y[tid2] 189 | self.Y[tid2]+=0.5*self.alpha*self.lRate*(error2)*self.Y[tid1] 190 | self.loss += self.regB*(self.Bu * self.Bu).sum() + self.regB*(self.Bi*self.Bi).sum() + (self.X * self.X).sum() + (self.Y * self.Y).sum() 191 | iteration += 1 192 | print ('iteration:',iteration,'loss:',self.loss) 193 | #if self.isConverged(iteration): 194 | # break 195 | 196 | 197 | def predict(self, u): 198 | 'invoked to rank all the items for the user' 199 | u = self.data.getId(u,'user') 200 | # return self.Y.dot(self.X[u]) 201 | return self.Y.dot(self.X[u]) + self.data.globalMean + self.Bu[u] 202 | -------------------------------------------------------------------------------- /base/IterativeRecommender.py: -------------------------------------------------------------------------------- 1 | from base.recommender import Recommender 2 | from tool import config 3 | import numpy as np 4 | from random import shuffle 5 | from tool.file import FileIO 6 | from os.path import abspath 7 | from time import strftime,localtime,time 8 | from evaluation.measure import Measure 9 | 10 | 11 | class IterativeRecommender(Recommender): 12 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 13 | super(IterativeRecommender, self).__init__(conf,trainingSet,testSet,fold) 14 | 15 | def readConfiguration(self): 16 | super(IterativeRecommender, self).readConfiguration() 17 | # set the reduced dimension 18 | self.k = int(self.config['num.factors']) 19 | # set maximum iteration 20 | self.maxIter = int(self.config['num.max.iter']) 21 | # set learning rate 22 | learningRate = config.LineConfig(self.config['learnRate']) 23 | self.lRate = float(learningRate['-init']) 24 | self.maxLRate = float(learningRate['-max']) 25 | # regularization parameter 26 | regular = config.LineConfig(self.config['reg.lambda']) 27 | self.regU,self.regI,self.regB= float(regular['-u']),float(regular['-i']),float(regular['-b']) 28 | 29 | def printAlgorConfig(self): 30 | super(IterativeRecommender, self).printAlgorConfig() 31 | print ('Reduced Dimension:',self.k) 32 | print ('Maximum Iteration:',self.maxIter) 33 | print ('Regularization parameter: regU %.3f, regI %.3f, regB %.3f' %(self.regU,self.regI,self.regB)) 34 | print ('='*80) 35 | 36 | def initModel(self): 37 | self.P = np.random.rand(self.data.getSize('user'), self.k).astype(np.float32)/10 # latent user matrix 38 | self.Q = np.random.rand(self.data.getSize(self.recType), self.k).astype(np.float32)/10 # latent item matrix 39 | self.loss, self.lastLoss = 0, 0 40 | 41 | def saveModel(self): 42 | pass 43 | 44 | def loadModel(self): 45 | pass 46 | 47 | def updateLearningRate(self,iter): 48 | if iter > 1: 49 | if abs(self.lastLoss) > abs(self.loss): 50 | self.lRate *= 1.01 51 | else: 52 | self.lRate *= 0.5 53 | 54 | if self.maxLRate > 0 and self.lRate > self.maxLRate: 55 | self.lRate = self.maxLRate 56 | 57 | 58 | def predict(self,u): 59 | u = self.data.getId(u, 'user') 60 | return self.Q.dot(self.P[u]) 61 | 62 | def isConverged(self,iter): 63 | from math import isnan 64 | if isnan(self.loss): 65 | print ('Loss = NaN or Infinity: current settings does not fit the recommender! Change the settings and try again!') 66 | exit(-1) 67 | deltaLoss = (self.lastLoss-self.loss) 68 | print ('%s %s iteration %d: loss = %.4f, delta_loss = %.5f learning_Rate = %.5f' %(self.algorName,self.foldInfo,iter,self.loss,deltaLoss,self.lRate)) 69 | #check if converged 70 | cond = abs(deltaLoss) < 1e-3 71 | converged = cond 72 | if not converged: 73 | self.updateLearningRate(iter) 74 | self.lastLoss = self.loss 75 | return converged 76 | 77 | def evalRanking(self): 78 | res = [] # used to contain the text of the result 79 | N = 0 80 | top = self.ranking['-topN'].split(',') 81 | top = [int(num) for num in top] 82 | N = max(top) 83 | 84 | if N > 100 or N < 0: 85 | print ('N can not be larger than 100! It has been reassigned with 10') 86 | N = 10 87 | 88 | res.append('userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n') 89 | # predict 90 | recList = {} 91 | userCount = len(self.data.testSet) 92 | 93 | for i, user in enumerate(self.data.testSet): 94 | itemSet = {} 95 | line = user + ':' 96 | predictedItems = self.predict(user) 97 | 98 | for id, score in enumerate(predictedItems): 99 | 100 | itemSet[self.data.id2name[self.recType][id]] = score 101 | 102 | for item in self.data.userRecord[user]: 103 | try: 104 | del itemSet[item[self.recType]] 105 | except KeyError: 106 | pass 107 | Nrecommendations = [] 108 | for item in itemSet: 109 | if len(Nrecommendations) < N: 110 | Nrecommendations.append((item, itemSet[item])) 111 | else: 112 | break 113 | 114 | Nrecommendations.sort(key=lambda d: d[1], reverse=True) 115 | recommendations = [item[1] for item in Nrecommendations] 116 | resNames = [item[0] for item in Nrecommendations] 117 | 118 | # itemSet = sorted(itemSet.iteritems(), key=lambda d: d[1], reverse=True) 119 | # if bTopN: 120 | # find the K biggest scores 121 | for item in itemSet: 122 | ind = N 123 | l = 0 124 | r = N - 1 125 | 126 | if recommendations[r] < itemSet[item]: 127 | while True: 128 | 129 | mid = (l + r) // 2 130 | if recommendations[mid] >= itemSet[item]: 131 | l = mid + 1 132 | elif recommendations[mid] < itemSet[item]: 133 | r = mid - 1 134 | else: 135 | ind = mid 136 | break 137 | if r < l: 138 | ind = r 139 | break 140 | # ind = bisect(recommendations, itemSet[item]) 141 | 142 | if ind < N - 1: 143 | recommendations[ind + 1] = itemSet[item] 144 | resNames[ind + 1] = item 145 | recList[user] = resNames 146 | 147 | if i % 100 == 0: 148 | print (self.algorName, self.foldInfo, 'progress:' + str(i) + '/' + str(userCount)) 149 | for item in recList[user]: 150 | line += item 151 | if item in self.data.testSet[user]: 152 | line += '*' 153 | 154 | line += '\n' 155 | res.append(line) 156 | currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) 157 | # output prediction result 158 | if self.isOutput: 159 | fileName = '' 160 | outDir = self.output['-dir'] 161 | if self.ranking.contains('-topN'): 162 | fileName = self.config['recommender'] + '@' + currentTime + '-top-' + self.ranking['-topN']\ 163 | + 'items' + self.foldInfo + '.txt' 164 | FileIO.writeFile(outDir, fileName, res) 165 | print ('The result has been output to ', abspath(outDir), '.') 166 | # output evaluation result 167 | outDir = self.output['-dir'] 168 | fileName = self.config['recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' 169 | 170 | self.measure = Measure.rankingMeasure(self.data.testSet, recList,top,self.data.getSize(self.recType)) 171 | 172 | FileIO.writeFile(outDir, fileName, self.measure) 173 | print ('The result of %s %s:\n%s' % (self.algorName, self.foldInfo, ''.join(self.measure))) 174 | 175 | def ranking_performance(self): 176 | N = 10 177 | itemcount = 0 178 | recList = {} 179 | testSample = {} 180 | for user in self.data.testSet: 181 | itemcount+=len(self.data.testSet[user]) 182 | if len(testSample) == 300: 183 | break 184 | testSample[user] = self.data.testSet[user] 185 | 186 | for user in testSample: 187 | itemSet = {} 188 | predictedItems = self.predict(user) 189 | for id, rating in enumerate(predictedItems): 190 | itemSet[self.data.id2name['track'][id]] = rating 191 | 192 | ratedList = self.data.testSet[user].keys() 193 | ratingList = self.data.testSet[user].values() 194 | for item in ratedList: 195 | del itemSet[item] 196 | 197 | Nrecommendations = [] 198 | for item in itemSet: 199 | if len(Nrecommendations) < N: 200 | Nrecommendations.append((item, itemSet[item])) 201 | else: 202 | break 203 | 204 | Nrecommendations.sort(key=lambda d: d[1], reverse=True) 205 | recommendations = [item[1] for item in Nrecommendations] 206 | resNames = [item[0] for item in Nrecommendations] 207 | 208 | # find the K biggest scores 209 | for item in itemSet: 210 | ind = N 211 | l = 0 212 | r = N - 1 213 | 214 | if recommendations[r] < itemSet[item]: 215 | while True: 216 | mid = int((l + r) / 2) 217 | if recommendations[mid] >= itemSet[item]: 218 | l = mid + 1 219 | elif recommendations[mid] < itemSet[item]: 220 | r = mid - 1 221 | if r < l: 222 | ind = r 223 | break 224 | # ind = bisect(recommendations, itemSet[item]) 225 | if ind < N - 1: 226 | recommendations[ind + 1] = itemSet[item] 227 | resNames[ind + 1] = item 228 | recList[user] = resNames 229 | measure = Measure.rankingMeasure(testSample, recList, [10], itemcount) 230 | print ('-'*80) 231 | print ('Ranking Performance '+self.foldInfo+' (Top-10 On 300 sampled users)') 232 | for m in measure[1:]: 233 | print (m.strip()) 234 | print ('-'*80) 235 | return measure 236 | 237 | 238 | -------------------------------------------------------------------------------- /data/record.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | from tool.config import Config,LineConfig 4 | from tool.qmath import normalize 5 | from tool.dataSplit import DataSplit 6 | import os.path 7 | from re import split 8 | from collections import defaultdict 9 | import time 10 | import pickle 11 | class Record(object): 12 | 'data access control' 13 | def __init__(self,config,trainingSet,testSet): 14 | self.config = config 15 | self.recordConfig = LineConfig(config['record.setup']) 16 | self.evalConfig = LineConfig(config['evaluation.setup']) 17 | self.name2id = defaultdict(dict) 18 | self.id2name = defaultdict(dict) 19 | self.listened = {} 20 | self.listened['artist']=defaultdict(dict) 21 | self.listened['track']=defaultdict(dict) 22 | self.listened['album']=defaultdict(dict) 23 | self.artist2Album = defaultdict(dict) #key:artist id, value:{album id1:1, album id2:1 ...} 24 | self.album2Track = defaultdict(dict) # 25 | self.artist2Track = defaultdict(dict) # 26 | self.Track2artist = defaultdict(dict) # 27 | self.Track2album = defaultdict(dict) # 28 | self.userRecord = defaultdict(list) #user data in training set. form: {user:[record1,record2]} 29 | self.trackRecord = defaultdict(list) # track data in training set. form: {track:[record1, record2]} 30 | self.testSet = defaultdict(dict) #user data in test set. form: {user:{recommenedObject1:1,recommendedObject:1}} 31 | self.recordCount = 0 32 | self.columns = {} 33 | self.globalMean = 0 34 | self.userMeans = {} #used to store the mean values of users's listen tims 35 | self.trackListen = {} 36 | 37 | self.trainingData = trainingSet 38 | 39 | self.computeUserMean() 40 | self.globalAverage() 41 | self.PopTrack = {} 42 | 43 | labels = self.recordConfig['-columns'].split(',') 44 | for col in labels: 45 | label = col.split(':') 46 | self.columns[label[0]] = int(label[1]) 47 | if self.evalConfig.contains('-byTime'): 48 | trainingSet,testSet = self.splitDataByTime(trainingSet) 49 | 50 | self.preprocess(trainingSet,testSet) 51 | 52 | 53 | self.computePop(trainingSet) 54 | 55 | def globalAverage(self): 56 | total = sum(self.userMeans.values()) 57 | if total==0: 58 | self.globalMean = 0 59 | else: 60 | self.globalMean = total/len(self.userMeans) 61 | 62 | def computeUserMean(self): 63 | for user in self.userRecord: 64 | for item in self.userRecord[user]: 65 | userSum += self.listened['track'][item].values() 66 | 67 | self.userMeans[user] = userSum/float(len(self.userRecord[user])) 68 | 69 | ''' 70 | def splitDataByTime(self,dataset): 71 | trainingSet = [] 72 | testSet = [] 73 | listened = {} 74 | ratio = float(self.evalConfig['-byTime']) 75 | records = defaultdict(list) 76 | for event in dataset: 77 | records[event['user']].append(event) 78 | if event['user'] not in listened: 79 | listened[event['user']] = 1 80 | else: 81 | listened[event['user']] += 1 82 | orderlist = sorted(listened.items(), key=lambda item:item[1], reverse=True) 83 | dellist = orderlist[:int(len(orderlist)*ratio)] 84 | for i in range(len(dellist)): 85 | if dellist[i][0] in records: 86 | del records[dellist[i][0]] 87 | 88 | #print('The amount of data after deletion:', len(records)) 89 | 90 | for user in records: 91 | orderedList = sorted(records[user],key=lambda d:d['time']) 92 | training = orderedList[0:int(len(orderedList)*(1-ratio))] 93 | test = orderedList[int(len(orderedList)*(1-ratio)):] 94 | trainingSet += training 95 | testSet += test 96 | 97 | #print ('the type1 :', type(trainingSet), type(testSet)) 98 | #file_train = 'trainset.txt' 99 | #file_test = 'testset.txt' 100 | #trainf = open(file_train, 'wb') 101 | #testf = open(file_test, 'wb') 102 | #pickle.dump(trainingSet, trainf, 2) 103 | #pickle.dump(testSet, testf, 2) 104 | #trainf.close() 105 | #testf.close() 106 | return trainingSet,testSet 107 | ''' 108 | def splitDataByTime(self,dataset): 109 | trainingSet = [] 110 | testSet = [] 111 | ratio = float(self.evalConfig['-byTime']) 112 | records = defaultdict(list) 113 | for event in dataset: 114 | records[event['user']].append(event) 115 | 116 | for user in records: 117 | orderedList = sorted(records[user],key=lambda d:d['time']) 118 | training = orderedList[0:int(len(orderedList)*(1-ratio))] 119 | test = orderedList[int(len(orderedList)*(1-ratio)):] 120 | trainingSet += training 121 | testSet += test 122 | 123 | return trainingSet,testSet 124 | 125 | def computePop(self, dataset): 126 | print('computePop...') 127 | for event in dataset: 128 | total = 0 129 | for value in self.listened['track'][event['track']].values(): 130 | total += value 131 | if value > 0: 132 | self.PopTrack[event['track']] = total 133 | 134 | print('computePop is finished...') 135 | print('PopTrack', len(self.PopTrack)) 136 | 137 | 138 | def preprocess(self,trainingSet,testSet): 139 | for entry in trainingSet: 140 | self.recordCount+=1 141 | for key in entry: 142 | if key!='time': 143 | if entry[key] not in self.name2id[key]: 144 | self.name2id[key][entry[key]] = len(self.name2id[key]) 145 | self.id2name[key][len(self.id2name[key])] = entry[key] 146 | 147 | if key=='user': 148 | self.userRecord[entry['user']].append(entry) 149 | if 'artist' in entry: 150 | if entry[key] not in self.listened['artist'][entry['artist']]: 151 | self.listened['artist'][entry['artist']][entry[key]] = 1 152 | else: 153 | self.listened['artist'][entry['artist']][entry[key]] += 1 154 | if 'album' in entry: 155 | if entry[key] not in self.listened['album'][entry['album']]: 156 | self.listened['album'][entry['album']][entry[key]] = 1 157 | else: 158 | self.listened['album'][entry['album']][entry[key]] += 1 159 | if 'track' in entry: 160 | if entry[key] not in self.listened['track'][entry['track']]: 161 | self.listened['track'][entry['track']][entry[key]] = 1 162 | else: 163 | self.listened['track'][entry['track']][entry[key]] += 1 164 | 165 | if key == 'artist' and 'album' in entry: 166 | self.artist2Album[entry[key]][entry['album']] = 1 167 | 168 | if key == 'album' and 'track' in entry: 169 | self.album2Track[entry[key]] = self.name2id['track'][entry['track']] 170 | self.Track2album[entry['track']] = self.name2id[key][entry[key]] 171 | 172 | if key == 'artist' and 'track' in entry: 173 | self.artist2Track[entry[key]] = self.name2id['track'][entry['track']] 174 | self.Track2artist[entry['track']] = self.name2id[key][entry[key]] 175 | 176 | if key == 'track': 177 | self.trackRecord[entry['track']].append(entry) 178 | 179 | 180 | 181 | recType = self.evalConfig['-target'] 182 | for entry in testSet: 183 | for key in entry: 184 | if key != 'time': 185 | if entry[key] not in self.name2id[key]: 186 | self.name2id[key][entry[key]] = len(self.name2id[key]) 187 | self.id2name[key][len(self.id2name[key])] = entry[key] 188 | if key=='user': 189 | if recType in entry and entry[recType] not in self.testSet[entry['user']]: 190 | self.testSet[entry['user']][entry[recType]]=1 191 | else: 192 | self.testSet[entry['user']][entry[recType]]+=1 193 | 194 | #remove items appearing in the training set from the test set 195 | for item in self.listened[recType]: 196 | for user in self.listened[recType][item]: 197 | try: 198 | del self.testSet[user][item] 199 | except KeyError: 200 | pass 201 | if user in self.testSet and len(self.testSet[user])==0: 202 | del self.testSet[user] 203 | 204 | 205 | 206 | def printTrainingSize(self): 207 | if 'user' in self.name2id: 208 | print ('user count:',len(self.name2id['user'])) 209 | if 'artist' in self.name2id: 210 | print ('artist count:',len(self.name2id['artist'])) 211 | if 'album' in self.name2id: 212 | print ('album count:',len(self.name2id['album'])) 213 | if 'track' in self.name2id: 214 | print ('track count:', len(self.name2id['track'])) 215 | print ('Training set size:',self.recordCount) 216 | 217 | 218 | def getId(self,obj,t): 219 | if obj in self.name2id[t]: 220 | return self.name2id[t][obj] 221 | else: 222 | print ('No '+t+' '+obj+' exists!') 223 | exit(-1) 224 | 225 | def getSize(self,t): 226 | return len(self.name2id[t]) 227 | 228 | def contains(self, obj, t): 229 | 'whether the recType t is in trainging set' 230 | if obj in self.name2id[t]: 231 | return True 232 | else: 233 | return False 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
| Entry | 31 |Example | 32 |Description | 33 |
|---|---|---|
| record | 36 |D:/xiami/100K.txt | 37 |Set the path to input dataset. | 38 |
| record.setup | 41 |-columns user:0,track:1,artist:2,album:3 -delim , | 42 |-columns: this option specifies what colums in the dataset mean. Four types of entities supported. If some types of information are missing, just skip the corresponding type; -delim: this option specifies which symbol separates the columns. 43 | | 44 |
| recommender | 47 |UserKNN/ItemKNN/MostPop/etc. | 48 |the name of the recommender 49 | | 50 |
| evaluation.setup | 53 |-testSet ../dataset/testset.txt | 54 |Main option: -testSet, -ap, -cv -byTime 55 | -testSet path/to/test/file (need to specify the test set manually) 56 | -ap ratio (ap means that the ratings are automatically partitioned into training set and test set, the number is the ratio of test set. e.g. -ap 0.2) 57 | -cv k (-cv means cross validation, k is the number of the fold. e.g. -cv 5) 58 | -byTime ratio (sort the user record in order of the time. ratio decides the percentage of test set(recently played). 59 | Secondary option:-b, -p, -cold 60 | -target track (This option decides which type of object will be recommended (artist, track, album). Only available for some general recommenders like MostPop) 61 | -b val (binarizing the rating values. Ratings equal or greater than val will be changed into 1, and ratings lower than val will be changed into 0. e.g. -b 3.0) 62 | -p (if this option is added, the cross validation wll be excuted parallelly, otherwise excuted one by one) 63 | -cold threshold (evaluation on cold-start users, users in training set with ratings more than threshold will be removed from the test set) 64 | | 65 |
| item.ranking | 68 |off -topN 5,10,20 | 69 |-topN N1,N2,N3...: the length of the recommendation list. *Yue can generate multiple evaluation results for different N at the same time 70 | | 71 |
| output.setup | 74 |on -dir ./Results/ | 75 |Main option: whether to output recommendation results 76 | -dir path: the directory path of output results. 77 | | 78 |
| num.factors | 88 |5/10/20/number | 89 |Set the number of latent factors | 90 |
| num.max.iter | 93 |100/200/number | 94 |Set the maximum number of iterations for iterative recommendation algorithms. | 95 |
| learnRate | 98 |-init 0.01 -max 1 | 99 |-init initial learning rate for iterative recommendation algorithms; 100 | -max: maximum learning rate (default 1); 101 | |
102 |
| reg.lambda | 105 |-u 0.05 -i 0.05 -b 0.1 | 106 |107 | -u: user regularizaiton; -i: item regularization; -b: bias regularizaiton; | 108 |
Note: We use SGD to obtain the local minimum. So, there have some differences between the original papers and the code in terms of fomula presentation. If you have problems in understanding the code, please open an issue to ask for help. We can guarantee that all the implementations are carefully reviewed and tested.
127 || Item Ranking | 132 |Paper | 133 |
|---|---|
| Rand | 136 |Recommend tracks, artists or albums randomly 137 | | 138 |
| MostPop | 141 |Recommend most popular tracks, artists or albums 142 | | 143 |
| UserKNN | 146 |A common user-based collaborative filtering method 147 | | 148 |
| BPR | 151 |Rendle et al., BPR: Bayesian Personalized Ranking from Implicit Feedback, UAI 2009. 152 | |
153 |
| WRMF | 156 |Hu et al., Collaborative Filtering for Implicit Feedback Datasets, KDD 2009. 157 | | 158 |
| IPF | 161 |Xiang et al., Temporal Recommendation on Graphs via Long- and Short-term Preference Fusion, KDD 2010. 162 | | 163 |
| FISM | 166 |Kabbur et al., FISM: Factored Item Similarity Models for Top-N Recommender Systems, KDD 2013. 167 | | 168 |
| ExpoMF | 171 |Liang et al., Modeling User Exposure in Recommendation, WWW 2016. 172 | |
173 |
| CDAE | 176 |Wu et al., Collaborative Denoising Auto-Encoders for Top-N Recommender Systems, WSDM 2016. 177 | |
178 |
| CUNE | 181 |Zhang et al., Collaborative User Network Embedding for Social Recommender Systems, SDM 2017. 182 | | 183 |
| Song2vec | 186 |Cheng et al., Exploiting Music Play Sequence for Music Recommendation, IJCAI 2017 187 | | 188 |
| APR | 191 |He et al., Adversarial Personalized Ranking for Recommendation, SIGIR 2018. 192 | | 193 |
| Data Set | 202 |Basic Meta | 203 |Context | 204 |||||||
|---|---|---|---|---|---|---|---|---|
| Users | 207 |Tracks | 208 |Artists | 209 |Albums | 210 |Record | 211 |Tag | 212 |User Profile | 213 |Artist Profile | 214 ||
| NowPlaying [1] | 217 |1,744 | 218 |16,864 | 219 |2,108 | 220 |N/A | 221 |1,117,335 | 222 |N/A | 223 |N/A | 224 |N/A | 225 |
| Xiami [2] | 228 |4,270 | 229 |177,289 | 230 |25,844 | 231 |68,479 | 232 |1,337,948 | 233 |N/A | 234 |N/A | 235 |N/A | 236 |
| LastFM [3] | 239 |1,892 | 240 |N/A | 241 |17,632 | 242 |N/A | 243 |92,834 | 244 |Yes | 245 |N/A | 246 |N/A | 247 |
| Yahoo Music 250 | [source] | 251 |1,800,000 | 252 |136,000 | 253 |many | 254 |many | 255 |717,000,000 | 256 |Yes | 257 |N/A | 258 |N/A | 259 |
| 30 Music [source][4] | 262 |45,167 | 263 |5,023,108 | 264 |595,049 | 265 |217,337 | 266 |many | 267 |Yes | 268 |Yes | 269 |N/A | 270 |
| Ali Music | 273 |349,946 | 274 |10,278 | 275 |50 | 276 |N/A | 277 |5,652,232 | 278 |N/A | 279 |N/A | 280 |Yes | 281 |
[1]. Eva Zangerle, Martin Pichl, Wolfgang Gassler, and Günther Specht. 2014. #nowplaying Music Dataset: Extracting Listening Behavior from Twitter. In Proceedings of the First International Workshop on Internet-Scale Multimedia Management (WISMM '14). ACM, New York, NY, USA, 21-26
287 |[2]. Wang, Dongjing, et al. "Learning music embedding with metadata for context aware recommendation." Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval. ACM, 2016.
288 |[3]. Iván Cantador, Peter Brusilovsky, and Tsvi Kuflik. 2011. 2nd Workshop on Information Heterogeneity and Fusion in Recom- mender Systems (HetRec 2011). In Proceedings of the 5th ACM conference on Recommender systems (RecSys 2011). ACM, New York, NY, USA
289 |[4]. Turrin R, Quadrana M, Condorelli A, et al. 30Music Listening and Playlists Dataset[C]//RecSys Posters. 2015.
290 | -------------------------------------------------------------------------------- /recommender/advanced/NeuTrans.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from base.DeepRecommender import DeepRecommender 3 | import os 4 | import numpy as np 5 | from tool import config 6 | from random import choice 7 | from random import shuffle 8 | from collections import defaultdict 9 | from scipy.sparse import * 10 | from scipy import * 11 | import gensim.models.word2vec as w2v 12 | from tool.qmath import cosine 13 | import tensorflow as tf 14 | import pickle 15 | 16 | class NeuTrans(DeepRecommender): 17 | def __init__(self,conf,trainingSet=None,testSet=None,fold='[1]'): 18 | super(NeuTrans, self).__init__(conf,trainingSet,testSet,fold) 19 | 20 | 21 | def readConfiguration(self): 22 | super(NeuTrans, self).readConfiguration() 23 | options = config.LineConfig(self.config['NeuTrans']) 24 | self.alpha = float(options['-alpha']) 25 | self.topK = int(options['-k']) 26 | self.negCount = int(options['-neg']) 27 | 28 | def buildNetwork(self): 29 | self.trainingData = [] 30 | print('Kind Note: This method will take much time') 31 | # build C-T-NET 32 | print('Building collaborative track network') 33 | self.trackNet = {} 34 | self.filteredListen = defaultdict(list) 35 | 36 | for track in self.data.trackRecord: 37 | if len(self.data.trackRecord[track]) > 0: 38 | self.trackNet[track] = self.data.trackRecord[track] 39 | for track in self.trackNet: 40 | tid = self.data.getId(track, 'track') 41 | for item in self.trackNet[track]: 42 | uid = self.data.getId(item['user'], 'user') 43 | if self.userListen[uid][tid] >= 0: 44 | self.filteredListen[track].append(item['user']) 45 | self.trainingData.append(item) 46 | 47 | self.CTNet = defaultdict(list) 48 | i=0 49 | for track1 in self.filteredListen: 50 | i += 1 51 | if i % 200 == 0: 52 | print (i, '/', len(self.filteredListen)) 53 | s1 = set(self.filteredListen[track1]) 54 | for track2 in self.filteredListen: 55 | if track1 != track2: 56 | s2 = set(self.filteredListen[track2]) 57 | weight = len(s1.intersection(s2)) 58 | if weight > 0: 59 | self.CTNet[track1] += [track2]*weight 60 | ######################## 歌曲 C-T-N-E-T 构建结束 ############################ 61 | 62 | print('Genrerating random deep walks...') 63 | self.T_walks = [] 64 | self.T_visited = defaultdict(dict) 65 | for track in self.CTNet: 66 | for t in range(10): 67 | path = [track] 68 | lastNode = track 69 | for i in range(1, 10): 70 | nextNode = choice(self.CTNet[lastNode]) 71 | count = 0 72 | #while(nextNode in self.T_visited[lastNode] or nextNode not in self.aSim[lastNode]): 73 | while(nextNode in self.T_visited[lastNode]): 74 | nextNode = choice(self.CTNet[lastNode]) 75 | count+=1 76 | if count==10: 77 | break 78 | path.append(nextNode) 79 | self.T_visited[track][lastNode] = 1 80 | lastNode = nextNode 81 | self.T_walks.append(path) 82 | shuffle(self.T_walks) 83 | ##del self.aSim 84 | 85 | print('Generating track embedding') 86 | model = w2v.Word2Vec(self.T_walks, size=self.k, window=5, min_count=0, iter=3) 87 | print('Track embedding generated') 88 | 89 | self.T = np.random.rand(self.data.getSize('track'), self.k) 90 | 91 | print ('Constructing similarity matrix...') 92 | i = 0 93 | self.nSim = {} 94 | for track1 in self.CTNet: 95 | tSim = [] 96 | i += 1 97 | if i % 1000 == 0: 98 | print (i, '/', len(self.CTNet)) 99 | vec1 = model.wv[track1] 100 | tid1 = self.data.getId(track1, 'track') 101 | for track2 in self.CTNet: 102 | if track1 != track2: 103 | tid2 = self.data.getId(track2, 'track') 104 | vec2 = model.wv[track2] 105 | sim = max(1e-6, cosine(vec1, vec2)) 106 | tSim.append((tid2, sim)) 107 | #self.nSim[t1][t2] = sim 108 | self.nSim[tid1] = sorted(tSim, key=lambda d: d[1], reverse=True)[:20] 109 | 110 | file1 = 'nsim.txt' 111 | df1 = open(file1, 'wb') 112 | #df1 = open(file1, 'rb') 113 | pickle.dump(self.nSim, df1) 114 | #self.nSim = pickle.load(df1) 115 | 116 | def attributeSim(self): 117 | ######################## 训练属性并计算属性相似度 ############################ 118 | print ('train the attribute...') 119 | self.attr = {} 120 | for track in self.CTNet: 121 | val = [] 122 | artist = self.data.Track2artist[track] 123 | #album = self.data.Track2album[tid1] 124 | val.append(artist) 125 | #val.append(album) 126 | self.attr[track] = val 127 | 128 | ## construct the S matrix 129 | print ('Constructing the attribute similarity matrix...') 130 | i = 0 131 | # self.aSim = defaultdict(dict) 132 | self.aSim = {} 133 | for track1 in self.CTNet: 134 | tSim = [] 135 | i += 1 136 | if i % 1000 == 0: 137 | print (i, '/', len(self.CTNet)) 138 | att1 = set(self.attr[track1]) 139 | tid1 = self.data.getId(track1, 'track') 140 | for track2 in self.CTNet: 141 | if track1 != track2: 142 | tid2 = self.data.getId(track2, 'track') 143 | att2 = set(self.attr[track2]) 144 | num1 = len(list(att1&att2)) 145 | num2 = len(list(att1|att2)) 146 | sim = num1/num2 147 | tSim.append((tid2, sim)) 148 | #self.aSim[t1][t2] = sim 149 | self.aSim[tid1] = sorted(tSim, key=lambda d: d[1], reverse=True)[:20] 150 | 151 | file2 = 'asim.txt' 152 | df1 = open(file2, 'wb') 153 | #df1 = open(file2, 'rb') 154 | pickle.dump(self.aSim, df1) 155 | #self.aSim = pickle.load(df1) 156 | 157 | 158 | def initModel(self): 159 | super(NeuTrans, self).initModel() 160 | self.userListen = defaultdict(dict) 161 | for user in self.data.userRecord: 162 | uid = self.data.getId(user, 'user') 163 | for item in self.data.userRecord[user]: 164 | tid = self.data.getId(item['track'], 'track') 165 | if item['track'] not in self.userListen[user]: 166 | self.userListen[uid][tid] = 0 167 | self.userListen[uid][tid] += 1 168 | 169 | self.buildNetwork() 170 | self.attributeSim() 171 | 172 | def buildModel(self): 173 | ###################### 构建神经网络非线性映射 #################################### 174 | print ('the tensorflow...') 175 | self.itemLayer = [64,64,64] 176 | self.userLayer = [64,64,64] 177 | 178 | #self.u_jdx = tf.placeholder(tf.int32, [None], name="u_jdx") 179 | self.v_jdx = tf.placeholder(tf.int32, [None], name="v_jdx") 180 | self.netSim = tf.placeholder(tf.float32, [None], name="netSim") 181 | self.attSim = tf.placeholder(tf.float32, [None], name="attSim") 182 | self.v_pdx = tf.placeholder(tf.int32, [None], name="v_pdx") 183 | self.v_qdx = tf.placeholder(tf.int32, [None], name="v_qdx") 184 | 185 | #userj_input = tf.nn.embedding_lookup(self.U, self.u_idx) 186 | self.V_neg_embed = tf.nn.embedding_lookup(self.V, self.v_jdx) 187 | 188 | self.V_net_pembed = tf.nn.embedding_lookup(self.V, self.v_pdx) 189 | self.V_net_qembed = tf.nn.embedding_lookup(self.V, self.v_qdx) 190 | 191 | def init_variable(shape, name): 192 | return tf.Variable(tf.truncated_normal(shape=shape, dtype=tf.float32, stddev=0.01), name=name) 193 | 194 | with tf.name_scope("User_Layer"): 195 | user_W1 = init_variable([self.k, self.userLayer[0]], "user_W1") 196 | self.U_embed_out = tf.matmul(self.U_embed, user_W1) 197 | self.regLoss = tf.nn.l2_loss(user_W1) 198 | 199 | with tf.name_scope("Item_Layer"): 200 | item_W1 = init_variable([self.k, self.itemLayer[0]], "item_W1") 201 | self.V_embed_out = tf.matmul(self.V_embed, item_W1) 202 | self.V_neg_embed_out = tf.matmul(self.V_neg_embed, item_W1) 203 | self.regLoss = tf.add(self.regLoss, tf.nn.l2_loss(item_W1)) 204 | 205 | self.reg_lambda = tf.constant(self.regU, dtype=tf.float32) 206 | 207 | error = tf.subtract(tf.reduce_sum(tf.multiply(self.U_embed_out, self.V_embed_out), 1), tf.reduce_sum(tf.multiply(self.U_embed_out, self.V_neg_embed_out), 1)) 208 | self.loss = tf.reduce_sum(tf.nn.softplus(-error)) 209 | # 构造正则化项 完善损失函数 210 | self.regLoss = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.U_embed_out)),self.regLoss) 211 | self.regLoss = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_embed_out)),self.regLoss) 212 | self.regLoss = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_neg_embed_out)), self.regLoss) 213 | self.total_loss = tf.add(self.loss, self.regLoss) 214 | 215 | ##### 网络结构损失函数 #### 216 | error_net = tf.subtract(self.netSim, tf.reduce_sum(tf.multiply(self.V_net_pembed, self.V_net_qembed), 1)) 217 | self.loss_net = tf.reduce_sum(tf.nn.softplus(-error_net)) 218 | self.reg_loss_net = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_net_pembed)), 219 | tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_net_qembed))) 220 | self.total_loss_net = tf.add(self.loss_net, self.reg_loss_net) 221 | 222 | ##### 属性损失函数 #### 223 | error_att = tf.subtract(self.attSim, tf.reduce_sum(tf.multiply(self.V_net_pembed, self.V_net_qembed), 1)) 224 | self.loss_att = tf.reduce_sum(tf.nn.softplus(-error_att)) 225 | self.reg_loss_att = tf.add(tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_net_pembed)), 226 | tf.multiply(self.reg_lambda, tf.nn.l2_loss(self.V_net_qembed))) 227 | self.total_loss_att = tf.add(self.loss_att, self.reg_loss_att) 228 | 229 | self.optimizer = tf.train.AdamOptimizer(self.lRate) 230 | self.train = self.optimizer.minimize(self.total_loss) 231 | 232 | self.optimizer_net = tf.train.AdamOptimizer(self.lRate) 233 | self.train_net = self.optimizer_net.minimize(self.total_loss_net) 234 | 235 | self.optimizer_att = tf.train.AdamOptimizer(self.lRate) 236 | self.train_att = self.optimizer_att.minimize(self.total_loss_att) 237 | 238 | self.U = np.zeros((self.m, self.itemLayer[0])) 239 | self.V = np.zeros((self.n, self.itemLayer[0])) 240 | 241 | with tf.Session() as sess: 242 | init = tf.global_variables_initializer() 243 | sess.run(init) 244 | for epoch in range(self.maxIter): 245 | item_pdx, item_qdx, nSim = self._net_next_batch() 246 | _,loss = sess.run([self.train_net,self.total_loss_net], feed_dict={self.v_pdx:item_pdx, self.v_qdx:item_qdx, self.netSim:nSim}) 247 | print ('iteration:', epoch, 'loss:',loss) 248 | self.ranking_performance() 249 | 250 | for epoch in range(self.maxIter): 251 | item_pdx, item_qdx, aSim = self._att_next_batch() 252 | _,loss = sess.run([self.train_att,self.total_loss_att], feed_dict={self.v_pdx:item_pdx, self.v_qdx:item_qdx, self.attSim:aSim}) 253 | print ('iteration:', epoch, 'loss:',loss) 254 | self.ranking_performance() 255 | 256 | for epoch in range(self.maxIter): 257 | user_idx, item_idx, neg_item_idx = self.next_batch() 258 | _,loss = sess.run([self.train,self.total_loss], feed_dict={self.u_idx:user_idx, self.v_idx:item_idx, self.v_jdx:neg_item_idx}) 259 | print ('iteration:', epoch, 'loss:',loss) 260 | U_embedding, V_embedding = sess.run([self.U_embed_out,self.V_embed_out], feed_dict={self.u_idx:user_idx, self.v_idx:item_idx, self.v_jdx:neg_item_idx}) 261 | for ue,u in zip(U_embedding,user_idx): 262 | self.U[u]=ue 263 | for ve,v in zip(V_embedding,item_idx): 264 | self.V[v]=ve 265 | 266 | self.ranking_performance() 267 | 268 | 269 | def next_batch(self): 270 | batch_idx = np.random.randint(len(self.data.trainingData), size=256) 271 | users = [self.data.trainingData[idx]['user'] for idx in batch_idx] 272 | items = [self.data.trainingData[idx]['track'] for idx in batch_idx] 273 | user_idx,item_idx=[],[] 274 | neg_item_idx = [] 275 | for i,user in enumerate(users): 276 | uid = self.data.getId(user, 'user') 277 | tid = self.data.getId(items[i], 'track') 278 | for j in range(100): #negative sampling 279 | neg_id = random.randint(0, self.n- 1) 280 | while neg_id in self.userListen[uid]: 281 | neg_id = random.randint(0, self.n - 1) 282 | user_idx.append(uid) 283 | item_idx.append(tid) 284 | neg_item_idx.append(neg_id) 285 | return user_idx, item_idx, neg_item_idx 286 | 287 | def _net_next_batch(self): 288 | batch_idx = np.random.randint(len(self.nSim)) 289 | item_pdx, item_qdx=[],[] 290 | nSim=[] 291 | for i in range(batch_idx): 292 | t1 = choice(list(self.nSim)) 293 | for t2 in self.nSim[t1]: 294 | item_pdx.append(t1) 295 | item_qdx.append(t2[0]) 296 | nSim.append(t2[1]) 297 | return item_pdx, item_qdx, nSim 298 | 299 | def _att_next_batch(self): 300 | batch_idx = np.random.randint(len(self.aSim)) 301 | item_pdx, item_qdx=[],[] 302 | aSim=[] 303 | for i in range(batch_idx): 304 | t1 = choice(list(self.aSim)) 305 | for t2 in self.aSim[t1]: 306 | item_pdx.append(t1) 307 | item_qdx.append(t2[0]) 308 | aSim.append(t2[1]) 309 | return item_pdx, item_qdx, aSim 310 | 311 | def predict(self, u): 312 | 'invoked to rank all the items for the user' 313 | u = self.data.getId(u,'user') 314 | return self.V.dot(self.U[u]) 315 | #pass --------------------------------------------------------------------------------