├── .gitignore ├── BPR.ipynb ├── BPR.py ├── README.md ├── main.py ├── preprocessor.py └── ratings_small.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /BPR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "C:\\Users\\k12s35h813g\\AppData\\Local\\Continuum\\anaconda3\\envs\\tf_gpu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "#!/usr/bin/env python\n", 19 | "# coding: utf-8\n", 20 | "\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import pickle\n", 24 | "import time\n", 25 | "import tensorflow as tf\n", 26 | "import random\n", 27 | "import math\n", 28 | "\n", 29 | "__author__ = \"Bo-Syun Cheng\"\n", 30 | "__email__ = \"k12s35h813g@gmail.com\"" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "class Data_preprocessor():\n", 40 | " def __init__(self,data,filter_user=1,filter_item=5):\n", 41 | " self.data = data\n", 42 | " self.filter_user = filter_user\n", 43 | " self.filter_item = filter_item\n", 44 | "\n", 45 | " def preprocess(self):\n", 46 | " self.filter_()\n", 47 | " return self.train_test_split()\n", 48 | " def filter_(self):\n", 49 | " \"\"\"\n", 50 | " 過濾掉session長度過短的user和評分數過少的item\n", 51 | "\n", 52 | " :param filter_user: 少於這個session長度的user要被過濾掉 default=1\n", 53 | " :param filter_item: 少於這個評分數的item要被過濾掉 default=5\n", 54 | " :return: dataframe\n", 55 | " \"\"\"\n", 56 | " session_lengths = self.data.groupby('userId').size()\n", 57 | " self.data = self.data[np.in1d(self.data['userId'], session_lengths[session_lengths>1].index)] #將長度不足2的session過濾掉\n", 58 | " print(\"剩餘data : %d\"%(len(self.data)))\n", 59 | " item_supports = self.data.groupby('movieId').size() #統計每個item被幾個使用者評過分\n", 60 | " self.data = self.data[np.in1d(self.data['movieId'], item_supports[item_supports>5].index)] #將被評分次數低於5的item過濾掉\n", 61 | " print(\"剩餘data : %d\"%(len(self.data)))\n", 62 | " \"\"\"再把只有一個click的user過濾掉 因為過濾掉商品可能會導致新的單一click的user出現\"\"\"\n", 63 | " session_lengths = self.data.groupby('userId').size()\n", 64 | " self.data = self.data[np.in1d(self.data['userId'], session_lengths[session_lengths>1].index)]\n", 65 | " print(\"剩餘data : %d\"%(len(self.data)))\n", 66 | " def train_test_split(self,time_range=86400):\n", 67 | " \"\"\"\n", 68 | " 切割訓練和測試資料集\n", 69 | "\n", 70 | " :param time_range:session若在這個區間內,將被分為test_data default=86400(1day)\n", 71 | " :retrun: a tuple of two dataframe\n", 72 | " \"\"\"\n", 73 | " tmax = self.data['timestamp'].max()\n", 74 | " session_tmax = self.data.groupby('userId')['timestamp'].max()\n", 75 | " train = self.data[np.in1d(self.data['userId'] , session_tmax[session_tmax<=tmax -86400].index)]\n", 76 | " test = self.data[np.in1d(self.data['userId'] , session_tmax[session_tmax>tmax -86400].index)]\n", 77 | " print(\"訓練資料集統計: session個數:%d , item個數:%d , event數:%d\"%(train['userId'].nunique(),train['movieId'].nunique(),len(train)))\n", 78 | " \"\"\"\n", 79 | " 基於協同式過濾的特性,若test data中含有train data沒出現過的item,將該item過濾掉\n", 80 | " \"\"\"\n", 81 | " test = test[np.in1d(test['movieId'], train['movieId'])]\n", 82 | " tslength = test.groupby('userId').size()\n", 83 | " test = test[np.in1d(test['userId'], tslength[tslength>=2].index)]\n", 84 | " print(\"測試資料集統計: session個數:%d , item個數:%d , event數:%d\"%(test['userId'].nunique(),test['movieId'].nunique(),len(test)))\n", 85 | "\n", 86 | " return train" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "class BPR():\n", 96 | " '''\n", 97 | " parameter\n", 98 | " train_sample_size : 訓練時,每個正樣本,我sample多少負樣本\n", 99 | " test_sample_size : 測試時,每個正樣本,我sample多少負樣本\n", 100 | " num_k : item embedding的維度大小\n", 101 | " evaluation_at : recall@多少,及正樣本要排前幾名,我們才視為推薦正確\n", 102 | " '''\n", 103 | " def __init__(self,data,n_epochs=10,batch_size=32,train_sample_size=10,test_sample_size=50,num_k=100,evaluation_at=10):\n", 104 | " self.n_epochs = n_epochs\n", 105 | " self.batch_size = batch_size\n", 106 | " self.train_sample_size = train_sample_size\n", 107 | " self.test_sample_size = test_sample_size\n", 108 | " self.num_k = num_k\n", 109 | " self.evaluation_at = evaluation_at\n", 110 | "\n", 111 | " self.data = data\n", 112 | " self.num_user = len(self.data['userId'].unique())\n", 113 | " self.num_item = len(self.data['movieId'].unique())\n", 114 | " self.num_event = len(self.data)\n", 115 | "\n", 116 | " self.all_item = set(self.data['movieId'].unique())\n", 117 | " self.experiment = []\n", 118 | "\n", 119 | " #Because the id is not always continuous , we build a map to normalize id . For example:[1,3,5,156]->[0,1,2,3]\n", 120 | " user_id = self.data['userId'].unique()\n", 121 | " self.user_id_map = {user_id[i] : i for i in range(self.num_user)}\n", 122 | " item_id = self.data['movieId'].unique()\n", 123 | " self.item_id_map = {item_id[i] : i for i in range(self.num_item)}\n", 124 | " training_data = self.data.loc[:,['userId','movieId']].values\n", 125 | " self.training_data = [[self.user_id_map[training_data[i][0]],self.item_id_map[training_data[i][1]]] for i in range(self.num_event)]\n", 126 | "\n", 127 | "\n", 128 | "\n", 129 | " #data preprocess\n", 130 | " self.split_data() #split data into training_data and testing\n", 131 | " self.sample_dict = self.negative_sample() #for each trainging data (user,item+) , we sample 10 negative item for bpr training\n", 132 | "\n", 133 | " self.build_model() #build TF graph\n", 134 | " self.sess = tf.Session() #create session\n", 135 | " self.sess.run(tf.global_variables_initializer())\n", 136 | "\n", 137 | "\n", 138 | " def split_data(self):\n", 139 | " user_session = self.data.groupby('userId')['movieId'].apply(set).reset_index().loc[:,['movieId']].values.reshape(-1)\n", 140 | " self.testing_data =[]\n", 141 | " for index,session in enumerate(user_session):\n", 142 | " random_pick = self.item_id_map[random.sample(session,1)[0]]\n", 143 | " self.training_data.remove([index,random_pick])\n", 144 | " self.testing_data.append([index,random_pick])\n", 145 | "\n", 146 | "\n", 147 | " def negative_sample(self):\n", 148 | " user_session = self.data.groupby('userId')['movieId'].apply(set).reset_index().loc[:,['movieId']].values.reshape(-1)\n", 149 | " sample_dict = {}\n", 150 | "\n", 151 | " for td in self.training_data:\n", 152 | " sample_dict[tuple(td)] = [self.item_id_map[s] for s in random.sample(self.all_item.difference(user_session[td[0]]) , self.train_sample_size)]\n", 153 | "\n", 154 | " return sample_dict\n", 155 | "\n", 156 | " def build_model(self):\n", 157 | " self.X_user = tf.placeholder(tf.int32,shape=(None , 1))\n", 158 | " self.X_pos_item = tf.placeholder(tf.int32,shape=(None , 1))\n", 159 | " self.X_neg_item = tf.placeholder(tf.int32,shape=(None , 1))\n", 160 | " self.X_predict = tf.placeholder(tf.int32,shape=(1))\n", 161 | "\n", 162 | " user_embedding = tf.Variable(tf.truncated_normal(shape=[self.num_user,self.num_k],mean=0.0,stddev=0.5))\n", 163 | " item_embedding = tf.Variable(tf.truncated_normal(shape=[self.num_item,self.num_k],mean=0.0,stddev=0.5))\n", 164 | "\n", 165 | " embed_user = tf.nn.embedding_lookup(user_embedding , self.X_user)\n", 166 | " embed_pos_item = tf.nn.embedding_lookup(item_embedding , self.X_pos_item)\n", 167 | " embed_neg_item = tf.nn.embedding_lookup(item_embedding , self.X_neg_item)\n", 168 | "\n", 169 | " pos_score = tf.matmul(embed_user , embed_pos_item , transpose_b=True)\n", 170 | " neg_score = tf.matmul(embed_user , embed_neg_item , transpose_b=True)\n", 171 | "\n", 172 | " self.loss = tf.reduce_mean(-tf.log(tf.nn.sigmoid(pos_score-neg_score)))\n", 173 | " self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss)\n", 174 | "\n", 175 | " predict_user_embed = tf.nn.embedding_lookup(user_embedding , self.X_predict)\n", 176 | " self.predict = tf.matmul(predict_user_embed , item_embedding , transpose_b=True)\n", 177 | "\n", 178 | " def fit(self):\n", 179 | " self.experiment = []\n", 180 | " for epoch in range(self.n_epochs):\n", 181 | " np.random.shuffle(self.training_data)\n", 182 | " total_loss = 0\n", 183 | " for i in range(0 , len(self.training_data) , self.batch_size):\n", 184 | " training_batch = self.training_data[i:i+self.batch_size]\n", 185 | " user_id = []\n", 186 | " pos_item_id = []\n", 187 | " neg_item_id = []\n", 188 | " for single_training in training_batch:\n", 189 | " for neg_sample in list(self.sample_dict[tuple(single_training)]):\n", 190 | " user_id.append(single_training[0])\n", 191 | " pos_item_id.append(single_training[1])\n", 192 | " neg_item_id.append(neg_sample)\n", 193 | "\n", 194 | " user_id = np.array(user_id).reshape(-1,1)\n", 195 | " pos_item_id = np.array(pos_item_id).reshape(-1,1)\n", 196 | " neg_item_id = np.array(neg_item_id).reshape(-1,1)\n", 197 | "\n", 198 | " _ , loss = self.sess.run([self.optimizer , self.loss] ,\n", 199 | " feed_dict = {self.X_user : user_id , self.X_pos_item : pos_item_id , self.X_neg_item : neg_item_id}\n", 200 | " )\n", 201 | " total_loss += loss\n", 202 | "\n", 203 | " num_true = 0\n", 204 | " for test in self.testing_data:\n", 205 | " result = self.sess.run(self.predict , feed_dict = {self.X_predict : [test[0]]})\n", 206 | " result = result.reshape(-1)\n", 207 | " if (result[[self.item_id_map[s] for s in random.sample(self.all_item , self.test_sample_size)]] > result[test[1]]).sum()+1 <= self.evaluation_at:\n", 208 | " num_true += 1\n", 209 | "\n", 210 | " print(\"epoch:%d , loss:%.2f , recall:%.2f\"%(epoch , total_loss , num_true/len(self.testing_data)))\n", 211 | " self.experiment.append([epoch , total_loss , num_true/len(self.testing_data)])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 4, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "剩餘data : 100004\n", 224 | "剩餘data : 88087\n", 225 | "剩餘data : 88087\n", 226 | "訓練資料集統計: session個數:669 , item個數:3099 , event數:86870\n", 227 | "測試資料集統計: session個數:2 , item個數:1154 , event數:1217\n", 228 | "epoch:0 , loss:3196.12 , recall:0.23\n", 229 | "epoch:1 , loss:2039.38 , recall:0.27\n", 230 | "epoch:2 , loss:1141.34 , recall:0.41\n", 231 | "epoch:3 , loss:668.30 , recall:0.54\n", 232 | "epoch:4 , loss:427.90 , recall:0.58\n", 233 | "epoch:5 , loss:282.31 , recall:0.63\n", 234 | "epoch:6 , loss:187.26 , recall:0.63\n", 235 | "epoch:7 , loss:123.10 , recall:0.64\n", 236 | "epoch:8 , loss:79.32 , recall:0.67\n", 237 | "epoch:9 , loss:49.86 , recall:0.67\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "if __name__ == \"__main__\":\n", 243 | " data = pd.read_csv('ratings_small.csv')\n", 244 | " dp = Data_preprocessor(data)\n", 245 | " processed_data = dp.preprocess()\n", 246 | " \n", 247 | " bpr = BPR(processed_data)\n", 248 | " bpr.fit()" 249 | ] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python 3", 255 | "language": "python", 256 | "name": "python3" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 3 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython3", 268 | "version": "3.6.5" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 2 273 | } 274 | -------------------------------------------------------------------------------- /BPR.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import pickle 7 | import time 8 | import tensorflow as tf 9 | import random 10 | import math 11 | 12 | __author__ = "Bo-Syun Cheng" 13 | __email__ = "k12s35h813g@gmail.com" 14 | 15 | 16 | class BPR(): 17 | ''' 18 | parameter 19 | train_sample_size : 訓練時,每個正樣本,我sample多少負樣本 20 | test_sample_size : 測試時,每個正樣本,我sample多少負樣本 21 | num_k : item embedding的維度大小 22 | evaluation_at : recall@多少,及正樣本要排前幾名,我們才視為推薦正確 23 | ''' 24 | def __init__(self,data,n_epochs=10,batch_size=32,train_sample_size=10,test_sample_size=50,num_k=100,evaluation_at=10): 25 | self.n_epochs = n_epochs 26 | self.batch_size = batch_size 27 | self.train_sample_size = train_sample_size 28 | self.test_sample_size = test_sample_size 29 | self.num_k = num_k 30 | self.evaluation_at = evaluation_at 31 | 32 | self.data = data 33 | self.num_user = len(self.data['userId'].unique()) 34 | self.num_item = len(self.data['movieId'].unique()) 35 | self.num_event = len(self.data) 36 | 37 | self.all_item = set(self.data['movieId'].unique()) 38 | self.experiment = [] 39 | 40 | #Because the id is not always continuous , we build a map to normalize id . For example:[1,3,5,156]->[0,1,2,3] 41 | user_id = self.data['userId'].unique() 42 | self.user_id_map = {user_id[i] : i for i in range(self.num_user)} 43 | item_id = self.data['movieId'].unique() 44 | self.item_id_map = {item_id[i] : i for i in range(self.num_item)} 45 | training_data = self.data.loc[:,['userId','movieId']].values 46 | self.training_data = [[self.user_id_map[training_data[i][0]],self.item_id_map[training_data[i][1]]] for i in range(self.num_event)] 47 | 48 | 49 | 50 | #data preprocess 51 | self.split_data() #split data into training_data and testing 52 | self.sample_dict = self.negative_sample() #for each trainging data (user,item+) , we sample 10 negative item for bpr training 53 | 54 | self.build_model() #build TF graph 55 | self.sess = tf.Session() #create session 56 | self.sess.run(tf.global_variables_initializer()) 57 | 58 | 59 | def split_data(self): 60 | user_session = self.data.groupby('userId')['movieId'].apply(set).reset_index().loc[:,['movieId']].values.reshape(-1) 61 | self.testing_data =[] 62 | for index,session in enumerate(user_session): 63 | random_pick = self.item_id_map[random.sample(session,1)[0]] 64 | self.training_data.remove([index,random_pick]) 65 | self.testing_data.append([index,random_pick]) 66 | 67 | 68 | def negative_sample(self): 69 | user_session = self.data.groupby('userId')['movieId'].apply(set).reset_index().loc[:,['movieId']].values.reshape(-1) 70 | sample_dict = {} 71 | 72 | for td in self.training_data: 73 | sample_dict[tuple(td)] = [self.item_id_map[s] for s in random.sample(self.all_item.difference(user_session[td[0]]) , self.train_sample_size)] 74 | 75 | return sample_dict 76 | 77 | def build_model(self): 78 | self.X_user = tf.placeholder(tf.int32,shape=(None , 1)) 79 | self.X_pos_item = tf.placeholder(tf.int32,shape=(None , 1)) 80 | self.X_neg_item = tf.placeholder(tf.int32,shape=(None , 1)) 81 | self.X_predict = tf.placeholder(tf.int32,shape=(1)) 82 | 83 | user_embedding = tf.Variable(tf.truncated_normal(shape=[self.num_user,self.num_k],mean=0.0,stddev=0.5)) 84 | item_embedding = tf.Variable(tf.truncated_normal(shape=[self.num_item,self.num_k],mean=0.0,stddev=0.5)) 85 | 86 | embed_user = tf.nn.embedding_lookup(user_embedding , self.X_user) 87 | embed_pos_item = tf.nn.embedding_lookup(item_embedding , self.X_pos_item) 88 | embed_neg_item = tf.nn.embedding_lookup(item_embedding , self.X_neg_item) 89 | 90 | pos_score = tf.matmul(embed_user , embed_pos_item , transpose_b=True) 91 | neg_score = tf.matmul(embed_user , embed_neg_item , transpose_b=True) 92 | 93 | self.loss = tf.reduce_mean(-tf.log(tf.nn.sigmoid(pos_score-neg_score))) 94 | self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss) 95 | 96 | predict_user_embed = tf.nn.embedding_lookup(user_embedding , self.X_predict) 97 | self.predict = tf.matmul(predict_user_embed , item_embedding , transpose_b=True) 98 | 99 | def fit(self): 100 | self.experiment = [] 101 | for epoch in range(self.n_epochs): 102 | np.random.shuffle(self.training_data) 103 | total_loss = 0 104 | for i in range(0 , len(self.training_data) , self.batch_size): 105 | training_batch = self.training_data[i:i+self.batch_size] 106 | user_id = [] 107 | pos_item_id = [] 108 | neg_item_id = [] 109 | for single_training in training_batch: 110 | for neg_sample in list(self.sample_dict[tuple(single_training)]): 111 | user_id.append(single_training[0]) 112 | pos_item_id.append(single_training[1]) 113 | neg_item_id.append(neg_sample) 114 | 115 | user_id = np.array(user_id).reshape(-1,1) 116 | pos_item_id = np.array(pos_item_id).reshape(-1,1) 117 | neg_item_id = np.array(neg_item_id).reshape(-1,1) 118 | 119 | _ , loss = self.sess.run([self.optimizer , self.loss] , 120 | feed_dict = {self.X_user : user_id , self.X_pos_item : pos_item_id , self.X_neg_item : neg_item_id} 121 | ) 122 | total_loss += loss 123 | 124 | num_true = 0 125 | for test in self.testing_data: 126 | result = self.sess.run(self.predict , feed_dict = {self.X_predict : [test[0]]}) 127 | result = result.reshape(-1) 128 | if (result[[self.item_id_map[s] for s in random.sample(self.all_item , self.test_sample_size)]] > result[test[1]]).sum()+1 <= self.evaluation_at: 129 | num_true += 1 130 | 131 | print("epoch:%d , loss:%.2f , recall:%.2f"%(epoch , total_loss , num_true/len(self.testing_data))) 132 | self.experiment.append([epoch , total_loss , num_true/len(self.testing_data)]) 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BPR-by-tensorflow 2 | 3 | implement bayesian personalized ranking by tensorflow 4 | 5 | ----------------------------------------------------------------------------- 6 | 7 | ## Introduction 8 | bayesian personalized ranking is a popular method in recommender system . 9 | In recent year, a lot of recommender system study combine deep learning model with BPR training loss. 10 | 11 | ## dataset 12 | The dataset is about the the user's rating to the moives. This is from Kaggle:(https://www.kaggle.com/rounakbanik/the-movies-dataset). 13 | 14 | ## usage 15 | 16 | Training tuple: 17 | 18 | 19 | To train a model: 20 | 21 | $ python main.py 22 | 23 | ## Acknowledgement 24 | I implement this method [BPR: Bayesian Personalized Ranking from Implicit Feedback. Steffen Rendle, Christoph Freudenthaler, Zeno Gantner and Lars Schmidt-Thieme, Proc. UAI 2009](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf) by tensorflow 25 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from preprocessor import Data_preprocessor 5 | from BPR import BPR 6 | import pandas as pd 7 | 8 | __author__ = "Bo-Syun Cheng" 9 | __email__ = "k12s35h813g@gmail.com" 10 | 11 | if __name__ == "__main__": 12 | data = pd.read_csv('ratings_small.csv') 13 | dp = Data_preprocessor(data) 14 | processed_data = dp.preprocess() 15 | 16 | bpr = BPR(processed_data) 17 | bpr.fit() 18 | -------------------------------------------------------------------------------- /preprocessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | __author__ = "Bo-Syun Cheng" 8 | __email__ = "k12s35h813g@gmail.com" 9 | 10 | 11 | # In[2]: 12 | 13 | 14 | class Data_preprocessor(): 15 | def __init__(self,data,filter_user=1,filter_item=5): 16 | self.data = data 17 | self.filter_user = filter_user 18 | self.filter_item = filter_item 19 | 20 | def preprocess(self): 21 | self.filter_() 22 | return self.train_test_split() 23 | def filter_(self): 24 | """ 25 | 過濾掉session長度過短的user和評分數過少的item 26 | 27 | :param filter_user: 少於這個session長度的user要被過濾掉 default=1 28 | :param filter_item: 少於這個評分數的item要被過濾掉 default=5 29 | :return: dataframe 30 | """ 31 | session_lengths = self.data.groupby('userId').size() 32 | self.data = self.data[np.in1d(self.data['userId'], session_lengths[session_lengths>1].index)] #將長度不足2的session過濾掉 33 | print("剩餘data : %d"%(len(self.data))) 34 | item_supports = self.data.groupby('movieId').size() #統計每個item被幾個使用者評過分 35 | self.data = self.data[np.in1d(self.data['movieId'], item_supports[item_supports>5].index)] #將被評分次數低於5的item過濾掉 36 | print("剩餘data : %d"%(len(self.data))) 37 | """再把只有一個click的user過濾掉 因為過濾掉商品可能會導致新的單一click的user出現""" 38 | session_lengths = self.data.groupby('userId').size() 39 | self.data = self.data[np.in1d(self.data['userId'], session_lengths[session_lengths>1].index)] 40 | print("剩餘data : %d"%(len(self.data))) 41 | def train_test_split(self,time_range=86400): 42 | """ 43 | 切割訓練和測試資料集 44 | 45 | :param time_range:session若在這個區間內,將被分為test_data default=86400(1day) 46 | :retrun: a tuple of two dataframe 47 | """ 48 | tmax = self.data['timestamp'].max() 49 | session_tmax = self.data.groupby('userId')['timestamp'].max() 50 | train = self.data[np.in1d(self.data['userId'] , session_tmax[session_tmax<=tmax -86400].index)] 51 | test = self.data[np.in1d(self.data['userId'] , session_tmax[session_tmax>tmax -86400].index)] 52 | print("訓練資料集統計: session個數:%d , item個數:%d , event數:%d"%(train['userId'].nunique(),train['movieId'].nunique(),len(train))) 53 | """ 54 | 基於協同式過濾的特性,若test data中含有train data沒出現過的item,將該item過濾掉 55 | """ 56 | test = test[np.in1d(test['movieId'], train['movieId'])] 57 | tslength = test.groupby('userId').size() 58 | test = test[np.in1d(test['userId'], tslength[tslength>=2].index)] 59 | print("測試資料集統計: session個數:%d , item個數:%d , event數:%d"%(test['userId'].nunique(),test['movieId'].nunique(),len(test))) 60 | 61 | return train 62 | --------------------------------------------------------------------------------