├── .gitignore ├── Basic-DeepFM-model ├── .idea │ ├── Basic-DeepFM-model.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── .ipynb_checkpoints │ └── DeepFM-StepByStep-checkpoint.ipynb ├── DataReader.py ├── DeepFM-StepByStep.ipynb ├── DeepFM.py ├── __pycache__ │ ├── DataReader.cpython-36.pyc │ ├── DeepFM.cpython-36.pyc │ ├── config.cpython-36.pyc │ └── metrics.cpython-36.pyc ├── config.py ├── data │ ├── test.csv │ └── train.csv ├── fig │ ├── DNN.png │ ├── DeepFM.png │ └── FM.png ├── main.py ├── metrics.py └── output │ ├── DNN_Mean-0.31183_Std0.29369.csv │ ├── DNN_Mean0.13436_Std0.06001.csv │ ├── DNN_Mean0.13817_Std0.06173.csv │ ├── DeepFM_Mean-0.11470_Std0.37335.csv │ ├── DeepFM_Mean0.01434_Std0.10176.csv │ ├── DeepFM_Mean0.05735_Std0.20027.csv │ ├── DeepFM_Mean0.06921_Std0.06395.csv │ ├── DeepFM_Mean0.17892_Std0.01572.csv │ ├── DeepFM_Mean0.26137_Std0.00210.csv │ ├── FM_Mean0.15581_Std0.02785.csv │ ├── FM_Mean0.19988_Std0.03441.csv │ └── FM_Mean0.23297_Std0.05576.csv ├── README.md ├── SVD_jia_jia_demo ├── .idea │ ├── SVD_jia_jia_demo.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml └── main.py ├── SVD_recom_demo └── main.py ├── recommendation-FFM-Demo ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── recommendation-FFM-Demo.iml │ └── workspace.xml ├── FFM_model.py └── TFModel │ ├── FFM-0.data-00000-of-00001 │ ├── FFM-0.index │ ├── FFM-0.meta │ └── checkpoint └── recommendation-FM-demo ├── .idea ├── misc.xml ├── modules.xml ├── recommendation-FM-demo.iml └── workspace.xml ├── FM_data.rar ├── FM_model.py └── data ├── ua.base └── ua.test /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isthegoal/recommendation_model_master/77c2117631e779527b92e04dc53358a88abbbbbc/.gitignore -------------------------------------------------------------------------------- /Basic-DeepFM-model/.idea/Basic-DeepFM-model.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /Basic-DeepFM-model/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /Basic-DeepFM-model/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Basic-DeepFM-model/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | feat_value 73 | predict 74 | feature_bias 75 | 76 | 77 | 78 | 87 | 88 | 89 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 82 | 83 | 84 | 85 | 86 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 1565360683569 115 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 147 | 148 | 149 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /SVD_jia_jia_demo/main.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | # author: wangle 3 | 4 | import numpy as np 5 | import random 6 | import time 7 | import pickle 8 | ''' 9 | 10 | 可以参考知乎这里的说明, https://zhuanlan.zhihu.com/p/42269534 11 | 但是发现训练速度好慢好慢。 12 | 13 | SVD++相对SVD做了进一步的改进,主要的改进点还是在于兴趣矩阵的计算上, 相对于之前的兴趣度计算方式,就入了更多的特征项 14 | 来作为更好的评估依据,其主要增加了增加了用户对商品的隐式反馈向量的计算get_Yi ,训练,更新;以及用户对商品的行为集合构建。。 15 | 16 | 17 | 18 | 其相对SVD的调整主要在这里把: 19 | rating = self.avg + self.bi[iid] + self.bu[uid] + np.sum(self.qi[iid] * (self.pu[uid] + u_impl_prf)) # 预测评分公式 20 | 具体新加的原因,参看https://blog.csdn.net/winone361/article/details/49427627 和 https://blog.csdn.net/akiyamamio11/article/details/79313339 21 | 22 | 23 | ''' 24 | import numpy as np 25 | import random 26 | 27 | ''' 28 | author:huang 29 | svd++ algorithm 30 | ''' 31 | 32 | 33 | class SVDPP: 34 | def __init__(self, mat, K=20): 35 | self.mat = np.array(mat) 36 | self.K = K 37 | self.bi = {} 38 | self.bu = {} 39 | self.qi = {} 40 | self.pu = {} 41 | self.avg = np.mean(self.mat[:, 2]) 42 | self.y = {} 43 | self.u_dict = {} 44 | for i in range(self.mat.shape[0]): 45 | uid = self.mat[i, 0] 46 | iid = self.mat[i, 1] 47 | self.u_dict.setdefault(uid, []) 48 | self.u_dict[uid].append(iid) 49 | self.bi.setdefault(iid, 0) 50 | self.bu.setdefault(uid, 0) 51 | self.qi.setdefault(iid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K)) 52 | self.pu.setdefault(uid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K)) 53 | self.y.setdefault(iid, np.zeros((self.K, 1)) + .1) 54 | 55 | def predict(self, uid, iid): # 预测评分的函数 56 | # setdefault的作用是当该用户或者物品未出现过时,新建它的bi,bu,qi,pu及用户评价过的物品u_dict,并设置初始值为0 57 | self.bi.setdefault(iid, 0) 58 | self.bu.setdefault(uid, 0) 59 | self.qi.setdefault(iid, np.zeros((self.K, 1))) 60 | self.pu.setdefault(uid, np.zeros((self.K, 1))) 61 | self.y.setdefault(uid, np.zeros((self.K, 1))) 62 | self.u_dict.setdefault(uid, []) 63 | u_impl_prf, sqrt_Nu = self.getY(uid, iid) 64 | rating = self.avg + self.bi[iid] + self.bu[uid] + np.sum(self.qi[iid] * (self.pu[uid] + u_impl_prf)) # 预测评分公式 65 | # 由于评分范围在1到5,所以当分数大于5或小于1时,返回5,1. 66 | if rating > 5: 67 | rating = 5 68 | if rating < 1: 69 | rating = 1 70 | return rating 71 | 72 | # 计算sqrt_Nu和∑yj 最为核心的部分 73 | def getY(self, uid, iid): 74 | Nu = self.u_dict[uid] 75 | I_Nu = len(Nu) 76 | sqrt_Nu = np.sqrt(I_Nu) 77 | y_u = np.zeros((self.K, 1)) 78 | if I_Nu == 0: 79 | u_impl_prf = y_u 80 | else: 81 | for i in Nu: 82 | y_u += self.y[i] 83 | u_impl_prf = y_u / sqrt_Nu 84 | 85 | return u_impl_prf, sqrt_Nu 86 | 87 | def train(self, steps=30, gamma=0.04, Lambda=0.15): # 训练函数,step为迭代次数。 88 | print('train data size', self.mat.shape) 89 | for step in range(steps): 90 | print('step', step + 1, 'is running') 91 | KK = np.random.permutation(self.mat.shape[0]) # 随机梯度下降算法,kk为对矩阵进行随机洗牌 92 | rmse = 0.0 93 | for i in range(self.mat.shape[0]): 94 | j = KK[i] 95 | uid = self.mat[j, 0] 96 | iid = self.mat[j, 1] 97 | rating = self.mat[j, 2] 98 | predict = self.predict(uid, iid) 99 | u_impl_prf, sqrt_Nu = self.getY(uid, iid) 100 | eui = rating - predict 101 | rmse += eui ** 2 102 | self.bu[uid] += gamma * (eui - Lambda * self.bu[uid]) 103 | self.bi[iid] += gamma * (eui - Lambda * self.bi[iid]) 104 | self.pu[uid] += gamma * (eui * self.qi[iid] - Lambda * self.pu[uid]) 105 | self.qi[iid] += gamma * (eui * (self.pu[uid] + u_impl_prf) - Lambda * self.qi[iid]) 106 | for j in self.u_dict[uid]: 107 | self.y[j] += gamma * (eui * self.qi[j] / sqrt_Nu - Lambda * self.y[j]) 108 | 109 | gamma = 0.93 * gamma 110 | print('rmse is', np.sqrt(rmse / self.mat.shape[0])) 111 | 112 | def test(self, test_data): # gamma以0.93的学习率递减 113 | 114 | test_data = np.array(test_data) 115 | print('test data size', test_data.shape) 116 | rmse = 0.0 117 | for i in range(test_data.shape[0]): 118 | uid = test_data[i, 0] 119 | iid = test_data[i, 1] 120 | rating = test_data[i, 2] 121 | eui = rating - self.predict(uid, iid) 122 | rmse += eui ** 2 123 | print('rmse of test data is', np.sqrt(rmse / test_data.shape[0])) 124 | 125 | 126 | def getMLData(): # 获取训练集和测试集的函数 127 | import re 128 | f = open("../data/ml-100k/u1.base", 'r') 129 | lines = f.readlines() 130 | f.close() 131 | data = [] 132 | for line in lines: 133 | list = re.split('\t|\n', line) 134 | if int(list[2]) != 0: 135 | data.append([int(i) for i in list[:3]]) 136 | train_data = data 137 | f = open("../data/ml-100k/u1.test", 'r') 138 | lines = f.readlines() 139 | f.close() 140 | data = [] 141 | for line in lines: 142 | list = re.split('\t|\n', line) 143 | if int(list[2]) != 0: 144 | data.append([int(i) for i in list[:3]]) 145 | test_data = data 146 | 147 | return train_data, test_data 148 | 149 | 150 | train_data, test_data = getMLData() 151 | a = SVDPP(train_data, 30) 152 | a.train() 153 | a.test(test_data) 154 | 155 | 156 | -------------------------------------------------------------------------------- /SVD_recom_demo/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import time 4 | import pickle 5 | ''' 6 | 可以参考知乎这里的说明:https://zhuanlan.zhihu.com/p/42147194 7 | 感觉这里的SVD相当于是LFM的简化版吧 8 | ''' 9 | class SVD(object): 10 | """ 11 | implementation of SVD for CF “https://zhuanlan.zhihu.com/p/42147194/” 12 | Reference: 13 | A Guide to Singular Value Decomposition for Collaborative Filtering 14 | 15 | 核心是:SVD在进行用户i对商品j的评分时,考虑了用户特征向量和商品特征向量,以及评分相对于平均评分的偏置向量。 16 | 17 | 发现跟 LFM算法基本是一致的,核心就是下面的式子, 也是通过一定的计算来模拟构建 兴趣矩阵,只不过这里构建矩阵时,要考虑的参数 18 | 包括 U 矩阵参数、M矩阵参数 、bu 、bi 这几种参数,其他都是差不多, 其余就是通过构建兴趣预估后的训练过程,这里正负样例是不需要构建的, 19 | 因为用户的评分值rate是已知的,可以当做标签来使用。 20 | 21 | 其实本质上是找到 用户特征矩阵U 、 物品特征向量M,学会对这两个向量的模拟。 这里新加的bi和bu整好应对之前感觉使用坐标来定位感觉信息不够, 22 | bi和bu是新加的信息,其中bi表示电影i的评分相对于平均评分的偏差,bu表示用户u所做的评分相对于平均评分的偏差,相当于是一种补足信息吧。 23 | 24 | p = self.meanV + self.bu[uid] + self.bi[iid] + np.sum(self.U[uid] * self.M[iid]) 25 | """ 26 | 27 | def __init__(self, epoch, eta, userNums, itemNums, ku=0.001, km=0.001,f=30, save_model=False): 28 | super(SVD, self).__init__() 29 | self.epoch = epoch 30 | self.userNums = userNums 31 | self.itemNums = itemNums 32 | self.eta = eta 33 | self.ku = ku 34 | self.km = km 35 | self.f = f 36 | self.save_model = save_model 37 | 38 | self.U = None 39 | self.M = None 40 | 41 | def fit(self, train, val=None): 42 | rateNums = train.shape[0] 43 | self.meanV = np.sum(train[:, 2]) / rateNums 44 | initv = np.sqrt((self.meanV - 1) / self.f) 45 | self.U = initv + np.random.uniform(-0.01, 0.01, (self.userNums + 1, self.f)) 46 | self.M = initv + np.random.uniform(-0.01, 0.01, (self.itemNums + 1, self.f)) 47 | self.bu = np.zeros(self.userNums + 1) 48 | self.bi = np.zeros(self.itemNums + 1) 49 | 50 | start = time.time() 51 | for i in range(self.epoch): 52 | sumRmse = 0.0 53 | for sample in train: 54 | uid = sample[0] 55 | iid = sample[1] 56 | vij = float(sample[2]) 57 | # p(U_i,M_j) = mu + b_i + b_u + U_i^TM_j 58 | p = self.meanV + self.bu[uid] + self.bi[iid] + \ 59 | np.sum(self.U[uid] * self.M[iid]) 60 | error = vij - p 61 | sumRmse += error ** 2 62 | # 计算Ui,Mj的梯度 63 | deltaU = error * self.M[iid] - self.ku * self.U[uid] 64 | deltaM = error * self.U[uid] - self.km * self.M[iid] 65 | # 更新参数 66 | self.U[uid] += self.eta * deltaU 67 | self.M[iid] += self.eta * deltaM 68 | 69 | self.bu[uid] += self.eta * (error - self.ku * self.bu[uid]) 70 | self.bi[iid] += self.eta * (error - self.km * self.bi[iid]) 71 | 72 | trainRmse = np.sqrt(sumRmse / rateNums) 73 | 74 | if val.any(): 75 | _, valRmse = self.evaluate(val) 76 | print("Epoch %d cost time %.4f, train RMSE: %.4f, validation RMSE: %.4f" % \ 77 | (i, time.time() - start, trainRmse, valRmse)) 78 | else: 79 | print("Epoch %d cost time %.4f, train RMSE: %.4f" % \ 80 | (i, time.time() - start, trainRmse)) 81 | 82 | if self.save_model: 83 | save_model='../data' 84 | model = (self.meanV, self.bu, self.bi, self.U, self.M) 85 | pickle.dump(model, open(save_model + '/svcRecModel.pkl', 'wb')) 86 | 87 | def evaluate(self, val): 88 | 89 | ''' 90 | 根据用户id和商品 id 去定位 去按照预估的参数,计算出定位得分,得到兴趣度评估分。 91 | ''' 92 | loss = 0 93 | pred = [] 94 | for sample in val: 95 | uid = sample[0] 96 | iid = sample[1] 97 | if uid > self.userNums or iid > self.itemNums: 98 | continue 99 | 100 | predi = self.meanV + self.bu[uid] + self.bi[iid] \ 101 | + np.sum(self.U[uid] * self.M[iid]) 102 | if predi < 1: 103 | predi = 1 104 | elif predi > 5: 105 | predi = 5 106 | pred.append(predi) 107 | 108 | if val.shape[1] == 3: 109 | vij = sample[2] 110 | loss += (predi - vij) ** 2 111 | 112 | if val.shape[1] == 3: 113 | rmse = np.sqrt(loss / val.shape[0]) 114 | return pred, rmse 115 | 116 | return pred 117 | 118 | def predict(self, test): 119 | 120 | return self.evaluate(test) 121 | 122 | 123 | def test(): 124 | import pandas as pd 125 | data_path = '../data/ml-1m/ratings.dat' 126 | data = pd.read_csv(data_path, sep='::', header=None,names=['user', 'item', 'rate', 'time'], engine='python') 127 | 128 | data = data.sample(frac=1) 129 | print(data.head()) 130 | 131 | del data['time'] 132 | trainNum = int(data.shape[0] * 0.8) 133 | train = data[:trainNum].values 134 | val = data[trainNum:].values 135 | 136 | userNums = data['user'].max() 137 | itemNums = data['item'].max() 138 | svd = SVD(35, 0.001, userNums, itemNums, f=50) 139 | svd.fit(train, val=val) 140 | svd.predict(val) 141 | if __name__ == '__main__': 142 | test() 143 | -------------------------------------------------------------------------------- /recommendation-FFM-Demo/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /recommendation-FFM-Demo/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /recommendation-FFM-Demo/.idea/recommendation-FFM-Demo.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /recommendation-FFM-Demo/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 34 | 35 | 36 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 112 | 113 | 114 | 115 | 116 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 1565359186405 145 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | -------------------------------------------------------------------------------- /recommendation-FM-demo/FM_data.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isthegoal/recommendation_model_master/77c2117631e779527b92e04dc53358a88abbbbbc/recommendation-FM-demo/FM_data.rar -------------------------------------------------------------------------------- /recommendation-FM-demo/FM_model.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | from itertools import count 3 | from collections import defaultdict 4 | from scipy.sparse import csr 5 | import numpy as np 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn.feature_extraction import DictVectorizer 9 | import tensorflow as tf 10 | from tqdm import tqdm_notebook as tqdm 11 | 12 | 13 | def vectorize_dic(dic,ix=None,p=None,n=0,g=0): 14 | """ 15 | dic -- dictionary of feature lists. Keys are the name of features 16 | ix -- index generator (default None) 17 | p -- dimension of feature space (number of columns in the sparse matrix) (default None) 18 | 把字典直接输入网络是非常不合适的,所以这里从id里面提取特征,做了embedding的一个向量化表示。 19 | 20 | 这里把 项的字典 转换成了 embedding表示,至于怎么简单对id进行词向量转换的,还挺有意思的,字典的向量转换(这里可以看到进行压缩前,比较有意义的矩阵是 对出现次数统计的矩阵) 21 | 其余还有全为0和全为1的参与到 稀疏矩阵的合成中。 22 | csr.csr_matrix: [[0. 2. 0. ... 0. 0. 0.] 23 | [0. 1. 1. ... 0. 0. 0.] 24 | [0. 1. 0. ... 0. 0. 0.] 25 | ... 26 | [0. 0. 0. ... 0. 0. 0.] 27 | [0. 0. 0. ... 0. 0. 0.] 28 | [0. 0. 0. ... 0. 0. 0.]] 29 | 向量转换的简单计算再这里,可以看得出是简单的 出现次数下的计算累加 (这里) 30 | for k,lis in dic.items(): 31 | for t in range(len(lis)): 32 | ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1 33 | col_ix[i+t*g] = ix[str(lis[t]) + str(k)] 34 | i += 1 35 | 36 | 常常需要把一个稀疏的np.array压缩,这时候就用到scipy库中的sparse.csr_matrix(csr:Compressed Sparse Row marix) 和sparse.csc_matric(csc:Compressed Sparse Column marix) 37 | 38 | 现在明白了,这就是一个对特征 的字典形式存储转成 的二维矩阵 形式,外加矩阵的压缩。 从而实现对 特征的embedding转换,由id索引转成词向量。 39 | 40 | 41 | 接下来就是将向量投入矩阵进行训练即可, 这就是一个线性模型,所以是比较简单的网络处理思路。 这里field也并不是分开处理的。 42 | 43 | 44 | 这里是对 45 | """ 46 | print('dic:',dic) 47 | if ix==None: 48 | ix = dict() 49 | 50 | nz = n * g 51 | 52 | col_ix = np.empty(nz,dtype = int) 53 | 54 | i = 0 55 | for k,lis in dic.items(): 56 | for t in range(len(lis)): 57 | ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1 58 | col_ix[i+t*g] = ix[str(lis[t]) + str(k)] 59 | i += 1 60 | 61 | row_ix = np.repeat(np.arange(0,n),g) 62 | data = np.ones(nz) 63 | if p == None: 64 | p = len(ix) 65 | 66 | ixx = np.where(col_ix < p) #这里 row_ix[ixx]和 col_ix[ixx] 都是一个通过不断累加统计下的二维矩阵,行是field,列是域列值的数量。 67 | print('csr.csr_matrix:', csr.csr_matrix((c[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p)).todense()) 68 | print('csr.csr_matrix shape:', csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)).todense().shape) 69 | print('data[ixx] shape :',data) 70 | print('row_ix[ixx] shape :', row_ix) 71 | print('col_ix[ixx] shape :',col_ix) 72 | 73 | return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p)),ix 74 | 75 | 76 | def batcher(X_, y_=None, batch_size=-1): 77 | n_samples = X_.shape[0] 78 | 79 | if batch_size == -1: 80 | batch_size = n_samples 81 | if batch_size < 1: 82 | raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size)) 83 | 84 | for i in range(0, n_samples, batch_size): 85 | upper_bound = min(i + batch_size, n_samples) 86 | ret_x = X_[i:upper_bound] 87 | ret_y = None 88 | if y_ is not None: 89 | ret_y = y_[i:i + batch_size] 90 | yield (ret_x, ret_y) 91 | 92 | 93 | cols = ['user','item','rating','timestamp'] 94 | 95 | train = pd.read_csv('data/ua.base',delimiter='\t',names = cols) 96 | test = pd.read_csv('data/ua.test',delimiter='\t',names = cols) 97 | 98 | x_train,ix = vectorize_dic({'users':train['user'].values, 99 | 'items':train['item'].values},n=len(train.index),g=2) 100 | print('x_train:',x_train) 101 | 102 | x_test,ix = vectorize_dic({'users':test['user'].values, 103 | 'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2) 104 | 105 | 106 | print(x_train) 107 | y_train = train['rating'].values 108 | y_test = test['rating'].values 109 | 110 | x_train = x_train.todense() 111 | x_test = x_test.todense() 112 | 113 | print('x_train todense:', x_train) 114 | 115 | print(x_train.shape) 116 | print (x_test.shape) 117 | 118 | 119 | n,p = x_train.shape 120 | 121 | k = 10 122 | 123 | 124 | x = tf.placeholder('float',[None,p]) 125 | 126 | y = tf.placeholder('float',[None,1]) 127 | 128 | w0 = tf.Variable(tf.zeros([1])) 129 | w = tf.Variable(tf.zeros([p])) 130 | 131 | v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01)) 132 | 133 | #y_hat = tf.Variable(tf.zeros([n,1])) 134 | 135 | linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True)) # n * 1 136 | pair_interactions = 0.5 * tf.reduce_sum( 137 | tf.subtract( 138 | tf.pow( 139 | tf.matmul(x,tf.transpose(v)),2), 140 | tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2))) 141 | ),axis = 1 , keep_dims=True) 142 | 143 | y_hat = tf.add(linear_terms,pair_interactions) 144 | 145 | lambda_w = tf.constant(0.001,name='lambda_w') 146 | lambda_v = tf.constant(0.001,name='lambda_v') 147 | 148 | l2_norm = tf.reduce_sum( 149 | tf.add( 150 | tf.multiply(lambda_w,tf.pow(w,2)), 151 | tf.multiply(lambda_v,tf.pow(v,2)) 152 | ) 153 | ) 154 | 155 | error = tf.reduce_mean(tf.square(y-y_hat)) 156 | loss = tf.add(error,l2_norm) 157 | 158 | 159 | train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss) 160 | 161 | 162 | epochs = 10 163 | batch_size = 1000 164 | 165 | # Launch the graph 166 | init = tf.global_variables_initializer() 167 | with tf.Session() as sess: 168 | sess.run(init) 169 | 170 | for epoch in tqdm(range(epochs), unit='epoch'): 171 | perm = np.random.permutation(x_train.shape[0]) #随机打乱,np.random.permutation(x):不在原数组上进行,返回新的数组,不改变自身数组。 172 | # iterate over batches 173 | for bX, bY in batcher(x_train[perm], y_train[perm], batch_size): 174 | _,t = sess.run([train_op,loss], feed_dict={x: bX.reshape(-1, p), y: bY.reshape(-1, 1)}) 175 | print(t) 176 | 177 | 178 | errors = [] 179 | for bX, bY in batcher(x_test, y_test): 180 | errors.append(sess.run(error, feed_dict={x: bX.reshape(-1, p), y: bY.reshape(-1, 1)})) 181 | print(errors) 182 | RMSE = np.sqrt(np.array(errors).mean()) 183 | print (RMSE) 184 | 185 | 186 | 187 | 188 | 189 | 190 | --------------------------------------------------------------------------------