├── .idea └── vcs.xml ├── AFM └── AFM.py ├── CollaborativeFiltering.ipynb ├── DCN ├── DCN-keras.ipynb ├── DCN-tf2.0.ipynb ├── DCN-tf2.0.py └── DCN.ipynb ├── GBDT_LR.ipynb ├── MLR.ipynb ├── NFM └── NFM.py ├── PNN ├── PNN-tf2.0.ipynb └── PNN.py ├── README.md ├── Wide-Deep ├── Wide-Deep.ipynb ├── Wide-Deep.py └── data_process.py ├── data ├── Criteo │ ├── data_process.py │ └── train.txt └── Driver │ └── train.csv ├── embedding.ipynb └── util └── train_model.py /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /AFM/AFM.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | TensorFlow 2.0 implementation of AFM 4 | Reference: 5 | https://www.jianshu.com/p/83d3b2a1e55d 6 | Attentional Factorization Machines: 7 | Learning the Weight of Feature Interactions via Attention Networks 8 | """ 9 | import tensorflow as tf 10 | 11 | import pickle 12 | from util.train_model import train_test_model_demo 13 | 14 | 15 | class AttentionNet(tf.keras.layers.Layer): 16 | def __init__(self, embedding_size=10,attention_size=3, **kwargs): 17 | self.embedding_size = embedding_size 18 | self.attention_size = attention_size 19 | super(AttentionNet, self).__init__(**kwargs) 20 | 21 | def build(self, input_shape): 22 | input_dim = input_shape[2] 23 | 24 | self.linearlayer = tf.keras.layers.Dense(input_dim, activation='relu', use_bias=True) 25 | self.attention_w = self.add_weight(shape=(self.embedding_size,self.attention_size), 26 | initializer='random_normal',trainable=True) 27 | self.attention_b = self.add_weight(shape=(self.attention_size,), 28 | initializer='random_normal',trainable=True) 29 | self.attention_h = self.add_weight(shape=(self.attention_size,), 30 | initializer='random_normal',trainable=True) 31 | self.attention_p = self.add_weight(shape=(self.embedding_size,1), 32 | initializer='ones',trainable=True) 33 | 34 | def call(self, input): 35 | # element_wise 36 | num_feat = input.shape[1] 37 | element_wise_product_list = [] 38 | for i in range(num_feat): 39 | for j in range(i+1,num_feat): 40 | element_wise_product_list.append(tf.multiply(input[:,i,:],input[:,j,:])) # None * embedding_size 41 | self.element_wise_product = tf.stack(element_wise_product_list) # (F * F - 1 / 2) * None * embedding_size 42 | self.element_wise_product = tf.transpose(self.element_wise_product,perm=[1,0,2],name='element_wise_product') # None * (F * F - 1 / 2) * embedding_size 43 | print("element_wise_product",self.element_wise_product.get_shape()) 44 | # attention part 45 | num_interaction = int(num_feat*(num_feat-1)/2) 46 | # wx+b->relu(wx+b)->h*relu(wx+b) 47 | self.attention_wx_plus_b = tf.reshape(tf.add(tf.matmul(tf.reshape(self.element_wise_product,shape=(-1,self.embedding_size)), 48 | self.attention_w),self.attention_b),shape = [-1,num_interaction,self.attention_size]) # N *(F*F-1/2)*1 49 | self.attention_exp = tf.exp(tf.reduce_sum(tf.multiply(tf.nn.relu(self.attention_wx_plus_b), 50 | self.attention_h),axis=2))# N * ( F * F - 1 / 2) * 1 51 | 52 | self.attention_exp_sum = tf.reshape(tf.reduce_sum(self.attention_exp,axis=1),shape=(-1,1)) # N * 1 * 1 53 | 54 | self.attention_out = tf.divide(self.attention_exp,self.attention_exp_sum,name='attention_out') # N * ( F * F - 1 / 2) * 1 55 | self.attention_x_product = tf.reduce_sum(tf.einsum('bn,bnm->bnm',self.attention_out,self.element_wise_product),axis=1,name='afm') # N * embedding_size 56 | self.attention_part_sum = tf.matmul(self.attention_x_product,self.attention_p) # N * 1 57 | 58 | return self.attention_part_sum 59 | 60 | class AFM(tf.keras.Model): 61 | def __init__(self, num_feat,embedding_size=10,attention_size=3): 62 | super().__init__() 63 | self.num_feat = num_feat # F features nums 字典数量 64 | self.embedding_size = embedding_size 65 | self.attention_size = attention_size 66 | # Embedding 这里采用embeddings层 因此大小为F* M F为field特征数量,N 为 feature的种类数 M为embedding的维度 67 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, 68 | embeddings_initializer='uniform') # N * embedding_size 69 | self.feat_embeddings = feat_embeddings 70 | self.attentionlayer = AttentionNet(self.embedding_size,self.attention_size) 71 | # linear part 72 | self.linearlayer = tf.keras.layers.Dense(1, activation='relu', use_bias=True) 73 | 74 | def call(self, feat_index, feat_value): 75 | # call函数接收输入变量 76 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。 77 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * F * embedding_size 78 | feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value) # # Batch * F * embedding_size 79 | feat_embedding_1 = tf.transpose(feat_embedding,perm=[0,2,1]) 80 | y_deep = self.attentionlayer(feat_embedding) 81 | 82 | y_linear = tf.reduce_sum(self.linearlayer(feat_embedding_1),axis=1) 83 | output = y_deep + y_linear 84 | return output 85 | if __name__ == '__main__': 86 | AID_DATA_DIR = "../data/Criteo/" 87 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb')) 88 | 89 | afm = AFM(num_feat=len(feat_dict_) + 1,embedding_size=10,attention_size=3) 90 | 91 | train_label_path = AID_DATA_DIR + 'train_label' 92 | train_idx_path = AID_DATA_DIR + 'train_idx' 93 | train_value_path = AID_DATA_DIR + 'train_value' 94 | 95 | test_label_path = AID_DATA_DIR + 'test_label' 96 | test_idx_path = AID_DATA_DIR + 'test_idx' 97 | test_value_path = AID_DATA_DIR + 'test_value' 98 | 99 | train_test_model_demo(afm,train_label_path, train_idx_path, train_value_path) 100 | -------------------------------------------------------------------------------- /CollaborativeFiltering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "movies = pd.read_csv(r\"F:\\baidudownload\\ml-20m\\ml-20m\\movies.csv\")" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 4, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "ratings = pd.read_csv(r'F:\\baidudownload\\ml-20m\\ml-20m\\ratings.csv')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 5, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | "
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
\n", 94 | "
" 95 | ], 96 | "text/plain": [ 97 | " movieId title \\\n", 98 | "0 1 Toy Story (1995) \n", 99 | "1 2 Jumanji (1995) \n", 100 | "2 3 Grumpier Old Men (1995) \n", 101 | "3 4 Waiting to Exhale (1995) \n", 102 | "4 5 Father of the Bride Part II (1995) \n", 103 | "\n", 104 | " genres \n", 105 | "0 Adventure|Animation|Children|Comedy|Fantasy \n", 106 | "1 Adventure|Children|Fantasy \n", 107 | "2 Comedy|Romance \n", 108 | "3 Comedy|Drama|Romance \n", 109 | "4 Comedy " 110 | ] 111 | }, 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "movies.head()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/html": [ 129 | "
\n", 130 | "\n", 143 | "\n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | "
userIdmovieIdratingtimestamp
0123.51112486027
11293.51112484676
21323.51112484819
31473.51112484727
41503.51112484580
\n", 191 | "
" 192 | ], 193 | "text/plain": [ 194 | " userId movieId rating timestamp\n", 195 | "0 1 2 3.5 1112486027\n", 196 | "1 1 29 3.5 1112484676\n", 197 | "2 1 32 3.5 1112484819\n", 198 | "3 1 47 3.5 1112484727\n", 199 | "4 1 50 3.5 1112484580" 200 | ] 201 | }, 202 | "execution_count": 6, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "ratings.head()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 10, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "data = pd.merge(movies,ratings,on='movieId',how='left')" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 12, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "data[['userId','rating','movieId','title']].sort_values('userId').to_csv(r\"F:\\baidudownload\\ml-20m\\ml-20m\\data.csv\",index=False)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "### 采用python字典来表示每位用户评论的电影和评分" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 25, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "files = open(r\"F:\\baidudownload\\ml-20m\\ml-20m\\data.csv\",'r',encoding=\"UTF-8\")\n", 243 | "# 读取data文件中每行中除了名字的数据\n", 244 | "data = {} ## 存放每个用户评论的电影和评分\n", 245 | "for line in files.readlines():\n", 246 | " line = line.strip().split(',')\n", 247 | " # 如果字典中没有某位用户,则使用用户ID来创建这位用户\n", 248 | " if not line[0] in data.keys():\n", 249 | " data[line[0]] = {line[3]:line[1]} # 子字典\n", 250 | " else:\n", 251 | " data[line[0]][line[3]] = line[1]" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## 计算任何两位用户之间的相似度,由于每位用户评论的电影不完全一样,所以兽先要找到两位用户共同评论过的电影然后计算两者之间的欧式距离,最后算出两者之间的相似度" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 16, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "from math import *" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 38, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "def Euclidean(user1,user2):\n", 277 | " # 取出两位用户评论过的电影和评分\n", 278 | " user1_data = data[user1]\n", 279 | " user2_data = data[user2]\n", 280 | " \n", 281 | " # 找到两位用户都评论过的电影,并计算两者的欧式距离\n", 282 | " for key in user1_data.keys():\n", 283 | " if key in user2_data.keys():\n", 284 | "# print(user1_data[key],user2_data[key])\n", 285 | " try:\n", 286 | " distance +=pow((float(user1_data[key])-float(user2_data[key])),2)\n", 287 | " except:\n", 288 | " print(\"error:\",user2_data[key])\n", 289 | " return 1/(1+sqrt(distance)) # 计算返回值越小,相似度越大" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 39, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "name": "stdout", 299 | "output_type": "stream", 300 | "text": [ 301 | "error: \n", 302 | "[('17602.0', 0.037535053785096986), ('67346.0', 0.03923924660549805), ('116900.0', 0.03938151824124737), ('130390.0', 0.042373278587501804)]\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "# 计算某个用户与其他用的相似度\n", 308 | "def top10_simliar(userID):\n", 309 | " res = []\n", 310 | " for userid in data.keys():\n", 311 | " # 排除自己计算相似度\n", 312 | " if not userid == userID:\n", 313 | " simliar = Euclidean(userID,userid)\n", 314 | " res.append((userid,simliar))\n", 315 | " res.sort(key=lambda val:val[1])# 按照相似度最大顺序排序\n", 316 | " return res[:4]\n", 317 | "RES = top10_simliar('1.0')\n", 318 | "print(RES)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "## 根据相似度来推荐用户" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 45, 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "error: \n", 338 | "[('Good Will Hunting (1997)', '5.0'), ('Horton Hears a Who! (2008)', '5.0'), ('Billy Madison (1995)', '5.0'), ('Julie & Julia (2009)', '5.0'), ('Chocolat (2000)', '5.0'), ('Harry Potter and the Order of the Phoenix (2007)', '5.0'), ('\"Sisterhood of the Traveling Pants', '5.0'), ('\"Secret Life of Bees', '5.0'), ('Happy Gilmore (1996)', '5.0'), ('Big Daddy (1999)', '5.0')]\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "def recommend(user):\n", 344 | " # 相似度最高用户\n", 345 | " top_sim_user = top10_simliar(user)[0][0]\n", 346 | " # 相似度最高用户的观影记录\n", 347 | " items = data[top_sim_user]\n", 348 | " recommendations = []\n", 349 | " # 筛选出该用户未观看的电影病添加到列表中\n", 350 | " for item in items.keys():\n", 351 | " if item not in data[user].keys():\n", 352 | " recommendations.append((item,items[item]))\n", 353 | " recommendations.sort(key=lambda val :val[1],reverse=True) # 按照评分排序\n", 354 | " # 返回评分最高的10部电影\n", 355 | " return recommendations[:10]\n", 356 | "Recommend = recommend('1.0')\n", 357 | "print(Recommend)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "#### \n", 365 | "但有时我们会碰到因为两个用户之间数据由于数据膨胀,一方数据大,一方数据小,但是两者称明显的线性关系\n", 366 | "\n", 367 | "我们引入Pearson相关系数来衡量两个变量之间的线性相关性。\n", 368 | "\n", 369 | "Pearson:-1~1   -1:完全负相关  1:完全正相关  0:不相关              \n", 370 | "\n", 371 | "相关系数 0.8-1.0 极强相关\n", 372 | "\n", 373 | "0.6-0.8 强相关\n", 374 | "\n", 375 | "0.4-0.6 中等程度相关\n", 376 | "\n", 377 | "0.2-0.4 弱相关\n", 378 | "\n", 379 | "0.0-0.2 极弱相关或无相关\n", 380 | "\n", 381 | "公式:" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 61, 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "0.22531203182281434\n" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "########################################################################\n", 399 | "##计算两用户之间的Pearson相关系数\n", 400 | "def pearson_sim(user1,user2):\n", 401 | " # 取出两位用户评论过的电影和评分\n", 402 | " user1_data = data[user1]\n", 403 | " user2_data = data[user2]\n", 404 | " distance = 0\n", 405 | " common = {}\n", 406 | " \n", 407 | " # 找到两位用户都评论过的电影\n", 408 | " for key in user1_data.keys():\n", 409 | " if key in user2_data.keys():\n", 410 | " common[key] = 1\n", 411 | " if len(common) == 0:\n", 412 | " return 0#如果没有共同评论过的电影,则返回0\n", 413 | " n = len(common)#共同电影数目\n", 414 | "# print(n,common)\n", 415 | " \n", 416 | " ##计算评分和\n", 417 | " try:\n", 418 | " sum1 = sum([float(user1_data[movie]) for movie in common])\n", 419 | " sum2 = sum([float(user2_data[movie]) for movie in common])\n", 420 | "\n", 421 | " ##计算评分平方和\n", 422 | " sum1Sq = sum([pow(float(user1_data[movie]),2) for movie in common])\n", 423 | " sum2Sq = sum([pow(float(user2_data[movie]),2) for movie in common])\n", 424 | "\n", 425 | " ##计算乘积和\n", 426 | " PSum = sum([float(user1_data[it])*float(user2_data[it]) for it in common])\n", 427 | " \n", 428 | " ##计算相关系数\n", 429 | " num = PSum - (sum1*sum2/n)\n", 430 | " den = sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))\n", 431 | " except:\n", 432 | " den = 999\n", 433 | " num = 0\n", 434 | " print('error:') \n", 435 | " if den == 0:\n", 436 | " return 0\n", 437 | " r = num/den\n", 438 | " return r\n", 439 | " \n", 440 | "R = pearson_sim('1.0','3.0')\n", 441 | "print(R)" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 63, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "# 计算某个用户与其他用的相似度\n", 451 | "def top10_simliar(userID):\n", 452 | " res = []\n", 453 | " for userid in data.keys():\n", 454 | " # 排除自己计算相似度\n", 455 | " if not userid == userID:\n", 456 | " simliar = pearson_sim(userID,userid)\n", 457 | " res.append((userid,simliar))\n", 458 | " res.sort(key=lambda val:val[1])# 按照相似度最大顺序排序\n", 459 | " return res[-4:]" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 64, 465 | "metadata": {}, 466 | "outputs": [ 467 | { 468 | "name": "stdout", 469 | "output_type": "stream", 470 | "text": [ 471 | "error:\n", 472 | "[('79721.0', 1.000000000000017), ('60581.0', 1.0000000000000187), ('83906.0', 1.0000000000000213), ('103682.0', 1.0000000000000255)]\n" 473 | ] 474 | } 475 | ], 476 | "source": [ 477 | "RES = top10_simliar('1.0')\n", 478 | "print(RES)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 65, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "error:\n", 491 | "[('\"Italian Job', '5.0'), ('\"Clockwork Orange', '5.0'), ('RocknRolla (2008)', '5.0'), ('No Country for Old Men (2007)', '5.0'), ('21 Grams (2003)', '5.0'), ('Layer Cake (2004)', '5.0'), ('Seven Pounds (2008)', '5.0'), ('Trainspotting (1996)', '5.0'), (\"Carlito's Way (1993)\", '5.0'), ('Crash (2004)', '5.0')]\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "def recommend(user):\n", 497 | " # 相似度最高用户\n", 498 | " top_sim_user = top10_simliar(user)[0][0]\n", 499 | " # 相似度最高用户的观影记录\n", 500 | " items = data[top_sim_user]\n", 501 | " recommendations = []\n", 502 | " # 筛选出该用户未观看的电影病添加到列表中\n", 503 | " for item in items.keys():\n", 504 | " if item not in data[user].keys():\n", 505 | " recommendations.append((item,items[item]))\n", 506 | " recommendations.sort(key=lambda val :val[1],reverse=True) # 按照评分排序\n", 507 | " # 返回评分最高的10部电影\n", 508 | " return recommendations[:10]\n", 509 | "Recommend = recommend('1.0')\n", 510 | "print(Recommend)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [] 519 | } 520 | ], 521 | "metadata": { 522 | "kernelspec": { 523 | "display_name": "Python 3", 524 | "language": "python", 525 | "name": "python3" 526 | }, 527 | "language_info": { 528 | "codemirror_mode": { 529 | "name": "ipython", 530 | "version": 3 531 | }, 532 | "file_extension": ".py", 533 | "mimetype": "text/x-python", 534 | "name": "python", 535 | "nbconvert_exporter": "python", 536 | "pygments_lexer": "ipython3", 537 | "version": "3.6.5" 538 | } 539 | }, 540 | "nbformat": 4, 541 | "nbformat_minor": 1 542 | } 543 | -------------------------------------------------------------------------------- /DCN/DCN-tf2.0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "import tensorflow as tf" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from collections import Counter" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "class CrossLayer(tf.keras.layers.Layer):\n", 39 | " def __init__(self,output_dim,num_layer,**kwargs):\n", 40 | " self.output_dim = output_dim\n", 41 | " self.num_layer = num_layer\n", 42 | " super(CrossLayer,self).__init__(**kwargs)\n", 43 | " \n", 44 | " def build(self,input_shape):\n", 45 | " self.input_dim = input_shape[1]\n", 46 | " # print(self.input_dim)\n", 47 | " self.W = []\n", 48 | " self.bias = []\n", 49 | " for i in range(self.num_layer):\n", 50 | " self.W.append(self.add_weight(shape=[self.input_dim,1],initializer = 'glorot_uniform',name='w_{}'.format(i),trainable=True))\n", 51 | " self.bias.append(self.add_weight(shape=[self.input_dim,1],initializer = 'zeros',name='b_{}'.format(i),trainable=True))\n", 52 | " self.built = True\n", 53 | " def call(self,input):\n", 54 | "\n", 55 | " x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j]\n", 56 | " # print(\"x0_shape\",x0.get_shape())\n", 57 | " x1 = tf.einsum('bmn,bnk->bmk',input,x0)\n", 58 | " cross = tf.einsum('bmn,nk->bmk',x1,self.W[0]) + self.bias[0] + input\n", 59 | " \n", 60 | " for i in range(1,self.num_layer):\n", 61 | " x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j]\n", 62 | " x1 = tf.einsum('bmn,bnk->bmk',input,x0)\n", 63 | " cross = tf.einsum('bmn,nk->bmk',x1,self.W[i]) + self.bias[i] + cross\n", 64 | " return cross\n", 65 | " \n", 66 | "class Deep(tf.keras.layers.Layer):\n", 67 | " def __init__(self,dropout_deep,deep_layer_sizes):\n", 68 | " # input_dim = num_size + embed_size = input_size\n", 69 | " super(Deep, self).__init__()\n", 70 | " self.dropout_deep = dropout_deep\n", 71 | " # fc layer\n", 72 | " self.deep_layer_sizes = deep_layer_sizes\n", 73 | " # 神经网络方面的参数\n", 74 | " for i in range(len(deep_layer_sizes)):\n", 75 | " setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))\n", 76 | " setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())\n", 77 | " setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))\n", 78 | " setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))\n", 79 | " # last layer\n", 80 | " self.fc = tf.keras.layers.Dense(128,activation=None,use_bias=True)\n", 81 | " \n", 82 | " def call(self,input):\n", 83 | " y_deep = getattr(self,'dense_' + str(0))(input)\n", 84 | " y_deep = getattr(self,'batchNorm_' + str(0))(y_deep)\n", 85 | " y_deep = getattr(self,'activation_' + str(0))(y_deep)\n", 86 | " y_deep = getattr(self,'dropout_' + str(0))(y_deep)\n", 87 | " \n", 88 | " for i in range(1,len(self.deep_layer_sizes)):\n", 89 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n", 90 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n", 91 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n", 92 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n", 93 | " \n", 94 | " output = self.fc(y_deep)\n", 95 | " return output\n", 96 | " \n", 97 | "class DCN(tf.keras.Model):\n", 98 | " def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):\n", 99 | " super().__init__()\n", 100 | " self.num_feat = num_feat # F =features nums\n", 101 | " self.num_field = num_field # N =fields of a feature \n", 102 | " self.dropout_deep = dropout_deep\n", 103 | " \n", 104 | " # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度\n", 105 | " feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M \n", 106 | " self.feat_embeddings = feat_embeddings\n", 107 | " \n", 108 | " self.crosslayer = CrossLayer(output_dim = 128,num_layer=8)\n", 109 | " \n", 110 | " self.deep = Deep(dropout_deep,deep_layer_sizes)\n", 111 | " self.fc = tf.keras.layers.Dense(1,activation='sigmoid',use_bias=True)\n", 112 | " \n", 113 | " def call(self,feat_index,feat_value):\n", 114 | " \n", 115 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 116 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 117 | "# print(feat_value.get_shape())\n", 118 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 119 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n", 120 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n", 121 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n", 122 | " \n", 123 | " x1 = self.crosslayer(stack_input)\n", 124 | " x2 = self.deep(stack_input)\n", 125 | " \n", 126 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n", 127 | " output = self.fc(x3)\n", 128 | " return output" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 4, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "train = pd.read_table('../data/Criteo/train.txt')\n", 138 | "train.columns=['label','I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n", 139 | " 'I10', 'I11', 'I12', 'I13','C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n", 140 | " 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n", 141 | " 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 5, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "cont_features=['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n", 151 | " 'I10', 'I11', 'I12', 'I13']\n", 152 | "dist_features = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n", 153 | " 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n", 154 | " 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 6, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "freq_ = 10\n", 164 | "# dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'\n", 165 | "continuous_range_ = range(1, 14)\n", 166 | "categorical_range_ = range(14, 40)\n", 167 | "\n", 168 | "# 统计离散特征每个离散值出现的次数组成字典\n", 169 | "feat_cnt = Counter()\n", 170 | "with open('../data/Criteo/train.txt', 'r') as fin:\n", 171 | " for line_idx, line in enumerate(fin):\n", 172 | " features = line.rstrip('\\n').split('\\t')\n", 173 | " for idx in categorical_range_:\n", 174 | " if features[idx] == '': continue\n", 175 | " feat_cnt.update([features[idx]])\n", 176 | "# Only retain discrete features with high frequency\n", 177 | "dis_feat_set = set() # 高频段的离散字符\n", 178 | "for feat, ot in feat_cnt.items():\n", 179 | " if ot >= freq_:\n", 180 | " dis_feat_set.add(feat)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 7, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "# Create a dictionary for continuous and discrete features\n", 190 | "feat_dict = {}\n", 191 | "tc = 1\n", 192 | "# Continuous features\n", 193 | "for idx in continuous_range_:\n", 194 | " feat_dict[idx] = tc\n", 195 | " tc += 1 # 代表占据一列\n", 196 | "\n", 197 | "# Discrete features\n", 198 | "cnt_feat_set = set()\n", 199 | "with open('../data/Criteo/train.txt', 'r') as fin:\n", 200 | " for line_idx, line in enumerate(fin):\n", 201 | " features = line.rstrip('\\n').split('\\t')\n", 202 | " for idx in categorical_range_:\n", 203 | " # 排除空字符和低频离散字符\n", 204 | " if features[idx] == '' or features[idx] not in dis_feat_set:\n", 205 | " continue\n", 206 | " # 排除连续性数值\n", 207 | " if features[idx] not in cnt_feat_set:\n", 208 | " cnt_feat_set.add(features[idx])\n", 209 | " # 获取种类数\n", 210 | " feat_dict[features[idx]] = tc\n", 211 | " tc += 1" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 8, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "train_label = []\n", 221 | "train_value = []\n", 222 | "train_idx = []\n", 223 | "\n", 224 | "continuous_range_ = range(1, 14)\n", 225 | "categorical_range_ = range(14, 40)\n", 226 | "cont_max_=[]\n", 227 | "cont_min_=[]\n", 228 | "for cf in cont_features:\n", 229 | " cont_max_.append(max(train[cf]))\n", 230 | " cont_min_.append(min(train[cf]))\n", 231 | "cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]\n", 232 | "\n", 233 | "def process_line_(line):\n", 234 | " features = line.rstrip('\\n').split('\\t')\n", 235 | " feat_idx, feat_value, label = [], [], []\n", 236 | "\n", 237 | " # MinMax Normalization\n", 238 | " for idx in continuous_range_:\n", 239 | " if features[idx] == '':\n", 240 | " feat_idx.append(0)\n", 241 | " feat_value.append(0.0)\n", 242 | " else:\n", 243 | " feat_idx.append(feat_dict[idx])\n", 244 | " # 归一化\n", 245 | " feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))\n", 246 | "\n", 247 | " # 处理离散型数据\n", 248 | " for idx in categorical_range_:\n", 249 | " if features[idx] == '' or features[idx] not in feat_dict:\n", 250 | " feat_idx.append(0)\n", 251 | " feat_value.append(0.0)\n", 252 | " else:\n", 253 | " feat_idx.append(feat_dict[features[idx]])\n", 254 | " feat_value.append(1.0)\n", 255 | " return feat_idx, feat_value, [int(features[0])]\n", 256 | "\n", 257 | "with open('../data/Criteo/train.txt', 'r') as fin:\n", 258 | " for line_idx, line in enumerate(fin):\n", 259 | "\n", 260 | " feat_idx, feat_value, label = process_line_(line)\n", 261 | " train_label.append(label)\n", 262 | " train_idx.append(feat_idx)\n", 263 | " train_value.append(feat_value)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 9, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "dcn= DCN(num_feat=len(feat_dict) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],\n", 273 | " deep_layer_sizes=[400, 400])" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 10, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "train_ds = tf.data.Dataset.from_tensor_slices(\n", 283 | " (train_label,train_idx,train_value)).shuffle(10000).batch(32)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 11, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "@tf.function\n", 293 | "def train_one_step(model, optimizer, idx, value, label):\n", 294 | " with tf.GradientTape() as tape:\n", 295 | " output = model(idx,value)\n", 296 | " loss = loss_object(y_true=label, y_pred=output)\n", 297 | " grads = tape.gradient(loss, model.trainable_variables)\n", 298 | " grads = [tf.clip_by_norm(g, 100) for g in grads]\n", 299 | " optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))\n", 300 | " \n", 301 | " train_loss(loss)\n", 302 | " train_accuracy(label,output)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 12, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "train_loss = tf.keras.metrics.Mean(name='train_loss')\n", 312 | "train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')\n", 313 | "\n", 314 | "loss_object = tf.keras.losses.BinaryCrossentropy()\n", 315 | "\n", 316 | "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 13, 322 | "metadata": { 323 | "scrolled": true 324 | }, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 331 | " def call(self,feat_index,feat_value):\n", 332 | " \n", 333 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 334 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 335 | "# print(feat_value.get_shape())\n", 336 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 337 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n", 338 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n", 339 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n", 340 | " \n", 341 | " x1 = self.crosslayer(stack_input)\n", 342 | " x2 = self.deep(stack_input)\n", 343 | " \n", 344 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n", 345 | " output = self.fc(x3)\n", 346 | " return output\n", 347 | "\n", 348 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 349 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 350 | " def call(self,feat_index,feat_value):\n", 351 | " \n", 352 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 353 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 354 | "# print(feat_value.get_shape())\n", 355 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 356 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n", 357 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n", 358 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n", 359 | " \n", 360 | " x1 = self.crosslayer(stack_input)\n", 361 | " x2 = self.deep(stack_input)\n", 362 | " \n", 363 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n", 364 | " output = self.fc(x3)\n", 365 | " return output\n", 366 | "\n", 367 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 368 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 369 | " def call(self,feat_index,feat_value):\n", 370 | " \n", 371 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 372 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 373 | "# print(feat_value.get_shape())\n", 374 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 375 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n", 376 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n", 377 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n", 378 | " \n", 379 | " x1 = self.crosslayer(stack_input)\n", 380 | " x2 = self.deep(stack_input)\n", 381 | " \n", 382 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n", 383 | " output = self.fc(x3)\n", 384 | " return output\n", 385 | "\n", 386 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 387 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 388 | " def call(self,feat_index,feat_value):\n", 389 | " \n", 390 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 391 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 392 | "# print(feat_value.get_shape())\n", 393 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 394 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n", 395 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n", 396 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n", 397 | " \n", 398 | " x1 = self.crosslayer(stack_input)\n", 399 | " x2 = self.deep(stack_input)\n", 400 | " \n", 401 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n", 402 | " output = self.fc(x3)\n", 403 | " return output\n", 404 | "\n", 405 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 406 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 407 | " def call(self,feat_index,feat_value):\n", 408 | " \n", 409 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 410 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 411 | "# print(feat_value.get_shape())\n", 412 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 413 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n", 414 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n", 415 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n", 416 | " \n", 417 | " x1 = self.crosslayer(stack_input)\n", 418 | " x2 = self.deep(stack_input)\n", 419 | " \n", 420 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n", 421 | " output = self.fc(x3)\n", 422 | " return output\n", 423 | "\n", 424 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 425 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 426 | " def call(self,feat_index,feat_value):\n", 427 | " \n", 428 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 429 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 430 | "# print(feat_value.get_shape())\n", 431 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 432 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n", 433 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n", 434 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n", 435 | " \n", 436 | " x1 = self.crosslayer(stack_input)\n", 437 | " x2 = self.deep(stack_input)\n", 438 | " \n", 439 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n", 440 | " output = self.fc(x3)\n", 441 | " return output\n", 442 | "\n", 443 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 444 | "Epoch 1, Loss: 0.565358579158783, Accuracy: 0.790395200252533\n", 445 | "Epoch 2, Loss: 0.5333142280578613, Accuracy: 0.7906453013420105\n", 446 | "Epoch 3, Loss: 0.5188921093940735, Accuracy: 0.7907286882400513\n", 447 | "Epoch 4, Loss: 0.5085805654525757, Accuracy: 0.790770411491394\n", 448 | "Epoch 5, Loss: 0.5001382231712341, Accuracy: 0.7907953858375549\n", 449 | "Epoch 6, Loss: 0.49196508526802063, Accuracy: 0.790812075138092\n", 450 | "Epoch 7, Loss: 0.4845847487449646, Accuracy: 0.791538655757904\n", 451 | "Epoch 8, Loss: 0.4777772128582001, Accuracy: 0.7933967113494873\n", 452 | "Epoch 9, Loss: 0.4712851643562317, Accuracy: 0.7953976988792419\n" 453 | ] 454 | }, 455 | { 456 | "name": "stdout", 457 | "output_type": "stream", 458 | "text": [ 459 | "Epoch 10, Loss: 0.46522337198257446, Accuracy: 0.797548770904541\n", 460 | "Epoch 11, Loss: 0.4593830108642578, Accuracy: 0.799308717250824\n", 461 | "Epoch 12, Loss: 0.4535185396671295, Accuracy: 0.8014007210731506\n", 462 | "Epoch 13, Loss: 0.4476926326751709, Accuracy: 0.8034401535987854\n", 463 | "Epoch 14, Loss: 0.4420176148414612, Accuracy: 0.8057957291603088\n", 464 | "Epoch 15, Loss: 0.43604835867881775, Accuracy: 0.8078039288520813\n", 465 | "Epoch 16, Loss: 0.430029958486557, Accuracy: 0.8101238012313843\n", 466 | "Epoch 17, Loss: 0.4236184060573578, Accuracy: 0.8130241632461548\n", 467 | "Epoch 18, Loss: 0.41711094975471497, Accuracy: 0.8157690167427063\n", 468 | "Epoch 19, Loss: 0.410213828086853, Accuracy: 0.8188567757606506\n", 469 | "Epoch 20, Loss: 0.40275657176971436, Accuracy: 0.8226613402366638\n", 470 | "Epoch 21, Loss: 0.3947707712650299, Accuracy: 0.8265085220336914\n", 471 | "Epoch 22, Loss: 0.3864079415798187, Accuracy: 0.8308699727058411\n", 472 | "Epoch 23, Loss: 0.37755030393600464, Accuracy: 0.8352872133255005\n", 473 | "Epoch 24, Loss: 0.3682657480239868, Accuracy: 0.8399407863616943\n", 474 | "Epoch 25, Loss: 0.3589519262313843, Accuracy: 0.8447423577308655\n", 475 | "Epoch 26, Loss: 0.3493313491344452, Accuracy: 0.8495401740074158\n", 476 | "Epoch 27, Loss: 0.33972665667533875, Accuracy: 0.8542419075965881\n", 477 | "Epoch 28, Loss: 0.33029282093048096, Accuracy: 0.8588579893112183\n", 478 | "Epoch 29, Loss: 0.3210965692996979, Accuracy: 0.8632591962814331\n", 479 | "Epoch 30, Loss: 0.3121466338634491, Accuracy: 0.8674670457839966\n", 480 | "Epoch 31, Loss: 0.3034890294075012, Accuracy: 0.8714196085929871\n", 481 | "Epoch 32, Loss: 0.2950327396392822, Accuracy: 0.8753126859664917\n", 482 | "Epoch 33, Loss: 0.2869029939174652, Accuracy: 0.8790152668952942\n", 483 | "Epoch 34, Loss: 0.27917614579200745, Accuracy: 0.8824853897094727\n", 484 | "Epoch 35, Loss: 0.27175894379615784, Accuracy: 0.8858000636100769\n", 485 | "Epoch 36, Loss: 0.2646080255508423, Accuracy: 0.8889583945274353\n", 486 | "Epoch 37, Loss: 0.2577793300151825, Accuracy: 0.8919594883918762\n", 487 | "Epoch 38, Loss: 0.2512573003768921, Accuracy: 0.8948026895523071\n", 488 | "Epoch 39, Loss: 0.24505917727947235, Accuracy: 0.897487223148346\n", 489 | "Epoch 40, Loss: 0.23911045491695404, Accuracy: 0.9000500440597534\n", 490 | "Epoch 41, Loss: 0.23342998325824738, Accuracy: 0.9024878144264221\n", 491 | "Epoch 42, Loss: 0.22800736129283905, Accuracy: 0.9048095345497131\n", 492 | "Epoch 43, Loss: 0.22281348705291748, Accuracy: 0.9070232510566711\n", 493 | "Epoch 44, Loss: 0.21784667670726776, Accuracy: 0.9091364145278931\n", 494 | "Epoch 45, Loss: 0.2130877673625946, Accuracy: 0.9111555814743042\n", 495 | "Epoch 46, Loss: 0.20853079855442047, Accuracy: 0.9130869507789612\n", 496 | "Epoch 47, Loss: 0.2041596919298172, Accuracy: 0.9149361848831177\n", 497 | "Epoch 48, Loss: 0.19996346533298492, Accuracy: 0.9167083501815796\n", 498 | "Epoch 49, Loss: 0.19593490660190582, Accuracy: 0.9184081554412842\n", 499 | "Epoch 50, Loss: 0.19206039607524872, Accuracy: 0.9200400114059448\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "EPOCHS = 50\n", 505 | "for epoch in range(EPOCHS):\n", 506 | " for label, idx, value in train_ds:\n", 507 | " train_one_step(dcn,optimizer,idx, value,label)\n", 508 | " template = 'Epoch {}, Loss: {}, Accuracy: {}'\n", 509 | " print (template.format(epoch+1,\n", 510 | " train_loss.result(),train_accuracy.result()))" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [] 519 | } 520 | ], 521 | "metadata": { 522 | "kernelspec": { 523 | "display_name": "Python 3", 524 | "language": "python", 525 | "name": "python3" 526 | }, 527 | "language_info": { 528 | "codemirror_mode": { 529 | "name": "ipython", 530 | "version": 3 531 | }, 532 | "file_extension": ".py", 533 | "mimetype": "text/x-python", 534 | "name": "python", 535 | "nbconvert_exporter": "python", 536 | "pygments_lexer": "ipython3", 537 | "version": "3.6.5" 538 | } 539 | }, 540 | "nbformat": 4, 541 | "nbformat_minor": 2 542 | } 543 | -------------------------------------------------------------------------------- /DCN/DCN-tf2.0.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import pandas as pd 4 | import tensorflow as tf 5 | from collections import Counter 6 | import pickle 7 | from util.train_model import train_test_model_demo 8 | 9 | 10 | class CrossLayer(tf.keras.layers.Layer): 11 | def __init__(self,output_dim,num_layer,**kwargs): 12 | self.output_dim = output_dim 13 | self.num_layer = num_layer 14 | super(CrossLayer,self).__init__(**kwargs) 15 | 16 | def build(self,input_shape): 17 | self.input_dim = input_shape[2] 18 | # print(self.input_dim) 19 | self.W = [] 20 | self.bias = [] 21 | for i in range(self.num_layer): 22 | self.W.append(self.add_weight(shape=[1,self.input_dim],initializer = 'glorot_uniform',name='w_{}'.format(i),trainable=True)) 23 | self.bias.append(self.add_weight(shape=[1,self.input_dim],initializer = 'zeros',name='b_{}'.format(i),trainable=True)) 24 | self.built = True 25 | 26 | def call(self,input): 27 | # 按照论文的公式 28 | # x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j] 29 | # print("x0_shape",x0.get_shape())# (9, 390, 1) 30 | # x1 = tf.einsum('bmn,bkm->bnk', input, x0) 31 | # print("x1_shape", x1.get_shape()) # (9, 390, 390) 32 | # print("self.W[0]_shape", self.W[0].get_shape()) 33 | # cross = tf.einsum('bmn,kn->bkm',x1,self.W[0]) + self.bias[0] + input 34 | # print("cross0", cross.get_shape())# (9, 1, 390) 35 | # for i in range(1,self.num_layer): 36 | # x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j] 37 | # x1 = tf.einsum('bmn,bkm->bnk',input,x0) 38 | # cross = tf.einsum('bmn,kn->bkm',x1,self.W[i]) + self.bias[i] + cross 39 | 40 | # 优化论文公式 改变结合律 41 | x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j] 42 | x1 = tf.einsum('bmn,km->bnk', x0, self.W[0]) 43 | cross = tf.einsum('bkm,bnk->bnm',input,x1) + self.bias[0] + input 44 | for i in range(1,self.num_layer): 45 | x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j] 46 | x1 = tf.einsum('bmn,km->bnk', x0, self.W[i]) 47 | cross = tf.einsum('bkm,bnk->bnm', cross,x1) + self.bias[i] + cross 48 | return cross 49 | 50 | class Deep(tf.keras.layers.Layer): 51 | def __init__(self,dropout_deep,deep_layer_sizes): 52 | # input_dim = num_size + embed_size = input_size 53 | super(Deep, self).__init__() 54 | self.dropout_deep = dropout_deep 55 | # fc layer 56 | self.deep_layer_sizes = deep_layer_sizes 57 | # 神经网络方面的参数 58 | for i in range(len(deep_layer_sizes)): 59 | setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i])) 60 | setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization()) 61 | setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu')) 62 | setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i])) 63 | # last layer 64 | self.fc = tf.keras.layers.Dense(128,activation=None,use_bias=True) 65 | 66 | def call(self,input): 67 | y_deep = getattr(self,'dense_' + str(0))(input) 68 | y_deep = getattr(self,'batchNorm_' + str(0))(y_deep) 69 | y_deep = getattr(self,'activation_' + str(0))(y_deep) 70 | y_deep = getattr(self,'dropout_' + str(0))(y_deep) 71 | 72 | for i in range(1,len(self.deep_layer_sizes)): 73 | y_deep = getattr(self,'dense_' + str(i))(y_deep) 74 | y_deep = getattr(self,'batchNorm_' + str(i))(y_deep) 75 | y_deep = getattr(self,'activation_' + str(i))(y_deep) 76 | y_deep = getattr(self,'dropout_' + str(i))(y_deep) 77 | 78 | output = self.fc(y_deep) 79 | return output 80 | 81 | class DCN(tf.keras.Model): 82 | def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10): 83 | super().__init__() 84 | self.num_feat = num_feat # F =features nums 85 | self.num_field = num_field # N =fields of a feature 86 | self.dropout_deep = dropout_deep 87 | 88 | # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度 89 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M 90 | self.feat_embeddings = feat_embeddings 91 | 92 | self.crosslayer = CrossLayer(output_dim = 128,num_layer=8) 93 | 94 | self.deep = Deep(dropout_deep,deep_layer_sizes) 95 | self.fc = tf.keras.layers.Dense(1,activation='sigmoid',use_bias=True) 96 | 97 | def call(self,feat_index,feat_value): 98 | 99 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。 100 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 101 | # print(feat_value.get_shape()) 102 | feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value) 103 | # print("feat_embedding:",feat_embedding.get_shape()) # 32 * 39 * 10 104 | stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding) 105 | # print("stack_input:",stack_input.get_shape()) # 32 * 1 * 390 106 | 107 | x1 = self.crosslayer(stack_input) 108 | x2 = self.deep(stack_input) 109 | 110 | x3 = tf.keras.layers.concatenate([x1,x2],axis=-1) 111 | output = self.fc(x3) 112 | return output 113 | 114 | if __name__ == '__main__': 115 | AID_DATA_DIR = "../data/Criteo/" 116 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb')) 117 | 118 | dcn = DCN(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5], 119 | deep_layer_sizes=[400, 400]) 120 | 121 | train_label_path = AID_DATA_DIR + 'train_label' 122 | train_idx_path = AID_DATA_DIR + 'train_idx' 123 | train_value_path = AID_DATA_DIR + 'train_value' 124 | 125 | train_test_model_demo(dcn,train_label_path, train_idx_path, train_value_path) -------------------------------------------------------------------------------- /GBDT_LR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "## GBDT+LR代码分析" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Scikit-learn实现" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import lightgbm as lgb\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "from sklearn.metrics import mean_squared_error\n", 31 | "from sklearn.linear_model import LogisticRegression" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 10, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "from sklearn.preprocessing import OneHotEncoder\n", 41 | "from sklearn.ensemble import GradientBoostingClassifier" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "df_train = pd.read_csv(r'F:\\Data\\recsys-data\\gbdt+lr/train.csv')\n", 51 | "df_test = pd.read_csv(r'F:\\Data\\recsys-data\\gbdt+lr/test.csv')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "NUMERIC_COLS = [\n", 61 | " \"ps_reg_01\", \"ps_reg_02\", \"ps_reg_03\",\n", 62 | " \"ps_car_12\", \"ps_car_13\", \"ps_car_14\", \"ps_car_15\",\n", 63 | "]" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 11, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "gbdt = GradientBoostingClassifier(n_estimators=50,random_state=10,subsample = 0.6,max_depth=7,min_samples_split=900)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/html": [ 83 | "
\n", 84 | "\n", 97 | "\n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | "
Unnamed: 0idtargetps_ind_01ps_ind_02_catps_ind_03ps_ind_04_catps_ind_05_catps_ind_06_binps_ind_07_bin...ps_calc_11ps_calc_12ps_calc_13ps_calc_14ps_calc_15_binps_calc_16_binps_calc_17_binps_calc_18_binps_calc_19_binps_calc_20_bin
080002022717151001...4265001110
180012022810161010...52410000001
280022022903180000...10135001110
380032023502180000...2229000110
480042023600121000...3255001010
\n", 247 | "

5 rows × 60 columns

\n", 248 | "
" 249 | ], 250 | "text/plain": [ 251 | " Unnamed: 0 id target ps_ind_01 ps_ind_02_cat ps_ind_03 \\\n", 252 | "0 8000 20227 1 7 1 5 \n", 253 | "1 8001 20228 1 0 1 6 \n", 254 | "2 8002 20229 0 3 1 8 \n", 255 | "3 8003 20235 0 2 1 8 \n", 256 | "4 8004 20236 0 0 1 2 \n", 257 | "\n", 258 | " ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ... \\\n", 259 | "0 1 0 0 1 ... \n", 260 | "1 1 0 1 0 ... \n", 261 | "2 0 0 0 0 ... \n", 262 | "3 0 0 0 0 ... \n", 263 | "4 1 0 0 0 ... \n", 264 | "\n", 265 | " ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin \\\n", 266 | "0 4 2 6 5 0 \n", 267 | "1 5 2 4 10 0 \n", 268 | "2 10 1 3 5 0 \n", 269 | "3 2 2 2 9 0 \n", 270 | "4 3 2 5 5 0 \n", 271 | "\n", 272 | " ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin \\\n", 273 | "0 0 1 1 1 \n", 274 | "1 0 0 0 0 \n", 275 | "2 0 1 1 1 \n", 276 | "3 0 0 1 1 \n", 277 | "4 0 1 0 1 \n", 278 | "\n", 279 | " ps_calc_20_bin \n", 280 | "0 0 \n", 281 | "1 1 \n", 282 | "2 0 \n", 283 | "3 0 \n", 284 | "4 0 \n", 285 | "\n", 286 | "[5 rows x 60 columns]" 287 | ] 288 | }, 289 | "execution_count": 4, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "df_test.head()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 100, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "y_train = df_train['target']\n", 305 | "y_test = df_test['target']\n", 306 | "X_train = df_train[NUMERIC_COLS]\n", 307 | "X_test = df_test[NUMERIC_COLS]" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 6, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "(8001, 7)" 319 | ] 320 | }, 321 | "execution_count": 6, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "X_train.shape" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 23, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "lgb_train = lgb.Dataset(X_train,y_train)\n", 337 | "lgb_eval = lgb.Dataset(X_test,y_test,reference=lgb_train)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "## 设置子树为100颗,每颗树包含64支叶子的树模型。那么形成的中间特征向量为100*64" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 24, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "params = {\n", 354 | " 'task': 'train',\n", 355 | " 'boosting_type': 'gbdt',\n", 356 | " 'objective': 'binary',\n", 357 | " 'metric': {'binary_logloss'},\n", 358 | " 'num_leaves': 64,\n", 359 | " 'num_trees': 100,\n", 360 | " 'learning_rate': 0.01,\n", 361 | " 'feature_fraction': 0.9,\n", 362 | " 'bagging_fraction': 0.8,\n", 363 | " 'bagging_freq': 5,\n", 364 | " 'verbose': 0\n", 365 | "}" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 25, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "# 叶子节点数,用来进行特征转换使用\n", 375 | "num_leaf = 64" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 26, 381 | "metadata": { 382 | "scrolled": true 383 | }, 384 | "outputs": [ 385 | { 386 | "name": "stderr", 387 | "output_type": "stream", 388 | "text": [ 389 | "D:\\anaconda3\\julianxu\\lib\\site-packages\\lightgbm\\engine.py:148: UserWarning: Found `num_trees` in params. Will use it instead of argument\n", 390 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 391 | ] 392 | }, 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "[1]\ttraining's binary_logloss: 0.155602\n", 398 | "[2]\ttraining's binary_logloss: 0.155022\n", 399 | "[3]\ttraining's binary_logloss: 0.15441\n", 400 | "[4]\ttraining's binary_logloss: 0.153819\n", 401 | "[5]\ttraining's binary_logloss: 0.153267\n", 402 | "[6]\ttraining's binary_logloss: 0.152685\n", 403 | "[7]\ttraining's binary_logloss: 0.152144\n", 404 | "[8]\ttraining's binary_logloss: 0.151545\n", 405 | "[9]\ttraining's binary_logloss: 0.151029\n", 406 | "[10]\ttraining's binary_logloss: 0.15049\n", 407 | "[11]\ttraining's binary_logloss: 0.150069\n", 408 | "[12]\ttraining's binary_logloss: 0.149553\n", 409 | "[13]\ttraining's binary_logloss: 0.149064\n", 410 | "[14]\ttraining's binary_logloss: 0.148592\n", 411 | "[15]\ttraining's binary_logloss: 0.148111\n", 412 | "[16]\ttraining's binary_logloss: 0.147618\n", 413 | "[17]\ttraining's binary_logloss: 0.147086\n", 414 | "[18]\ttraining's binary_logloss: 0.146624\n", 415 | "[19]\ttraining's binary_logloss: 0.146184\n", 416 | "[20]\ttraining's binary_logloss: 0.145696\n", 417 | "[21]\ttraining's binary_logloss: 0.145182\n", 418 | "[22]\ttraining's binary_logloss: 0.144704\n", 419 | "[23]\ttraining's binary_logloss: 0.144244\n", 420 | "[24]\ttraining's binary_logloss: 0.143804\n", 421 | "[25]\ttraining's binary_logloss: 0.14335\n", 422 | "[26]\ttraining's binary_logloss: 0.142893\n", 423 | "[27]\ttraining's binary_logloss: 0.142461\n", 424 | "[28]\ttraining's binary_logloss: 0.141992\n", 425 | "[29]\ttraining's binary_logloss: 0.14154\n", 426 | "[30]\ttraining's binary_logloss: 0.141097\n", 427 | "[31]\ttraining's binary_logloss: 0.14065\n", 428 | "[32]\ttraining's binary_logloss: 0.14021\n", 429 | "[33]\ttraining's binary_logloss: 0.139826\n", 430 | "[34]\ttraining's binary_logloss: 0.139455\n", 431 | "[35]\ttraining's binary_logloss: 0.139101\n", 432 | "[36]\ttraining's binary_logloss: 0.138699\n", 433 | "[37]\ttraining's binary_logloss: 0.138313\n", 434 | "[38]\ttraining's binary_logloss: 0.137922\n", 435 | "[39]\ttraining's binary_logloss: 0.13748\n", 436 | "[40]\ttraining's binary_logloss: 0.13711\n", 437 | "[41]\ttraining's binary_logloss: 0.136669\n", 438 | "[42]\ttraining's binary_logloss: 0.136245\n", 439 | "[43]\ttraining's binary_logloss: 0.135825\n", 440 | "[44]\ttraining's binary_logloss: 0.135446\n", 441 | "[45]\ttraining's binary_logloss: 0.135044\n", 442 | "[46]\ttraining's binary_logloss: 0.134611\n", 443 | "[47]\ttraining's binary_logloss: 0.134199\n", 444 | "[48]\ttraining's binary_logloss: 0.133789\n", 445 | "[49]\ttraining's binary_logloss: 0.133391\n", 446 | "[50]\ttraining's binary_logloss: 0.133004\n", 447 | "[51]\ttraining's binary_logloss: 0.132586\n", 448 | "[52]\ttraining's binary_logloss: 0.132205\n", 449 | "[53]\ttraining's binary_logloss: 0.131787\n", 450 | "[54]\ttraining's binary_logloss: 0.131378\n", 451 | "[55]\ttraining's binary_logloss: 0.131014\n", 452 | "[56]\ttraining's binary_logloss: 0.130628\n", 453 | "[57]\ttraining's binary_logloss: 0.130253\n", 454 | "[58]\ttraining's binary_logloss: 0.129902\n", 455 | "[59]\ttraining's binary_logloss: 0.12956\n", 456 | "[60]\ttraining's binary_logloss: 0.129185\n", 457 | "[61]\ttraining's binary_logloss: 0.128838\n", 458 | "[62]\ttraining's binary_logloss: 0.128492\n", 459 | "[63]\ttraining's binary_logloss: 0.128169\n", 460 | "[64]\ttraining's binary_logloss: 0.127838\n", 461 | "[65]\ttraining's binary_logloss: 0.12748\n", 462 | "[66]\ttraining's binary_logloss: 0.127149\n", 463 | "[67]\ttraining's binary_logloss: 0.126845\n", 464 | "[68]\ttraining's binary_logloss: 0.126493\n", 465 | "[69]\ttraining's binary_logloss: 0.126139\n", 466 | "[70]\ttraining's binary_logloss: 0.125797\n", 467 | "[71]\ttraining's binary_logloss: 0.125492\n", 468 | "[72]\ttraining's binary_logloss: 0.125175\n", 469 | "[73]\ttraining's binary_logloss: 0.12489\n", 470 | "[74]\ttraining's binary_logloss: 0.124602\n", 471 | "[75]\ttraining's binary_logloss: 0.124281\n", 472 | "[76]\ttraining's binary_logloss: 0.123981\n", 473 | "[77]\ttraining's binary_logloss: 0.123696\n", 474 | "[78]\ttraining's binary_logloss: 0.123414\n", 475 | "[79]\ttraining's binary_logloss: 0.123113\n", 476 | "[80]\ttraining's binary_logloss: 0.122799\n", 477 | "[81]\ttraining's binary_logloss: 0.122486\n", 478 | "[82]\ttraining's binary_logloss: 0.122147\n", 479 | "[83]\ttraining's binary_logloss: 0.121818\n", 480 | "[84]\ttraining's binary_logloss: 0.121483\n", 481 | "[85]\ttraining's binary_logloss: 0.12115\n", 482 | "[86]\ttraining's binary_logloss: 0.120842\n", 483 | "[87]\ttraining's binary_logloss: 0.120546\n", 484 | "[88]\ttraining's binary_logloss: 0.12025\n", 485 | "[89]\ttraining's binary_logloss: 0.119959\n", 486 | "[90]\ttraining's binary_logloss: 0.119682\n", 487 | "[91]\ttraining's binary_logloss: 0.11935\n", 488 | "[92]\ttraining's binary_logloss: 0.119037\n", 489 | "[93]\ttraining's binary_logloss: 0.118712\n", 490 | "[94]\ttraining's binary_logloss: 0.118397\n", 491 | "[95]\ttraining's binary_logloss: 0.118085\n", 492 | "[96]\ttraining's binary_logloss: 0.117773\n", 493 | "[97]\ttraining's binary_logloss: 0.117491\n", 494 | "[98]\ttraining's binary_logloss: 0.117192\n", 495 | "[99]\ttraining's binary_logloss: 0.116892\n", 496 | "[100]\ttraining's binary_logloss: 0.116629\n" 497 | ] 498 | } 499 | ], 500 | "source": [ 501 | "# train\n", 502 | "gbm = lgb.train(params,\n", 503 | " lgb_train,\n", 504 | " num_boost_round=100,\n", 505 | " valid_sets=lgb_train)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 27, 511 | "metadata": {}, 512 | "outputs": [ 513 | { 514 | "name": "stdout", 515 | "output_type": "stream", 516 | "text": [ 517 | "Save model...\n" 518 | ] 519 | }, 520 | { 521 | "data": { 522 | "text/plain": [ 523 | "" 524 | ] 525 | }, 526 | "execution_count": 27, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "print('Save model...')\n", 533 | "# save model to file\n", 534 | "gbm.save_model(r'F:\\Data\\recsys-data\\gbdt+lr/model.txt')" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 54, 540 | "metadata": {}, 541 | "outputs": [ 542 | { 543 | "name": "stdout", 544 | "output_type": "stream", 545 | "text": [ 546 | "Start predicting...\n" 547 | ] 548 | } 549 | ], 550 | "source": [ 551 | "print('Start predicting...')\n", 552 | "# predict and get data on leaves, training data\n", 553 | "y_pred = gbm.predict(X_train, pred_leaf=True)" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 56, 559 | "metadata": {}, 560 | "outputs": [ 561 | { 562 | "data": { 563 | "text/plain": [ 564 | "(8001, 7)" 565 | ] 566 | }, 567 | "execution_count": 56, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "X_train.shape" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 55, 579 | "metadata": {}, 580 | "outputs": [ 581 | { 582 | "data": { 583 | "text/plain": [ 584 | "array([[17, 0, 55, ..., 4, 63, 63],\n", 585 | " [62, 8, 58, ..., 47, 9, 57],\n", 586 | " [44, 0, 58, ..., 34, 62, 45],\n", 587 | " ...,\n", 588 | " [51, 19, 16, ..., 23, 33, 56],\n", 589 | " [61, 28, 58, ..., 53, 28, 18],\n", 590 | " [53, 29, 54, ..., 4, 63, 63]])" 591 | ] 592 | }, 593 | "execution_count": 55, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "y_pred" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 29, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/plain": [ 610 | "(8001, 100)" 611 | ] 612 | }, 613 | "execution_count": 29, 614 | "metadata": {}, 615 | "output_type": "execute_result" 616 | } 617 | ], 618 | "source": [ 619 | "np.array(y_pred).shape" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 32, 625 | "metadata": {}, 626 | "outputs": [ 627 | { 628 | "data": { 629 | "text/plain": [ 630 | "array([17, 0, 55, 44, 47, 8, 8, 39, 8, 8, 0, 0, 0, 0, 0, 0, 38,\n", 631 | " 36, 36, 26, 15, 13, 38, 18, 41, 54, 45, 51, 55, 59, 15, 20, 2, 2,\n", 632 | " 2, 63, 56, 26, 7, 25, 46, 58, 62, 26, 19, 48, 6, 51, 5, 45, 44,\n", 633 | " 1, 44, 14, 33, 41, 10, 39, 49, 63, 51, 63, 20, 48, 52, 47, 8, 36,\n", 634 | " 8, 8, 50, 0, 32, 21, 8, 23, 48, 48, 17, 49, 46, 10, 28, 12, 59,\n", 635 | " 22, 12, 51, 34, 32, 15, 15, 53, 29, 29, 59, 59, 4, 63, 63])" 636 | ] 637 | }, 638 | "execution_count": 32, 639 | "metadata": {}, 640 | "output_type": "execute_result" 641 | } 642 | ], 643 | "source": [ 644 | "y_pred[0]\n", 645 | "# 17,0每个数字代表每颗树的叶子节点索引" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 36, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "transform_training_matrix = np.zeros([len(y_pred),len(y_pred[0])*num_leaf],dtype=np.int64) # N**num_tress*num_leaf" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 47, 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "for i in range(0,len(y_pred)):\n", 664 | " temp = np.arange(len(y_pred[0]))*num_leaf + np.array(y_pred[i]) # 以64为一个周期,然后加上相应的节点位置\n", 665 | " transform_training_matrix[i][temp] += 1 # 找出索引对应的值,然后加1" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 83, 671 | "metadata": {}, 672 | "outputs": [ 673 | { 674 | "data": { 675 | "text/plain": [ 676 | "(8001, 6400)" 677 | ] 678 | }, 679 | "execution_count": 83, 680 | "metadata": {}, 681 | "output_type": "execute_result" 682 | } 683 | ], 684 | "source": [ 685 | "transform_training_matrix.shape" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 95, 691 | "metadata": {}, 692 | "outputs": [], 693 | "source": [ 694 | "y_test_lgb = gbm.predict(X_test,pred_leaf=True)" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 65, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "# 将预测集进行onehot转换" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 86, 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "data": { 713 | "text/plain": [ 714 | "2000" 715 | ] 716 | }, 717 | "execution_count": 86, 718 | "metadata": {}, 719 | "output_type": "execute_result" 720 | } 721 | ], 722 | "source": [ 723 | "len(y_test)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 96, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "transform_test_matrix = np.zeros([len(y_test_lgb),len(y_test_lgb[0])*num_leaf],dtype=np.int64)" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": 88, 738 | "metadata": {}, 739 | "outputs": [ 740 | { 741 | "data": { 742 | "text/plain": [ 743 | "(2000, 6400)" 744 | ] 745 | }, 746 | "execution_count": 88, 747 | "metadata": {}, 748 | "output_type": "execute_result" 749 | } 750 | ], 751 | "source": [ 752 | "transform_test_matrix.shape" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 97, 758 | "metadata": {}, 759 | "outputs": [], 760 | "source": [ 761 | "for i in range(len(y_test_lgb)):\n", 762 | " temp = np.arange(len(y_test[0]))*num_leaf + np.array(y_test_lgb[i])\n", 763 | " transform_test_matrix[i][temp] += 1" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 98, 769 | "metadata": {}, 770 | "outputs": [], 771 | "source": [ 772 | "lm = LogisticRegression(penalty='l2',C=0.05)\n", 773 | "lm.fit(transform_training_matrix,y_train)\n", 774 | "y_pred_test = lm.predict_proba(transform_test_matrix)" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 99, 780 | "metadata": {}, 781 | "outputs": [ 782 | { 783 | "data": { 784 | "text/plain": [ 785 | "(2000, 2)" 786 | ] 787 | }, 788 | "execution_count": 99, 789 | "metadata": {}, 790 | "output_type": "execute_result" 791 | } 792 | ], 793 | "source": [ 794 | "y_pred_test.shape" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": 101, 800 | "metadata": {}, 801 | "outputs": [ 802 | { 803 | "name": "stdout", 804 | "output_type": "stream", 805 | "text": [ 806 | "Normalized Cross Entropy 2.213280152050503\n" 807 | ] 808 | } 809 | ], 810 | "source": [ 811 | "NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) + (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))\n", 812 | "print(\"Normalized Cross Entropy \" + str(NE))" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": {}, 819 | "outputs": [], 820 | "source": [] 821 | } 822 | ], 823 | "metadata": { 824 | "kernelspec": { 825 | "display_name": "Python 3", 826 | "language": "python", 827 | "name": "python3" 828 | }, 829 | "language_info": { 830 | "codemirror_mode": { 831 | "name": "ipython", 832 | "version": 3 833 | }, 834 | "file_extension": ".py", 835 | "mimetype": "text/x-python", 836 | "name": "python", 837 | "nbconvert_exporter": "python", 838 | "pygments_lexer": "ipython3", 839 | "version": "3.6.5" 840 | } 841 | }, 842 | "nbformat": 4, 843 | "nbformat_minor": 1 844 | } 845 | -------------------------------------------------------------------------------- /MLR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 1.处理数据过程" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 3, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "from sklearn.preprocessing import StandardScaler" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 81, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# get_data\n", 27 | "train_data = pd.read_table(r'F:\\Data\\recsys-data\\mlr\\adult.data',header=None,delimiter=',')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "test_data = pd.read_table(r'F:\\Data\\recsys-data\\mlr\\adult.test',header=None,delimiter=',')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "
\n", 48 | "\n", 61 | "\n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | "
01234567891011121314
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
\n", 175 | "
" 176 | ], 177 | "text/plain": [ 178 | " 0 1 2 3 4 5 \\\n", 179 | "0 39 State-gov 77516 Bachelors 13 Never-married \n", 180 | "1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n", 181 | "2 38 Private 215646 HS-grad 9 Divorced \n", 182 | "3 53 Private 234721 11th 7 Married-civ-spouse \n", 183 | "4 28 Private 338409 Bachelors 13 Married-civ-spouse \n", 184 | "\n", 185 | " 6 7 8 9 10 11 12 \\\n", 186 | "0 Adm-clerical Not-in-family White Male 2174 0 40 \n", 187 | "1 Exec-managerial Husband White Male 0 0 13 \n", 188 | "2 Handlers-cleaners Not-in-family White Male 0 0 40 \n", 189 | "3 Handlers-cleaners Husband Black Male 0 0 40 \n", 190 | "4 Prof-specialty Wife Black Female 0 0 40 \n", 191 | "\n", 192 | " 13 14 \n", 193 | "0 United-States <=50K \n", 194 | "1 United-States <=50K \n", 195 | "2 United-States <=50K \n", 196 | "3 United-States <=50K \n", 197 | "4 Cuba <=50K " 198 | ] 199 | }, 200 | "execution_count": 83, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "train_data.head()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 98, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "test_data[14] = test_data[14].apply(lambda x: x[:-1])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 85, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "\n", 228 | "RangeIndex: 32561 entries, 0 to 32560\n", 229 | "Data columns (total 15 columns):\n", 230 | "0 32561 non-null int64\n", 231 | "1 32561 non-null object\n", 232 | "2 32561 non-null int64\n", 233 | "3 32561 non-null object\n", 234 | "4 32561 non-null int64\n", 235 | "5 32561 non-null object\n", 236 | "6 32561 non-null object\n", 237 | "7 32561 non-null object\n", 238 | "8 32561 non-null object\n", 239 | "9 32561 non-null object\n", 240 | "10 32561 non-null int64\n", 241 | "11 32561 non-null int64\n", 242 | "12 32561 non-null int64\n", 243 | "13 32561 non-null object\n", 244 | "14 32561 non-null object\n", 245 | "dtypes: int64(6), object(9)\n", 246 | "memory usage: 3.7+ MB\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "train_data.info()" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 86, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "all_columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','label','type']\n", 261 | "continus_columns =['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']\n", 262 | "dummy_columns = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 87, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "train_data['type'] = 1\n", 272 | "test_data['type'] = 2" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 88, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/html": [ 283 | "
\n", 284 | "\n", 297 | "\n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | "
01234567891011121314type
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K1
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K1
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K1
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K1
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K1
\n", 417 | "
" 418 | ], 419 | "text/plain": [ 420 | " 0 1 2 3 4 5 \\\n", 421 | "0 39 State-gov 77516 Bachelors 13 Never-married \n", 422 | "1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n", 423 | "2 38 Private 215646 HS-grad 9 Divorced \n", 424 | "3 53 Private 234721 11th 7 Married-civ-spouse \n", 425 | "4 28 Private 338409 Bachelors 13 Married-civ-spouse \n", 426 | "\n", 427 | " 6 7 8 9 10 11 12 \\\n", 428 | "0 Adm-clerical Not-in-family White Male 2174 0 40 \n", 429 | "1 Exec-managerial Husband White Male 0 0 13 \n", 430 | "2 Handlers-cleaners Not-in-family White Male 0 0 40 \n", 431 | "3 Handlers-cleaners Husband Black Male 0 0 40 \n", 432 | "4 Prof-specialty Wife Black Female 0 0 40 \n", 433 | "\n", 434 | " 13 14 type \n", 435 | "0 United-States <=50K 1 \n", 436 | "1 United-States <=50K 1 \n", 437 | "2 United-States <=50K 1 \n", 438 | "3 United-States <=50K 1 \n", 439 | "4 Cuba <=50K 1 " 440 | ] 441 | }, 442 | "execution_count": 88, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "train_data.head()" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 89, 454 | "metadata": {}, 455 | "outputs": [ 456 | { 457 | "data": { 458 | "text/html": [ 459 | "
\n", 460 | "\n", 473 | "\n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | "
01234567891011121314type
025Private22680211th7Never-marriedMachine-op-inspctOwn-childBlackMale0040United-States<=50K.2
138Private89814HS-grad9Married-civ-spouseFarming-fishingHusbandWhiteMale0050United-States<=50K.2
228Local-gov336951Assoc-acdm12Married-civ-spouseProtective-servHusbandWhiteMale0040United-States>50K.2
344Private160323Some-college10Married-civ-spouseMachine-op-inspctHusbandBlackMale7688040United-States>50K.2
418?103497Some-college10Never-married?Own-childWhiteFemale0030United-States<=50K.2
\n", 593 | "
" 594 | ], 595 | "text/plain": [ 596 | " 0 1 2 3 4 5 \\\n", 597 | "0 25 Private 226802 11th 7 Never-married \n", 598 | "1 38 Private 89814 HS-grad 9 Married-civ-spouse \n", 599 | "2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse \n", 600 | "3 44 Private 160323 Some-college 10 Married-civ-spouse \n", 601 | "4 18 ? 103497 Some-college 10 Never-married \n", 602 | "\n", 603 | " 6 7 8 9 10 11 12 \\\n", 604 | "0 Machine-op-inspct Own-child Black Male 0 0 40 \n", 605 | "1 Farming-fishing Husband White Male 0 0 50 \n", 606 | "2 Protective-serv Husband White Male 0 0 40 \n", 607 | "3 Machine-op-inspct Husband Black Male 7688 0 40 \n", 608 | "4 ? Own-child White Female 0 0 30 \n", 609 | "\n", 610 | " 13 14 type \n", 611 | "0 United-States <=50K. 2 \n", 612 | "1 United-States <=50K. 2 \n", 613 | "2 United-States >50K. 2 \n", 614 | "3 United-States >50K. 2 \n", 615 | "4 United-States <=50K. 2 " 616 | ] 617 | }, 618 | "execution_count": 89, 619 | "metadata": {}, 620 | "output_type": "execute_result" 621 | } 622 | ], 623 | "source": [ 624 | "test_data.head()" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 99, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "all_data = pd.concat([train_data,test_data],axis = 0)\n", 634 | "all_data.columns = all_columns" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 101, 640 | "metadata": {}, 641 | "outputs": [], 642 | "source": [ 643 | "all_data = pd.get_dummies(all_data,columns=dummy_columns)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 102, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [ 652 | "all_data['label'] = all_data['label'].map(lambda x:1 if x.strip()=='>50K' else 0)" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 103, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [ 661 | "for col in continus_columns:\n", 662 | " ss = StandardScaler()\n", 663 | " all_data[col] = ss.fit_transform(all_data[[col]])" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 104, 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "test_data = all_data[all_data['type']==2].drop(['type'],axis=1)\n", 673 | "train_data = all_data[all_data['type']==1].drop(['type'],axis=1)" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 105, 679 | "metadata": {}, 680 | "outputs": [ 681 | { 682 | "data": { 683 | "text/html": [ 684 | "
\n", 685 | "\n", 698 | "\n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | "
agefnlwgteducation-numcapital-gaincapital-losshours-per-weeklabelworkclass_ ?workclass_ Federal-govworkclass_ Local-gov...native-country_ Portugalnative-country_ Puerto-Riconative-country_ Scotlandnative-country_ Southnative-country_ Taiwannative-country_ Thailandnative-country_ Trinadad&Tobagonative-country_ United-Statesnative-country_ Vietnamnative-country_ Yugoslavia
00.025996-1.0619791.1365120.146932-0.217127-0.0340870000...0000000100
10.828308-1.0071041.136512-0.144804-0.217127-2.2130320000...0000000100
2-0.0469420.246034-0.419335-0.144804-0.217127-0.0340870000...0000000100
31.0471210.426663-1.197259-0.144804-0.217127-0.0340870000...0000000100
4-0.7763161.4085301.136512-0.144804-0.217127-0.0340870000...0000000000
\n", 848 | "

5 rows × 109 columns

\n", 849 | "
" 850 | ], 851 | "text/plain": [ 852 | " age fnlwgt education-num capital-gain capital-loss \\\n", 853 | "0 0.025996 -1.061979 1.136512 0.146932 -0.217127 \n", 854 | "1 0.828308 -1.007104 1.136512 -0.144804 -0.217127 \n", 855 | "2 -0.046942 0.246034 -0.419335 -0.144804 -0.217127 \n", 856 | "3 1.047121 0.426663 -1.197259 -0.144804 -0.217127 \n", 857 | "4 -0.776316 1.408530 1.136512 -0.144804 -0.217127 \n", 858 | "\n", 859 | " hours-per-week label workclass_ ? workclass_ Federal-gov \\\n", 860 | "0 -0.034087 0 0 0 \n", 861 | "1 -2.213032 0 0 0 \n", 862 | "2 -0.034087 0 0 0 \n", 863 | "3 -0.034087 0 0 0 \n", 864 | "4 -0.034087 0 0 0 \n", 865 | "\n", 866 | " workclass_ Local-gov ... native-country_ Portugal \\\n", 867 | "0 0 ... 0 \n", 868 | "1 0 ... 0 \n", 869 | "2 0 ... 0 \n", 870 | "3 0 ... 0 \n", 871 | "4 0 ... 0 \n", 872 | "\n", 873 | " native-country_ Puerto-Rico native-country_ Scotland \\\n", 874 | "0 0 0 \n", 875 | "1 0 0 \n", 876 | "2 0 0 \n", 877 | "3 0 0 \n", 878 | "4 0 0 \n", 879 | "\n", 880 | " native-country_ South native-country_ Taiwan native-country_ Thailand \\\n", 881 | "0 0 0 0 \n", 882 | "1 0 0 0 \n", 883 | "2 0 0 0 \n", 884 | "3 0 0 0 \n", 885 | "4 0 0 0 \n", 886 | "\n", 887 | " native-country_ Trinadad&Tobago native-country_ United-States \\\n", 888 | "0 0 1 \n", 889 | "1 0 1 \n", 890 | "2 0 1 \n", 891 | "3 0 1 \n", 892 | "4 0 0 \n", 893 | "\n", 894 | " native-country_ Vietnam native-country_ Yugoslavia \n", 895 | "0 0 0 \n", 896 | "1 0 0 \n", 897 | "2 0 0 \n", 898 | "3 0 0 \n", 899 | "4 0 0 \n", 900 | "\n", 901 | "[5 rows x 109 columns]" 902 | ] 903 | }, 904 | "execution_count": 105, 905 | "metadata": {}, 906 | "output_type": "execute_result" 907 | } 908 | ], 909 | "source": [ 910 | "train_data.head()" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 106, 916 | "metadata": {}, 917 | "outputs": [], 918 | "source": [ 919 | "train_y = train_data['label']\n", 920 | "train_x = train_data.drop(['label'],axis = 1)\n", 921 | "test_y = test_data['label']\n", 922 | "test_x = test_data.drop(['label'],axis = 1)" 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 107, 928 | "metadata": {}, 929 | "outputs": [ 930 | { 931 | "data": { 932 | "text/plain": [ 933 | "((24720, 109), (7841, 109))" 934 | ] 935 | }, 936 | "execution_count": 107, 937 | "metadata": {}, 938 | "output_type": "execute_result" 939 | } 940 | ], 941 | "source": [ 942 | "train_data[train_data['label']==0].shape,train_data[train_data['label']==1].shape" 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": 108, 948 | "metadata": {}, 949 | "outputs": [ 950 | { 951 | "data": { 952 | "text/plain": [ 953 | "((12435, 109), (3846, 109))" 954 | ] 955 | }, 956 | "execution_count": 108, 957 | "metadata": {}, 958 | "output_type": "execute_result" 959 | } 960 | ], 961 | "source": [ 962 | "test_data[test_data['label']==0].shape,test_data[test_data['label']==1].shape" 963 | ] 964 | }, 965 | { 966 | "cell_type": "markdown", 967 | "metadata": {}, 968 | "source": [ 969 | "# 数据处理完后,特征的维度是108维。" 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "execution_count": 36, 975 | "metadata": {}, 976 | "outputs": [], 977 | "source": [ 978 | "import tensorflow as tf\n", 979 | "import time\n", 980 | "from sklearn.metrics import roc_auc_score" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 37, 986 | "metadata": {}, 987 | "outputs": [], 988 | "source": [ 989 | "x = tf.placeholder(tf.float32,shape=[None,108])\n", 990 | "y = tf.placeholder(tf.float32,shape=[None])" 991 | ] 992 | }, 993 | { 994 | "cell_type": "code", 995 | "execution_count": 38, 996 | "metadata": {}, 997 | "outputs": [], 998 | "source": [ 999 | "m = 2\n", 1000 | "learning_rate = 0.3\n", 1001 | "# 聚类参数\n", 1002 | "u = tf.Variable(tf.random_normal([108,m],0.0,0.5),name='u')\n", 1003 | "w = tf.Variable(tf.random_normal([108,m],0.0,0.5),name='w')" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "code", 1008 | "execution_count": 39, 1009 | "metadata": {}, 1010 | "outputs": [], 1011 | "source": [ 1012 | "U = tf.matmul(x,u)\n", 1013 | "p1 = tf.nn.softmax(U)" 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "code", 1018 | "execution_count": 40, 1019 | "metadata": {}, 1020 | "outputs": [], 1021 | "source": [ 1022 | "W = tf.matmul(x,w)\n", 1023 | "p2 = tf.nn.softmax(W)" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "code", 1028 | "execution_count": 43, 1029 | "metadata": {}, 1030 | "outputs": [], 1031 | "source": [ 1032 | "pred = tf.reduce_sum(tf.multiply(p1,p2),1)" 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "execution_count": 111, 1038 | "metadata": {}, 1039 | "outputs": [ 1040 | { 1041 | "name": "stdout", 1042 | "output_type": "stream", 1043 | "text": [ 1044 | "0 0 cost:11242.442383,train_auc:0.712778,test_auc:0.820132\n", 1045 | "100 6 cost:10882.122070,train_auc:0.899455,test_auc:0.896667\n", 1046 | "200 13 cost:10876.142578,train_auc:0.899216,test_auc:0.896111\n", 1047 | "300 20 cost:10874.057617,train_auc:0.898944,test_auc:0.895544\n", 1048 | "400 27 cost:10873.227539,train_auc:0.898751,test_auc:0.895171\n", 1049 | "500 34 cost:10872.964844,train_auc:0.898600,test_auc:0.894889\n", 1050 | "600 41 cost:10872.932617,train_auc:0.898473,test_auc:0.894680\n", 1051 | "700 47 cost:10873.029297,train_auc:0.898341,test_auc:0.894479\n", 1052 | "800 54 cost:10873.178711,train_auc:0.898198,test_auc:0.894281\n", 1053 | "900 61 cost:10873.346680,train_auc:0.898057,test_auc:0.894087\n", 1054 | "1000 67 cost:10873.514648,train_auc:0.897925,test_auc:0.893923\n", 1055 | "1100 74 cost:10873.672852,train_auc:0.897806,test_auc:0.893769\n", 1056 | "1200 81 cost:10873.840820,train_auc:0.897707,test_auc:0.893632\n", 1057 | "1300 88 cost:10874.041992,train_auc:0.897628,test_auc:0.893517\n", 1058 | "1400 94 cost:10874.262695,train_auc:0.897528,test_auc:0.893393\n", 1059 | "1500 101 cost:10874.436523,train_auc:0.897415,test_auc:0.893272\n", 1060 | "1600 108 cost:10874.587891,train_auc:0.897307,test_auc:0.893159\n", 1061 | "1700 114 cost:10874.765625,train_auc:0.897202,test_auc:0.893054\n", 1062 | "1800 121 cost:10874.981445,train_auc:0.897095,test_auc:0.892930\n", 1063 | "1900 128 cost:10875.167969,train_auc:0.896978,test_auc:0.892802\n", 1064 | "2000 134 cost:10875.333008,train_auc:0.896860,test_auc:0.892688\n", 1065 | "2100 141 cost:10875.524414,train_auc:0.896738,test_auc:0.892574\n", 1066 | "2200 148 cost:10875.757812,train_auc:0.896615,test_auc:0.892461\n", 1067 | "2300 155 cost:10875.997070,train_auc:0.896485,test_auc:0.892339\n", 1068 | "2400 161 cost:10876.208984,train_auc:0.896360,test_auc:0.892227\n", 1069 | "2500 168 cost:10876.394531,train_auc:0.896243,test_auc:0.892121\n", 1070 | "2600 175 cost:10876.570312,train_auc:0.896136,test_auc:0.892023\n", 1071 | "2700 181 cost:10876.739258,train_auc:0.896034,test_auc:0.891935\n", 1072 | "2800 188 cost:10876.905273,train_auc:0.895939,test_auc:0.891854\n", 1073 | "2900 195 cost:10877.067383,train_auc:0.895846,test_auc:0.891770\n", 1074 | "3000 202 cost:10877.227539,train_auc:0.895758,test_auc:0.891693\n", 1075 | "3100 208 cost:10877.381836,train_auc:0.895672,test_auc:0.891614\n", 1076 | "3200 215 cost:10877.538086,train_auc:0.895587,test_auc:0.891543\n", 1077 | "3300 222 cost:10877.686523,train_auc:0.895506,test_auc:0.891467\n", 1078 | "3400 228 cost:10877.834961,train_auc:0.895428,test_auc:0.891393\n", 1079 | "3500 235 cost:10877.979492,train_auc:0.895351,test_auc:0.891326\n", 1080 | "3600 242 cost:10878.118164,train_auc:0.895276,test_auc:0.891256\n", 1081 | "3700 249 cost:10878.254883,train_auc:0.895202,test_auc:0.891195\n", 1082 | "3800 255 cost:10878.383789,train_auc:0.895131,test_auc:0.891134\n", 1083 | "3900 262 cost:10878.516602,train_auc:0.895061,test_auc:0.891071\n", 1084 | "4000 269 cost:10878.642578,train_auc:0.894996,test_auc:0.891007\n", 1085 | "4100 275 cost:10878.762695,train_auc:0.894933,test_auc:0.890941\n", 1086 | "4200 282 cost:10878.880859,train_auc:0.894870,test_auc:0.890885\n", 1087 | "4300 289 cost:10878.995117,train_auc:0.894808,test_auc:0.890830\n", 1088 | "4400 296 cost:10879.104492,train_auc:0.894749,test_auc:0.890773\n", 1089 | "4500 302 cost:10879.211914,train_auc:0.894694,test_auc:0.890719\n", 1090 | "4600 309 cost:10879.315430,train_auc:0.894638,test_auc:0.890661\n", 1091 | "4700 316 cost:10879.413086,train_auc:0.894587,test_auc:0.890599\n", 1092 | "4800 322 cost:10879.512695,train_auc:0.894538,test_auc:0.890540\n", 1093 | "4900 329 cost:10879.610352,train_auc:0.894489,test_auc:0.890475\n", 1094 | "5000 336 cost:10879.701172,train_auc:0.894441,test_auc:0.890410\n", 1095 | "5100 343 cost:10879.782227,train_auc:0.894394,test_auc:0.890351\n", 1096 | "5200 349 cost:10879.862305,train_auc:0.894349,test_auc:0.890298\n", 1097 | "5300 356 cost:10879.940430,train_auc:0.894302,test_auc:0.890250\n", 1098 | "5400 363 cost:10880.012695,train_auc:0.894259,test_auc:0.890202\n", 1099 | "5500 369 cost:10880.083984,train_auc:0.894218,test_auc:0.890157\n", 1100 | "5600 376 cost:10880.151367,train_auc:0.894178,test_auc:0.890111\n", 1101 | "5700 383 cost:10880.219727,train_auc:0.894140,test_auc:0.890069\n", 1102 | "5800 390 cost:10880.284180,train_auc:0.894102,test_auc:0.890030\n", 1103 | "5900 396 cost:10880.351562,train_auc:0.894066,test_auc:0.889992\n", 1104 | "6000 403 cost:10880.413086,train_auc:0.894032,test_auc:0.889956\n", 1105 | "6100 410 cost:10880.476562,train_auc:0.893997,test_auc:0.889925\n", 1106 | "6200 416 cost:10880.538086,train_auc:0.893965,test_auc:0.889892\n", 1107 | "6300 423 cost:10880.598633,train_auc:0.893930,test_auc:0.889859\n", 1108 | "6400 430 cost:10880.655273,train_auc:0.893898,test_auc:0.889828\n", 1109 | "6500 436 cost:10880.713867,train_auc:0.893868,test_auc:0.889799\n", 1110 | "6600 443 cost:10880.770508,train_auc:0.893839,test_auc:0.889774\n", 1111 | "6700 450 cost:10880.827148,train_auc:0.893811,test_auc:0.889746\n", 1112 | "6800 456 cost:10880.886719,train_auc:0.893784,test_auc:0.889719\n", 1113 | "6900 463 cost:10880.939453,train_auc:0.893758,test_auc:0.889696\n", 1114 | "7000 470 cost:10880.992188,train_auc:0.893733,test_auc:0.889676\n", 1115 | "7100 476 cost:10881.047852,train_auc:0.893709,test_auc:0.889654\n", 1116 | "7200 483 cost:10881.102539,train_auc:0.893686,test_auc:0.889627\n", 1117 | "7300 490 cost:10881.153320,train_auc:0.893661,test_auc:0.889606\n", 1118 | "7400 496 cost:10881.208008,train_auc:0.893640,test_auc:0.889589\n", 1119 | "7500 503 cost:10881.259766,train_auc:0.893618,test_auc:0.889570\n", 1120 | "7600 510 cost:10881.309570,train_auc:0.893597,test_auc:0.889553\n", 1121 | "7700 516 cost:10881.361328,train_auc:0.893579,test_auc:0.889539\n", 1122 | "7800 523 cost:10881.413086,train_auc:0.893559,test_auc:0.889522\n", 1123 | "7900 530 cost:10881.461914,train_auc:0.893539,test_auc:0.889508\n", 1124 | "8000 536 cost:10881.512695,train_auc:0.893519,test_auc:0.889491\n", 1125 | "8100 543 cost:10881.560547,train_auc:0.893500,test_auc:0.889474\n", 1126 | "8200 550 cost:10881.610352,train_auc:0.893482,test_auc:0.889456\n", 1127 | "8300 556 cost:10881.657227,train_auc:0.893465,test_auc:0.889440\n", 1128 | "8400 563 cost:10881.704102,train_auc:0.893447,test_auc:0.889421\n", 1129 | "8500 570 cost:10881.750977,train_auc:0.893431,test_auc:0.889410\n", 1130 | "8600 576 cost:10881.797852,train_auc:0.893416,test_auc:0.889400\n", 1131 | "8700 583 cost:10881.842773,train_auc:0.893401,test_auc:0.889388\n", 1132 | "8800 590 cost:10881.889648,train_auc:0.893389,test_auc:0.889375\n", 1133 | "8900 596 cost:10881.931641,train_auc:0.893376,test_auc:0.889363\n", 1134 | "9000 603 cost:10881.979492,train_auc:0.893363,test_auc:0.889353\n", 1135 | "9100 610 cost:10882.022461,train_auc:0.893352,test_auc:0.889344\n", 1136 | "9200 616 cost:10882.066406,train_auc:0.893340,test_auc:0.889334\n", 1137 | "9300 623 cost:10882.108398,train_auc:0.893328,test_auc:0.889323\n", 1138 | "9400 629 cost:10882.151367,train_auc:0.893317,test_auc:0.889311\n", 1139 | "9500 636 cost:10882.194336,train_auc:0.893307,test_auc:0.889305\n", 1140 | "9600 643 cost:10882.236328,train_auc:0.893295,test_auc:0.889294\n", 1141 | "9700 649 cost:10882.278320,train_auc:0.893284,test_auc:0.889286\n", 1142 | "9800 656 cost:10882.318359,train_auc:0.893269,test_auc:0.889278\n", 1143 | "9900 663 cost:10882.359375,train_auc:0.893260,test_auc:0.889270\n" 1144 | ] 1145 | } 1146 | ], 1147 | "source": [ 1148 | "cost1 = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred,labels=y))\n", 1149 | "cost = tf.add_n([cost1])\n", 1150 | "train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost)\n", 1151 | "time_s = time.time()\n", 1152 | "result=[]\n", 1153 | "with tf.Session() as sess:\n", 1154 | " sess.run(tf.global_variables_initializer())# 初始化\n", 1155 | " for epoch in range(0,10000):\n", 1156 | " f_dict ={x:train_x,y:train_y}\n", 1157 | " \n", 1158 | " _,cost_,predict_ = sess.run([train_op,cost,pred],feed_dict=f_dict)\n", 1159 | " \n", 1160 | " auc = roc_auc_score(train_y,predict_)\n", 1161 | " time_t =time.time()\n", 1162 | " # 测试集\n", 1163 | " if epoch % 100 ==0:\n", 1164 | " f_dict ={x:test_x,y:test_y}\n", 1165 | " _,cost_,predict_test =sess.run([train_op,cost,pred],feed_dict=f_dict)\n", 1166 | " test_auc = roc_auc_score(test_y,predict_test)\n", 1167 | " print(\"%d %1d cost:%f,train_auc:%f,test_auc:%f\"%(epoch,(time_t-time_s),cost_,auc,test_auc))\n", 1168 | " result.append([epoch,(time_t-time_s),auc,test_auc])" 1169 | ] 1170 | }, 1171 | { 1172 | "cell_type": "code", 1173 | "execution_count": null, 1174 | "metadata": {}, 1175 | "outputs": [], 1176 | "source": [] 1177 | } 1178 | ], 1179 | "metadata": { 1180 | "kernelspec": { 1181 | "display_name": "Python 3", 1182 | "language": "python", 1183 | "name": "python3" 1184 | }, 1185 | "language_info": { 1186 | "codemirror_mode": { 1187 | "name": "ipython", 1188 | "version": 3 1189 | }, 1190 | "file_extension": ".py", 1191 | "mimetype": "text/x-python", 1192 | "name": "python", 1193 | "nbconvert_exporter": "python", 1194 | "pygments_lexer": "ipython3", 1195 | "version": "3.6.5" 1196 | } 1197 | }, 1198 | "nbformat": 4, 1199 | "nbformat_minor": 2 1200 | } 1201 | -------------------------------------------------------------------------------- /NFM/NFM.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | TensorFlow 2.0 implementation of Product-based Neural Network[1] 4 | Reference: 5 | https://zhuanlan.zhihu.com/p/37522285 6 | Neural Factorization Machines for Sparse Predictive Analytics 7 | """ 8 | import tensorflow as tf 9 | 10 | import pickle 11 | from util.train_model import train_test_model_demo 12 | 13 | 14 | class BiInteraction(tf.keras.layers.Layer): 15 | def __init__(self, Units=1, **kwargs): 16 | self.units = Units 17 | super(BiInteraction, self).__init__(**kwargs) 18 | 19 | def build(self, input_shape): 20 | input_dim = input_shape[2] 21 | # self.W = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', trainable=True) 22 | # self.b = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', trainable=True) 23 | self.linearlayer = tf.keras.layers.Dense(input_dim, activation='relu', use_bias=True) 24 | 25 | def call(self, input): 26 | # sum-square-part 27 | self.summed_features_emb = tf.reduce_sum(input,1) # None * K 28 | # print("self.summed_features_emb:",self.summed_features_emb.get_shape()) 29 | self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K 30 | # square-sum-part 31 | self.squared_features_emb = tf.square(input) 32 | self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb,1) # None * K 33 | 34 | # second order 35 | self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square,self.squared_sum_features_emb) # None * K 36 | print("y_second_order:",self.y_second_order.get_shape()) # 128 * 10 37 | output = self.linearlayer(self.y_second_order) 38 | return output 39 | 40 | class NFM(tf.keras.Model): 41 | def __init__(self, num_feat, num_field, dropout_deep, deep_layer_sizes, embedding_size=10): 42 | super().__init__() 43 | self.num_feat = num_feat # F =features nums 44 | self.num_field = num_field # N =fields of a feature 45 | self.dropout_deep = dropout_deep 46 | 47 | # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度 48 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, 49 | embeddings_initializer='uniform') # F * M 50 | self.feat_embeddings = feat_embeddings 51 | 52 | # fc layer 53 | self.deep_layer_sizes = deep_layer_sizes 54 | # 神经网络方面的参数 55 | for i in range(len(deep_layer_sizes)): 56 | setattr(self, 'dense_' + str(i), tf.keras.layers.Dense(deep_layer_sizes[i])) 57 | setattr(self, 'batchNorm_' + str(i), tf.keras.layers.BatchNormalization()) 58 | setattr(self, 'activation_' + str(i), tf.keras.layers.Activation('relu')) 59 | setattr(self, 'dropout_' + str(i), tf.keras.layers.Dropout(dropout_deep[i])) 60 | self.bilayer = BiInteraction(1) 61 | # last layer 62 | self.fc = tf.keras.layers.Dense(1, activation=None, use_bias=True) 63 | 64 | self.linearlayer = tf.keras.layers.Dense(deep_layer_sizes[-1], activation='relu', use_bias=True) 65 | 66 | def call(self, feat_index, feat_value): 67 | # call函数接收输入变量 68 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。 69 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 70 | # print(feat_value.get_shape()) 71 | feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value) 72 | 73 | y_deep = self.bilayer(feat_embedding) 74 | y_linear = self.linearlayer(tf.reduce_sum(feat_embedding,1)) 75 | 76 | for i in range(len(self.deep_layer_sizes)): 77 | y_deep = getattr(self, 'dense_' + str(i))(y_deep) 78 | y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep) 79 | y_deep = getattr(self, 'activation_' + str(i))(y_deep) 80 | y_deep = getattr(self, 'dropout_' + str(i))(y_deep) 81 | y = y_deep + y_linear 82 | output = self.fc(y) 83 | 84 | return output 85 | if __name__ == '__main__': 86 | AID_DATA_DIR = "../data/Criteo/" 87 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb')) 88 | 89 | nfm = NFM(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5], 90 | deep_layer_sizes=[400, 400], embedding_size=10) 91 | 92 | train_label_path = AID_DATA_DIR + 'train_label' 93 | train_idx_path = AID_DATA_DIR + 'train_idx' 94 | train_value_path = AID_DATA_DIR + 'train_value' 95 | 96 | test_label_path = AID_DATA_DIR + 'test_label' 97 | test_idx_path = AID_DATA_DIR + 'test_idx' 98 | test_value_path = AID_DATA_DIR + 'test_value' 99 | 100 | train_test_model_demo(nfm,train_label_path, train_idx_path, train_value_path) 101 | -------------------------------------------------------------------------------- /PNN/PNN-tf2.0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n", 14 | "Using TensorFlow backend.\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "# from sklearn.preprocessing import OneHotEncoder,StandarScaler\n", 22 | "from sklearn.metrics import accuracy_score\n", 23 | "import random\n", 24 | "from keras.utils import to_categorical\n", 25 | "from sklearn.preprocessing import LabelEncoder\n", 26 | "\n", 27 | "from sklearn.metrics import roc_auc_score\n", 28 | "\n", 29 | "import tensorflow as tf\n", 30 | "\n", 31 | "from collections import Counter\n", 32 | "\n", 33 | "import math" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 23, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "class PNN(tf.keras.Model):\n", 43 | " def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,product_layer_dim=10,reg_l1=0.01,reg_l2=1e-5,embedding_size=10,product_type='outer'):\n", 44 | " super().__init__()\n", 45 | " self.reg_l1 = reg_l1\n", 46 | " self.reg_l2 = reg_l2\n", 47 | " self.num_feat = num_feat # F =features nums\n", 48 | " self.num_field = num_field # N =fields of a feature \n", 49 | " self.product_layer_dim = product_layer_dim # D1 pnn dim\n", 50 | " self.dropout_deep = dropout_deep\n", 51 | " \n", 52 | " # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度\n", 53 | " feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M \n", 54 | " self.feat_embeddings = feat_embeddings\n", 55 | " \n", 56 | " # 定义随机初始化\n", 57 | " initializer = tf.initializers.GlorotUniform()\n", 58 | " \n", 59 | " # linear part 线性层就是embedding层的复制,因此线性信号权重大小是D1 * N * M,为什么因此是线性层维度为 D1,embedding层维度为N* M\n", 60 | " # 因此权重大小为D1 * N *M\n", 61 | " self.linear_weights = tf.Variable(initializer(shape=(product_layer_dim,num_field,embedding_size))) # D1 * N * M\n", 62 | " \n", 63 | " # quadratic part \n", 64 | " self.product_type = product_type\n", 65 | " if product_type == 'inner':\n", 66 | " self.theta = tf.Variable(initializer(shape=(product_layer_dim,num_field))) # D1 * N\n", 67 | "\n", 68 | " else:\n", 69 | " self.quadratic_weights = tf.Variable(initializer(shape=(product_layer_dim,embedding_size, embedding_size)))# D1 * M * M\n", 70 | " \n", 71 | " # fc layer\n", 72 | " self.deep_layer_sizes = deep_layer_sizes\n", 73 | " #神经网络方面的参数\n", 74 | " for i in range(len(deep_layer_sizes)):\n", 75 | " setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))\n", 76 | " setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())\n", 77 | " setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))\n", 78 | " setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))\n", 79 | " \n", 80 | " # last layer\n", 81 | " self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True)\n", 82 | " \n", 83 | " def call(self,feat_index,feat_value):\n", 84 | " # call函数接收输入变量\n", 85 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 86 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 87 | "# print(feat_value.get_shape())\n", 88 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 89 | " # linear part \n", 90 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n", 91 | " \n", 92 | " # quadratic part\n", 93 | " if self.product_type == 'inner':\n", 94 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n", 95 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n", 96 | " else:\n", 97 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n", 98 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n", 99 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n", 100 | " \n", 101 | " y_deep = tf.concat((lz,lp),axis=1)\n", 102 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n", 103 | " \n", 104 | " for i in range(len(self.deep_layer_sizes)):\n", 105 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n", 106 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n", 107 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n", 108 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n", 109 | " \n", 110 | " output = self.fc(y_deep)\n", 111 | " \n", 112 | " return output " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "train = pd.read_csv(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.tiny.csv')\n", 122 | "\n", 123 | "train = train.fillna(0)\n", 124 | "\n", 125 | "traindrop = train.drop(columns = ['Id'])\n", 126 | "\n", 127 | "traindrop.to_csv(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt',sep='\\t', index=False,header=None)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 11, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "freq_ = 10\n", 137 | "# dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'\n", 138 | "continuous_range_ = range(1, 14)\n", 139 | "categorical_range_ = range(14, 40)\n", 140 | "\n", 141 | "# 统计离散特征每个离散值出现的次数组成字典\n", 142 | "feat_cnt = Counter()\n", 143 | "with open(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt', 'r') as fin:\n", 144 | " for line_idx, line in enumerate(fin):\n", 145 | " features = line.rstrip('\\n').split('\\t')\n", 146 | " for idx in categorical_range_:\n", 147 | " if features[idx] == '': continue\n", 148 | " feat_cnt.update([features[idx]])" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 13, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# Only retain discrete features with high frequency\n", 158 | "dis_feat_set = set() # 高频段的离散字符\n", 159 | "for feat, ot in feat_cnt.items():\n", 160 | " if ot >= freq_:\n", 161 | " dis_feat_set.add(feat)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 14, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# Create a dictionary for continuous and discrete features\n", 171 | "feat_dict = {}\n", 172 | "tc = 1\n", 173 | "# Continuous features\n", 174 | "for idx in continuous_range_:\n", 175 | " feat_dict[idx] = tc\n", 176 | " tc += 1 # 代表占据一列\n", 177 | "\n", 178 | "# Discrete features\n", 179 | "cnt_feat_set = set()\n", 180 | "with open(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt', 'r') as fin:\n", 181 | " for line_idx, line in enumerate(fin):\n", 182 | " features = line.rstrip('\\n').split('\\t')\n", 183 | " for idx in categorical_range_:\n", 184 | " # 排除空字符和低频离散字符\n", 185 | " if features[idx] == '' or features[idx] not in dis_feat_set:\n", 186 | " continue\n", 187 | " # 排除连续性数值\n", 188 | " if features[idx] not in cnt_feat_set:\n", 189 | " cnt_feat_set.add(features[idx])\n", 190 | " # 获取种类数\n", 191 | " feat_dict[features[idx]] = tc\n", 192 | " tc += 1" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 16, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "file_path = \"F:\\\\baidudownload\\\\kaggle-2014-criteo-master\\\\kaggle-2014-criteo-master\\\\\"" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 18, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "cont_features=['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n", 211 | " 'I10', 'I11', 'I12', 'I13']\n", 212 | "dist_features = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n", 213 | " 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n", 214 | " 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 21, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "train_label = []\n", 224 | "train_value = []\n", 225 | "train_idx = []\n", 226 | "test_label = []\n", 227 | "test_value = []\n", 228 | "test_idx = []\n", 229 | "\n", 230 | "continuous_range_ = range(1, 14)\n", 231 | "categorical_range_ = range(14, 40)\n", 232 | "cont_max_=[]\n", 233 | "cont_min_=[]\n", 234 | "for cf in cont_features:\n", 235 | " cont_max_.append(max(train[cf]))\n", 236 | " cont_min_.append(min(train[cf]))\n", 237 | "cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]\n", 238 | "\n", 239 | "def process_line_(line):\n", 240 | " features = line.rstrip('\\n').split('\\t')\n", 241 | " feat_idx, feat_value, label = [], [], []\n", 242 | "\n", 243 | " # MinMax Normalization\n", 244 | " for idx in continuous_range_:\n", 245 | " if features[idx] == '':\n", 246 | " feat_idx.append(0)\n", 247 | " feat_value.append(0.0)\n", 248 | " else:\n", 249 | " feat_idx.append(feat_dict[idx])\n", 250 | " # 归一化\n", 251 | " feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))\n", 252 | "\n", 253 | " # 处理离散型数据\n", 254 | " for idx in categorical_range_:\n", 255 | " if features[idx] == '' or features[idx] not in feat_dict:\n", 256 | " feat_idx.append(0)\n", 257 | " feat_value.append(0.0)\n", 258 | " else:\n", 259 | " feat_idx.append(feat_dict[features[idx]])\n", 260 | " feat_value.append(1.0)\n", 261 | " return feat_idx, feat_value, [int(features[0])]\n", 262 | "split_ratio = 0.9\n", 263 | "with open(file_path + 'train.txt', 'r') as fin:\n", 264 | " for line_idx, line in enumerate(fin):\n", 265 | "\n", 266 | " feat_idx, feat_value, label = process_line_(line)\n", 267 | " if np.random.random() <= split_ratio:\n", 268 | " train_label.append(label)\n", 269 | " train_idx.append(feat_idx)\n", 270 | " train_value.append(feat_value)\n", 271 | " else:\n", 272 | " test_label.append(label)\n", 273 | " test_idx.append(feat_idx)\n", 274 | " test_value.append(feat_value)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 24, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "pnn = PNN(num_feat=len(feat_dict) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],\n", 284 | " deep_layer_sizes=[400, 400], product_layer_dim=10,\n", 285 | " reg_l1=0.01, reg_l2=1e-5, embedding_size=10, product_type='outer')" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 25, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "train_ds = tf.data.Dataset.from_tensor_slices(\n", 295 | " (train_label,train_idx,train_value)).shuffle(10000).batch(32)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 26, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "@tf.function\n", 305 | "def train_one_step(model, optimizer, idx, value, label):\n", 306 | " with tf.GradientTape() as tape:\n", 307 | " output = model(idx,value)\n", 308 | " loss = loss_object(y_true=label, y_pred=output)\n", 309 | " grads = tape.gradient(loss, model.trainable_variables)\n", 310 | " grads = [tf.clip_by_norm(g, 100) for g in grads]\n", 311 | " optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))\n", 312 | " \n", 313 | " train_loss(loss)\n", 314 | " train_accuracy(label,output)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 27, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "train_loss = tf.keras.metrics.Mean(name='train_loss')\n", 324 | "train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')\n", 325 | "\n", 326 | "loss_object = tf.keras.losses.BinaryCrossentropy()\n", 327 | "\n", 328 | "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 35, 334 | "metadata": { 335 | "scrolled": true 336 | }, 337 | "outputs": [ 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 343 | " def call(self,feat_index,feat_value):\n", 344 | " # call函数接收输入变量\n", 345 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 346 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 347 | "# print(feat_value.get_shape())\n", 348 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 349 | " # linear part \n", 350 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n", 351 | " \n", 352 | " # quadratic part\n", 353 | " if self.product_type == 'inner':\n", 354 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n", 355 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n", 356 | " else:\n", 357 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n", 358 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n", 359 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n", 360 | " \n", 361 | " y_deep = tf.concat((lz,lp),axis=1)\n", 362 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n", 363 | " \n", 364 | " for i in range(len(self.deep_layer_sizes)):\n", 365 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n", 366 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n", 367 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n", 368 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n", 369 | " \n", 370 | " output = self.fc(y_deep)\n", 371 | " \n", 372 | " return output \n", 373 | "\n", 374 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 375 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 376 | " def call(self,feat_index,feat_value):\n", 377 | " # call函数接收输入变量\n", 378 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 379 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 380 | "# print(feat_value.get_shape())\n", 381 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 382 | " # linear part \n", 383 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n", 384 | " \n", 385 | " # quadratic part\n", 386 | " if self.product_type == 'inner':\n", 387 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n", 388 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n", 389 | " else:\n", 390 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n", 391 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n", 392 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n", 393 | " \n", 394 | " y_deep = tf.concat((lz,lp),axis=1)\n", 395 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n", 396 | " \n", 397 | " for i in range(len(self.deep_layer_sizes)):\n", 398 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n", 399 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n", 400 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n", 401 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n", 402 | " \n", 403 | " output = self.fc(y_deep)\n", 404 | " \n", 405 | " return output \n", 406 | "\n", 407 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 408 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 409 | " def call(self,feat_index,feat_value):\n", 410 | " # call函数接收输入变量\n", 411 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 412 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 413 | "# print(feat_value.get_shape())\n", 414 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 415 | " # linear part \n", 416 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n", 417 | " \n", 418 | " # quadratic part\n", 419 | " if self.product_type == 'inner':\n", 420 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n", 421 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n", 422 | " else:\n", 423 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n", 424 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n", 425 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n", 426 | " \n", 427 | " y_deep = tf.concat((lz,lp),axis=1)\n", 428 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n", 429 | " \n", 430 | " for i in range(len(self.deep_layer_sizes)):\n", 431 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n", 432 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n", 433 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n", 434 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n", 435 | " \n", 436 | " output = self.fc(y_deep)\n", 437 | " \n", 438 | " return output \n", 439 | "\n", 440 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 441 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 442 | " def call(self,feat_index,feat_value):\n", 443 | " # call函数接收输入变量\n", 444 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 445 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 446 | "# print(feat_value.get_shape())\n", 447 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 448 | " # linear part \n", 449 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n", 450 | " \n", 451 | " # quadratic part\n", 452 | " if self.product_type == 'inner':\n", 453 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n", 454 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n", 455 | " else:\n", 456 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n", 457 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n", 458 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n", 459 | " \n", 460 | " y_deep = tf.concat((lz,lp),axis=1)\n", 461 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n", 462 | " \n", 463 | " for i in range(len(self.deep_layer_sizes)):\n", 464 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n", 465 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n", 466 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n", 467 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n", 468 | " \n", 469 | " output = self.fc(y_deep)\n", 470 | " \n", 471 | " return output \n", 472 | "\n", 473 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 474 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 475 | " def call(self,feat_index,feat_value):\n", 476 | " # call函数接收输入变量\n", 477 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 478 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 479 | "# print(feat_value.get_shape())\n", 480 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 481 | " # linear part \n", 482 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n", 483 | " \n", 484 | " # quadratic part\n", 485 | " if self.product_type == 'inner':\n", 486 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n", 487 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n", 488 | " else:\n", 489 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n", 490 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n", 491 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n", 492 | " \n", 493 | " y_deep = tf.concat((lz,lp),axis=1)\n", 494 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n", 495 | " \n", 496 | " for i in range(len(self.deep_layer_sizes)):\n", 497 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n", 498 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n", 499 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n", 500 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n", 501 | " \n", 502 | " output = self.fc(y_deep)\n", 503 | " \n", 504 | " return output \n", 505 | "\n", 506 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n" 507 | ] 508 | }, 509 | { 510 | "name": "stdout", 511 | "output_type": "stream", 512 | "text": [ 513 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n", 514 | " def call(self,feat_index,feat_value):\n", 515 | " # call函数接收输入变量\n", 516 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n", 517 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n", 518 | "# print(feat_value.get_shape())\n", 519 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n", 520 | " # linear part \n", 521 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n", 522 | " \n", 523 | " # quadratic part\n", 524 | " if self.product_type == 'inner':\n", 525 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n", 526 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n", 527 | " else:\n", 528 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n", 529 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n", 530 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n", 531 | " \n", 532 | " y_deep = tf.concat((lz,lp),axis=1)\n", 533 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n", 534 | " \n", 535 | " for i in range(len(self.deep_layer_sizes)):\n", 536 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n", 537 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n", 538 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n", 539 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n", 540 | " \n", 541 | " output = self.fc(y_deep)\n", 542 | " \n", 543 | " return output \n", 544 | "\n", 545 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n", 546 | "Epoch 1, Loss: 0.6576068997383118, Accuracy: 0.7935805320739746\n", 547 | "Epoch 2, Loss: 0.5885103344917297, Accuracy: 0.7927504181861877\n", 548 | "Epoch 3, Loss: 0.5613061785697937, Accuracy: 0.7932115793228149\n", 549 | "Epoch 4, Loss: 0.5463097095489502, Accuracy: 0.7933038473129272\n", 550 | "Epoch 5, Loss: 0.5362721681594849, Accuracy: 0.7933591604232788\n", 551 | "Epoch 6, Loss: 0.5272665023803711, Accuracy: 0.7933960556983948\n", 552 | "Epoch 7, Loss: 0.519040048122406, Accuracy: 0.7934224009513855\n", 553 | "Epoch 8, Loss: 0.5109833478927612, Accuracy: 0.7934421896934509\n", 554 | "Epoch 9, Loss: 0.5033180713653564, Accuracy: 0.7937650084495544\n", 555 | "Epoch 10, Loss: 0.4961019456386566, Accuracy: 0.7946319580078125\n", 556 | "Epoch 11, Loss: 0.4890766143798828, Accuracy: 0.7957438230514526\n", 557 | "Epoch 12, Loss: 0.4827176034450531, Accuracy: 0.7968548536300659\n", 558 | "Epoch 13, Loss: 0.47674980759620667, Accuracy: 0.797752320766449\n", 559 | "Epoch 14, Loss: 0.4711345434188843, Accuracy: 0.7988378405570984\n", 560 | "Epoch 15, Loss: 0.4657707214355469, Accuracy: 0.800000011920929\n", 561 | "Epoch 16, Loss: 0.46048682928085327, Accuracy: 0.8010168671607971\n", 562 | "Epoch 17, Loss: 0.45555245876312256, Accuracy: 0.8017839193344116\n", 563 | "Epoch 18, Loss: 0.45082414150238037, Accuracy: 0.8027116656303406\n", 564 | "Epoch 19, Loss: 0.44618847966194153, Accuracy: 0.8038330674171448\n", 565 | "Epoch 20, Loss: 0.4417554438114166, Accuracy: 0.8047869205474854\n", 566 | "Epoch 21, Loss: 0.4372897744178772, Accuracy: 0.8057553768157959\n", 567 | "Epoch 22, Loss: 0.4330126941204071, Accuracy: 0.8065603375434875\n", 568 | "Epoch 23, Loss: 0.4288385808467865, Accuracy: 0.8073434233665466\n", 569 | "Epoch 24, Loss: 0.4247806966304779, Accuracy: 0.8084993362426758\n", 570 | "Epoch 25, Loss: 0.4208265244960785, Accuracy: 0.8095406889915466\n", 571 | "Epoch 26, Loss: 0.4169451892375946, Accuracy: 0.8103954792022705\n", 572 | "Epoch 27, Loss: 0.4130288362503052, Accuracy: 0.8114328980445862\n", 573 | "Epoch 28, Loss: 0.40920114517211914, Accuracy: 0.8125543594360352\n", 574 | "Epoch 29, Loss: 0.40538835525512695, Accuracy: 0.8135412335395813\n", 575 | "Epoch 30, Loss: 0.4015306830406189, Accuracy: 0.8148680925369263\n", 576 | "Epoch 31, Loss: 0.3976829946041107, Accuracy: 0.8161808252334595\n", 577 | "Epoch 32, Loss: 0.3940720558166504, Accuracy: 0.8173941373825073\n", 578 | "Epoch 33, Loss: 0.3902757167816162, Accuracy: 0.8186178207397461\n", 579 | "Epoch 34, Loss: 0.3865205645561218, Accuracy: 0.8198021054267883\n", 580 | "Epoch 35, Loss: 0.3826711177825928, Accuracy: 0.8213613629341125\n", 581 | "Epoch 36, Loss: 0.3788437247276306, Accuracy: 0.8228186964988708\n", 582 | "Epoch 37, Loss: 0.37512916326522827, Accuracy: 0.8244065642356873\n", 583 | "Epoch 38, Loss: 0.37138548493385315, Accuracy: 0.8259109258651733\n", 584 | "Epoch 39, Loss: 0.3679004907608032, Accuracy: 0.8273097276687622\n", 585 | "Epoch 40, Loss: 0.36420953273773193, Accuracy: 0.8287769556045532\n", 586 | "Epoch 41, Loss: 0.3605599105358124, Accuracy: 0.8302671313285828\n", 587 | "Epoch 42, Loss: 0.3569217622280121, Accuracy: 0.8318312168121338\n", 588 | "Epoch 43, Loss: 0.3537658751010895, Accuracy: 0.8332582712173462\n", 589 | "Epoch 44, Loss: 0.35020676255226135, Accuracy: 0.8348342180252075\n", 590 | "Epoch 45, Loss: 0.3465138077735901, Accuracy: 0.8364631533622742\n", 591 | "Epoch 46, Loss: 0.34285783767700195, Accuracy: 0.8380813598632812\n", 592 | "Epoch 47, Loss: 0.3392927646636963, Accuracy: 0.8396543264389038\n", 593 | "Epoch 48, Loss: 0.33572396636009216, Accuracy: 0.8413115739822388\n", 594 | "Epoch 49, Loss: 0.33254799246788025, Accuracy: 0.8427882790565491\n", 595 | "Epoch 50, Loss: 0.3292645514011383, Accuracy: 0.8442501425743103\n" 596 | ] 597 | } 598 | ], 599 | "source": [ 600 | "EPOCHS = 50\n", 601 | "for epoch in range(EPOCHS):\n", 602 | " for label, idx, value in train_ds:\n", 603 | " train_one_step(pnn,optimizer,idx, value,label)\n", 604 | " template = 'Epoch {}, Loss: {}, Accuracy: {}'\n", 605 | " print (template.format(epoch+1,\n", 606 | " train_loss.result(),train_accuracy.result()))" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [] 615 | } 616 | ], 617 | "metadata": { 618 | "kernelspec": { 619 | "display_name": "Python 3", 620 | "language": "python", 621 | "name": "python3" 622 | }, 623 | "language_info": { 624 | "codemirror_mode": { 625 | "name": "ipython", 626 | "version": 3 627 | }, 628 | "file_extension": ".py", 629 | "mimetype": "text/x-python", 630 | "name": "python", 631 | "nbconvert_exporter": "python", 632 | "pygments_lexer": "ipython3", 633 | "version": "3.6.5" 634 | } 635 | }, 636 | "nbformat": 4, 637 | "nbformat_minor": 2 638 | } 639 | -------------------------------------------------------------------------------- /PNN/PNN.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | TensorFlow 2.0 implementation of Product-based Neural Network[1] 4 | Reference: 5 | [1] Product-based Neural Networks for User ResponsePrediction, 6 | Yanru Qu, Han Cai, Kan Ren, Weinan Zhang, Yong Yu, Ying Wen, Jun Wang 7 | [2] Tensorflow implementation of PNN 8 | https://github.com/Snail110/Awesome-RecSystem-Models/blob/master/Model/PNN_TensorFlow.py 9 | """ 10 | import tensorflow as tf 11 | 12 | import pickle 13 | from util.train_model import train_test_model_demo 14 | class PNN(tf.keras.Model): 15 | def __init__(self, num_feat, num_field, dropout_deep, deep_layer_sizes, product_layer_dim=10, reg_l1=0.01, 16 | reg_l2=1e-5, embedding_size=10, product_type='outer'): 17 | super().__init__() 18 | self.reg_l1 = reg_l1 19 | self.reg_l2 = reg_l2 20 | self.num_feat = num_feat # F =features nums 21 | self.num_field = num_field # N =fields of a feature 22 | self.product_layer_dim = product_layer_dim # D1 pnn dim 23 | self.dropout_deep = dropout_deep 24 | 25 | # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度 26 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, 27 | embeddings_initializer='uniform') # F * M 28 | self.feat_embeddings = feat_embeddings 29 | 30 | # 定义随机初始化 31 | initializer = tf.initializers.GlorotUniform() 32 | 33 | # linear part 线性层就是embedding层的复制,因此线性信号权重大小是D1 * N * M,为什么因此是线性层维度为 D1,embedding层维度为N* M 34 | # 因此权重大小为D1 * N *M 35 | self.linear_weights = tf.Variable( 36 | initializer(shape=(product_layer_dim, num_field, embedding_size))) # D1 * N * M 37 | 38 | # quadratic part 39 | self.product_type = product_type 40 | if product_type == 'inner': 41 | self.theta = tf.Variable(initializer(shape=(product_layer_dim, num_field))) # D1 * N 42 | 43 | else: 44 | self.quadratic_weights = tf.Variable( 45 | initializer(shape=(product_layer_dim, embedding_size, embedding_size))) # D1 * M * M 46 | 47 | # fc layer 48 | self.deep_layer_sizes = deep_layer_sizes 49 | # 神经网络方面的参数 50 | for i in range(len(deep_layer_sizes)): 51 | setattr(self, 'dense_' + str(i), tf.keras.layers.Dense(deep_layer_sizes[i])) 52 | setattr(self, 'batchNorm_' + str(i), tf.keras.layers.BatchNormalization()) 53 | setattr(self, 'activation_' + str(i), tf.keras.layers.Activation('relu')) 54 | setattr(self, 'dropout_' + str(i), tf.keras.layers.Dropout(dropout_deep[i])) 55 | 56 | # last layer 57 | self.fc = tf.keras.layers.Dense(1, activation=None, use_bias=True) 58 | 59 | def call(self, feat_index, feat_value): 60 | # call函数接收输入变量 61 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。 62 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 63 | # print(feat_value.get_shape()) 64 | feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value) 65 | # linear part 66 | lz = tf.einsum('bnm,dnm->bd', feat_embedding, self.linear_weights) # Batch * D1 67 | 68 | # quadratic part 69 | if self.product_type == 'inner': 70 | theta = tf.einsum('bnm,dn->bdnm', feat_embedding, self.theta) # Batch * D1 * N * M 71 | lp = tf.einsum('bdnm,bdnm->bd', theta, theta) # Batch * D1 72 | else: 73 | embed_sum = tf.reduce_sum(feat_embedding, axis=1) # Batch * M 74 | p = tf.einsum('bm,bn->bmn', embed_sum, embed_sum) 75 | lp = tf.einsum('bmn,dmn->bd', p, self.quadratic_weights) # Batch * D1 76 | 77 | y_deep = tf.concat((lz, lp), axis=1) 78 | y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep) 79 | 80 | for i in range(len(self.deep_layer_sizes)): 81 | y_deep = getattr(self, 'dense_' + str(i))(y_deep) 82 | y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep) 83 | y_deep = getattr(self, 'activation_' + str(i))(y_deep) 84 | y_deep = getattr(self, 'dropout_' + str(i))(y_deep) 85 | 86 | output = self.fc(y_deep) 87 | 88 | return output 89 | if __name__ == '__main__': 90 | AID_DATA_DIR = "../data/Criteo/" 91 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb')) 92 | 93 | pnn = PNN(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5], 94 | deep_layer_sizes=[400, 400], product_layer_dim=10, 95 | reg_l1=0.01, reg_l2=1e-5, embedding_size=10, product_type='outer') 96 | 97 | train_label_path = AID_DATA_DIR + 'train_label' 98 | train_idx_path = AID_DATA_DIR + 'train_idx' 99 | train_value_path = AID_DATA_DIR + 'train_value' 100 | 101 | test_label_path = AID_DATA_DIR + 'test_label' 102 | test_idx_path = AID_DATA_DIR + 'test_idx' 103 | test_value_path = AID_DATA_DIR + 'test_value' 104 | 105 | train_test_model_demo(pnn,train_label_path, train_idx_path, train_value_path) 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # recsys 2 | 3 | ## 1.Requireme 4 | 在这里强调下主要要是采用 Tensorflow2.0的api进行建立model 5 | 6 | TensorFlow2.0,Keras, Python3.6, NumPy, sk-learn, Pandas 7 | 8 | ## 2.Datasets 9 | 10 | ### 2.1 Criteo 11 | 12 | This dataset Contains about 45 million records. There are 13 features taking integer values (mostly count features) and 26 categorical features. 13 | The dataset is available at http://labs.criteo.com/2014/02/download-kaggle-display-advertising-challenge-dataset/ 14 | 15 | 在这里我截取一部分数据进行模型训练 data =../data/Criteo/train.txt 16 | 17 | ### 2.2 Seguro-safe-driver 18 | 19 | In the train and test data, features that belong to similar groupings are 20 | tagged as such in the feature names (e.g., ind, reg, car, calc). In addition, 21 | feature names include the postfix bin to indicate binary features and 22 | cat to indicate categorical features. Features without these designations 23 | are either continuous or ordinal. Values of -1 indicate that the feature was 24 | missing from the observation. The target columns signifies whether or not a 25 | claim was filed for that policy holder. 26 | 27 | The dataset is available at https://www.kaggle.com/c/porto-seguro-safe-driver-prediction 28 | 29 | ## 3. 推荐系统实战 30 | 31 | ![image](https://pic2.zhimg.com/80/v2-763b523bd17349cd6cfecae2765db3d5_hd.jpg) 32 | 来自https://zhuanlan.zhihu.com/p/69050253 33 | 34 | ![image](https://pic3.zhimg.com/v2-dd98a58d2676f20ded7d7b0c61e88fa2_r.jpg) 35 | 来自https://zhuanlan.zhihu.com/p/53231955 36 | ### 3.1 第一章.协同过滤 37 | 38 | ### 3.2 第二章 GBDT+LR 39 | 40 | 本质上GBDT+LR是一种具有stacking思想的二分类,所以用来解决二分类问题,这个方法出自于Facebook 2014年的论文 Practical Lessons from Predicting Clicks on Ads at Facebook 。 41 | https://zhuanlan.zhihu.com/p/29053940 42 | 43 | ### 3.3 第三章 MLR 44 | 45 | 算法简单实现 我们这里只是简单实现一个tensorflow版本的MLR模型 46 | https://www.jianshu.com/p/627fc0d755b2 47 | 48 | ### 3.4 第四章 DCN 49 | 50 | Deep Cross Network模型 51 | 52 | https://www.jianshu.com/p/77719fc252fa 53 | 54 | https://github.com/Nirvanada/Deep-and-Cross-Keras 55 | 56 | https://blog.csdn.net/roguesir/article/details/797632 57 | 58 | https://arxiv.org/abs/1708.05123 59 | 60 | ### 3.5 第五章 PNN 61 | 62 | https://github.com/JianzhouZhan/Awesome-RecSystem-Models 63 | 64 | https://github.com/Snail110/tensorflow_practice/blob/master/recommendation/Basic-PNN-Demo/PNN.py 65 | 66 | https://www.jianshu.com/p/be784ab4abc2 67 | 68 | 69 | ### 3.6 第六章 Wide-Deep 70 | 71 | https://zhuanlan.zhihu.com/p/92279796 72 | 73 | https://github.com/busesese/Wide_Deep_Model 74 | 75 | ### 3.6 第七章 NFM 76 | 77 | https://zhuanlan.zhihu.com/p/37522285 78 | Neural Factorization Machines for Sparse Predictive Analytics -------------------------------------------------------------------------------- /Wide-Deep/Wide-Deep.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | import pickle 4 | from util.train_model import train_test_model_demo 5 | 6 | 7 | class Wide(tf.keras.layers.Layer): 8 | def __init__(self,units=1): 9 | # input_dim = num_size + embed_size = input_size 10 | super(Wide, self).__init__() 11 | # self.units = units 12 | self.linear = tf.keras.layers.Dense(units=units,activation='relu') 13 | def call(self, inputs): 14 | output = self.linear(inputs) 15 | return output 16 | 17 | class Deep(tf.keras.layers.Layer): 18 | def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10): 19 | # input_dim = num_size + embed_size = input_size 20 | super(Deep, self).__init__() 21 | self.num_feat = num_feat # F =features nums 22 | self.num_field = num_field # N =fields of a feature 23 | self.dropout_deep = dropout_deep 24 | 25 | # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度 26 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M 27 | self.feat_embeddings = feat_embeddings 28 | 29 | # fc layer 30 | self.deep_layer_sizes = deep_layer_sizes 31 | #神经网络方面的参数 32 | for i in range(len(deep_layer_sizes)): 33 | setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i])) 34 | setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization()) 35 | setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu')) 36 | setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i])) 37 | # last layer 38 | self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True) 39 | 40 | def call(self,feat_index,feat_value): 41 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。 42 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 43 | # print(feat_value.get_shape()) 44 | feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value) 45 | 46 | y_deep = tf.keras.layers.Flatten()(feat_embedding) 47 | for i in range(len(self.deep_layer_sizes)): 48 | y_deep = getattr(self,'dense_' + str(i))(y_deep) 49 | y_deep = getattr(self,'batchNorm_' + str(i))(y_deep) 50 | y_deep = getattr(self,'activation_' + str(i))(y_deep) 51 | y_deep = getattr(self,'dropout_' + str(i))(y_deep) 52 | 53 | output = self.fc(y_deep) 54 | return output 55 | 56 | class WideDeep(tf.keras.Model): 57 | def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10): 58 | super().__init__() 59 | self.num_feat = num_feat # F =features nums 60 | self.num_field = num_field # N =fields of a feature 61 | self.dropout_deep = dropout_deep 62 | 63 | self.wide = Wide(units=1) 64 | self.deep = Deep(num_feat,num_field,dropout_deep,deep_layer_sizes) 65 | self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True) 66 | 67 | def call(self,num_input,feat_index,feat_value): 68 | x1 = self.wide(num_input) 69 | x2 = self.deep(feat_index,feat_value) 70 | 71 | x3 = tf.keras.layers.concatenate([x1,x2],axis=-1) 72 | output = self.fc(x3) 73 | return output 74 | 75 | 76 | if __name__ == '__main__': 77 | AID_DATA_DIR = "../data/Criteo/" 78 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/cross_feat_dict_10.pkl2', 'rb')) 79 | 80 | widedeep = WideDeep(num_feat=len(feat_dict_) + 1, num_field=52, dropout_deep=[0.5, 0.5, 0.5], 81 | deep_layer_sizes=[400, 400],embedding_size=10) 82 | 83 | train_label_path = AID_DATA_DIR + 'traincross_label' 84 | train_idx_path = AID_DATA_DIR + 'traincross_idx' 85 | train_value_path = AID_DATA_DIR + 'traincross_value' 86 | train_num_path = AID_DATA_DIR + 'traincross_num' 87 | 88 | # 这种读取数据方式采用TextLineDataset,数据为大文件时,节省内存,效率训练 89 | def get_batch_dataset(label_path, idx_path, value_path,num_path): 90 | label = tf.data.TextLineDataset(label_path) 91 | idx = tf.data.TextLineDataset(idx_path) 92 | value = tf.data.TextLineDataset(value_path) 93 | num = tf.data.TextLineDataset(num_path) 94 | 95 | label = label.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12) 96 | idx = idx.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12) 97 | value = value.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12) 98 | num = num.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12) 99 | 100 | batch_dataset = tf.data.Dataset.zip((num,label, idx, value)) 101 | batch_dataset = batch_dataset.shuffle(buffer_size=128) 102 | batch_dataset = batch_dataset.batch(128) 103 | batch_dataset = batch_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 104 | return batch_dataset 105 | train_batch_dataset = get_batch_dataset(train_label_path, train_idx_path, train_value_path,train_num_path) 106 | 107 | train_loss = tf.keras.metrics.Mean(name='train_loss') 108 | train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc') 109 | loss_object = tf.keras.losses.BinaryCrossentropy() 110 | optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4) 111 | 112 | 113 | @tf.function 114 | def train_one_step(model, optimizer, idx, value, label, num): 115 | with tf.GradientTape() as tape: 116 | output = model(num, idx, value) 117 | loss = loss_object(y_true=label, y_pred=output) 118 | grads = tape.gradient(loss, model.trainable_variables) 119 | grads = [tf.clip_by_norm(g, 100) for g in grads] 120 | optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables)) 121 | 122 | train_loss(loss) 123 | train_accuracy(label, output) 124 | 125 | EPOCHS = 50 126 | for epoch in range(EPOCHS): 127 | for num, label, idx, value in train_batch_dataset: 128 | train_one_step(widedeep, optimizer, idx, value, label,num) 129 | template = 'Epoch {}, Loss: {}, Accuracy: {}' 130 | print(template.format(epoch + 1, 131 | train_loss.result(), train_accuracy.result())) 132 | 133 | -------------------------------------------------------------------------------- /Wide-Deep/data_process.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import pickle 4 | from collections import Counter 5 | 6 | """ 7 | Data Process for Wide-Deep network 8 | https://github.com/busesese/Wide_Deep_Model 9 | https://github.com/aviraj-sinha/ML5/blob/master/10.%20Keras%20Wide%20and%20Deep.ipynb 10 | """ 11 | def get_train_test_file(file_path, feat_dict_, split_ratio=0.9): 12 | #定义训练集与测试集 13 | train_label_fout = open(file_path+'traincross_label', 'w') 14 | train_value_fout = open(file_path+'traincross_value', 'w') 15 | train_idx_fout = open(file_path+'traincross_idx', 'w') 16 | train_num_fout = open(file_path + 'traincross_num', 'w') 17 | 18 | continuous_range_ = range(1, 14) 19 | categorical_range_ = range(14, 52) 20 | 21 | def process_line_(line): 22 | features = line.rstrip('\n').split('\t') 23 | feat_idx, feat_value, label= [], [], [] 24 | # 自己获取每列特征中的最大值,最小值 25 | cont_min_ = [0.0, -2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 26 | cont_max_ = [95.0,7864,8457.0,87.0,1015215.0,4638.0,1658.0,547.0,5637.0,4.0,37.0,98.0,770.0] 27 | cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))] 28 | # MinMax Normalization 29 | for idx in continuous_range_: 30 | if features[idx] == '': 31 | feat_idx.append(0) 32 | feat_value.append(0.0) 33 | else: 34 | feat_idx.append(feat_dict_[idx]) 35 | feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6)) 36 | # 获取数值型特征 37 | num = feat_value[:] 38 | # 处理分类型数据 39 | for idx in categorical_range_: 40 | if features[idx] == '' or features[idx] not in feat_dict_: 41 | feat_idx.append(0) 42 | feat_value.append(0.0) 43 | else: 44 | feat_idx.append(feat_dict_[features[idx]]) 45 | feat_value.append(1.0) 46 | return feat_idx, feat_value, [int(features[0])], num 47 | 48 | with open(file_path+'traincross.txt', 'r') as fin: 49 | for line_idx, line in enumerate(fin): 50 | feat_idx, feat_value, label, num = process_line_(line) 51 | 52 | feat_value = '\t'.join([str(v) for v in feat_value]) + '\n' 53 | feat_idx = '\t'.join([str(idx) for idx in feat_idx]) + '\n' 54 | label = '\t'.join([str(idx) for idx in label]) + '\n' 55 | feat_num = '\t'.join([str(idx) for idx in num]) + '\n' 56 | 57 | train_label_fout.write(label) 58 | train_idx_fout.write(feat_idx) 59 | train_value_fout.write(feat_value) 60 | train_num_fout.write(feat_num) 61 | 62 | fin.close() 63 | 64 | train_label_fout.close() 65 | train_idx_fout.close() 66 | train_value_fout.close() 67 | train_num_fout.close() 68 | 69 | 70 | def cross_feature(file_path,cross_range): 71 | # 构建交叉特征数据集 72 | traincross = open(file_path+'traincross.txt', 'w') 73 | with open(file_path+'train.txt', 'r') as fin: 74 | for line_idx, line in enumerate(fin): 75 | features = line.rstrip('\n').split('\t') 76 | for i in cross_range: 77 | features.append('_'.join([features[i[0]], features[i[1]]])) 78 | string_features = '\t'.join(features) + '\n' 79 | traincross.write(string_features) 80 | fin.close() 81 | traincross.close() 82 | 83 | def get_feat_dict(file_path): 84 | 85 | freq_ = 10 86 | # pkl2格式用来保存字典形式的wide-deep数据pickle 87 | dir_feat_dict_ = file_path+'cross_feat_dict_' + str(freq_) + '.pkl2' 88 | continuous_range_ = range(1, 14) 89 | categorical_range_ = range(14, 52) 90 | 91 | if os.path.exists(dir_feat_dict_): 92 | feat_dict = pickle.load(open(dir_feat_dict_, 'rb')) 93 | else: 94 | # print('generate a feature dict') 95 | # Count the number of occurrences of discrete features 96 | feat_cnt = Counter() 97 | with open(file_path+'traincross.txt', 'r') as fin: 98 | for line_idx, line in enumerate(fin): 99 | features = line.rstrip('\n').split('\t') 100 | for idx in categorical_range_: 101 | if features[idx] == '': continue 102 | feat_cnt.update([features[idx]]) 103 | fin.close() 104 | # Only retain discrete features with high frequency 105 | dis_feat_set = set() 106 | for feat, ot in feat_cnt.items(): 107 | if ot >= freq_: 108 | dis_feat_set.add(feat) 109 | 110 | # Create a dictionary for continuous and discrete features 111 | feat_dict = {} 112 | tc = 1 113 | # Continuous features 114 | for idx in continuous_range_: 115 | feat_dict[idx] = tc 116 | tc += 1 117 | # Discrete features 118 | cnt_feat_set = set() 119 | with open(file_path+'traincross.txt', 'r') as fin: 120 | for line_idx, line in enumerate(fin): 121 | features = line.rstrip('\n').split('\t') 122 | 123 | for idx in categorical_range_: 124 | if features[idx] == '' or features[idx] not in dis_feat_set: 125 | continue 126 | if features[idx] not in cnt_feat_set: 127 | cnt_feat_set.add(features[idx]) 128 | feat_dict[features[idx]] = tc 129 | tc += 1 130 | # Save dictionary 131 | fin.close() 132 | with open(dir_feat_dict_, 'wb') as fout: 133 | pickle.dump(feat_dict, fout) 134 | print('args.num_feat ', len(feat_dict) + 1) 135 | return feat_dict 136 | 137 | 138 | if __name__ == '__main__': 139 | file_path = '../data/Criteo/' 140 | # 交叉特征 141 | cross_range = [[14, 15], [16, 17], [18, 19], [20, 21], [22, 23], [24, 25], [26, 27], [28, 29], [30, 31], 142 | [32, 33], [34, 35], [36, 37], [38, 39]] 143 | cross_feature(file_path,cross_range) 144 | feat_dict = get_feat_dict(file_path) 145 | get_train_test_file(file_path, feat_dict) 146 | print('Done!') -------------------------------------------------------------------------------- /data/Criteo/data_process.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import pickle 4 | from collections import Counter 5 | 6 | """ 7 | Data Process for FM, PNN, and DeepFM. 8 | [1] PaddlePaddle implementation of DeepFM for CTR prediction 9 | https://github.com/Snail110/Awesome-RecSystem-Models/blob/master/data/Criteo/forOtherModels/dataPreprocess_TensorFlow.py 10 | """ 11 | def get_train_test_file(file_path, feat_dict_, split_ratio=0.9): 12 | #定义训练集与测试集 13 | train_label_fout = open('train_label', 'w') 14 | train_value_fout = open('train_value', 'w') 15 | train_idx_fout = open('train_idx', 'w') 16 | test_label_fout = open('test_label', 'w') 17 | test_value_fout = open('test_value', 'w') 18 | test_idx_fout = open('test_idx', 'w') 19 | 20 | continuous_range_ = range(1, 14) 21 | categorical_range_ = range(14, 40) 22 | 23 | def process_line_(line): 24 | features = line.rstrip('\n').split('\t') 25 | feat_idx, feat_value, label = [], [], [] 26 | # 自己获取每列特征中的最大值,最小值 27 | cont_min_ = [0.0, -2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 28 | cont_max_ = [95.0,7864,8457.0,87.0,1015215.0,4638.0,1658.0,547.0,5637.0,4.0,37.0,98.0,770.0] 29 | cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))] 30 | # MinMax Normalization 31 | for idx in continuous_range_: 32 | if features[idx] == '': 33 | feat_idx.append(0) 34 | feat_value.append(0.0) 35 | else: 36 | feat_idx.append(feat_dict_[idx]) 37 | feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6)) 38 | 39 | # 处理分类型数据 40 | for idx in categorical_range_: 41 | if features[idx] == '' or features[idx] not in feat_dict_: 42 | feat_idx.append(0) 43 | feat_value.append(0.0) 44 | else: 45 | feat_idx.append(feat_dict_[features[idx]]) 46 | feat_value.append(1.0) 47 | return feat_idx, feat_value, [int(features[0])] 48 | 49 | with open(file_path, 'r') as fin: 50 | for line_idx, line in enumerate(fin): 51 | feat_idx, feat_value, label = process_line_(line) 52 | 53 | feat_value = '\t'.join([str(v) for v in feat_value]) + '\n' 54 | feat_idx = '\t'.join([str(idx) for idx in feat_idx]) + '\n' 55 | label = '\t'.join([str(idx) for idx in label]) + '\n' 56 | 57 | if np.random.random() <= split_ratio: 58 | train_label_fout.write(label) 59 | train_idx_fout.write(feat_idx) 60 | train_value_fout.write(feat_value) 61 | else: 62 | test_label_fout.write(label) 63 | test_idx_fout.write(feat_idx) 64 | test_value_fout.write(feat_value) 65 | 66 | fin.close() 67 | 68 | train_label_fout.close() 69 | train_idx_fout.close() 70 | train_value_fout.close() 71 | test_label_fout.close() 72 | test_idx_fout.close() 73 | test_value_fout.close() 74 | 75 | 76 | def get_feat_dict(file_path): 77 | freq_ = 10 78 | # pkl2格式用来保存字典形式的数据pickle 79 | dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2' 80 | continuous_range_ = range(1, 14) 81 | categorical_range_ = range(14, 40) 82 | 83 | if os.path.exists(dir_feat_dict_): 84 | feat_dict = pickle.load(open(dir_feat_dict_, 'rb')) 85 | else: 86 | # print('generate a feature dict') 87 | # Count the number of occurrences of discrete features 88 | feat_cnt = Counter() 89 | with open(file_path, 'r') as fin: 90 | for line_idx, line in enumerate(fin): 91 | features = line.rstrip('\n').split('\t') 92 | for idx in categorical_range_: 93 | if features[idx] == '': continue 94 | feat_cnt.update([features[idx]]) 95 | 96 | # Only retain discrete features with high frequency 97 | dis_feat_set = set() 98 | for feat, ot in feat_cnt.items(): 99 | if ot >= freq_: 100 | dis_feat_set.add(feat) 101 | 102 | # Create a dictionary for continuous and discrete features 103 | feat_dict = {} 104 | tc = 1 105 | # Continuous features 106 | for idx in continuous_range_: 107 | feat_dict[idx] = tc 108 | tc += 1 109 | # Discrete features 110 | cnt_feat_set = set() 111 | with open(file_path, 'r') as fin: 112 | for line_idx, line in enumerate(fin): 113 | features = line.rstrip('\n').split('\t') 114 | for idx in categorical_range_: 115 | if features[idx] == '' or features[idx] not in dis_feat_set: 116 | continue 117 | if features[idx] not in cnt_feat_set: 118 | cnt_feat_set.add(features[idx]) 119 | feat_dict[features[idx]] = tc 120 | tc += 1 121 | 122 | # Save dictionary 123 | with open(dir_feat_dict_, 'wb') as fout: 124 | pickle.dump(feat_dict, fout) 125 | print('args.num_feat ', len(feat_dict) + 1) 126 | 127 | return feat_dict 128 | 129 | 130 | if __name__ == '__main__': 131 | file_path = './train.txt' 132 | feat_dict = get_feat_dict(file_path) 133 | get_train_test_file(file_path, feat_dict) 134 | print('Done!') -------------------------------------------------------------------------------- /embedding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import tensorflow as tf" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "csv = [\n", 28 | " \"1,harden|james|curry\",\n", 29 | " \"2,wrestbrook|harden|durant\",\n", 30 | " \"3,|paul|towns\",\n", 31 | "]" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "TAG_SET = [\"harden\", \"james\", \"curry\", \"durant\", \"paul\",\"towns\",\"wrestbrook\"]" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# 处理得到SpareTensor\n", 50 | "ids,post_tags_str = tf.decode_csv(csv,[[-1],[\"\"]])" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 7, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "table = tf.contrib.lookup.index_table_from_tensor(\n", 60 | "mapping=TAG_SET,default_value=-1) # 构造一个查找表" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 9, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "split_tags = tf.string_split(post_tags_str,\"|\")" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 12, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "" 81 | ] 82 | }, 83 | "execution_count": 12, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "split_tags.indices" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 14, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "tags = tf.SparseTensor(\n", 99 | "indices = split_tags.indices,\n", 100 | "values = table.lookup(split_tags.values),\n", 101 | " dense_shape=split_tags.dense_shape)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 15, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# 定义embedding变量\n", 111 | "# 大小为3 因为 只有7个类型\n", 112 | "TAG_EMBEDDING_DIM = 3\n", 113 | "embedding_params = tf.Variable(tf.truncated_normal([len(TAG_SET),TAG_EMBEDDING_DIM]))" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 16, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "WARNING:tensorflow:The default value of combiner will change from \"mean\" to \"sqrtn\" after 2016/11/01.\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "embedding_tags = tf.nn.embedding_lookup_sparse(embedding_params,sp_ids=tags,sp_weights=None)\n", 131 | "# sp_ids就是我们刚刚得到的SparseTensor,而sp_weights=None代表的每一个取值的权重,如果是None的话,所有权重都是1,也就是相当于取了平均\n", 132 | "# 如果不是None的话,我们需要同样传入一个SparseTensor,代表不同球员的喜欢权重。大家感兴趣可以自己去尝试" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 18, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "[SparseTensorValue(indices=array([[0, 0],\n", 145 | " [0, 1],\n", 146 | " [0, 2],\n", 147 | " [1, 0],\n", 148 | " [1, 1],\n", 149 | " [1, 2],\n", 150 | " [2, 0],\n", 151 | " [2, 1]], dtype=int64), values=array([0, 1, 2, 6, 0, 3, 4, 5], dtype=int64), dense_shape=array([3, 3], dtype=int64)), array([[ 0.06023904, 1.0575624 , -0.9093878 ],\n", 152 | " [-0.42566654, 0.26845995, -0.6602178 ],\n", 153 | " [-0.6277443 , 0.28916246, -0.15512544]], dtype=float32), array([b'harden|james|curry', b'wrestbrook|harden|durant', b'|paul|towns'],\n", 154 | " dtype=object)]\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "with tf.Session() as s:\n", 160 | " s.run([tf.global_variables_initializer(),tf.tables_initializer()])\n", 161 | " print(s.run([tags,embedding_tags,post_tags_str]))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 3", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.6.5" 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 2 193 | } 194 | -------------------------------------------------------------------------------- /util/train_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | import tensorflow as tf 5 | 6 | def train_test_model_demo(model,train_label_path, train_idx_path, train_value_path): 7 | # 这种读取数据方式采用TextLineDataset,数据为大文件时,节省内存,效率训练 8 | def get_batch_dataset(label_path, idx_path, value_path): 9 | label = tf.data.TextLineDataset(label_path) 10 | idx = tf.data.TextLineDataset(idx_path) 11 | value = tf.data.TextLineDataset(value_path) 12 | 13 | label = label.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12) 14 | idx = idx.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12) 15 | value = value.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12) 16 | 17 | batch_dataset = tf.data.Dataset.zip((label, idx, value)) 18 | batch_dataset = batch_dataset.shuffle(buffer_size=128) 19 | batch_dataset = batch_dataset.batch(128) 20 | batch_dataset = batch_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 21 | return batch_dataset 22 | train_batch_dataset = get_batch_dataset(train_label_path, train_idx_path, train_value_path) 23 | 24 | train_loss = tf.keras.metrics.Mean(name='train_loss') 25 | train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc') 26 | loss_object = tf.keras.losses.BinaryCrossentropy() 27 | optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4) 28 | 29 | 30 | @tf.function 31 | def train_one_step(model, optimizer, idx, value, label): 32 | with tf.GradientTape() as tape: 33 | output = model(idx, value) 34 | loss = loss_object(y_true=label, y_pred=output) 35 | grads = tape.gradient(loss, model.trainable_variables) 36 | grads = [tf.clip_by_norm(g, 100) for g in grads] 37 | optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables)) 38 | 39 | train_loss(loss) 40 | train_accuracy(label, output) 41 | 42 | EPOCHS = 50 43 | for epoch in range(EPOCHS): 44 | for label, idx, value in train_batch_dataset: 45 | train_one_step(model, optimizer, idx, value, label) 46 | template = 'Epoch {}, Loss: {}, Accuracy: {}' 47 | print(template.format(epoch + 1, 48 | train_loss.result(), train_accuracy.result())) 49 | 50 | def train_test_model_demo_1(model,train_label, train_idx, train_value): 51 | # 这种读取数据方式采用tf.data.Dataset.from_tensor_slices,数据为小文件时,便于进行大数据前的调试模型使用。 52 | def get_dataset(train_label, train_idx, train_value): 53 | train_ds = tf.data.Dataset.from_tensor_slices( 54 | (train_label, train_idx, train_value)).shuffle(10000).batch(32) 55 | return train_ds 56 | train_batch_dataset = get_dataset(train_label, train_idx, train_value) 57 | 58 | train_loss = tf.keras.metrics.Mean(name='train_loss') 59 | train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc') 60 | # 二分类 61 | loss_object = tf.keras.losses.BinaryCrossentropy() 62 | optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4) 63 | 64 | @tf.function 65 | def train_one_step(model, optimizer, idx, value, label): 66 | with tf.GradientTape() as tape: 67 | output = model(idx, value) 68 | loss = loss_object(y_true=label, y_pred=output) 69 | grads = tape.gradient(loss, model.trainable_variables) 70 | grads = [tf.clip_by_norm(g, 100) for g in grads] 71 | optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables)) 72 | 73 | train_loss(loss) 74 | train_accuracy(label, output) 75 | 76 | EPOCHS = 50 77 | for epoch in range(EPOCHS): 78 | for label, idx, value in train_batch_dataset: 79 | train_one_step(model, optimizer, idx, value, label) 80 | template = 'Epoch {}, Loss: {}, Accuracy: {}' 81 | print(template.format(epoch + 1, 82 | train_loss.result(), train_accuracy.result())) --------------------------------------------------------------------------------