├── .idea
└── vcs.xml
├── AFM
└── AFM.py
├── CollaborativeFiltering.ipynb
├── DCN
├── DCN-keras.ipynb
├── DCN-tf2.0.ipynb
├── DCN-tf2.0.py
└── DCN.ipynb
├── GBDT_LR.ipynb
├── MLR.ipynb
├── NFM
└── NFM.py
├── PNN
├── PNN-tf2.0.ipynb
└── PNN.py
├── README.md
├── Wide-Deep
├── Wide-Deep.ipynb
├── Wide-Deep.py
└── data_process.py
├── data
├── Criteo
│ ├── data_process.py
│ └── train.txt
└── Driver
│ └── train.csv
├── embedding.ipynb
└── util
└── train_model.py
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/AFM/AFM.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | TensorFlow 2.0 implementation of AFM
4 | Reference:
5 | https://www.jianshu.com/p/83d3b2a1e55d
6 | Attentional Factorization Machines:
7 | Learning the Weight of Feature Interactions via Attention Networks
8 | """
9 | import tensorflow as tf
10 |
11 | import pickle
12 | from util.train_model import train_test_model_demo
13 |
14 |
15 | class AttentionNet(tf.keras.layers.Layer):
16 | def __init__(self, embedding_size=10,attention_size=3, **kwargs):
17 | self.embedding_size = embedding_size
18 | self.attention_size = attention_size
19 | super(AttentionNet, self).__init__(**kwargs)
20 |
21 | def build(self, input_shape):
22 | input_dim = input_shape[2]
23 |
24 | self.linearlayer = tf.keras.layers.Dense(input_dim, activation='relu', use_bias=True)
25 | self.attention_w = self.add_weight(shape=(self.embedding_size,self.attention_size),
26 | initializer='random_normal',trainable=True)
27 | self.attention_b = self.add_weight(shape=(self.attention_size,),
28 | initializer='random_normal',trainable=True)
29 | self.attention_h = self.add_weight(shape=(self.attention_size,),
30 | initializer='random_normal',trainable=True)
31 | self.attention_p = self.add_weight(shape=(self.embedding_size,1),
32 | initializer='ones',trainable=True)
33 |
34 | def call(self, input):
35 | # element_wise
36 | num_feat = input.shape[1]
37 | element_wise_product_list = []
38 | for i in range(num_feat):
39 | for j in range(i+1,num_feat):
40 | element_wise_product_list.append(tf.multiply(input[:,i,:],input[:,j,:])) # None * embedding_size
41 | self.element_wise_product = tf.stack(element_wise_product_list) # (F * F - 1 / 2) * None * embedding_size
42 | self.element_wise_product = tf.transpose(self.element_wise_product,perm=[1,0,2],name='element_wise_product') # None * (F * F - 1 / 2) * embedding_size
43 | print("element_wise_product",self.element_wise_product.get_shape())
44 | # attention part
45 | num_interaction = int(num_feat*(num_feat-1)/2)
46 | # wx+b->relu(wx+b)->h*relu(wx+b)
47 | self.attention_wx_plus_b = tf.reshape(tf.add(tf.matmul(tf.reshape(self.element_wise_product,shape=(-1,self.embedding_size)),
48 | self.attention_w),self.attention_b),shape = [-1,num_interaction,self.attention_size]) # N *(F*F-1/2)*1
49 | self.attention_exp = tf.exp(tf.reduce_sum(tf.multiply(tf.nn.relu(self.attention_wx_plus_b),
50 | self.attention_h),axis=2))# N * ( F * F - 1 / 2) * 1
51 |
52 | self.attention_exp_sum = tf.reshape(tf.reduce_sum(self.attention_exp,axis=1),shape=(-1,1)) # N * 1 * 1
53 |
54 | self.attention_out = tf.divide(self.attention_exp,self.attention_exp_sum,name='attention_out') # N * ( F * F - 1 / 2) * 1
55 | self.attention_x_product = tf.reduce_sum(tf.einsum('bn,bnm->bnm',self.attention_out,self.element_wise_product),axis=1,name='afm') # N * embedding_size
56 | self.attention_part_sum = tf.matmul(self.attention_x_product,self.attention_p) # N * 1
57 |
58 | return self.attention_part_sum
59 |
60 | class AFM(tf.keras.Model):
61 | def __init__(self, num_feat,embedding_size=10,attention_size=3):
62 | super().__init__()
63 | self.num_feat = num_feat # F features nums 字典数量
64 | self.embedding_size = embedding_size
65 | self.attention_size = attention_size
66 | # Embedding 这里采用embeddings层 因此大小为F* M F为field特征数量,N 为 feature的种类数 M为embedding的维度
67 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size,
68 | embeddings_initializer='uniform') # N * embedding_size
69 | self.feat_embeddings = feat_embeddings
70 | self.attentionlayer = AttentionNet(self.embedding_size,self.attention_size)
71 | # linear part
72 | self.linearlayer = tf.keras.layers.Dense(1, activation='relu', use_bias=True)
73 |
74 | def call(self, feat_index, feat_value):
75 | # call函数接收输入变量
76 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。
77 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * F * embedding_size
78 | feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value) # # Batch * F * embedding_size
79 | feat_embedding_1 = tf.transpose(feat_embedding,perm=[0,2,1])
80 | y_deep = self.attentionlayer(feat_embedding)
81 |
82 | y_linear = tf.reduce_sum(self.linearlayer(feat_embedding_1),axis=1)
83 | output = y_deep + y_linear
84 | return output
85 | if __name__ == '__main__':
86 | AID_DATA_DIR = "../data/Criteo/"
87 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb'))
88 |
89 | afm = AFM(num_feat=len(feat_dict_) + 1,embedding_size=10,attention_size=3)
90 |
91 | train_label_path = AID_DATA_DIR + 'train_label'
92 | train_idx_path = AID_DATA_DIR + 'train_idx'
93 | train_value_path = AID_DATA_DIR + 'train_value'
94 |
95 | test_label_path = AID_DATA_DIR + 'test_label'
96 | test_idx_path = AID_DATA_DIR + 'test_idx'
97 | test_value_path = AID_DATA_DIR + 'test_value'
98 |
99 | train_test_model_demo(afm,train_label_path, train_idx_path, train_value_path)
100 |
--------------------------------------------------------------------------------
/CollaborativeFiltering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "movies = pd.read_csv(r\"F:\\baidudownload\\ml-20m\\ml-20m\\movies.csv\")"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 4,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "ratings = pd.read_csv(r'F:\\baidudownload\\ml-20m\\ml-20m\\ratings.csv')"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 5,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/html": [
38 | "
\n",
39 | "\n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " | \n",
56 | " movieId | \n",
57 | " title | \n",
58 | " genres | \n",
59 | "
\n",
60 | " \n",
61 | " \n",
62 | " \n",
63 | " 0 | \n",
64 | " 1 | \n",
65 | " Toy Story (1995) | \n",
66 | " Adventure|Animation|Children|Comedy|Fantasy | \n",
67 | "
\n",
68 | " \n",
69 | " 1 | \n",
70 | " 2 | \n",
71 | " Jumanji (1995) | \n",
72 | " Adventure|Children|Fantasy | \n",
73 | "
\n",
74 | " \n",
75 | " 2 | \n",
76 | " 3 | \n",
77 | " Grumpier Old Men (1995) | \n",
78 | " Comedy|Romance | \n",
79 | "
\n",
80 | " \n",
81 | " 3 | \n",
82 | " 4 | \n",
83 | " Waiting to Exhale (1995) | \n",
84 | " Comedy|Drama|Romance | \n",
85 | "
\n",
86 | " \n",
87 | " 4 | \n",
88 | " 5 | \n",
89 | " Father of the Bride Part II (1995) | \n",
90 | " Comedy | \n",
91 | "
\n",
92 | " \n",
93 | "
\n",
94 | "
"
95 | ],
96 | "text/plain": [
97 | " movieId title \\\n",
98 | "0 1 Toy Story (1995) \n",
99 | "1 2 Jumanji (1995) \n",
100 | "2 3 Grumpier Old Men (1995) \n",
101 | "3 4 Waiting to Exhale (1995) \n",
102 | "4 5 Father of the Bride Part II (1995) \n",
103 | "\n",
104 | " genres \n",
105 | "0 Adventure|Animation|Children|Comedy|Fantasy \n",
106 | "1 Adventure|Children|Fantasy \n",
107 | "2 Comedy|Romance \n",
108 | "3 Comedy|Drama|Romance \n",
109 | "4 Comedy "
110 | ]
111 | },
112 | "execution_count": 5,
113 | "metadata": {},
114 | "output_type": "execute_result"
115 | }
116 | ],
117 | "source": [
118 | "movies.head()"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 6,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "data": {
128 | "text/html": [
129 | "\n",
130 | "\n",
143 | "
\n",
144 | " \n",
145 | " \n",
146 | " | \n",
147 | " userId | \n",
148 | " movieId | \n",
149 | " rating | \n",
150 | " timestamp | \n",
151 | "
\n",
152 | " \n",
153 | " \n",
154 | " \n",
155 | " 0 | \n",
156 | " 1 | \n",
157 | " 2 | \n",
158 | " 3.5 | \n",
159 | " 1112486027 | \n",
160 | "
\n",
161 | " \n",
162 | " 1 | \n",
163 | " 1 | \n",
164 | " 29 | \n",
165 | " 3.5 | \n",
166 | " 1112484676 | \n",
167 | "
\n",
168 | " \n",
169 | " 2 | \n",
170 | " 1 | \n",
171 | " 32 | \n",
172 | " 3.5 | \n",
173 | " 1112484819 | \n",
174 | "
\n",
175 | " \n",
176 | " 3 | \n",
177 | " 1 | \n",
178 | " 47 | \n",
179 | " 3.5 | \n",
180 | " 1112484727 | \n",
181 | "
\n",
182 | " \n",
183 | " 4 | \n",
184 | " 1 | \n",
185 | " 50 | \n",
186 | " 3.5 | \n",
187 | " 1112484580 | \n",
188 | "
\n",
189 | " \n",
190 | "
\n",
191 | "
"
192 | ],
193 | "text/plain": [
194 | " userId movieId rating timestamp\n",
195 | "0 1 2 3.5 1112486027\n",
196 | "1 1 29 3.5 1112484676\n",
197 | "2 1 32 3.5 1112484819\n",
198 | "3 1 47 3.5 1112484727\n",
199 | "4 1 50 3.5 1112484580"
200 | ]
201 | },
202 | "execution_count": 6,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "ratings.head()"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 10,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "data = pd.merge(movies,ratings,on='movieId',how='left')"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 12,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "data[['userId','rating','movieId','title']].sort_values('userId').to_csv(r\"F:\\baidudownload\\ml-20m\\ml-20m\\data.csv\",index=False)"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "### 采用python字典来表示每位用户评论的电影和评分"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 25,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "files = open(r\"F:\\baidudownload\\ml-20m\\ml-20m\\data.csv\",'r',encoding=\"UTF-8\")\n",
243 | "# 读取data文件中每行中除了名字的数据\n",
244 | "data = {} ## 存放每个用户评论的电影和评分\n",
245 | "for line in files.readlines():\n",
246 | " line = line.strip().split(',')\n",
247 | " # 如果字典中没有某位用户,则使用用户ID来创建这位用户\n",
248 | " if not line[0] in data.keys():\n",
249 | " data[line[0]] = {line[3]:line[1]} # 子字典\n",
250 | " else:\n",
251 | " data[line[0]][line[3]] = line[1]"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "## 计算任何两位用户之间的相似度,由于每位用户评论的电影不完全一样,所以兽先要找到两位用户共同评论过的电影然后计算两者之间的欧式距离,最后算出两者之间的相似度"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 16,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "from math import *"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 38,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "def Euclidean(user1,user2):\n",
277 | " # 取出两位用户评论过的电影和评分\n",
278 | " user1_data = data[user1]\n",
279 | " user2_data = data[user2]\n",
280 | " \n",
281 | " # 找到两位用户都评论过的电影,并计算两者的欧式距离\n",
282 | " for key in user1_data.keys():\n",
283 | " if key in user2_data.keys():\n",
284 | "# print(user1_data[key],user2_data[key])\n",
285 | " try:\n",
286 | " distance +=pow((float(user1_data[key])-float(user2_data[key])),2)\n",
287 | " except:\n",
288 | " print(\"error:\",user2_data[key])\n",
289 | " return 1/(1+sqrt(distance)) # 计算返回值越小,相似度越大"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 39,
295 | "metadata": {},
296 | "outputs": [
297 | {
298 | "name": "stdout",
299 | "output_type": "stream",
300 | "text": [
301 | "error: \n",
302 | "[('17602.0', 0.037535053785096986), ('67346.0', 0.03923924660549805), ('116900.0', 0.03938151824124737), ('130390.0', 0.042373278587501804)]\n"
303 | ]
304 | }
305 | ],
306 | "source": [
307 | "# 计算某个用户与其他用的相似度\n",
308 | "def top10_simliar(userID):\n",
309 | " res = []\n",
310 | " for userid in data.keys():\n",
311 | " # 排除自己计算相似度\n",
312 | " if not userid == userID:\n",
313 | " simliar = Euclidean(userID,userid)\n",
314 | " res.append((userid,simliar))\n",
315 | " res.sort(key=lambda val:val[1])# 按照相似度最大顺序排序\n",
316 | " return res[:4]\n",
317 | "RES = top10_simliar('1.0')\n",
318 | "print(RES)"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "## 根据相似度来推荐用户"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 45,
331 | "metadata": {},
332 | "outputs": [
333 | {
334 | "name": "stdout",
335 | "output_type": "stream",
336 | "text": [
337 | "error: \n",
338 | "[('Good Will Hunting (1997)', '5.0'), ('Horton Hears a Who! (2008)', '5.0'), ('Billy Madison (1995)', '5.0'), ('Julie & Julia (2009)', '5.0'), ('Chocolat (2000)', '5.0'), ('Harry Potter and the Order of the Phoenix (2007)', '5.0'), ('\"Sisterhood of the Traveling Pants', '5.0'), ('\"Secret Life of Bees', '5.0'), ('Happy Gilmore (1996)', '5.0'), ('Big Daddy (1999)', '5.0')]\n"
339 | ]
340 | }
341 | ],
342 | "source": [
343 | "def recommend(user):\n",
344 | " # 相似度最高用户\n",
345 | " top_sim_user = top10_simliar(user)[0][0]\n",
346 | " # 相似度最高用户的观影记录\n",
347 | " items = data[top_sim_user]\n",
348 | " recommendations = []\n",
349 | " # 筛选出该用户未观看的电影病添加到列表中\n",
350 | " for item in items.keys():\n",
351 | " if item not in data[user].keys():\n",
352 | " recommendations.append((item,items[item]))\n",
353 | " recommendations.sort(key=lambda val :val[1],reverse=True) # 按照评分排序\n",
354 | " # 返回评分最高的10部电影\n",
355 | " return recommendations[:10]\n",
356 | "Recommend = recommend('1.0')\n",
357 | "print(Recommend)"
358 | ]
359 | },
360 | {
361 | "cell_type": "markdown",
362 | "metadata": {},
363 | "source": [
364 | "#### \n",
365 | "但有时我们会碰到因为两个用户之间数据由于数据膨胀,一方数据大,一方数据小,但是两者称明显的线性关系\n",
366 | "\n",
367 | "我们引入Pearson相关系数来衡量两个变量之间的线性相关性。\n",
368 | "\n",
369 | "Pearson:-1~1 -1:完全负相关 1:完全正相关 0:不相关 \n",
370 | "\n",
371 | "相关系数 0.8-1.0 极强相关\n",
372 | "\n",
373 | "0.6-0.8 强相关\n",
374 | "\n",
375 | "0.4-0.6 中等程度相关\n",
376 | "\n",
377 | "0.2-0.4 弱相关\n",
378 | "\n",
379 | "0.0-0.2 极弱相关或无相关\n",
380 | "\n",
381 | "公式:"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 61,
387 | "metadata": {},
388 | "outputs": [
389 | {
390 | "name": "stdout",
391 | "output_type": "stream",
392 | "text": [
393 | "0.22531203182281434\n"
394 | ]
395 | }
396 | ],
397 | "source": [
398 | "########################################################################\n",
399 | "##计算两用户之间的Pearson相关系数\n",
400 | "def pearson_sim(user1,user2):\n",
401 | " # 取出两位用户评论过的电影和评分\n",
402 | " user1_data = data[user1]\n",
403 | " user2_data = data[user2]\n",
404 | " distance = 0\n",
405 | " common = {}\n",
406 | " \n",
407 | " # 找到两位用户都评论过的电影\n",
408 | " for key in user1_data.keys():\n",
409 | " if key in user2_data.keys():\n",
410 | " common[key] = 1\n",
411 | " if len(common) == 0:\n",
412 | " return 0#如果没有共同评论过的电影,则返回0\n",
413 | " n = len(common)#共同电影数目\n",
414 | "# print(n,common)\n",
415 | " \n",
416 | " ##计算评分和\n",
417 | " try:\n",
418 | " sum1 = sum([float(user1_data[movie]) for movie in common])\n",
419 | " sum2 = sum([float(user2_data[movie]) for movie in common])\n",
420 | "\n",
421 | " ##计算评分平方和\n",
422 | " sum1Sq = sum([pow(float(user1_data[movie]),2) for movie in common])\n",
423 | " sum2Sq = sum([pow(float(user2_data[movie]),2) for movie in common])\n",
424 | "\n",
425 | " ##计算乘积和\n",
426 | " PSum = sum([float(user1_data[it])*float(user2_data[it]) for it in common])\n",
427 | " \n",
428 | " ##计算相关系数\n",
429 | " num = PSum - (sum1*sum2/n)\n",
430 | " den = sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))\n",
431 | " except:\n",
432 | " den = 999\n",
433 | " num = 0\n",
434 | " print('error:') \n",
435 | " if den == 0:\n",
436 | " return 0\n",
437 | " r = num/den\n",
438 | " return r\n",
439 | " \n",
440 | "R = pearson_sim('1.0','3.0')\n",
441 | "print(R)"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": 63,
447 | "metadata": {},
448 | "outputs": [],
449 | "source": [
450 | "# 计算某个用户与其他用的相似度\n",
451 | "def top10_simliar(userID):\n",
452 | " res = []\n",
453 | " for userid in data.keys():\n",
454 | " # 排除自己计算相似度\n",
455 | " if not userid == userID:\n",
456 | " simliar = pearson_sim(userID,userid)\n",
457 | " res.append((userid,simliar))\n",
458 | " res.sort(key=lambda val:val[1])# 按照相似度最大顺序排序\n",
459 | " return res[-4:]"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 64,
465 | "metadata": {},
466 | "outputs": [
467 | {
468 | "name": "stdout",
469 | "output_type": "stream",
470 | "text": [
471 | "error:\n",
472 | "[('79721.0', 1.000000000000017), ('60581.0', 1.0000000000000187), ('83906.0', 1.0000000000000213), ('103682.0', 1.0000000000000255)]\n"
473 | ]
474 | }
475 | ],
476 | "source": [
477 | "RES = top10_simliar('1.0')\n",
478 | "print(RES)"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 65,
484 | "metadata": {},
485 | "outputs": [
486 | {
487 | "name": "stdout",
488 | "output_type": "stream",
489 | "text": [
490 | "error:\n",
491 | "[('\"Italian Job', '5.0'), ('\"Clockwork Orange', '5.0'), ('RocknRolla (2008)', '5.0'), ('No Country for Old Men (2007)', '5.0'), ('21 Grams (2003)', '5.0'), ('Layer Cake (2004)', '5.0'), ('Seven Pounds (2008)', '5.0'), ('Trainspotting (1996)', '5.0'), (\"Carlito's Way (1993)\", '5.0'), ('Crash (2004)', '5.0')]\n"
492 | ]
493 | }
494 | ],
495 | "source": [
496 | "def recommend(user):\n",
497 | " # 相似度最高用户\n",
498 | " top_sim_user = top10_simliar(user)[0][0]\n",
499 | " # 相似度最高用户的观影记录\n",
500 | " items = data[top_sim_user]\n",
501 | " recommendations = []\n",
502 | " # 筛选出该用户未观看的电影病添加到列表中\n",
503 | " for item in items.keys():\n",
504 | " if item not in data[user].keys():\n",
505 | " recommendations.append((item,items[item]))\n",
506 | " recommendations.sort(key=lambda val :val[1],reverse=True) # 按照评分排序\n",
507 | " # 返回评分最高的10部电影\n",
508 | " return recommendations[:10]\n",
509 | "Recommend = recommend('1.0')\n",
510 | "print(Recommend)"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": null,
516 | "metadata": {},
517 | "outputs": [],
518 | "source": []
519 | }
520 | ],
521 | "metadata": {
522 | "kernelspec": {
523 | "display_name": "Python 3",
524 | "language": "python",
525 | "name": "python3"
526 | },
527 | "language_info": {
528 | "codemirror_mode": {
529 | "name": "ipython",
530 | "version": 3
531 | },
532 | "file_extension": ".py",
533 | "mimetype": "text/x-python",
534 | "name": "python",
535 | "nbconvert_exporter": "python",
536 | "pygments_lexer": "ipython3",
537 | "version": "3.6.5"
538 | }
539 | },
540 | "nbformat": 4,
541 | "nbformat_minor": 1
542 | }
543 |
--------------------------------------------------------------------------------
/DCN/DCN-tf2.0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
13 | " from ._conv import register_converters as _register_converters\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import numpy as np\n",
19 | "import pandas as pd\n",
20 | "import tensorflow as tf"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "from collections import Counter"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "class CrossLayer(tf.keras.layers.Layer):\n",
39 | " def __init__(self,output_dim,num_layer,**kwargs):\n",
40 | " self.output_dim = output_dim\n",
41 | " self.num_layer = num_layer\n",
42 | " super(CrossLayer,self).__init__(**kwargs)\n",
43 | " \n",
44 | " def build(self,input_shape):\n",
45 | " self.input_dim = input_shape[1]\n",
46 | " # print(self.input_dim)\n",
47 | " self.W = []\n",
48 | " self.bias = []\n",
49 | " for i in range(self.num_layer):\n",
50 | " self.W.append(self.add_weight(shape=[self.input_dim,1],initializer = 'glorot_uniform',name='w_{}'.format(i),trainable=True))\n",
51 | " self.bias.append(self.add_weight(shape=[self.input_dim,1],initializer = 'zeros',name='b_{}'.format(i),trainable=True))\n",
52 | " self.built = True\n",
53 | " def call(self,input):\n",
54 | "\n",
55 | " x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j]\n",
56 | " # print(\"x0_shape\",x0.get_shape())\n",
57 | " x1 = tf.einsum('bmn,bnk->bmk',input,x0)\n",
58 | " cross = tf.einsum('bmn,nk->bmk',x1,self.W[0]) + self.bias[0] + input\n",
59 | " \n",
60 | " for i in range(1,self.num_layer):\n",
61 | " x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j]\n",
62 | " x1 = tf.einsum('bmn,bnk->bmk',input,x0)\n",
63 | " cross = tf.einsum('bmn,nk->bmk',x1,self.W[i]) + self.bias[i] + cross\n",
64 | " return cross\n",
65 | " \n",
66 | "class Deep(tf.keras.layers.Layer):\n",
67 | " def __init__(self,dropout_deep,deep_layer_sizes):\n",
68 | " # input_dim = num_size + embed_size = input_size\n",
69 | " super(Deep, self).__init__()\n",
70 | " self.dropout_deep = dropout_deep\n",
71 | " # fc layer\n",
72 | " self.deep_layer_sizes = deep_layer_sizes\n",
73 | " # 神经网络方面的参数\n",
74 | " for i in range(len(deep_layer_sizes)):\n",
75 | " setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))\n",
76 | " setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())\n",
77 | " setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))\n",
78 | " setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))\n",
79 | " # last layer\n",
80 | " self.fc = tf.keras.layers.Dense(128,activation=None,use_bias=True)\n",
81 | " \n",
82 | " def call(self,input):\n",
83 | " y_deep = getattr(self,'dense_' + str(0))(input)\n",
84 | " y_deep = getattr(self,'batchNorm_' + str(0))(y_deep)\n",
85 | " y_deep = getattr(self,'activation_' + str(0))(y_deep)\n",
86 | " y_deep = getattr(self,'dropout_' + str(0))(y_deep)\n",
87 | " \n",
88 | " for i in range(1,len(self.deep_layer_sizes)):\n",
89 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
90 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
91 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
92 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
93 | " \n",
94 | " output = self.fc(y_deep)\n",
95 | " return output\n",
96 | " \n",
97 | "class DCN(tf.keras.Model):\n",
98 | " def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):\n",
99 | " super().__init__()\n",
100 | " self.num_feat = num_feat # F =features nums\n",
101 | " self.num_field = num_field # N =fields of a feature \n",
102 | " self.dropout_deep = dropout_deep\n",
103 | " \n",
104 | " # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度\n",
105 | " feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M \n",
106 | " self.feat_embeddings = feat_embeddings\n",
107 | " \n",
108 | " self.crosslayer = CrossLayer(output_dim = 128,num_layer=8)\n",
109 | " \n",
110 | " self.deep = Deep(dropout_deep,deep_layer_sizes)\n",
111 | " self.fc = tf.keras.layers.Dense(1,activation='sigmoid',use_bias=True)\n",
112 | " \n",
113 | " def call(self,feat_index,feat_value):\n",
114 | " \n",
115 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
116 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
117 | "# print(feat_value.get_shape())\n",
118 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
119 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
120 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
121 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
122 | " \n",
123 | " x1 = self.crosslayer(stack_input)\n",
124 | " x2 = self.deep(stack_input)\n",
125 | " \n",
126 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
127 | " output = self.fc(x3)\n",
128 | " return output"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 4,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "train = pd.read_table('../data/Criteo/train.txt')\n",
138 | "train.columns=['label','I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n",
139 | " 'I10', 'I11', 'I12', 'I13','C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n",
140 | " 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n",
141 | " 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 5,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "cont_features=['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n",
151 | " 'I10', 'I11', 'I12', 'I13']\n",
152 | "dist_features = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n",
153 | " 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n",
154 | " 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 6,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "freq_ = 10\n",
164 | "# dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'\n",
165 | "continuous_range_ = range(1, 14)\n",
166 | "categorical_range_ = range(14, 40)\n",
167 | "\n",
168 | "# 统计离散特征每个离散值出现的次数组成字典\n",
169 | "feat_cnt = Counter()\n",
170 | "with open('../data/Criteo/train.txt', 'r') as fin:\n",
171 | " for line_idx, line in enumerate(fin):\n",
172 | " features = line.rstrip('\\n').split('\\t')\n",
173 | " for idx in categorical_range_:\n",
174 | " if features[idx] == '': continue\n",
175 | " feat_cnt.update([features[idx]])\n",
176 | "# Only retain discrete features with high frequency\n",
177 | "dis_feat_set = set() # 高频段的离散字符\n",
178 | "for feat, ot in feat_cnt.items():\n",
179 | " if ot >= freq_:\n",
180 | " dis_feat_set.add(feat)"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 7,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "# Create a dictionary for continuous and discrete features\n",
190 | "feat_dict = {}\n",
191 | "tc = 1\n",
192 | "# Continuous features\n",
193 | "for idx in continuous_range_:\n",
194 | " feat_dict[idx] = tc\n",
195 | " tc += 1 # 代表占据一列\n",
196 | "\n",
197 | "# Discrete features\n",
198 | "cnt_feat_set = set()\n",
199 | "with open('../data/Criteo/train.txt', 'r') as fin:\n",
200 | " for line_idx, line in enumerate(fin):\n",
201 | " features = line.rstrip('\\n').split('\\t')\n",
202 | " for idx in categorical_range_:\n",
203 | " # 排除空字符和低频离散字符\n",
204 | " if features[idx] == '' or features[idx] not in dis_feat_set:\n",
205 | " continue\n",
206 | " # 排除连续性数值\n",
207 | " if features[idx] not in cnt_feat_set:\n",
208 | " cnt_feat_set.add(features[idx])\n",
209 | " # 获取种类数\n",
210 | " feat_dict[features[idx]] = tc\n",
211 | " tc += 1"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 8,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "train_label = []\n",
221 | "train_value = []\n",
222 | "train_idx = []\n",
223 | "\n",
224 | "continuous_range_ = range(1, 14)\n",
225 | "categorical_range_ = range(14, 40)\n",
226 | "cont_max_=[]\n",
227 | "cont_min_=[]\n",
228 | "for cf in cont_features:\n",
229 | " cont_max_.append(max(train[cf]))\n",
230 | " cont_min_.append(min(train[cf]))\n",
231 | "cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]\n",
232 | "\n",
233 | "def process_line_(line):\n",
234 | " features = line.rstrip('\\n').split('\\t')\n",
235 | " feat_idx, feat_value, label = [], [], []\n",
236 | "\n",
237 | " # MinMax Normalization\n",
238 | " for idx in continuous_range_:\n",
239 | " if features[idx] == '':\n",
240 | " feat_idx.append(0)\n",
241 | " feat_value.append(0.0)\n",
242 | " else:\n",
243 | " feat_idx.append(feat_dict[idx])\n",
244 | " # 归一化\n",
245 | " feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))\n",
246 | "\n",
247 | " # 处理离散型数据\n",
248 | " for idx in categorical_range_:\n",
249 | " if features[idx] == '' or features[idx] not in feat_dict:\n",
250 | " feat_idx.append(0)\n",
251 | " feat_value.append(0.0)\n",
252 | " else:\n",
253 | " feat_idx.append(feat_dict[features[idx]])\n",
254 | " feat_value.append(1.0)\n",
255 | " return feat_idx, feat_value, [int(features[0])]\n",
256 | "\n",
257 | "with open('../data/Criteo/train.txt', 'r') as fin:\n",
258 | " for line_idx, line in enumerate(fin):\n",
259 | "\n",
260 | " feat_idx, feat_value, label = process_line_(line)\n",
261 | " train_label.append(label)\n",
262 | " train_idx.append(feat_idx)\n",
263 | " train_value.append(feat_value)"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 9,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "dcn= DCN(num_feat=len(feat_dict) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],\n",
273 | " deep_layer_sizes=[400, 400])"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 10,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "train_ds = tf.data.Dataset.from_tensor_slices(\n",
283 | " (train_label,train_idx,train_value)).shuffle(10000).batch(32)"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 11,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "@tf.function\n",
293 | "def train_one_step(model, optimizer, idx, value, label):\n",
294 | " with tf.GradientTape() as tape:\n",
295 | " output = model(idx,value)\n",
296 | " loss = loss_object(y_true=label, y_pred=output)\n",
297 | " grads = tape.gradient(loss, model.trainable_variables)\n",
298 | " grads = [tf.clip_by_norm(g, 100) for g in grads]\n",
299 | " optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))\n",
300 | " \n",
301 | " train_loss(loss)\n",
302 | " train_accuracy(label,output)"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 12,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "train_loss = tf.keras.metrics.Mean(name='train_loss')\n",
312 | "train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')\n",
313 | "\n",
314 | "loss_object = tf.keras.losses.BinaryCrossentropy()\n",
315 | "\n",
316 | "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 13,
322 | "metadata": {
323 | "scrolled": true
324 | },
325 | "outputs": [
326 | {
327 | "name": "stdout",
328 | "output_type": "stream",
329 | "text": [
330 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
331 | " def call(self,feat_index,feat_value):\n",
332 | " \n",
333 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
334 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
335 | "# print(feat_value.get_shape())\n",
336 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
337 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
338 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
339 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
340 | " \n",
341 | " x1 = self.crosslayer(stack_input)\n",
342 | " x2 = self.deep(stack_input)\n",
343 | " \n",
344 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
345 | " output = self.fc(x3)\n",
346 | " return output\n",
347 | "\n",
348 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
349 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
350 | " def call(self,feat_index,feat_value):\n",
351 | " \n",
352 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
353 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
354 | "# print(feat_value.get_shape())\n",
355 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
356 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
357 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
358 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
359 | " \n",
360 | " x1 = self.crosslayer(stack_input)\n",
361 | " x2 = self.deep(stack_input)\n",
362 | " \n",
363 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
364 | " output = self.fc(x3)\n",
365 | " return output\n",
366 | "\n",
367 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
368 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
369 | " def call(self,feat_index,feat_value):\n",
370 | " \n",
371 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
372 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
373 | "# print(feat_value.get_shape())\n",
374 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
375 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
376 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
377 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
378 | " \n",
379 | " x1 = self.crosslayer(stack_input)\n",
380 | " x2 = self.deep(stack_input)\n",
381 | " \n",
382 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
383 | " output = self.fc(x3)\n",
384 | " return output\n",
385 | "\n",
386 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
387 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
388 | " def call(self,feat_index,feat_value):\n",
389 | " \n",
390 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
391 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
392 | "# print(feat_value.get_shape())\n",
393 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
394 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
395 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
396 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
397 | " \n",
398 | " x1 = self.crosslayer(stack_input)\n",
399 | " x2 = self.deep(stack_input)\n",
400 | " \n",
401 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
402 | " output = self.fc(x3)\n",
403 | " return output\n",
404 | "\n",
405 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
406 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
407 | " def call(self,feat_index,feat_value):\n",
408 | " \n",
409 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
410 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
411 | "# print(feat_value.get_shape())\n",
412 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
413 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
414 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
415 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
416 | " \n",
417 | " x1 = self.crosslayer(stack_input)\n",
418 | " x2 = self.deep(stack_input)\n",
419 | " \n",
420 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
421 | " output = self.fc(x3)\n",
422 | " return output\n",
423 | "\n",
424 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
425 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
426 | " def call(self,feat_index,feat_value):\n",
427 | " \n",
428 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
429 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
430 | "# print(feat_value.get_shape())\n",
431 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
432 | " # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
433 | " stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
434 | " # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
435 | " \n",
436 | " x1 = self.crosslayer(stack_input)\n",
437 | " x2 = self.deep(stack_input)\n",
438 | " \n",
439 | " x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
440 | " output = self.fc(x3)\n",
441 | " return output\n",
442 | "\n",
443 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
444 | "Epoch 1, Loss: 0.565358579158783, Accuracy: 0.790395200252533\n",
445 | "Epoch 2, Loss: 0.5333142280578613, Accuracy: 0.7906453013420105\n",
446 | "Epoch 3, Loss: 0.5188921093940735, Accuracy: 0.7907286882400513\n",
447 | "Epoch 4, Loss: 0.5085805654525757, Accuracy: 0.790770411491394\n",
448 | "Epoch 5, Loss: 0.5001382231712341, Accuracy: 0.7907953858375549\n",
449 | "Epoch 6, Loss: 0.49196508526802063, Accuracy: 0.790812075138092\n",
450 | "Epoch 7, Loss: 0.4845847487449646, Accuracy: 0.791538655757904\n",
451 | "Epoch 8, Loss: 0.4777772128582001, Accuracy: 0.7933967113494873\n",
452 | "Epoch 9, Loss: 0.4712851643562317, Accuracy: 0.7953976988792419\n"
453 | ]
454 | },
455 | {
456 | "name": "stdout",
457 | "output_type": "stream",
458 | "text": [
459 | "Epoch 10, Loss: 0.46522337198257446, Accuracy: 0.797548770904541\n",
460 | "Epoch 11, Loss: 0.4593830108642578, Accuracy: 0.799308717250824\n",
461 | "Epoch 12, Loss: 0.4535185396671295, Accuracy: 0.8014007210731506\n",
462 | "Epoch 13, Loss: 0.4476926326751709, Accuracy: 0.8034401535987854\n",
463 | "Epoch 14, Loss: 0.4420176148414612, Accuracy: 0.8057957291603088\n",
464 | "Epoch 15, Loss: 0.43604835867881775, Accuracy: 0.8078039288520813\n",
465 | "Epoch 16, Loss: 0.430029958486557, Accuracy: 0.8101238012313843\n",
466 | "Epoch 17, Loss: 0.4236184060573578, Accuracy: 0.8130241632461548\n",
467 | "Epoch 18, Loss: 0.41711094975471497, Accuracy: 0.8157690167427063\n",
468 | "Epoch 19, Loss: 0.410213828086853, Accuracy: 0.8188567757606506\n",
469 | "Epoch 20, Loss: 0.40275657176971436, Accuracy: 0.8226613402366638\n",
470 | "Epoch 21, Loss: 0.3947707712650299, Accuracy: 0.8265085220336914\n",
471 | "Epoch 22, Loss: 0.3864079415798187, Accuracy: 0.8308699727058411\n",
472 | "Epoch 23, Loss: 0.37755030393600464, Accuracy: 0.8352872133255005\n",
473 | "Epoch 24, Loss: 0.3682657480239868, Accuracy: 0.8399407863616943\n",
474 | "Epoch 25, Loss: 0.3589519262313843, Accuracy: 0.8447423577308655\n",
475 | "Epoch 26, Loss: 0.3493313491344452, Accuracy: 0.8495401740074158\n",
476 | "Epoch 27, Loss: 0.33972665667533875, Accuracy: 0.8542419075965881\n",
477 | "Epoch 28, Loss: 0.33029282093048096, Accuracy: 0.8588579893112183\n",
478 | "Epoch 29, Loss: 0.3210965692996979, Accuracy: 0.8632591962814331\n",
479 | "Epoch 30, Loss: 0.3121466338634491, Accuracy: 0.8674670457839966\n",
480 | "Epoch 31, Loss: 0.3034890294075012, Accuracy: 0.8714196085929871\n",
481 | "Epoch 32, Loss: 0.2950327396392822, Accuracy: 0.8753126859664917\n",
482 | "Epoch 33, Loss: 0.2869029939174652, Accuracy: 0.8790152668952942\n",
483 | "Epoch 34, Loss: 0.27917614579200745, Accuracy: 0.8824853897094727\n",
484 | "Epoch 35, Loss: 0.27175894379615784, Accuracy: 0.8858000636100769\n",
485 | "Epoch 36, Loss: 0.2646080255508423, Accuracy: 0.8889583945274353\n",
486 | "Epoch 37, Loss: 0.2577793300151825, Accuracy: 0.8919594883918762\n",
487 | "Epoch 38, Loss: 0.2512573003768921, Accuracy: 0.8948026895523071\n",
488 | "Epoch 39, Loss: 0.24505917727947235, Accuracy: 0.897487223148346\n",
489 | "Epoch 40, Loss: 0.23911045491695404, Accuracy: 0.9000500440597534\n",
490 | "Epoch 41, Loss: 0.23342998325824738, Accuracy: 0.9024878144264221\n",
491 | "Epoch 42, Loss: 0.22800736129283905, Accuracy: 0.9048095345497131\n",
492 | "Epoch 43, Loss: 0.22281348705291748, Accuracy: 0.9070232510566711\n",
493 | "Epoch 44, Loss: 0.21784667670726776, Accuracy: 0.9091364145278931\n",
494 | "Epoch 45, Loss: 0.2130877673625946, Accuracy: 0.9111555814743042\n",
495 | "Epoch 46, Loss: 0.20853079855442047, Accuracy: 0.9130869507789612\n",
496 | "Epoch 47, Loss: 0.2041596919298172, Accuracy: 0.9149361848831177\n",
497 | "Epoch 48, Loss: 0.19996346533298492, Accuracy: 0.9167083501815796\n",
498 | "Epoch 49, Loss: 0.19593490660190582, Accuracy: 0.9184081554412842\n",
499 | "Epoch 50, Loss: 0.19206039607524872, Accuracy: 0.9200400114059448\n"
500 | ]
501 | }
502 | ],
503 | "source": [
504 | "EPOCHS = 50\n",
505 | "for epoch in range(EPOCHS):\n",
506 | " for label, idx, value in train_ds:\n",
507 | " train_one_step(dcn,optimizer,idx, value,label)\n",
508 | " template = 'Epoch {}, Loss: {}, Accuracy: {}'\n",
509 | " print (template.format(epoch+1,\n",
510 | " train_loss.result(),train_accuracy.result()))"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": null,
516 | "metadata": {},
517 | "outputs": [],
518 | "source": []
519 | }
520 | ],
521 | "metadata": {
522 | "kernelspec": {
523 | "display_name": "Python 3",
524 | "language": "python",
525 | "name": "python3"
526 | },
527 | "language_info": {
528 | "codemirror_mode": {
529 | "name": "ipython",
530 | "version": 3
531 | },
532 | "file_extension": ".py",
533 | "mimetype": "text/x-python",
534 | "name": "python",
535 | "nbconvert_exporter": "python",
536 | "pygments_lexer": "ipython3",
537 | "version": "3.6.5"
538 | }
539 | },
540 | "nbformat": 4,
541 | "nbformat_minor": 2
542 | }
543 |
--------------------------------------------------------------------------------
/DCN/DCN-tf2.0.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import numpy as np
3 | import pandas as pd
4 | import tensorflow as tf
5 | from collections import Counter
6 | import pickle
7 | from util.train_model import train_test_model_demo
8 |
9 |
10 | class CrossLayer(tf.keras.layers.Layer):
11 | def __init__(self,output_dim,num_layer,**kwargs):
12 | self.output_dim = output_dim
13 | self.num_layer = num_layer
14 | super(CrossLayer,self).__init__(**kwargs)
15 |
16 | def build(self,input_shape):
17 | self.input_dim = input_shape[2]
18 | # print(self.input_dim)
19 | self.W = []
20 | self.bias = []
21 | for i in range(self.num_layer):
22 | self.W.append(self.add_weight(shape=[1,self.input_dim],initializer = 'glorot_uniform',name='w_{}'.format(i),trainable=True))
23 | self.bias.append(self.add_weight(shape=[1,self.input_dim],initializer = 'zeros',name='b_{}'.format(i),trainable=True))
24 | self.built = True
25 |
26 | def call(self,input):
27 | # 按照论文的公式
28 | # x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j]
29 | # print("x0_shape",x0.get_shape())# (9, 390, 1)
30 | # x1 = tf.einsum('bmn,bkm->bnk', input, x0)
31 | # print("x1_shape", x1.get_shape()) # (9, 390, 390)
32 | # print("self.W[0]_shape", self.W[0].get_shape())
33 | # cross = tf.einsum('bmn,kn->bkm',x1,self.W[0]) + self.bias[0] + input
34 | # print("cross0", cross.get_shape())# (9, 1, 390)
35 | # for i in range(1,self.num_layer):
36 | # x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j]
37 | # x1 = tf.einsum('bmn,bkm->bnk',input,x0)
38 | # cross = tf.einsum('bmn,kn->bkm',x1,self.W[i]) + self.bias[i] + cross
39 |
40 | # 优化论文公式 改变结合律
41 | x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j]
42 | x1 = tf.einsum('bmn,km->bnk', x0, self.W[0])
43 | cross = tf.einsum('bkm,bnk->bnm',input,x1) + self.bias[0] + input
44 | for i in range(1,self.num_layer):
45 | x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j]
46 | x1 = tf.einsum('bmn,km->bnk', x0, self.W[i])
47 | cross = tf.einsum('bkm,bnk->bnm', cross,x1) + self.bias[i] + cross
48 | return cross
49 |
50 | class Deep(tf.keras.layers.Layer):
51 | def __init__(self,dropout_deep,deep_layer_sizes):
52 | # input_dim = num_size + embed_size = input_size
53 | super(Deep, self).__init__()
54 | self.dropout_deep = dropout_deep
55 | # fc layer
56 | self.deep_layer_sizes = deep_layer_sizes
57 | # 神经网络方面的参数
58 | for i in range(len(deep_layer_sizes)):
59 | setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))
60 | setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())
61 | setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))
62 | setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))
63 | # last layer
64 | self.fc = tf.keras.layers.Dense(128,activation=None,use_bias=True)
65 |
66 | def call(self,input):
67 | y_deep = getattr(self,'dense_' + str(0))(input)
68 | y_deep = getattr(self,'batchNorm_' + str(0))(y_deep)
69 | y_deep = getattr(self,'activation_' + str(0))(y_deep)
70 | y_deep = getattr(self,'dropout_' + str(0))(y_deep)
71 |
72 | for i in range(1,len(self.deep_layer_sizes)):
73 | y_deep = getattr(self,'dense_' + str(i))(y_deep)
74 | y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)
75 | y_deep = getattr(self,'activation_' + str(i))(y_deep)
76 | y_deep = getattr(self,'dropout_' + str(i))(y_deep)
77 |
78 | output = self.fc(y_deep)
79 | return output
80 |
81 | class DCN(tf.keras.Model):
82 | def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):
83 | super().__init__()
84 | self.num_feat = num_feat # F =features nums
85 | self.num_field = num_field # N =fields of a feature
86 | self.dropout_deep = dropout_deep
87 |
88 | # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度
89 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M
90 | self.feat_embeddings = feat_embeddings
91 |
92 | self.crosslayer = CrossLayer(output_dim = 128,num_layer=8)
93 |
94 | self.deep = Deep(dropout_deep,deep_layer_sizes)
95 | self.fc = tf.keras.layers.Dense(1,activation='sigmoid',use_bias=True)
96 |
97 | def call(self,feat_index,feat_value):
98 |
99 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。
100 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M
101 | # print(feat_value.get_shape())
102 | feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)
103 | # print("feat_embedding:",feat_embedding.get_shape()) # 32 * 39 * 10
104 | stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)
105 | # print("stack_input:",stack_input.get_shape()) # 32 * 1 * 390
106 |
107 | x1 = self.crosslayer(stack_input)
108 | x2 = self.deep(stack_input)
109 |
110 | x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)
111 | output = self.fc(x3)
112 | return output
113 |
114 | if __name__ == '__main__':
115 | AID_DATA_DIR = "../data/Criteo/"
116 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb'))
117 |
118 | dcn = DCN(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],
119 | deep_layer_sizes=[400, 400])
120 |
121 | train_label_path = AID_DATA_DIR + 'train_label'
122 | train_idx_path = AID_DATA_DIR + 'train_idx'
123 | train_value_path = AID_DATA_DIR + 'train_value'
124 |
125 | train_test_model_demo(dcn,train_label_path, train_idx_path, train_value_path)
--------------------------------------------------------------------------------
/GBDT_LR.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "## GBDT+LR代码分析"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Scikit-learn实现"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import lightgbm as lgb\n",
28 | "import numpy as np\n",
29 | "import pandas as pd\n",
30 | "from sklearn.metrics import mean_squared_error\n",
31 | "from sklearn.linear_model import LogisticRegression"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 10,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "from sklearn.preprocessing import OneHotEncoder\n",
41 | "from sklearn.ensemble import GradientBoostingClassifier"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "df_train = pd.read_csv(r'F:\\Data\\recsys-data\\gbdt+lr/train.csv')\n",
51 | "df_test = pd.read_csv(r'F:\\Data\\recsys-data\\gbdt+lr/test.csv')"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "NUMERIC_COLS = [\n",
61 | " \"ps_reg_01\", \"ps_reg_02\", \"ps_reg_03\",\n",
62 | " \"ps_car_12\", \"ps_car_13\", \"ps_car_14\", \"ps_car_15\",\n",
63 | "]"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 11,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "gbdt = GradientBoostingClassifier(n_estimators=50,random_state=10,subsample = 0.6,max_depth=7,min_samples_split=900)"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "data": {
82 | "text/html": [
83 | "\n",
84 | "\n",
97 | "
\n",
98 | " \n",
99 | " \n",
100 | " | \n",
101 | " Unnamed: 0 | \n",
102 | " id | \n",
103 | " target | \n",
104 | " ps_ind_01 | \n",
105 | " ps_ind_02_cat | \n",
106 | " ps_ind_03 | \n",
107 | " ps_ind_04_cat | \n",
108 | " ps_ind_05_cat | \n",
109 | " ps_ind_06_bin | \n",
110 | " ps_ind_07_bin | \n",
111 | " ... | \n",
112 | " ps_calc_11 | \n",
113 | " ps_calc_12 | \n",
114 | " ps_calc_13 | \n",
115 | " ps_calc_14 | \n",
116 | " ps_calc_15_bin | \n",
117 | " ps_calc_16_bin | \n",
118 | " ps_calc_17_bin | \n",
119 | " ps_calc_18_bin | \n",
120 | " ps_calc_19_bin | \n",
121 | " ps_calc_20_bin | \n",
122 | "
\n",
123 | " \n",
124 | " \n",
125 | " \n",
126 | " 0 | \n",
127 | " 8000 | \n",
128 | " 20227 | \n",
129 | " 1 | \n",
130 | " 7 | \n",
131 | " 1 | \n",
132 | " 5 | \n",
133 | " 1 | \n",
134 | " 0 | \n",
135 | " 0 | \n",
136 | " 1 | \n",
137 | " ... | \n",
138 | " 4 | \n",
139 | " 2 | \n",
140 | " 6 | \n",
141 | " 5 | \n",
142 | " 0 | \n",
143 | " 0 | \n",
144 | " 1 | \n",
145 | " 1 | \n",
146 | " 1 | \n",
147 | " 0 | \n",
148 | "
\n",
149 | " \n",
150 | " 1 | \n",
151 | " 8001 | \n",
152 | " 20228 | \n",
153 | " 1 | \n",
154 | " 0 | \n",
155 | " 1 | \n",
156 | " 6 | \n",
157 | " 1 | \n",
158 | " 0 | \n",
159 | " 1 | \n",
160 | " 0 | \n",
161 | " ... | \n",
162 | " 5 | \n",
163 | " 2 | \n",
164 | " 4 | \n",
165 | " 10 | \n",
166 | " 0 | \n",
167 | " 0 | \n",
168 | " 0 | \n",
169 | " 0 | \n",
170 | " 0 | \n",
171 | " 1 | \n",
172 | "
\n",
173 | " \n",
174 | " 2 | \n",
175 | " 8002 | \n",
176 | " 20229 | \n",
177 | " 0 | \n",
178 | " 3 | \n",
179 | " 1 | \n",
180 | " 8 | \n",
181 | " 0 | \n",
182 | " 0 | \n",
183 | " 0 | \n",
184 | " 0 | \n",
185 | " ... | \n",
186 | " 10 | \n",
187 | " 1 | \n",
188 | " 3 | \n",
189 | " 5 | \n",
190 | " 0 | \n",
191 | " 0 | \n",
192 | " 1 | \n",
193 | " 1 | \n",
194 | " 1 | \n",
195 | " 0 | \n",
196 | "
\n",
197 | " \n",
198 | " 3 | \n",
199 | " 8003 | \n",
200 | " 20235 | \n",
201 | " 0 | \n",
202 | " 2 | \n",
203 | " 1 | \n",
204 | " 8 | \n",
205 | " 0 | \n",
206 | " 0 | \n",
207 | " 0 | \n",
208 | " 0 | \n",
209 | " ... | \n",
210 | " 2 | \n",
211 | " 2 | \n",
212 | " 2 | \n",
213 | " 9 | \n",
214 | " 0 | \n",
215 | " 0 | \n",
216 | " 0 | \n",
217 | " 1 | \n",
218 | " 1 | \n",
219 | " 0 | \n",
220 | "
\n",
221 | " \n",
222 | " 4 | \n",
223 | " 8004 | \n",
224 | " 20236 | \n",
225 | " 0 | \n",
226 | " 0 | \n",
227 | " 1 | \n",
228 | " 2 | \n",
229 | " 1 | \n",
230 | " 0 | \n",
231 | " 0 | \n",
232 | " 0 | \n",
233 | " ... | \n",
234 | " 3 | \n",
235 | " 2 | \n",
236 | " 5 | \n",
237 | " 5 | \n",
238 | " 0 | \n",
239 | " 0 | \n",
240 | " 1 | \n",
241 | " 0 | \n",
242 | " 1 | \n",
243 | " 0 | \n",
244 | "
\n",
245 | " \n",
246 | "
\n",
247 | "
5 rows × 60 columns
\n",
248 | "
"
249 | ],
250 | "text/plain": [
251 | " Unnamed: 0 id target ps_ind_01 ps_ind_02_cat ps_ind_03 \\\n",
252 | "0 8000 20227 1 7 1 5 \n",
253 | "1 8001 20228 1 0 1 6 \n",
254 | "2 8002 20229 0 3 1 8 \n",
255 | "3 8003 20235 0 2 1 8 \n",
256 | "4 8004 20236 0 0 1 2 \n",
257 | "\n",
258 | " ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ... \\\n",
259 | "0 1 0 0 1 ... \n",
260 | "1 1 0 1 0 ... \n",
261 | "2 0 0 0 0 ... \n",
262 | "3 0 0 0 0 ... \n",
263 | "4 1 0 0 0 ... \n",
264 | "\n",
265 | " ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin \\\n",
266 | "0 4 2 6 5 0 \n",
267 | "1 5 2 4 10 0 \n",
268 | "2 10 1 3 5 0 \n",
269 | "3 2 2 2 9 0 \n",
270 | "4 3 2 5 5 0 \n",
271 | "\n",
272 | " ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin \\\n",
273 | "0 0 1 1 1 \n",
274 | "1 0 0 0 0 \n",
275 | "2 0 1 1 1 \n",
276 | "3 0 0 1 1 \n",
277 | "4 0 1 0 1 \n",
278 | "\n",
279 | " ps_calc_20_bin \n",
280 | "0 0 \n",
281 | "1 1 \n",
282 | "2 0 \n",
283 | "3 0 \n",
284 | "4 0 \n",
285 | "\n",
286 | "[5 rows x 60 columns]"
287 | ]
288 | },
289 | "execution_count": 4,
290 | "metadata": {},
291 | "output_type": "execute_result"
292 | }
293 | ],
294 | "source": [
295 | "df_test.head()"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 100,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "y_train = df_train['target']\n",
305 | "y_test = df_test['target']\n",
306 | "X_train = df_train[NUMERIC_COLS]\n",
307 | "X_test = df_test[NUMERIC_COLS]"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 6,
313 | "metadata": {},
314 | "outputs": [
315 | {
316 | "data": {
317 | "text/plain": [
318 | "(8001, 7)"
319 | ]
320 | },
321 | "execution_count": 6,
322 | "metadata": {},
323 | "output_type": "execute_result"
324 | }
325 | ],
326 | "source": [
327 | "X_train.shape"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 23,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "lgb_train = lgb.Dataset(X_train,y_train)\n",
337 | "lgb_eval = lgb.Dataset(X_test,y_test,reference=lgb_train)"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "## 设置子树为100颗,每颗树包含64支叶子的树模型。那么形成的中间特征向量为100*64"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 24,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "params = {\n",
354 | " 'task': 'train',\n",
355 | " 'boosting_type': 'gbdt',\n",
356 | " 'objective': 'binary',\n",
357 | " 'metric': {'binary_logloss'},\n",
358 | " 'num_leaves': 64,\n",
359 | " 'num_trees': 100,\n",
360 | " 'learning_rate': 0.01,\n",
361 | " 'feature_fraction': 0.9,\n",
362 | " 'bagging_fraction': 0.8,\n",
363 | " 'bagging_freq': 5,\n",
364 | " 'verbose': 0\n",
365 | "}"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 25,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "# 叶子节点数,用来进行特征转换使用\n",
375 | "num_leaf = 64"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 26,
381 | "metadata": {
382 | "scrolled": true
383 | },
384 | "outputs": [
385 | {
386 | "name": "stderr",
387 | "output_type": "stream",
388 | "text": [
389 | "D:\\anaconda3\\julianxu\\lib\\site-packages\\lightgbm\\engine.py:148: UserWarning: Found `num_trees` in params. Will use it instead of argument\n",
390 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
391 | ]
392 | },
393 | {
394 | "name": "stdout",
395 | "output_type": "stream",
396 | "text": [
397 | "[1]\ttraining's binary_logloss: 0.155602\n",
398 | "[2]\ttraining's binary_logloss: 0.155022\n",
399 | "[3]\ttraining's binary_logloss: 0.15441\n",
400 | "[4]\ttraining's binary_logloss: 0.153819\n",
401 | "[5]\ttraining's binary_logloss: 0.153267\n",
402 | "[6]\ttraining's binary_logloss: 0.152685\n",
403 | "[7]\ttraining's binary_logloss: 0.152144\n",
404 | "[8]\ttraining's binary_logloss: 0.151545\n",
405 | "[9]\ttraining's binary_logloss: 0.151029\n",
406 | "[10]\ttraining's binary_logloss: 0.15049\n",
407 | "[11]\ttraining's binary_logloss: 0.150069\n",
408 | "[12]\ttraining's binary_logloss: 0.149553\n",
409 | "[13]\ttraining's binary_logloss: 0.149064\n",
410 | "[14]\ttraining's binary_logloss: 0.148592\n",
411 | "[15]\ttraining's binary_logloss: 0.148111\n",
412 | "[16]\ttraining's binary_logloss: 0.147618\n",
413 | "[17]\ttraining's binary_logloss: 0.147086\n",
414 | "[18]\ttraining's binary_logloss: 0.146624\n",
415 | "[19]\ttraining's binary_logloss: 0.146184\n",
416 | "[20]\ttraining's binary_logloss: 0.145696\n",
417 | "[21]\ttraining's binary_logloss: 0.145182\n",
418 | "[22]\ttraining's binary_logloss: 0.144704\n",
419 | "[23]\ttraining's binary_logloss: 0.144244\n",
420 | "[24]\ttraining's binary_logloss: 0.143804\n",
421 | "[25]\ttraining's binary_logloss: 0.14335\n",
422 | "[26]\ttraining's binary_logloss: 0.142893\n",
423 | "[27]\ttraining's binary_logloss: 0.142461\n",
424 | "[28]\ttraining's binary_logloss: 0.141992\n",
425 | "[29]\ttraining's binary_logloss: 0.14154\n",
426 | "[30]\ttraining's binary_logloss: 0.141097\n",
427 | "[31]\ttraining's binary_logloss: 0.14065\n",
428 | "[32]\ttraining's binary_logloss: 0.14021\n",
429 | "[33]\ttraining's binary_logloss: 0.139826\n",
430 | "[34]\ttraining's binary_logloss: 0.139455\n",
431 | "[35]\ttraining's binary_logloss: 0.139101\n",
432 | "[36]\ttraining's binary_logloss: 0.138699\n",
433 | "[37]\ttraining's binary_logloss: 0.138313\n",
434 | "[38]\ttraining's binary_logloss: 0.137922\n",
435 | "[39]\ttraining's binary_logloss: 0.13748\n",
436 | "[40]\ttraining's binary_logloss: 0.13711\n",
437 | "[41]\ttraining's binary_logloss: 0.136669\n",
438 | "[42]\ttraining's binary_logloss: 0.136245\n",
439 | "[43]\ttraining's binary_logloss: 0.135825\n",
440 | "[44]\ttraining's binary_logloss: 0.135446\n",
441 | "[45]\ttraining's binary_logloss: 0.135044\n",
442 | "[46]\ttraining's binary_logloss: 0.134611\n",
443 | "[47]\ttraining's binary_logloss: 0.134199\n",
444 | "[48]\ttraining's binary_logloss: 0.133789\n",
445 | "[49]\ttraining's binary_logloss: 0.133391\n",
446 | "[50]\ttraining's binary_logloss: 0.133004\n",
447 | "[51]\ttraining's binary_logloss: 0.132586\n",
448 | "[52]\ttraining's binary_logloss: 0.132205\n",
449 | "[53]\ttraining's binary_logloss: 0.131787\n",
450 | "[54]\ttraining's binary_logloss: 0.131378\n",
451 | "[55]\ttraining's binary_logloss: 0.131014\n",
452 | "[56]\ttraining's binary_logloss: 0.130628\n",
453 | "[57]\ttraining's binary_logloss: 0.130253\n",
454 | "[58]\ttraining's binary_logloss: 0.129902\n",
455 | "[59]\ttraining's binary_logloss: 0.12956\n",
456 | "[60]\ttraining's binary_logloss: 0.129185\n",
457 | "[61]\ttraining's binary_logloss: 0.128838\n",
458 | "[62]\ttraining's binary_logloss: 0.128492\n",
459 | "[63]\ttraining's binary_logloss: 0.128169\n",
460 | "[64]\ttraining's binary_logloss: 0.127838\n",
461 | "[65]\ttraining's binary_logloss: 0.12748\n",
462 | "[66]\ttraining's binary_logloss: 0.127149\n",
463 | "[67]\ttraining's binary_logloss: 0.126845\n",
464 | "[68]\ttraining's binary_logloss: 0.126493\n",
465 | "[69]\ttraining's binary_logloss: 0.126139\n",
466 | "[70]\ttraining's binary_logloss: 0.125797\n",
467 | "[71]\ttraining's binary_logloss: 0.125492\n",
468 | "[72]\ttraining's binary_logloss: 0.125175\n",
469 | "[73]\ttraining's binary_logloss: 0.12489\n",
470 | "[74]\ttraining's binary_logloss: 0.124602\n",
471 | "[75]\ttraining's binary_logloss: 0.124281\n",
472 | "[76]\ttraining's binary_logloss: 0.123981\n",
473 | "[77]\ttraining's binary_logloss: 0.123696\n",
474 | "[78]\ttraining's binary_logloss: 0.123414\n",
475 | "[79]\ttraining's binary_logloss: 0.123113\n",
476 | "[80]\ttraining's binary_logloss: 0.122799\n",
477 | "[81]\ttraining's binary_logloss: 0.122486\n",
478 | "[82]\ttraining's binary_logloss: 0.122147\n",
479 | "[83]\ttraining's binary_logloss: 0.121818\n",
480 | "[84]\ttraining's binary_logloss: 0.121483\n",
481 | "[85]\ttraining's binary_logloss: 0.12115\n",
482 | "[86]\ttraining's binary_logloss: 0.120842\n",
483 | "[87]\ttraining's binary_logloss: 0.120546\n",
484 | "[88]\ttraining's binary_logloss: 0.12025\n",
485 | "[89]\ttraining's binary_logloss: 0.119959\n",
486 | "[90]\ttraining's binary_logloss: 0.119682\n",
487 | "[91]\ttraining's binary_logloss: 0.11935\n",
488 | "[92]\ttraining's binary_logloss: 0.119037\n",
489 | "[93]\ttraining's binary_logloss: 0.118712\n",
490 | "[94]\ttraining's binary_logloss: 0.118397\n",
491 | "[95]\ttraining's binary_logloss: 0.118085\n",
492 | "[96]\ttraining's binary_logloss: 0.117773\n",
493 | "[97]\ttraining's binary_logloss: 0.117491\n",
494 | "[98]\ttraining's binary_logloss: 0.117192\n",
495 | "[99]\ttraining's binary_logloss: 0.116892\n",
496 | "[100]\ttraining's binary_logloss: 0.116629\n"
497 | ]
498 | }
499 | ],
500 | "source": [
501 | "# train\n",
502 | "gbm = lgb.train(params,\n",
503 | " lgb_train,\n",
504 | " num_boost_round=100,\n",
505 | " valid_sets=lgb_train)"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 27,
511 | "metadata": {},
512 | "outputs": [
513 | {
514 | "name": "stdout",
515 | "output_type": "stream",
516 | "text": [
517 | "Save model...\n"
518 | ]
519 | },
520 | {
521 | "data": {
522 | "text/plain": [
523 | ""
524 | ]
525 | },
526 | "execution_count": 27,
527 | "metadata": {},
528 | "output_type": "execute_result"
529 | }
530 | ],
531 | "source": [
532 | "print('Save model...')\n",
533 | "# save model to file\n",
534 | "gbm.save_model(r'F:\\Data\\recsys-data\\gbdt+lr/model.txt')"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 54,
540 | "metadata": {},
541 | "outputs": [
542 | {
543 | "name": "stdout",
544 | "output_type": "stream",
545 | "text": [
546 | "Start predicting...\n"
547 | ]
548 | }
549 | ],
550 | "source": [
551 | "print('Start predicting...')\n",
552 | "# predict and get data on leaves, training data\n",
553 | "y_pred = gbm.predict(X_train, pred_leaf=True)"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 56,
559 | "metadata": {},
560 | "outputs": [
561 | {
562 | "data": {
563 | "text/plain": [
564 | "(8001, 7)"
565 | ]
566 | },
567 | "execution_count": 56,
568 | "metadata": {},
569 | "output_type": "execute_result"
570 | }
571 | ],
572 | "source": [
573 | "X_train.shape"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": 55,
579 | "metadata": {},
580 | "outputs": [
581 | {
582 | "data": {
583 | "text/plain": [
584 | "array([[17, 0, 55, ..., 4, 63, 63],\n",
585 | " [62, 8, 58, ..., 47, 9, 57],\n",
586 | " [44, 0, 58, ..., 34, 62, 45],\n",
587 | " ...,\n",
588 | " [51, 19, 16, ..., 23, 33, 56],\n",
589 | " [61, 28, 58, ..., 53, 28, 18],\n",
590 | " [53, 29, 54, ..., 4, 63, 63]])"
591 | ]
592 | },
593 | "execution_count": 55,
594 | "metadata": {},
595 | "output_type": "execute_result"
596 | }
597 | ],
598 | "source": [
599 | "y_pred"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 29,
605 | "metadata": {},
606 | "outputs": [
607 | {
608 | "data": {
609 | "text/plain": [
610 | "(8001, 100)"
611 | ]
612 | },
613 | "execution_count": 29,
614 | "metadata": {},
615 | "output_type": "execute_result"
616 | }
617 | ],
618 | "source": [
619 | "np.array(y_pred).shape"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": 32,
625 | "metadata": {},
626 | "outputs": [
627 | {
628 | "data": {
629 | "text/plain": [
630 | "array([17, 0, 55, 44, 47, 8, 8, 39, 8, 8, 0, 0, 0, 0, 0, 0, 38,\n",
631 | " 36, 36, 26, 15, 13, 38, 18, 41, 54, 45, 51, 55, 59, 15, 20, 2, 2,\n",
632 | " 2, 63, 56, 26, 7, 25, 46, 58, 62, 26, 19, 48, 6, 51, 5, 45, 44,\n",
633 | " 1, 44, 14, 33, 41, 10, 39, 49, 63, 51, 63, 20, 48, 52, 47, 8, 36,\n",
634 | " 8, 8, 50, 0, 32, 21, 8, 23, 48, 48, 17, 49, 46, 10, 28, 12, 59,\n",
635 | " 22, 12, 51, 34, 32, 15, 15, 53, 29, 29, 59, 59, 4, 63, 63])"
636 | ]
637 | },
638 | "execution_count": 32,
639 | "metadata": {},
640 | "output_type": "execute_result"
641 | }
642 | ],
643 | "source": [
644 | "y_pred[0]\n",
645 | "# 17,0每个数字代表每颗树的叶子节点索引"
646 | ]
647 | },
648 | {
649 | "cell_type": "code",
650 | "execution_count": 36,
651 | "metadata": {},
652 | "outputs": [],
653 | "source": [
654 | "transform_training_matrix = np.zeros([len(y_pred),len(y_pred[0])*num_leaf],dtype=np.int64) # N**num_tress*num_leaf"
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": 47,
660 | "metadata": {},
661 | "outputs": [],
662 | "source": [
663 | "for i in range(0,len(y_pred)):\n",
664 | " temp = np.arange(len(y_pred[0]))*num_leaf + np.array(y_pred[i]) # 以64为一个周期,然后加上相应的节点位置\n",
665 | " transform_training_matrix[i][temp] += 1 # 找出索引对应的值,然后加1"
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": 83,
671 | "metadata": {},
672 | "outputs": [
673 | {
674 | "data": {
675 | "text/plain": [
676 | "(8001, 6400)"
677 | ]
678 | },
679 | "execution_count": 83,
680 | "metadata": {},
681 | "output_type": "execute_result"
682 | }
683 | ],
684 | "source": [
685 | "transform_training_matrix.shape"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": 95,
691 | "metadata": {},
692 | "outputs": [],
693 | "source": [
694 | "y_test_lgb = gbm.predict(X_test,pred_leaf=True)"
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "execution_count": 65,
700 | "metadata": {},
701 | "outputs": [],
702 | "source": [
703 | "# 将预测集进行onehot转换"
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 86,
709 | "metadata": {},
710 | "outputs": [
711 | {
712 | "data": {
713 | "text/plain": [
714 | "2000"
715 | ]
716 | },
717 | "execution_count": 86,
718 | "metadata": {},
719 | "output_type": "execute_result"
720 | }
721 | ],
722 | "source": [
723 | "len(y_test)"
724 | ]
725 | },
726 | {
727 | "cell_type": "code",
728 | "execution_count": 96,
729 | "metadata": {},
730 | "outputs": [],
731 | "source": [
732 | "transform_test_matrix = np.zeros([len(y_test_lgb),len(y_test_lgb[0])*num_leaf],dtype=np.int64)"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": 88,
738 | "metadata": {},
739 | "outputs": [
740 | {
741 | "data": {
742 | "text/plain": [
743 | "(2000, 6400)"
744 | ]
745 | },
746 | "execution_count": 88,
747 | "metadata": {},
748 | "output_type": "execute_result"
749 | }
750 | ],
751 | "source": [
752 | "transform_test_matrix.shape"
753 | ]
754 | },
755 | {
756 | "cell_type": "code",
757 | "execution_count": 97,
758 | "metadata": {},
759 | "outputs": [],
760 | "source": [
761 | "for i in range(len(y_test_lgb)):\n",
762 | " temp = np.arange(len(y_test[0]))*num_leaf + np.array(y_test_lgb[i])\n",
763 | " transform_test_matrix[i][temp] += 1"
764 | ]
765 | },
766 | {
767 | "cell_type": "code",
768 | "execution_count": 98,
769 | "metadata": {},
770 | "outputs": [],
771 | "source": [
772 | "lm = LogisticRegression(penalty='l2',C=0.05)\n",
773 | "lm.fit(transform_training_matrix,y_train)\n",
774 | "y_pred_test = lm.predict_proba(transform_test_matrix)"
775 | ]
776 | },
777 | {
778 | "cell_type": "code",
779 | "execution_count": 99,
780 | "metadata": {},
781 | "outputs": [
782 | {
783 | "data": {
784 | "text/plain": [
785 | "(2000, 2)"
786 | ]
787 | },
788 | "execution_count": 99,
789 | "metadata": {},
790 | "output_type": "execute_result"
791 | }
792 | ],
793 | "source": [
794 | "y_pred_test.shape"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": 101,
800 | "metadata": {},
801 | "outputs": [
802 | {
803 | "name": "stdout",
804 | "output_type": "stream",
805 | "text": [
806 | "Normalized Cross Entropy 2.213280152050503\n"
807 | ]
808 | }
809 | ],
810 | "source": [
811 | "NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) + (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))\n",
812 | "print(\"Normalized Cross Entropy \" + str(NE))"
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": null,
818 | "metadata": {},
819 | "outputs": [],
820 | "source": []
821 | }
822 | ],
823 | "metadata": {
824 | "kernelspec": {
825 | "display_name": "Python 3",
826 | "language": "python",
827 | "name": "python3"
828 | },
829 | "language_info": {
830 | "codemirror_mode": {
831 | "name": "ipython",
832 | "version": 3
833 | },
834 | "file_extension": ".py",
835 | "mimetype": "text/x-python",
836 | "name": "python",
837 | "nbconvert_exporter": "python",
838 | "pygments_lexer": "ipython3",
839 | "version": "3.6.5"
840 | }
841 | },
842 | "nbformat": 4,
843 | "nbformat_minor": 1
844 | }
845 |
--------------------------------------------------------------------------------
/MLR.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 1.处理数据过程"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 3,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "from sklearn.preprocessing import StandardScaler"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 81,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "# get_data\n",
27 | "train_data = pd.read_table(r'F:\\Data\\recsys-data\\mlr\\adult.data',header=None,delimiter=',')"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "test_data = pd.read_table(r'F:\\Data\\recsys-data\\mlr\\adult.test',header=None,delimiter=',')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/html": [
47 | "\n",
48 | "\n",
61 | "
\n",
62 | " \n",
63 | " \n",
64 | " | \n",
65 | " 0 | \n",
66 | " 1 | \n",
67 | " 2 | \n",
68 | " 3 | \n",
69 | " 4 | \n",
70 | " 5 | \n",
71 | " 6 | \n",
72 | " 7 | \n",
73 | " 8 | \n",
74 | " 9 | \n",
75 | " 10 | \n",
76 | " 11 | \n",
77 | " 12 | \n",
78 | " 13 | \n",
79 | " 14 | \n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " 0 | \n",
85 | " 39 | \n",
86 | " State-gov | \n",
87 | " 77516 | \n",
88 | " Bachelors | \n",
89 | " 13 | \n",
90 | " Never-married | \n",
91 | " Adm-clerical | \n",
92 | " Not-in-family | \n",
93 | " White | \n",
94 | " Male | \n",
95 | " 2174 | \n",
96 | " 0 | \n",
97 | " 40 | \n",
98 | " United-States | \n",
99 | " <=50K | \n",
100 | "
\n",
101 | " \n",
102 | " 1 | \n",
103 | " 50 | \n",
104 | " Self-emp-not-inc | \n",
105 | " 83311 | \n",
106 | " Bachelors | \n",
107 | " 13 | \n",
108 | " Married-civ-spouse | \n",
109 | " Exec-managerial | \n",
110 | " Husband | \n",
111 | " White | \n",
112 | " Male | \n",
113 | " 0 | \n",
114 | " 0 | \n",
115 | " 13 | \n",
116 | " United-States | \n",
117 | " <=50K | \n",
118 | "
\n",
119 | " \n",
120 | " 2 | \n",
121 | " 38 | \n",
122 | " Private | \n",
123 | " 215646 | \n",
124 | " HS-grad | \n",
125 | " 9 | \n",
126 | " Divorced | \n",
127 | " Handlers-cleaners | \n",
128 | " Not-in-family | \n",
129 | " White | \n",
130 | " Male | \n",
131 | " 0 | \n",
132 | " 0 | \n",
133 | " 40 | \n",
134 | " United-States | \n",
135 | " <=50K | \n",
136 | "
\n",
137 | " \n",
138 | " 3 | \n",
139 | " 53 | \n",
140 | " Private | \n",
141 | " 234721 | \n",
142 | " 11th | \n",
143 | " 7 | \n",
144 | " Married-civ-spouse | \n",
145 | " Handlers-cleaners | \n",
146 | " Husband | \n",
147 | " Black | \n",
148 | " Male | \n",
149 | " 0 | \n",
150 | " 0 | \n",
151 | " 40 | \n",
152 | " United-States | \n",
153 | " <=50K | \n",
154 | "
\n",
155 | " \n",
156 | " 4 | \n",
157 | " 28 | \n",
158 | " Private | \n",
159 | " 338409 | \n",
160 | " Bachelors | \n",
161 | " 13 | \n",
162 | " Married-civ-spouse | \n",
163 | " Prof-specialty | \n",
164 | " Wife | \n",
165 | " Black | \n",
166 | " Female | \n",
167 | " 0 | \n",
168 | " 0 | \n",
169 | " 40 | \n",
170 | " Cuba | \n",
171 | " <=50K | \n",
172 | "
\n",
173 | " \n",
174 | "
\n",
175 | "
"
176 | ],
177 | "text/plain": [
178 | " 0 1 2 3 4 5 \\\n",
179 | "0 39 State-gov 77516 Bachelors 13 Never-married \n",
180 | "1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n",
181 | "2 38 Private 215646 HS-grad 9 Divorced \n",
182 | "3 53 Private 234721 11th 7 Married-civ-spouse \n",
183 | "4 28 Private 338409 Bachelors 13 Married-civ-spouse \n",
184 | "\n",
185 | " 6 7 8 9 10 11 12 \\\n",
186 | "0 Adm-clerical Not-in-family White Male 2174 0 40 \n",
187 | "1 Exec-managerial Husband White Male 0 0 13 \n",
188 | "2 Handlers-cleaners Not-in-family White Male 0 0 40 \n",
189 | "3 Handlers-cleaners Husband Black Male 0 0 40 \n",
190 | "4 Prof-specialty Wife Black Female 0 0 40 \n",
191 | "\n",
192 | " 13 14 \n",
193 | "0 United-States <=50K \n",
194 | "1 United-States <=50K \n",
195 | "2 United-States <=50K \n",
196 | "3 United-States <=50K \n",
197 | "4 Cuba <=50K "
198 | ]
199 | },
200 | "execution_count": 83,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": [
206 | "train_data.head()"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 98,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "test_data[14] = test_data[14].apply(lambda x: x[:-1])"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 85,
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | "\n",
228 | "RangeIndex: 32561 entries, 0 to 32560\n",
229 | "Data columns (total 15 columns):\n",
230 | "0 32561 non-null int64\n",
231 | "1 32561 non-null object\n",
232 | "2 32561 non-null int64\n",
233 | "3 32561 non-null object\n",
234 | "4 32561 non-null int64\n",
235 | "5 32561 non-null object\n",
236 | "6 32561 non-null object\n",
237 | "7 32561 non-null object\n",
238 | "8 32561 non-null object\n",
239 | "9 32561 non-null object\n",
240 | "10 32561 non-null int64\n",
241 | "11 32561 non-null int64\n",
242 | "12 32561 non-null int64\n",
243 | "13 32561 non-null object\n",
244 | "14 32561 non-null object\n",
245 | "dtypes: int64(6), object(9)\n",
246 | "memory usage: 3.7+ MB\n"
247 | ]
248 | }
249 | ],
250 | "source": [
251 | "train_data.info()"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 86,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "all_columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','label','type']\n",
261 | "continus_columns =['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']\n",
262 | "dummy_columns = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 87,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "train_data['type'] = 1\n",
272 | "test_data['type'] = 2"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 88,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "data": {
282 | "text/html": [
283 | "\n",
284 | "\n",
297 | "
\n",
298 | " \n",
299 | " \n",
300 | " | \n",
301 | " 0 | \n",
302 | " 1 | \n",
303 | " 2 | \n",
304 | " 3 | \n",
305 | " 4 | \n",
306 | " 5 | \n",
307 | " 6 | \n",
308 | " 7 | \n",
309 | " 8 | \n",
310 | " 9 | \n",
311 | " 10 | \n",
312 | " 11 | \n",
313 | " 12 | \n",
314 | " 13 | \n",
315 | " 14 | \n",
316 | " type | \n",
317 | "
\n",
318 | " \n",
319 | " \n",
320 | " \n",
321 | " 0 | \n",
322 | " 39 | \n",
323 | " State-gov | \n",
324 | " 77516 | \n",
325 | " Bachelors | \n",
326 | " 13 | \n",
327 | " Never-married | \n",
328 | " Adm-clerical | \n",
329 | " Not-in-family | \n",
330 | " White | \n",
331 | " Male | \n",
332 | " 2174 | \n",
333 | " 0 | \n",
334 | " 40 | \n",
335 | " United-States | \n",
336 | " <=50K | \n",
337 | " 1 | \n",
338 | "
\n",
339 | " \n",
340 | " 1 | \n",
341 | " 50 | \n",
342 | " Self-emp-not-inc | \n",
343 | " 83311 | \n",
344 | " Bachelors | \n",
345 | " 13 | \n",
346 | " Married-civ-spouse | \n",
347 | " Exec-managerial | \n",
348 | " Husband | \n",
349 | " White | \n",
350 | " Male | \n",
351 | " 0 | \n",
352 | " 0 | \n",
353 | " 13 | \n",
354 | " United-States | \n",
355 | " <=50K | \n",
356 | " 1 | \n",
357 | "
\n",
358 | " \n",
359 | " 2 | \n",
360 | " 38 | \n",
361 | " Private | \n",
362 | " 215646 | \n",
363 | " HS-grad | \n",
364 | " 9 | \n",
365 | " Divorced | \n",
366 | " Handlers-cleaners | \n",
367 | " Not-in-family | \n",
368 | " White | \n",
369 | " Male | \n",
370 | " 0 | \n",
371 | " 0 | \n",
372 | " 40 | \n",
373 | " United-States | \n",
374 | " <=50K | \n",
375 | " 1 | \n",
376 | "
\n",
377 | " \n",
378 | " 3 | \n",
379 | " 53 | \n",
380 | " Private | \n",
381 | " 234721 | \n",
382 | " 11th | \n",
383 | " 7 | \n",
384 | " Married-civ-spouse | \n",
385 | " Handlers-cleaners | \n",
386 | " Husband | \n",
387 | " Black | \n",
388 | " Male | \n",
389 | " 0 | \n",
390 | " 0 | \n",
391 | " 40 | \n",
392 | " United-States | \n",
393 | " <=50K | \n",
394 | " 1 | \n",
395 | "
\n",
396 | " \n",
397 | " 4 | \n",
398 | " 28 | \n",
399 | " Private | \n",
400 | " 338409 | \n",
401 | " Bachelors | \n",
402 | " 13 | \n",
403 | " Married-civ-spouse | \n",
404 | " Prof-specialty | \n",
405 | " Wife | \n",
406 | " Black | \n",
407 | " Female | \n",
408 | " 0 | \n",
409 | " 0 | \n",
410 | " 40 | \n",
411 | " Cuba | \n",
412 | " <=50K | \n",
413 | " 1 | \n",
414 | "
\n",
415 | " \n",
416 | "
\n",
417 | "
"
418 | ],
419 | "text/plain": [
420 | " 0 1 2 3 4 5 \\\n",
421 | "0 39 State-gov 77516 Bachelors 13 Never-married \n",
422 | "1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n",
423 | "2 38 Private 215646 HS-grad 9 Divorced \n",
424 | "3 53 Private 234721 11th 7 Married-civ-spouse \n",
425 | "4 28 Private 338409 Bachelors 13 Married-civ-spouse \n",
426 | "\n",
427 | " 6 7 8 9 10 11 12 \\\n",
428 | "0 Adm-clerical Not-in-family White Male 2174 0 40 \n",
429 | "1 Exec-managerial Husband White Male 0 0 13 \n",
430 | "2 Handlers-cleaners Not-in-family White Male 0 0 40 \n",
431 | "3 Handlers-cleaners Husband Black Male 0 0 40 \n",
432 | "4 Prof-specialty Wife Black Female 0 0 40 \n",
433 | "\n",
434 | " 13 14 type \n",
435 | "0 United-States <=50K 1 \n",
436 | "1 United-States <=50K 1 \n",
437 | "2 United-States <=50K 1 \n",
438 | "3 United-States <=50K 1 \n",
439 | "4 Cuba <=50K 1 "
440 | ]
441 | },
442 | "execution_count": 88,
443 | "metadata": {},
444 | "output_type": "execute_result"
445 | }
446 | ],
447 | "source": [
448 | "train_data.head()"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 89,
454 | "metadata": {},
455 | "outputs": [
456 | {
457 | "data": {
458 | "text/html": [
459 | "\n",
460 | "\n",
473 | "
\n",
474 | " \n",
475 | " \n",
476 | " | \n",
477 | " 0 | \n",
478 | " 1 | \n",
479 | " 2 | \n",
480 | " 3 | \n",
481 | " 4 | \n",
482 | " 5 | \n",
483 | " 6 | \n",
484 | " 7 | \n",
485 | " 8 | \n",
486 | " 9 | \n",
487 | " 10 | \n",
488 | " 11 | \n",
489 | " 12 | \n",
490 | " 13 | \n",
491 | " 14 | \n",
492 | " type | \n",
493 | "
\n",
494 | " \n",
495 | " \n",
496 | " \n",
497 | " 0 | \n",
498 | " 25 | \n",
499 | " Private | \n",
500 | " 226802 | \n",
501 | " 11th | \n",
502 | " 7 | \n",
503 | " Never-married | \n",
504 | " Machine-op-inspct | \n",
505 | " Own-child | \n",
506 | " Black | \n",
507 | " Male | \n",
508 | " 0 | \n",
509 | " 0 | \n",
510 | " 40 | \n",
511 | " United-States | \n",
512 | " <=50K. | \n",
513 | " 2 | \n",
514 | "
\n",
515 | " \n",
516 | " 1 | \n",
517 | " 38 | \n",
518 | " Private | \n",
519 | " 89814 | \n",
520 | " HS-grad | \n",
521 | " 9 | \n",
522 | " Married-civ-spouse | \n",
523 | " Farming-fishing | \n",
524 | " Husband | \n",
525 | " White | \n",
526 | " Male | \n",
527 | " 0 | \n",
528 | " 0 | \n",
529 | " 50 | \n",
530 | " United-States | \n",
531 | " <=50K. | \n",
532 | " 2 | \n",
533 | "
\n",
534 | " \n",
535 | " 2 | \n",
536 | " 28 | \n",
537 | " Local-gov | \n",
538 | " 336951 | \n",
539 | " Assoc-acdm | \n",
540 | " 12 | \n",
541 | " Married-civ-spouse | \n",
542 | " Protective-serv | \n",
543 | " Husband | \n",
544 | " White | \n",
545 | " Male | \n",
546 | " 0 | \n",
547 | " 0 | \n",
548 | " 40 | \n",
549 | " United-States | \n",
550 | " >50K. | \n",
551 | " 2 | \n",
552 | "
\n",
553 | " \n",
554 | " 3 | \n",
555 | " 44 | \n",
556 | " Private | \n",
557 | " 160323 | \n",
558 | " Some-college | \n",
559 | " 10 | \n",
560 | " Married-civ-spouse | \n",
561 | " Machine-op-inspct | \n",
562 | " Husband | \n",
563 | " Black | \n",
564 | " Male | \n",
565 | " 7688 | \n",
566 | " 0 | \n",
567 | " 40 | \n",
568 | " United-States | \n",
569 | " >50K. | \n",
570 | " 2 | \n",
571 | "
\n",
572 | " \n",
573 | " 4 | \n",
574 | " 18 | \n",
575 | " ? | \n",
576 | " 103497 | \n",
577 | " Some-college | \n",
578 | " 10 | \n",
579 | " Never-married | \n",
580 | " ? | \n",
581 | " Own-child | \n",
582 | " White | \n",
583 | " Female | \n",
584 | " 0 | \n",
585 | " 0 | \n",
586 | " 30 | \n",
587 | " United-States | \n",
588 | " <=50K. | \n",
589 | " 2 | \n",
590 | "
\n",
591 | " \n",
592 | "
\n",
593 | "
"
594 | ],
595 | "text/plain": [
596 | " 0 1 2 3 4 5 \\\n",
597 | "0 25 Private 226802 11th 7 Never-married \n",
598 | "1 38 Private 89814 HS-grad 9 Married-civ-spouse \n",
599 | "2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse \n",
600 | "3 44 Private 160323 Some-college 10 Married-civ-spouse \n",
601 | "4 18 ? 103497 Some-college 10 Never-married \n",
602 | "\n",
603 | " 6 7 8 9 10 11 12 \\\n",
604 | "0 Machine-op-inspct Own-child Black Male 0 0 40 \n",
605 | "1 Farming-fishing Husband White Male 0 0 50 \n",
606 | "2 Protective-serv Husband White Male 0 0 40 \n",
607 | "3 Machine-op-inspct Husband Black Male 7688 0 40 \n",
608 | "4 ? Own-child White Female 0 0 30 \n",
609 | "\n",
610 | " 13 14 type \n",
611 | "0 United-States <=50K. 2 \n",
612 | "1 United-States <=50K. 2 \n",
613 | "2 United-States >50K. 2 \n",
614 | "3 United-States >50K. 2 \n",
615 | "4 United-States <=50K. 2 "
616 | ]
617 | },
618 | "execution_count": 89,
619 | "metadata": {},
620 | "output_type": "execute_result"
621 | }
622 | ],
623 | "source": [
624 | "test_data.head()"
625 | ]
626 | },
627 | {
628 | "cell_type": "code",
629 | "execution_count": 99,
630 | "metadata": {},
631 | "outputs": [],
632 | "source": [
633 | "all_data = pd.concat([train_data,test_data],axis = 0)\n",
634 | "all_data.columns = all_columns"
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": 101,
640 | "metadata": {},
641 | "outputs": [],
642 | "source": [
643 | "all_data = pd.get_dummies(all_data,columns=dummy_columns)"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": 102,
649 | "metadata": {},
650 | "outputs": [],
651 | "source": [
652 | "all_data['label'] = all_data['label'].map(lambda x:1 if x.strip()=='>50K' else 0)"
653 | ]
654 | },
655 | {
656 | "cell_type": "code",
657 | "execution_count": 103,
658 | "metadata": {},
659 | "outputs": [],
660 | "source": [
661 | "for col in continus_columns:\n",
662 | " ss = StandardScaler()\n",
663 | " all_data[col] = ss.fit_transform(all_data[[col]])"
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "execution_count": 104,
669 | "metadata": {},
670 | "outputs": [],
671 | "source": [
672 | "test_data = all_data[all_data['type']==2].drop(['type'],axis=1)\n",
673 | "train_data = all_data[all_data['type']==1].drop(['type'],axis=1)"
674 | ]
675 | },
676 | {
677 | "cell_type": "code",
678 | "execution_count": 105,
679 | "metadata": {},
680 | "outputs": [
681 | {
682 | "data": {
683 | "text/html": [
684 | "\n",
685 | "\n",
698 | "
\n",
699 | " \n",
700 | " \n",
701 | " | \n",
702 | " age | \n",
703 | " fnlwgt | \n",
704 | " education-num | \n",
705 | " capital-gain | \n",
706 | " capital-loss | \n",
707 | " hours-per-week | \n",
708 | " label | \n",
709 | " workclass_ ? | \n",
710 | " workclass_ Federal-gov | \n",
711 | " workclass_ Local-gov | \n",
712 | " ... | \n",
713 | " native-country_ Portugal | \n",
714 | " native-country_ Puerto-Rico | \n",
715 | " native-country_ Scotland | \n",
716 | " native-country_ South | \n",
717 | " native-country_ Taiwan | \n",
718 | " native-country_ Thailand | \n",
719 | " native-country_ Trinadad&Tobago | \n",
720 | " native-country_ United-States | \n",
721 | " native-country_ Vietnam | \n",
722 | " native-country_ Yugoslavia | \n",
723 | "
\n",
724 | " \n",
725 | " \n",
726 | " \n",
727 | " 0 | \n",
728 | " 0.025996 | \n",
729 | " -1.061979 | \n",
730 | " 1.136512 | \n",
731 | " 0.146932 | \n",
732 | " -0.217127 | \n",
733 | " -0.034087 | \n",
734 | " 0 | \n",
735 | " 0 | \n",
736 | " 0 | \n",
737 | " 0 | \n",
738 | " ... | \n",
739 | " 0 | \n",
740 | " 0 | \n",
741 | " 0 | \n",
742 | " 0 | \n",
743 | " 0 | \n",
744 | " 0 | \n",
745 | " 0 | \n",
746 | " 1 | \n",
747 | " 0 | \n",
748 | " 0 | \n",
749 | "
\n",
750 | " \n",
751 | " 1 | \n",
752 | " 0.828308 | \n",
753 | " -1.007104 | \n",
754 | " 1.136512 | \n",
755 | " -0.144804 | \n",
756 | " -0.217127 | \n",
757 | " -2.213032 | \n",
758 | " 0 | \n",
759 | " 0 | \n",
760 | " 0 | \n",
761 | " 0 | \n",
762 | " ... | \n",
763 | " 0 | \n",
764 | " 0 | \n",
765 | " 0 | \n",
766 | " 0 | \n",
767 | " 0 | \n",
768 | " 0 | \n",
769 | " 0 | \n",
770 | " 1 | \n",
771 | " 0 | \n",
772 | " 0 | \n",
773 | "
\n",
774 | " \n",
775 | " 2 | \n",
776 | " -0.046942 | \n",
777 | " 0.246034 | \n",
778 | " -0.419335 | \n",
779 | " -0.144804 | \n",
780 | " -0.217127 | \n",
781 | " -0.034087 | \n",
782 | " 0 | \n",
783 | " 0 | \n",
784 | " 0 | \n",
785 | " 0 | \n",
786 | " ... | \n",
787 | " 0 | \n",
788 | " 0 | \n",
789 | " 0 | \n",
790 | " 0 | \n",
791 | " 0 | \n",
792 | " 0 | \n",
793 | " 0 | \n",
794 | " 1 | \n",
795 | " 0 | \n",
796 | " 0 | \n",
797 | "
\n",
798 | " \n",
799 | " 3 | \n",
800 | " 1.047121 | \n",
801 | " 0.426663 | \n",
802 | " -1.197259 | \n",
803 | " -0.144804 | \n",
804 | " -0.217127 | \n",
805 | " -0.034087 | \n",
806 | " 0 | \n",
807 | " 0 | \n",
808 | " 0 | \n",
809 | " 0 | \n",
810 | " ... | \n",
811 | " 0 | \n",
812 | " 0 | \n",
813 | " 0 | \n",
814 | " 0 | \n",
815 | " 0 | \n",
816 | " 0 | \n",
817 | " 0 | \n",
818 | " 1 | \n",
819 | " 0 | \n",
820 | " 0 | \n",
821 | "
\n",
822 | " \n",
823 | " 4 | \n",
824 | " -0.776316 | \n",
825 | " 1.408530 | \n",
826 | " 1.136512 | \n",
827 | " -0.144804 | \n",
828 | " -0.217127 | \n",
829 | " -0.034087 | \n",
830 | " 0 | \n",
831 | " 0 | \n",
832 | " 0 | \n",
833 | " 0 | \n",
834 | " ... | \n",
835 | " 0 | \n",
836 | " 0 | \n",
837 | " 0 | \n",
838 | " 0 | \n",
839 | " 0 | \n",
840 | " 0 | \n",
841 | " 0 | \n",
842 | " 0 | \n",
843 | " 0 | \n",
844 | " 0 | \n",
845 | "
\n",
846 | " \n",
847 | "
\n",
848 | "
5 rows × 109 columns
\n",
849 | "
"
850 | ],
851 | "text/plain": [
852 | " age fnlwgt education-num capital-gain capital-loss \\\n",
853 | "0 0.025996 -1.061979 1.136512 0.146932 -0.217127 \n",
854 | "1 0.828308 -1.007104 1.136512 -0.144804 -0.217127 \n",
855 | "2 -0.046942 0.246034 -0.419335 -0.144804 -0.217127 \n",
856 | "3 1.047121 0.426663 -1.197259 -0.144804 -0.217127 \n",
857 | "4 -0.776316 1.408530 1.136512 -0.144804 -0.217127 \n",
858 | "\n",
859 | " hours-per-week label workclass_ ? workclass_ Federal-gov \\\n",
860 | "0 -0.034087 0 0 0 \n",
861 | "1 -2.213032 0 0 0 \n",
862 | "2 -0.034087 0 0 0 \n",
863 | "3 -0.034087 0 0 0 \n",
864 | "4 -0.034087 0 0 0 \n",
865 | "\n",
866 | " workclass_ Local-gov ... native-country_ Portugal \\\n",
867 | "0 0 ... 0 \n",
868 | "1 0 ... 0 \n",
869 | "2 0 ... 0 \n",
870 | "3 0 ... 0 \n",
871 | "4 0 ... 0 \n",
872 | "\n",
873 | " native-country_ Puerto-Rico native-country_ Scotland \\\n",
874 | "0 0 0 \n",
875 | "1 0 0 \n",
876 | "2 0 0 \n",
877 | "3 0 0 \n",
878 | "4 0 0 \n",
879 | "\n",
880 | " native-country_ South native-country_ Taiwan native-country_ Thailand \\\n",
881 | "0 0 0 0 \n",
882 | "1 0 0 0 \n",
883 | "2 0 0 0 \n",
884 | "3 0 0 0 \n",
885 | "4 0 0 0 \n",
886 | "\n",
887 | " native-country_ Trinadad&Tobago native-country_ United-States \\\n",
888 | "0 0 1 \n",
889 | "1 0 1 \n",
890 | "2 0 1 \n",
891 | "3 0 1 \n",
892 | "4 0 0 \n",
893 | "\n",
894 | " native-country_ Vietnam native-country_ Yugoslavia \n",
895 | "0 0 0 \n",
896 | "1 0 0 \n",
897 | "2 0 0 \n",
898 | "3 0 0 \n",
899 | "4 0 0 \n",
900 | "\n",
901 | "[5 rows x 109 columns]"
902 | ]
903 | },
904 | "execution_count": 105,
905 | "metadata": {},
906 | "output_type": "execute_result"
907 | }
908 | ],
909 | "source": [
910 | "train_data.head()"
911 | ]
912 | },
913 | {
914 | "cell_type": "code",
915 | "execution_count": 106,
916 | "metadata": {},
917 | "outputs": [],
918 | "source": [
919 | "train_y = train_data['label']\n",
920 | "train_x = train_data.drop(['label'],axis = 1)\n",
921 | "test_y = test_data['label']\n",
922 | "test_x = test_data.drop(['label'],axis = 1)"
923 | ]
924 | },
925 | {
926 | "cell_type": "code",
927 | "execution_count": 107,
928 | "metadata": {},
929 | "outputs": [
930 | {
931 | "data": {
932 | "text/plain": [
933 | "((24720, 109), (7841, 109))"
934 | ]
935 | },
936 | "execution_count": 107,
937 | "metadata": {},
938 | "output_type": "execute_result"
939 | }
940 | ],
941 | "source": [
942 | "train_data[train_data['label']==0].shape,train_data[train_data['label']==1].shape"
943 | ]
944 | },
945 | {
946 | "cell_type": "code",
947 | "execution_count": 108,
948 | "metadata": {},
949 | "outputs": [
950 | {
951 | "data": {
952 | "text/plain": [
953 | "((12435, 109), (3846, 109))"
954 | ]
955 | },
956 | "execution_count": 108,
957 | "metadata": {},
958 | "output_type": "execute_result"
959 | }
960 | ],
961 | "source": [
962 | "test_data[test_data['label']==0].shape,test_data[test_data['label']==1].shape"
963 | ]
964 | },
965 | {
966 | "cell_type": "markdown",
967 | "metadata": {},
968 | "source": [
969 | "# 数据处理完后,特征的维度是108维。"
970 | ]
971 | },
972 | {
973 | "cell_type": "code",
974 | "execution_count": 36,
975 | "metadata": {},
976 | "outputs": [],
977 | "source": [
978 | "import tensorflow as tf\n",
979 | "import time\n",
980 | "from sklearn.metrics import roc_auc_score"
981 | ]
982 | },
983 | {
984 | "cell_type": "code",
985 | "execution_count": 37,
986 | "metadata": {},
987 | "outputs": [],
988 | "source": [
989 | "x = tf.placeholder(tf.float32,shape=[None,108])\n",
990 | "y = tf.placeholder(tf.float32,shape=[None])"
991 | ]
992 | },
993 | {
994 | "cell_type": "code",
995 | "execution_count": 38,
996 | "metadata": {},
997 | "outputs": [],
998 | "source": [
999 | "m = 2\n",
1000 | "learning_rate = 0.3\n",
1001 | "# 聚类参数\n",
1002 | "u = tf.Variable(tf.random_normal([108,m],0.0,0.5),name='u')\n",
1003 | "w = tf.Variable(tf.random_normal([108,m],0.0,0.5),name='w')"
1004 | ]
1005 | },
1006 | {
1007 | "cell_type": "code",
1008 | "execution_count": 39,
1009 | "metadata": {},
1010 | "outputs": [],
1011 | "source": [
1012 | "U = tf.matmul(x,u)\n",
1013 | "p1 = tf.nn.softmax(U)"
1014 | ]
1015 | },
1016 | {
1017 | "cell_type": "code",
1018 | "execution_count": 40,
1019 | "metadata": {},
1020 | "outputs": [],
1021 | "source": [
1022 | "W = tf.matmul(x,w)\n",
1023 | "p2 = tf.nn.softmax(W)"
1024 | ]
1025 | },
1026 | {
1027 | "cell_type": "code",
1028 | "execution_count": 43,
1029 | "metadata": {},
1030 | "outputs": [],
1031 | "source": [
1032 | "pred = tf.reduce_sum(tf.multiply(p1,p2),1)"
1033 | ]
1034 | },
1035 | {
1036 | "cell_type": "code",
1037 | "execution_count": 111,
1038 | "metadata": {},
1039 | "outputs": [
1040 | {
1041 | "name": "stdout",
1042 | "output_type": "stream",
1043 | "text": [
1044 | "0 0 cost:11242.442383,train_auc:0.712778,test_auc:0.820132\n",
1045 | "100 6 cost:10882.122070,train_auc:0.899455,test_auc:0.896667\n",
1046 | "200 13 cost:10876.142578,train_auc:0.899216,test_auc:0.896111\n",
1047 | "300 20 cost:10874.057617,train_auc:0.898944,test_auc:0.895544\n",
1048 | "400 27 cost:10873.227539,train_auc:0.898751,test_auc:0.895171\n",
1049 | "500 34 cost:10872.964844,train_auc:0.898600,test_auc:0.894889\n",
1050 | "600 41 cost:10872.932617,train_auc:0.898473,test_auc:0.894680\n",
1051 | "700 47 cost:10873.029297,train_auc:0.898341,test_auc:0.894479\n",
1052 | "800 54 cost:10873.178711,train_auc:0.898198,test_auc:0.894281\n",
1053 | "900 61 cost:10873.346680,train_auc:0.898057,test_auc:0.894087\n",
1054 | "1000 67 cost:10873.514648,train_auc:0.897925,test_auc:0.893923\n",
1055 | "1100 74 cost:10873.672852,train_auc:0.897806,test_auc:0.893769\n",
1056 | "1200 81 cost:10873.840820,train_auc:0.897707,test_auc:0.893632\n",
1057 | "1300 88 cost:10874.041992,train_auc:0.897628,test_auc:0.893517\n",
1058 | "1400 94 cost:10874.262695,train_auc:0.897528,test_auc:0.893393\n",
1059 | "1500 101 cost:10874.436523,train_auc:0.897415,test_auc:0.893272\n",
1060 | "1600 108 cost:10874.587891,train_auc:0.897307,test_auc:0.893159\n",
1061 | "1700 114 cost:10874.765625,train_auc:0.897202,test_auc:0.893054\n",
1062 | "1800 121 cost:10874.981445,train_auc:0.897095,test_auc:0.892930\n",
1063 | "1900 128 cost:10875.167969,train_auc:0.896978,test_auc:0.892802\n",
1064 | "2000 134 cost:10875.333008,train_auc:0.896860,test_auc:0.892688\n",
1065 | "2100 141 cost:10875.524414,train_auc:0.896738,test_auc:0.892574\n",
1066 | "2200 148 cost:10875.757812,train_auc:0.896615,test_auc:0.892461\n",
1067 | "2300 155 cost:10875.997070,train_auc:0.896485,test_auc:0.892339\n",
1068 | "2400 161 cost:10876.208984,train_auc:0.896360,test_auc:0.892227\n",
1069 | "2500 168 cost:10876.394531,train_auc:0.896243,test_auc:0.892121\n",
1070 | "2600 175 cost:10876.570312,train_auc:0.896136,test_auc:0.892023\n",
1071 | "2700 181 cost:10876.739258,train_auc:0.896034,test_auc:0.891935\n",
1072 | "2800 188 cost:10876.905273,train_auc:0.895939,test_auc:0.891854\n",
1073 | "2900 195 cost:10877.067383,train_auc:0.895846,test_auc:0.891770\n",
1074 | "3000 202 cost:10877.227539,train_auc:0.895758,test_auc:0.891693\n",
1075 | "3100 208 cost:10877.381836,train_auc:0.895672,test_auc:0.891614\n",
1076 | "3200 215 cost:10877.538086,train_auc:0.895587,test_auc:0.891543\n",
1077 | "3300 222 cost:10877.686523,train_auc:0.895506,test_auc:0.891467\n",
1078 | "3400 228 cost:10877.834961,train_auc:0.895428,test_auc:0.891393\n",
1079 | "3500 235 cost:10877.979492,train_auc:0.895351,test_auc:0.891326\n",
1080 | "3600 242 cost:10878.118164,train_auc:0.895276,test_auc:0.891256\n",
1081 | "3700 249 cost:10878.254883,train_auc:0.895202,test_auc:0.891195\n",
1082 | "3800 255 cost:10878.383789,train_auc:0.895131,test_auc:0.891134\n",
1083 | "3900 262 cost:10878.516602,train_auc:0.895061,test_auc:0.891071\n",
1084 | "4000 269 cost:10878.642578,train_auc:0.894996,test_auc:0.891007\n",
1085 | "4100 275 cost:10878.762695,train_auc:0.894933,test_auc:0.890941\n",
1086 | "4200 282 cost:10878.880859,train_auc:0.894870,test_auc:0.890885\n",
1087 | "4300 289 cost:10878.995117,train_auc:0.894808,test_auc:0.890830\n",
1088 | "4400 296 cost:10879.104492,train_auc:0.894749,test_auc:0.890773\n",
1089 | "4500 302 cost:10879.211914,train_auc:0.894694,test_auc:0.890719\n",
1090 | "4600 309 cost:10879.315430,train_auc:0.894638,test_auc:0.890661\n",
1091 | "4700 316 cost:10879.413086,train_auc:0.894587,test_auc:0.890599\n",
1092 | "4800 322 cost:10879.512695,train_auc:0.894538,test_auc:0.890540\n",
1093 | "4900 329 cost:10879.610352,train_auc:0.894489,test_auc:0.890475\n",
1094 | "5000 336 cost:10879.701172,train_auc:0.894441,test_auc:0.890410\n",
1095 | "5100 343 cost:10879.782227,train_auc:0.894394,test_auc:0.890351\n",
1096 | "5200 349 cost:10879.862305,train_auc:0.894349,test_auc:0.890298\n",
1097 | "5300 356 cost:10879.940430,train_auc:0.894302,test_auc:0.890250\n",
1098 | "5400 363 cost:10880.012695,train_auc:0.894259,test_auc:0.890202\n",
1099 | "5500 369 cost:10880.083984,train_auc:0.894218,test_auc:0.890157\n",
1100 | "5600 376 cost:10880.151367,train_auc:0.894178,test_auc:0.890111\n",
1101 | "5700 383 cost:10880.219727,train_auc:0.894140,test_auc:0.890069\n",
1102 | "5800 390 cost:10880.284180,train_auc:0.894102,test_auc:0.890030\n",
1103 | "5900 396 cost:10880.351562,train_auc:0.894066,test_auc:0.889992\n",
1104 | "6000 403 cost:10880.413086,train_auc:0.894032,test_auc:0.889956\n",
1105 | "6100 410 cost:10880.476562,train_auc:0.893997,test_auc:0.889925\n",
1106 | "6200 416 cost:10880.538086,train_auc:0.893965,test_auc:0.889892\n",
1107 | "6300 423 cost:10880.598633,train_auc:0.893930,test_auc:0.889859\n",
1108 | "6400 430 cost:10880.655273,train_auc:0.893898,test_auc:0.889828\n",
1109 | "6500 436 cost:10880.713867,train_auc:0.893868,test_auc:0.889799\n",
1110 | "6600 443 cost:10880.770508,train_auc:0.893839,test_auc:0.889774\n",
1111 | "6700 450 cost:10880.827148,train_auc:0.893811,test_auc:0.889746\n",
1112 | "6800 456 cost:10880.886719,train_auc:0.893784,test_auc:0.889719\n",
1113 | "6900 463 cost:10880.939453,train_auc:0.893758,test_auc:0.889696\n",
1114 | "7000 470 cost:10880.992188,train_auc:0.893733,test_auc:0.889676\n",
1115 | "7100 476 cost:10881.047852,train_auc:0.893709,test_auc:0.889654\n",
1116 | "7200 483 cost:10881.102539,train_auc:0.893686,test_auc:0.889627\n",
1117 | "7300 490 cost:10881.153320,train_auc:0.893661,test_auc:0.889606\n",
1118 | "7400 496 cost:10881.208008,train_auc:0.893640,test_auc:0.889589\n",
1119 | "7500 503 cost:10881.259766,train_auc:0.893618,test_auc:0.889570\n",
1120 | "7600 510 cost:10881.309570,train_auc:0.893597,test_auc:0.889553\n",
1121 | "7700 516 cost:10881.361328,train_auc:0.893579,test_auc:0.889539\n",
1122 | "7800 523 cost:10881.413086,train_auc:0.893559,test_auc:0.889522\n",
1123 | "7900 530 cost:10881.461914,train_auc:0.893539,test_auc:0.889508\n",
1124 | "8000 536 cost:10881.512695,train_auc:0.893519,test_auc:0.889491\n",
1125 | "8100 543 cost:10881.560547,train_auc:0.893500,test_auc:0.889474\n",
1126 | "8200 550 cost:10881.610352,train_auc:0.893482,test_auc:0.889456\n",
1127 | "8300 556 cost:10881.657227,train_auc:0.893465,test_auc:0.889440\n",
1128 | "8400 563 cost:10881.704102,train_auc:0.893447,test_auc:0.889421\n",
1129 | "8500 570 cost:10881.750977,train_auc:0.893431,test_auc:0.889410\n",
1130 | "8600 576 cost:10881.797852,train_auc:0.893416,test_auc:0.889400\n",
1131 | "8700 583 cost:10881.842773,train_auc:0.893401,test_auc:0.889388\n",
1132 | "8800 590 cost:10881.889648,train_auc:0.893389,test_auc:0.889375\n",
1133 | "8900 596 cost:10881.931641,train_auc:0.893376,test_auc:0.889363\n",
1134 | "9000 603 cost:10881.979492,train_auc:0.893363,test_auc:0.889353\n",
1135 | "9100 610 cost:10882.022461,train_auc:0.893352,test_auc:0.889344\n",
1136 | "9200 616 cost:10882.066406,train_auc:0.893340,test_auc:0.889334\n",
1137 | "9300 623 cost:10882.108398,train_auc:0.893328,test_auc:0.889323\n",
1138 | "9400 629 cost:10882.151367,train_auc:0.893317,test_auc:0.889311\n",
1139 | "9500 636 cost:10882.194336,train_auc:0.893307,test_auc:0.889305\n",
1140 | "9600 643 cost:10882.236328,train_auc:0.893295,test_auc:0.889294\n",
1141 | "9700 649 cost:10882.278320,train_auc:0.893284,test_auc:0.889286\n",
1142 | "9800 656 cost:10882.318359,train_auc:0.893269,test_auc:0.889278\n",
1143 | "9900 663 cost:10882.359375,train_auc:0.893260,test_auc:0.889270\n"
1144 | ]
1145 | }
1146 | ],
1147 | "source": [
1148 | "cost1 = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred,labels=y))\n",
1149 | "cost = tf.add_n([cost1])\n",
1150 | "train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost)\n",
1151 | "time_s = time.time()\n",
1152 | "result=[]\n",
1153 | "with tf.Session() as sess:\n",
1154 | " sess.run(tf.global_variables_initializer())# 初始化\n",
1155 | " for epoch in range(0,10000):\n",
1156 | " f_dict ={x:train_x,y:train_y}\n",
1157 | " \n",
1158 | " _,cost_,predict_ = sess.run([train_op,cost,pred],feed_dict=f_dict)\n",
1159 | " \n",
1160 | " auc = roc_auc_score(train_y,predict_)\n",
1161 | " time_t =time.time()\n",
1162 | " # 测试集\n",
1163 | " if epoch % 100 ==0:\n",
1164 | " f_dict ={x:test_x,y:test_y}\n",
1165 | " _,cost_,predict_test =sess.run([train_op,cost,pred],feed_dict=f_dict)\n",
1166 | " test_auc = roc_auc_score(test_y,predict_test)\n",
1167 | " print(\"%d %1d cost:%f,train_auc:%f,test_auc:%f\"%(epoch,(time_t-time_s),cost_,auc,test_auc))\n",
1168 | " result.append([epoch,(time_t-time_s),auc,test_auc])"
1169 | ]
1170 | },
1171 | {
1172 | "cell_type": "code",
1173 | "execution_count": null,
1174 | "metadata": {},
1175 | "outputs": [],
1176 | "source": []
1177 | }
1178 | ],
1179 | "metadata": {
1180 | "kernelspec": {
1181 | "display_name": "Python 3",
1182 | "language": "python",
1183 | "name": "python3"
1184 | },
1185 | "language_info": {
1186 | "codemirror_mode": {
1187 | "name": "ipython",
1188 | "version": 3
1189 | },
1190 | "file_extension": ".py",
1191 | "mimetype": "text/x-python",
1192 | "name": "python",
1193 | "nbconvert_exporter": "python",
1194 | "pygments_lexer": "ipython3",
1195 | "version": "3.6.5"
1196 | }
1197 | },
1198 | "nbformat": 4,
1199 | "nbformat_minor": 2
1200 | }
1201 |
--------------------------------------------------------------------------------
/NFM/NFM.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | TensorFlow 2.0 implementation of Product-based Neural Network[1]
4 | Reference:
5 | https://zhuanlan.zhihu.com/p/37522285
6 | Neural Factorization Machines for Sparse Predictive Analytics
7 | """
8 | import tensorflow as tf
9 |
10 | import pickle
11 | from util.train_model import train_test_model_demo
12 |
13 |
14 | class BiInteraction(tf.keras.layers.Layer):
15 | def __init__(self, Units=1, **kwargs):
16 | self.units = Units
17 | super(BiInteraction, self).__init__(**kwargs)
18 |
19 | def build(self, input_shape):
20 | input_dim = input_shape[2]
21 | # self.W = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', trainable=True)
22 | # self.b = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', trainable=True)
23 | self.linearlayer = tf.keras.layers.Dense(input_dim, activation='relu', use_bias=True)
24 |
25 | def call(self, input):
26 | # sum-square-part
27 | self.summed_features_emb = tf.reduce_sum(input,1) # None * K
28 | # print("self.summed_features_emb:",self.summed_features_emb.get_shape())
29 | self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K
30 | # square-sum-part
31 | self.squared_features_emb = tf.square(input)
32 | self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb,1) # None * K
33 |
34 | # second order
35 | self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square,self.squared_sum_features_emb) # None * K
36 | print("y_second_order:",self.y_second_order.get_shape()) # 128 * 10
37 | output = self.linearlayer(self.y_second_order)
38 | return output
39 |
40 | class NFM(tf.keras.Model):
41 | def __init__(self, num_feat, num_field, dropout_deep, deep_layer_sizes, embedding_size=10):
42 | super().__init__()
43 | self.num_feat = num_feat # F =features nums
44 | self.num_field = num_field # N =fields of a feature
45 | self.dropout_deep = dropout_deep
46 |
47 | # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度
48 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size,
49 | embeddings_initializer='uniform') # F * M
50 | self.feat_embeddings = feat_embeddings
51 |
52 | # fc layer
53 | self.deep_layer_sizes = deep_layer_sizes
54 | # 神经网络方面的参数
55 | for i in range(len(deep_layer_sizes)):
56 | setattr(self, 'dense_' + str(i), tf.keras.layers.Dense(deep_layer_sizes[i]))
57 | setattr(self, 'batchNorm_' + str(i), tf.keras.layers.BatchNormalization())
58 | setattr(self, 'activation_' + str(i), tf.keras.layers.Activation('relu'))
59 | setattr(self, 'dropout_' + str(i), tf.keras.layers.Dropout(dropout_deep[i]))
60 | self.bilayer = BiInteraction(1)
61 | # last layer
62 | self.fc = tf.keras.layers.Dense(1, activation=None, use_bias=True)
63 |
64 | self.linearlayer = tf.keras.layers.Dense(deep_layer_sizes[-1], activation='relu', use_bias=True)
65 |
66 | def call(self, feat_index, feat_value):
67 | # call函数接收输入变量
68 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。
69 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M
70 | # print(feat_value.get_shape())
71 | feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value)
72 |
73 | y_deep = self.bilayer(feat_embedding)
74 | y_linear = self.linearlayer(tf.reduce_sum(feat_embedding,1))
75 |
76 | for i in range(len(self.deep_layer_sizes)):
77 | y_deep = getattr(self, 'dense_' + str(i))(y_deep)
78 | y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep)
79 | y_deep = getattr(self, 'activation_' + str(i))(y_deep)
80 | y_deep = getattr(self, 'dropout_' + str(i))(y_deep)
81 | y = y_deep + y_linear
82 | output = self.fc(y)
83 |
84 | return output
85 | if __name__ == '__main__':
86 | AID_DATA_DIR = "../data/Criteo/"
87 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb'))
88 |
89 | nfm = NFM(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],
90 | deep_layer_sizes=[400, 400], embedding_size=10)
91 |
92 | train_label_path = AID_DATA_DIR + 'train_label'
93 | train_idx_path = AID_DATA_DIR + 'train_idx'
94 | train_value_path = AID_DATA_DIR + 'train_value'
95 |
96 | test_label_path = AID_DATA_DIR + 'test_label'
97 | test_idx_path = AID_DATA_DIR + 'test_idx'
98 | test_value_path = AID_DATA_DIR + 'test_value'
99 |
100 | train_test_model_demo(nfm,train_label_path, train_idx_path, train_value_path)
101 |
--------------------------------------------------------------------------------
/PNN/PNN-tf2.0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
13 | " from ._conv import register_converters as _register_converters\n",
14 | "Using TensorFlow backend.\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "import numpy as np\n",
20 | "import pandas as pd\n",
21 | "# from sklearn.preprocessing import OneHotEncoder,StandarScaler\n",
22 | "from sklearn.metrics import accuracy_score\n",
23 | "import random\n",
24 | "from keras.utils import to_categorical\n",
25 | "from sklearn.preprocessing import LabelEncoder\n",
26 | "\n",
27 | "from sklearn.metrics import roc_auc_score\n",
28 | "\n",
29 | "import tensorflow as tf\n",
30 | "\n",
31 | "from collections import Counter\n",
32 | "\n",
33 | "import math"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 23,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "class PNN(tf.keras.Model):\n",
43 | " def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,product_layer_dim=10,reg_l1=0.01,reg_l2=1e-5,embedding_size=10,product_type='outer'):\n",
44 | " super().__init__()\n",
45 | " self.reg_l1 = reg_l1\n",
46 | " self.reg_l2 = reg_l2\n",
47 | " self.num_feat = num_feat # F =features nums\n",
48 | " self.num_field = num_field # N =fields of a feature \n",
49 | " self.product_layer_dim = product_layer_dim # D1 pnn dim\n",
50 | " self.dropout_deep = dropout_deep\n",
51 | " \n",
52 | " # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度\n",
53 | " feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M \n",
54 | " self.feat_embeddings = feat_embeddings\n",
55 | " \n",
56 | " # 定义随机初始化\n",
57 | " initializer = tf.initializers.GlorotUniform()\n",
58 | " \n",
59 | " # linear part 线性层就是embedding层的复制,因此线性信号权重大小是D1 * N * M,为什么因此是线性层维度为 D1,embedding层维度为N* M\n",
60 | " # 因此权重大小为D1 * N *M\n",
61 | " self.linear_weights = tf.Variable(initializer(shape=(product_layer_dim,num_field,embedding_size))) # D1 * N * M\n",
62 | " \n",
63 | " # quadratic part \n",
64 | " self.product_type = product_type\n",
65 | " if product_type == 'inner':\n",
66 | " self.theta = tf.Variable(initializer(shape=(product_layer_dim,num_field))) # D1 * N\n",
67 | "\n",
68 | " else:\n",
69 | " self.quadratic_weights = tf.Variable(initializer(shape=(product_layer_dim,embedding_size, embedding_size)))# D1 * M * M\n",
70 | " \n",
71 | " # fc layer\n",
72 | " self.deep_layer_sizes = deep_layer_sizes\n",
73 | " #神经网络方面的参数\n",
74 | " for i in range(len(deep_layer_sizes)):\n",
75 | " setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))\n",
76 | " setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())\n",
77 | " setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))\n",
78 | " setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))\n",
79 | " \n",
80 | " # last layer\n",
81 | " self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True)\n",
82 | " \n",
83 | " def call(self,feat_index,feat_value):\n",
84 | " # call函数接收输入变量\n",
85 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
86 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
87 | "# print(feat_value.get_shape())\n",
88 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
89 | " # linear part \n",
90 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
91 | " \n",
92 | " # quadratic part\n",
93 | " if self.product_type == 'inner':\n",
94 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
95 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
96 | " else:\n",
97 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
98 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
99 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
100 | " \n",
101 | " y_deep = tf.concat((lz,lp),axis=1)\n",
102 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
103 | " \n",
104 | " for i in range(len(self.deep_layer_sizes)):\n",
105 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
106 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
107 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
108 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
109 | " \n",
110 | " output = self.fc(y_deep)\n",
111 | " \n",
112 | " return output "
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 7,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "train = pd.read_csv(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.tiny.csv')\n",
122 | "\n",
123 | "train = train.fillna(0)\n",
124 | "\n",
125 | "traindrop = train.drop(columns = ['Id'])\n",
126 | "\n",
127 | "traindrop.to_csv(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt',sep='\\t', index=False,header=None)"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 11,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "freq_ = 10\n",
137 | "# dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'\n",
138 | "continuous_range_ = range(1, 14)\n",
139 | "categorical_range_ = range(14, 40)\n",
140 | "\n",
141 | "# 统计离散特征每个离散值出现的次数组成字典\n",
142 | "feat_cnt = Counter()\n",
143 | "with open(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt', 'r') as fin:\n",
144 | " for line_idx, line in enumerate(fin):\n",
145 | " features = line.rstrip('\\n').split('\\t')\n",
146 | " for idx in categorical_range_:\n",
147 | " if features[idx] == '': continue\n",
148 | " feat_cnt.update([features[idx]])"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 13,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "# Only retain discrete features with high frequency\n",
158 | "dis_feat_set = set() # 高频段的离散字符\n",
159 | "for feat, ot in feat_cnt.items():\n",
160 | " if ot >= freq_:\n",
161 | " dis_feat_set.add(feat)"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 14,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "# Create a dictionary for continuous and discrete features\n",
171 | "feat_dict = {}\n",
172 | "tc = 1\n",
173 | "# Continuous features\n",
174 | "for idx in continuous_range_:\n",
175 | " feat_dict[idx] = tc\n",
176 | " tc += 1 # 代表占据一列\n",
177 | "\n",
178 | "# Discrete features\n",
179 | "cnt_feat_set = set()\n",
180 | "with open(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt', 'r') as fin:\n",
181 | " for line_idx, line in enumerate(fin):\n",
182 | " features = line.rstrip('\\n').split('\\t')\n",
183 | " for idx in categorical_range_:\n",
184 | " # 排除空字符和低频离散字符\n",
185 | " if features[idx] == '' or features[idx] not in dis_feat_set:\n",
186 | " continue\n",
187 | " # 排除连续性数值\n",
188 | " if features[idx] not in cnt_feat_set:\n",
189 | " cnt_feat_set.add(features[idx])\n",
190 | " # 获取种类数\n",
191 | " feat_dict[features[idx]] = tc\n",
192 | " tc += 1"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 16,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "file_path = \"F:\\\\baidudownload\\\\kaggle-2014-criteo-master\\\\kaggle-2014-criteo-master\\\\\""
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 18,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "cont_features=['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n",
211 | " 'I10', 'I11', 'I12', 'I13']\n",
212 | "dist_features = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n",
213 | " 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n",
214 | " 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 21,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "train_label = []\n",
224 | "train_value = []\n",
225 | "train_idx = []\n",
226 | "test_label = []\n",
227 | "test_value = []\n",
228 | "test_idx = []\n",
229 | "\n",
230 | "continuous_range_ = range(1, 14)\n",
231 | "categorical_range_ = range(14, 40)\n",
232 | "cont_max_=[]\n",
233 | "cont_min_=[]\n",
234 | "for cf in cont_features:\n",
235 | " cont_max_.append(max(train[cf]))\n",
236 | " cont_min_.append(min(train[cf]))\n",
237 | "cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]\n",
238 | "\n",
239 | "def process_line_(line):\n",
240 | " features = line.rstrip('\\n').split('\\t')\n",
241 | " feat_idx, feat_value, label = [], [], []\n",
242 | "\n",
243 | " # MinMax Normalization\n",
244 | " for idx in continuous_range_:\n",
245 | " if features[idx] == '':\n",
246 | " feat_idx.append(0)\n",
247 | " feat_value.append(0.0)\n",
248 | " else:\n",
249 | " feat_idx.append(feat_dict[idx])\n",
250 | " # 归一化\n",
251 | " feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))\n",
252 | "\n",
253 | " # 处理离散型数据\n",
254 | " for idx in categorical_range_:\n",
255 | " if features[idx] == '' or features[idx] not in feat_dict:\n",
256 | " feat_idx.append(0)\n",
257 | " feat_value.append(0.0)\n",
258 | " else:\n",
259 | " feat_idx.append(feat_dict[features[idx]])\n",
260 | " feat_value.append(1.0)\n",
261 | " return feat_idx, feat_value, [int(features[0])]\n",
262 | "split_ratio = 0.9\n",
263 | "with open(file_path + 'train.txt', 'r') as fin:\n",
264 | " for line_idx, line in enumerate(fin):\n",
265 | "\n",
266 | " feat_idx, feat_value, label = process_line_(line)\n",
267 | " if np.random.random() <= split_ratio:\n",
268 | " train_label.append(label)\n",
269 | " train_idx.append(feat_idx)\n",
270 | " train_value.append(feat_value)\n",
271 | " else:\n",
272 | " test_label.append(label)\n",
273 | " test_idx.append(feat_idx)\n",
274 | " test_value.append(feat_value)"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 24,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "pnn = PNN(num_feat=len(feat_dict) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],\n",
284 | " deep_layer_sizes=[400, 400], product_layer_dim=10,\n",
285 | " reg_l1=0.01, reg_l2=1e-5, embedding_size=10, product_type='outer')"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 25,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "train_ds = tf.data.Dataset.from_tensor_slices(\n",
295 | " (train_label,train_idx,train_value)).shuffle(10000).batch(32)"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 26,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "@tf.function\n",
305 | "def train_one_step(model, optimizer, idx, value, label):\n",
306 | " with tf.GradientTape() as tape:\n",
307 | " output = model(idx,value)\n",
308 | " loss = loss_object(y_true=label, y_pred=output)\n",
309 | " grads = tape.gradient(loss, model.trainable_variables)\n",
310 | " grads = [tf.clip_by_norm(g, 100) for g in grads]\n",
311 | " optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))\n",
312 | " \n",
313 | " train_loss(loss)\n",
314 | " train_accuracy(label,output)"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 27,
320 | "metadata": {},
321 | "outputs": [],
322 | "source": [
323 | "train_loss = tf.keras.metrics.Mean(name='train_loss')\n",
324 | "train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')\n",
325 | "\n",
326 | "loss_object = tf.keras.losses.BinaryCrossentropy()\n",
327 | "\n",
328 | "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 35,
334 | "metadata": {
335 | "scrolled": true
336 | },
337 | "outputs": [
338 | {
339 | "name": "stdout",
340 | "output_type": "stream",
341 | "text": [
342 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
343 | " def call(self,feat_index,feat_value):\n",
344 | " # call函数接收输入变量\n",
345 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
346 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
347 | "# print(feat_value.get_shape())\n",
348 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
349 | " # linear part \n",
350 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
351 | " \n",
352 | " # quadratic part\n",
353 | " if self.product_type == 'inner':\n",
354 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
355 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
356 | " else:\n",
357 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
358 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
359 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
360 | " \n",
361 | " y_deep = tf.concat((lz,lp),axis=1)\n",
362 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
363 | " \n",
364 | " for i in range(len(self.deep_layer_sizes)):\n",
365 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
366 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
367 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
368 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
369 | " \n",
370 | " output = self.fc(y_deep)\n",
371 | " \n",
372 | " return output \n",
373 | "\n",
374 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
375 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
376 | " def call(self,feat_index,feat_value):\n",
377 | " # call函数接收输入变量\n",
378 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
379 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
380 | "# print(feat_value.get_shape())\n",
381 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
382 | " # linear part \n",
383 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
384 | " \n",
385 | " # quadratic part\n",
386 | " if self.product_type == 'inner':\n",
387 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
388 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
389 | " else:\n",
390 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
391 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
392 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
393 | " \n",
394 | " y_deep = tf.concat((lz,lp),axis=1)\n",
395 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
396 | " \n",
397 | " for i in range(len(self.deep_layer_sizes)):\n",
398 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
399 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
400 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
401 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
402 | " \n",
403 | " output = self.fc(y_deep)\n",
404 | " \n",
405 | " return output \n",
406 | "\n",
407 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
408 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
409 | " def call(self,feat_index,feat_value):\n",
410 | " # call函数接收输入变量\n",
411 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
412 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
413 | "# print(feat_value.get_shape())\n",
414 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
415 | " # linear part \n",
416 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
417 | " \n",
418 | " # quadratic part\n",
419 | " if self.product_type == 'inner':\n",
420 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
421 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
422 | " else:\n",
423 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
424 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
425 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
426 | " \n",
427 | " y_deep = tf.concat((lz,lp),axis=1)\n",
428 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
429 | " \n",
430 | " for i in range(len(self.deep_layer_sizes)):\n",
431 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
432 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
433 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
434 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
435 | " \n",
436 | " output = self.fc(y_deep)\n",
437 | " \n",
438 | " return output \n",
439 | "\n",
440 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
441 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
442 | " def call(self,feat_index,feat_value):\n",
443 | " # call函数接收输入变量\n",
444 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
445 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
446 | "# print(feat_value.get_shape())\n",
447 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
448 | " # linear part \n",
449 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
450 | " \n",
451 | " # quadratic part\n",
452 | " if self.product_type == 'inner':\n",
453 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
454 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
455 | " else:\n",
456 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
457 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
458 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
459 | " \n",
460 | " y_deep = tf.concat((lz,lp),axis=1)\n",
461 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
462 | " \n",
463 | " for i in range(len(self.deep_layer_sizes)):\n",
464 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
465 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
466 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
467 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
468 | " \n",
469 | " output = self.fc(y_deep)\n",
470 | " \n",
471 | " return output \n",
472 | "\n",
473 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
474 | "WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
475 | " def call(self,feat_index,feat_value):\n",
476 | " # call函数接收输入变量\n",
477 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
478 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
479 | "# print(feat_value.get_shape())\n",
480 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
481 | " # linear part \n",
482 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
483 | " \n",
484 | " # quadratic part\n",
485 | " if self.product_type == 'inner':\n",
486 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
487 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
488 | " else:\n",
489 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
490 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
491 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
492 | " \n",
493 | " y_deep = tf.concat((lz,lp),axis=1)\n",
494 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
495 | " \n",
496 | " for i in range(len(self.deep_layer_sizes)):\n",
497 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
498 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
499 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
500 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
501 | " \n",
502 | " output = self.fc(y_deep)\n",
503 | " \n",
504 | " return output \n",
505 | "\n",
506 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n"
507 | ]
508 | },
509 | {
510 | "name": "stdout",
511 | "output_type": "stream",
512 | "text": [
513 | "WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:\n",
514 | " def call(self,feat_index,feat_value):\n",
515 | " # call函数接收输入变量\n",
516 | " # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。\n",
517 | " feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
518 | "# print(feat_value.get_shape())\n",
519 | " feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
520 | " # linear part \n",
521 | " lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
522 | " \n",
523 | " # quadratic part\n",
524 | " if self.product_type == 'inner':\n",
525 | " theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
526 | " lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
527 | " else:\n",
528 | " embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
529 | " p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
530 | " lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
531 | " \n",
532 | " y_deep = tf.concat((lz,lp),axis=1)\n",
533 | " y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
534 | " \n",
535 | " for i in range(len(self.deep_layer_sizes)):\n",
536 | " y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
537 | " y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
538 | " y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
539 | " y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
540 | " \n",
541 | " output = self.fc(y_deep)\n",
542 | " \n",
543 | " return output \n",
544 | "\n",
545 | "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
546 | "Epoch 1, Loss: 0.6576068997383118, Accuracy: 0.7935805320739746\n",
547 | "Epoch 2, Loss: 0.5885103344917297, Accuracy: 0.7927504181861877\n",
548 | "Epoch 3, Loss: 0.5613061785697937, Accuracy: 0.7932115793228149\n",
549 | "Epoch 4, Loss: 0.5463097095489502, Accuracy: 0.7933038473129272\n",
550 | "Epoch 5, Loss: 0.5362721681594849, Accuracy: 0.7933591604232788\n",
551 | "Epoch 6, Loss: 0.5272665023803711, Accuracy: 0.7933960556983948\n",
552 | "Epoch 7, Loss: 0.519040048122406, Accuracy: 0.7934224009513855\n",
553 | "Epoch 8, Loss: 0.5109833478927612, Accuracy: 0.7934421896934509\n",
554 | "Epoch 9, Loss: 0.5033180713653564, Accuracy: 0.7937650084495544\n",
555 | "Epoch 10, Loss: 0.4961019456386566, Accuracy: 0.7946319580078125\n",
556 | "Epoch 11, Loss: 0.4890766143798828, Accuracy: 0.7957438230514526\n",
557 | "Epoch 12, Loss: 0.4827176034450531, Accuracy: 0.7968548536300659\n",
558 | "Epoch 13, Loss: 0.47674980759620667, Accuracy: 0.797752320766449\n",
559 | "Epoch 14, Loss: 0.4711345434188843, Accuracy: 0.7988378405570984\n",
560 | "Epoch 15, Loss: 0.4657707214355469, Accuracy: 0.800000011920929\n",
561 | "Epoch 16, Loss: 0.46048682928085327, Accuracy: 0.8010168671607971\n",
562 | "Epoch 17, Loss: 0.45555245876312256, Accuracy: 0.8017839193344116\n",
563 | "Epoch 18, Loss: 0.45082414150238037, Accuracy: 0.8027116656303406\n",
564 | "Epoch 19, Loss: 0.44618847966194153, Accuracy: 0.8038330674171448\n",
565 | "Epoch 20, Loss: 0.4417554438114166, Accuracy: 0.8047869205474854\n",
566 | "Epoch 21, Loss: 0.4372897744178772, Accuracy: 0.8057553768157959\n",
567 | "Epoch 22, Loss: 0.4330126941204071, Accuracy: 0.8065603375434875\n",
568 | "Epoch 23, Loss: 0.4288385808467865, Accuracy: 0.8073434233665466\n",
569 | "Epoch 24, Loss: 0.4247806966304779, Accuracy: 0.8084993362426758\n",
570 | "Epoch 25, Loss: 0.4208265244960785, Accuracy: 0.8095406889915466\n",
571 | "Epoch 26, Loss: 0.4169451892375946, Accuracy: 0.8103954792022705\n",
572 | "Epoch 27, Loss: 0.4130288362503052, Accuracy: 0.8114328980445862\n",
573 | "Epoch 28, Loss: 0.40920114517211914, Accuracy: 0.8125543594360352\n",
574 | "Epoch 29, Loss: 0.40538835525512695, Accuracy: 0.8135412335395813\n",
575 | "Epoch 30, Loss: 0.4015306830406189, Accuracy: 0.8148680925369263\n",
576 | "Epoch 31, Loss: 0.3976829946041107, Accuracy: 0.8161808252334595\n",
577 | "Epoch 32, Loss: 0.3940720558166504, Accuracy: 0.8173941373825073\n",
578 | "Epoch 33, Loss: 0.3902757167816162, Accuracy: 0.8186178207397461\n",
579 | "Epoch 34, Loss: 0.3865205645561218, Accuracy: 0.8198021054267883\n",
580 | "Epoch 35, Loss: 0.3826711177825928, Accuracy: 0.8213613629341125\n",
581 | "Epoch 36, Loss: 0.3788437247276306, Accuracy: 0.8228186964988708\n",
582 | "Epoch 37, Loss: 0.37512916326522827, Accuracy: 0.8244065642356873\n",
583 | "Epoch 38, Loss: 0.37138548493385315, Accuracy: 0.8259109258651733\n",
584 | "Epoch 39, Loss: 0.3679004907608032, Accuracy: 0.8273097276687622\n",
585 | "Epoch 40, Loss: 0.36420953273773193, Accuracy: 0.8287769556045532\n",
586 | "Epoch 41, Loss: 0.3605599105358124, Accuracy: 0.8302671313285828\n",
587 | "Epoch 42, Loss: 0.3569217622280121, Accuracy: 0.8318312168121338\n",
588 | "Epoch 43, Loss: 0.3537658751010895, Accuracy: 0.8332582712173462\n",
589 | "Epoch 44, Loss: 0.35020676255226135, Accuracy: 0.8348342180252075\n",
590 | "Epoch 45, Loss: 0.3465138077735901, Accuracy: 0.8364631533622742\n",
591 | "Epoch 46, Loss: 0.34285783767700195, Accuracy: 0.8380813598632812\n",
592 | "Epoch 47, Loss: 0.3392927646636963, Accuracy: 0.8396543264389038\n",
593 | "Epoch 48, Loss: 0.33572396636009216, Accuracy: 0.8413115739822388\n",
594 | "Epoch 49, Loss: 0.33254799246788025, Accuracy: 0.8427882790565491\n",
595 | "Epoch 50, Loss: 0.3292645514011383, Accuracy: 0.8442501425743103\n"
596 | ]
597 | }
598 | ],
599 | "source": [
600 | "EPOCHS = 50\n",
601 | "for epoch in range(EPOCHS):\n",
602 | " for label, idx, value in train_ds:\n",
603 | " train_one_step(pnn,optimizer,idx, value,label)\n",
604 | " template = 'Epoch {}, Loss: {}, Accuracy: {}'\n",
605 | " print (template.format(epoch+1,\n",
606 | " train_loss.result(),train_accuracy.result()))"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": null,
612 | "metadata": {},
613 | "outputs": [],
614 | "source": []
615 | }
616 | ],
617 | "metadata": {
618 | "kernelspec": {
619 | "display_name": "Python 3",
620 | "language": "python",
621 | "name": "python3"
622 | },
623 | "language_info": {
624 | "codemirror_mode": {
625 | "name": "ipython",
626 | "version": 3
627 | },
628 | "file_extension": ".py",
629 | "mimetype": "text/x-python",
630 | "name": "python",
631 | "nbconvert_exporter": "python",
632 | "pygments_lexer": "ipython3",
633 | "version": "3.6.5"
634 | }
635 | },
636 | "nbformat": 4,
637 | "nbformat_minor": 2
638 | }
639 |
--------------------------------------------------------------------------------
/PNN/PNN.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | TensorFlow 2.0 implementation of Product-based Neural Network[1]
4 | Reference:
5 | [1] Product-based Neural Networks for User ResponsePrediction,
6 | Yanru Qu, Han Cai, Kan Ren, Weinan Zhang, Yong Yu, Ying Wen, Jun Wang
7 | [2] Tensorflow implementation of PNN
8 | https://github.com/Snail110/Awesome-RecSystem-Models/blob/master/Model/PNN_TensorFlow.py
9 | """
10 | import tensorflow as tf
11 |
12 | import pickle
13 | from util.train_model import train_test_model_demo
14 | class PNN(tf.keras.Model):
15 | def __init__(self, num_feat, num_field, dropout_deep, deep_layer_sizes, product_layer_dim=10, reg_l1=0.01,
16 | reg_l2=1e-5, embedding_size=10, product_type='outer'):
17 | super().__init__()
18 | self.reg_l1 = reg_l1
19 | self.reg_l2 = reg_l2
20 | self.num_feat = num_feat # F =features nums
21 | self.num_field = num_field # N =fields of a feature
22 | self.product_layer_dim = product_layer_dim # D1 pnn dim
23 | self.dropout_deep = dropout_deep
24 |
25 | # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度
26 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size,
27 | embeddings_initializer='uniform') # F * M
28 | self.feat_embeddings = feat_embeddings
29 |
30 | # 定义随机初始化
31 | initializer = tf.initializers.GlorotUniform()
32 |
33 | # linear part 线性层就是embedding层的复制,因此线性信号权重大小是D1 * N * M,为什么因此是线性层维度为 D1,embedding层维度为N* M
34 | # 因此权重大小为D1 * N *M
35 | self.linear_weights = tf.Variable(
36 | initializer(shape=(product_layer_dim, num_field, embedding_size))) # D1 * N * M
37 |
38 | # quadratic part
39 | self.product_type = product_type
40 | if product_type == 'inner':
41 | self.theta = tf.Variable(initializer(shape=(product_layer_dim, num_field))) # D1 * N
42 |
43 | else:
44 | self.quadratic_weights = tf.Variable(
45 | initializer(shape=(product_layer_dim, embedding_size, embedding_size))) # D1 * M * M
46 |
47 | # fc layer
48 | self.deep_layer_sizes = deep_layer_sizes
49 | # 神经网络方面的参数
50 | for i in range(len(deep_layer_sizes)):
51 | setattr(self, 'dense_' + str(i), tf.keras.layers.Dense(deep_layer_sizes[i]))
52 | setattr(self, 'batchNorm_' + str(i), tf.keras.layers.BatchNormalization())
53 | setattr(self, 'activation_' + str(i), tf.keras.layers.Activation('relu'))
54 | setattr(self, 'dropout_' + str(i), tf.keras.layers.Dropout(dropout_deep[i]))
55 |
56 | # last layer
57 | self.fc = tf.keras.layers.Dense(1, activation=None, use_bias=True)
58 |
59 | def call(self, feat_index, feat_value):
60 | # call函数接收输入变量
61 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。
62 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M
63 | # print(feat_value.get_shape())
64 | feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value)
65 | # linear part
66 | lz = tf.einsum('bnm,dnm->bd', feat_embedding, self.linear_weights) # Batch * D1
67 |
68 | # quadratic part
69 | if self.product_type == 'inner':
70 | theta = tf.einsum('bnm,dn->bdnm', feat_embedding, self.theta) # Batch * D1 * N * M
71 | lp = tf.einsum('bdnm,bdnm->bd', theta, theta) # Batch * D1
72 | else:
73 | embed_sum = tf.reduce_sum(feat_embedding, axis=1) # Batch * M
74 | p = tf.einsum('bm,bn->bmn', embed_sum, embed_sum)
75 | lp = tf.einsum('bmn,dmn->bd', p, self.quadratic_weights) # Batch * D1
76 |
77 | y_deep = tf.concat((lz, lp), axis=1)
78 | y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)
79 |
80 | for i in range(len(self.deep_layer_sizes)):
81 | y_deep = getattr(self, 'dense_' + str(i))(y_deep)
82 | y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep)
83 | y_deep = getattr(self, 'activation_' + str(i))(y_deep)
84 | y_deep = getattr(self, 'dropout_' + str(i))(y_deep)
85 |
86 | output = self.fc(y_deep)
87 |
88 | return output
89 | if __name__ == '__main__':
90 | AID_DATA_DIR = "../data/Criteo/"
91 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb'))
92 |
93 | pnn = PNN(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],
94 | deep_layer_sizes=[400, 400], product_layer_dim=10,
95 | reg_l1=0.01, reg_l2=1e-5, embedding_size=10, product_type='outer')
96 |
97 | train_label_path = AID_DATA_DIR + 'train_label'
98 | train_idx_path = AID_DATA_DIR + 'train_idx'
99 | train_value_path = AID_DATA_DIR + 'train_value'
100 |
101 | test_label_path = AID_DATA_DIR + 'test_label'
102 | test_idx_path = AID_DATA_DIR + 'test_idx'
103 | test_value_path = AID_DATA_DIR + 'test_value'
104 |
105 | train_test_model_demo(pnn,train_label_path, train_idx_path, train_value_path)
106 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # recsys
2 |
3 | ## 1.Requireme
4 | 在这里强调下主要要是采用 Tensorflow2.0的api进行建立model
5 |
6 | TensorFlow2.0,Keras, Python3.6, NumPy, sk-learn, Pandas
7 |
8 | ## 2.Datasets
9 |
10 | ### 2.1 Criteo
11 |
12 | This dataset Contains about 45 million records. There are 13 features taking integer values (mostly count features) and 26 categorical features.
13 | The dataset is available at http://labs.criteo.com/2014/02/download-kaggle-display-advertising-challenge-dataset/
14 |
15 | 在这里我截取一部分数据进行模型训练 data =../data/Criteo/train.txt
16 |
17 | ### 2.2 Seguro-safe-driver
18 |
19 | In the train and test data, features that belong to similar groupings are
20 | tagged as such in the feature names (e.g., ind, reg, car, calc). In addition,
21 | feature names include the postfix bin to indicate binary features and
22 | cat to indicate categorical features. Features without these designations
23 | are either continuous or ordinal. Values of -1 indicate that the feature was
24 | missing from the observation. The target columns signifies whether or not a
25 | claim was filed for that policy holder.
26 |
27 | The dataset is available at https://www.kaggle.com/c/porto-seguro-safe-driver-prediction
28 |
29 | ## 3. 推荐系统实战
30 |
31 | 
32 | 来自https://zhuanlan.zhihu.com/p/69050253
33 |
34 | 
35 | 来自https://zhuanlan.zhihu.com/p/53231955
36 | ### 3.1 第一章.协同过滤
37 |
38 | ### 3.2 第二章 GBDT+LR
39 |
40 | 本质上GBDT+LR是一种具有stacking思想的二分类,所以用来解决二分类问题,这个方法出自于Facebook 2014年的论文 Practical Lessons from Predicting Clicks on Ads at Facebook 。
41 | https://zhuanlan.zhihu.com/p/29053940
42 |
43 | ### 3.3 第三章 MLR
44 |
45 | 算法简单实现 我们这里只是简单实现一个tensorflow版本的MLR模型
46 | https://www.jianshu.com/p/627fc0d755b2
47 |
48 | ### 3.4 第四章 DCN
49 |
50 | Deep Cross Network模型
51 |
52 | https://www.jianshu.com/p/77719fc252fa
53 |
54 | https://github.com/Nirvanada/Deep-and-Cross-Keras
55 |
56 | https://blog.csdn.net/roguesir/article/details/797632
57 |
58 | https://arxiv.org/abs/1708.05123
59 |
60 | ### 3.5 第五章 PNN
61 |
62 | https://github.com/JianzhouZhan/Awesome-RecSystem-Models
63 |
64 | https://github.com/Snail110/tensorflow_practice/blob/master/recommendation/Basic-PNN-Demo/PNN.py
65 |
66 | https://www.jianshu.com/p/be784ab4abc2
67 |
68 |
69 | ### 3.6 第六章 Wide-Deep
70 |
71 | https://zhuanlan.zhihu.com/p/92279796
72 |
73 | https://github.com/busesese/Wide_Deep_Model
74 |
75 | ### 3.6 第七章 NFM
76 |
77 | https://zhuanlan.zhihu.com/p/37522285
78 | Neural Factorization Machines for Sparse Predictive Analytics
--------------------------------------------------------------------------------
/Wide-Deep/Wide-Deep.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | import pickle
4 | from util.train_model import train_test_model_demo
5 |
6 |
7 | class Wide(tf.keras.layers.Layer):
8 | def __init__(self,units=1):
9 | # input_dim = num_size + embed_size = input_size
10 | super(Wide, self).__init__()
11 | # self.units = units
12 | self.linear = tf.keras.layers.Dense(units=units,activation='relu')
13 | def call(self, inputs):
14 | output = self.linear(inputs)
15 | return output
16 |
17 | class Deep(tf.keras.layers.Layer):
18 | def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):
19 | # input_dim = num_size + embed_size = input_size
20 | super(Deep, self).__init__()
21 | self.num_feat = num_feat # F =features nums
22 | self.num_field = num_field # N =fields of a feature
23 | self.dropout_deep = dropout_deep
24 |
25 | # Embedding 这里采用embeddings层因此大小为F* M F为特征数量,M为embedding的维度
26 | feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M
27 | self.feat_embeddings = feat_embeddings
28 |
29 | # fc layer
30 | self.deep_layer_sizes = deep_layer_sizes
31 | #神经网络方面的参数
32 | for i in range(len(deep_layer_sizes)):
33 | setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))
34 | setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())
35 | setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))
36 | setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))
37 | # last layer
38 | self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True)
39 |
40 | def call(self,feat_index,feat_value):
41 | # embedding part feat_index = inputs为输入 feat_embeddings为一个layer。
42 | feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M
43 | # print(feat_value.get_shape())
44 | feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)
45 |
46 | y_deep = tf.keras.layers.Flatten()(feat_embedding)
47 | for i in range(len(self.deep_layer_sizes)):
48 | y_deep = getattr(self,'dense_' + str(i))(y_deep)
49 | y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)
50 | y_deep = getattr(self,'activation_' + str(i))(y_deep)
51 | y_deep = getattr(self,'dropout_' + str(i))(y_deep)
52 |
53 | output = self.fc(y_deep)
54 | return output
55 |
56 | class WideDeep(tf.keras.Model):
57 | def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):
58 | super().__init__()
59 | self.num_feat = num_feat # F =features nums
60 | self.num_field = num_field # N =fields of a feature
61 | self.dropout_deep = dropout_deep
62 |
63 | self.wide = Wide(units=1)
64 | self.deep = Deep(num_feat,num_field,dropout_deep,deep_layer_sizes)
65 | self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True)
66 |
67 | def call(self,num_input,feat_index,feat_value):
68 | x1 = self.wide(num_input)
69 | x2 = self.deep(feat_index,feat_value)
70 |
71 | x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)
72 | output = self.fc(x3)
73 | return output
74 |
75 |
76 | if __name__ == '__main__':
77 | AID_DATA_DIR = "../data/Criteo/"
78 | feat_dict_ = pickle.load(open(AID_DATA_DIR + '/cross_feat_dict_10.pkl2', 'rb'))
79 |
80 | widedeep = WideDeep(num_feat=len(feat_dict_) + 1, num_field=52, dropout_deep=[0.5, 0.5, 0.5],
81 | deep_layer_sizes=[400, 400],embedding_size=10)
82 |
83 | train_label_path = AID_DATA_DIR + 'traincross_label'
84 | train_idx_path = AID_DATA_DIR + 'traincross_idx'
85 | train_value_path = AID_DATA_DIR + 'traincross_value'
86 | train_num_path = AID_DATA_DIR + 'traincross_num'
87 |
88 | # 这种读取数据方式采用TextLineDataset,数据为大文件时,节省内存,效率训练
89 | def get_batch_dataset(label_path, idx_path, value_path,num_path):
90 | label = tf.data.TextLineDataset(label_path)
91 | idx = tf.data.TextLineDataset(idx_path)
92 | value = tf.data.TextLineDataset(value_path)
93 | num = tf.data.TextLineDataset(num_path)
94 |
95 | label = label.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
96 | idx = idx.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
97 | value = value.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
98 | num = num.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
99 |
100 | batch_dataset = tf.data.Dataset.zip((num,label, idx, value))
101 | batch_dataset = batch_dataset.shuffle(buffer_size=128)
102 | batch_dataset = batch_dataset.batch(128)
103 | batch_dataset = batch_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
104 | return batch_dataset
105 | train_batch_dataset = get_batch_dataset(train_label_path, train_idx_path, train_value_path,train_num_path)
106 |
107 | train_loss = tf.keras.metrics.Mean(name='train_loss')
108 | train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')
109 | loss_object = tf.keras.losses.BinaryCrossentropy()
110 | optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
111 |
112 |
113 | @tf.function
114 | def train_one_step(model, optimizer, idx, value, label, num):
115 | with tf.GradientTape() as tape:
116 | output = model(num, idx, value)
117 | loss = loss_object(y_true=label, y_pred=output)
118 | grads = tape.gradient(loss, model.trainable_variables)
119 | grads = [tf.clip_by_norm(g, 100) for g in grads]
120 | optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
121 |
122 | train_loss(loss)
123 | train_accuracy(label, output)
124 |
125 | EPOCHS = 50
126 | for epoch in range(EPOCHS):
127 | for num, label, idx, value in train_batch_dataset:
128 | train_one_step(widedeep, optimizer, idx, value, label,num)
129 | template = 'Epoch {}, Loss: {}, Accuracy: {}'
130 | print(template.format(epoch + 1,
131 | train_loss.result(), train_accuracy.result()))
132 |
133 |
--------------------------------------------------------------------------------
/Wide-Deep/data_process.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import pickle
4 | from collections import Counter
5 |
6 | """
7 | Data Process for Wide-Deep network
8 | https://github.com/busesese/Wide_Deep_Model
9 | https://github.com/aviraj-sinha/ML5/blob/master/10.%20Keras%20Wide%20and%20Deep.ipynb
10 | """
11 | def get_train_test_file(file_path, feat_dict_, split_ratio=0.9):
12 | #定义训练集与测试集
13 | train_label_fout = open(file_path+'traincross_label', 'w')
14 | train_value_fout = open(file_path+'traincross_value', 'w')
15 | train_idx_fout = open(file_path+'traincross_idx', 'w')
16 | train_num_fout = open(file_path + 'traincross_num', 'w')
17 |
18 | continuous_range_ = range(1, 14)
19 | categorical_range_ = range(14, 52)
20 |
21 | def process_line_(line):
22 | features = line.rstrip('\n').split('\t')
23 | feat_idx, feat_value, label= [], [], []
24 | # 自己获取每列特征中的最大值,最小值
25 | cont_min_ = [0.0, -2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
26 | cont_max_ = [95.0,7864,8457.0,87.0,1015215.0,4638.0,1658.0,547.0,5637.0,4.0,37.0,98.0,770.0]
27 | cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]
28 | # MinMax Normalization
29 | for idx in continuous_range_:
30 | if features[idx] == '':
31 | feat_idx.append(0)
32 | feat_value.append(0.0)
33 | else:
34 | feat_idx.append(feat_dict_[idx])
35 | feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))
36 | # 获取数值型特征
37 | num = feat_value[:]
38 | # 处理分类型数据
39 | for idx in categorical_range_:
40 | if features[idx] == '' or features[idx] not in feat_dict_:
41 | feat_idx.append(0)
42 | feat_value.append(0.0)
43 | else:
44 | feat_idx.append(feat_dict_[features[idx]])
45 | feat_value.append(1.0)
46 | return feat_idx, feat_value, [int(features[0])], num
47 |
48 | with open(file_path+'traincross.txt', 'r') as fin:
49 | for line_idx, line in enumerate(fin):
50 | feat_idx, feat_value, label, num = process_line_(line)
51 |
52 | feat_value = '\t'.join([str(v) for v in feat_value]) + '\n'
53 | feat_idx = '\t'.join([str(idx) for idx in feat_idx]) + '\n'
54 | label = '\t'.join([str(idx) for idx in label]) + '\n'
55 | feat_num = '\t'.join([str(idx) for idx in num]) + '\n'
56 |
57 | train_label_fout.write(label)
58 | train_idx_fout.write(feat_idx)
59 | train_value_fout.write(feat_value)
60 | train_num_fout.write(feat_num)
61 |
62 | fin.close()
63 |
64 | train_label_fout.close()
65 | train_idx_fout.close()
66 | train_value_fout.close()
67 | train_num_fout.close()
68 |
69 |
70 | def cross_feature(file_path,cross_range):
71 | # 构建交叉特征数据集
72 | traincross = open(file_path+'traincross.txt', 'w')
73 | with open(file_path+'train.txt', 'r') as fin:
74 | for line_idx, line in enumerate(fin):
75 | features = line.rstrip('\n').split('\t')
76 | for i in cross_range:
77 | features.append('_'.join([features[i[0]], features[i[1]]]))
78 | string_features = '\t'.join(features) + '\n'
79 | traincross.write(string_features)
80 | fin.close()
81 | traincross.close()
82 |
83 | def get_feat_dict(file_path):
84 |
85 | freq_ = 10
86 | # pkl2格式用来保存字典形式的wide-deep数据pickle
87 | dir_feat_dict_ = file_path+'cross_feat_dict_' + str(freq_) + '.pkl2'
88 | continuous_range_ = range(1, 14)
89 | categorical_range_ = range(14, 52)
90 |
91 | if os.path.exists(dir_feat_dict_):
92 | feat_dict = pickle.load(open(dir_feat_dict_, 'rb'))
93 | else:
94 | # print('generate a feature dict')
95 | # Count the number of occurrences of discrete features
96 | feat_cnt = Counter()
97 | with open(file_path+'traincross.txt', 'r') as fin:
98 | for line_idx, line in enumerate(fin):
99 | features = line.rstrip('\n').split('\t')
100 | for idx in categorical_range_:
101 | if features[idx] == '': continue
102 | feat_cnt.update([features[idx]])
103 | fin.close()
104 | # Only retain discrete features with high frequency
105 | dis_feat_set = set()
106 | for feat, ot in feat_cnt.items():
107 | if ot >= freq_:
108 | dis_feat_set.add(feat)
109 |
110 | # Create a dictionary for continuous and discrete features
111 | feat_dict = {}
112 | tc = 1
113 | # Continuous features
114 | for idx in continuous_range_:
115 | feat_dict[idx] = tc
116 | tc += 1
117 | # Discrete features
118 | cnt_feat_set = set()
119 | with open(file_path+'traincross.txt', 'r') as fin:
120 | for line_idx, line in enumerate(fin):
121 | features = line.rstrip('\n').split('\t')
122 |
123 | for idx in categorical_range_:
124 | if features[idx] == '' or features[idx] not in dis_feat_set:
125 | continue
126 | if features[idx] not in cnt_feat_set:
127 | cnt_feat_set.add(features[idx])
128 | feat_dict[features[idx]] = tc
129 | tc += 1
130 | # Save dictionary
131 | fin.close()
132 | with open(dir_feat_dict_, 'wb') as fout:
133 | pickle.dump(feat_dict, fout)
134 | print('args.num_feat ', len(feat_dict) + 1)
135 | return feat_dict
136 |
137 |
138 | if __name__ == '__main__':
139 | file_path = '../data/Criteo/'
140 | # 交叉特征
141 | cross_range = [[14, 15], [16, 17], [18, 19], [20, 21], [22, 23], [24, 25], [26, 27], [28, 29], [30, 31],
142 | [32, 33], [34, 35], [36, 37], [38, 39]]
143 | cross_feature(file_path,cross_range)
144 | feat_dict = get_feat_dict(file_path)
145 | get_train_test_file(file_path, feat_dict)
146 | print('Done!')
--------------------------------------------------------------------------------
/data/Criteo/data_process.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import pickle
4 | from collections import Counter
5 |
6 | """
7 | Data Process for FM, PNN, and DeepFM.
8 | [1] PaddlePaddle implementation of DeepFM for CTR prediction
9 | https://github.com/Snail110/Awesome-RecSystem-Models/blob/master/data/Criteo/forOtherModels/dataPreprocess_TensorFlow.py
10 | """
11 | def get_train_test_file(file_path, feat_dict_, split_ratio=0.9):
12 | #定义训练集与测试集
13 | train_label_fout = open('train_label', 'w')
14 | train_value_fout = open('train_value', 'w')
15 | train_idx_fout = open('train_idx', 'w')
16 | test_label_fout = open('test_label', 'w')
17 | test_value_fout = open('test_value', 'w')
18 | test_idx_fout = open('test_idx', 'w')
19 |
20 | continuous_range_ = range(1, 14)
21 | categorical_range_ = range(14, 40)
22 |
23 | def process_line_(line):
24 | features = line.rstrip('\n').split('\t')
25 | feat_idx, feat_value, label = [], [], []
26 | # 自己获取每列特征中的最大值,最小值
27 | cont_min_ = [0.0, -2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
28 | cont_max_ = [95.0,7864,8457.0,87.0,1015215.0,4638.0,1658.0,547.0,5637.0,4.0,37.0,98.0,770.0]
29 | cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]
30 | # MinMax Normalization
31 | for idx in continuous_range_:
32 | if features[idx] == '':
33 | feat_idx.append(0)
34 | feat_value.append(0.0)
35 | else:
36 | feat_idx.append(feat_dict_[idx])
37 | feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))
38 |
39 | # 处理分类型数据
40 | for idx in categorical_range_:
41 | if features[idx] == '' or features[idx] not in feat_dict_:
42 | feat_idx.append(0)
43 | feat_value.append(0.0)
44 | else:
45 | feat_idx.append(feat_dict_[features[idx]])
46 | feat_value.append(1.0)
47 | return feat_idx, feat_value, [int(features[0])]
48 |
49 | with open(file_path, 'r') as fin:
50 | for line_idx, line in enumerate(fin):
51 | feat_idx, feat_value, label = process_line_(line)
52 |
53 | feat_value = '\t'.join([str(v) for v in feat_value]) + '\n'
54 | feat_idx = '\t'.join([str(idx) for idx in feat_idx]) + '\n'
55 | label = '\t'.join([str(idx) for idx in label]) + '\n'
56 |
57 | if np.random.random() <= split_ratio:
58 | train_label_fout.write(label)
59 | train_idx_fout.write(feat_idx)
60 | train_value_fout.write(feat_value)
61 | else:
62 | test_label_fout.write(label)
63 | test_idx_fout.write(feat_idx)
64 | test_value_fout.write(feat_value)
65 |
66 | fin.close()
67 |
68 | train_label_fout.close()
69 | train_idx_fout.close()
70 | train_value_fout.close()
71 | test_label_fout.close()
72 | test_idx_fout.close()
73 | test_value_fout.close()
74 |
75 |
76 | def get_feat_dict(file_path):
77 | freq_ = 10
78 | # pkl2格式用来保存字典形式的数据pickle
79 | dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'
80 | continuous_range_ = range(1, 14)
81 | categorical_range_ = range(14, 40)
82 |
83 | if os.path.exists(dir_feat_dict_):
84 | feat_dict = pickle.load(open(dir_feat_dict_, 'rb'))
85 | else:
86 | # print('generate a feature dict')
87 | # Count the number of occurrences of discrete features
88 | feat_cnt = Counter()
89 | with open(file_path, 'r') as fin:
90 | for line_idx, line in enumerate(fin):
91 | features = line.rstrip('\n').split('\t')
92 | for idx in categorical_range_:
93 | if features[idx] == '': continue
94 | feat_cnt.update([features[idx]])
95 |
96 | # Only retain discrete features with high frequency
97 | dis_feat_set = set()
98 | for feat, ot in feat_cnt.items():
99 | if ot >= freq_:
100 | dis_feat_set.add(feat)
101 |
102 | # Create a dictionary for continuous and discrete features
103 | feat_dict = {}
104 | tc = 1
105 | # Continuous features
106 | for idx in continuous_range_:
107 | feat_dict[idx] = tc
108 | tc += 1
109 | # Discrete features
110 | cnt_feat_set = set()
111 | with open(file_path, 'r') as fin:
112 | for line_idx, line in enumerate(fin):
113 | features = line.rstrip('\n').split('\t')
114 | for idx in categorical_range_:
115 | if features[idx] == '' or features[idx] not in dis_feat_set:
116 | continue
117 | if features[idx] not in cnt_feat_set:
118 | cnt_feat_set.add(features[idx])
119 | feat_dict[features[idx]] = tc
120 | tc += 1
121 |
122 | # Save dictionary
123 | with open(dir_feat_dict_, 'wb') as fout:
124 | pickle.dump(feat_dict, fout)
125 | print('args.num_feat ', len(feat_dict) + 1)
126 |
127 | return feat_dict
128 |
129 |
130 | if __name__ == '__main__':
131 | file_path = './train.txt'
132 | feat_dict = get_feat_dict(file_path)
133 | get_train_test_file(file_path, feat_dict)
134 | print('Done!')
--------------------------------------------------------------------------------
/embedding.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
13 | " from ._conv import register_converters as _register_converters\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import tensorflow as tf"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "csv = [\n",
28 | " \"1,harden|james|curry\",\n",
29 | " \"2,wrestbrook|harden|durant\",\n",
30 | " \"3,|paul|towns\",\n",
31 | "]"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "TAG_SET = [\"harden\", \"james\", \"curry\", \"durant\", \"paul\",\"towns\",\"wrestbrook\"]"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 4,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# 处理得到SpareTensor\n",
50 | "ids,post_tags_str = tf.decode_csv(csv,[[-1],[\"\"]])"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 7,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "table = tf.contrib.lookup.index_table_from_tensor(\n",
60 | "mapping=TAG_SET,default_value=-1) # 构造一个查找表"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 9,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "split_tags = tf.string_split(post_tags_str,\"|\")"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 12,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "data": {
79 | "text/plain": [
80 | ""
81 | ]
82 | },
83 | "execution_count": 12,
84 | "metadata": {},
85 | "output_type": "execute_result"
86 | }
87 | ],
88 | "source": [
89 | "split_tags.indices"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 14,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "tags = tf.SparseTensor(\n",
99 | "indices = split_tags.indices,\n",
100 | "values = table.lookup(split_tags.values),\n",
101 | " dense_shape=split_tags.dense_shape)"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 15,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# 定义embedding变量\n",
111 | "# 大小为3 因为 只有7个类型\n",
112 | "TAG_EMBEDDING_DIM = 3\n",
113 | "embedding_params = tf.Variable(tf.truncated_normal([len(TAG_SET),TAG_EMBEDDING_DIM]))"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 16,
119 | "metadata": {},
120 | "outputs": [
121 | {
122 | "name": "stdout",
123 | "output_type": "stream",
124 | "text": [
125 | "WARNING:tensorflow:The default value of combiner will change from \"mean\" to \"sqrtn\" after 2016/11/01.\n"
126 | ]
127 | }
128 | ],
129 | "source": [
130 | "embedding_tags = tf.nn.embedding_lookup_sparse(embedding_params,sp_ids=tags,sp_weights=None)\n",
131 | "# sp_ids就是我们刚刚得到的SparseTensor,而sp_weights=None代表的每一个取值的权重,如果是None的话,所有权重都是1,也就是相当于取了平均\n",
132 | "# 如果不是None的话,我们需要同样传入一个SparseTensor,代表不同球员的喜欢权重。大家感兴趣可以自己去尝试"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 18,
138 | "metadata": {},
139 | "outputs": [
140 | {
141 | "name": "stdout",
142 | "output_type": "stream",
143 | "text": [
144 | "[SparseTensorValue(indices=array([[0, 0],\n",
145 | " [0, 1],\n",
146 | " [0, 2],\n",
147 | " [1, 0],\n",
148 | " [1, 1],\n",
149 | " [1, 2],\n",
150 | " [2, 0],\n",
151 | " [2, 1]], dtype=int64), values=array([0, 1, 2, 6, 0, 3, 4, 5], dtype=int64), dense_shape=array([3, 3], dtype=int64)), array([[ 0.06023904, 1.0575624 , -0.9093878 ],\n",
152 | " [-0.42566654, 0.26845995, -0.6602178 ],\n",
153 | " [-0.6277443 , 0.28916246, -0.15512544]], dtype=float32), array([b'harden|james|curry', b'wrestbrook|harden|durant', b'|paul|towns'],\n",
154 | " dtype=object)]\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "with tf.Session() as s:\n",
160 | " s.run([tf.global_variables_initializer(),tf.tables_initializer()])\n",
161 | " print(s.run([tags,embedding_tags,post_tags_str]))"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": []
170 | }
171 | ],
172 | "metadata": {
173 | "kernelspec": {
174 | "display_name": "Python 3",
175 | "language": "python",
176 | "name": "python3"
177 | },
178 | "language_info": {
179 | "codemirror_mode": {
180 | "name": "ipython",
181 | "version": 3
182 | },
183 | "file_extension": ".py",
184 | "mimetype": "text/x-python",
185 | "name": "python",
186 | "nbconvert_exporter": "python",
187 | "pygments_lexer": "ipython3",
188 | "version": "3.6.5"
189 | }
190 | },
191 | "nbformat": 4,
192 | "nbformat_minor": 2
193 | }
194 |
--------------------------------------------------------------------------------
/util/train_model.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | """
4 | import tensorflow as tf
5 |
6 | def train_test_model_demo(model,train_label_path, train_idx_path, train_value_path):
7 | # 这种读取数据方式采用TextLineDataset,数据为大文件时,节省内存,效率训练
8 | def get_batch_dataset(label_path, idx_path, value_path):
9 | label = tf.data.TextLineDataset(label_path)
10 | idx = tf.data.TextLineDataset(idx_path)
11 | value = tf.data.TextLineDataset(value_path)
12 |
13 | label = label.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
14 | idx = idx.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
15 | value = value.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
16 |
17 | batch_dataset = tf.data.Dataset.zip((label, idx, value))
18 | batch_dataset = batch_dataset.shuffle(buffer_size=128)
19 | batch_dataset = batch_dataset.batch(128)
20 | batch_dataset = batch_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
21 | return batch_dataset
22 | train_batch_dataset = get_batch_dataset(train_label_path, train_idx_path, train_value_path)
23 |
24 | train_loss = tf.keras.metrics.Mean(name='train_loss')
25 | train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')
26 | loss_object = tf.keras.losses.BinaryCrossentropy()
27 | optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
28 |
29 |
30 | @tf.function
31 | def train_one_step(model, optimizer, idx, value, label):
32 | with tf.GradientTape() as tape:
33 | output = model(idx, value)
34 | loss = loss_object(y_true=label, y_pred=output)
35 | grads = tape.gradient(loss, model.trainable_variables)
36 | grads = [tf.clip_by_norm(g, 100) for g in grads]
37 | optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
38 |
39 | train_loss(loss)
40 | train_accuracy(label, output)
41 |
42 | EPOCHS = 50
43 | for epoch in range(EPOCHS):
44 | for label, idx, value in train_batch_dataset:
45 | train_one_step(model, optimizer, idx, value, label)
46 | template = 'Epoch {}, Loss: {}, Accuracy: {}'
47 | print(template.format(epoch + 1,
48 | train_loss.result(), train_accuracy.result()))
49 |
50 | def train_test_model_demo_1(model,train_label, train_idx, train_value):
51 | # 这种读取数据方式采用tf.data.Dataset.from_tensor_slices,数据为小文件时,便于进行大数据前的调试模型使用。
52 | def get_dataset(train_label, train_idx, train_value):
53 | train_ds = tf.data.Dataset.from_tensor_slices(
54 | (train_label, train_idx, train_value)).shuffle(10000).batch(32)
55 | return train_ds
56 | train_batch_dataset = get_dataset(train_label, train_idx, train_value)
57 |
58 | train_loss = tf.keras.metrics.Mean(name='train_loss')
59 | train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')
60 | # 二分类
61 | loss_object = tf.keras.losses.BinaryCrossentropy()
62 | optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
63 |
64 | @tf.function
65 | def train_one_step(model, optimizer, idx, value, label):
66 | with tf.GradientTape() as tape:
67 | output = model(idx, value)
68 | loss = loss_object(y_true=label, y_pred=output)
69 | grads = tape.gradient(loss, model.trainable_variables)
70 | grads = [tf.clip_by_norm(g, 100) for g in grads]
71 | optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
72 |
73 | train_loss(loss)
74 | train_accuracy(label, output)
75 |
76 | EPOCHS = 50
77 | for epoch in range(EPOCHS):
78 | for label, idx, value in train_batch_dataset:
79 | train_one_step(model, optimizer, idx, value, label)
80 | template = 'Epoch {}, Loss: {}, Accuracy: {}'
81 | print(template.format(epoch + 1,
82 | train_loss.result(), train_accuracy.result()))
--------------------------------------------------------------------------------