├── .DS_Store
├── Config.py
├── README.md
├── __pycache__
├── Config.cpython-36.pyc
└── tools.cpython-36.pyc
├── data-pre.py
├── data_conf.txt
├── ex_data
├── dataset1.csv
├── dataset1.txt
├── dataset2.csv
├── dataset2.txt
├── dataset3.csv
└── dataset3.txt
├── img
├── .DS_Store
├── pic1.png
└── pic2.png
├── paper
├── 1.png
├── 10.png
├── 11.png
├── 12.png
├── 13.png
├── 2.png
├── 3.png
├── 4.png
├── 6.png
├── 7.png
├── 8.png
├── 9.png
└── README.md
├── plot.py
├── tools.py
└── xDeepFM.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/.DS_Store
--------------------------------------------------------------------------------
/Config.py:
--------------------------------------------------------------------------------
1 | """
2 | author : yang yiqing 2018年07月13日15:57:50
3 | """
4 |
5 | # file
6 | train_file = 'ex_data/dataset1.csv'
7 | valid_file = 'ex_data/dataset2.csv'
8 | test_file = 'ex_data/dataset3.csv'
9 |
10 | train_save_file = 'ex_data/dataset1.txt'
11 | valid_save_file = 'ex_data/dataset2.txt'
12 | test_save_file = 'ex_data/dataset3.txt'
13 |
14 | label_name = 'label'
15 |
16 | # features
17 | numeric_features = ['all_launch_count', 'last_launch', 'all_video_count', 'last_video', 'all_video_day',
18 | 'all_action_count', 'last_action',
19 | 'all_action_day', 'register_day']
20 | single_features = ['register_type', 'device_type']
21 | multi_features = []
22 |
23 | num_embedding = True
24 | single_feature_frequency = 10
25 | multi_feature_frequency = 0
26 |
27 | # model
28 |
29 | FM_layer = True
30 | DNN_layer = True
31 | CIN_layer = False
32 |
33 | use_numerical_embedding = False
34 |
35 |
36 | embedding_size = 16
37 |
38 | dnn_net_size = [128,64,32]
39 | cross_layer_size = [10,10,10]
40 | cross_direct = False
41 | cross_output_size = 1
42 |
43 | # train
44 | batch_size = 4096
45 | epochs = 4000
46 | learning_rate = 0.01
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # xDeepFM
2 |
3 | ## 介绍
4 |
5 | **- 原论文题目:《xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems》**
6 |
7 | **- 原论文地址: [xDeepFM](https://arxiv.org/pdf/1803.05170.pdf.)**
8 |
9 | **- 论文整理: [论文整理](https://github.com/batch-norm/xDeepFM/tree/master/paper)**
10 |
11 | 下面对本论文提出的模型进行了复现。
12 |
13 | ## 模型介绍
14 |
15 | **模型架构 = FM + CIN + DNN**
16 |
17 |
18 |
19 | ## 使用
20 |
21 | **1.数据准备**
22 |
23 | + 训练集和测试集需为.csv文件
24 | + 支持**数值型特征映射为embedding**,也支持**数值型特征直接作为DNN输入**
25 | + 支持**多值离散特征**的处理,可自行配置为sum or mean,分隔符请用"|"
26 | + **cat特征**需要自行先用labelEncoder转换一下
27 |
28 | 具体配置在**Config.py**文件中,也可结合ex_data中的例子作为参考。
29 |
30 | **转换完成后的训练数据示例:**
31 |
32 | ```
33 | 1,18:1,30:1,0:0.25,2:0.8125,4:0.0,6:0.0,8:0.0,10:0.006630292147247738,12:0.8125,14:0.25,16:0.5625,
34 | ```
35 |
36 | **2.模型训练**
37 |
38 | + 先在Config中指定单值离散,多值离散,连续型特征
39 | + 默认激活函数"relu",默认optimizer"Adagrad"
40 | + 默认DNN网络结构 [128,64,32]
41 | + 默认CIN卷积核维度 [10,10,10] ,输出维度 [1]
42 | + 默认使用 DNN + CIN + FM,可在Config中配置
43 | + 默认建立vocabulary的最低词频 10
44 |
45 | **3.模型实验**
46 |
47 | **- Batch_size : 4096 , epochs: 2000**
48 |
49 | **- 指标为"logloss"**
50 |
51 | **- 数据为 “2018中国高校大数据挑战赛” 初赛数据**
52 |
53 |
54 |
55 | ## 小结
56 |
57 | 模型基于DeepFM加入了**CIN component (压缩交互网络)**,对原有的结构进行了vector-wise和边界明确的交互填充
58 |
59 | + **优点**:表达能力更强,可以发掘出vector-wise的交互特征,精度更高
60 | + **缺点**:训练速度变得很缓慢
61 |
62 | yyq 2018年07月21日13:47:42
63 |
64 |
65 |
--------------------------------------------------------------------------------
/__pycache__/Config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/__pycache__/Config.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/tools.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/__pycache__/tools.cpython-36.pyc
--------------------------------------------------------------------------------
/data-pre.py:
--------------------------------------------------------------------------------
1 | """
2 | author:yang yiqing 2018年07月13日16:04:16
3 | - 数据处理,数值型必须是float,离散型必须是int,多值离散是str中间用|隔开,eg. "1|2|3"
4 | - 暂时不能有缺失值
5 |
6 | """
7 |
8 | import Config
9 | import pandas as pd
10 | import numpy as np
11 | from sklearn.preprocessing import MinMaxScaler
12 |
13 |
14 | class Parse(object):
15 | def __init__(self):
16 | self.global_emb_idx = 0
17 | self.label_num = 0
18 | self.single_num = 0
19 | self.multi_num = 0
20 | self.train = pd.read_csv(Config.train_file, index_col=0)
21 | self.valid = pd.read_csv(Config.valid_file, index_col=0)
22 | self.test = pd.read_csv(Config.test_file, index_col=0)
23 | scalar = MinMaxScaler()
24 | all_data = pd.concat([self.train, self.valid, self.test])
25 | print('transform data...')
26 | for s in Config.numeric_features:
27 | scalar.fit(all_data[s].values.reshape(-1,1))
28 | self.train[s] = scalar.transform(self.train[s].values.reshape(-1,1))
29 | self.valid[s] = scalar.transform(self.valid[s].values.reshape(-1,1))
30 | self.test[s] = scalar.transform(self.test[s].values.reshape(-1,1))
31 | self.check()
32 | self.num_features = Config.numeric_features
33 | self.single_features = Config.single_features
34 | self.multi_features = Config.multi_features
35 | self.backup_dict = {}
36 |
37 | self.num_dict = {}
38 | self.single_dict = {}
39 | self.multi_dict = {}
40 | self.get_dict()
41 | self.trans_data(self.train, Config.train_save_file)
42 | self.trans_data(self.valid, Config.valid_save_file)
43 | self.trans_data(self.test, Config.test_save_file)
44 | self.save_conf()
45 |
46 |
47 | def get_dict(self):
48 | print('prepare dict...')
49 | self.global_emb_idx = 0
50 | if self.num_features and Config.num_embedding:
51 | for s in self.num_features:
52 | self.num_dict[s] = self.global_emb_idx
53 | self.global_emb_idx += 1
54 | # for NaN
55 | self.backup_dict[s] = self.global_emb_idx
56 | self.global_emb_idx += 1
57 | #print(self.num_dict)
58 |
59 | if self.single_features:
60 | for s in self.single_features:
61 | # every filed
62 | frequency_dict = {}
63 | current_dict = {}
64 | values = pd.concat([self.train, self.valid, self.test])[s]
65 | for v in values:
66 | if v in frequency_dict:
67 | frequency_dict[v] += 1
68 | else:
69 | frequency_dict[v] = 1
70 | for k, v in frequency_dict.items():
71 | if v > Config.single_feature_frequency:
72 | current_dict[k] = self.global_emb_idx
73 | self.global_emb_idx += 1
74 | self.single_dict[s] = current_dict
75 | self.backup_dict[s] = self.global_emb_idx
76 | # for NaN and low frequency word
77 | # 为每个field留出2个emb的位置来处理不在词典中的值和缺失值
78 | self.global_emb_idx += 1
79 | #print(self.single_dict)
80 |
81 | if self.multi_features:
82 | for s in self.multi_features:
83 | # every field
84 | frequency_dict = {}
85 | current_dict = {}
86 | values = pd.concat([self.train, self.valid, self.test])[s]
87 | for vs in values:
88 | for v in vs.split('|'):
89 | v = int(v)
90 | if v in frequency_dict:
91 | frequency_dict[v] += 1
92 | else:
93 | frequency_dict[v] = 1
94 | for k, v in frequency_dict.items():
95 | if v > Config.multi_feature_frequency:
96 | current_dict[k] = self.global_emb_idx
97 | self.global_emb_idx += 1
98 | self.multi_dict[s] = current_dict
99 | self.backup_dict[s] = self.global_emb_idx
100 | # for NaN and low frequency word
101 | # 为每个field留出2个emb的位置来处理不在词典中的值和缺失值
102 | # self.global_emb_idx += 1
103 | #print(self.multi_dict)
104 |
105 | def trans_data(self, data, save_file):
106 | print('trans data...' + save_file)
107 | # label index1:value1 index2:value2
108 |
109 | with open(save_file, 'w') as f:
110 | # label, index : 值
111 | def write_to_file(line):
112 | label = line[Config.label_name]
113 | f.write(str(label) + ',')
114 | self.label_num += 1
115 | for s in self.single_features:
116 | now_v = line[s]
117 | if now_v in self.single_dict[s]:
118 | now_idx = self.single_dict[s][now_v]
119 | else:
120 | now_idx = self.backup_dict[s]
121 | f.write(str(now_idx) + ':' + str(1) + ',')
122 | self.single_num += 1
123 | for s in self.num_features:
124 | now_v = line[s]
125 | f.write(str(self.num_dict[s]) + ':' + str(now_v) + ',')
126 | self.single_num += 1
127 | for s in self.multi_features:
128 | now_v = line[s]
129 | if '|' not in now_v:
130 | idxs = [now_v]
131 | else:
132 | idxs = now_v.split('|')
133 | idxs = [x for x in idxs if int(x) in self.multi_dict[s]]
134 | if idxs:
135 | f.write(str('|'.join(idxs)) + ':' + str(1) + ',')
136 | else:
137 | f.write(str(self.backup_dict[s]) + ':' + str(1) + ',')
138 | self.multi_num += 1
139 |
140 | f.write('\n')
141 |
142 |
143 | data.apply(lambda x: write_to_file(x), axis=1)
144 |
145 | def check(self):
146 | if self.train.shape[1] == self.test.shape[1] == self.test.shape[1]:
147 | return True
148 | else:
149 | print('error , all dataset must have same shape')
150 |
151 | # 保存数据处理的信息 总的embedding大小,单值离散特征数量,数值型特征数量,多值离散特征数量
152 | def save_conf(self):
153 | with open('data_conf.txt', 'w') as f:
154 | f.write(str(self.global_emb_idx) + '\t')
155 | f.write(str(len(self.single_features)) + '\t')
156 | f.write(str(len(self.num_features)) + '\t')
157 | f.write(str(len(self.multi_features)))
158 |
159 |
160 |
161 | if __name__ == '__main__':
162 | pa = Parse()
163 |
--------------------------------------------------------------------------------
/data_conf.txt:
--------------------------------------------------------------------------------
1 | 730 2 9 0
--------------------------------------------------------------------------------
/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/img/.DS_Store
--------------------------------------------------------------------------------
/img/pic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/img/pic1.png
--------------------------------------------------------------------------------
/img/pic2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/img/pic2.png
--------------------------------------------------------------------------------
/paper/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/1.png
--------------------------------------------------------------------------------
/paper/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/10.png
--------------------------------------------------------------------------------
/paper/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/11.png
--------------------------------------------------------------------------------
/paper/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/12.png
--------------------------------------------------------------------------------
/paper/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/13.png
--------------------------------------------------------------------------------
/paper/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/2.png
--------------------------------------------------------------------------------
/paper/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/3.png
--------------------------------------------------------------------------------
/paper/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/4.png
--------------------------------------------------------------------------------
/paper/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/6.png
--------------------------------------------------------------------------------
/paper/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/7.png
--------------------------------------------------------------------------------
/paper/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/8.png
--------------------------------------------------------------------------------
/paper/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/9.png
--------------------------------------------------------------------------------
/paper/README.md:
--------------------------------------------------------------------------------
1 | # XDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems
2 |
3 | # ABSTRACT
4 |
5 | 1.现在诸如DeepFM和Deep&Wide等模型都可以自动学习隐式的高维交互特征,并结合了低维特征,**但是有一个缺点就是它们的高维特征都是在bite-wise的层面上进行交互的**。本片论文提出了一种**压缩交互网络(Compressed Interaction Network(CIN))**,能够学习显式的交互特征并且是在vector-wise的级别,CIN带有一些CNN和RNN的特点,最终作者将整个模型命名为"eXtreme Deep Factorization Machine(xDeepFM)"。
6 |
7 | 2.本文提出的模型有两个优点:
8 |
9 | + 能够显式的学习有明确边界的高维交互特征
10 | + 能够学习隐式的低维和高维特征
11 |
12 | 个人理解这里作者对implicit和explicit的理解是交互特征的维度是否明确,在这里翻译为隐式和显式。
13 |
14 | ## INTRODUCTION
15 |
16 | 1.简单介绍了单值离散特征和多值离散特征,然后介绍三个**手动提取交互特征的缺点:**
17 |
18 | + 挖掘出高质量的交互特征需要非常专业的领域知识并且需要做大量的尝试,很耗时间。
19 | + 在大型的推荐系统中,原生特征是海量的,手动挖掘交叉特征几乎不可能。
20 | + 挖掘不出肉眼不可见的交叉特征
21 |
22 | 2.然后介绍了**经典的FM模型**,用提取隐向量然后做内积的形式来提取交叉特征,扩展的FM模型可以提取随机的高维特征,但是**主要的缺陷是:**
23 |
24 | + 会学习所有的交叉特征,其中肯定会包含无用的交叉组合,另外一篇论文指出引入无用的交叉特征会引入噪音并降低模型的表现。
25 |
26 | 3.介绍了引入了DNN的组合模型,"Factorisation-machine supported Neural Network **(FNN)**",它在DNN之前使用了预训练的field embedding。
27 |
28 | 4.介绍了**PNN**(Product-based Neural Network),在embedding layer和DNN Input之间插入了一层product layer,不依赖于pre-trained FM。
29 |
30 | 5.FNN和PNN的缺点都是忽略了低维交互特征,**Wide&Deep和DeepFM**模型通过混合架构解决了这种问题,但是它们同样存在缺点:
31 |
32 | + 它们学习到的高维特征是一种implicit fasion,没有一种公式可以明确推论出最终学习出来的交叉特征到底是多少维的
33 | + 另一方面,其DNN部分是在bit-wise的层面下进行学习的,而经典的FM架构是在vetor-wise层面学习的
34 |
35 | 6.本文提出的方法基于**DCN(Deep & Cross Network)**模型,其目标是有效率的捕捉到**边界明确的交叉特征**。
36 |
37 | # PRELEMINARIES
38 |
39 | ## Embedding Layer
40 |
41 | 介绍一些基于"univalent","multivalent"进行embedding的基础知识,这里不介绍了:
42 |
43 |
44 |
45 | ## Implicit High-order Interactions
46 |
47 | 前向传播过程:
48 |
49 |
50 |
51 | 这种架构是bit-wise层面的,意思是说,**即使是同一个filed embedding,不同的element之间也会互相影响。**
52 |
53 | PNN和DeepFM基于上面的缺点进行了改进,除了DNN component,**还添加了two-way interation layer到架构中,这样就既有vector-wise也有bit-wise的component了**。PNN和DeepFM的区别就是DeepFM是把product layer直接作为结果连到输出层,**而PNN是把product layer放在DNN和embedding layer之间**
54 |
55 |
56 |
57 | ## Explicit High-order Interactions
58 |
59 | 这里主要介绍了Cross Network(cross net)也是本文主要借鉴的一种模型,下面是该模型的架构:
60 |
61 |
62 |
63 | 该模型的主要目标是显示的构建高维交互特征,不像DNN前向传播的全连接层那样,每个隐藏层是通过如下公式计算出来的:
64 |
65 |
66 |
67 | 通过推导可以看出其实**每一个隐含层都是x0的一个scalar multiple,**这当然不是代表隐含层是x0的线性表达,只是说因为每一层原生x0都会参与计算,因此对x0非常敏感。**但是其缺点为:**
68 |
69 | + crossnet的输出是一种特殊形式,即x0的scalar multiple
70 | + 交互特征仍然是bit-wise层面的
71 |
72 | # OUR PROPSED MODEL
73 |
74 | ## Compressed Interation Network
75 |
76 | 本论文设计了一种新的cross network, 称为**Compressed Interaction Network (CIN)**, 设计的时候主要**考虑了下面三个方面:**
77 |
78 | + 交互特征是在vector-wise层面的(主要基于crossnet改进了这点)
79 | + 高维交互特征是显式的
80 | + 网络的复杂度不会因为交互层级的增加而增加
81 |
82 | **下面介绍了一些在CIN的中的概念:**
83 |
84 | 既然在CIN中是vector-wise层级的,那么每一个unit是一个vector,因此field embedding的输出是一个mxD的矩阵(D:embedding size,m:filed size),CIN的第k层是一个Hk x D的矩阵(Hk代表的是CIN中每一层的向量数量,H0=m),**下面是第CIN第k层的h-emb的计算公式:**
85 |
86 |
87 |
88 | 还是比较直观的,其中○代表Hadamard product:
89 |
90 |
91 |
92 | **可以发现k-th layer的计算也是和crossnet一样依赖于(k-1)-th layer和 0-th layer,因此交互特征是显式的,而且交互的层级随着网络结构的加深而增加(在这点上和crossnet是一样的),同时通过公式也可以很明显的看出,模型是vector-wise的:**
93 |
94 |
95 |
96 |
97 |
98 | 如果公式不好理解的话,可以通过如下图示来理解:
99 |
100 |
101 |
102 | 图(a)和(b)表示了如何从这一层的隐藏层(Hk x D)和X^0层(m X D)来产生下一层隐藏层的(Hk+1 x D),图示所示计算方法是为了更好的展现为什么模型有CNN的思想,先通过X0和Xk的第i列做一个outer product(matrix multiplication)得到一个Hk x m的矩阵(0<=i
107 |
108 | ## CIN Analysis
109 |
110 | 本文从空间复杂度,时间复杂度和多项式逼近等方面进行了分析,这里只介绍参数:
111 |
112 | 从公式也可以看出:
113 |
114 |
115 |
116 | 计算第k-th layer的第h(0
125 |
126 | 最终的模型结构如下:
127 |
128 |
129 |
130 | # EXPRIMENTS
131 |
132 | 这部分就不说了,反正就是好多实验,自己的模型就是吊吊吊,有用的信息是:
133 |
134 | + 对于这种高维稀疏特征来说,基于FM思想的模型例如DeepFM,Deep&Wide,PNN等比LR不知道高到哪里去了
135 | + 并不是混合模型就一定好,但是单用DNN component一般效果比较差
136 | + 这种用于高维稀疏特征的混合模型一般在比较浅层的比如2-3层的网络结构下会取得最好的效果
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
--------------------------------------------------------------------------------
/plot.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | import numpy as np
4 |
5 | # data = pd.read_csv('DNN_loss_result.csv')
6 | # plt.title('DNN')
7 | # plt.xlabel('step')
8 | # plt.ylabel('loss')
9 | #
10 | # plt.plot(data.step,data.valid_auc,'b',label='valid_loss')
11 | # plt.plot(data.step,data.train_auc,'g',label='train_loss')
12 | # plt.legend(bbox_to_anchor=[0.3, 0.4])
13 | # plt.yticks(np.linspace(0.45,0.7,20))
14 | # plt.grid()
15 | # plt.show()
16 |
17 | data1 = pd.read_csv('DNN_loss_result.csv')
18 | data2 = pd.read_csv('FM_loss_result.csv')
19 | data3 = pd.read_csv('xDeepFM_loss_result.csv')
20 | data4 = pd.read_csv('DeepFM_loss_result.csv')
21 |
22 | plt.title('Model loss')
23 | plt.xlabel('step')
24 | plt.ylabel('loss')
25 |
26 | plt.plot(data1.step,data1.valid_auc,'b',label='DNN loss')
27 | plt.plot(data2.step,data2.valid_auc,'g',label='FM loss')
28 | plt.plot(data3.step,data3.valid_auc,'r',label='xDeepFM loss')
29 | plt.plot(data4.step[10:],data4.valid_auc[10:],'y',label='DeepFM loss')
30 | plt.legend(bbox_to_anchor=[0.3, 0.4])
31 | plt.yticks(np.linspace(0.45,0.7,20))
32 | plt.grid()
33 | plt.show()
--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import tensorflow as tf
4 | from sklearn.metrics import roc_auc_score
5 |
6 | def auc_score(preds, labels, label_size):
7 | preds = [x[label_size - 1] for x in preds]
8 | labels = [x[label_size - 1] for x in labels]
9 | roc_score = roc_auc_score(labels, preds)
10 | return roc_score
11 |
12 |
13 | def _get_data(data_dir):
14 | data = []
15 | with open(data_dir, 'r') as f:
16 | line = f.readline()
17 | while line:
18 | data.append(line)
19 | line = f.readline()
20 | return data
21 |
22 |
23 | def _get_conf():
24 | with open('data_conf.txt', 'r') as f:
25 | line = f.readline()
26 | line = line.split('\t')
27 | return int(line[0]), int(line[1]), int(line[2]), int(line[3])
28 |
29 | def get_label(labels, label_size):
30 | final_label = []
31 | for v in labels:
32 | temp_label = [0] * label_size
33 | temp_label[v] = 1
34 | final_label.append(temp_label)
35 | return final_label
36 |
--------------------------------------------------------------------------------
/xDeepFM.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import tensorflow as tf
4 | import Config
5 | from tools import _get_data, _get_conf, get_label,auc_score
6 | import random
7 | import time
8 |
9 | class xDeepFM(object):
10 | def __init__(self):
11 | self.total_emb, self.single_size, self.numerical_size, self.multi_size = _get_conf()
12 | self.field_size = self.single_size + self.numerical_size + self.multi_size
13 | self.embedding_length = self.field_size * Config.embedding_size
14 | self._init_data()
15 | self._init_placeholder()
16 | self._init_Variable()
17 | self._init_Model()
18 | self.valid_batch = self._get_batch(self.valid, -1)
19 | self.valid_label = get_label(self.valid_batch[0], 2)
20 | self.valid_dict = {
21 | self.ph['single_index']: self.valid_batch[1],
22 | self.ph['numerical_index']: self.valid_batch[2],
23 | self.ph['numerical_value']: self.valid_batch[3],
24 | self.ph['value']: self.valid_batch[-1],
25 | self.ph['label']: self.valid_label,
26 | self.train_phase: False
27 | }
28 | if Config.multi_features:
29 | for idx, s in enumerate(Config.multi_features):
30 | self.valid_dict[self.ph['multi_index_%s' % s]] = self.valid_batch[4]
31 | self.valid_dict[self.ph['multi_value_%s' % s]] = self.valid_batch[5]
32 | self.global_step = []
33 | self.global_train_auc = []
34 | self.global_valid_auc = []
35 |
36 | self._train()
37 | self._save_loss()
38 |
39 |
40 |
41 |
42 | def _init_data(self):
43 | self.train = _get_data(Config.train_save_file)
44 | self.valid = _get_data(Config.valid_save_file)
45 | self.test = _get_data(Config.test_save_file)
46 |
47 |
48 |
49 |
50 |
51 |
52 | def _get_batch(self, data, idx):
53 | start = time.time()
54 | if idx == -1:
55 | batch_data = data
56 | elif (idx + 1) * Config.batch_size <= len(data):
57 | batch_data = data[idx*Config.batch_size:(idx+1)*Config.batch_size]
58 | else:
59 | batch_data = data[idx*Config.batch_size:]
60 | final_label = []
61 | final_single_index = []
62 | final_numerical_value = []
63 | final_numerical_index = []
64 | final_multi_sparse_index = []
65 | final_multi_sparse_value = []
66 | final_value = []
67 | for idx, line in enumerate(batch_data):
68 | line_index = []
69 | line_value = []
70 | line_numerical_value = []
71 | line_data = line.split(',')
72 | final_label.append(int(line_data[0]))
73 | if self.single_size:
74 | for i in range(1, 1 + self.single_size):
75 | single_pair = line_data[i].split(':')
76 | line_index.append(int(single_pair[0]))
77 | line_value.append(float(single_pair[1]))
78 | final_single_index.append(line_index)
79 | line_index = []
80 | if self.single_size + self.numerical_size:
81 | for i in range(1 + self.single_size, 1 + self.single_size + self.numerical_size):
82 | single_pair = line_data[i].split(':')
83 | if not Config.use_numerical_embedding:
84 | line_numerical_value.append(float(single_pair[1]))
85 | if float(single_pair[1]) == 0:
86 | line_index.append(int(9999))
87 | line_value.append(float(1))
88 | else:
89 | line_index.append(int(single_pair[0]))
90 | line_value.append(float(single_pair[1]))
91 | final_numerical_value.append(line_numerical_value)
92 | final_numerical_index.append(line_index)
93 | line_index = []
94 | total_length = 1 + self.single_size + self.numerical_size + self.multi_size
95 | if self.multi_size:
96 | for i in range(1 + self.single_size + self.numerical_size, total_length):
97 | single_pair = line_data[i].split(':')
98 | _multi = [int(x) for x in single_pair[0].split('|')]
99 | line_index.append(_multi)
100 | for v in _multi:
101 | final_multi_sparse_index.append([idx, idx])
102 | final_multi_sparse_value.append(v)
103 | line_value.append(float(single_pair[1]))
104 | final_value.append(line_value)
105 | end = time.time()
106 | return [final_label, final_single_index, final_numerical_index,final_numerical_value,final_multi_sparse_index, final_multi_sparse_value, final_value]
107 |
108 | def _init_placeholder(self):
109 | self.ph = {}
110 | self.ph['label'] = tf.placeholder(dtype=tf.int8, shape=[None, 2])
111 | self.train_phase = tf.placeholder(tf.bool, name="train_phase")
112 | self.ph['value'] = tf.placeholder(dtype=tf.float32,
113 | shape=[None, self.single_size + self.numerical_size + self.multi_size])
114 | self.ph['single_index'] = tf.placeholder(dtype=tf.int32, shape=[None, self.single_size])
115 | self.ph['numerical_index'] = tf.placeholder(dtype=tf.int32, shape=[None, self.numerical_size])
116 | for s in Config.multi_features:
117 | self.ph['multi_index_%s' % s] = tf.placeholder(dtype=tf.int64, shape=[None, 2])
118 | self.ph['multi_value_%s' % s] = tf.placeholder(dtype=tf.int64, shape=[None])
119 | if not Config.use_numerical_embedding:
120 | self.ph['numerical_value'] = tf.placeholder(dtype=tf.float32,shape=[None,self.numerical_size])
121 |
122 | def _init_Variable(self):
123 | self.vr = {}
124 | self.vr['single_second_embedding'] = tf.get_variable(name='single_second_embedding',
125 | shape=(10000, Config.embedding_size),
126 | initializer=tf.glorot_uniform_initializer())
127 | self.vr['numerical_second_embedding'] = tf.get_variable(name='numerical_second_embedding',
128 | shape=(10000, Config.embedding_size),
129 | initializer=tf.glorot_uniform_initializer())
130 | for s in Config.multi_features:
131 | self.vr['multi_second_embedding_%s' % s] = tf.get_variable(name='multi_second_embedding_%s' % s,
132 | shape=(10000, Config.embedding_size),
133 | initializer=tf.glorot_uniform_initializer())
134 |
135 | self.vr['single_first_embedding'] = tf.get_variable(name='single_first_embedding',
136 | shape=(10000, 1),
137 | initializer=tf.glorot_uniform_initializer())
138 | self.vr['numerical_first_embedding'] = tf.get_variable(name='numerical_first_embedding',
139 | shape=(10000, 1),
140 | initializer=tf.glorot_uniform_initializer())
141 | for s in Config.multi_features:
142 | self.vr['multi_first_embedding_%s' % s] = tf.get_variable(name='multi_first_embedding_%s' % s,
143 | shape=(10000, 1),
144 | initializer=tf.glorot_uniform_initializer())
145 | # DNN part
146 | if Config.use_numerical_embedding:
147 | dnn_net = [self.embedding_length] + Config.dnn_net_size
148 | else:
149 | dnn_net = [self.embedding_length - self.numerical_size * Config.embedding_size + self.numerical_size] + Config.dnn_net_size
150 | for i in range(len(Config.dnn_net_size)):
151 | self.vr['W_%d' % i] = tf.get_variable(name='W_%d' % i, shape=[dnn_net[i], dnn_net[i + 1]],
152 | initializer=tf.glorot_uniform_initializer())
153 | self.vr['b_%d' % i] = tf.get_variable(name='b_%d' % i, shape=[dnn_net[i + 1]],
154 | initializer=tf.zeros_initializer())
155 | # output
156 |
157 | def _init_Model(self):
158 | # first embedding
159 | first_single_result = tf.reshape(tf.nn.embedding_lookup(self.vr['single_first_embedding'],
160 | self.ph['single_index']),
161 | shape=[-1, self.single_size]
162 | )
163 | first_numerical_result = tf.reshape(tf.nn.embedding_lookup(self.vr['numerical_first_embedding'],
164 | self.ph['numerical_index']),
165 | shape=[-1, self.numerical_size]
166 | )
167 | first_multi_result = []
168 | if Config.multi_features:
169 | for s in Config.multi_features:
170 | temp_multi_result = tf.nn.embedding_lookup_sparse(self.vr['multi_first_embedding_%s' % s],
171 | tf.SparseTensor(indices=self.ph['multi_index_%s' % s],
172 | values=self.ph['multi_value_%s' % s],
173 | dense_shape=(Config.batch_size,
174 | Config.embedding_size)),
175 | None,
176 | combiner="sum"
177 | )
178 | first_multi_result.append(temp_multi_result)
179 | first_multi_result = tf.concat(first_multi_result, axis=1)
180 | first_embedding_output = tf.concat([first_single_result, first_numerical_result,first_multi_result], axis=1)
181 | else:
182 | first_embedding_output = tf.concat([first_single_result, first_numerical_result], axis=1)
183 |
184 | y_first_order = tf.multiply(first_embedding_output, self.ph['value'])
185 |
186 | # second embedding
187 | second_single_result = tf.reshape(tf.nn.embedding_lookup(self.vr['single_second_embedding'],
188 | self.ph['single_index']),
189 | shape=[-1, Config.embedding_size * self.single_size]
190 | )
191 | second_numerical_result = tf.reshape(tf.nn.embedding_lookup(self.vr['numerical_second_embedding'],
192 | self.ph['numerical_index']),
193 | shape=[-1, Config.embedding_size * self.numerical_size]
194 | )
195 | if Config.multi_features:
196 | second_multi_result = []
197 | for s in Config.multi_features:
198 | temp_multi_result = tf.nn.embedding_lookup_sparse(self.vr['multi_second_embedding_%s' % s],
199 | tf.SparseTensor(indices=self.ph['multi_index_%s' % s],
200 | values=self.ph['multi_value_%s' % s],
201 | dense_shape=(Config.batch_size,
202 | Config.embedding_size)),
203 | None,
204 | combiner="sum"
205 | )
206 | second_multi_result.append(temp_multi_result)
207 | second_multi_result = tf.concat(second_multi_result, axis=1)
208 | # DNN input
209 | self.DNN_input = tf.concat([second_single_result,second_multi_result], axis=1)
210 | else:
211 | self.DNN_input = tf.concat([second_single_result], axis=1)
212 | self.middle_fm_input = tf.concat([self.DNN_input,second_numerical_result], axis=1)
213 | if Config.use_numerical_embedding:
214 | self.DNN_input = tf.concat([self.DNN_input,second_numerical_result], axis=1)
215 | else:
216 | self.DNN_input = tf.concat([self.DNN_input,self.ph['numerical_value']],axis=1)
217 | self.shape = tf.shape(self.DNN_input)
218 | # second output
219 | second_FM_input = tf.reshape(self.middle_fm_input, shape=[-1, self.single_size + self.numerical_size + self.multi_size,
220 | Config.embedding_size])
221 |
222 | summed_features_emb = tf.reduce_sum(second_FM_input, 1)
223 | summed_features_emb_square = tf.square(summed_features_emb)
224 | squared_features_emb = tf.square(second_FM_input)
225 | squared_sum_features_emb = tf.reduce_sum(squared_features_emb, 1)
226 | y_second_order = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb)
227 |
228 | dnn_output = self.DNN_input
229 | # DNN output
230 | for i in range(len(Config.dnn_net_size)):
231 | self.DNN_input = tf.add(tf.matmul(self.DNN_input, self.vr['W_%d' % i]), self.vr['b_%d' % i])
232 | self.DNN_input = tf.layers.batch_normalization(self.DNN_input,training=self.train_phase)
233 | dnn_output = tf.nn.relu(self.DNN_input)
234 |
235 | # CIN
236 | D = Config.embedding_size
237 | final_result = []
238 | final_len = 0
239 | field_nums = [self.field_size]
240 | if Config.multi_features:
241 | nn_input = tf.reshape(tf.concat([second_single_result, second_multi_result], axis=1),
242 | shape=[-1, self.field_size, Config.embedding_size])
243 | else:
244 | nn_input = tf.reshape(second_single_result,
245 | shape=[-1, self.field_size, Config.embedding_size])
246 | cin_layers = [nn_input]
247 | split_tensor_0 = tf.split(nn_input, D * [1], 2)
248 | for idx, layer_size in enumerate(Config.cross_layer_size):
249 | now_tensor = tf.split(cin_layers[-1], D * [1], 2)
250 | # Hk x m
251 | dot_result_m = tf.matmul(split_tensor_0, now_tensor, transpose_b=True)
252 | dot_result_o = tf.reshape(dot_result_m, shape=[D, -1, field_nums[0] * field_nums[-1]])
253 | dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2])
254 | filters = tf.get_variable(name="f_" + str(idx), shape=[1, field_nums[-1] * field_nums[0], layer_size],
255 | dtype=tf.float32)
256 | curr_out = tf.nn.conv1d(dot_result, filters=filters, stride=1, padding='VALID')
257 | b = tf.get_variable(name="f_b" + str(idx), shape=[layer_size], dtype=tf.float32,
258 | initializer=tf.zeros_initializer())
259 | curr_out = tf.nn.relu(tf.nn.bias_add(curr_out, b))
260 | curr_out = tf.transpose(curr_out, perm=[0, 2, 1])
261 | if Config.cross_direct:
262 | direct_connect = curr_out
263 | next_hidden = curr_out
264 | final_len += layer_size
265 | field_nums.append(int(layer_size))
266 | else:
267 | if idx != len(Config.cross_layer_size) - 1:
268 | next_hidden, direct_connect = tf.split(curr_out, 2 * [int(layer_size / 2)], 1)
269 | final_len += int(layer_size / 2)
270 | else:
271 | direct_connect = curr_out
272 | next_hidden = 0
273 | final_len += layer_size
274 |
275 | field_nums.append(int(layer_size / 2))
276 | final_result.append(direct_connect)
277 | cin_layers.append(next_hidden)
278 | result = tf.concat(final_result, axis=1)
279 | result = tf.reduce_sum(result, -1)
280 | w_nn_output1 = tf.get_variable(name='w_nn_output1', shape=[final_len, Config.cross_output_size],
281 | dtype=tf.float32)
282 | b_nn_output1 = tf.get_variable(name='b_nn_output1', shape=[Config.cross_output_size], dtype=tf.float32,
283 | initializer=tf.zeros_initializer())
284 | CIN_out = tf.nn.xw_plus_b(result, w_nn_output1, b_nn_output1)
285 |
286 | # final output
287 | output_length = 0
288 | to_concat = []
289 | if Config.FM_layer:
290 | to_concat.append(y_first_order)
291 | to_concat.append(y_second_order)
292 | output_length += self.field_size + Config.embedding_size
293 | if Config.CIN_layer:
294 | to_concat.append(CIN_out)
295 | output_length += Config.cross_output_size
296 | if Config.DNN_layer:
297 | to_concat.append(dnn_output)
298 | output_length += Config.dnn_net_size[-1]
299 |
300 | output = tf.concat(to_concat, axis=1)
301 |
302 | self.vr['final_w'] = tf.get_variable(name='final_w', shape=[output_length, 2],
303 | initializer=tf.glorot_uniform_initializer())
304 | self.vr['final_b'] = tf.get_variable(name='final_b', shape=[2],
305 | initializer=tf.zeros_initializer())
306 | final_logits = tf.add(tf.matmul(output, self.vr['final_w']), self.vr['final_b'])
307 | self.softmax_output = tf.nn.softmax(final_logits)
308 | self.loss = tf.reduce_mean(
309 | tf.nn.softmax_cross_entropy_with_logits(labels=self.ph['label'], logits=final_logits))
310 | self.optimizer = tf.train.AdagradOptimizer(learning_rate=Config.learning_rate).minimize(self.loss)
311 |
312 | def _train(self):
313 | print('....')
314 | with tf.Session() as self.sess:
315 | self.sess.run(tf.global_variables_initializer())
316 | allDataLength = len(self.train)
317 | global_step = 0
318 | print('total step:%d'%(Config.epochs * (int(allDataLength / Config.batch_size) + 1)))
319 | for i in range(Config.epochs):
320 | num_batchs = int(allDataLength / Config.batch_size) + 1
321 | for j in range(num_batchs):
322 | global_step += 1
323 | now_batch = self._get_batch(self.train,j)
324 | start = time.time()
325 | batch_dict = {
326 | self.ph['single_index']: now_batch[1],
327 | self.ph['numerical_index']: now_batch[2],
328 | self.ph['value']: now_batch[-1],
329 | self.ph['label']: get_label(now_batch[0], 2),
330 | self.ph['numerical_value']:now_batch[3],
331 | self.train_phase:True
332 | }
333 | if Config.multi_features:
334 | for idx,s in enumerate(Config.multi_features):
335 | batch_dict[self.ph['multi_index_%s'%s]]= now_batch[4]
336 | batch_dict[self.ph['multi_value_%s'%s]] = now_batch[5]
337 | end = time.time()
338 | start = time.time()
339 | _out, _loss, _ = self.sess.run((self.softmax_output, self.loss, self.optimizer),
340 | feed_dict=batch_dict)
341 | end = time.time()
342 |
343 | if global_step % 10 == 0:
344 | __out, __loss, __ = self.sess.run((self.softmax_output, self.loss, self.optimizer),
345 | feed_dict=self.valid_dict)
346 | self.global_step.append(global_step)
347 | self.global_train_auc.append(_loss)
348 | self.global_valid_auc.append(__loss)
349 | print('step:',global_step,'train loss:',_loss,'valid loss:',__loss,'valid_auc:',auc_score(__out,get_label(self.valid_batch[0],2),2))
350 |
351 |
352 | def _save_loss(self):
353 | loss_result = pd.DataFrame({
354 | 'step':self.global_step,
355 | 'train_auc':self.global_train_auc,
356 | 'valid_auc':self.global_valid_auc
357 | })
358 | loss_result.to_csv('DeepFM_loss_result.csv',index=False)
359 |
360 | if __name__ == '__main__':
361 | xdeep = xDeepFM()
362 |
--------------------------------------------------------------------------------