├── .DS_Store ├── Config.py ├── README.md ├── __pycache__ ├── Config.cpython-36.pyc └── tools.cpython-36.pyc ├── data-pre.py ├── data_conf.txt ├── ex_data ├── dataset1.csv ├── dataset1.txt ├── dataset2.csv ├── dataset2.txt ├── dataset3.csv └── dataset3.txt ├── img ├── .DS_Store ├── pic1.png └── pic2.png ├── paper ├── 1.png ├── 10.png ├── 11.png ├── 12.png ├── 13.png ├── 2.png ├── 3.png ├── 4.png ├── 6.png ├── 7.png ├── 8.png ├── 9.png └── README.md ├── plot.py ├── tools.py └── xDeepFM.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/.DS_Store -------------------------------------------------------------------------------- /Config.py: -------------------------------------------------------------------------------- 1 | """ 2 | author : yang yiqing 2018年07月13日15:57:50 3 | """ 4 | 5 | # file 6 | train_file = 'ex_data/dataset1.csv' 7 | valid_file = 'ex_data/dataset2.csv' 8 | test_file = 'ex_data/dataset3.csv' 9 | 10 | train_save_file = 'ex_data/dataset1.txt' 11 | valid_save_file = 'ex_data/dataset2.txt' 12 | test_save_file = 'ex_data/dataset3.txt' 13 | 14 | label_name = 'label' 15 | 16 | # features 17 | numeric_features = ['all_launch_count', 'last_launch', 'all_video_count', 'last_video', 'all_video_day', 18 | 'all_action_count', 'last_action', 19 | 'all_action_day', 'register_day'] 20 | single_features = ['register_type', 'device_type'] 21 | multi_features = [] 22 | 23 | num_embedding = True 24 | single_feature_frequency = 10 25 | multi_feature_frequency = 0 26 | 27 | # model 28 | 29 | FM_layer = True 30 | DNN_layer = True 31 | CIN_layer = False 32 | 33 | use_numerical_embedding = False 34 | 35 | 36 | embedding_size = 16 37 | 38 | dnn_net_size = [128,64,32] 39 | cross_layer_size = [10,10,10] 40 | cross_direct = False 41 | cross_output_size = 1 42 | 43 | # train 44 | batch_size = 4096 45 | epochs = 4000 46 | learning_rate = 0.01 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xDeepFM 2 | 3 | ## 介绍 4 | 5 | **- 原论文题目:《xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems》** 6 | 7 | **- 原论文地址: [xDeepFM](https://arxiv.org/pdf/1803.05170.pdf.)** 8 | 9 | **- 论文整理: [论文整理](https://github.com/batch-norm/xDeepFM/tree/master/paper)** 10 | 11 | 下面对本论文提出的模型进行了复现。 12 | 13 | ## 模型介绍 14 | 15 | **模型架构 = FM + CIN + DNN** 16 | 17 | none 18 | 19 | ## 使用 20 | 21 | **1.数据准备** 22 | 23 | + 训练集和测试集需为.csv文件 24 | + 支持**数值型特征映射为embedding**,也支持**数值型特征直接作为DNN输入** 25 | + 支持**多值离散特征**的处理,可自行配置为sum or mean,分隔符请用"|" 26 | + **cat特征**需要自行先用labelEncoder转换一下 27 | 28 | 具体配置在**Config.py**文件中,也可结合ex_data中的例子作为参考。 29 | 30 | **转换完成后的训练数据示例:** 31 | 32 | ``` 33 | 1,18:1,30:1,0:0.25,2:0.8125,4:0.0,6:0.0,8:0.0,10:0.006630292147247738,12:0.8125,14:0.25,16:0.5625, 34 | ``` 35 | 36 | **2.模型训练** 37 | 38 | + 先在Config中指定单值离散,多值离散,连续型特征 39 | + 默认激活函数"relu",默认optimizer"Adagrad" 40 | + 默认DNN网络结构 [128,64,32] 41 | + 默认CIN卷积核维度 [10,10,10] ,输出维度 [1] 42 | + 默认使用 DNN + CIN + FM,可在Config中配置 43 | + 默认建立vocabulary的最低词频 10 44 | 45 | **3.模型实验** 46 | 47 | **- Batch_size : 4096 , epochs: 2000** 48 | 49 | **- 指标为"logloss"** 50 | 51 | **- 数据为 “2018中国高校大数据挑战赛” 初赛数据** 52 | 53 | none 54 | 55 | ## 小结 56 | 57 | 模型基于DeepFM加入了**CIN component (压缩交互网络)**,对原有的结构进行了vector-wise和边界明确的交互填充 58 | 59 | + **优点**:表达能力更强,可以发掘出vector-wise的交互特征,精度更高 60 | + **缺点**:训练速度变得很缓慢 61 | 62 | yyq 2018年07月21日13:47:42 63 | 64 | 65 | -------------------------------------------------------------------------------- /__pycache__/Config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/__pycache__/Config.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/tools.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/__pycache__/tools.cpython-36.pyc -------------------------------------------------------------------------------- /data-pre.py: -------------------------------------------------------------------------------- 1 | """ 2 | author:yang yiqing 2018年07月13日16:04:16 3 | - 数据处理,数值型必须是float,离散型必须是int,多值离散是str中间用|隔开,eg. "1|2|3" 4 | - 暂时不能有缺失值 5 | 6 | """ 7 | 8 | import Config 9 | import pandas as pd 10 | import numpy as np 11 | from sklearn.preprocessing import MinMaxScaler 12 | 13 | 14 | class Parse(object): 15 | def __init__(self): 16 | self.global_emb_idx = 0 17 | self.label_num = 0 18 | self.single_num = 0 19 | self.multi_num = 0 20 | self.train = pd.read_csv(Config.train_file, index_col=0) 21 | self.valid = pd.read_csv(Config.valid_file, index_col=0) 22 | self.test = pd.read_csv(Config.test_file, index_col=0) 23 | scalar = MinMaxScaler() 24 | all_data = pd.concat([self.train, self.valid, self.test]) 25 | print('transform data...') 26 | for s in Config.numeric_features: 27 | scalar.fit(all_data[s].values.reshape(-1,1)) 28 | self.train[s] = scalar.transform(self.train[s].values.reshape(-1,1)) 29 | self.valid[s] = scalar.transform(self.valid[s].values.reshape(-1,1)) 30 | self.test[s] = scalar.transform(self.test[s].values.reshape(-1,1)) 31 | self.check() 32 | self.num_features = Config.numeric_features 33 | self.single_features = Config.single_features 34 | self.multi_features = Config.multi_features 35 | self.backup_dict = {} 36 | 37 | self.num_dict = {} 38 | self.single_dict = {} 39 | self.multi_dict = {} 40 | self.get_dict() 41 | self.trans_data(self.train, Config.train_save_file) 42 | self.trans_data(self.valid, Config.valid_save_file) 43 | self.trans_data(self.test, Config.test_save_file) 44 | self.save_conf() 45 | 46 | 47 | def get_dict(self): 48 | print('prepare dict...') 49 | self.global_emb_idx = 0 50 | if self.num_features and Config.num_embedding: 51 | for s in self.num_features: 52 | self.num_dict[s] = self.global_emb_idx 53 | self.global_emb_idx += 1 54 | # for NaN 55 | self.backup_dict[s] = self.global_emb_idx 56 | self.global_emb_idx += 1 57 | #print(self.num_dict) 58 | 59 | if self.single_features: 60 | for s in self.single_features: 61 | # every filed 62 | frequency_dict = {} 63 | current_dict = {} 64 | values = pd.concat([self.train, self.valid, self.test])[s] 65 | for v in values: 66 | if v in frequency_dict: 67 | frequency_dict[v] += 1 68 | else: 69 | frequency_dict[v] = 1 70 | for k, v in frequency_dict.items(): 71 | if v > Config.single_feature_frequency: 72 | current_dict[k] = self.global_emb_idx 73 | self.global_emb_idx += 1 74 | self.single_dict[s] = current_dict 75 | self.backup_dict[s] = self.global_emb_idx 76 | # for NaN and low frequency word 77 | # 为每个field留出2个emb的位置来处理不在词典中的值和缺失值 78 | self.global_emb_idx += 1 79 | #print(self.single_dict) 80 | 81 | if self.multi_features: 82 | for s in self.multi_features: 83 | # every field 84 | frequency_dict = {} 85 | current_dict = {} 86 | values = pd.concat([self.train, self.valid, self.test])[s] 87 | for vs in values: 88 | for v in vs.split('|'): 89 | v = int(v) 90 | if v in frequency_dict: 91 | frequency_dict[v] += 1 92 | else: 93 | frequency_dict[v] = 1 94 | for k, v in frequency_dict.items(): 95 | if v > Config.multi_feature_frequency: 96 | current_dict[k] = self.global_emb_idx 97 | self.global_emb_idx += 1 98 | self.multi_dict[s] = current_dict 99 | self.backup_dict[s] = self.global_emb_idx 100 | # for NaN and low frequency word 101 | # 为每个field留出2个emb的位置来处理不在词典中的值和缺失值 102 | # self.global_emb_idx += 1 103 | #print(self.multi_dict) 104 | 105 | def trans_data(self, data, save_file): 106 | print('trans data...' + save_file) 107 | # label index1:value1 index2:value2 108 | 109 | with open(save_file, 'w') as f: 110 | # label, index : 值 111 | def write_to_file(line): 112 | label = line[Config.label_name] 113 | f.write(str(label) + ',') 114 | self.label_num += 1 115 | for s in self.single_features: 116 | now_v = line[s] 117 | if now_v in self.single_dict[s]: 118 | now_idx = self.single_dict[s][now_v] 119 | else: 120 | now_idx = self.backup_dict[s] 121 | f.write(str(now_idx) + ':' + str(1) + ',') 122 | self.single_num += 1 123 | for s in self.num_features: 124 | now_v = line[s] 125 | f.write(str(self.num_dict[s]) + ':' + str(now_v) + ',') 126 | self.single_num += 1 127 | for s in self.multi_features: 128 | now_v = line[s] 129 | if '|' not in now_v: 130 | idxs = [now_v] 131 | else: 132 | idxs = now_v.split('|') 133 | idxs = [x for x in idxs if int(x) in self.multi_dict[s]] 134 | if idxs: 135 | f.write(str('|'.join(idxs)) + ':' + str(1) + ',') 136 | else: 137 | f.write(str(self.backup_dict[s]) + ':' + str(1) + ',') 138 | self.multi_num += 1 139 | 140 | f.write('\n') 141 | 142 | 143 | data.apply(lambda x: write_to_file(x), axis=1) 144 | 145 | def check(self): 146 | if self.train.shape[1] == self.test.shape[1] == self.test.shape[1]: 147 | return True 148 | else: 149 | print('error , all dataset must have same shape') 150 | 151 | # 保存数据处理的信息 总的embedding大小,单值离散特征数量,数值型特征数量,多值离散特征数量 152 | def save_conf(self): 153 | with open('data_conf.txt', 'w') as f: 154 | f.write(str(self.global_emb_idx) + '\t') 155 | f.write(str(len(self.single_features)) + '\t') 156 | f.write(str(len(self.num_features)) + '\t') 157 | f.write(str(len(self.multi_features))) 158 | 159 | 160 | 161 | if __name__ == '__main__': 162 | pa = Parse() 163 | -------------------------------------------------------------------------------- /data_conf.txt: -------------------------------------------------------------------------------- 1 | 730 2 9 0 -------------------------------------------------------------------------------- /img/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/img/.DS_Store -------------------------------------------------------------------------------- /img/pic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/img/pic1.png -------------------------------------------------------------------------------- /img/pic2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/img/pic2.png -------------------------------------------------------------------------------- /paper/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/1.png -------------------------------------------------------------------------------- /paper/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/10.png -------------------------------------------------------------------------------- /paper/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/11.png -------------------------------------------------------------------------------- /paper/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/12.png -------------------------------------------------------------------------------- /paper/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/13.png -------------------------------------------------------------------------------- /paper/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/2.png -------------------------------------------------------------------------------- /paper/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/3.png -------------------------------------------------------------------------------- /paper/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/4.png -------------------------------------------------------------------------------- /paper/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/6.png -------------------------------------------------------------------------------- /paper/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/7.png -------------------------------------------------------------------------------- /paper/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/8.png -------------------------------------------------------------------------------- /paper/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batch-norm/xDeepFM/62a2f144bcafd467772cc791b8699b714c4b6e50/paper/9.png -------------------------------------------------------------------------------- /paper/README.md: -------------------------------------------------------------------------------- 1 | # XDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems 2 | 3 | # ABSTRACT 4 | 5 | 1.现在诸如DeepFM和Deep&Wide等模型都可以自动学习隐式的高维交互特征,并结合了低维特征,**但是有一个缺点就是它们的高维特征都是在bite-wise的层面上进行交互的**。本片论文提出了一种**压缩交互网络(Compressed Interaction Network(CIN))**,能够学习显式的交互特征并且是在vector-wise的级别,CIN带有一些CNN和RNN的特点,最终作者将整个模型命名为"eXtreme Deep Factorization Machine(xDeepFM)"。 6 | 7 | 2.本文提出的模型有两个优点: 8 | 9 | + 能够显式的学习有明确边界的高维交互特征 10 | + 能够学习隐式的低维和高维特征 11 | 12 | 个人理解这里作者对implicit和explicit的理解是交互特征的维度是否明确,在这里翻译为隐式和显式。 13 | 14 | ## INTRODUCTION 15 | 16 | 1.简单介绍了单值离散特征和多值离散特征,然后介绍三个**手动提取交互特征的缺点:** 17 | 18 | + 挖掘出高质量的交互特征需要非常专业的领域知识并且需要做大量的尝试,很耗时间。 19 | + 在大型的推荐系统中,原生特征是海量的,手动挖掘交叉特征几乎不可能。 20 | + 挖掘不出肉眼不可见的交叉特征 21 | 22 | 2.然后介绍了**经典的FM模型**,用提取隐向量然后做内积的形式来提取交叉特征,扩展的FM模型可以提取随机的高维特征,但是**主要的缺陷是:** 23 | 24 | + 会学习所有的交叉特征,其中肯定会包含无用的交叉组合,另外一篇论文指出引入无用的交叉特征会引入噪音并降低模型的表现。 25 | 26 | 3.介绍了引入了DNN的组合模型,"Factorisation-machine supported Neural Network **(FNN)**",它在DNN之前使用了预训练的field embedding。 27 | 28 | 4.介绍了**PNN**(Product-based Neural Network),在embedding layer和DNN Input之间插入了一层product layer,不依赖于pre-trained FM。 29 | 30 | 5.FNN和PNN的缺点都是忽略了低维交互特征,**Wide&Deep和DeepFM**模型通过混合架构解决了这种问题,但是它们同样存在缺点: 31 | 32 | + 它们学习到的高维特征是一种implicit fasion,没有一种公式可以明确推论出最终学习出来的交叉特征到底是多少维的 33 | + 另一方面,其DNN部分是在bit-wise的层面下进行学习的,而经典的FM架构是在vetor-wise层面学习的 34 | 35 | 6.本文提出的方法基于**DCN(Deep & Cross Network)**模型,其目标是有效率的捕捉到**边界明确的交叉特征**。 36 | 37 | # PRELEMINARIES 38 | 39 | ## Embedding Layer 40 | 41 | 介绍一些基于"univalent","multivalent"进行embedding的基础知识,这里不介绍了: 42 | 43 | 44 | 45 | ## Implicit High-order Interactions 46 | 47 | 前向传播过程: 48 | 49 | 50 | 51 | 这种架构是bit-wise层面的,意思是说,**即使是同一个filed embedding,不同的element之间也会互相影响。** 52 | 53 | PNN和DeepFM基于上面的缺点进行了改进,除了DNN component,**还添加了two-way interation layer到架构中,这样就既有vector-wise也有bit-wise的component了**。PNN和DeepFM的区别就是DeepFM是把product layer直接作为结果连到输出层,**而PNN是把product layer放在DNN和embedding layer之间** 54 | 55 | 56 | 57 | ## Explicit High-order Interactions 58 | 59 | 这里主要介绍了Cross Network(cross net)也是本文主要借鉴的一种模型,下面是该模型的架构: 60 | 61 | 62 | 63 | 该模型的主要目标是显示的构建高维交互特征,不像DNN前向传播的全连接层那样,每个隐藏层是通过如下公式计算出来的: 64 | 65 | 66 | 67 | 通过推导可以看出其实**每一个隐含层都是x0的一个scalar multiple,**这当然不是代表隐含层是x0的线性表达,只是说因为每一层原生x0都会参与计算,因此对x0非常敏感。**但是其缺点为:** 68 | 69 | + crossnet的输出是一种特殊形式,即x0的scalar multiple 70 | + 交互特征仍然是bit-wise层面的 71 | 72 | # OUR PROPSED MODEL 73 | 74 | ## Compressed Interation Network 75 | 76 | 本论文设计了一种新的cross network, 称为**Compressed Interaction Network (CIN)**, 设计的时候主要**考虑了下面三个方面:** 77 | 78 | + 交互特征是在vector-wise层面的(主要基于crossnet改进了这点) 79 | + 高维交互特征是显式的 80 | + 网络的复杂度不会因为交互层级的增加而增加 81 | 82 | **下面介绍了一些在CIN的中的概念:** 83 | 84 | 既然在CIN中是vector-wise层级的,那么每一个unit是一个vector,因此field embedding的输出是一个mxD的矩阵(D:embedding size,m:filed size),CIN的第k层是一个Hk x D的矩阵(Hk代表的是CIN中每一层的向量数量,H0=m),**下面是第CIN第k层的h-emb的计算公式:** 85 | 86 | 87 | 88 | 还是比较直观的,其中○代表Hadamard product: 89 | 90 | 91 | 92 | **可以发现k-th layer的计算也是和crossnet一样依赖于(k-1)-th layer和 0-th layer,因此交互特征是显式的,而且交互的层级随着网络结构的加深而增加(在这点上和crossnet是一样的),同时通过公式也可以很明显的看出,模型是vector-wise的:** 93 | 94 | 95 | 96 | 97 | 98 | 如果公式不好理解的话,可以通过如下图示来理解: 99 | 100 | 101 | 102 | 图(a)和(b)表示了如何从这一层的隐藏层(Hk x D)和X^0层(m X D)来产生下一层隐藏层的(Hk+1 x D),图示所示计算方法是为了更好的展现为什么模型有CNN的思想,先通过X0和Xk的第i列做一个outer product(matrix multiplication)得到一个Hk x m的矩阵(0<=i 107 | 108 | ## CIN Analysis 109 | 110 | 本文从空间复杂度,时间复杂度和多项式逼近等方面进行了分析,这里只介绍参数: 111 | 112 | 从公式也可以看出: 113 | 114 | 115 | 116 | 计算第k-th layer的第h(0 125 | 126 | 最终的模型结构如下: 127 | 128 | 129 | 130 | # EXPRIMENTS 131 | 132 | 这部分就不说了,反正就是好多实验,自己的模型就是吊吊吊,有用的信息是: 133 | 134 | + 对于这种高维稀疏特征来说,基于FM思想的模型例如DeepFM,Deep&Wide,PNN等比LR不知道高到哪里去了 135 | + 并不是混合模型就一定好,但是单用DNN component一般效果比较差 136 | + 这种用于高维稀疏特征的混合模型一般在比较浅层的比如2-3层的网络结构下会取得最好的效果 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import numpy as np 4 | 5 | # data = pd.read_csv('DNN_loss_result.csv') 6 | # plt.title('DNN') 7 | # plt.xlabel('step') 8 | # plt.ylabel('loss') 9 | # 10 | # plt.plot(data.step,data.valid_auc,'b',label='valid_loss') 11 | # plt.plot(data.step,data.train_auc,'g',label='train_loss') 12 | # plt.legend(bbox_to_anchor=[0.3, 0.4]) 13 | # plt.yticks(np.linspace(0.45,0.7,20)) 14 | # plt.grid() 15 | # plt.show() 16 | 17 | data1 = pd.read_csv('DNN_loss_result.csv') 18 | data2 = pd.read_csv('FM_loss_result.csv') 19 | data3 = pd.read_csv('xDeepFM_loss_result.csv') 20 | data4 = pd.read_csv('DeepFM_loss_result.csv') 21 | 22 | plt.title('Model loss') 23 | plt.xlabel('step') 24 | plt.ylabel('loss') 25 | 26 | plt.plot(data1.step,data1.valid_auc,'b',label='DNN loss') 27 | plt.plot(data2.step,data2.valid_auc,'g',label='FM loss') 28 | plt.plot(data3.step,data3.valid_auc,'r',label='xDeepFM loss') 29 | plt.plot(data4.step[10:],data4.valid_auc[10:],'y',label='DeepFM loss') 30 | plt.legend(bbox_to_anchor=[0.3, 0.4]) 31 | plt.yticks(np.linspace(0.45,0.7,20)) 32 | plt.grid() 33 | plt.show() -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | from sklearn.metrics import roc_auc_score 5 | 6 | def auc_score(preds, labels, label_size): 7 | preds = [x[label_size - 1] for x in preds] 8 | labels = [x[label_size - 1] for x in labels] 9 | roc_score = roc_auc_score(labels, preds) 10 | return roc_score 11 | 12 | 13 | def _get_data(data_dir): 14 | data = [] 15 | with open(data_dir, 'r') as f: 16 | line = f.readline() 17 | while line: 18 | data.append(line) 19 | line = f.readline() 20 | return data 21 | 22 | 23 | def _get_conf(): 24 | with open('data_conf.txt', 'r') as f: 25 | line = f.readline() 26 | line = line.split('\t') 27 | return int(line[0]), int(line[1]), int(line[2]), int(line[3]) 28 | 29 | def get_label(labels, label_size): 30 | final_label = [] 31 | for v in labels: 32 | temp_label = [0] * label_size 33 | temp_label[v] = 1 34 | final_label.append(temp_label) 35 | return final_label 36 | -------------------------------------------------------------------------------- /xDeepFM.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import tensorflow as tf 4 | import Config 5 | from tools import _get_data, _get_conf, get_label,auc_score 6 | import random 7 | import time 8 | 9 | class xDeepFM(object): 10 | def __init__(self): 11 | self.total_emb, self.single_size, self.numerical_size, self.multi_size = _get_conf() 12 | self.field_size = self.single_size + self.numerical_size + self.multi_size 13 | self.embedding_length = self.field_size * Config.embedding_size 14 | self._init_data() 15 | self._init_placeholder() 16 | self._init_Variable() 17 | self._init_Model() 18 | self.valid_batch = self._get_batch(self.valid, -1) 19 | self.valid_label = get_label(self.valid_batch[0], 2) 20 | self.valid_dict = { 21 | self.ph['single_index']: self.valid_batch[1], 22 | self.ph['numerical_index']: self.valid_batch[2], 23 | self.ph['numerical_value']: self.valid_batch[3], 24 | self.ph['value']: self.valid_batch[-1], 25 | self.ph['label']: self.valid_label, 26 | self.train_phase: False 27 | } 28 | if Config.multi_features: 29 | for idx, s in enumerate(Config.multi_features): 30 | self.valid_dict[self.ph['multi_index_%s' % s]] = self.valid_batch[4] 31 | self.valid_dict[self.ph['multi_value_%s' % s]] = self.valid_batch[5] 32 | self.global_step = [] 33 | self.global_train_auc = [] 34 | self.global_valid_auc = [] 35 | 36 | self._train() 37 | self._save_loss() 38 | 39 | 40 | 41 | 42 | def _init_data(self): 43 | self.train = _get_data(Config.train_save_file) 44 | self.valid = _get_data(Config.valid_save_file) 45 | self.test = _get_data(Config.test_save_file) 46 | 47 | 48 | 49 | 50 | 51 | 52 | def _get_batch(self, data, idx): 53 | start = time.time() 54 | if idx == -1: 55 | batch_data = data 56 | elif (idx + 1) * Config.batch_size <= len(data): 57 | batch_data = data[idx*Config.batch_size:(idx+1)*Config.batch_size] 58 | else: 59 | batch_data = data[idx*Config.batch_size:] 60 | final_label = [] 61 | final_single_index = [] 62 | final_numerical_value = [] 63 | final_numerical_index = [] 64 | final_multi_sparse_index = [] 65 | final_multi_sparse_value = [] 66 | final_value = [] 67 | for idx, line in enumerate(batch_data): 68 | line_index = [] 69 | line_value = [] 70 | line_numerical_value = [] 71 | line_data = line.split(',') 72 | final_label.append(int(line_data[0])) 73 | if self.single_size: 74 | for i in range(1, 1 + self.single_size): 75 | single_pair = line_data[i].split(':') 76 | line_index.append(int(single_pair[0])) 77 | line_value.append(float(single_pair[1])) 78 | final_single_index.append(line_index) 79 | line_index = [] 80 | if self.single_size + self.numerical_size: 81 | for i in range(1 + self.single_size, 1 + self.single_size + self.numerical_size): 82 | single_pair = line_data[i].split(':') 83 | if not Config.use_numerical_embedding: 84 | line_numerical_value.append(float(single_pair[1])) 85 | if float(single_pair[1]) == 0: 86 | line_index.append(int(9999)) 87 | line_value.append(float(1)) 88 | else: 89 | line_index.append(int(single_pair[0])) 90 | line_value.append(float(single_pair[1])) 91 | final_numerical_value.append(line_numerical_value) 92 | final_numerical_index.append(line_index) 93 | line_index = [] 94 | total_length = 1 + self.single_size + self.numerical_size + self.multi_size 95 | if self.multi_size: 96 | for i in range(1 + self.single_size + self.numerical_size, total_length): 97 | single_pair = line_data[i].split(':') 98 | _multi = [int(x) for x in single_pair[0].split('|')] 99 | line_index.append(_multi) 100 | for v in _multi: 101 | final_multi_sparse_index.append([idx, idx]) 102 | final_multi_sparse_value.append(v) 103 | line_value.append(float(single_pair[1])) 104 | final_value.append(line_value) 105 | end = time.time() 106 | return [final_label, final_single_index, final_numerical_index,final_numerical_value,final_multi_sparse_index, final_multi_sparse_value, final_value] 107 | 108 | def _init_placeholder(self): 109 | self.ph = {} 110 | self.ph['label'] = tf.placeholder(dtype=tf.int8, shape=[None, 2]) 111 | self.train_phase = tf.placeholder(tf.bool, name="train_phase") 112 | self.ph['value'] = tf.placeholder(dtype=tf.float32, 113 | shape=[None, self.single_size + self.numerical_size + self.multi_size]) 114 | self.ph['single_index'] = tf.placeholder(dtype=tf.int32, shape=[None, self.single_size]) 115 | self.ph['numerical_index'] = tf.placeholder(dtype=tf.int32, shape=[None, self.numerical_size]) 116 | for s in Config.multi_features: 117 | self.ph['multi_index_%s' % s] = tf.placeholder(dtype=tf.int64, shape=[None, 2]) 118 | self.ph['multi_value_%s' % s] = tf.placeholder(dtype=tf.int64, shape=[None]) 119 | if not Config.use_numerical_embedding: 120 | self.ph['numerical_value'] = tf.placeholder(dtype=tf.float32,shape=[None,self.numerical_size]) 121 | 122 | def _init_Variable(self): 123 | self.vr = {} 124 | self.vr['single_second_embedding'] = tf.get_variable(name='single_second_embedding', 125 | shape=(10000, Config.embedding_size), 126 | initializer=tf.glorot_uniform_initializer()) 127 | self.vr['numerical_second_embedding'] = tf.get_variable(name='numerical_second_embedding', 128 | shape=(10000, Config.embedding_size), 129 | initializer=tf.glorot_uniform_initializer()) 130 | for s in Config.multi_features: 131 | self.vr['multi_second_embedding_%s' % s] = tf.get_variable(name='multi_second_embedding_%s' % s, 132 | shape=(10000, Config.embedding_size), 133 | initializer=tf.glorot_uniform_initializer()) 134 | 135 | self.vr['single_first_embedding'] = tf.get_variable(name='single_first_embedding', 136 | shape=(10000, 1), 137 | initializer=tf.glorot_uniform_initializer()) 138 | self.vr['numerical_first_embedding'] = tf.get_variable(name='numerical_first_embedding', 139 | shape=(10000, 1), 140 | initializer=tf.glorot_uniform_initializer()) 141 | for s in Config.multi_features: 142 | self.vr['multi_first_embedding_%s' % s] = tf.get_variable(name='multi_first_embedding_%s' % s, 143 | shape=(10000, 1), 144 | initializer=tf.glorot_uniform_initializer()) 145 | # DNN part 146 | if Config.use_numerical_embedding: 147 | dnn_net = [self.embedding_length] + Config.dnn_net_size 148 | else: 149 | dnn_net = [self.embedding_length - self.numerical_size * Config.embedding_size + self.numerical_size] + Config.dnn_net_size 150 | for i in range(len(Config.dnn_net_size)): 151 | self.vr['W_%d' % i] = tf.get_variable(name='W_%d' % i, shape=[dnn_net[i], dnn_net[i + 1]], 152 | initializer=tf.glorot_uniform_initializer()) 153 | self.vr['b_%d' % i] = tf.get_variable(name='b_%d' % i, shape=[dnn_net[i + 1]], 154 | initializer=tf.zeros_initializer()) 155 | # output 156 | 157 | def _init_Model(self): 158 | # first embedding 159 | first_single_result = tf.reshape(tf.nn.embedding_lookup(self.vr['single_first_embedding'], 160 | self.ph['single_index']), 161 | shape=[-1, self.single_size] 162 | ) 163 | first_numerical_result = tf.reshape(tf.nn.embedding_lookup(self.vr['numerical_first_embedding'], 164 | self.ph['numerical_index']), 165 | shape=[-1, self.numerical_size] 166 | ) 167 | first_multi_result = [] 168 | if Config.multi_features: 169 | for s in Config.multi_features: 170 | temp_multi_result = tf.nn.embedding_lookup_sparse(self.vr['multi_first_embedding_%s' % s], 171 | tf.SparseTensor(indices=self.ph['multi_index_%s' % s], 172 | values=self.ph['multi_value_%s' % s], 173 | dense_shape=(Config.batch_size, 174 | Config.embedding_size)), 175 | None, 176 | combiner="sum" 177 | ) 178 | first_multi_result.append(temp_multi_result) 179 | first_multi_result = tf.concat(first_multi_result, axis=1) 180 | first_embedding_output = tf.concat([first_single_result, first_numerical_result,first_multi_result], axis=1) 181 | else: 182 | first_embedding_output = tf.concat([first_single_result, first_numerical_result], axis=1) 183 | 184 | y_first_order = tf.multiply(first_embedding_output, self.ph['value']) 185 | 186 | # second embedding 187 | second_single_result = tf.reshape(tf.nn.embedding_lookup(self.vr['single_second_embedding'], 188 | self.ph['single_index']), 189 | shape=[-1, Config.embedding_size * self.single_size] 190 | ) 191 | second_numerical_result = tf.reshape(tf.nn.embedding_lookup(self.vr['numerical_second_embedding'], 192 | self.ph['numerical_index']), 193 | shape=[-1, Config.embedding_size * self.numerical_size] 194 | ) 195 | if Config.multi_features: 196 | second_multi_result = [] 197 | for s in Config.multi_features: 198 | temp_multi_result = tf.nn.embedding_lookup_sparse(self.vr['multi_second_embedding_%s' % s], 199 | tf.SparseTensor(indices=self.ph['multi_index_%s' % s], 200 | values=self.ph['multi_value_%s' % s], 201 | dense_shape=(Config.batch_size, 202 | Config.embedding_size)), 203 | None, 204 | combiner="sum" 205 | ) 206 | second_multi_result.append(temp_multi_result) 207 | second_multi_result = tf.concat(second_multi_result, axis=1) 208 | # DNN input 209 | self.DNN_input = tf.concat([second_single_result,second_multi_result], axis=1) 210 | else: 211 | self.DNN_input = tf.concat([second_single_result], axis=1) 212 | self.middle_fm_input = tf.concat([self.DNN_input,second_numerical_result], axis=1) 213 | if Config.use_numerical_embedding: 214 | self.DNN_input = tf.concat([self.DNN_input,second_numerical_result], axis=1) 215 | else: 216 | self.DNN_input = tf.concat([self.DNN_input,self.ph['numerical_value']],axis=1) 217 | self.shape = tf.shape(self.DNN_input) 218 | # second output 219 | second_FM_input = tf.reshape(self.middle_fm_input, shape=[-1, self.single_size + self.numerical_size + self.multi_size, 220 | Config.embedding_size]) 221 | 222 | summed_features_emb = tf.reduce_sum(second_FM_input, 1) 223 | summed_features_emb_square = tf.square(summed_features_emb) 224 | squared_features_emb = tf.square(second_FM_input) 225 | squared_sum_features_emb = tf.reduce_sum(squared_features_emb, 1) 226 | y_second_order = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb) 227 | 228 | dnn_output = self.DNN_input 229 | # DNN output 230 | for i in range(len(Config.dnn_net_size)): 231 | self.DNN_input = tf.add(tf.matmul(self.DNN_input, self.vr['W_%d' % i]), self.vr['b_%d' % i]) 232 | self.DNN_input = tf.layers.batch_normalization(self.DNN_input,training=self.train_phase) 233 | dnn_output = tf.nn.relu(self.DNN_input) 234 | 235 | # CIN 236 | D = Config.embedding_size 237 | final_result = [] 238 | final_len = 0 239 | field_nums = [self.field_size] 240 | if Config.multi_features: 241 | nn_input = tf.reshape(tf.concat([second_single_result, second_multi_result], axis=1), 242 | shape=[-1, self.field_size, Config.embedding_size]) 243 | else: 244 | nn_input = tf.reshape(second_single_result, 245 | shape=[-1, self.field_size, Config.embedding_size]) 246 | cin_layers = [nn_input] 247 | split_tensor_0 = tf.split(nn_input, D * [1], 2) 248 | for idx, layer_size in enumerate(Config.cross_layer_size): 249 | now_tensor = tf.split(cin_layers[-1], D * [1], 2) 250 | # Hk x m 251 | dot_result_m = tf.matmul(split_tensor_0, now_tensor, transpose_b=True) 252 | dot_result_o = tf.reshape(dot_result_m, shape=[D, -1, field_nums[0] * field_nums[-1]]) 253 | dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2]) 254 | filters = tf.get_variable(name="f_" + str(idx), shape=[1, field_nums[-1] * field_nums[0], layer_size], 255 | dtype=tf.float32) 256 | curr_out = tf.nn.conv1d(dot_result, filters=filters, stride=1, padding='VALID') 257 | b = tf.get_variable(name="f_b" + str(idx), shape=[layer_size], dtype=tf.float32, 258 | initializer=tf.zeros_initializer()) 259 | curr_out = tf.nn.relu(tf.nn.bias_add(curr_out, b)) 260 | curr_out = tf.transpose(curr_out, perm=[0, 2, 1]) 261 | if Config.cross_direct: 262 | direct_connect = curr_out 263 | next_hidden = curr_out 264 | final_len += layer_size 265 | field_nums.append(int(layer_size)) 266 | else: 267 | if idx != len(Config.cross_layer_size) - 1: 268 | next_hidden, direct_connect = tf.split(curr_out, 2 * [int(layer_size / 2)], 1) 269 | final_len += int(layer_size / 2) 270 | else: 271 | direct_connect = curr_out 272 | next_hidden = 0 273 | final_len += layer_size 274 | 275 | field_nums.append(int(layer_size / 2)) 276 | final_result.append(direct_connect) 277 | cin_layers.append(next_hidden) 278 | result = tf.concat(final_result, axis=1) 279 | result = tf.reduce_sum(result, -1) 280 | w_nn_output1 = tf.get_variable(name='w_nn_output1', shape=[final_len, Config.cross_output_size], 281 | dtype=tf.float32) 282 | b_nn_output1 = tf.get_variable(name='b_nn_output1', shape=[Config.cross_output_size], dtype=tf.float32, 283 | initializer=tf.zeros_initializer()) 284 | CIN_out = tf.nn.xw_plus_b(result, w_nn_output1, b_nn_output1) 285 | 286 | # final output 287 | output_length = 0 288 | to_concat = [] 289 | if Config.FM_layer: 290 | to_concat.append(y_first_order) 291 | to_concat.append(y_second_order) 292 | output_length += self.field_size + Config.embedding_size 293 | if Config.CIN_layer: 294 | to_concat.append(CIN_out) 295 | output_length += Config.cross_output_size 296 | if Config.DNN_layer: 297 | to_concat.append(dnn_output) 298 | output_length += Config.dnn_net_size[-1] 299 | 300 | output = tf.concat(to_concat, axis=1) 301 | 302 | self.vr['final_w'] = tf.get_variable(name='final_w', shape=[output_length, 2], 303 | initializer=tf.glorot_uniform_initializer()) 304 | self.vr['final_b'] = tf.get_variable(name='final_b', shape=[2], 305 | initializer=tf.zeros_initializer()) 306 | final_logits = tf.add(tf.matmul(output, self.vr['final_w']), self.vr['final_b']) 307 | self.softmax_output = tf.nn.softmax(final_logits) 308 | self.loss = tf.reduce_mean( 309 | tf.nn.softmax_cross_entropy_with_logits(labels=self.ph['label'], logits=final_logits)) 310 | self.optimizer = tf.train.AdagradOptimizer(learning_rate=Config.learning_rate).minimize(self.loss) 311 | 312 | def _train(self): 313 | print('....') 314 | with tf.Session() as self.sess: 315 | self.sess.run(tf.global_variables_initializer()) 316 | allDataLength = len(self.train) 317 | global_step = 0 318 | print('total step:%d'%(Config.epochs * (int(allDataLength / Config.batch_size) + 1))) 319 | for i in range(Config.epochs): 320 | num_batchs = int(allDataLength / Config.batch_size) + 1 321 | for j in range(num_batchs): 322 | global_step += 1 323 | now_batch = self._get_batch(self.train,j) 324 | start = time.time() 325 | batch_dict = { 326 | self.ph['single_index']: now_batch[1], 327 | self.ph['numerical_index']: now_batch[2], 328 | self.ph['value']: now_batch[-1], 329 | self.ph['label']: get_label(now_batch[0], 2), 330 | self.ph['numerical_value']:now_batch[3], 331 | self.train_phase:True 332 | } 333 | if Config.multi_features: 334 | for idx,s in enumerate(Config.multi_features): 335 | batch_dict[self.ph['multi_index_%s'%s]]= now_batch[4] 336 | batch_dict[self.ph['multi_value_%s'%s]] = now_batch[5] 337 | end = time.time() 338 | start = time.time() 339 | _out, _loss, _ = self.sess.run((self.softmax_output, self.loss, self.optimizer), 340 | feed_dict=batch_dict) 341 | end = time.time() 342 | 343 | if global_step % 10 == 0: 344 | __out, __loss, __ = self.sess.run((self.softmax_output, self.loss, self.optimizer), 345 | feed_dict=self.valid_dict) 346 | self.global_step.append(global_step) 347 | self.global_train_auc.append(_loss) 348 | self.global_valid_auc.append(__loss) 349 | print('step:',global_step,'train loss:',_loss,'valid loss:',__loss,'valid_auc:',auc_score(__out,get_label(self.valid_batch[0],2),2)) 350 | 351 | 352 | def _save_loss(self): 353 | loss_result = pd.DataFrame({ 354 | 'step':self.global_step, 355 | 'train_auc':self.global_train_auc, 356 | 'valid_auc':self.global_valid_auc 357 | }) 358 | loss_result.to_csv('DeepFM_loss_result.csv',index=False) 359 | 360 | if __name__ == '__main__': 361 | xdeep = xDeepFM() 362 | --------------------------------------------------------------------------------