├── README.md
├── run_deepfm.py
└── deepctrmodels
    └── deepfm.py


/README.md:
--------------------------------------------------------------------------------
1 | # DeepFM_torch
2 | 本程序是利用**Pytorch**实现**DeepFM**算法，以Criteo数据集作为测试，AUC值目前可以达到79.15%
3 | 
4 | 模型实现细节见: https://blog.csdn.net/springtostring/article/details/108157070
5 | 
6 | 参考代码库DeepCTR:https://github.com/shenweichen/DeepCTR-Torch
7 | 


--------------------------------------------------------------------------------
/run_deepfm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pandas as pd
 3 | import torch
 4 | from sklearn.metrics import log_loss, roc_auc_score
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
 7 | import sys
 8 | import os
 9 | 
10 | import torch.nn as nn
11 | import numpy as np
12 | import torch.utils.data as Data
13 | from torch.utils.data import DataLoader
14 | import torch.optim as optim
15 | import torch.nn.functional as F
16 | from sklearn.metrics import log_loss, roc_auc_score
17 | from collections import OrderedDict, namedtuple, defaultdict
18 | import random
19 | from deepctrmodels.deepfm import Deepfm
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | if __name__ == "__main__":
27 | 
28 |     seed = 1024
29 |     torch.manual_seed(seed)  # 为CPU设置随机种子
30 |     torch.cuda.manual_seed(seed)  # 为当前GPU设置随机种子
31 |     torch.cuda.manual_seed_all(seed)  # 为所有GPU设置随机种子
32 |     np.random.seed(seed)
33 |     random.seed(seed)
34 | 
35 |     sparse_features = ['C' + str(i) for i in range(1, 27)]   #C代表类别特征 class
36 |     dense_features =  ['I' + str(i) for i in range(1, 14)]   #I代表数值特征 int
37 |     col_names = ['label'] + dense_features + sparse_features
38 |     data = pd.read_csv('dac/train.txt', names=col_names, sep='\t')
39 |     # data = pd.read_csv('criteo_train_1m.txt', names=col_names, sep='\t')
40 |     # data = pd.read_csv('total.txt')
41 |     feature_names = sparse_features + dense_features         #全体特征名
42 |     data[sparse_features] = data[sparse_features].fillna('-1', )   # 类别特征缺失 ，使用-1代替
43 |     data[dense_features] = data[dense_features].fillna(0, )        # 数值特征缺失，使用0代替
44 |     target = ['label']                                             # label
45 | 
46 |     # 1.Label Encoding for sparse features,and do simple Transformation for dense features
47 |     # 使用LabelEncoder()，为类别特征的每一个item编号
48 |     for feat in sparse_features:
49 |         lbe = LabelEncoder()
50 |         data[feat] = lbe.fit_transform(data[feat])
51 | 
52 |     # 数值特征 max-min 0-1归化
53 |     mms = MinMaxScaler(feature_range=(0, 1))
54 |     data[dense_features] = mms.fit_transform(data[dense_features])
55 | 
56 |     # 2.count #unique features for each sparse field,and record dense feature field name
57 |     feat_sizes1={ feat:1 for feat in dense_features}
58 |     feat_sizes2 = {feat: len(data[feat].unique()) for feat in sparse_features}
59 |     feat_sizes={}
60 |     feat_sizes.update(feat_sizes1)
61 |     feat_sizes.update(feat_sizes2)
62 |     # print(feat_sizes)
63 | 
64 |     # 3.generate input data for model
65 |     train, test = train_test_split(data, test_size=0.2,random_state=2020)
66 |     # print(train.head(5))
67 |     # print(test.head(5))
68 |     train_model_input = {name: train[name] for name in feature_names}
69 |     test_model_input =  {name: test[name]  for name in feature_names}
70 | 
71 |     # 4.Define Model,train,predict and evaluate
72 |     device = 'cpu'
73 |     use_cuda = True
74 |     if use_cuda and torch.cuda.is_available():
75 |         print('cuda ready...')
76 |         device = 'cuda:0'
77 | 
78 |     model = Deepfm(feat_sizes ,sparse_feature_columns = sparse_features,dense_feature_columns = dense_features,
79 |                    dnn_hidden_units=[400,400,400] , dnn_dropout=0.9 , ebedding_size = 8 ,
80 |                    l2_reg_linear=1e-3, device=device)
81 | 
82 | 
83 | 
84 |     model.fit(train_model_input, train[target].values , test_model_input , test[target].values ,batch_size=50000, epochs=150, verbose=1)
85 | 
86 |     pred_ans = model.predict(test_model_input, 50000)
87 | 
88 |     print("final test")
89 |     print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
90 |     print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))


--------------------------------------------------------------------------------
/deepctrmodels/deepfm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pandas as pd
  3 | import torch
  4 | from sklearn.metrics import log_loss, roc_auc_score
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  7 | import sys
  8 | import os
  9 | 
 10 | import torch.nn as nn
 11 | import numpy as np
 12 | import torch.utils.data as Data
 13 | from torch.utils.data import DataLoader
 14 | import torch.optim as optim
 15 | import torch.nn.functional as F
 16 | from sklearn.metrics import log_loss, roc_auc_score
 17 | from collections import OrderedDict, namedtuple, defaultdict
 18 | import random
 19 | 
 20 | 
 21 | # 添加 dropout 和 L2正则项 添加后与deepctr还是有区别 L2正则项需要继续改进
 22 | # init()代码太冗杂了，封装几个函数
 23 | 
 24 | class Deepfm(nn.Module):
 25 | 
 26 |     def __init__(self, feat_sizes, sparse_feature_columns, dense_feature_columns,dnn_hidden_units=[400, 400,400], dnn_dropout=0.0, ebedding_size=4,
 27 |                  l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024,
 28 |                  device='cpu'):
 29 | 
 30 |         super(Deepfm, self).__init__()
 31 |         self.feat_sizes = feat_sizes
 32 |         self.device = device
 33 |         self.sparse_feature_columns = sparse_feature_columns
 34 |         self.dense_feature_columns = dense_feature_columns
 35 |         self.embedding_size = ebedding_size
 36 |         self.l2_reg_linear = l2_reg_linear
 37 | 
 38 |         # self.feature_index 建立feature到列名到输入数据X的相对位置的映射
 39 |         self.feature_index = self.build_input_features(self.feat_sizes)
 40 | 
 41 |         self.bias = nn.Parameter(torch.zeros((1,)))
 42 |         # self.weight
 43 |         self.weight = nn.Parameter(torch.Tensor(len(self.dense_feature_columns), 1)).to(device)
 44 |         torch.nn.init.normal_(self.weight, mean=0, std=0.0001)
 45 | 
 46 |         self.embedding_dict1 = self.create_embedding_matrix(self.sparse_feature_columns , feat_sizes , 1 ,
 47 |                                                        sparse=False, device=self.device)
 48 | 
 49 |         self.embedding_dict2 = self.create_embedding_matrix(self.sparse_feature_columns , feat_sizes , self.embedding_size ,
 50 |                                                        sparse=False, device=self.device)
 51 |         # dnn
 52 |         self.dropout = nn.Dropout(dnn_dropout)
 53 |         self.dnn_input_size = self.embedding_size * len(self.sparse_feature_columns) + len(self.dense_feature_columns)
 54 |         hidden_units = [self.dnn_input_size] + dnn_hidden_units
 55 |         self.linears = nn.ModuleList(
 56 |             [nn.Linear(hidden_units[i], hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])
 57 |         self.relus = nn.ModuleList(
 58 |             [nn.ReLU() for i in range(len(hidden_units) - 1)])
 59 |         for name, tensor in self.linears.named_parameters():
 60 |             if 'weight' in name:
 61 |                 nn.init.normal_(tensor, mean=0, std=init_std)
 62 |         # self.linears =self.linears.to(device)
 63 |         self.dnn_linear = nn.Linear(
 64 |             dnn_hidden_units[-1], 1, bias=False).to(device)
 65 | 
 66 |         self.to(device)
 67 | 
 68 |     def forward(self, X):
 69 |         '''
 70 |         :param X: pd.DtateFrame
 71 |         :return:  y_pre
 72 |         '''
 73 | 
 74 |         '''
 75 |             FM liner
 76 |         '''
 77 |         sparse_embedding_list1 = [self.embedding_dict1[feat](
 78 |             X[:, self.feature_index[feat][0]:self.feature_index[feat][1]].long())
 79 |             for feat in self.sparse_feature_columns]
 80 | 
 81 |         dense_value_list2 = [X[:, self.feature_index[feat][0]:self.feature_index[feat][1]]
 82 |                              for feat in self.dense_feature_columns]
 83 |         linear_sparse_logit = torch.sum(
 84 |             torch.cat(sparse_embedding_list1, dim=-1), dim=-1, keepdim=False)
 85 |         linear_dense_logit = torch.cat(
 86 |             dense_value_list2, dim=-1).matmul(self.weight)
 87 |         logit = linear_sparse_logit + linear_dense_logit
 88 | 
 89 |         sparse_embedding_list = [self.embedding_dict2[feat](
 90 |             X[:, self.feature_index[feat][0]:self.feature_index[feat][1]].long())
 91 |             for feat in self.sparse_feature_columns]
 92 |         '''
 93 |             FM second
 94 |         '''
 95 |         fm_input = torch.cat(sparse_embedding_list, dim=1)  # shape: (batch_size,field_size,embedding_size)
 96 |         square_of_sum = torch.pow(torch.sum(fm_input, dim=1, keepdim=True), 2)  # shape: (batch_size,1,embedding_size)
 97 |         sum_of_square = torch.sum(torch.pow(fm_input, 2), dim=1, keepdim=True)  # shape: (batch_size,1,embedding_size)
 98 |         cross_term = square_of_sum - sum_of_square
 99 |         cross_term = 0.5 * torch.sum(cross_term, dim=2, keepdim=False)  # shape: (batch_size,1)
100 |         logit += cross_term
101 | 
102 |         '''
103 |             DNN
104 |         '''
105 |         #  sparse_embedding_list、 dense_value_list2
106 |         dnn_sparse_input = torch.cat(sparse_embedding_list, dim=1)
107 |         batch_size = dnn_sparse_input.shape[0]
108 |         # print(dnn_sparse_input.shape)
109 |         dnn_sparse_input=dnn_sparse_input.reshape(batch_size,-1)
110 |         # dnn_sparse_input shape: [ batch_size, len(sparse_feat)*embedding_size ]
111 |         dnn_dense_input = torch.cat(dense_value_list2, dim=-1)
112 |         # print(dnn_sparse_input.shape)
113 |         # dnn_dense_input shape: [ batch_size, len(dense_feat) ]
114 |         dnn_total_input = torch.cat([dnn_sparse_input, dnn_dense_input], dim=-1)
115 |         deep_input = dnn_total_input
116 | 
117 |         for i in range(len(self.linears)):
118 |             fc = self.linears[i](deep_input)
119 |             fc = self.relus[i](fc)
120 |             fc = self.dropout(fc)
121 |             deep_input = fc
122 |         dnn_output = self.dnn_linear(deep_input)
123 | 
124 |         logit += dnn_output
125 |         '''
126 |             output
127 |         '''
128 |         y_pred = torch.sigmoid(logit+self.bias)
129 |         return y_pred
130 | 
131 |     def fit(self, train_input, y_label, val_input, y_val, batch_size=5000, epochs=15, verbose=5):
132 |         x = [train_input[feature] for feature in self.feature_index]
133 | 
134 |         for i in range(len(x)):
135 |             if len(x[i].shape) == 1:
136 |                 x[i] = np.expand_dims(x[i], axis=1)  # 扩展成2维，以便后续cat
137 | 
138 |         train_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x, axis=-1)), torch.from_numpy(y_label))
139 |         train_loader = DataLoader(dataset=train_tensor_data,shuffle=True ,batch_size=batch_size)
140 | 
141 |         print(self.device, end="\n")
142 |         model = self.train()
143 |         loss_func = F.binary_cross_entropy
144 |         # loss_func = F.binary_cross_entropy_with_logits
145 |         optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay = 0.0)
146 |         # optimizer = optim.Adagrad(model.parameters(),lr=0.01)
147 |         # 显示 一次epoch需要几个step
148 |         sample_num = len(train_tensor_data)
149 |         steps_per_epoch = (sample_num - 1) // batch_size + 1
150 | 
151 |         print("Train on {0} samples,  {1} steps per epoch".format(
152 |             len(train_tensor_data), steps_per_epoch))
153 | 
154 |         for epoch in range(epochs):
155 |             loss_epoch = 0
156 |             total_loss_epoch = 0.0
157 |             train_result = {}
158 |             pred_ans = []
159 |             true_ans = []
160 |             with torch.autograd.set_detect_anomaly(True):
161 |                 for index, (x_train, y_train) in enumerate(train_loader):
162 |                     x = x_train.to(self.device).float()
163 |                     y = y_train.to(self.device).float()
164 | 
165 |                     y_pred = model(x).squeeze()
166 | 
167 |                     optimizer.zero_grad()
168 |                     loss = loss_func(y_pred, y.squeeze(),reduction='mean')
169 |                     #L2 norm
170 |                     loss = loss + self.l2_reg_linear * self.get_L2_Norm()
171 |                     loss.backward(retain_graph=True)
172 |                     optimizer.step()
173 |                     total_loss_epoch = total_loss_epoch + loss.item()
174 |                     y_pred = y_pred.cpu().data.numpy()  # .squeeze()
175 |                     pred_ans.append(y_pred)
176 |                     true_ans.append(y.squeeze().cpu().data.numpy())
177 | 
178 |             if (epoch % verbose == 0):
179 |                 print('epoch %d train loss is %.4f train AUC is %.4f' %
180 |                       (epoch,total_loss_epoch / steps_per_epoch,roc_auc_score(np.concatenate(true_ans), np.concatenate(pred_ans))))
181 |                 self.val_auc_logloss(val_input, y_val, batch_size=50000)
182 |                 print(" ")
183 | 
184 |     def predict(self, test_input, batch_size = 256, use_double=False):
185 |         """
186 | 
187 |         :param x: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
188 |         :param batch_size: Integer. If unspecified, it will default to 256.
189 |         :return: Numpy array(s) of predictions.
190 |         """
191 |         model = self.eval()
192 |         x = [test_input[feature] for feature in self.feature_index]
193 | 
194 |         for i in range(len(x)):
195 |             if len(x[i].shape) == 1:
196 |                 x[i] = np.expand_dims(x[i], axis=1)  # 扩展成2维，以便后续cat
197 | 
198 |         tensor_data = Data.TensorDataset(
199 |             torch.from_numpy(np.concatenate(x, axis=-1)))
200 |         test_loader = DataLoader(
201 |             dataset=tensor_data, shuffle=False, batch_size=batch_size)
202 | 
203 |         pred_ans = []
204 |         with torch.no_grad():
205 |             for index, x_test in enumerate(test_loader):
206 |                 x = x_test[0].to(self.device).float()
207 |                 # y = y_test.to(self.device).float()
208 | 
209 |                 y_pred = model(x).cpu().data.numpy()  # .squeeze()
210 |                 pred_ans.append(y_pred)
211 | 
212 |         if use_double:
213 |             return np.concatenate(pred_ans).astype("float64")
214 |         else:
215 |             return np.concatenate(pred_ans)
216 | 
217 |     def val_auc_logloss(self, val_input, y_val, batch_size=50000, use_double=False):
218 |         pred_ans = self.predict(val_input, batch_size)
219 |         print("test LogLoss is %.4f test AUC is %.4f"%(log_loss(y_val, pred_ans),roc_auc_score(y_val, pred_ans)) )
220 | 
221 |     def get_L2_Norm(self ):
222 | 
223 |         loss = torch.zeros((1,), device=self.device)
224 |         loss = loss + torch.norm(self.weight)
225 |         for t in self.embedding_dict1.parameters():
226 |             loss = loss+ torch.norm(t)
227 |         for t in self.embedding_dict2.parameters():
228 |             loss = loss+ torch.norm(t)
229 |         return  loss
230 | 
231 |     def build_input_features(self, feat_sizes):
232 |         # Return OrderedDict: {feature_name:(start, start+dimension)}
233 |         features = OrderedDict()
234 |         start = 0
235 |         for feat in feat_sizes:
236 |             feat_name = feat
237 |             if feat_name in features:
238 |                 continue
239 |             features[feat_name] = (start, start + 1)
240 |             start += 1
241 |         return  features
242 | 
243 |     def create_embedding_matrix(self ,sparse_feature_columns, feat_sizes,embedding_size,init_std=0.0001, sparse=False, device='cpu'):
244 |         embedding_dict = nn.ModuleDict(
245 |             {feat: nn.Embedding(feat_sizes[feat], embedding_size, sparse=False)
246 |              for feat in sparse_feature_columns}
247 |         )
248 |         for tensor in embedding_dict.values():
249 |             nn.init.normal_(tensor.weight, mean=0, std=init_std)
250 |         return embedding_dict.to(device)


--------------------------------------------------------------------------------