├── README.md ├── run_deepfm.py └── deepctrmodels └── deepfm.py /README.md: -------------------------------------------------------------------------------- 1 | # DeepFM_torch 2 | 本程序是利用**Pytorch**实现**DeepFM**算法,以Criteo数据集作为测试,AUC值目前可以达到79.15% 3 | 4 | 模型实现细节见: https://blog.csdn.net/springtostring/article/details/108157070 5 | 6 | 参考代码库DeepCTR:https://github.com/shenweichen/DeepCTR-Torch 7 | -------------------------------------------------------------------------------- /run_deepfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | import torch 4 | from sklearn.metrics import log_loss, roc_auc_score 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 7 | import sys 8 | import os 9 | 10 | import torch.nn as nn 11 | import numpy as np 12 | import torch.utils.data as Data 13 | from torch.utils.data import DataLoader 14 | import torch.optim as optim 15 | import torch.nn.functional as F 16 | from sklearn.metrics import log_loss, roc_auc_score 17 | from collections import OrderedDict, namedtuple, defaultdict 18 | import random 19 | from deepctrmodels.deepfm import Deepfm 20 | 21 | 22 | 23 | 24 | 25 | 26 | if __name__ == "__main__": 27 | 28 | seed = 1024 29 | torch.manual_seed(seed) # 为CPU设置随机种子 30 | torch.cuda.manual_seed(seed) # 为当前GPU设置随机种子 31 | torch.cuda.manual_seed_all(seed) # 为所有GPU设置随机种子 32 | np.random.seed(seed) 33 | random.seed(seed) 34 | 35 | sparse_features = ['C' + str(i) for i in range(1, 27)] #C代表类别特征 class 36 | dense_features = ['I' + str(i) for i in range(1, 14)] #I代表数值特征 int 37 | col_names = ['label'] + dense_features + sparse_features 38 | data = pd.read_csv('dac/train.txt', names=col_names, sep='\t') 39 | # data = pd.read_csv('criteo_train_1m.txt', names=col_names, sep='\t') 40 | # data = pd.read_csv('total.txt') 41 | feature_names = sparse_features + dense_features #全体特征名 42 | data[sparse_features] = data[sparse_features].fillna('-1', ) # 类别特征缺失 ,使用-1代替 43 | data[dense_features] = data[dense_features].fillna(0, ) # 数值特征缺失,使用0代替 44 | target = ['label'] # label 45 | 46 | # 1.Label Encoding for sparse features,and do simple Transformation for dense features 47 | # 使用LabelEncoder(),为类别特征的每一个item编号 48 | for feat in sparse_features: 49 | lbe = LabelEncoder() 50 | data[feat] = lbe.fit_transform(data[feat]) 51 | 52 | # 数值特征 max-min 0-1归化 53 | mms = MinMaxScaler(feature_range=(0, 1)) 54 | data[dense_features] = mms.fit_transform(data[dense_features]) 55 | 56 | # 2.count #unique features for each sparse field,and record dense feature field name 57 | feat_sizes1={ feat:1 for feat in dense_features} 58 | feat_sizes2 = {feat: len(data[feat].unique()) for feat in sparse_features} 59 | feat_sizes={} 60 | feat_sizes.update(feat_sizes1) 61 | feat_sizes.update(feat_sizes2) 62 | # print(feat_sizes) 63 | 64 | # 3.generate input data for model 65 | train, test = train_test_split(data, test_size=0.2,random_state=2020) 66 | # print(train.head(5)) 67 | # print(test.head(5)) 68 | train_model_input = {name: train[name] for name in feature_names} 69 | test_model_input = {name: test[name] for name in feature_names} 70 | 71 | # 4.Define Model,train,predict and evaluate 72 | device = 'cpu' 73 | use_cuda = True 74 | if use_cuda and torch.cuda.is_available(): 75 | print('cuda ready...') 76 | device = 'cuda:0' 77 | 78 | model = Deepfm(feat_sizes ,sparse_feature_columns = sparse_features,dense_feature_columns = dense_features, 79 | dnn_hidden_units=[400,400,400] , dnn_dropout=0.9 , ebedding_size = 8 , 80 | l2_reg_linear=1e-3, device=device) 81 | 82 | 83 | 84 | model.fit(train_model_input, train[target].values , test_model_input , test[target].values ,batch_size=50000, epochs=150, verbose=1) 85 | 86 | pred_ans = model.predict(test_model_input, 50000) 87 | 88 | print("final test") 89 | print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) 90 | print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) -------------------------------------------------------------------------------- /deepctrmodels/deepfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | import torch 4 | from sklearn.metrics import log_loss, roc_auc_score 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 7 | import sys 8 | import os 9 | 10 | import torch.nn as nn 11 | import numpy as np 12 | import torch.utils.data as Data 13 | from torch.utils.data import DataLoader 14 | import torch.optim as optim 15 | import torch.nn.functional as F 16 | from sklearn.metrics import log_loss, roc_auc_score 17 | from collections import OrderedDict, namedtuple, defaultdict 18 | import random 19 | 20 | 21 | # 添加 dropout 和 L2正则项 添加后与deepctr还是有区别 L2正则项需要继续改进 22 | # init()代码太冗杂了,封装几个函数 23 | 24 | class Deepfm(nn.Module): 25 | 26 | def __init__(self, feat_sizes, sparse_feature_columns, dense_feature_columns,dnn_hidden_units=[400, 400,400], dnn_dropout=0.0, ebedding_size=4, 27 | l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024, 28 | device='cpu'): 29 | 30 | super(Deepfm, self).__init__() 31 | self.feat_sizes = feat_sizes 32 | self.device = device 33 | self.sparse_feature_columns = sparse_feature_columns 34 | self.dense_feature_columns = dense_feature_columns 35 | self.embedding_size = ebedding_size 36 | self.l2_reg_linear = l2_reg_linear 37 | 38 | # self.feature_index 建立feature到列名到输入数据X的相对位置的映射 39 | self.feature_index = self.build_input_features(self.feat_sizes) 40 | 41 | self.bias = nn.Parameter(torch.zeros((1,))) 42 | # self.weight 43 | self.weight = nn.Parameter(torch.Tensor(len(self.dense_feature_columns), 1)).to(device) 44 | torch.nn.init.normal_(self.weight, mean=0, std=0.0001) 45 | 46 | self.embedding_dict1 = self.create_embedding_matrix(self.sparse_feature_columns , feat_sizes , 1 , 47 | sparse=False, device=self.device) 48 | 49 | self.embedding_dict2 = self.create_embedding_matrix(self.sparse_feature_columns , feat_sizes , self.embedding_size , 50 | sparse=False, device=self.device) 51 | # dnn 52 | self.dropout = nn.Dropout(dnn_dropout) 53 | self.dnn_input_size = self.embedding_size * len(self.sparse_feature_columns) + len(self.dense_feature_columns) 54 | hidden_units = [self.dnn_input_size] + dnn_hidden_units 55 | self.linears = nn.ModuleList( 56 | [nn.Linear(hidden_units[i], hidden_units[i + 1]) for i in range(len(hidden_units) - 1)]) 57 | self.relus = nn.ModuleList( 58 | [nn.ReLU() for i in range(len(hidden_units) - 1)]) 59 | for name, tensor in self.linears.named_parameters(): 60 | if 'weight' in name: 61 | nn.init.normal_(tensor, mean=0, std=init_std) 62 | # self.linears =self.linears.to(device) 63 | self.dnn_linear = nn.Linear( 64 | dnn_hidden_units[-1], 1, bias=False).to(device) 65 | 66 | self.to(device) 67 | 68 | def forward(self, X): 69 | ''' 70 | :param X: pd.DtateFrame 71 | :return: y_pre 72 | ''' 73 | 74 | ''' 75 | FM liner 76 | ''' 77 | sparse_embedding_list1 = [self.embedding_dict1[feat]( 78 | X[:, self.feature_index[feat][0]:self.feature_index[feat][1]].long()) 79 | for feat in self.sparse_feature_columns] 80 | 81 | dense_value_list2 = [X[:, self.feature_index[feat][0]:self.feature_index[feat][1]] 82 | for feat in self.dense_feature_columns] 83 | linear_sparse_logit = torch.sum( 84 | torch.cat(sparse_embedding_list1, dim=-1), dim=-1, keepdim=False) 85 | linear_dense_logit = torch.cat( 86 | dense_value_list2, dim=-1).matmul(self.weight) 87 | logit = linear_sparse_logit + linear_dense_logit 88 | 89 | sparse_embedding_list = [self.embedding_dict2[feat]( 90 | X[:, self.feature_index[feat][0]:self.feature_index[feat][1]].long()) 91 | for feat in self.sparse_feature_columns] 92 | ''' 93 | FM second 94 | ''' 95 | fm_input = torch.cat(sparse_embedding_list, dim=1) # shape: (batch_size,field_size,embedding_size) 96 | square_of_sum = torch.pow(torch.sum(fm_input, dim=1, keepdim=True), 2) # shape: (batch_size,1,embedding_size) 97 | sum_of_square = torch.sum(torch.pow(fm_input, 2), dim=1, keepdim=True) # shape: (batch_size,1,embedding_size) 98 | cross_term = square_of_sum - sum_of_square 99 | cross_term = 0.5 * torch.sum(cross_term, dim=2, keepdim=False) # shape: (batch_size,1) 100 | logit += cross_term 101 | 102 | ''' 103 | DNN 104 | ''' 105 | # sparse_embedding_list、 dense_value_list2 106 | dnn_sparse_input = torch.cat(sparse_embedding_list, dim=1) 107 | batch_size = dnn_sparse_input.shape[0] 108 | # print(dnn_sparse_input.shape) 109 | dnn_sparse_input=dnn_sparse_input.reshape(batch_size,-1) 110 | # dnn_sparse_input shape: [ batch_size, len(sparse_feat)*embedding_size ] 111 | dnn_dense_input = torch.cat(dense_value_list2, dim=-1) 112 | # print(dnn_sparse_input.shape) 113 | # dnn_dense_input shape: [ batch_size, len(dense_feat) ] 114 | dnn_total_input = torch.cat([dnn_sparse_input, dnn_dense_input], dim=-1) 115 | deep_input = dnn_total_input 116 | 117 | for i in range(len(self.linears)): 118 | fc = self.linears[i](deep_input) 119 | fc = self.relus[i](fc) 120 | fc = self.dropout(fc) 121 | deep_input = fc 122 | dnn_output = self.dnn_linear(deep_input) 123 | 124 | logit += dnn_output 125 | ''' 126 | output 127 | ''' 128 | y_pred = torch.sigmoid(logit+self.bias) 129 | return y_pred 130 | 131 | def fit(self, train_input, y_label, val_input, y_val, batch_size=5000, epochs=15, verbose=5): 132 | x = [train_input[feature] for feature in self.feature_index] 133 | 134 | for i in range(len(x)): 135 | if len(x[i].shape) == 1: 136 | x[i] = np.expand_dims(x[i], axis=1) # 扩展成2维,以便后续cat 137 | 138 | train_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x, axis=-1)), torch.from_numpy(y_label)) 139 | train_loader = DataLoader(dataset=train_tensor_data,shuffle=True ,batch_size=batch_size) 140 | 141 | print(self.device, end="\n") 142 | model = self.train() 143 | loss_func = F.binary_cross_entropy 144 | # loss_func = F.binary_cross_entropy_with_logits 145 | optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay = 0.0) 146 | # optimizer = optim.Adagrad(model.parameters(),lr=0.01) 147 | # 显示 一次epoch需要几个step 148 | sample_num = len(train_tensor_data) 149 | steps_per_epoch = (sample_num - 1) // batch_size + 1 150 | 151 | print("Train on {0} samples, {1} steps per epoch".format( 152 | len(train_tensor_data), steps_per_epoch)) 153 | 154 | for epoch in range(epochs): 155 | loss_epoch = 0 156 | total_loss_epoch = 0.0 157 | train_result = {} 158 | pred_ans = [] 159 | true_ans = [] 160 | with torch.autograd.set_detect_anomaly(True): 161 | for index, (x_train, y_train) in enumerate(train_loader): 162 | x = x_train.to(self.device).float() 163 | y = y_train.to(self.device).float() 164 | 165 | y_pred = model(x).squeeze() 166 | 167 | optimizer.zero_grad() 168 | loss = loss_func(y_pred, y.squeeze(),reduction='mean') 169 | #L2 norm 170 | loss = loss + self.l2_reg_linear * self.get_L2_Norm() 171 | loss.backward(retain_graph=True) 172 | optimizer.step() 173 | total_loss_epoch = total_loss_epoch + loss.item() 174 | y_pred = y_pred.cpu().data.numpy() # .squeeze() 175 | pred_ans.append(y_pred) 176 | true_ans.append(y.squeeze().cpu().data.numpy()) 177 | 178 | if (epoch % verbose == 0): 179 | print('epoch %d train loss is %.4f train AUC is %.4f' % 180 | (epoch,total_loss_epoch / steps_per_epoch,roc_auc_score(np.concatenate(true_ans), np.concatenate(pred_ans)))) 181 | self.val_auc_logloss(val_input, y_val, batch_size=50000) 182 | print(" ") 183 | 184 | def predict(self, test_input, batch_size = 256, use_double=False): 185 | """ 186 | 187 | :param x: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs). 188 | :param batch_size: Integer. If unspecified, it will default to 256. 189 | :return: Numpy array(s) of predictions. 190 | """ 191 | model = self.eval() 192 | x = [test_input[feature] for feature in self.feature_index] 193 | 194 | for i in range(len(x)): 195 | if len(x[i].shape) == 1: 196 | x[i] = np.expand_dims(x[i], axis=1) # 扩展成2维,以便后续cat 197 | 198 | tensor_data = Data.TensorDataset( 199 | torch.from_numpy(np.concatenate(x, axis=-1))) 200 | test_loader = DataLoader( 201 | dataset=tensor_data, shuffle=False, batch_size=batch_size) 202 | 203 | pred_ans = [] 204 | with torch.no_grad(): 205 | for index, x_test in enumerate(test_loader): 206 | x = x_test[0].to(self.device).float() 207 | # y = y_test.to(self.device).float() 208 | 209 | y_pred = model(x).cpu().data.numpy() # .squeeze() 210 | pred_ans.append(y_pred) 211 | 212 | if use_double: 213 | return np.concatenate(pred_ans).astype("float64") 214 | else: 215 | return np.concatenate(pred_ans) 216 | 217 | def val_auc_logloss(self, val_input, y_val, batch_size=50000, use_double=False): 218 | pred_ans = self.predict(val_input, batch_size) 219 | print("test LogLoss is %.4f test AUC is %.4f"%(log_loss(y_val, pred_ans),roc_auc_score(y_val, pred_ans)) ) 220 | 221 | def get_L2_Norm(self ): 222 | 223 | loss = torch.zeros((1,), device=self.device) 224 | loss = loss + torch.norm(self.weight) 225 | for t in self.embedding_dict1.parameters(): 226 | loss = loss+ torch.norm(t) 227 | for t in self.embedding_dict2.parameters(): 228 | loss = loss+ torch.norm(t) 229 | return loss 230 | 231 | def build_input_features(self, feat_sizes): 232 | # Return OrderedDict: {feature_name:(start, start+dimension)} 233 | features = OrderedDict() 234 | start = 0 235 | for feat in feat_sizes: 236 | feat_name = feat 237 | if feat_name in features: 238 | continue 239 | features[feat_name] = (start, start + 1) 240 | start += 1 241 | return features 242 | 243 | def create_embedding_matrix(self ,sparse_feature_columns, feat_sizes,embedding_size,init_std=0.0001, sparse=False, device='cpu'): 244 | embedding_dict = nn.ModuleDict( 245 | {feat: nn.Embedding(feat_sizes[feat], embedding_size, sparse=False) 246 | for feat in sparse_feature_columns} 247 | ) 248 | for tensor in embedding_dict.values(): 249 | nn.init.normal_(tensor.weight, mean=0, std=init_std) 250 | return embedding_dict.to(device) --------------------------------------------------------------------------------