├── README.md ├── main.py ├── model ├── AFM.py ├── DCN.py ├── DIN.py ├── DeepFM.py ├── FNN.py ├── NFM.py ├── PNN.py └── __init__.py └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc └── data_preprocess.cpython-36.pyc ├── common.py ├── data_preprocess.py ├── sample.py └── split_train.py /README.md: -------------------------------------------------------------------------------- 1 | # dnn_ctr 2 | The framework to deal with ctr problem 3 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from utils import data_preprocess 4 | from model import DeepFM 5 | import torch 6 | import pickle 7 | 8 | train_dict,test_dict = data_preprocess.read_csv_dataset('./data/final_track2_train_new.csv',task='finish') 9 | #pred_dict=data_preprocess.read_csv_dataset_pred('./data/small_test.csv',task='like') 10 | #train_dict = data_preprocess.read_criteo_data('./data/tiny_train_input.csv', './data/category_emb.csv') 11 | #test_dict = data_preprocess.read_criteo_data('./data/tiny_test_input.csv', './data/category_emb.csv') 12 | 13 | deepfm = DeepFM.DeepFM(8,train_dict['feature_sizes'],verbose=True,use_cuda=True, weight_decay=0.0001,use_fm=True,use_ffm=False,use_deep=False) 14 | #pred=deepfm.predict_from_model_file(pred_dict['index'], pred_dict['value'],deepfm,'./saved_model') 15 | #pred=deepfm.predict(pred_dict['index'], pred_dict['value']) 16 | #pickle.dump(pred,open('like_pre','wb')) 17 | deepfm.fit(train_dict['index'], train_dict['value'], train_dict['lable'],test_dict['index'], test_dict['value'], test_dict['lable'],ealry_stopping=True,refit=False,save_path='./saved_model') 18 | -------------------------------------------------------------------------------- /model/AFM.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | A pytorch implementation of AFM 8 | 9 | Reference: 10 | [1] Attentional Factorization Machines:Learning theWeight of Feature Interactions via Attention Networks 11 | 12 | """ 13 | 14 | import os 15 | import numpy as np 16 | from sklearn.base import BaseEstimator, TransformerMixin 17 | from sklearn.metrics import roc_auc_score 18 | from time import time 19 | 20 | import torch 21 | import torch.autograd as autograd 22 | import torch.nn as nn 23 | import torch.nn.functional as F 24 | import torch.optim as optim 25 | from torch.autograd import Variable 26 | 27 | import torch.backends.cudnn 28 | 29 | 30 | """ 31 | 网络结构部分 32 | """ 33 | 34 | class AFM(torch.nn.Module): 35 | """ 36 | :parameter 37 | ------------- 38 | field_size: size of the feature fields 39 | feature_sizes: a field_size-dim array, sizes of the feature dictionary 40 | embedding_size: size of the feature embedding 41 | attention_size: The attention netwotk's parameter 42 | is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not? 43 | dropout_shallow: an array of the size of 1, example:[0.5], the element is for the-first order part 44 | h_depth: deep network's hidden layers' depth 45 | deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2 46 | is_deep_dropout: bool, deep part uses dropout or not? 47 | dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2 48 | deep_layers_activation: relu or sigmoid etc 49 | n_epochs: epochs 50 | batch_size: batch_size 51 | learning_rate: learning_rate 52 | optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag' 53 | is_batch_norm:bool, use batch_norm or not ? 54 | verbose: verbose 55 | weight_decay: weight decay (L2 penalty) 56 | random_seed: random_seed=950104 someone's birthday, my lukcy number 57 | use_fm: bool 58 | use_ffm: bool 59 | loss_type: "logloss", only 60 | eval_metric: roc_auc_score 61 | use_cuda: bool use gpu or cpu? 62 | n_class: number of classes. is bounded to 1 63 | greater_is_better: bool. Is the greater eval better? 64 | 65 | 66 | Attention: only support logsitcs regression 67 | """ 68 | def __init__(self,field_size, feature_sizes, embedding_size = 4, attention_size = 4,is_shallow_dropout = True, dropout_shallow = [0.5], 69 | is_attention_dropout = True, dropout_attention=[0.5], 70 | attention_layers_activation = 'relu', n_epochs = 64, batch_size = 256, learning_rate = 0.003, 71 | optimizer_type = 'adam', is_batch_norm = False, verbose = False, random_seed = 950104, weight_decay = 0.0, 72 | use_fm = True, use_ffm = False,loss_type = 'logloss', eval_metric = roc_auc_score, 73 | use_cuda = True, n_class = 1, greater_is_better = True 74 | ): 75 | super(AFM, self).__init__() 76 | self.field_size = field_size 77 | self.feature_sizes = feature_sizes 78 | self.embedding_size = embedding_size 79 | self.attention_size = attention_size 80 | self.is_shallow_dropout = is_shallow_dropout 81 | self.dropout_shallow = dropout_shallow 82 | self.is_attention_dropout = is_attention_dropout 83 | self.dropout_attention = dropout_attention 84 | self.attention_layers_activation = attention_layers_activation 85 | self.n_epochs = n_epochs 86 | self.batch_size = batch_size 87 | self.learning_rate = learning_rate 88 | self.optimizer_type = optimizer_type 89 | self.is_batch_norm = is_batch_norm 90 | self.verbose = verbose 91 | self.weight_decay = weight_decay 92 | self.random_seed = random_seed 93 | self.use_fm = use_fm 94 | self.use_ffm = use_ffm 95 | self.loss_type = loss_type 96 | self.eval_metric = eval_metric 97 | self.use_cuda = use_cuda 98 | self.n_class = n_class 99 | self.greater_is_better = greater_is_better 100 | 101 | torch.manual_seed(self.random_seed) 102 | 103 | """ 104 | check cuda 105 | """ 106 | if self.use_cuda and not torch.cuda.is_available(): 107 | self.use_cuda = False 108 | print("Cuda is not available, automatically changed into cpu model") 109 | 110 | """ 111 | check use fm or ffm 112 | """ 113 | if self.use_fm and self.use_ffm: 114 | print("only support one type only, please make sure to choose only fm or ffm part") 115 | exit(1) 116 | elif self.use_fm: 117 | print("The model is afm(fm+attention layers)") 118 | elif self.use_ffm: 119 | print("The model is affm(ffm+attention layers)") 120 | else: 121 | print("You have to choose more than one of (fm, ffm) models to use") 122 | exit(1) 123 | """ 124 | bias 125 | """ 126 | self.bias = torch.nn.Parameter(torch.randn(1)) 127 | 128 | """ 129 | fm part 130 | """ 131 | if self.use_fm: 132 | print("Init fm part") 133 | self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) 134 | if self.dropout_shallow: 135 | self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0]) 136 | self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) 137 | print("Init fm part succeed") 138 | 139 | """ 140 | ffm part 141 | """ 142 | if self.use_ffm: 143 | print("Init ffm part") 144 | self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) 145 | if self.dropout_shallow: 146 | self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0]) 147 | self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes]) 148 | print("Init ffm part succeed") 149 | 150 | """ 151 | attention part 152 | """ 153 | print("Init attention part") 154 | 155 | if self.is_attention_dropout: 156 | self.attention_linear_0_dropout = nn.Dropout(self.dropout_attention[0]) 157 | self.attention_linear_1 = nn.Linear(self.embedding_size, self.attention_size) 158 | self.H = torch.nn.Parameter(torch.randn(self.attention_size)) 159 | self.P = torch.nn.Parameter(torch.randn(self.embedding_size)) 160 | print("Init attention part succeed") 161 | 162 | print "Init succeed" 163 | 164 | def forward(self, Xi, Xv): 165 | """ 166 | :param Xi_train: index input tensor, batch_size * k * 1 167 | :param Xv_train: value input tensor, batch_size * k * 1 168 | :return: the last output 169 | """ 170 | """ 171 | fm part 172 | """ 173 | if self.use_fm: 174 | fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)] 175 | fm_first_order = torch.cat(fm_first_order_emb_arr,1) 176 | if self.is_shallow_dropout: 177 | fm_first_order = self.fm_first_order_dropout(fm_first_order) 178 | 179 | 180 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in 181 | enumerate(self.fm_second_order_embeddings)] 182 | fm_wij_arr = [] 183 | for i in range(self.field_size): 184 | for j in range(i + 1, self.field_size): 185 | fm_wij_arr.append(fm_second_order_emb_arr[i] * fm_second_order_emb_arr[j]) 186 | 187 | 188 | """ 189 | ffm part 190 | """ 191 | if self.use_ffm: 192 | ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)] 193 | ffm_first_order = torch.cat(ffm_first_order_emb_arr,1) 194 | if self.is_shallow_dropout: 195 | ffm_first_order = self.ffm_first_order_dropout(ffm_first_order) 196 | ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)] 197 | ffm_wij_arr = [] 198 | for i in range(self.field_size): 199 | for j in range(i+1, self.field_size): 200 | ffm_wij_arr.append(ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i]) 201 | 202 | """ 203 | attention part 204 | """ 205 | if self.use_fm: 206 | interaction_layer = torch.cat(fm_wij_arr, 1) 207 | else: 208 | interaction_layer = torch.cat(ffm_wij_arr,1) 209 | 210 | if self.attention_layers_activation == 'sigmoid': 211 | activation = F.sigmoid 212 | elif self.attention_layers_activation == 'tanh': 213 | activation = F.tanh 214 | else: 215 | activation = F.relu 216 | 217 | if self.is_attention_dropout: 218 | interaction_layer = self.attention_linear_0_dropout(interaction_layer) 219 | attention_tmp = self.attention_linear_1(interaction_layer.view([-1,self.embedding_size])) 220 | attention_tmp = attention_tmp * self.H 221 | attention_tmp = torch.sum(attention_tmp,1).view([-1,self.field_size*(self.field_size-1)/2]) 222 | attention_weight = torch.nn.Softmax()(attention_tmp) 223 | attention_output = torch.sum(interaction_layer.view([-1,self.embedding_size])* self.P,1).view([-1,self.field_size*(self.field_size-1)/2]) 224 | attention_output = attention_output * attention_weight 225 | 226 | 227 | """ 228 | sum 229 | """ 230 | if self.use_fm: 231 | total_sum = self.bias+ torch.sum(fm_first_order,1) + torch.sum(attention_output,1) 232 | elif self.use_ffm: 233 | total_sum = self.bias + torch.sum(ffm_first_order, 1) + torch.sum(attention_output, 1) 234 | return total_sum 235 | 236 | 237 | def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None, 238 | y_valid = None, ealry_stopping=False, refit=False, save_path = None): 239 | """ 240 | :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...] 241 | indi_j is the feature index of feature field j of sample i in the training set 242 | :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...] 243 | vali_j is the feature value of feature field j of sample i in the training set 244 | vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features) 245 | :param y_train: label of each sample in the training set 246 | :param Xi_valid: list of list of feature indices of each sample in the validation set 247 | :param Xv_valid: list of list of feature values of each sample in the validation set 248 | :param y_valid: label of each sample in the validation set 249 | :param ealry_stopping: perform early stopping or not 250 | :param refit: refit the model on the train+valid dataset or not 251 | :param save_path: the path to save the model 252 | :return: 253 | """ 254 | """ 255 | pre_process 256 | """ 257 | if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])): 258 | print("Save path is not existed!") 259 | return 260 | 261 | if self.verbose: 262 | print("pre_process data ing...") 263 | is_valid = False 264 | Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1)) 265 | Xv_train = np.array(Xv_train) 266 | y_train = np.array(y_train) 267 | x_size = Xi_train.shape[0] 268 | if Xi_valid: 269 | Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1)) 270 | Xv_valid = np.array(Xv_valid) 271 | y_valid = np.array(y_valid) 272 | x_valid_size = Xi_valid.shape[0] 273 | is_valid = True 274 | if self.verbose: 275 | print("pre_process data finished") 276 | 277 | """ 278 | train model 279 | """ 280 | model = self.train() 281 | 282 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 283 | if self.optimizer_type == 'adam': 284 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 285 | elif self.optimizer_type == 'rmsp': 286 | optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 287 | elif self.optimizer_type == 'adag': 288 | optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 289 | 290 | criterion = F.binary_cross_entropy_with_logits 291 | 292 | train_result = [] 293 | valid_result = [] 294 | for epoch in range(self.n_epochs): 295 | total_loss = 0.0 296 | batch_iter = x_size // self.batch_size 297 | epoch_begin_time = time() 298 | batch_begin_time = time() 299 | for i in range(batch_iter+1): 300 | offset = i*self.batch_size 301 | end = min(x_size, offset+self.batch_size) 302 | if offset == end: 303 | break 304 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 305 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 306 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 307 | if self.use_cuda: 308 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 309 | optimizer.zero_grad() 310 | outputs = model(batch_xi, batch_xv) 311 | loss = criterion(outputs, batch_y) 312 | loss.backward() 313 | optimizer.step() 314 | 315 | total_loss += loss.data[0] 316 | if self.verbose: 317 | if i % 100 == 99: # print every 100 mini-batches 318 | eval = self.evaluate(batch_xi, batch_xv, batch_y) 319 | print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' % 320 | (epoch + 1, i + 1, total_loss/100.0, eval, time()-batch_begin_time)) 321 | total_loss = 0.0 322 | batch_begin_time = time() 323 | 324 | train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size) 325 | train_result.append(train_eval) 326 | print('*'*50) 327 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 328 | (epoch + 1, train_loss, train_eval, time()-epoch_begin_time)) 329 | print('*'*50) 330 | 331 | if is_valid: 332 | valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size) 333 | valid_result.append(valid_eval) 334 | print('*' * 50) 335 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 336 | (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time)) 337 | print('*' * 50) 338 | if save_path: 339 | torch.save(self.state_dict(),save_path) 340 | if is_valid and ealry_stopping and self.training_termination(valid_result): 341 | print("early stop at [%d] epoch!" % (epoch+1)) 342 | break 343 | 344 | # fit a few more epoch on train+valid until result reaches the best_train_score 345 | if is_valid and refit: 346 | if self.verbose: 347 | print("refitting the model") 348 | if self.greater_is_better: 349 | best_epoch = np.argmax(valid_result) 350 | else: 351 | best_epoch = np.argmin(valid_result) 352 | best_train_score = train_result[best_epoch] 353 | Xi_train = np.concatenate((Xi_train,Xi_valid)) 354 | Xv_train = np.concatenate((Xv_train,Xv_valid)) 355 | y_train = np.concatenate((y_train,y_valid)) 356 | x_size = x_size + x_valid_size 357 | self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train) 358 | for epoch in range(64): 359 | batch_iter = x_size // self.batch_size 360 | for i in range(batch_iter + 1): 361 | offset = i * self.batch_size 362 | end = min(x_size, offset + self.batch_size) 363 | if offset == end: 364 | break 365 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 366 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 367 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 368 | if self.use_cuda: 369 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 370 | optimizer.zero_grad() 371 | outputs = model(batch_xi, batch_xv) 372 | loss = criterion(outputs, batch_y) 373 | loss.backward() 374 | optimizer.step() 375 | train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size) 376 | if save_path: 377 | torch.save(self.state_dict(), save_path) 378 | if abs(best_train_score-train_eval) < 0.001 or \ 379 | (self.greater_is_better and train_eval > best_train_score) or \ 380 | ((not self.greater_is_better) and train_result < best_train_score): 381 | break 382 | if self.verbose: 383 | print("refit finished") 384 | 385 | def eval_by_batch(self,Xi, Xv, y, x_size): 386 | total_loss = 0.0 387 | y_pred = [] 388 | if self.use_ffm: 389 | batch_size = 16384*2 390 | else: 391 | batch_size = 16384 392 | batch_iter = x_size // batch_size 393 | criterion = F.binary_cross_entropy_with_logits 394 | model = self.eval() 395 | for i in range(batch_iter+1): 396 | offset = i * batch_size 397 | end = min(x_size, offset + batch_size) 398 | if offset == end: 399 | break 400 | batch_xi = Variable(torch.LongTensor(Xi[offset:end])) 401 | batch_xv = Variable(torch.FloatTensor(Xv[offset:end])) 402 | batch_y = Variable(torch.FloatTensor(y[offset:end])) 403 | if self.use_cuda: 404 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 405 | outputs = model(batch_xi, batch_xv) 406 | pred = F.sigmoid(outputs).cpu() 407 | y_pred.extend(pred.data.numpy()) 408 | loss = criterion(outputs, batch_y) 409 | total_loss += loss.data[0]*(end-offset) 410 | total_metric = self.eval_metric(y,y_pred) 411 | return total_loss/x_size, total_metric 412 | 413 | # shuffle three lists simutaneously 414 | def shuffle_in_unison_scary(self, a, b, c): 415 | rng_state = np.random.get_state() 416 | np.random.shuffle(a) 417 | np.random.set_state(rng_state) 418 | np.random.shuffle(b) 419 | np.random.set_state(rng_state) 420 | np.random.shuffle(c) 421 | 422 | def training_termination(self, valid_result): 423 | if len(valid_result) > 4: 424 | if self.greater_is_better: 425 | if valid_result[-1] < valid_result[-2] and \ 426 | valid_result[-2] < valid_result[-3] and \ 427 | valid_result[-3] < valid_result[-4]: 428 | return True 429 | else: 430 | if valid_result[-1] > valid_result[-2] and \ 431 | valid_result[-2] > valid_result[-3] and \ 432 | valid_result[-3] > valid_result[-4]: 433 | return True 434 | return False 435 | 436 | def predict(self, Xi, Xv): 437 | """ 438 | :param Xi: the same as fit function 439 | :param Xv: the same as fit function 440 | :return: output, ont-dim array 441 | """ 442 | Xi = np.array(Xi).reshape((-1,self.field_size,1)) 443 | Xi = Variable(torch.LongTensor(Xi)) 444 | Xv = Variable(torch.FloatTensor(Xv)) 445 | if self.use_cuda and torch.cuda.is_available(): 446 | Xi, Xv = Xi.cuda(), Xv.cuda() 447 | 448 | model = self.eval() 449 | pred = F.sigmoid(model(Xi, Xv)).cpu() 450 | return (pred.data.numpy() > 0.5) 451 | 452 | def predict_proba(self, Xi, Xv): 453 | Xi = np.array(Xi).reshape((-1, self.field_size, 1)) 454 | Xi = Variable(torch.LongTensor(Xi)) 455 | Xv = Variable(torch.FloatTensor(Xv)) 456 | if self.use_cuda and torch.cuda.is_available(): 457 | Xi, Xv = Xi.cuda(), Xv.cuda() 458 | 459 | model = self.eval() 460 | pred = F.sigmoid(model(Xi, Xv)).cpu() 461 | return pred.data.numpy() 462 | 463 | def inner_predict(self, Xi, Xv): 464 | """ 465 | :param Xi: tensor of feature index 466 | :param Xv: tensor of feature value 467 | :return: output, numpy 468 | """ 469 | model = self.eval() 470 | pred = F.sigmoid(model(Xi, Xv)).cpu() 471 | return (pred.data.numpy() > 0.5) 472 | 473 | def inner_predict_proba(self, Xi, Xv): 474 | """ 475 | :param Xi: tensor of feature index 476 | :param Xv: tensor of feature value 477 | :return: output, numpy 478 | """ 479 | model = self.eval() 480 | pred = F.sigmoid(model(Xi, Xv)).cpu() 481 | return pred.data.numpy() 482 | 483 | 484 | def evaluate(self, Xi, Xv, y): 485 | """ 486 | :param Xi: tensor of feature index 487 | :param Xv: tensor of feature value 488 | :param y: tensor of labels 489 | :return: metric of the evaluation 490 | """ 491 | y_pred = self.inner_predict_proba(Xi, Xv) 492 | return self.eval_metric(y.cpu().data.numpy(), y_pred) 493 | 494 | """ 495 | test part 496 | """ 497 | import sys 498 | sys.path.append('../') 499 | from utils import data_preprocess 500 | 501 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv') 502 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv') 503 | with torch.cuda.device(0): 504 | afm = AFM(39, result_dict['feature_sizes'], batch_size=128 * 64, is_shallow_dropout=False, verbose=True, use_cuda=True, 505 | weight_decay=0.00002, use_fm=True, use_ffm=False).cuda() 506 | afm.fit(result_dict['index'], result_dict['value'], result_dict['label'], 507 | test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, refit=False, 508 | save_path='../data/model/afm.pkl') 509 | -------------------------------------------------------------------------------- /model/DCN.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | A pytorch implementation of NFM 8 | 9 | Reference: 10 | [1] Deep & Cross Network for Ad Click Predictions 11 | Ruoxi Wang,Stanford University,Stanford, CA,ruoxi@stanford.edu 12 | Bin Fu,Google Inc.,New York, NY,binfu@google.com 13 | Gang Fu,Google Inc.,New York, NY,thomasfu@google.com 14 | Mingliang Wang,Google Inc.,New York, NY,mlwang@google.com 15 | 16 | """ 17 | 18 | import os 19 | import numpy as np 20 | from sklearn.base import BaseEstimator, TransformerMixin 21 | from sklearn.metrics import roc_auc_score 22 | from time import time 23 | 24 | import torch 25 | import torch.autograd as autograd 26 | import torch.nn as nn 27 | import torch.nn.functional as F 28 | import torch.optim as optim 29 | from torch.autograd import Variable 30 | 31 | import torch.backends.cudnn 32 | 33 | 34 | """ 35 | 网络结构部分 36 | """ 37 | 38 | class DCN(torch.nn.Module): 39 | """ 40 | :parameter 41 | ------------- 42 | field_size: size of the feature fields 43 | feature_sizes: a field_size-dim array, sizes of the feature dictionary 44 | embedding_size: size of the feature embedding 45 | h_depth: deep network's hidden layers' depth 46 | deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2 47 | is_deep_dropout: bool, deep part uses dropout or not? 48 | dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2 49 | deep_layers_activation: relu or sigmoid etc 50 | n_epochs: epochs 51 | batch_size: batch_size 52 | learning_rate: learning_rate 53 | optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag' 54 | is_batch_norm:bool, use batch_norm or not ? 55 | verbose: verbose 56 | weight_decay: weight decay (L2 penalty) 57 | random_seed: random_seed=950104 someone's birthday, my lukcy number 58 | use_cross: bool 59 | use_inner_prodcut: bool 60 | use_depp:bool 61 | loss_type: "logloss", only 62 | eval_metric: roc_auc_score 63 | use_cuda: bool use gpu or cpu? 64 | n_class: number of classes. is bounded to 1 65 | greater_is_better: bool. Is the greater eval better? 66 | 67 | 68 | Attention: only support logsitcs regression 69 | """ 70 | def __init__(self,field_size, feature_sizes, embedding_size = 4, 71 | h_depth = 2, deep_layers = [32, 32], is_deep_dropout = True, dropout_deep=[0.0, 0.5, 0.5], 72 | h_cross_depth = 3, 73 | h_inner_product_depth = 2, inner_product_layers = [32, 32], is_inner_product_dropout = True, dropout_inner_product_deep = [0.0, 0.5, 0.5], 74 | deep_layers_activation = 'relu', n_epochs = 64, batch_size = 256, learning_rate = 0.003, 75 | optimizer_type = 'adam', is_batch_norm = False, verbose = False, random_seed = 950104, 76 | use_cross = True, use_inner_product = False, use_deep = True,weight_decay = 0.0,loss_type = 'logloss', eval_metric = roc_auc_score, 77 | use_cuda = True, n_class = 1, greater_is_better = True 78 | ): 79 | super(DCN, self).__init__() 80 | self.field_size = field_size 81 | self.feature_sizes = feature_sizes 82 | self.embedding_size = embedding_size 83 | self.h_depth = h_depth 84 | self.deep_layers = deep_layers 85 | self.is_deep_dropout = is_deep_dropout 86 | self.dropout_deep = dropout_deep 87 | self.h_cross_depth = h_cross_depth 88 | self.h_inner_product_depth = h_inner_product_depth 89 | self.inner_product_layers = inner_product_layers 90 | self.is_inner_product_dropout = is_inner_product_dropout 91 | self.dropout_inner_product_deep = dropout_inner_product_deep 92 | self.deep_layers_activation = deep_layers_activation 93 | self.n_epochs = n_epochs 94 | self.batch_size = batch_size 95 | self.learning_rate = learning_rate 96 | self.optimizer_type = optimizer_type 97 | self.is_batch_norm = is_batch_norm 98 | self.verbose = verbose 99 | self.weight_decay = weight_decay 100 | self.random_seed = random_seed 101 | self.use_cross = use_cross 102 | self.use_inner_product = use_inner_product 103 | self.use_deep = use_deep 104 | self.loss_type = loss_type 105 | self.eval_metric = eval_metric 106 | self.use_cuda = use_cuda 107 | self.n_class = n_class 108 | self.greater_is_better = greater_is_better 109 | 110 | torch.manual_seed(self.random_seed) 111 | 112 | """ 113 | check cuda 114 | """ 115 | if self.use_cuda and not torch.cuda.is_available(): 116 | self.use_cuda = False 117 | print("Cuda is not available, automatically changed into cpu model") 118 | 119 | """ 120 | check model type 121 | """ 122 | if self.use_cross and self.use_deep and self.use_inner_product: 123 | print("The model is (cross network + deep network + inner_product network)") 124 | elif self.use_cross and self.use_deep: 125 | print("The model is (cross network + deep network)") 126 | elif self.use_cross and self.use_inner_product: 127 | print("The model is (cross network + inner product network)") 128 | elif self.use_inner_product and self.use_deep: 129 | print("The model is (inner product network + deep network)") 130 | elif self.use_cross: 131 | print("The model is a cross network only") 132 | elif self.use_deep: 133 | print("The model is a deep network only") 134 | elif self.use_inner_product: 135 | print("The model is an inner product network only") 136 | else: 137 | print("You have to choose more than one of (cross network, deep network, inner product network) models to use") 138 | exit(1) 139 | 140 | """ 141 | embeddings 142 | """ 143 | self.embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) 144 | 145 | cat_size = 0 146 | """ 147 | cross part 148 | """ 149 | if self.use_cross: 150 | print("Init cross network") 151 | for i in range(self.h_cross_depth): 152 | setattr(self, 'cross_weight_' + str(i+1), 153 | torch.nn.Parameter(torch.randn(self.field_size*self.embedding_size))) 154 | setattr(self, 'cross_bias_' + str(i + 1), 155 | torch.nn.Parameter(torch.randn(self.field_size * self.embedding_size))) 156 | print("Cross network finished") 157 | cat_size += self.field_size * self.embedding_size 158 | 159 | """ 160 | inner prodcut part 161 | """ 162 | if self.use_inner_product: 163 | print("Init inner product network") 164 | if self.is_inner_product_dropout: 165 | self.inner_product_0_dropout = nn.Dropout(self.dropout_inner_product_deep[0]) 166 | self.inner_product_linear_1 = nn.Linear(self.field_size*(self.field_size-1)/2, self.inner_product_layers[0]) 167 | if self.is_inner_product_dropout: 168 | self.inner_product_1_dropout = nn.Dropout(self.dropout_inner_product_deep[1]) 169 | if self.is_batch_norm: 170 | self.inner_product_batch_norm_1 = nn.BatchNorm1d(self.inner_product_layers[0]) 171 | 172 | for i, h in enumerate(self.inner_product_layers[1:], 1): 173 | setattr(self, 'inner_product_linear_' + str(i + 1), nn.Linear(self.inner_product_layers[i - 1], self.inner_product_layers[i])) 174 | if self.is_batch_norm: 175 | setattr(self, 'inner_product_batch_norm_' + str(i + 1), nn.BatchNorm1d(self.inner_product_layers[i])) 176 | if self.is_deep_dropout: 177 | setattr(self, 'inner_product_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_inner_product_deep[i + 1])) 178 | cat_size += inner_product_layers[-1] 179 | print("Inner product network finished") 180 | 181 | """ 182 | deep part 183 | """ 184 | if self.use_deep: 185 | print("Init deep part") 186 | 187 | if self.is_deep_dropout: 188 | self.linear_0_dropout = nn.Dropout(self.dropout_deep[0]) 189 | self.linear_1 = nn.Linear(self.embedding_size*self.field_size, deep_layers[0]) 190 | if self.is_batch_norm: 191 | self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0]) 192 | if self.is_deep_dropout: 193 | self.linear_1_dropout = nn.Dropout(self.dropout_deep[1]) 194 | for i, h in enumerate(self.deep_layers[1:], 1): 195 | setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i])) 196 | if self.is_batch_norm: 197 | setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i])) 198 | if self.is_deep_dropout: 199 | setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1])) 200 | cat_size += deep_layers[-1] 201 | print("Init deep part succeed") 202 | 203 | self.last_layer = nn.Linear(cat_size,1) 204 | print "Init succeed" 205 | 206 | def forward(self, Xi, Xv): 207 | """ 208 | :param Xi_train: index input tensor, batch_size * k * 1 209 | :param Xv_train: value input tensor, batch_size * k * 1 210 | :return: the last output 211 | """ 212 | 213 | if self.deep_layers_activation == 'sigmoid': 214 | activation = F.sigmoid 215 | elif self.deep_layers_activation == 'tanh': 216 | activation = F.tanh 217 | else: 218 | activation = F.relu 219 | 220 | """ 221 | embeddings 222 | """ 223 | emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.embeddings)] 224 | outputs = [] 225 | """ 226 | cross part 227 | """ 228 | if self.use_cross: 229 | x_0 = torch.cat(emb_arr,1) 230 | x_l = x_0 231 | for i in range(self.h_cross_depth): 232 | x_l = torch.sum(x_0 * x_l, 1).view([-1,1]) * getattr(self,'cross_weight_'+str(i+1)).view([1,-1]) + getattr(self,'cross_bias_'+str(i+1)) + x_l 233 | outputs.append(x_l) 234 | 235 | """ 236 | inner product part 237 | """ 238 | if self.use_inner_product: 239 | fm_wij_arr = [] 240 | for i in range(self.field_size): 241 | for j in range(i + 1, self.field_size): 242 | fm_wij_arr.append(torch.sum(emb_arr[i] * emb_arr[j],1).view([-1,1])) 243 | inner_output = torch.cat(fm_wij_arr,1) 244 | 245 | if self.is_inner_product_dropout: 246 | deep_emb = self.inner_product_0_dropout(inner_output) 247 | x_deep = self.inner_product_linear_1(deep_emb) 248 | if self.is_batch_norm: 249 | x_deep = self.inner_product_batch_norm_1(x_deep) 250 | x_deep = activation(x_deep) 251 | if self.is_inner_product_dropout: 252 | x_deep = self.inner_product_1_dropout(x_deep) 253 | for i in range(1, len(self.deep_layers)): 254 | x_deep = getattr(self, 'inner_product_linear_' + str(i + 1))(x_deep) 255 | if self.is_batch_norm: 256 | x_deep = getattr(self, 'inner_product_batch_norm_' + str(i + 1))(x_deep) 257 | x_deep = activation(x_deep) 258 | if self.is_deep_dropout: 259 | x_deep = getattr(self, 'inner_product_' + str(i + 1) + '_dropout')(x_deep) 260 | outputs.append(x_deep) 261 | 262 | """ 263 | deep part 264 | """ 265 | if self.use_deep: 266 | deep_emb = torch.cat(emb_arr,1) 267 | 268 | if self.is_deep_dropout: 269 | deep_emb = self.linear_0_dropout(deep_emb) 270 | x_deep = self.linear_1(deep_emb) 271 | if self.is_batch_norm: 272 | x_deep = self.batch_norm_1(x_deep) 273 | x_deep = activation(x_deep) 274 | if self.is_deep_dropout: 275 | x_deep = self.linear_1_dropout(x_deep) 276 | for i in range(1, len(self.deep_layers)): 277 | x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep) 278 | if self.is_batch_norm: 279 | x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep) 280 | x_deep = activation(x_deep) 281 | if self.is_deep_dropout: 282 | x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep) 283 | outputs.append(x_deep) 284 | 285 | """ 286 | total 287 | """ 288 | output = self.last_layer(torch.cat(outputs,1)) 289 | return torch.sum(output,1) 290 | 291 | 292 | def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None, 293 | y_valid = None, ealry_stopping=False, refit=False, save_path = None): 294 | """ 295 | :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...] 296 | indi_j is the feature index of feature field j of sample i in the training set 297 | :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...] 298 | vali_j is the feature value of feature field j of sample i in the training set 299 | vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features) 300 | :param y_train: label of each sample in the training set 301 | :param Xi_valid: list of list of feature indices of each sample in the validation set 302 | :param Xv_valid: list of list of feature values of each sample in the validation set 303 | :param y_valid: label of each sample in the validation set 304 | :param ealry_stopping: perform early stopping or not 305 | :param refit: refit the model on the train+valid dataset or not 306 | :param save_path: the path to save the model 307 | :return: 308 | """ 309 | """ 310 | pre_process 311 | """ 312 | if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])): 313 | print("Save path is not existed!") 314 | return 315 | 316 | if self.verbose: 317 | print("pre_process data ing...") 318 | is_valid = False 319 | Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1)) 320 | Xv_train = np.array(Xv_train) 321 | y_train = np.array(y_train) 322 | x_size = Xi_train.shape[0] 323 | if Xi_valid: 324 | Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1)) 325 | Xv_valid = np.array(Xv_valid) 326 | y_valid = np.array(y_valid) 327 | x_valid_size = Xi_valid.shape[0] 328 | is_valid = True 329 | if self.verbose: 330 | print("pre_process data finished") 331 | 332 | """ 333 | train model 334 | """ 335 | model = self.train() 336 | 337 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 338 | if self.optimizer_type == 'adam': 339 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 340 | elif self.optimizer_type == 'rmsp': 341 | optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 342 | elif self.optimizer_type == 'adag': 343 | optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 344 | 345 | criterion = F.binary_cross_entropy_with_logits 346 | 347 | train_result = [] 348 | valid_result = [] 349 | for epoch in range(self.n_epochs): 350 | total_loss = 0.0 351 | batch_iter = x_size // self.batch_size 352 | epoch_begin_time = time() 353 | batch_begin_time = time() 354 | for i in range(batch_iter+1): 355 | offset = i*self.batch_size 356 | end = min(x_size, offset+self.batch_size) 357 | if offset == end: 358 | break 359 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 360 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 361 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 362 | if self.use_cuda: 363 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 364 | optimizer.zero_grad() 365 | outputs = model(batch_xi, batch_xv) 366 | loss = criterion(outputs, batch_y) 367 | loss.backward() 368 | optimizer.step() 369 | 370 | total_loss += loss.data[0] 371 | if self.verbose: 372 | if i % 100 == 99: # print every 100 mini-batches 373 | eval = self.evaluate(batch_xi, batch_xv, batch_y) 374 | print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' % 375 | (epoch + 1, i + 1, total_loss/100.0, eval, time()-batch_begin_time)) 376 | total_loss = 0.0 377 | batch_begin_time = time() 378 | 379 | train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size) 380 | train_result.append(train_eval) 381 | print('*'*50) 382 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 383 | (epoch + 1, train_loss, train_eval, time()-epoch_begin_time)) 384 | print('*'*50) 385 | 386 | if is_valid: 387 | valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size) 388 | valid_result.append(valid_eval) 389 | print('*' * 50) 390 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 391 | (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time)) 392 | print('*' * 50) 393 | if save_path: 394 | torch.save(self.state_dict(),save_path) 395 | if is_valid and ealry_stopping and self.training_termination(valid_result): 396 | print("early stop at [%d] epoch!" % (epoch+1)) 397 | break 398 | 399 | # fit a few more epoch on train+valid until result reaches the best_train_score 400 | if is_valid and refit: 401 | if self.verbose: 402 | print("refitting the model") 403 | if self.greater_is_better: 404 | best_epoch = np.argmax(valid_result) 405 | else: 406 | best_epoch = np.argmin(valid_result) 407 | best_train_score = train_result[best_epoch] 408 | Xi_train = np.concatenate((Xi_train,Xi_valid)) 409 | Xv_train = np.concatenate((Xv_train,Xv_valid)) 410 | y_train = np.concatenate((y_train,y_valid)) 411 | x_size = x_size + x_valid_size 412 | self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train) 413 | for epoch in range(64): 414 | batch_iter = x_size // self.batch_size 415 | for i in range(batch_iter + 1): 416 | offset = i * self.batch_size 417 | end = min(x_size, offset + self.batch_size) 418 | if offset == end: 419 | break 420 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 421 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 422 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 423 | if self.use_cuda: 424 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 425 | optimizer.zero_grad() 426 | outputs = model(batch_xi, batch_xv) 427 | loss = criterion(outputs, batch_y) 428 | loss.backward() 429 | optimizer.step() 430 | train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size) 431 | if save_path: 432 | torch.save(self.state_dict(), save_path) 433 | if abs(best_train_score-train_eval) < 0.001 or \ 434 | (self.greater_is_better and train_eval > best_train_score) or \ 435 | ((not self.greater_is_better) and train_result < best_train_score): 436 | break 437 | if self.verbose: 438 | print("refit finished") 439 | 440 | def eval_by_batch(self,Xi, Xv, y, x_size): 441 | total_loss = 0.0 442 | y_pred = [] 443 | batch_size = 16384 444 | batch_iter = x_size // batch_size 445 | criterion = F.binary_cross_entropy_with_logits 446 | model = self.eval() 447 | for i in range(batch_iter+1): 448 | offset = i * batch_size 449 | end = min(x_size, offset + batch_size) 450 | if offset == end: 451 | break 452 | batch_xi = Variable(torch.LongTensor(Xi[offset:end])) 453 | batch_xv = Variable(torch.FloatTensor(Xv[offset:end])) 454 | batch_y = Variable(torch.FloatTensor(y[offset:end])) 455 | if self.use_cuda: 456 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 457 | outputs = model(batch_xi, batch_xv) 458 | pred = F.sigmoid(outputs).cpu() 459 | y_pred.extend(pred.data.numpy()) 460 | loss = criterion(outputs, batch_y) 461 | total_loss += loss.data[0]*(end-offset) 462 | total_metric = self.eval_metric(y,y_pred) 463 | return total_loss/x_size, total_metric 464 | 465 | # shuffle three lists simutaneously 466 | def shuffle_in_unison_scary(self, a, b, c): 467 | rng_state = np.random.get_state() 468 | np.random.shuffle(a) 469 | np.random.set_state(rng_state) 470 | np.random.shuffle(b) 471 | np.random.set_state(rng_state) 472 | np.random.shuffle(c) 473 | 474 | def training_termination(self, valid_result): 475 | if len(valid_result) > 4: 476 | if self.greater_is_better: 477 | if valid_result[-1] < valid_result[-2] and \ 478 | valid_result[-2] < valid_result[-3] and \ 479 | valid_result[-3] < valid_result[-4]: 480 | return True 481 | else: 482 | if valid_result[-1] > valid_result[-2] and \ 483 | valid_result[-2] > valid_result[-3] and \ 484 | valid_result[-3] > valid_result[-4]: 485 | return True 486 | return False 487 | 488 | def predict(self, Xi, Xv): 489 | """ 490 | :param Xi: the same as fit function 491 | :param Xv: the same as fit function 492 | :return: output, ont-dim array 493 | """ 494 | Xi = np.array(Xi).reshape((-1,self.field_size,1)) 495 | Xi = Variable(torch.LongTensor(Xi)) 496 | Xv = Variable(torch.FloatTensor(Xv)) 497 | if self.use_cuda and torch.cuda.is_available(): 498 | Xi, Xv = Xi.cuda(), Xv.cuda() 499 | 500 | model = self.eval() 501 | pred = F.sigmoid(model(Xi, Xv)).cpu() 502 | return (pred.data.numpy() > 0.5) 503 | 504 | def predict_proba(self, Xi, Xv): 505 | Xi = np.array(Xi).reshape((-1, self.field_size, 1)) 506 | Xi = Variable(torch.LongTensor(Xi)) 507 | Xv = Variable(torch.FloatTensor(Xv)) 508 | if self.use_cuda and torch.cuda.is_available(): 509 | Xi, Xv = Xi.cuda(), Xv.cuda() 510 | 511 | model = self.eval() 512 | pred = F.sigmoid(model(Xi, Xv)).cpu() 513 | return pred.data.numpy() 514 | 515 | def inner_predict(self, Xi, Xv): 516 | """ 517 | :param Xi: tensor of feature index 518 | :param Xv: tensor of feature value 519 | :return: output, numpy 520 | """ 521 | model = self.eval() 522 | pred = F.sigmoid(model(Xi, Xv)).cpu() 523 | return (pred.data.numpy() > 0.5) 524 | 525 | def inner_predict_proba(self, Xi, Xv): 526 | """ 527 | :param Xi: tensor of feature index 528 | :param Xv: tensor of feature value 529 | :return: output, numpy 530 | """ 531 | model = self.eval() 532 | pred = F.sigmoid(model(Xi, Xv)).cpu() 533 | return pred.data.numpy() 534 | 535 | 536 | def evaluate(self, Xi, Xv, y): 537 | """ 538 | :param Xi: tensor of feature index 539 | :param Xv: tensor of feature value 540 | :param y: tensor of labels 541 | :return: metric of the evaluation 542 | """ 543 | y_pred = self.inner_predict_proba(Xi, Xv) 544 | return self.eval_metric(y.cpu().data.numpy(), y_pred) 545 | 546 | """ 547 | test part 548 | """ 549 | import sys 550 | sys.path.append('../') 551 | from utils import data_preprocess 552 | 553 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv') 554 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv') 555 | with torch.cuda.device(0): 556 | dcn = DCN(39, result_dict['feature_sizes'], batch_size=128 * 32, verbose=True, use_cuda=True, 557 | weight_decay=0.00002, use_inner_product=True).cuda() 558 | dcn.fit(result_dict['index'], result_dict['value'], result_dict['label'], 559 | test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, refit=False, 560 | save_path='../data/model/dcn.pkl') 561 | -------------------------------------------------------------------------------- /model/DIN.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | A pytorch implementation of NFM 8 | 9 | Reference: 10 | [1] Neural Factorization Machines for Sparse Predictive Analytics 11 | Xiangnan He,School of Computing,National University of Singapore,Singapore 117417,dcshex@nus.edu.sg 12 | Tat-Seng Chua,School of Computing,National University of Singapore,Singapore 117417,dcscts@nus.edu.sg 13 | 14 | """ 15 | 16 | import os 17 | import numpy as np 18 | from sklearn.base import BaseEstimator, TransformerMixin 19 | from sklearn.metrics import roc_auc_score 20 | from time import time 21 | 22 | import torch 23 | import torch.autograd as autograd 24 | import torch.nn as nn 25 | import torch.nn.functional as F 26 | import torch.optim as optim 27 | from torch.autograd import Variable 28 | 29 | import torch.backends.cudnn 30 | 31 | 32 | """ 33 | 网络结构部分 34 | """ 35 | 36 | class DIN(torch.nn.Module): 37 | """ 38 | :parameter 39 | ------------- 40 | field_size: size of the feature fields 41 | feature_sizes: a field_size-dim array, sizes of the feature dictionary 42 | embedding_size: size of the feature embedding 43 | is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not? 44 | dropout_shallow: an array of the size of 1, example:[0.5], the element is for the-first order part 45 | h_depth: deep network's hidden layers' depth 46 | deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2 47 | is_deep_dropout: bool, deep part uses dropout or not? 48 | dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2 49 | deep_layers_activation: relu or sigmoid etc 50 | n_epochs: epochs 51 | batch_size: batch_size 52 | learning_rate: learning_rate 53 | optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag' 54 | is_batch_norm:bool, use batch_norm or not ? 55 | verbose: verbose 56 | weight_decay: weight decay (L2 penalty) 57 | random_seed: random_seed=950104 someone's birthday, my lukcy number 58 | use_fm: bool 59 | use_ffm: bool 60 | interation_type: bool, When it's true, the element-wise product of the fm or ffm embeddings will be added together, otherwise, the element-wise prodcut of embeddings will be concatenated. 61 | loss_type: "logloss", only 62 | eval_metric: roc_auc_score 63 | use_cuda: bool use gpu or cpu? 64 | n_class: number of classes. is bounded to 1 65 | greater_is_better: bool. Is the greater eval better? 66 | 67 | 68 | Attention: only support logsitcs regression 69 | """ 70 | def __init__(self,field_size, feature_sizes, embedding_size = 4, is_shallow_dropout = True, dropout_shallow = [0.5], 71 | h_depth = 2, deep_layers = [32, 32], is_deep_dropout = True, dropout_deep=[0.0, 0.5, 0.5], 72 | deep_layers_activation = 'relu', n_epochs = 64, batch_size = 256, learning_rate = 0.003, 73 | optimizer_type = 'adam', is_batch_norm = False, verbose = False, random_seed = 950104, weight_decay = 0.0, 74 | use_fm = True, use_ffm = False, use_high_interaction = True,interation_type = True,loss_type = 'logloss', eval_metric = roc_auc_score, 75 | use_cuda = True, n_class = 1, greater_is_better = True 76 | ): 77 | super(DIN, self).__init__() 78 | self.field_size = field_size 79 | self.feature_sizes = feature_sizes 80 | self.embedding_size = embedding_size 81 | self.is_shallow_dropout = is_shallow_dropout 82 | self.dropout_shallow = dropout_shallow 83 | self.h_depth = h_depth 84 | self.deep_layers = deep_layers 85 | self.is_deep_dropout = is_deep_dropout 86 | self.dropout_deep = dropout_deep 87 | self.deep_layers_activation = deep_layers_activation 88 | self.n_epochs = n_epochs 89 | self.batch_size = batch_size 90 | self.learning_rate = learning_rate 91 | self.optimizer_type = optimizer_type 92 | self.is_batch_norm = is_batch_norm 93 | self.verbose = verbose 94 | self.weight_decay = weight_decay 95 | self.random_seed = random_seed 96 | self.use_fm = use_fm 97 | self.use_ffm = use_ffm 98 | self.use_high_interaction = use_high_interaction 99 | self.interation_type = interation_type 100 | self.loss_type = loss_type 101 | self.eval_metric = eval_metric 102 | self.use_cuda = use_cuda 103 | self.n_class = n_class 104 | self.greater_is_better = greater_is_better 105 | self.pre_train = False 106 | 107 | torch.manual_seed(self.random_seed) 108 | 109 | """ 110 | check cuda 111 | """ 112 | if self.use_cuda and not torch.cuda.is_available(): 113 | self.use_cuda = False 114 | print("Cuda is not available, automatically changed into cpu model") 115 | 116 | """ 117 | check use fm or ffm 118 | """ 119 | if self.use_fm and self.use_ffm: 120 | print("only support one type only, please make sure to choose only fm or ffm part") 121 | exit(1) 122 | elif self.use_fm: 123 | print("The model is nfm(fm+nn layers)") 124 | elif self.use_ffm: 125 | print("The model is nffm(ffm+nn layers)") 126 | else: 127 | print("You have to choose more than one of (fm, ffm) models to use") 128 | exit(1) 129 | """ 130 | bias 131 | """ 132 | self.bias = torch.nn.Parameter(torch.randn(1)) 133 | 134 | """ 135 | fm part 136 | """ 137 | if self.use_fm: 138 | print("Init fm part") 139 | self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) 140 | if self.dropout_shallow: 141 | self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0]) 142 | self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) 143 | print("Init fm part succeed") 144 | 145 | """ 146 | ffm part 147 | """ 148 | if self.use_ffm: 149 | print("Init ffm part") 150 | self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) 151 | if self.dropout_shallow: 152 | self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0]) 153 | self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes]) 154 | print("Init ffm part succeed") 155 | 156 | """ 157 | high interaction part 158 | """ 159 | if self.use_high_interaction and self.use_fm: 160 | self.h_weights = nn.ParameterList([torch.nn.Parameter(torch.ones(self.embedding_size)) for i in range(self.field_size)]) 161 | self.h_bias = nn.ParameterList([torch.nn.Parameter(torch.ones(1)) for i in range(self.field_size)]) 162 | self.h_batch_norm = nn.BatchNorm1d(self.field_size) 163 | 164 | """ 165 | deep part 166 | """ 167 | print("Init deep part") 168 | 169 | if self.is_deep_dropout: 170 | self.linear_0_dropout = nn.Dropout(self.dropout_deep[0]) 171 | if self.interation_type: 172 | self.linear_1 = nn.Linear(self.embedding_size, deep_layers[0]) 173 | else: 174 | self.linear_1 = nn.Linear(self.field_size*(self.field_size-1)/2, deep_layers[0]) 175 | if self.is_batch_norm: 176 | self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0]) 177 | if self.is_deep_dropout: 178 | self.linear_1_dropout = nn.Dropout(self.dropout_deep[1]) 179 | for i, h in enumerate(self.deep_layers[1:], 1): 180 | setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i])) 181 | if self.is_batch_norm: 182 | setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i])) 183 | if self.is_deep_dropout: 184 | setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1])) 185 | 186 | print("Init deep part succeed") 187 | 188 | print "Init succeed" 189 | 190 | def forward(self, Xi, Xv): 191 | """ 192 | :param Xi_train: index input tensor, batch_size * k * 1 193 | :param Xv_train: value input tensor, batch_size * k * 1 194 | :return: the last output 195 | """ 196 | """ 197 | fm part 198 | """ 199 | if self.use_fm: 200 | fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)] 201 | fm_first_order = torch.cat(fm_first_order_emb_arr,1) 202 | if self.is_shallow_dropout: 203 | fm_first_order = self.fm_first_order_dropout(fm_first_order) 204 | 205 | if self.interation_type: 206 | # use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation 207 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)] 208 | fm_sum_second_order_emb = sum(fm_second_order_emb_arr) 209 | fm_sum_second_order_emb_square = fm_sum_second_order_emb*fm_sum_second_order_emb # (x+y)^2 210 | fm_second_order_emb_square = [item*item for item in fm_second_order_emb_arr] 211 | fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) #x^2+y^2 212 | fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5 213 | else: 214 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in 215 | enumerate(self.fm_second_order_embeddings)] 216 | fm_wij_arr = [] 217 | for i in range(self.field_size): 218 | for j in range(i + 1, self.field_size): 219 | fm_wij_arr.append(fm_second_order_emb_arr[i] * fm_second_order_emb_arr[j]) 220 | 221 | 222 | """ 223 | ffm part 224 | """ 225 | if self.use_ffm: 226 | ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)] 227 | ffm_first_order = torch.cat(ffm_first_order_emb_arr,1) 228 | if self.is_shallow_dropout: 229 | ffm_first_order = self.ffm_first_order_dropout(ffm_first_order) 230 | ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)] 231 | ffm_wij_arr = [] 232 | for i in range(self.field_size): 233 | for j in range(i+1, self.field_size): 234 | ffm_wij_arr.append(ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i]) 235 | ffm_second_order = sum(ffm_wij_arr) 236 | 237 | """ 238 | high interaction part 239 | """ 240 | if self.use_high_interaction and self.use_fm: 241 | total_prod = 1.0 242 | for i, h_weight in enumerate(self.h_weights): 243 | total_prod = total_prod * (fm_second_order_emb_arr[i]*h_weight+self.h_bias[i]) 244 | high_output = total_prod 245 | 246 | 247 | """ 248 | deep part 249 | """ 250 | if self.use_fm and self.interation_type: 251 | deep_emb = fm_second_order 252 | elif self.use_ffm and self.interation_type: 253 | deep_emb = ffm_second_order 254 | elif self.use_fm: 255 | deep_emb = torch.cat([torch.sum(fm_wij,1).view([-1,1]) for fm_wij in fm_wij_arr], 1) 256 | else: 257 | deep_emb = torch.cat([torch.sum(ffm_wij,1).view([-1,1]) for ffm_wij in ffm_wij_arr],1) 258 | 259 | if self.deep_layers_activation == 'sigmoid': 260 | activation = F.sigmoid 261 | elif self.deep_layers_activation == 'tanh': 262 | activation = F.tanh 263 | else: 264 | activation = F.relu 265 | 266 | if self.is_deep_dropout: 267 | deep_emb = self.linear_0_dropout(deep_emb) 268 | x_deep = self.linear_1(deep_emb) 269 | if self.is_batch_norm: 270 | x_deep = self.batch_norm_1(x_deep) 271 | x_deep = activation(x_deep) 272 | if self.is_deep_dropout: 273 | x_deep = self.linear_1_dropout(x_deep) 274 | for i in range(1, len(self.deep_layers)): 275 | x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep) 276 | if self.is_batch_norm: 277 | x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep) 278 | x_deep = activation(x_deep) 279 | if self.is_deep_dropout: 280 | x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep) 281 | 282 | """ 283 | sum 284 | """ 285 | if self.use_fm: 286 | if self.use_high_interaction and not self.pre_train: 287 | total_sum = self.bias+ torch.sum(fm_first_order,1) + torch.sum(x_deep, 1) + torch.sum(high_output,1) 288 | else: 289 | total_sum = self.bias + torch.sum(fm_first_order, 1) + torch.sum(x_deep, 1) 290 | elif self.use_ffm: 291 | total_sum = self.bias + torch.sum(ffm_first_order, 1) + torch.sum(x_deep, 1) 292 | return total_sum 293 | 294 | 295 | def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None, 296 | y_valid = None, ealry_stopping=False, pre_train = False, n_epochs = 64,refit=False, save_path = None): 297 | """ 298 | :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...] 299 | indi_j is the feature index of feature field j of sample i in the training set 300 | :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...] 301 | vali_j is the feature value of feature field j of sample i in the training set 302 | vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features) 303 | :param y_train: label of each sample in the training set 304 | :param Xi_valid: list of list of feature indices of each sample in the validation set 305 | :param Xv_valid: list of list of feature values of each sample in the validation set 306 | :param y_valid: label of each sample in the validation set 307 | :param ealry_stopping: perform early stopping or not 308 | :param pre_train: pre_train or not 309 | :param n_epochs: number of epochs 310 | :param refit: refit the model on the train+valid dataset or not 311 | :param save_path: the path to save the model 312 | :return: 313 | """ 314 | """ 315 | pre_process 316 | """ 317 | if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])): 318 | print("Save path is not existed!") 319 | return 320 | 321 | if self.verbose: 322 | print("pre_process data ing...") 323 | 324 | self.pre_train = pre_train 325 | self.n_epochs = n_epochs 326 | is_valid = False 327 | Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1)) 328 | Xv_train = np.array(Xv_train) 329 | y_train = np.array(y_train) 330 | x_size = Xi_train.shape[0] 331 | if Xi_valid: 332 | Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1)) 333 | Xv_valid = np.array(Xv_valid) 334 | y_valid = np.array(y_valid) 335 | x_valid_size = Xi_valid.shape[0] 336 | is_valid = True 337 | if self.verbose: 338 | print("pre_process data finished") 339 | 340 | """ 341 | train model 342 | """ 343 | model = self.train() 344 | 345 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 346 | if self.optimizer_type == 'adam': 347 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 348 | elif self.optimizer_type == 'rmsp': 349 | optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 350 | elif self.optimizer_type == 'adag': 351 | optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 352 | 353 | criterion = F.binary_cross_entropy_with_logits 354 | 355 | train_result = [] 356 | valid_result = [] 357 | for epoch in range(self.n_epochs): 358 | total_loss = 0.0 359 | batch_iter = x_size // self.batch_size 360 | epoch_begin_time = time() 361 | batch_begin_time = time() 362 | for i in range(batch_iter+1): 363 | offset = i*self.batch_size 364 | end = min(x_size, offset+self.batch_size) 365 | if offset == end: 366 | break 367 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 368 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 369 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 370 | if self.use_cuda: 371 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 372 | optimizer.zero_grad() 373 | outputs = model(batch_xi, batch_xv) 374 | loss = criterion(outputs, batch_y) 375 | try: 376 | loss.backward() 377 | except: 378 | print batch_xi.is_cuda, batch_xv.is_cuda, batch_y.is_cuda 379 | print batch_xi 380 | print batch_xv 381 | print batch_y 382 | optimizer.step() 383 | 384 | total_loss += loss.data[0] 385 | if self.verbose: 386 | if i % 100 == 99: # print every 100 mini-batches 387 | eval = self.evaluate(batch_xi, batch_xv, batch_y) 388 | print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' % 389 | (epoch + 1, i + 1, total_loss/100.0, eval, time()-batch_begin_time)) 390 | total_loss = 0.0 391 | batch_begin_time = time() 392 | 393 | train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size) 394 | train_result.append(train_eval) 395 | print('*'*50) 396 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 397 | (epoch + 1, train_loss, train_eval, time()-epoch_begin_time)) 398 | print('*'*50) 399 | 400 | if is_valid: 401 | valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size) 402 | valid_result.append(valid_eval) 403 | print('*' * 50) 404 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 405 | (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time)) 406 | print('*' * 50) 407 | if save_path: 408 | torch.save(self.state_dict(),save_path) 409 | if is_valid and ealry_stopping and self.training_termination(valid_result): 410 | print("early stop at [%d] epoch!" % (epoch+1)) 411 | break 412 | 413 | # fit a few more epoch on train+valid until result reaches the best_train_score 414 | if is_valid and refit: 415 | if self.verbose: 416 | print("refitting the model") 417 | if self.greater_is_better: 418 | best_epoch = np.argmax(valid_result) 419 | else: 420 | best_epoch = np.argmin(valid_result) 421 | best_train_score = train_result[best_epoch] 422 | Xi_train = np.concatenate((Xi_train,Xi_valid)) 423 | Xv_train = np.concatenate((Xv_train,Xv_valid)) 424 | y_train = np.concatenate((y_train,y_valid)) 425 | x_size = x_size + x_valid_size 426 | self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train) 427 | for epoch in range(64): 428 | batch_iter = x_size // self.batch_size 429 | for i in range(batch_iter + 1): 430 | offset = i * self.batch_size 431 | end = min(x_size, offset + self.batch_size) 432 | if offset == end: 433 | break 434 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 435 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 436 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 437 | if self.use_cuda: 438 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 439 | optimizer.zero_grad() 440 | outputs = model(batch_xi, batch_xv) 441 | loss = criterion(outputs, batch_y) 442 | loss.backward() 443 | optimizer.step() 444 | train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size) 445 | if save_path: 446 | torch.save(self.state_dict(), save_path) 447 | if abs(best_train_score-train_eval) < 0.001 or \ 448 | (self.greater_is_better and train_eval > best_train_score) or \ 449 | ((not self.greater_is_better) and train_result < best_train_score): 450 | break 451 | if self.verbose: 452 | print("refit finished") 453 | 454 | def eval_by_batch(self,Xi, Xv, y, x_size): 455 | total_loss = 0.0 456 | y_pred = [] 457 | if self.use_ffm: 458 | batch_size = 16384*2 459 | else: 460 | batch_size = 16384 461 | batch_iter = x_size // batch_size 462 | criterion = F.binary_cross_entropy_with_logits 463 | model = self.eval() 464 | for i in range(batch_iter+1): 465 | offset = i * batch_size 466 | end = min(x_size, offset + batch_size) 467 | if offset == end: 468 | break 469 | batch_xi = Variable(torch.LongTensor(Xi[offset:end])) 470 | batch_xv = Variable(torch.FloatTensor(Xv[offset:end])) 471 | batch_y = Variable(torch.FloatTensor(y[offset:end])) 472 | if self.use_cuda: 473 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 474 | outputs = model(batch_xi, batch_xv) 475 | pred = F.sigmoid(outputs).cpu() 476 | y_pred.extend(pred.data.numpy()) 477 | loss = criterion(outputs, batch_y) 478 | total_loss += loss.data[0]*(end-offset) 479 | total_metric = self.eval_metric(y,y_pred) 480 | return total_loss/x_size, total_metric 481 | 482 | # shuffle three lists simutaneously 483 | def shuffle_in_unison_scary(self, a, b, c): 484 | rng_state = np.random.get_state() 485 | np.random.shuffle(a) 486 | np.random.set_state(rng_state) 487 | np.random.shuffle(b) 488 | np.random.set_state(rng_state) 489 | np.random.shuffle(c) 490 | 491 | def training_termination(self, valid_result): 492 | if len(valid_result) > 4: 493 | if self.greater_is_better: 494 | if valid_result[-1] < valid_result[-2] and \ 495 | valid_result[-2] < valid_result[-3] and \ 496 | valid_result[-3] < valid_result[-4]: 497 | return True 498 | else: 499 | if valid_result[-1] > valid_result[-2] and \ 500 | valid_result[-2] > valid_result[-3] and \ 501 | valid_result[-3] > valid_result[-4]: 502 | return True 503 | return False 504 | 505 | def predict(self, Xi, Xv): 506 | """ 507 | :param Xi: the same as fit function 508 | :param Xv: the same as fit function 509 | :return: output, ont-dim array 510 | """ 511 | Xi = np.array(Xi).reshape((-1,self.field_size,1)) 512 | Xi = Variable(torch.LongTensor(Xi)) 513 | Xv = Variable(torch.FloatTensor(Xv)) 514 | if self.use_cuda and torch.cuda.is_available(): 515 | Xi, Xv = Xi.cuda(), Xv.cuda() 516 | 517 | model = self.eval() 518 | pred = F.sigmoid(model(Xi, Xv)).cpu() 519 | return (pred.data.numpy() > 0.5) 520 | 521 | def predict_proba(self, Xi, Xv): 522 | Xi = np.array(Xi).reshape((-1, self.field_size, 1)) 523 | Xi = Variable(torch.LongTensor(Xi)) 524 | Xv = Variable(torch.FloatTensor(Xv)) 525 | if self.use_cuda and torch.cuda.is_available(): 526 | Xi, Xv = Xi.cuda(), Xv.cuda() 527 | 528 | model = self.eval() 529 | pred = F.sigmoid(model(Xi, Xv)).cpu() 530 | return pred.data.numpy() 531 | 532 | def inner_predict(self, Xi, Xv): 533 | """ 534 | :param Xi: tensor of feature index 535 | :param Xv: tensor of feature value 536 | :return: output, numpy 537 | """ 538 | model = self.eval() 539 | pred = F.sigmoid(model(Xi, Xv)).cpu() 540 | return (pred.data.numpy() > 0.5) 541 | 542 | def inner_predict_proba(self, Xi, Xv): 543 | """ 544 | :param Xi: tensor of feature index 545 | :param Xv: tensor of feature value 546 | :return: output, numpy 547 | """ 548 | model = self.eval() 549 | pred = F.sigmoid(model(Xi, Xv)).cpu() 550 | return pred.data.numpy() 551 | 552 | 553 | def evaluate(self, Xi, Xv, y): 554 | """ 555 | :param Xi: tensor of feature index 556 | :param Xv: tensor of feature value 557 | :param y: tensor of labels 558 | :return: metric of the evaluation 559 | """ 560 | y_pred = self.inner_predict_proba(Xi, Xv) 561 | return self.eval_metric(y.cpu().data.numpy(), y_pred) 562 | 563 | """ 564 | test part 565 | """ 566 | import sys 567 | sys.path.append('../') 568 | from utils import data_preprocess 569 | 570 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv') 571 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv') 572 | with torch.cuda.device(0): 573 | din = DIN(39, result_dict['feature_sizes'], batch_size=128 * 64, is_shallow_dropout=False, verbose=True, use_cuda=True, 574 | weight_decay=0.0000002, use_fm=True, use_ffm=False, use_high_interaction=True,interation_type=False).cuda() 575 | # din.fit(result_dict['index'], result_dict['value'], result_dict['label'], 576 | # test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, pre_train=True, 577 | # n_epochs=32,refit=False, 578 | # save_path='../data/model/din.pkl') 579 | din.load_state_dict(torch.load('../data/model/din.pkl')) 580 | din.fit(result_dict['index'], result_dict['value'], result_dict['label'], 581 | test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, pre_train=False, 582 | n_epochs=64, refit=False, 583 | save_path='../data/model/din.pkl') 584 | -------------------------------------------------------------------------------- /model/DeepFM.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | A pytorch implementation of deepfm 8 | 9 | Reference: 10 | [1] DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, 11 | Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He. 12 | 13 | """ 14 | 15 | import os 16 | import numpy as np 17 | from sklearn.base import BaseEstimator, TransformerMixin 18 | from sklearn.metrics import roc_auc_score 19 | from time import time 20 | 21 | import torch 22 | import torch.autograd as autograd 23 | import torch.nn as nn 24 | import torch.nn.functional as F 25 | import torch.optim as optim 26 | from torch.autograd import Variable 27 | 28 | import torch.backends.cudnn 29 | 30 | """ 31 | 缃戠粶缁撴瀯閮ㄥ垎 32 | """ 33 | 34 | 35 | class DeepFM(torch.nn.Module): 36 | """ 37 | :parameter 38 | ------------- 39 | field_size: size of the feature fields 40 | feature_sizes: a field_size-dim array, sizes of the feature dictionary 41 | embedding_size: size of the feature embedding 42 | is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not? 43 | dropout_shallow: an array of the size of 2, example:[0.5,0.5], the first element is for the-first order part and the second element is for the second-order part 44 | h_depth: deep network's hidden layers' depth 45 | deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2 46 | is_deep_dropout: bool, deep part uses dropout or not? 47 | dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2 48 | deep_layers_activation: relu or sigmoid etc 49 | n_epochs: epochs 50 | batch_size: batch_size 51 | learning_rate: learning_rate 52 | optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag' 53 | is_batch_norm锛歜ool, use batch_norm or not ? 54 | verbose: verbose 55 | weight_decay: weight decay (L2 penalty) 56 | random_seed: random_seed=950104 someone's birthday, my lukcy number 57 | use_fm: bool 58 | use_ffm: bool 59 | use_deep: bool 60 | loss_type: "logloss", only 61 | eval_metric: roc_auc_score 62 | use_cuda: bool use gpu or cpu? 63 | n_class: number of classes. is bounded to 1 64 | greater_is_better: bool. Is the greater eval better? 65 | 66 | 67 | Attention: only support logsitcs regression 68 | """ 69 | 70 | def __init__(self, field_size, feature_sizes, embedding_size=4, is_shallow_dropout=True, dropout_shallow=[0.5, 0.5], 71 | h_depth=2, deep_layers=[64, 64], is_deep_dropout=True, dropout_deep=[0.5, 0.5, 0.5], 72 | deep_layers_activation='relu', n_epochs=24, batch_size=256, learning_rate=0.003, 73 | optimizer_type='adam', is_batch_norm=False, verbose=False, random_seed=950104, weight_decay=0.0, 74 | use_fm=True, use_ffm=False, use_deep=True, loss_type='logloss', eval_metric=roc_auc_score, 75 | use_cuda=True, n_class=1, greater_is_better=True 76 | ): 77 | super(DeepFM, self).__init__() 78 | self.field_size = field_size 79 | self.feature_sizes = feature_sizes 80 | self.embedding_size = embedding_size 81 | self.is_shallow_dropout = is_shallow_dropout 82 | self.dropout_shallow = dropout_shallow 83 | self.h_depth = h_depth 84 | self.deep_layers = deep_layers 85 | self.is_deep_dropout = is_deep_dropout 86 | self.dropout_deep = dropout_deep 87 | self.deep_layers_activation = deep_layers_activation 88 | self.n_epochs = n_epochs 89 | self.batch_size = batch_size 90 | self.learning_rate = learning_rate 91 | self.optimizer_type = optimizer_type 92 | self.is_batch_norm = is_batch_norm 93 | self.verbose = verbose 94 | self.weight_decay = weight_decay 95 | self.random_seed = random_seed 96 | self.use_fm = use_fm 97 | self.use_ffm = use_ffm 98 | self.use_deep = use_deep 99 | self.loss_type = loss_type 100 | self.eval_metric = eval_metric 101 | self.use_cuda = use_cuda 102 | self.n_class = n_class 103 | self.greater_is_better = greater_is_better 104 | 105 | torch.manual_seed(self.random_seed) 106 | 107 | """ 108 | check cuda 109 | """ 110 | if self.use_cuda and not torch.cuda.is_available(): 111 | self.use_cuda = False 112 | print("Cuda is not available, automatically changed into cpu model") 113 | 114 | """ 115 | check use fm or ffm 116 | """ 117 | if self.use_fm and self.use_ffm: 118 | print("only support one type only, please make sure to choose only fm or ffm part") 119 | exit(1) 120 | elif self.use_fm and self.use_deep: 121 | print("The model is deepfm(fm+deep layers)") 122 | elif self.use_ffm and self.use_deep: 123 | print("The model is deepffm(ffm+deep layers)") 124 | elif self.use_fm: 125 | print("The model is fm only") 126 | elif self.use_ffm: 127 | print("The model is ffm only") 128 | elif self.use_deep: 129 | print("The model is deep layers only") 130 | else: 131 | print("You have to choose more than one of (fm, ffm, deep) models to use") 132 | exit(1) 133 | 134 | """ 135 | bias 136 | """ 137 | if self.use_fm or self.use_ffm: 138 | self.bias = torch.nn.Parameter(torch.randn(1)) 139 | """ 140 | fm part 141 | """ 142 | if self.use_fm: 143 | print("Init fm part") 144 | self.fm_first_order_embeddings = nn.ModuleList( 145 | [nn.Embedding(feature_size, 1) for feature_size in self.feature_sizes]) 146 | if self.dropout_shallow: 147 | self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0]) 148 | self.fm_second_order_embeddings = nn.ModuleList( 149 | [nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) 150 | if self.dropout_shallow: 151 | self.fm_second_order_dropout = nn.Dropout(self.dropout_shallow[1]) 152 | print("Init fm part succeed") 153 | 154 | """ 155 | ffm part 156 | """ 157 | if self.use_ffm: 158 | print("Init ffm part") 159 | self.ffm_first_order_embeddings = nn.ModuleList( 160 | [nn.Embedding(feature_size, 1) for feature_size in self.feature_sizes]) 161 | if self.dropout_shallow: 162 | self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0]) 163 | self.ffm_second_order_embeddings = nn.ModuleList( 164 | [nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for 165 | feature_size in self.feature_sizes]) 166 | if self.dropout_shallow: 167 | self.ffm_second_order_dropout = nn.Dropout(self.dropout_shallow[1]) 168 | print("Init ffm part succeed") 169 | 170 | """ 171 | deep part 172 | """ 173 | if self.use_deep: 174 | print("Init deep part") 175 | if not self.use_fm and not self.use_ffm: 176 | self.fm_second_order_embeddings = nn.ModuleList( 177 | [nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) 178 | 179 | if self.is_deep_dropout: 180 | self.linear_0_dropout = nn.Dropout(self.dropout_deep[0]) 181 | 182 | self.linear_1 = nn.Linear(self.field_size * self.embedding_size, deep_layers[0]) 183 | if self.is_batch_norm: 184 | self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0]) 185 | if self.is_deep_dropout: 186 | self.linear_1_dropout = nn.Dropout(self.dropout_deep[1]) 187 | for i, h in enumerate(self.deep_layers[1:], 1): 188 | setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i])) 189 | if self.is_batch_norm: 190 | setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i])) 191 | if self.is_deep_dropout: 192 | setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1])) 193 | 194 | print("Init deep part succeed") 195 | 196 | print("Init succeed") 197 | 198 | def forward(self, Xi, Xv): 199 | """ 200 | :param Xi_train: index input tensor, batch_size * k * 1 201 | :param Xv_train: value input tensor, batch_size * k * 1 202 | :return: the last output 203 | """ 204 | """ 205 | fm part 206 | """ 207 | if self.use_fm: 208 | fm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in 209 | enumerate(self.fm_first_order_embeddings)] 210 | fm_first_order = torch.cat(fm_first_order_emb_arr, 1) 211 | if self.is_shallow_dropout: 212 | fm_first_order = self.fm_first_order_dropout(fm_first_order) 213 | 214 | # use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation 215 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in 216 | enumerate(self.fm_second_order_embeddings)] 217 | fm_sum_second_order_emb = sum(fm_second_order_emb_arr) 218 | fm_sum_second_order_emb_square = fm_sum_second_order_emb * fm_sum_second_order_emb # (x+y)^2 219 | fm_second_order_emb_square = [item * item for item in fm_second_order_emb_arr] 220 | fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) # x^2+y^2 221 | fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5 222 | if self.is_shallow_dropout: 223 | fm_second_order = self.fm_second_order_dropout(fm_second_order) 224 | 225 | """ 226 | ffm part 227 | """ 228 | if self.use_ffm: 229 | ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in 230 | enumerate(self.ffm_first_order_embeddings)] 231 | ffm_first_order = torch.cat(ffm_first_order_emb_arr, 1) 232 | if self.is_shallow_dropout: 233 | ffm_first_order = self.ffm_first_order_dropout(ffm_first_order) 234 | ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for emb in f_embs] for 235 | i, f_embs in enumerate(self.ffm_second_order_embeddings)] 236 | ffm_wij_arr = [] 237 | for i in range(self.field_size): 238 | for j in range(i + 1, self.field_size): 239 | ffm_wij_arr.append(ffm_second_order_emb_arr[i][j] * ffm_second_order_emb_arr[j][i]) 240 | ffm_second_order = sum(ffm_wij_arr) 241 | if self.is_shallow_dropout: 242 | ffm_second_order = self.ffm_second_order_dropout(ffm_second_order) 243 | 244 | """ 245 | deep part 246 | """ 247 | if self.use_deep: 248 | if self.use_fm: 249 | deep_emb = torch.cat(fm_second_order_emb_arr, 1) 250 | elif self.use_ffm: 251 | deep_emb = torch.cat([sum(ffm_second_order_embs) for ffm_second_order_embs in ffm_second_order_emb_arr], 252 | 1) 253 | else: 254 | deep_emb = torch.cat([(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in 255 | enumerate(self.fm_second_order_embeddings)], 1) 256 | 257 | if self.deep_layers_activation == 'sigmoid': 258 | activation = F.sigmoid 259 | elif self.deep_layers_activation == 'tanh': 260 | activation = F.tanh 261 | else: 262 | activation = F.relu 263 | if self.is_deep_dropout: 264 | deep_emb = self.linear_0_dropout(deep_emb) 265 | x_deep = self.linear_1(deep_emb) 266 | if self.is_batch_norm: 267 | x_deep = self.batch_norm_1(x_deep) 268 | x_deep = activation(x_deep) 269 | if self.is_deep_dropout: 270 | x_deep = self.linear_1_dropout(x_deep) 271 | for i in range(1, len(self.deep_layers)): 272 | x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep) 273 | if self.is_batch_norm: 274 | x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep) 275 | x_deep = activation(x_deep) 276 | if self.is_deep_dropout: 277 | x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep) 278 | """ 279 | sum 280 | """ 281 | if self.use_fm and self.use_deep: 282 | total_sum = torch.sum(fm_first_order, 1) + torch.sum(fm_second_order, 1) + torch.sum(x_deep, 1) + self.bias 283 | elif self.use_ffm and self.use_deep: 284 | total_sum = torch.sum(ffm_first_order, 1) + torch.sum(ffm_second_order, 1) + torch.sum(x_deep, 285 | 1) + self.bias 286 | elif self.use_fm: 287 | total_sum = torch.sum(fm_first_order, 1) + torch.sum(fm_second_order, 1) + self.bias 288 | elif self.use_ffm: 289 | total_sum = torch.sum(ffm_first_order, 1) + torch.sum(ffm_second_order, 1) + self.bias 290 | else: 291 | total_sum = torch.sum(x_deep, 1) 292 | return total_sum 293 | 294 | def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None, 295 | y_valid=None, ealry_stopping=False, refit=False, save_path=None): 296 | """ 297 | :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...] 298 | indi_j is the feature index of feature field j of sample i in the training set 299 | :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...] 300 | vali_j is the feature value of feature field j of sample i in the training set 301 | vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features) 302 | :param y_train: label of each sample in the training set 303 | :param Xi_valid: list of list of feature indices of each sample in the validation set 304 | :param Xv_valid: list of list of feature values of each sample in the validation set 305 | :param y_valid: label of each sample in the validation set 306 | :param ealry_stopping: perform early stopping or not 307 | :param refit: refit the model on the train+valid dataset or not 308 | :param save_path: the path to save the model 309 | :return: 310 | """ 311 | """ 312 | pre_process 313 | """ 314 | if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])): 315 | print("Save path is not existed!") 316 | return 317 | 318 | if self.verbose: 319 | print("pre_process data ing...") 320 | is_valid = False 321 | Xi_train = np.array(Xi_train) 322 | Xi_train = Xi_train.reshape((-1, self.field_size, 1)) 323 | Xv_train = np.array(Xv_train) 324 | y_train = np.array(y_train) 325 | x_size = Xi_train.shape[0] 326 | if Xi_valid: 327 | Xi_valid = np.array(Xi_valid).reshape((-1, self.field_size, 1)) 328 | Xv_valid = np.array(Xv_valid) 329 | y_valid = np.array(y_valid) 330 | x_valid_size = Xi_valid.shape[0] 331 | is_valid = True 332 | if self.verbose: 333 | print("pre_process data finished") 334 | 335 | """ 336 | train model 337 | """ 338 | model = self.train() 339 | if torch.cuda.device_count() > 1 and self.use_cuda: 340 | print("Let's use", torch.cuda.device_count(), "GPUs!") 341 | model = torch.nn.DataParallel(model.cuda()) 342 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 343 | if self.optimizer_type == 'adam': 344 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 345 | elif self.optimizer_type == 'rmsp': 346 | optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 347 | elif self.optimizer_type == 'adag': 348 | optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 349 | 350 | criterion = F.binary_cross_entropy_with_logits 351 | 352 | train_result = [] 353 | valid_result = [] 354 | for epoch in range(self.n_epochs): 355 | total_loss = 0.0 356 | batch_iter = x_size // self.batch_size 357 | epoch_begin_time = time() 358 | batch_begin_time = time() 359 | self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train) 360 | for i in range(batch_iter + 1): 361 | offset = i * self.batch_size 362 | end = min(x_size, offset + self.batch_size) 363 | if offset == end: 364 | break 365 | #if i == 10000: 366 | #break 367 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 368 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 369 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 370 | 371 | if self.use_cuda: 372 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 373 | optimizer.zero_grad() 374 | outputs = model(batch_xi, batch_xv) 375 | loss = criterion(outputs, batch_y) 376 | loss.backward() 377 | optimizer.step() 378 | 379 | total_loss += loss.item() 380 | if self.verbose: 381 | if i % 100 == 99: # print every 100 mini-batches 382 | pre=outputs.cpu().data.numpy() 383 | y=batch_y.cpu().data.numpy() 384 | roc=self.roc(y,pre) 385 | print('[epoch,batch]: [%d, %5d] train_loss: %.6f train_metric: %.6f time: %.1f s' % 386 | (epoch + 1, i + 1, total_loss / 100.0, roc, time() - batch_begin_time)) 387 | total_loss = 0.0 388 | batch_begin_time = time() 389 | else: 390 | #pre = outputs.cpu().data.numpy() 391 | #y = batch_y.cpu().data.numpy() 392 | #roc = self.roc(y, pre) 393 | roc=-1 394 | print('[epoch,batch]: [%d, %5d] train_loss: %.6f train_metric: %.6f time: %.1f s' % 395 | (epoch + 1, i + 1, loss.item(), roc, time() - batch_begin_time)) 396 | batch_begin_time = time() 397 | 398 | train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size) 399 | train_result.append(train_eval) 400 | print('*' * 50) 401 | print('epoch_result:[%d] train_loss: %.6f train_metric: %.6f time: %.1f s' % 402 | (epoch + 1, train_loss, train_eval, time() - epoch_begin_time)) 403 | print('*' * 50) 404 | 405 | if is_valid: 406 | valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size) 407 | valid_result.append(valid_eval) 408 | print('*' * 50) 409 | print('epoch_result:[%d] val_loss: %.6f val_metric: %.6f time: %.1f s' % 410 | (epoch + 1, valid_loss, valid_eval, time() - epoch_begin_time)) 411 | print('*' * 50) 412 | if save_path: 413 | torch.save(self.state_dict(), save_path) 414 | if is_valid and ealry_stopping and self.training_termination(valid_result): 415 | print("early stop at [%d] epoch!" % (epoch + 1)) 416 | break 417 | 418 | # fit a few more epoch on train+valid until result reaches the best_train_score 419 | if is_valid and refit: 420 | if self.verbose: 421 | print("refitting the model") 422 | if self.greater_is_better: 423 | best_epoch = np.argmax(valid_result) 424 | else: 425 | best_epoch = np.argmin(valid_result) 426 | best_train_score = train_result[best_epoch] 427 | Xi_train = np.concatenate((Xi_train, Xi_valid)) 428 | Xv_train = np.concatenate((Xv_train, Xv_valid)) 429 | y_train = np.concatenate((y_train, y_valid)) 430 | x_size = x_size + x_valid_size 431 | self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train) 432 | for epoch in range(64): 433 | batch_iter = x_size // self.batch_size 434 | for i in range(batch_iter + 1): 435 | offset = i * self.batch_size 436 | end = min(x_size, offset + self.batch_size) 437 | if offset == end: 438 | break 439 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 440 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 441 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 442 | 443 | if self.use_cuda: 444 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 445 | optimizer.zero_grad() 446 | outputs = model(batch_xi, batch_xv) 447 | loss = criterion(outputs, batch_y) 448 | loss.backward() 449 | optimizer.step() 450 | train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size) 451 | if save_path: 452 | torch.save(self.state_dict(), save_path) 453 | if abs(best_train_score - train_eval) < 0.001 or \ 454 | (self.greater_is_better and train_eval > best_train_score) or \ 455 | ((not self.greater_is_better) and train_result < best_train_score): 456 | break 457 | if self.verbose: 458 | print("refit finished") 459 | 460 | def eval_by_batch(self, Xi, Xv, y, x_size): 461 | total_loss = 0.0 462 | y_pred = [] 463 | y1=[] 464 | if self.use_ffm: 465 | batch_size = self.batch_size 466 | else: 467 | batch_size = self.batch_size 468 | batch_iter = x_size // batch_size 469 | criterion = F.binary_cross_entropy_with_logits 470 | model = self.eval() 471 | 472 | for i in range(batch_iter + 1): 473 | offset = i * batch_size 474 | end = min(x_size, offset + batch_size) 475 | if offset == end: 476 | break 477 | if i==10: 478 | break 479 | batch_xi = Variable(torch.LongTensor(Xi[offset:end])) 480 | batch_xv = Variable(torch.FloatTensor(Xv[offset:end])) 481 | batch_y = Variable(torch.FloatTensor(y[offset:end])) 482 | 483 | if self.use_cuda: 484 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 485 | outputs = model(batch_xi, batch_xv) 486 | loss = criterion(outputs, batch_y) 487 | pred = F.sigmoid(outputs).cpu() 488 | y_pred.extend(pred.cpu().data.numpy()) 489 | y1.extend(batch_y.cpu().data.numpy()) 490 | total_loss += loss.item() 491 | roc=self.roc(y1,y_pred) 492 | return total_loss / 10, roc 493 | 494 | # shuffle three lists simutaneously 495 | def shuffle_in_unison_scary(self, a, b, c): 496 | rng_state = np.random.get_state() 497 | np.random.shuffle(a) 498 | np.random.set_state(rng_state) 499 | np.random.shuffle(b) 500 | np.random.set_state(rng_state) 501 | np.random.shuffle(c) 502 | 503 | def training_termination(self, valid_result): 504 | if len(valid_result) > 4: 505 | if self.greater_is_better: 506 | if valid_result[-1] < valid_result[-2] and \ 507 | valid_result[-2] < valid_result[-3] and \ 508 | valid_result[-3] < valid_result[-4]: 509 | return True 510 | else: 511 | if valid_result[-1] > valid_result[-2] and \ 512 | valid_result[-2] > valid_result[-3] and \ 513 | valid_result[-3] > valid_result[-4]: 514 | return True 515 | return False 516 | 517 | def predict_from_model_file(self, Xi, Xv,model,path): 518 | """ 519 | :param Xi: the same as fit function 520 | :param Xv: the same as fit function 521 | :return: output, ont-dim array 522 | """ 523 | state=torch.load(path) 524 | model.load_state_dict(state) 525 | Xi = np.array(Xi).reshape((-1, self.field_size, 1)) 526 | Xi = Variable(torch.LongTensor(Xi)) 527 | Xv = Variable(torch.FloatTensor(Xv)) 528 | if self.use_cuda and torch.cuda.is_available(): 529 | Xi, Xv = Xi.cuda(), Xv.cuda() 530 | 531 | pred = F.sigmoid(model(Xi, Xv)).cpu().data.numpy() 532 | return pred 533 | 534 | def predict(self, Xi, Xv): 535 | Xi = np.array(Xi).reshape((-1, self.field_size, 1)) 536 | Xi = Variable(torch.LongTensor(Xi)) 537 | Xv = Variable(torch.FloatTensor(Xv)) 538 | if self.use_cuda and torch.cuda.is_available(): 539 | Xi, Xv = Xi.cuda(), Xv.cuda() 540 | 541 | model = self.eval() 542 | pred = F.sigmoid(model(Xi, Xv)).cpu() 543 | return pred.data.numpy() 544 | 545 | def inner_predict(self, Xi, Xv): 546 | """ 547 | :param Xi: tensor of feature index 548 | :param Xv: tensor of feature value 549 | :return: output, numpy 550 | """ 551 | model = self.eval() 552 | pred = F.sigmoid(model(Xi, Xv)).cpu() 553 | return (pred.data.numpy() > 0.5) 554 | 555 | def inner_predict_proba(self, Xi, Xv): 556 | """ 557 | :param Xi: tensor of feature index 558 | :param Xv: tensor of feature value 559 | :return: output, numpy 560 | """ 561 | model = self.eval() 562 | pred = F.sigmoid(model(Xi, Xv)).cpu() 563 | return pred.data.numpy() 564 | 565 | def roc(self, y,pre): 566 | """ 567 | :param Xi: tensor of feature index 568 | :param Xv: tensor of feature value 569 | :param y: tensor of labels 570 | :return: metric of the evaluation 571 | """ 572 | total_metric=0 573 | if len(set(y)) == 2: 574 | total_metric = roc_auc_score(y, pre) 575 | else: 576 | total_metric = -1 577 | return total_metric 578 | -------------------------------------------------------------------------------- /model/FNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | A pytorch implementation of FNN 8 | 9 | Reference: 10 | [1] Deep Learning over Multi-field Categorical Data: A Case Study on User Response Prediction 11 | 12 | Weinan Zhang, Tianming Du, Jun Wang 13 | 14 | """ 15 | import os 16 | import numpy as np 17 | from sklearn.base import BaseEstimator, TransformerMixin 18 | from sklearn.metrics import roc_auc_score 19 | from time import time 20 | 21 | import torch 22 | import torch.autograd as autograd 23 | import torch.nn as nn 24 | import torch.nn.functional as F 25 | import torch.optim as optim 26 | from torch.autograd import Variable 27 | 28 | import torch.backends.cudnn 29 | 30 | """ 31 | 网络结构部分 32 | """ 33 | 34 | class FNN(torch.nn.Module): 35 | """ 36 | :parameter 37 | ------------- 38 | field_size: size of the feature fields 39 | feature_sizes: a field_size-dim array, sizes of the feature dictionary 40 | embedding_size: size of the feature embedding 41 | h_depth: deep network's hidden layers' depth 42 | deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2 43 | is_deep_dropout: bool, deep part uses dropout or not? 44 | dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2 45 | deep_layers_activation: relu or sigmoid etc 46 | n_epochs: epochs 47 | batch_size: batch_size 48 | learning_rate: learning_rate 49 | optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag' 50 | is_batch_norm:bool, use batch_norm or not ? 51 | verbose: verbose 52 | pre_weight_decay: pretrain 's weight decay (L2 penalty) 53 | weight_decay: weight decay (L2 penalty) 54 | random_seed: random_seed=950104 someone's birthday, my lukcy number 55 | use_fm: bool 56 | use_ffm: bool 57 | loss_type: "logloss", only 58 | eval_metric: roc_auc_score 59 | use_cuda: bool use gpu or cpu? 60 | n_class: number of classes. is bounded to 1 61 | greater_is_better: bool. Is the greater eval better? 62 | 63 | 64 | Attention: only support logsitcs regression 65 | """ 66 | 67 | def __init__(self, field_size, feature_sizes, embedding_size=4, 68 | h_depth=2, deep_layers=[32, 32], is_deep_dropout=True, dropout_deep=[0.5, 0.5, 0.5], 69 | deep_layers_activation='tanh', n_epochs=64, batch_size=256, learning_rate=0.003, 70 | optimizer_type='adam', is_batch_norm=False, verbose=False, random_seed=950104, pre_weight_decay= 0.0,weight_decay=0.0, 71 | use_fm=True, use_ffm=False, loss_type='logloss', eval_metric=roc_auc_score, 72 | use_cuda=True, n_class=1, greater_is_better=True 73 | ): 74 | super(FNN, self).__init__() 75 | self.field_size = field_size 76 | self.feature_sizes = feature_sizes 77 | self.embedding_size = embedding_size 78 | self.h_depth = h_depth 79 | self.deep_layers = deep_layers 80 | self.is_deep_dropout = is_deep_dropout 81 | self.dropout_deep = dropout_deep 82 | self.deep_layers_activation = deep_layers_activation 83 | self.n_epochs = n_epochs 84 | self.batch_size = batch_size 85 | self.learning_rate = learning_rate 86 | self.optimizer_type = optimizer_type 87 | self.is_batch_norm = is_batch_norm 88 | self.verbose = verbose 89 | self.pre_weight_decay = pre_weight_decay 90 | self.weight_decay = weight_decay 91 | self.random_seed = random_seed 92 | self.use_fm = use_fm 93 | self.use_ffm = use_ffm 94 | self.loss_type = loss_type 95 | self.eval_metric = eval_metric 96 | self.use_cuda = use_cuda 97 | self.n_class = n_class 98 | self.greater_is_better = greater_is_better 99 | self.pretrain = False 100 | 101 | torch.manual_seed(self.random_seed) 102 | 103 | """ 104 | check cuda 105 | """ 106 | if self.use_cuda and not torch.cuda.is_available(): 107 | self.use_cuda = False 108 | print("Cuda is not available, automatically changed into cpu model") 109 | 110 | """ 111 | check use fm or ffm 112 | """ 113 | if self.use_fm and self.use_ffm: 114 | print("only support one type only, please make sure to choose only fm or ffm part") 115 | exit(1) 116 | elif self.use_fm: 117 | print("The model is FNN(fm+nn layers)") 118 | elif self.use_ffm: 119 | print("The model is FFNN(ffm+nn layers)") 120 | else: 121 | print("You have to choose more than one of (fm, ffm, deep) models to use") 122 | exit(1) 123 | 124 | """ 125 | fm part 126 | """ 127 | if self.use_fm: 128 | print("Init fm part") 129 | self.fm_bias = torch.nn.Parameter(torch.randn(1), requires_grad=True) #w0 130 | self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) #wi 131 | self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) #vi 132 | print("Init fm part succeed") 133 | 134 | """ 135 | ffm part 136 | """ 137 | if self.use_ffm: 138 | print("Init ffm part") 139 | self.ffm_bias = torch.nn.Parameter(torch.randn(1), requires_grad=True) 140 | self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) 141 | self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes]) 142 | print("Init ffm part succeed") 143 | 144 | print("Init nn part") 145 | if self.is_deep_dropout: 146 | self.linear_0_dropout = nn.Dropout(self.dropout_deep[0]) 147 | if not use_ffm: 148 | self.linear_1 = nn.Linear(1 + self.field_size + self.field_size * self.embedding_size, deep_layers[0]) 149 | else: 150 | self.linear_1 = nn.Linear(1 + self.field_size + self.field_size * self.field_size * self.embedding_size, deep_layers[0]) 151 | 152 | if self.is_batch_norm: 153 | self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0]) 154 | 155 | if self.is_deep_dropout: 156 | self.linear_1_dropout = nn.Dropout(self.dropout_deep[1]) 157 | for i, h in enumerate(self.deep_layers[1:], 1): 158 | setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i])) 159 | if self.is_batch_norm: 160 | setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i])) 161 | if self.is_deep_dropout: 162 | setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1])) 163 | self.deep_last_layer = nn.Linear(self.deep_layers[-1], self.n_class) 164 | print("Init nn part succeed") 165 | 166 | print "Init succeed" 167 | 168 | def forward(self, Xi, Xv): 169 | """ 170 | :param Xi: index input tensor, batch_size * k * 1 171 | :param Xv: value input tensor, batch_size * k * 1 172 | :param is_pretrain: the para to decide fm pretrain or not 173 | :return: the last output 174 | """ 175 | if self.pretrain and self.use_fm: 176 | fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)] 177 | fm_first_order_sum = torch.sum(sum(fm_first_order_emb_arr),1) 178 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)] 179 | fm_sum_second_order_emb = sum(fm_second_order_emb_arr) 180 | fm_sum_second_order_emb_square = fm_sum_second_order_emb*fm_sum_second_order_emb # (x+y)^2 181 | fm_second_order_emb_square = [item*item for item in fm_second_order_emb_arr] 182 | fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) #x^2+y^2 183 | fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5 184 | fm_second_order_sum = torch.sum(fm_second_order,1) 185 | return self.fm_bias+fm_first_order_sum+fm_second_order_sum 186 | elif self.pretrain and self.use_ffm: 187 | ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)] 188 | sum_ = torch.sum(sum(ffm_first_order_emb_arr),1) 189 | ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)] 190 | for i in range(self.field_size): 191 | for j in range(i+1, self.field_size): 192 | sum_ += torch.sum((ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i]),1) 193 | return self.ffm_bias + sum_ 194 | elif not self.pretrain and self.use_fm: 195 | fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)] 196 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)] 197 | fm_first_order = torch.cat(fm_first_order_emb_arr,1) 198 | fm_second_order = torch.cat(fm_second_order_emb_arr,1) 199 | if self.use_cuda: 200 | fm_bias = self.fm_bias * Variable(torch.ones(Xi.data.shape[0],1)).cuda() 201 | else: 202 | fm_bias = self.fm_bias * Variable(torch.ones(Xi.data.shape[0], 1)) 203 | deep_emb = torch.cat([fm_bias,fm_first_order,fm_second_order],1) 204 | if self.deep_layers_activation == 'sigmoid': 205 | activation = F.sigmoid 206 | elif self.deep_layers_activation == 'tanh': 207 | activation = F.tanh 208 | else: 209 | activation = F.relu 210 | if self.is_deep_dropout: 211 | deep_emb = self.linear_0_dropout(deep_emb) 212 | x_deep = self.linear_1(deep_emb) 213 | if self.is_batch_norm: 214 | x_deep = self.batch_norm_1(x_deep) 215 | x_deep = activation(x_deep) 216 | if self.is_deep_dropout: 217 | x_deep = self.linear_1_dropout(x_deep) 218 | for i in range(1, len(self.deep_layers)): 219 | x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep) 220 | if self.is_batch_norm: 221 | x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep) 222 | x_deep = activation(x_deep) 223 | if self.is_deep_dropout: 224 | x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep) 225 | x_deep = self.deep_last_layer(x_deep) 226 | return torch.sum(x_deep,1) 227 | else: 228 | ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)] 229 | ffm_second_order_emb_arr = [torch.cat([(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for emb in f_embs],1) for 230 | i, f_embs in enumerate(self.ffm_second_order_embeddings)] 231 | ffm_first_order = torch.cat(ffm_first_order_emb_arr,1) 232 | ffm_second_order = torch.cat(ffm_second_order_emb_arr,1) 233 | if self.use_cuda: 234 | ffm_bias = self.ffm_bias * Variable(torch.ones(Xi.data.shape[0], 1)).cuda() 235 | else: 236 | ffm_bias = self.ffm_bias * Variable(torch.ones(Xi.data.shape[0], 1)) 237 | deep_emb = torch.cat([ffm_bias, ffm_first_order, ffm_second_order], 1) 238 | if self.deep_layers_activation == 'sigmoid': 239 | activation = F.sigmoid 240 | elif self.deep_layers_activation == 'tanh': 241 | activation = F.tanh 242 | else: 243 | activation = F.relu 244 | if self.is_deep_dropout: 245 | deep_emb = self.linear_0_dropout(deep_emb) 246 | x_deep = self.linear_1(deep_emb) 247 | if self.is_batch_norm: 248 | x_deep = self.batch_norm_1(x_deep) 249 | x_deep = activation(x_deep) 250 | if self.is_deep_dropout: 251 | x_deep = self.linear_1_dropout(x_deep) 252 | for i in range(1, len(self.deep_layers)): 253 | x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep) 254 | if self.is_batch_norm: 255 | x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep) 256 | x_deep = activation(x_deep) 257 | if self.is_deep_dropout: 258 | x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep) 259 | x_deep = self.deep_last_layer(x_deep) 260 | return torch.sum(x_deep,1) 261 | 262 | def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None, 263 | y_valid = None, is_pretrain = False, ealry_stopping=False, refit=False, save_path = None): 264 | """ 265 | :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...] 266 | indi_j is the feature index of feature field j of sample i in the training set 267 | :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...] 268 | vali_j is the feature value of feature field j of sample i in the training set 269 | vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features) 270 | :param y_train: label of each sample in the training set 271 | :param Xi_valid: list of list of feature indices of each sample in the validation set 272 | :param Xv_valid: list of list of feature values of each sample in the validation set 273 | :param y_valid: label of each sample in the validation set 274 | :param is_pretrain: pretrain or not ? 275 | :param ealry_stopping: perform early stopping or not 276 | :param refit: refit the model on the train+valid dataset or not 277 | :param save_path: the path to save the model 278 | :return: 279 | """ 280 | """ 281 | pre_process 282 | """ 283 | if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])): 284 | print("Save path is not existed!") 285 | return 286 | 287 | if is_pretrain: 288 | print("The model is pre_training now. You must change the mode in the next fitting") 289 | 290 | if self.verbose: 291 | print("pre_process data ing...") 292 | self.pretrain = is_pretrain 293 | is_valid = False 294 | Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1)) 295 | Xv_train = np.array(Xv_train) 296 | y_train = np.array(y_train) 297 | x_size = Xi_train.shape[0] 298 | if Xi_valid: 299 | Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1)) 300 | Xv_valid = np.array(Xv_valid) 301 | y_valid = np.array(y_valid) 302 | x_valid_size = Xi_valid.shape[0] 303 | is_valid = True 304 | if self.verbose: 305 | print("pre_process data finished") 306 | 307 | """ 308 | train model 309 | """ 310 | model = self.train() 311 | if self.pretrain: 312 | weight_decay = self.pre_weight_decay 313 | else: 314 | weight_decay = self.weight_decay 315 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=weight_decay) 316 | if self.optimizer_type == 'adam': 317 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=weight_decay) 318 | elif self.optimizer_type == 'rmsp': 319 | optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=weight_decay) 320 | elif self.optimizer_type == 'adag': 321 | optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=weight_decay) 322 | 323 | criterion = F.binary_cross_entropy_with_logits 324 | 325 | train_result = [] 326 | valid_result = [] 327 | for epoch in range(self.n_epochs): 328 | total_loss = 0.0 329 | batch_iter = x_size // self.batch_size 330 | epoch_begin_time = time() 331 | batch_begin_time = time() 332 | for i in range(batch_iter+1): 333 | offset = i*self.batch_size 334 | end = min(x_size, offset+self.batch_size) 335 | if offset == end: 336 | break 337 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 338 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 339 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 340 | if self.use_cuda: 341 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 342 | optimizer.zero_grad() 343 | outputs = model(batch_xi, batch_xv) 344 | loss = criterion(outputs, batch_y) 345 | loss.backward() 346 | optimizer.step() 347 | 348 | total_loss += loss.data[0] 349 | if self.verbose: 350 | if i % 100 == 99: # print every 100 mini-batches 351 | eval = self.evaluate(batch_xi, batch_xv, batch_y) 352 | print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' % 353 | (epoch + 1, i + 1, total_loss/100, eval, time()-batch_begin_time)) 354 | total_loss = 0.0 355 | batch_begin_time = time() 356 | 357 | train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size) 358 | train_result.append(train_eval) 359 | print('*'*50) 360 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 361 | (epoch + 1, train_loss, train_eval, time()-epoch_begin_time)) 362 | print('*'*50) 363 | 364 | if is_valid: 365 | valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size) 366 | valid_result.append(valid_eval) 367 | print('*' * 50) 368 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 369 | (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time)) 370 | print('*' * 50) 371 | if save_path: 372 | torch.save(self.state_dict(),save_path) 373 | if is_valid and ealry_stopping and self.training_termination(valid_result): 374 | print("early stop at [%d] epoch!" % (epoch+1)) 375 | break 376 | 377 | # fit a few more epoch on train+valid until result reaches the best_train_score 378 | if is_valid and refit: 379 | if self.verbose: 380 | print("refitting the model") 381 | if self.greater_is_better: 382 | best_epoch = np.argmax(valid_result) 383 | else: 384 | best_epoch = np.argmin(valid_result) 385 | best_train_score = train_result[best_epoch] 386 | Xi_train = np.concatenate((Xi_train,Xi_valid)) 387 | Xv_train = np.concatenate((Xv_train,Xv_valid)) 388 | y_train = np.concatenate((y_train,y_valid)) 389 | x_size = x_size + x_valid_size 390 | self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train) 391 | for epoch in range(64): 392 | batch_iter = x_size // self.batch_size 393 | for i in range(batch_iter + 1): 394 | offset = i * self.batch_size 395 | end = min(x_size, offset + self.batch_size) 396 | if offset == end: 397 | break 398 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 399 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 400 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 401 | if self.use_cuda: 402 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 403 | optimizer.zero_grad() 404 | outputs = model(batch_xi, batch_xv) 405 | loss = criterion(outputs, batch_y) 406 | loss.backward() 407 | optimizer.step() 408 | train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size) 409 | if save_path: 410 | torch.save(self.state_dict(), save_path) 411 | if abs(best_train_score-train_eval) < 0.001 or \ 412 | (self.greater_is_better and train_eval > best_train_score) or \ 413 | ((not self.greater_is_better) and train_result < best_train_score): 414 | break 415 | if self.verbose: 416 | print("refit finished") 417 | 418 | def eval_by_batch(self,Xi, Xv, y, x_size): 419 | total_loss = 0.0 420 | y_pred = [] 421 | if self.use_ffm: 422 | batch_size = 16384*2 423 | else: 424 | batch_size = 16384 425 | batch_iter = x_size // batch_size 426 | criterion = F.binary_cross_entropy_with_logits 427 | model = self.eval() 428 | for i in range(batch_iter+1): 429 | offset = i * batch_size 430 | end = min(x_size, offset + batch_size) 431 | if offset == end: 432 | break 433 | batch_xi = Variable(torch.LongTensor(Xi[offset:end])) 434 | batch_xv = Variable(torch.FloatTensor(Xv[offset:end])) 435 | batch_y = Variable(torch.FloatTensor(y[offset:end])) 436 | if self.use_cuda: 437 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 438 | outputs = model(batch_xi, batch_xv) 439 | pred = F.sigmoid(outputs).cpu() 440 | y_pred.extend(pred.data.numpy()) 441 | loss = criterion(outputs, batch_y) 442 | total_loss += loss.data[0]*(end-offset) 443 | total_metric = self.eval_metric(y,y_pred) 444 | return total_loss/x_size, total_metric 445 | 446 | # shuffle three lists simutaneously 447 | def shuffle_in_unison_scary(self, a, b, c): 448 | rng_state = np.random.get_state() 449 | np.random.shuffle(a) 450 | np.random.set_state(rng_state) 451 | np.random.shuffle(b) 452 | np.random.set_state(rng_state) 453 | np.random.shuffle(c) 454 | 455 | def training_termination(self, valid_result): 456 | if len(valid_result) > 4: 457 | if self.greater_is_better: 458 | if valid_result[-1] < valid_result[-2] and \ 459 | valid_result[-2] < valid_result[-3] and \ 460 | valid_result[-3] < valid_result[-4]: 461 | return True 462 | else: 463 | if valid_result[-1] > valid_result[-2] and \ 464 | valid_result[-2] > valid_result[-3] and \ 465 | valid_result[-3] > valid_result[-4]: 466 | return True 467 | return False 468 | 469 | def predict(self, Xi, Xv): 470 | """ 471 | :param Xi: the same as fit function 472 | :param Xv: the same as fit function 473 | :return: output, ont-dim array 474 | """ 475 | Xi = np.array(Xi).reshape((-1,self.field_size,1)) 476 | Xi = Variable(torch.LongTensor(Xi)) 477 | Xv = Variable(torch.FloatTensor(Xv)) 478 | if self.use_cuda and torch.cuda.is_available(): 479 | Xi, Xv = Xi.cuda(), Xv.cuda() 480 | 481 | model = self.eval() 482 | pred = F.sigmoid(model(Xi, Xv)).cpu() 483 | return (pred.data.numpy() > 0.5) 484 | 485 | def predict_proba(self, Xi, Xv): 486 | Xi = np.array(Xi).reshape((-1, self.field_size, 1)) 487 | Xi = Variable(torch.LongTensor(Xi)) 488 | Xv = Variable(torch.FloatTensor(Xv)) 489 | if self.use_cuda and torch.cuda.is_available(): 490 | Xi, Xv = Xi.cuda(), Xv.cuda() 491 | 492 | model = self.eval() 493 | pred = F.sigmoid(model(Xi, Xv)).cpu() 494 | return pred.data.numpy() 495 | 496 | def inner_predict(self, Xi, Xv): 497 | """ 498 | :param Xi: tensor of feature index 499 | :param Xv: tensor of feature value 500 | :return: output, numpy 501 | """ 502 | model = self.eval() 503 | pred = F.sigmoid(model(Xi, Xv)).cpu() 504 | return (pred.data.numpy() > 0.5) 505 | 506 | def inner_predict_proba(self, Xi, Xv): 507 | """ 508 | :param Xi: tensor of feature index 509 | :param Xv: tensor of feature value 510 | :return: output, numpy 511 | """ 512 | model = self.eval() 513 | pred = F.sigmoid(model(Xi, Xv)).cpu() 514 | return pred.data.numpy() 515 | 516 | 517 | def evaluate(self, Xi, Xv, y): 518 | """ 519 | :param Xi: tensor of feature index 520 | :param Xv: tensor of feature value 521 | :param y: tensor of labels 522 | :return: metric of the evaluation 523 | """ 524 | y_pred = self.inner_predict_proba(Xi, Xv) 525 | return self.eval_metric(y.cpu().data.numpy(), y_pred) 526 | 527 | """ 528 | test part 529 | """ 530 | import sys 531 | sys.path.append('../') 532 | from utils import data_preprocess 533 | 534 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv') 535 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv') 536 | with torch.cuda.device(2): 537 | fnn = FNN(39, result_dict['feature_sizes'], batch_size=128 * 64, verbose=True, use_cuda=True, 538 | pre_weight_decay= 0.0001 ,weight_decay=0.00001, use_fm=False, use_ffm=True).cuda() 539 | fnn.load_state_dict(torch.load('../data/model/ffnn.pkl')) 540 | # fnn.fit(result_dict['index'], result_dict['value'], result_dict['label'], 541 | # test_dict['index'], test_dict['value'], test_dict['label'],ealry_stopping=True,refit=False,is_pretrain=True,save_path='../data/model/ffnn.pkl') 542 | fnn.fit(result_dict['index'], result_dict['value'], result_dict['label'], 543 | test_dict['index'], test_dict['value'], test_dict['label'],ealry_stopping=True,refit=False,is_pretrain=False,save_path='../data/model/ffnn.pkl') 544 | -------------------------------------------------------------------------------- /model/NFM.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | A pytorch implementation of NFM 8 | 9 | Reference: 10 | [1] Neural Factorization Machines for Sparse Predictive Analytics 11 | Xiangnan He,School of Computing,National University of Singapore,Singapore 117417,dcshex@nus.edu.sg 12 | Tat-Seng Chua,School of Computing,National University of Singapore,Singapore 117417,dcscts@nus.edu.sg 13 | 14 | """ 15 | 16 | import os 17 | import numpy as np 18 | from sklearn.base import BaseEstimator, TransformerMixin 19 | from sklearn.metrics import roc_auc_score 20 | from time import time 21 | 22 | import torch 23 | import torch.autograd as autograd 24 | import torch.nn as nn 25 | import torch.nn.functional as F 26 | import torch.optim as optim 27 | from torch.autograd import Variable 28 | 29 | import torch.backends.cudnn 30 | 31 | 32 | """ 33 | 网络结构部分 34 | """ 35 | 36 | class NFM(torch.nn.Module): 37 | """ 38 | :parameter 39 | ------------- 40 | field_size: size of the feature fields 41 | feature_sizes: a field_size-dim array, sizes of the feature dictionary 42 | embedding_size: size of the feature embedding 43 | is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not? 44 | dropout_shallow: an array of the size of 1, example:[0.5], the element is for the-first order part 45 | h_depth: deep network's hidden layers' depth 46 | deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2 47 | is_deep_dropout: bool, deep part uses dropout or not? 48 | dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2 49 | deep_layers_activation: relu or sigmoid etc 50 | n_epochs: epochs 51 | batch_size: batch_size 52 | learning_rate: learning_rate 53 | optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag' 54 | is_batch_norm:bool, use batch_norm or not ? 55 | verbose: verbose 56 | weight_decay: weight decay (L2 penalty) 57 | random_seed: random_seed=950104 someone's birthday, my lukcy number 58 | use_fm: bool 59 | use_ffm: bool 60 | interation_type: bool, When it's true, the element-wise product of the fm or ffm embeddings will be added together, otherwise, the element-wise prodcut of embeddings will be concatenated. 61 | loss_type: "logloss", only 62 | eval_metric: roc_auc_score 63 | use_cuda: bool use gpu or cpu? 64 | n_class: number of classes. is bounded to 1 65 | greater_is_better: bool. Is the greater eval better? 66 | 67 | 68 | Attention: only support logsitcs regression 69 | """ 70 | def __init__(self,field_size, feature_sizes, embedding_size = 4, is_shallow_dropout = True, dropout_shallow = [0.5], 71 | h_depth = 2, deep_layers = [32, 32], is_deep_dropout = True, dropout_deep=[0.0, 0.5, 0.5], 72 | deep_layers_activation = 'relu', n_epochs = 64, batch_size = 256, learning_rate = 0.003, 73 | optimizer_type = 'adam', is_batch_norm = False, verbose = False, random_seed = 950104, weight_decay = 0.0, 74 | use_fm = True, use_ffm = False, interation_type = True,loss_type = 'logloss', eval_metric = roc_auc_score, 75 | use_cuda = True, n_class = 1, greater_is_better = True 76 | ): 77 | super(NFM, self).__init__() 78 | self.field_size = field_size 79 | self.feature_sizes = feature_sizes 80 | self.embedding_size = embedding_size 81 | self.is_shallow_dropout = is_shallow_dropout 82 | self.dropout_shallow = dropout_shallow 83 | self.h_depth = h_depth 84 | self.deep_layers = deep_layers 85 | self.is_deep_dropout = is_deep_dropout 86 | self.dropout_deep = dropout_deep 87 | self.deep_layers_activation = deep_layers_activation 88 | self.n_epochs = n_epochs 89 | self.batch_size = batch_size 90 | self.learning_rate = learning_rate 91 | self.optimizer_type = optimizer_type 92 | self.is_batch_norm = is_batch_norm 93 | self.verbose = verbose 94 | self.weight_decay = weight_decay 95 | self.random_seed = random_seed 96 | self.use_fm = use_fm 97 | self.use_ffm = use_ffm 98 | self.interation_type = interation_type 99 | self.loss_type = loss_type 100 | self.eval_metric = eval_metric 101 | self.use_cuda = use_cuda 102 | self.n_class = n_class 103 | self.greater_is_better = greater_is_better 104 | 105 | torch.manual_seed(self.random_seed) 106 | 107 | """ 108 | check cuda 109 | """ 110 | if self.use_cuda and not torch.cuda.is_available(): 111 | self.use_cuda = False 112 | print("Cuda is not available, automatically changed into cpu model") 113 | 114 | """ 115 | check use fm or ffm 116 | """ 117 | if self.use_fm and self.use_ffm: 118 | print("only support one type only, please make sure to choose only fm or ffm part") 119 | exit(1) 120 | elif self.use_fm: 121 | print("The model is nfm(fm+nn layers)") 122 | elif self.use_ffm: 123 | print("The model is nffm(ffm+nn layers)") 124 | else: 125 | print("You have to choose more than one of (fm, ffm) models to use") 126 | exit(1) 127 | """ 128 | bias 129 | """ 130 | self.bias = torch.nn.Parameter(torch.randn(1)) 131 | 132 | """ 133 | fm part 134 | """ 135 | if self.use_fm: 136 | print("Init fm part") 137 | self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) 138 | if self.dropout_shallow: 139 | self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0]) 140 | self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) 141 | print("Init fm part succeed") 142 | 143 | """ 144 | ffm part 145 | """ 146 | if self.use_ffm: 147 | print("Init ffm part") 148 | self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) 149 | if self.dropout_shallow: 150 | self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0]) 151 | self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes]) 152 | print("Init ffm part succeed") 153 | 154 | """ 155 | deep part 156 | """ 157 | print("Init deep part") 158 | 159 | if self.is_deep_dropout: 160 | self.linear_0_dropout = nn.Dropout(self.dropout_deep[0]) 161 | if self.interation_type: 162 | self.linear_1 = nn.Linear(self.embedding_size, deep_layers[0]) 163 | else: 164 | self.linear_1 = nn.Linear(self.field_size*(self.field_size-1)/2, deep_layers[0]) 165 | if self.is_batch_norm: 166 | self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0]) 167 | if self.is_deep_dropout: 168 | self.linear_1_dropout = nn.Dropout(self.dropout_deep[1]) 169 | for i, h in enumerate(self.deep_layers[1:], 1): 170 | setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i])) 171 | if self.is_batch_norm: 172 | setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i])) 173 | if self.is_deep_dropout: 174 | setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1])) 175 | 176 | print("Init deep part succeed") 177 | 178 | print "Init succeed" 179 | 180 | def forward(self, Xi, Xv): 181 | """ 182 | :param Xi_train: index input tensor, batch_size * k * 1 183 | :param Xv_train: value input tensor, batch_size * k * 1 184 | :return: the last output 185 | """ 186 | """ 187 | fm part 188 | """ 189 | if self.use_fm: 190 | fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)] 191 | fm_first_order = torch.cat(fm_first_order_emb_arr,1) 192 | if self.is_shallow_dropout: 193 | fm_first_order = self.fm_first_order_dropout(fm_first_order) 194 | 195 | if self.interation_type: 196 | # use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation 197 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)] 198 | fm_sum_second_order_emb = sum(fm_second_order_emb_arr) 199 | fm_sum_second_order_emb_square = fm_sum_second_order_emb*fm_sum_second_order_emb # (x+y)^2 200 | fm_second_order_emb_square = [item*item for item in fm_second_order_emb_arr] 201 | fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) #x^2+y^2 202 | fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5 203 | else: 204 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in 205 | enumerate(self.fm_second_order_embeddings)] 206 | fm_wij_arr = [] 207 | for i in range(self.field_size): 208 | for j in range(i + 1, self.field_size): 209 | fm_wij_arr.append(fm_second_order_emb_arr[i] * fm_second_order_emb_arr[j]) 210 | 211 | 212 | """ 213 | ffm part 214 | """ 215 | if self.use_ffm: 216 | ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)] 217 | ffm_first_order = torch.cat(ffm_first_order_emb_arr,1) 218 | if self.is_shallow_dropout: 219 | ffm_first_order = self.ffm_first_order_dropout(ffm_first_order) 220 | ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)] 221 | ffm_wij_arr = [] 222 | for i in range(self.field_size): 223 | for j in range(i+1, self.field_size): 224 | ffm_wij_arr.append(ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i]) 225 | ffm_second_order = sum(ffm_wij_arr) 226 | 227 | """ 228 | deep part 229 | """ 230 | if self.use_fm and self.interation_type: 231 | deep_emb = fm_second_order 232 | elif self.use_ffm and self.interation_type: 233 | deep_emb = ffm_second_order 234 | elif self.use_fm: 235 | deep_emb = torch.cat([torch.sum(fm_wij,1).view([-1,1]) for fm_wij in fm_wij_arr], 1) 236 | else: 237 | deep_emb = torch.cat([torch.sum(ffm_wij,1).view([-1,1]) for ffm_wij in ffm_wij_arr],1) 238 | 239 | if self.deep_layers_activation == 'sigmoid': 240 | activation = F.sigmoid 241 | elif self.deep_layers_activation == 'tanh': 242 | activation = F.tanh 243 | else: 244 | activation = F.relu 245 | 246 | if self.is_deep_dropout: 247 | deep_emb = self.linear_0_dropout(deep_emb) 248 | x_deep = self.linear_1(deep_emb) 249 | if self.is_batch_norm: 250 | x_deep = self.batch_norm_1(x_deep) 251 | x_deep = activation(x_deep) 252 | if self.is_deep_dropout: 253 | x_deep = self.linear_1_dropout(x_deep) 254 | for i in range(1, len(self.deep_layers)): 255 | x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep) 256 | if self.is_batch_norm: 257 | x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep) 258 | x_deep = activation(x_deep) 259 | if self.is_deep_dropout: 260 | x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep) 261 | 262 | """ 263 | sum 264 | """ 265 | if self.use_fm: 266 | total_sum = self.bias+ torch.sum(fm_first_order,1) + torch.sum(x_deep,1) 267 | elif self.use_ffm: 268 | total_sum = self.bias + torch.sum(ffm_first_order, 1) + torch.sum(x_deep, 1) 269 | return total_sum 270 | 271 | 272 | def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None, 273 | y_valid = None, ealry_stopping=False, refit=False, save_path = None): 274 | """ 275 | :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...] 276 | indi_j is the feature index of feature field j of sample i in the training set 277 | :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...] 278 | vali_j is the feature value of feature field j of sample i in the training set 279 | vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features) 280 | :param y_train: label of each sample in the training set 281 | :param Xi_valid: list of list of feature indices of each sample in the validation set 282 | :param Xv_valid: list of list of feature values of each sample in the validation set 283 | :param y_valid: label of each sample in the validation set 284 | :param ealry_stopping: perform early stopping or not 285 | :param refit: refit the model on the train+valid dataset or not 286 | :param save_path: the path to save the model 287 | :return: 288 | """ 289 | """ 290 | pre_process 291 | """ 292 | if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])): 293 | print("Save path is not existed!") 294 | return 295 | 296 | if self.verbose: 297 | print("pre_process data ing...") 298 | is_valid = False 299 | Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1)) 300 | Xv_train = np.array(Xv_train) 301 | y_train = np.array(y_train) 302 | x_size = Xi_train.shape[0] 303 | if Xi_valid: 304 | Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1)) 305 | Xv_valid = np.array(Xv_valid) 306 | y_valid = np.array(y_valid) 307 | x_valid_size = Xi_valid.shape[0] 308 | is_valid = True 309 | if self.verbose: 310 | print("pre_process data finished") 311 | 312 | """ 313 | train model 314 | """ 315 | model = self.train() 316 | 317 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 318 | if self.optimizer_type == 'adam': 319 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 320 | elif self.optimizer_type == 'rmsp': 321 | optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 322 | elif self.optimizer_type == 'adag': 323 | optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 324 | 325 | criterion = F.binary_cross_entropy_with_logits 326 | 327 | train_result = [] 328 | valid_result = [] 329 | for epoch in range(self.n_epochs): 330 | total_loss = 0.0 331 | batch_iter = x_size // self.batch_size 332 | epoch_begin_time = time() 333 | batch_begin_time = time() 334 | for i in range(batch_iter+1): 335 | offset = i*self.batch_size 336 | end = min(x_size, offset+self.batch_size) 337 | if offset == end: 338 | break 339 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 340 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 341 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 342 | if self.use_cuda: 343 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 344 | optimizer.zero_grad() 345 | outputs = model(batch_xi, batch_xv) 346 | loss = criterion(outputs, batch_y) 347 | loss.backward() 348 | optimizer.step() 349 | 350 | total_loss += loss.data[0] 351 | if self.verbose: 352 | if i % 100 == 99: # print every 100 mini-batches 353 | eval = self.evaluate(batch_xi, batch_xv, batch_y) 354 | print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' % 355 | (epoch + 1, i + 1, total_loss/100.0, eval, time()-batch_begin_time)) 356 | total_loss = 0.0 357 | batch_begin_time = time() 358 | 359 | train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size) 360 | train_result.append(train_eval) 361 | print('*'*50) 362 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 363 | (epoch + 1, train_loss, train_eval, time()-epoch_begin_time)) 364 | print('*'*50) 365 | 366 | if is_valid: 367 | valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size) 368 | valid_result.append(valid_eval) 369 | print('*' * 50) 370 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 371 | (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time)) 372 | print('*' * 50) 373 | if save_path: 374 | torch.save(self.state_dict(),save_path) 375 | if is_valid and ealry_stopping and self.training_termination(valid_result): 376 | print("early stop at [%d] epoch!" % (epoch+1)) 377 | break 378 | 379 | # fit a few more epoch on train+valid until result reaches the best_train_score 380 | if is_valid and refit: 381 | if self.verbose: 382 | print("refitting the model") 383 | if self.greater_is_better: 384 | best_epoch = np.argmax(valid_result) 385 | else: 386 | best_epoch = np.argmin(valid_result) 387 | best_train_score = train_result[best_epoch] 388 | Xi_train = np.concatenate((Xi_train,Xi_valid)) 389 | Xv_train = np.concatenate((Xv_train,Xv_valid)) 390 | y_train = np.concatenate((y_train,y_valid)) 391 | x_size = x_size + x_valid_size 392 | self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train) 393 | for epoch in range(64): 394 | batch_iter = x_size // self.batch_size 395 | for i in range(batch_iter + 1): 396 | offset = i * self.batch_size 397 | end = min(x_size, offset + self.batch_size) 398 | if offset == end: 399 | break 400 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 401 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 402 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 403 | if self.use_cuda: 404 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 405 | optimizer.zero_grad() 406 | outputs = model(batch_xi, batch_xv) 407 | loss = criterion(outputs, batch_y) 408 | loss.backward() 409 | optimizer.step() 410 | train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size) 411 | if save_path: 412 | torch.save(self.state_dict(), save_path) 413 | if abs(best_train_score-train_eval) < 0.001 or \ 414 | (self.greater_is_better and train_eval > best_train_score) or \ 415 | ((not self.greater_is_better) and train_result < best_train_score): 416 | break 417 | if self.verbose: 418 | print("refit finished") 419 | 420 | def eval_by_batch(self,Xi, Xv, y, x_size): 421 | total_loss = 0.0 422 | y_pred = [] 423 | if self.use_ffm: 424 | batch_size = 16384*2 425 | else: 426 | batch_size = 16384 427 | batch_iter = x_size // batch_size 428 | criterion = F.binary_cross_entropy_with_logits 429 | model = self.eval() 430 | for i in range(batch_iter+1): 431 | offset = i * batch_size 432 | end = min(x_size, offset + batch_size) 433 | if offset == end: 434 | break 435 | batch_xi = Variable(torch.LongTensor(Xi[offset:end])) 436 | batch_xv = Variable(torch.FloatTensor(Xv[offset:end])) 437 | batch_y = Variable(torch.FloatTensor(y[offset:end])) 438 | if self.use_cuda: 439 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 440 | 441 | # self.print_embedding_prod(batch_xi,batch_xv) 442 | 443 | outputs = model(batch_xi, batch_xv) 444 | pred = F.sigmoid(outputs).cpu() 445 | y_pred.extend(pred.data.numpy()) 446 | loss = criterion(outputs, batch_y) 447 | total_loss += loss.data[0]*(end-offset) 448 | total_metric = self.eval_metric(y,y_pred) 449 | return total_loss/x_size, total_metric 450 | 451 | # shuffle three lists simutaneously 452 | def shuffle_in_unison_scary(self, a, b, c): 453 | rng_state = np.random.get_state() 454 | np.random.shuffle(a) 455 | np.random.set_state(rng_state) 456 | np.random.shuffle(b) 457 | np.random.set_state(rng_state) 458 | np.random.shuffle(c) 459 | 460 | def training_termination(self, valid_result): 461 | if len(valid_result) > 4: 462 | if self.greater_is_better: 463 | if valid_result[-1] < valid_result[-2] and \ 464 | valid_result[-2] < valid_result[-3] and \ 465 | valid_result[-3] < valid_result[-4]: 466 | return True 467 | else: 468 | if valid_result[-1] > valid_result[-2] and \ 469 | valid_result[-2] > valid_result[-3] and \ 470 | valid_result[-3] > valid_result[-4]: 471 | return True 472 | return False 473 | 474 | def predict(self, Xi, Xv): 475 | """ 476 | :param Xi: the same as fit function 477 | :param Xv: the same as fit function 478 | :return: output, ont-dim array 479 | """ 480 | Xi = np.array(Xi).reshape((-1,self.field_size,1)) 481 | Xi = Variable(torch.LongTensor(Xi)) 482 | Xv = Variable(torch.FloatTensor(Xv)) 483 | if self.use_cuda and torch.cuda.is_available(): 484 | Xi, Xv = Xi.cuda(), Xv.cuda() 485 | 486 | model = self.eval() 487 | pred = F.sigmoid(model(Xi, Xv)).cpu() 488 | return (pred.data.numpy() > 0.5) 489 | 490 | def predict_proba(self, Xi, Xv): 491 | Xi = np.array(Xi).reshape((-1, self.field_size, 1)) 492 | Xi = Variable(torch.LongTensor(Xi)) 493 | Xv = Variable(torch.FloatTensor(Xv)) 494 | if self.use_cuda and torch.cuda.is_available(): 495 | Xi, Xv = Xi.cuda(), Xv.cuda() 496 | 497 | model = self.eval() 498 | pred = F.sigmoid(model(Xi, Xv)).cpu() 499 | return pred.data.numpy() 500 | 501 | def inner_predict(self, Xi, Xv): 502 | """ 503 | :param Xi: tensor of feature index 504 | :param Xv: tensor of feature value 505 | :return: output, numpy 506 | """ 507 | model = self.eval() 508 | pred = F.sigmoid(model(Xi, Xv)).cpu() 509 | return (pred.data.numpy() > 0.5) 510 | 511 | def inner_predict_proba(self, Xi, Xv): 512 | """ 513 | :param Xi: tensor of feature index 514 | :param Xv: tensor of feature value 515 | :return: output, numpy 516 | """ 517 | model = self.eval() 518 | pred = F.sigmoid(model(Xi, Xv)).cpu() 519 | return pred.data.numpy() 520 | 521 | 522 | def evaluate(self, Xi, Xv, y): 523 | """ 524 | :param Xi: tensor of feature index 525 | :param Xv: tensor of feature value 526 | :param y: tensor of labels 527 | :return: metric of the evaluation 528 | """ 529 | y_pred = self.inner_predict_proba(Xi, Xv) 530 | return self.eval_metric(y.cpu().data.numpy(), y_pred) 531 | 532 | def print_embedding_prod(self,Xi,Xv): 533 | if not self.use_fm: 534 | print "Error! Only print fm model!" 535 | return 536 | fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in 537 | enumerate(self.fm_second_order_embeddings)] 538 | total_prod = fm_second_order_emb_arr[0] + 1.0 539 | for emb in fm_second_order_emb_arr[1:]: 540 | total_prod = total_prod * (emb + 1.0) 541 | print "max:", torch.max(total_prod) 542 | print "min", torch.min(total_prod) 543 | 544 | """ 545 | test part 546 | """ 547 | import sys 548 | sys.path.append('../') 549 | from utils import data_preprocess 550 | 551 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv') 552 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv') 553 | with torch.cuda.device(1): 554 | nfm = NFM(39, result_dict['feature_sizes'], batch_size=128 * 64, is_shallow_dropout=False, verbose=True, use_cuda=True, 555 | weight_decay=0.00002, use_fm=True, use_ffm=False, interation_type=False).cuda() 556 | nfm.fit(result_dict['index'], result_dict['value'], result_dict['label'], 557 | test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, refit=False, 558 | save_path='../data/model/nfm.pkl') 559 | -------------------------------------------------------------------------------- /model/PNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | A pytorch implementation of PNN 8 | 9 | Reference: 10 | [1] Product-based Neural Networks for User Response Prediction 11 | Yanru Qu, Han Cai, Kan Ren, Weinan Zhang, Yong Yu Shanghai Jiao Tong University 12 | {kevinqu, hcai, kren, wnzhang, yyu}@apex.sjtu.edu.cn Ying Wen, Jun Wang University College London {ying.wen, j.wang}@cs.ucl.ac.uk 13 | 14 | """ 15 | import os 16 | import numpy as np 17 | from sklearn.base import BaseEstimator, TransformerMixin 18 | from sklearn.metrics import roc_auc_score 19 | from time import time 20 | 21 | import torch 22 | import torch.autograd as autograd 23 | import torch.nn as nn 24 | import torch.nn.functional as F 25 | import torch.optim as optim 26 | from torch.autograd import Variable 27 | 28 | import torch.backends.cudnn 29 | 30 | """ 31 | 网络结构部分 32 | """ 33 | 34 | class PNN(torch.nn.Module): 35 | """ 36 | :parameter 37 | ------------- 38 | field_size: size of the feature fields 39 | feature_sizes: a field_size-dim array, sizes of the feature dictionary 40 | embedding_size: size of the feature embedding 41 | h_depth: deep network's hidden layers' depth 42 | deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2 43 | is_deep_dropout: bool, deep part uses dropout or not? 44 | dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2 45 | use_inner_product: use inner product or not? 46 | use_outer_product: use outter product or not? 47 | deep_layers_activation: relu or sigmoid etc 48 | n_epochs: epochs 49 | batch_size: batch_size 50 | learning_rate: learning_rate 51 | optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag' 52 | is_batch_norm:bool, use batch_norm or not ? 53 | verbose: verbose 54 | weight_decay: weight decay (L2 penalty) 55 | random_seed: random_seed=950104 someone's birthday, my lukcy number 56 | loss_type: "logloss", only 57 | eval_metric: roc_auc_score 58 | use_cuda: bool use gpu or cpu? 59 | n_class: number of classes. is bounded to 1 60 | greater_is_better: bool. Is the greater eval better? 61 | 62 | 63 | Attention: only support logsitcs regression 64 | """ 65 | 66 | def __init__(self, field_size, feature_sizes, embedding_size=4, 67 | h_depth = 3, deep_layers=[32, 32, 32], is_deep_dropout=True, dropout_deep=[0.5, 0.5, 0.5], use_inner_product = True, use_outer_product = False, 68 | deep_layers_activation='relu', n_epochs=64, batch_size=256, learning_rate=0.003, 69 | optimizer_type='adam', is_batch_norm=False, verbose=False, random_seed=950104,weight_decay=0.0, loss_type='logloss', eval_metric=roc_auc_score, 70 | use_cuda=True, n_class=1, greater_is_better=True 71 | ): 72 | super(PNN, self).__init__() 73 | self.field_size = field_size 74 | self.feature_sizes = feature_sizes 75 | self.embedding_size = embedding_size 76 | self.h_depth = h_depth 77 | self.deep_layers = deep_layers 78 | self.is_deep_dropout = is_deep_dropout 79 | self.dropout_deep = dropout_deep 80 | self.use_inner_product = use_inner_product 81 | self.use_outer_product = use_outer_product 82 | self.deep_layers_activation = deep_layers_activation 83 | self.n_epochs = n_epochs 84 | self.batch_size = batch_size 85 | self.learning_rate = learning_rate 86 | self.optimizer_type = optimizer_type 87 | self.is_batch_norm = is_batch_norm 88 | self.verbose = verbose 89 | self.weight_decay = weight_decay 90 | self.random_seed = random_seed 91 | self.loss_type = loss_type 92 | self.eval_metric = eval_metric 93 | self.use_cuda = use_cuda 94 | self.n_class = n_class 95 | self.greater_is_better = greater_is_better 96 | 97 | torch.manual_seed(self.random_seed) 98 | 99 | """ 100 | check cuda 101 | """ 102 | if self.use_cuda and not torch.cuda.is_available(): 103 | self.use_cuda = False 104 | print("Cuda is not available, automatically changed into cpu model") 105 | 106 | """ 107 | check use inner_product or outer_product 108 | """ 109 | if self.use_inner_product and self.use_inner_product: 110 | print("The model uses both inner product and outer product") 111 | elif self.use_inner_product: 112 | print("The model uses inner product (IPNN))") 113 | elif self.use_ffm: 114 | print("The model uses outer product (OPNN)") 115 | else: 116 | print("The model is sample deep model only! Neither inner product or outer product is used") 117 | 118 | """ 119 | embbedding part 120 | """ 121 | print("Init embeddings") 122 | self.embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) 123 | print("Init embeddings finished") 124 | 125 | """ 126 | first order part (linear part) 127 | """ 128 | print("Init first order part") 129 | self.first_order_weight = nn.ModuleList([nn.ParameterList([torch.nn.Parameter(torch.randn(self.embedding_size), requires_grad=True) for j in range(self.field_size)]) for i in range(self.deep_layers[0])]) 130 | self.bias = torch.nn.Parameter(torch.randn(self.deep_layers[0]), requires_grad=True) 131 | print("Init first order part finished") 132 | 133 | """ 134 | second order part (quadratic part) 135 | """ 136 | print("Init second order part") 137 | if self.use_inner_product: 138 | self.inner_second_weight_emb = nn.ModuleList([nn.ParameterList([torch.nn.Parameter(torch.randn(self.embedding_size), requires_grad=True) for j in range(self.field_size)]) for i in range(self.deep_layers[0])]) 139 | 140 | if self.use_outer_product: 141 | arr = [] 142 | for i in range(self.deep_layers[0]): 143 | tmp = torch.randn(self.embedding_size,self.embedding_size) 144 | arr.append(torch.nn.Parameter(torch.mm(tmp,tmp.t()))) 145 | self.outer_second_weight_emb = nn.ParameterList(arr) 146 | print("Init second order part finished") 147 | 148 | 149 | print("Init nn part") 150 | 151 | for i, h in enumerate(self.deep_layers[1:], 1): 152 | setattr(self, 'linear_' + str(i), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i])) 153 | if self.is_batch_norm: 154 | setattr(self, 'batch_norm_' + str(i), nn.BatchNorm1d(deep_layers[i])) 155 | if self.is_deep_dropout: 156 | setattr(self, 'linear_' + str(i) + '_dropout', nn.Dropout(self.dropout_deep[i])) 157 | self.deep_last_layer = nn.Linear(self.deep_layers[-1], self.n_class) 158 | print("Init nn part succeed") 159 | 160 | print "Init succeed" 161 | 162 | def forward(self, Xi, Xv): 163 | """ 164 | :param Xi: index input tensor, batch_size * k * 1 165 | :param Xv: value input tensor, batch_size * k * 1 166 | :param is_pretrain: the para to decide fm pretrain or not 167 | :return: the last output 168 | """ 169 | 170 | """ 171 | embedding 172 | """ 173 | emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.embeddings)] 174 | 175 | """ 176 | first order part (linear part) 177 | """ 178 | first_order_arr = [] 179 | for i, weight_arr in enumerate(self.first_order_weight): 180 | tmp_arr = [] 181 | for j, weight in enumerate(weight_arr): 182 | tmp_arr.append(torch.sum(emb_arr[j]*weight,1)) 183 | first_order_arr.append(sum(tmp_arr).view([-1,1])) 184 | first_order = torch.cat(first_order_arr,1) 185 | 186 | """ 187 | second order part (quadratic part) 188 | """ 189 | if self.use_inner_product: 190 | inner_product_arr = [] 191 | for i, weight_arr in enumerate(self.inner_second_weight_emb): 192 | tmp_arr = [] 193 | for j, weight in enumerate(weight_arr): 194 | tmp_arr.append(torch.sum(emb_arr[j] * weight, 1)) 195 | sum_ = sum(tmp_arr) 196 | inner_product_arr.append((sum_*sum_).view([-1,1])) 197 | inner_product = torch.cat(inner_product_arr,1) 198 | first_order = first_order + inner_product 199 | 200 | if self.use_outer_product: 201 | outer_product_arr = [] 202 | emb_arr_sum = sum(emb_arr) 203 | emb_matrix_arr = torch.bmm(emb_arr_sum.view([-1,self.embedding_size,1]),emb_arr_sum.view([-1,1,self.embedding_size])) 204 | for i, weight in enumerate(self.outer_second_weight_emb): 205 | outer_product_arr.append(torch.sum(torch.sum(emb_matrix_arr*weight,2),1).view([-1,1])) 206 | outer_product = torch.cat(outer_product_arr,1) 207 | first_order = first_order + outer_product 208 | 209 | """ 210 | nn part 211 | """ 212 | if self.deep_layers_activation == 'sigmoid': 213 | activation = F.sigmoid 214 | elif self.deep_layers_activation == 'tanh': 215 | activation = F.tanh 216 | else: 217 | activation = F.relu 218 | x_deep = first_order 219 | for i, h in enumerate(self.deep_layers[1:], 1): 220 | x_deep = getattr(self, 'linear_' + str(i))(x_deep) 221 | if self.is_batch_norm: 222 | x_deep = getattr(self, 'batch_norm_' + str(i))(x_deep) 223 | x_deep = activation(x_deep) 224 | if self.is_deep_dropout: 225 | x_deep = getattr(self, 'linear_' + str(i) + '_dropout')(x_deep) 226 | x_deep = self.deep_last_layer(x_deep) 227 | return torch.sum(x_deep, 1) 228 | 229 | 230 | 231 | 232 | 233 | 234 | def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None, 235 | y_valid = None, ealry_stopping=False, refit=False, save_path = None): 236 | """ 237 | :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...] 238 | indi_j is the feature index of feature field j of sample i in the training set 239 | :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...] 240 | vali_j is the feature value of feature field j of sample i in the training set 241 | vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features) 242 | :param y_train: label of each sample in the training set 243 | :param Xi_valid: list of list of feature indices of each sample in the validation set 244 | :param Xv_valid: list of list of feature values of each sample in the validation set 245 | :param y_valid: label of each sample in the validation set 246 | :param ealry_stopping: perform early stopping or not 247 | :param refit: refit the model on the train+valid dataset or not 248 | :param save_path: the path to save the model 249 | :return: 250 | """ 251 | """ 252 | pre_process 253 | """ 254 | if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])): 255 | print("Save path is not existed!") 256 | return 257 | 258 | if self.verbose: 259 | print("pre_process data ing...") 260 | is_valid = False 261 | Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1)) 262 | Xv_train = np.array(Xv_train) 263 | y_train = np.array(y_train) 264 | x_size = Xi_train.shape[0] 265 | if Xi_valid: 266 | Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1)) 267 | Xv_valid = np.array(Xv_valid) 268 | y_valid = np.array(y_valid) 269 | x_valid_size = Xi_valid.shape[0] 270 | is_valid = True 271 | if self.verbose: 272 | print("pre_process data finished") 273 | 274 | """ 275 | train model 276 | """ 277 | model = self.train() 278 | optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 279 | if self.optimizer_type == 'adam': 280 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 281 | elif self.optimizer_type == 'rmsp': 282 | optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 283 | elif self.optimizer_type == 'adag': 284 | optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) 285 | 286 | criterion = F.binary_cross_entropy_with_logits 287 | 288 | train_result = [] 289 | valid_result = [] 290 | for epoch in range(self.n_epochs): 291 | total_loss = 0.0 292 | batch_iter = x_size // self.batch_size 293 | epoch_begin_time = time() 294 | batch_begin_time = time() 295 | for i in range(batch_iter+1): 296 | offset = i*self.batch_size 297 | end = min(x_size, offset+self.batch_size) 298 | if offset == end: 299 | break 300 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 301 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 302 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 303 | if self.use_cuda: 304 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 305 | optimizer.zero_grad() 306 | outputs = model(batch_xi, batch_xv) 307 | loss = criterion(outputs, batch_y) 308 | loss.backward() 309 | optimizer.step() 310 | 311 | total_loss += loss.data[0] 312 | if self.verbose: 313 | if i % 100 == 99: # print every 100 mini-batches 314 | eval = self.evaluate(batch_xi, batch_xv, batch_y) 315 | print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' % 316 | (epoch + 1, i + 1, total_loss, eval, time()-batch_begin_time)) 317 | total_loss = 0.0 318 | batch_begin_time = time() 319 | 320 | train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size) 321 | train_result.append(train_eval) 322 | print('*'*50) 323 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 324 | (epoch + 1, train_loss, train_eval, time()-epoch_begin_time)) 325 | print('*'*50) 326 | 327 | if is_valid: 328 | valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size) 329 | valid_result.append(valid_eval) 330 | print('*' * 50) 331 | print('[%d] loss: %.6f metric: %.6f time: %.1f s' % 332 | (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time)) 333 | print('*' * 50) 334 | if save_path: 335 | torch.save(self.state_dict(),save_path) 336 | if is_valid and ealry_stopping and self.training_termination(valid_result): 337 | print("early stop at [%d] epoch!" % (epoch+1)) 338 | break 339 | 340 | # fit a few more epoch on train+valid until result reaches the best_train_score 341 | if is_valid and refit: 342 | if self.verbose: 343 | print("refitting the model") 344 | if self.greater_is_better: 345 | best_epoch = np.argmax(valid_result) 346 | else: 347 | best_epoch = np.argmin(valid_result) 348 | best_train_score = train_result[best_epoch] 349 | Xi_train = np.concatenate((Xi_train,Xi_valid)) 350 | Xv_train = np.concatenate((Xv_train,Xv_valid)) 351 | y_train = np.concatenate((y_train,y_valid)) 352 | x_size = x_size + x_valid_size 353 | self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train) 354 | for epoch in range(64): 355 | batch_iter = x_size // self.batch_size 356 | for i in range(batch_iter + 1): 357 | offset = i * self.batch_size 358 | end = min(x_size, offset + self.batch_size) 359 | if offset == end: 360 | break 361 | batch_xi = Variable(torch.LongTensor(Xi_train[offset:end])) 362 | batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end])) 363 | batch_y = Variable(torch.FloatTensor(y_train[offset:end])) 364 | if self.use_cuda: 365 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 366 | optimizer.zero_grad() 367 | outputs = model(batch_xi, batch_xv) 368 | loss = criterion(outputs, batch_y) 369 | loss.backward() 370 | optimizer.step() 371 | train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size) 372 | if save_path: 373 | torch.save(self.state_dict(), save_path) 374 | if abs(best_train_score-train_eval) < 0.001 or \ 375 | (self.greater_is_better and train_eval > best_train_score) or \ 376 | ((not self.greater_is_better) and train_result < best_train_score): 377 | break 378 | if self.verbose: 379 | print("refit finished") 380 | 381 | def eval_by_batch(self,Xi, Xv, y, x_size): 382 | total_loss = 0.0 383 | y_pred = [] 384 | batch_size = 16384 385 | batch_iter = x_size // batch_size 386 | criterion = F.binary_cross_entropy_with_logits 387 | model = self.eval() 388 | for i in range(batch_iter+1): 389 | offset = i * batch_size 390 | end = min(x_size, offset + batch_size) 391 | if offset == end: 392 | break 393 | batch_xi = Variable(torch.LongTensor(Xi[offset:end])) 394 | batch_xv = Variable(torch.FloatTensor(Xv[offset:end])) 395 | batch_y = Variable(torch.FloatTensor(y[offset:end])) 396 | if self.use_cuda: 397 | batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda() 398 | outputs = model(batch_xi, batch_xv) 399 | pred = F.sigmoid(outputs).cpu() 400 | y_pred.extend(pred.data.numpy()) 401 | loss = criterion(outputs, batch_y) 402 | total_loss += loss.data[0]*(end-offset) 403 | total_metric = self.eval_metric(y,y_pred) 404 | return total_loss/x_size, total_metric 405 | 406 | # shuffle three lists simutaneously 407 | def shuffle_in_unison_scary(self, a, b, c): 408 | rng_state = np.random.get_state() 409 | np.random.shuffle(a) 410 | np.random.set_state(rng_state) 411 | np.random.shuffle(b) 412 | np.random.set_state(rng_state) 413 | np.random.shuffle(c) 414 | 415 | def training_termination(self, valid_result): 416 | if len(valid_result) > 4: 417 | if self.greater_is_better: 418 | if valid_result[-1] < valid_result[-2] and \ 419 | valid_result[-2] < valid_result[-3] and \ 420 | valid_result[-3] < valid_result[-4]: 421 | return True 422 | else: 423 | if valid_result[-1] > valid_result[-2] and \ 424 | valid_result[-2] > valid_result[-3] and \ 425 | valid_result[-3] > valid_result[-4]: 426 | return True 427 | return False 428 | 429 | def predict(self, Xi, Xv): 430 | """ 431 | :param Xi: the same as fit function 432 | :param Xv: the same as fit function 433 | :return: output, ont-dim array 434 | """ 435 | Xi = np.array(Xi).reshape((-1,self.field_size,1)) 436 | Xi = Variable(torch.LongTensor(Xi)) 437 | Xv = Variable(torch.FloatTensor(Xv)) 438 | if self.use_cuda and torch.cuda.is_available(): 439 | Xi, Xv = Xi.cuda(), Xv.cuda() 440 | 441 | model = self.eval() 442 | pred = F.sigmoid(model(Xi, Xv)).cpu() 443 | return (pred.data.numpy() > 0.5) 444 | 445 | def predict_proba(self, Xi, Xv): 446 | Xi = np.array(Xi).reshape((-1, self.field_size, 1)) 447 | Xi = Variable(torch.LongTensor(Xi)) 448 | Xv = Variable(torch.FloatTensor(Xv)) 449 | if self.use_cuda and torch.cuda.is_available(): 450 | Xi, Xv = Xi.cuda(), Xv.cuda() 451 | 452 | model = self.eval() 453 | pred = F.sigmoid(model(Xi, Xv)).cpu() 454 | return pred.data.numpy() 455 | 456 | def inner_predict(self, Xi, Xv): 457 | """ 458 | :param Xi: tensor of feature index 459 | :param Xv: tensor of feature value 460 | :return: output, numpy 461 | """ 462 | model = self.eval() 463 | pred = F.sigmoid(model(Xi, Xv)).cpu() 464 | return (pred.data.numpy() > 0.5) 465 | 466 | def inner_predict_proba(self, Xi, Xv): 467 | """ 468 | :param Xi: tensor of feature index 469 | :param Xv: tensor of feature value 470 | :return: output, numpy 471 | """ 472 | model = self.eval() 473 | pred = F.sigmoid(model(Xi, Xv)).cpu() 474 | return pred.data.numpy() 475 | 476 | 477 | def evaluate(self, Xi, Xv, y): 478 | """ 479 | :param Xi: tensor of feature index 480 | :param Xv: tensor of feature value 481 | :param y: tensor of labels 482 | :return: metric of the evaluation 483 | """ 484 | y_pred = self.inner_predict_proba(Xi, Xv) 485 | return self.eval_metric(y.cpu().data.numpy(), y_pred) 486 | 487 | """ 488 | test part 489 | """ 490 | import sys 491 | sys.path.append('../') 492 | from utils import data_preprocess 493 | 494 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv') 495 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv') 496 | with torch.cuda.device(2): 497 | pnn = PNN(39, result_dict['feature_sizes'], batch_size=128 * 64, verbose=True, use_cuda=True,weight_decay=0.00001, use_inner_product=True, use_outer_product=True).cuda() 498 | pnn.fit(result_dict['index'], result_dict['value'], result_dict['label'], 499 | test_dict['index'], test_dict['value'], test_dict['label'],ealry_stopping=True,refit=False,save_path='../data/model/pnn.pkl') 500 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/model/__init__.py -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/utils/__init__.py -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data_preprocess.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/utils/__pycache__/data_preprocess.cpython-36.pyc -------------------------------------------------------------------------------- /utils/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/utils/common.py -------------------------------------------------------------------------------- /utils/data_preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | This script is used to preprocess the raw data file 8 | 9 | """ 10 | 11 | import sys 12 | import math 13 | import argparse 14 | import hashlib, csv, math, os, pickle, subprocess 15 | import pandas as pd 16 | 17 | def gen_criteo_category_index(file_path): 18 | cate_dict = [] 19 | for i in range(26): 20 | cate_dict.append({}) 21 | for line in open(file_path, 'r'): 22 | datas = line.replace('\n','').split('\t') 23 | for i, item in enumerate(datas[14:]): 24 | if not cate_dict[i].has_key(item): 25 | cate_dict[i][item] = len(cate_dict[i]) 26 | return cate_dict 27 | 28 | def write_criteo_category_index(file_path, cate_dict_arr): 29 | f = open(file_path,'w') 30 | for i, cate_dict in enumerate(cate_dict_arr): 31 | for key in cate_dict: 32 | f.write(str(i)+','+key+','+str(cate_dict[key])+'\n') 33 | 34 | def load_criteo_category_index(file_path): 35 | f = open(file_path,'r') 36 | cate_dict = [] 37 | for i in range(39): 38 | cate_dict.append({}) 39 | for line in f: 40 | datas = line.strip().split(',') 41 | cate_dict[int(datas[0])][datas[1]] = int(datas[2]) 42 | return cate_dict 43 | 44 | def read_raw_criteo_data(file_path, embedding_path, type): 45 | """ 46 | :param file_path: string 47 | :param type: string (train or test) 48 | :return: result: dict 49 | result['continuous_feat']:two-dim array 50 | result['category_feat']:dict 51 | result['category_feat']['index']:two-dim array 52 | result['category_feat']['value']:two-dim array 53 | result['label']: one-dim array 54 | """ 55 | begin_index = 1 56 | if type != 'train' and type != 'test': 57 | print("type error") 58 | return {} 59 | elif type == 'test': 60 | begin_index = 0 61 | cate_embedding = load_criteo_category_index(embedding_path) 62 | result = {'continuous_feat':[], 'category_feat':{'index':[],'value':[]}, 'label':[], 'feature_sizes':[]} 63 | for i, item in enumerate(cate_embedding): 64 | result['feature_sizes'].append(len(item)) 65 | f = open(file_path) 66 | for line in f: 67 | datas = line.replace('\n', '').split('\t') 68 | 69 | indexs = [] 70 | values = [] 71 | flag = True 72 | for i, item in enumerate(datas[begin_index + 13:]): 73 | if not cate_embedding[i].has_key(item): 74 | flag = False 75 | break 76 | indexs.append(cate_embedding[i][item]) 77 | values.append(1) 78 | if not flag: 79 | continue 80 | result['category_feat']['index'].append(indexs) 81 | result['category_feat']['value'].append(values) 82 | 83 | if type == 'train': 84 | result['label'].append(int(datas[0])) 85 | else: 86 | result['label'].append(0) 87 | 88 | continuous_array = [] 89 | for item in datas[begin_index:begin_index+13]: 90 | if item == '': 91 | continuous_array.append(-10.0) 92 | elif float(item) < 2.0: 93 | continuous_array.append(float(item)) 94 | else: 95 | continuous_array.append(math.log(float(item))) 96 | result['continuous_feat'].append(continuous_array) 97 | 98 | return result 99 | 100 | def read_criteo_data(file_path,emb_file): 101 | result = {'lable':[], 'index':[],'value':[],'feature_sizes':[]} 102 | cate_dict = load_criteo_category_index(emb_file) 103 | for item in cate_dict: 104 | result['feature_sizes'].append(len(item)) 105 | f = open(file_path,'r') 106 | for line in f: 107 | datas = line.strip().split(',') 108 | result['lable'].append(int(datas[0])) 109 | indexs = [int(item) for item in datas[1:]] 110 | values = [1 for i in range(39)] 111 | result['index'].append(indexs) 112 | result['value'].append(values) 113 | return result 114 | 115 | def gen_criteo_category_emb_from_libffmfile(filepath, dir_path): 116 | fr = open(filepath) 117 | cate_emb_arr = [{} for i in range(39)] 118 | for line in fr: 119 | datas = line.strip().split(' ') 120 | for item in datas[1:]: 121 | [filed, index, value] = item.split(':') 122 | filed = int(filed) 123 | index = int(index) 124 | if not cate_emb_arr[filed].has_key(index): 125 | cate_emb_arr[filed][index] = len(cate_emb_arr[filed]) 126 | 127 | with open(dir_path, 'w') as f: 128 | for i,item in enumerate(cate_emb_arr): 129 | for key in item: 130 | f.write(str(i)+','+str(key)+','+str(item[key])+'\n') 131 | 132 | def gen_emb_input_file(filepath, emb_file, dir_path): 133 | cate_dict = load_criteo_category_index(emb_file) 134 | fr = open(filepath,'r') 135 | fw = open(dir_path,'w') 136 | for line in fr: 137 | row = [] 138 | datas = line.strip().split(' ') 139 | row.append(datas[0]) 140 | for item in datas[1:]: 141 | [filed, index, value] = item.split(':') 142 | filed = int(filed) 143 | row.append(str(cate_dict[filed][index])) 144 | fw.write(','.join(row)+'\n') 145 | 146 | 147 | def read_csv_dataset(train_csv,task='like'): 148 | train_dict={} 149 | test_dict={} 150 | train_csv=pd.read_csv(train_csv) 151 | if task=='like': 152 | lable=train_csv[task] 153 | elif task=='finish': 154 | lable=train_csv[task] 155 | train_dict['lable']=lable[0:int(len(lable)*0.8)].to_list() 156 | test_dict['lable']=lable[int(len(lable)*0.8)+1:-1].to_list() 157 | 158 | feild = ['uid','user_city','item_id','author_id','item_city','channel','music_id','video_duration'] 159 | value=[1]*len(feild) 160 | values=[value for i in range(len(lable))] 161 | train_dict['value']=values[0:int(len(lable)*0.8)] 162 | test_dict['value']=values[int(len(lable)*0.8)+1:-1] 163 | 164 | feature_sizes=[73974,397,4122689,850308,462,5,89779,641] 165 | train_dict['feature_sizes']=feature_sizes 166 | test_dict['feature_sizes']=feature_sizes 167 | ''' 168 | creat_time_segment=35898.1 169 | min_num=53015373867 170 | train_csv['creat_time']=train_csv['creat_time'].apply(lambda x:int((x-min_num)/creat_time_segment)) 171 | ''' 172 | 173 | temp=train_csv[feild].values 174 | train_dict['index']=temp[0:int(len(lable)*0.8)].tolist() 175 | test_dict['index']=temp[int(len(lable)*0.8)+1:-1].tolist() 176 | 177 | return train_dict,test_dict 178 | 179 | 180 | def read_csv_dataset_pred(pred_csv,task='like'): 181 | pred_dict={} 182 | train_csv=pd.read_csv(pred_csv) 183 | if task=='like': 184 | lable=train_csv[task] 185 | elif task=='finish': 186 | lable=train_csv[task] 187 | pred_dict['lable']=lable.to_list() 188 | 189 | feild = ['uid','user_city','item_id','author_id','item_city','channel','music_id','creat_time','video_duration'] 190 | value=[1]*len(feild) 191 | values=[value for i in range(len(lable))] 192 | pred_dict['value']=values 193 | 194 | feature_sizes=[] 195 | for i in feild: 196 | feature_size=max(train_csv[i])+1 197 | if i=='creat_time': 198 | feature_size=2010 199 | feature_sizes.append(feature_size) 200 | pred_dict['feature_sizes']=feature_sizes 201 | 202 | creat_time_segment=35898.1 203 | min_num=53015373867 204 | train_csv['creat_time']=train_csv['creat_time'].apply(lambda x:int((x-min_num)/creat_time_segment)) 205 | 206 | temp=train_csv[feild].values 207 | pred_dict['index']=temp.tolist() 208 | return pred_dict 209 | -------------------------------------------------------------------------------- /utils/sample.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | This script is used to sample data from the raw dataset 8 | 9 | python sample.py s_path t_path prob 10 | 11 | """ 12 | 13 | import argparse 14 | import sys 15 | import random 16 | 17 | s_path = sys.argv[1] 18 | t_path = sys.argv[2] 19 | prob = float(sys.argv[3]) 20 | 21 | with open(t_path,'wb') as f: 22 | for line in open(s_path,'rb'): 23 | if random.random() < prob: 24 | f.write(line) 25 | -------------------------------------------------------------------------------- /utils/split_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Created on Dec 10, 2017 5 | @author: jachin,Nie 6 | 7 | This script is used to sample data from the raw dataset 8 | 9 | python sample.py s_path t_path prob 10 | 11 | """ 12 | 13 | import argparse 14 | import sys 15 | import random 16 | 17 | s_path = sys.argv[1] 18 | tr_path = sys.argv[2] 19 | te_path = sys.argv[3] 20 | prob = float(sys.argv[4]) 21 | 22 | with open(tr_path,'wb') as fr: 23 | with open(te_path,'wb') as fe: 24 | for line in open(s_path,'rb'): 25 | if random.random() < prob: 26 | fr.write(line) 27 | else: 28 | fe.write(line) 29 | --------------------------------------------------------------------------------