├── README.md
├── main.py
├── model
    ├── AFM.py
    ├── DCN.py
    ├── DIN.py
    ├── DeepFM.py
    ├── FNN.py
    ├── NFM.py
    ├── PNN.py
    └── __init__.py
└── utils
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-36.pyc
        └── data_preprocess.cpython-36.pyc
    ├── common.py
    ├── data_preprocess.py
    ├── sample.py
    └── split_train.py


/README.md:
--------------------------------------------------------------------------------
1 | # dnn_ctr
2 | The framework to deal with ctr problem
3 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from utils import data_preprocess
 4 | from model import DeepFM
 5 | import torch
 6 | import pickle
 7 | 
 8 | train_dict,test_dict = data_preprocess.read_csv_dataset('./data/final_track2_train_new.csv',task='finish')
 9 | #pred_dict=data_preprocess.read_csv_dataset_pred('./data/small_test.csv',task='like')
10 | #train_dict = data_preprocess.read_criteo_data('./data/tiny_train_input.csv', './data/category_emb.csv')
11 | #test_dict = data_preprocess.read_criteo_data('./data/tiny_test_input.csv', './data/category_emb.csv')
12 | 
13 | deepfm = DeepFM.DeepFM(8,train_dict['feature_sizes'],verbose=True,use_cuda=True, weight_decay=0.0001,use_fm=True,use_ffm=False,use_deep=False)
14 | #pred=deepfm.predict_from_model_file(pred_dict['index'], pred_dict['value'],deepfm,'./saved_model')
15 | #pred=deepfm.predict(pred_dict['index'], pred_dict['value'])
16 | #pickle.dump(pred,open('like_pre','wb'))
17 | deepfm.fit(train_dict['index'], train_dict['value'], train_dict['lable'],test_dict['index'], test_dict['value'], test_dict['lable'],ealry_stopping=True,refit=False,save_path='./saved_model')
18 | 


--------------------------------------------------------------------------------
/model/AFM.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | Created on Dec 10, 2017
  5 | @author: jachin,Nie
  6 | 
  7 | A pytorch implementation of AFM
  8 | 
  9 | Reference:
 10 | [1] Attentional Factorization Machines:Learning theWeight of Feature Interactions via Attention Networks
 11 | 
 12 | """
 13 | 
 14 | import os
 15 | import numpy as np
 16 | from sklearn.base import BaseEstimator, TransformerMixin
 17 | from sklearn.metrics import roc_auc_score
 18 | from time import time
 19 | 
 20 | import torch
 21 | import torch.autograd as autograd
 22 | import torch.nn as nn
 23 | import torch.nn.functional as F
 24 | import torch.optim as optim
 25 | from torch.autograd import Variable
 26 | 
 27 | import torch.backends.cudnn
 28 | 
 29 | 
 30 | """
 31 |     网络结构部分
 32 | """
 33 | 
 34 | class AFM(torch.nn.Module):
 35 |     """
 36 |     :parameter
 37 |     -------------
 38 |     field_size: size of the feature fields
 39 |     feature_sizes: a field_size-dim array, sizes of the feature dictionary
 40 |     embedding_size: size of the feature embedding
 41 |     attention_size: The attention netwotk's parameter
 42 |     is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not?
 43 |     dropout_shallow: an array of the size of 1, example:[0.5], the element is for the-first order part
 44 |     h_depth: deep network's hidden layers' depth
 45 |     deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2
 46 |     is_deep_dropout: bool, deep part uses dropout or not?
 47 |     dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2
 48 |     deep_layers_activation: relu or sigmoid etc
 49 |     n_epochs: epochs
 50 |     batch_size: batch_size
 51 |     learning_rate: learning_rate
 52 |     optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag'
 53 |     is_batch_norm：bool,  use batch_norm or not ?
 54 |     verbose: verbose
 55 |     weight_decay: weight decay (L2 penalty)
 56 |     random_seed: random_seed=950104 someone's birthday, my lukcy number
 57 |     use_fm: bool
 58 |     use_ffm: bool
 59 |     loss_type: "logloss", only
 60 |     eval_metric: roc_auc_score
 61 |     use_cuda: bool use gpu or cpu?
 62 |     n_class: number of classes. is bounded to 1
 63 |     greater_is_better: bool. Is the greater eval better?
 64 | 
 65 | 
 66 |     Attention: only support logsitcs regression
 67 |     """
 68 |     def __init__(self,field_size, feature_sizes, embedding_size = 4, attention_size = 4,is_shallow_dropout = True, dropout_shallow = [0.5],
 69 |                  is_attention_dropout = True, dropout_attention=[0.5],
 70 |                  attention_layers_activation = 'relu', n_epochs = 64, batch_size = 256, learning_rate = 0.003,
 71 |                  optimizer_type = 'adam', is_batch_norm = False, verbose = False, random_seed = 950104, weight_decay = 0.0,
 72 |                  use_fm = True, use_ffm = False,loss_type = 'logloss', eval_metric = roc_auc_score,
 73 |                  use_cuda = True, n_class = 1, greater_is_better = True
 74 |                  ):
 75 |         super(AFM, self).__init__()
 76 |         self.field_size = field_size
 77 |         self.feature_sizes = feature_sizes
 78 |         self.embedding_size = embedding_size
 79 |         self.attention_size = attention_size
 80 |         self.is_shallow_dropout = is_shallow_dropout
 81 |         self.dropout_shallow = dropout_shallow
 82 |         self.is_attention_dropout = is_attention_dropout
 83 |         self.dropout_attention = dropout_attention
 84 |         self.attention_layers_activation = attention_layers_activation
 85 |         self.n_epochs = n_epochs
 86 |         self.batch_size = batch_size
 87 |         self.learning_rate = learning_rate
 88 |         self.optimizer_type = optimizer_type
 89 |         self.is_batch_norm = is_batch_norm
 90 |         self.verbose = verbose
 91 |         self.weight_decay = weight_decay
 92 |         self.random_seed = random_seed
 93 |         self.use_fm = use_fm
 94 |         self.use_ffm = use_ffm
 95 |         self.loss_type = loss_type
 96 |         self.eval_metric = eval_metric
 97 |         self.use_cuda = use_cuda
 98 |         self.n_class = n_class
 99 |         self.greater_is_better = greater_is_better
100 | 
101 |         torch.manual_seed(self.random_seed)
102 | 
103 |         """
104 |             check cuda
105 |         """
106 |         if self.use_cuda and not torch.cuda.is_available():
107 |             self.use_cuda = False
108 |             print("Cuda is not available, automatically changed into cpu model")
109 | 
110 |         """
111 |             check use fm or ffm
112 |         """
113 |         if self.use_fm and self.use_ffm:
114 |             print("only support one type only, please make sure to choose only fm or ffm part")
115 |             exit(1)
116 |         elif self.use_fm:
117 |             print("The model is afm(fm+attention layers)")
118 |         elif self.use_ffm:
119 |             print("The model is affm(ffm+attention layers)")
120 |         else:
121 |             print("You have to choose more than one of (fm, ffm) models to use")
122 |             exit(1)
123 |         """
124 |             bias
125 |         """
126 |         self.bias = torch.nn.Parameter(torch.randn(1))
127 | 
128 |         """
129 |             fm part
130 |         """
131 |         if self.use_fm:
132 |             print("Init fm part")
133 |             self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
134 |             if self.dropout_shallow:
135 |                 self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
136 |             self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
137 |             print("Init fm part succeed")
138 | 
139 |         """
140 |             ffm part
141 |         """
142 |         if self.use_ffm:
143 |             print("Init ffm part")
144 |             self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
145 |             if self.dropout_shallow:
146 |                 self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
147 |             self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes])
148 |             print("Init ffm part succeed")
149 | 
150 |         """
151 |             attention part
152 |         """
153 |         print("Init attention part")
154 | 
155 |         if self.is_attention_dropout:
156 |             self.attention_linear_0_dropout = nn.Dropout(self.dropout_attention[0])
157 |         self.attention_linear_1 = nn.Linear(self.embedding_size, self.attention_size)
158 |         self.H = torch.nn.Parameter(torch.randn(self.attention_size))
159 |         self.P = torch.nn.Parameter(torch.randn(self.embedding_size))
160 |         print("Init attention part succeed")
161 | 
162 |         print "Init succeed"
163 | 
164 |     def forward(self, Xi, Xv):
165 |         """
166 |         :param Xi_train: index input tensor, batch_size * k * 1
167 |         :param Xv_train: value input tensor, batch_size * k * 1
168 |         :return: the last output
169 |         """
170 |         """
171 |             fm part
172 |         """
173 |         if self.use_fm:
174 |             fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
175 |             fm_first_order = torch.cat(fm_first_order_emb_arr,1)
176 |             if self.is_shallow_dropout:
177 |                 fm_first_order = self.fm_first_order_dropout(fm_first_order)
178 | 
179 | 
180 |             fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
181 |                                         enumerate(self.fm_second_order_embeddings)]
182 |             fm_wij_arr = []
183 |             for i in range(self.field_size):
184 |                 for j in range(i + 1, self.field_size):
185 |                     fm_wij_arr.append(fm_second_order_emb_arr[i] * fm_second_order_emb_arr[j])
186 | 
187 | 
188 |         """
189 |             ffm part
190 |         """
191 |         if self.use_ffm:
192 |             ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)]
193 |             ffm_first_order = torch.cat(ffm_first_order_emb_arr,1)
194 |             if self.is_shallow_dropout:
195 |                 ffm_first_order = self.ffm_first_order_dropout(ffm_first_order)
196 |             ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in  f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)]
197 |             ffm_wij_arr = []
198 |             for i in range(self.field_size):
199 |                 for j in range(i+1, self.field_size):
200 |                     ffm_wij_arr.append(ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i])
201 | 
202 |         """
203 |             attention part
204 |         """
205 |         if self.use_fm:
206 |             interaction_layer = torch.cat(fm_wij_arr, 1)
207 |         else:
208 |             interaction_layer = torch.cat(ffm_wij_arr,1)
209 | 
210 |         if self.attention_layers_activation == 'sigmoid':
211 |             activation = F.sigmoid
212 |         elif self.attention_layers_activation == 'tanh':
213 |             activation = F.tanh
214 |         else:
215 |             activation = F.relu
216 | 
217 |         if self.is_attention_dropout:
218 |             interaction_layer = self.attention_linear_0_dropout(interaction_layer)
219 |         attention_tmp = self.attention_linear_1(interaction_layer.view([-1,self.embedding_size]))
220 |         attention_tmp = attention_tmp * self.H
221 |         attention_tmp = torch.sum(attention_tmp,1).view([-1,self.field_size*(self.field_size-1)/2])
222 |         attention_weight = torch.nn.Softmax()(attention_tmp)
223 |         attention_output = torch.sum(interaction_layer.view([-1,self.embedding_size])* self.P,1).view([-1,self.field_size*(self.field_size-1)/2])
224 |         attention_output = attention_output * attention_weight
225 | 
226 | 
227 |         """
228 |             sum
229 |         """
230 |         if self.use_fm:
231 |             total_sum = self.bias+ torch.sum(fm_first_order,1) + torch.sum(attention_output,1)
232 |         elif self.use_ffm:
233 |             total_sum = self.bias + torch.sum(ffm_first_order, 1) + torch.sum(attention_output, 1)
234 |         return total_sum
235 | 
236 | 
237 |     def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None,
238 |                 y_valid = None, ealry_stopping=False, refit=False, save_path = None):
239 |         """
240 |         :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
241 |                         indi_j is the feature index of feature field j of sample i in the training set
242 |         :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
243 |                         vali_j is the feature value of feature field j of sample i in the training set
244 |                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
245 |         :param y_train: label of each sample in the training set
246 |         :param Xi_valid: list of list of feature indices of each sample in the validation set
247 |         :param Xv_valid: list of list of feature values of each sample in the validation set
248 |         :param y_valid: label of each sample in the validation set
249 |         :param ealry_stopping: perform early stopping or not
250 |         :param refit: refit the model on the train+valid dataset or not
251 |         :param save_path: the path to save the model
252 |         :return:
253 |         """
254 |         """
255 |         pre_process
256 |         """
257 |         if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])):
258 |             print("Save path is not existed!")
259 |             return
260 | 
261 |         if self.verbose:
262 |             print("pre_process data ing...")
263 |         is_valid = False
264 |         Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1))
265 |         Xv_train = np.array(Xv_train)
266 |         y_train = np.array(y_train)
267 |         x_size = Xi_train.shape[0]
268 |         if Xi_valid:
269 |             Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1))
270 |             Xv_valid = np.array(Xv_valid)
271 |             y_valid = np.array(y_valid)
272 |             x_valid_size = Xi_valid.shape[0]
273 |             is_valid = True
274 |         if self.verbose:
275 |             print("pre_process data finished")
276 | 
277 |         """
278 |             train model
279 |         """
280 |         model = self.train()
281 | 
282 |         optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
283 |         if self.optimizer_type == 'adam':
284 |             optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
285 |         elif self.optimizer_type == 'rmsp':
286 |             optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
287 |         elif self.optimizer_type == 'adag':
288 |             optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
289 | 
290 |         criterion = F.binary_cross_entropy_with_logits
291 | 
292 |         train_result = []
293 |         valid_result = []
294 |         for epoch in range(self.n_epochs):
295 |             total_loss = 0.0
296 |             batch_iter = x_size // self.batch_size
297 |             epoch_begin_time = time()
298 |             batch_begin_time = time()
299 |             for i in range(batch_iter+1):
300 |                 offset = i*self.batch_size
301 |                 end = min(x_size, offset+self.batch_size)
302 |                 if offset == end:
303 |                     break
304 |                 batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
305 |                 batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
306 |                 batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
307 |                 if self.use_cuda:
308 |                     batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
309 |                 optimizer.zero_grad()
310 |                 outputs = model(batch_xi, batch_xv)
311 |                 loss = criterion(outputs, batch_y)
312 |                 loss.backward()
313 |                 optimizer.step()
314 | 
315 |                 total_loss += loss.data[0]
316 |                 if self.verbose:
317 |                     if i % 100 == 99:  # print every 100 mini-batches
318 |                         eval = self.evaluate(batch_xi, batch_xv, batch_y)
319 |                         print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' %
320 |                               (epoch + 1, i + 1, total_loss/100.0, eval, time()-batch_begin_time))
321 |                         total_loss = 0.0
322 |                         batch_begin_time = time()
323 | 
324 |             train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size)
325 |             train_result.append(train_eval)
326 |             print('*'*50)
327 |             print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
328 |                   (epoch + 1, train_loss, train_eval, time()-epoch_begin_time))
329 |             print('*'*50)
330 | 
331 |             if is_valid:
332 |                 valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size)
333 |                 valid_result.append(valid_eval)
334 |                 print('*' * 50)
335 |                 print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
336 |                       (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time))
337 |                 print('*' * 50)
338 |             if save_path:
339 |                 torch.save(self.state_dict(),save_path)
340 |             if is_valid and ealry_stopping and self.training_termination(valid_result):
341 |                 print("early stop at [%d] epoch!" % (epoch+1))
342 |                 break
343 | 
344 |         # fit a few more epoch on train+valid until result reaches the best_train_score
345 |         if is_valid and refit:
346 |             if self.verbose:
347 |                 print("refitting the model")
348 |             if self.greater_is_better:
349 |                 best_epoch = np.argmax(valid_result)
350 |             else:
351 |                 best_epoch = np.argmin(valid_result)
352 |             best_train_score = train_result[best_epoch]
353 |             Xi_train = np.concatenate((Xi_train,Xi_valid))
354 |             Xv_train = np.concatenate((Xv_train,Xv_valid))
355 |             y_train = np.concatenate((y_train,y_valid))
356 |             x_size = x_size + x_valid_size
357 |             self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train)
358 |             for epoch in range(64):
359 |                 batch_iter = x_size // self.batch_size
360 |                 for i in range(batch_iter + 1):
361 |                     offset = i * self.batch_size
362 |                     end = min(x_size, offset + self.batch_size)
363 |                     if offset == end:
364 |                         break
365 |                     batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
366 |                     batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
367 |                     batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
368 |                     if self.use_cuda:
369 |                         batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
370 |                     optimizer.zero_grad()
371 |                     outputs = model(batch_xi, batch_xv)
372 |                     loss = criterion(outputs, batch_y)
373 |                     loss.backward()
374 |                     optimizer.step()
375 |                 train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size)
376 |                 if save_path:
377 |                     torch.save(self.state_dict(), save_path)
378 |                 if abs(best_train_score-train_eval) < 0.001 or \
379 |                         (self.greater_is_better and train_eval > best_train_score) or \
380 |                         ((not self.greater_is_better) and train_result < best_train_score):
381 |                     break
382 |             if self.verbose:
383 |                 print("refit finished")
384 | 
385 |     def eval_by_batch(self,Xi, Xv, y, x_size):
386 |         total_loss = 0.0
387 |         y_pred = []
388 |         if self.use_ffm:
389 |             batch_size = 16384*2
390 |         else:
391 |             batch_size = 16384
392 |         batch_iter = x_size // batch_size
393 |         criterion = F.binary_cross_entropy_with_logits
394 |         model = self.eval()
395 |         for i in range(batch_iter+1):
396 |             offset = i * batch_size
397 |             end = min(x_size, offset + batch_size)
398 |             if offset == end:
399 |                 break
400 |             batch_xi = Variable(torch.LongTensor(Xi[offset:end]))
401 |             batch_xv = Variable(torch.FloatTensor(Xv[offset:end]))
402 |             batch_y = Variable(torch.FloatTensor(y[offset:end]))
403 |             if self.use_cuda:
404 |                 batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
405 |             outputs = model(batch_xi, batch_xv)
406 |             pred = F.sigmoid(outputs).cpu()
407 |             y_pred.extend(pred.data.numpy())
408 |             loss = criterion(outputs, batch_y)
409 |             total_loss += loss.data[0]*(end-offset)
410 |         total_metric = self.eval_metric(y,y_pred)
411 |         return total_loss/x_size, total_metric
412 | 
413 |     # shuffle three lists simutaneously
414 |     def shuffle_in_unison_scary(self, a, b, c):
415 |         rng_state = np.random.get_state()
416 |         np.random.shuffle(a)
417 |         np.random.set_state(rng_state)
418 |         np.random.shuffle(b)
419 |         np.random.set_state(rng_state)
420 |         np.random.shuffle(c)
421 | 
422 |     def training_termination(self, valid_result):
423 |         if len(valid_result) > 4:
424 |             if self.greater_is_better:
425 |                 if valid_result[-1] < valid_result[-2] and \
426 |                     valid_result[-2] < valid_result[-3] and \
427 |                     valid_result[-3] < valid_result[-4]:
428 |                     return True
429 |             else:
430 |                 if valid_result[-1] > valid_result[-2] and \
431 |                     valid_result[-2] > valid_result[-3] and \
432 |                     valid_result[-3] > valid_result[-4]:
433 |                     return True
434 |         return False
435 | 
436 |     def predict(self, Xi, Xv):
437 |         """
438 |         :param Xi: the same as fit function
439 |         :param Xv: the same as fit function
440 |         :return: output, ont-dim array
441 |         """
442 |         Xi = np.array(Xi).reshape((-1,self.field_size,1))
443 |         Xi = Variable(torch.LongTensor(Xi))
444 |         Xv = Variable(torch.FloatTensor(Xv))
445 |         if self.use_cuda and torch.cuda.is_available():
446 |             Xi, Xv = Xi.cuda(), Xv.cuda()
447 | 
448 |         model = self.eval()
449 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
450 |         return (pred.data.numpy() > 0.5)
451 | 
452 |     def predict_proba(self, Xi, Xv):
453 |         Xi = np.array(Xi).reshape((-1, self.field_size, 1))
454 |         Xi = Variable(torch.LongTensor(Xi))
455 |         Xv = Variable(torch.FloatTensor(Xv))
456 |         if self.use_cuda and torch.cuda.is_available():
457 |             Xi, Xv = Xi.cuda(), Xv.cuda()
458 | 
459 |         model = self.eval()
460 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
461 |         return pred.data.numpy()
462 | 
463 |     def inner_predict(self, Xi, Xv):
464 |         """
465 |         :param Xi: tensor of feature index
466 |         :param Xv: tensor of feature value
467 |         :return: output, numpy
468 |         """
469 |         model = self.eval()
470 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
471 |         return (pred.data.numpy() > 0.5)
472 | 
473 |     def inner_predict_proba(self, Xi, Xv):
474 |         """
475 |         :param Xi: tensor of feature index
476 |         :param Xv: tensor of feature value
477 |         :return: output, numpy
478 |         """
479 |         model = self.eval()
480 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
481 |         return pred.data.numpy()
482 | 
483 | 
484 |     def evaluate(self, Xi, Xv, y):
485 |         """
486 |         :param Xi: tensor of feature index
487 |         :param Xv: tensor of feature value
488 |         :param y: tensor of labels
489 |         :return: metric of the evaluation
490 |         """
491 |         y_pred = self.inner_predict_proba(Xi, Xv)
492 |         return self.eval_metric(y.cpu().data.numpy(), y_pred)
493 | 
494 | """
495 |     test part
496 | """
497 | import sys
498 | sys.path.append('../')
499 | from utils import data_preprocess
500 | 
501 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv')
502 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv')
503 | with torch.cuda.device(0):
504 |     afm = AFM(39, result_dict['feature_sizes'], batch_size=128 * 64, is_shallow_dropout=False, verbose=True, use_cuda=True,
505 |                       weight_decay=0.00002, use_fm=True, use_ffm=False).cuda()
506 |     afm.fit(result_dict['index'], result_dict['value'], result_dict['label'],
507 |             test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, refit=False,
508 |             save_path='../data/model/afm.pkl')
509 | 


--------------------------------------------------------------------------------
/model/DCN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | Created on Dec 10, 2017
  5 | @author: jachin,Nie
  6 | 
  7 | A pytorch implementation of NFM
  8 | 
  9 | Reference:
 10 | [1] Deep & Cross Network for Ad Click Predictions
 11 | Ruoxi Wang,Stanford University,Stanford, CA,ruoxi@stanford.edu
 12 | Bin Fu,Google Inc.,New York, NY,binfu@google.com
 13 | Gang Fu,Google Inc.,New York, NY,thomasfu@google.com
 14 | Mingliang Wang,Google Inc.,New York, NY,mlwang@google.com
 15 | 
 16 | """
 17 | 
 18 | import os
 19 | import numpy as np
 20 | from sklearn.base import BaseEstimator, TransformerMixin
 21 | from sklearn.metrics import roc_auc_score
 22 | from time import time
 23 | 
 24 | import torch
 25 | import torch.autograd as autograd
 26 | import torch.nn as nn
 27 | import torch.nn.functional as F
 28 | import torch.optim as optim
 29 | from torch.autograd import Variable
 30 | 
 31 | import torch.backends.cudnn
 32 | 
 33 | 
 34 | """
 35 |     网络结构部分
 36 | """
 37 | 
 38 | class DCN(torch.nn.Module):
 39 |     """
 40 |     :parameter
 41 |     -------------
 42 |     field_size: size of the feature fields
 43 |     feature_sizes: a field_size-dim array, sizes of the feature dictionary
 44 |     embedding_size: size of the feature embedding
 45 |     h_depth: deep network's hidden layers' depth
 46 |     deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2
 47 |     is_deep_dropout: bool, deep part uses dropout or not?
 48 |     dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2
 49 |     deep_layers_activation: relu or sigmoid etc
 50 |     n_epochs: epochs
 51 |     batch_size: batch_size
 52 |     learning_rate: learning_rate
 53 |     optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag'
 54 |     is_batch_norm：bool,  use batch_norm or not ?
 55 |     verbose: verbose
 56 |     weight_decay: weight decay (L2 penalty)
 57 |     random_seed: random_seed=950104 someone's birthday, my lukcy number
 58 |     use_cross: bool
 59 |     use_inner_prodcut: bool
 60 |     use_depp:bool
 61 |     loss_type: "logloss", only
 62 |     eval_metric: roc_auc_score
 63 |     use_cuda: bool use gpu or cpu?
 64 |     n_class: number of classes. is bounded to 1
 65 |     greater_is_better: bool. Is the greater eval better?
 66 | 
 67 | 
 68 |     Attention: only support logsitcs regression
 69 |     """
 70 |     def __init__(self,field_size, feature_sizes, embedding_size = 4,
 71 |                  h_depth = 2, deep_layers = [32, 32], is_deep_dropout = True, dropout_deep=[0.0, 0.5, 0.5],
 72 |                  h_cross_depth = 3,
 73 |                  h_inner_product_depth = 2, inner_product_layers = [32, 32], is_inner_product_dropout = True, dropout_inner_product_deep = [0.0, 0.5, 0.5],
 74 |                  deep_layers_activation = 'relu', n_epochs = 64, batch_size = 256, learning_rate = 0.003,
 75 |                  optimizer_type = 'adam', is_batch_norm = False, verbose = False, random_seed = 950104,
 76 |                  use_cross = True, use_inner_product = False, use_deep = True,weight_decay = 0.0,loss_type = 'logloss', eval_metric = roc_auc_score,
 77 |                  use_cuda = True, n_class = 1, greater_is_better = True
 78 |                  ):
 79 |         super(DCN, self).__init__()
 80 |         self.field_size = field_size
 81 |         self.feature_sizes = feature_sizes
 82 |         self.embedding_size = embedding_size
 83 |         self.h_depth = h_depth
 84 |         self.deep_layers = deep_layers
 85 |         self.is_deep_dropout = is_deep_dropout
 86 |         self.dropout_deep = dropout_deep
 87 |         self.h_cross_depth = h_cross_depth
 88 |         self.h_inner_product_depth = h_inner_product_depth
 89 |         self.inner_product_layers = inner_product_layers
 90 |         self.is_inner_product_dropout = is_inner_product_dropout
 91 |         self.dropout_inner_product_deep = dropout_inner_product_deep
 92 |         self.deep_layers_activation = deep_layers_activation
 93 |         self.n_epochs = n_epochs
 94 |         self.batch_size = batch_size
 95 |         self.learning_rate = learning_rate
 96 |         self.optimizer_type = optimizer_type
 97 |         self.is_batch_norm = is_batch_norm
 98 |         self.verbose = verbose
 99 |         self.weight_decay = weight_decay
100 |         self.random_seed = random_seed
101 |         self.use_cross = use_cross
102 |         self.use_inner_product = use_inner_product
103 |         self.use_deep = use_deep
104 |         self.loss_type = loss_type
105 |         self.eval_metric = eval_metric
106 |         self.use_cuda = use_cuda
107 |         self.n_class = n_class
108 |         self.greater_is_better = greater_is_better
109 | 
110 |         torch.manual_seed(self.random_seed)
111 | 
112 |         """
113 |             check cuda
114 |         """
115 |         if self.use_cuda and not torch.cuda.is_available():
116 |             self.use_cuda = False
117 |             print("Cuda is not available, automatically changed into cpu model")
118 | 
119 |         """
120 |             check model type
121 |         """
122 |         if self.use_cross and self.use_deep and self.use_inner_product:
123 |             print("The model is (cross network + deep network + inner_product network)")
124 |         elif self.use_cross and self.use_deep:
125 |             print("The model is (cross network + deep network)")
126 |         elif self.use_cross and self.use_inner_product:
127 |             print("The model is (cross network + inner product network)")
128 |         elif self.use_inner_product and self.use_deep:
129 |             print("The model is (inner product network + deep network)")
130 |         elif self.use_cross:
131 |             print("The model is a cross network only")
132 |         elif self.use_deep:
133 |             print("The model is a deep network only")
134 |         elif self.use_inner_product:
135 |             print("The model is an inner product network only")
136 |         else:
137 |             print("You have to choose more than one of (cross network, deep network, inner product network) models to use")
138 |             exit(1)
139 | 
140 |         """
141 |             embeddings
142 |         """
143 |         self.embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
144 | 
145 |         cat_size = 0
146 |         """
147 |             cross part
148 |         """
149 |         if self.use_cross:
150 |             print("Init cross network")
151 |             for i in range(self.h_cross_depth):
152 |                 setattr(self, 'cross_weight_' + str(i+1),
153 |                         torch.nn.Parameter(torch.randn(self.field_size*self.embedding_size)))
154 |                 setattr(self, 'cross_bias_' + str(i + 1),
155 |                         torch.nn.Parameter(torch.randn(self.field_size * self.embedding_size)))
156 |             print("Cross network finished")
157 |             cat_size += self.field_size * self.embedding_size
158 | 
159 |         """
160 |             inner prodcut part
161 |         """
162 |         if self.use_inner_product:
163 |             print("Init inner product network")
164 |             if self.is_inner_product_dropout:
165 |                 self.inner_product_0_dropout = nn.Dropout(self.dropout_inner_product_deep[0])
166 |             self.inner_product_linear_1 = nn.Linear(self.field_size*(self.field_size-1)/2, self.inner_product_layers[0])
167 |             if self.is_inner_product_dropout:
168 |                 self.inner_product_1_dropout = nn.Dropout(self.dropout_inner_product_deep[1])
169 |             if self.is_batch_norm:
170 |                 self.inner_product_batch_norm_1 = nn.BatchNorm1d(self.inner_product_layers[0])
171 | 
172 |             for i, h in enumerate(self.inner_product_layers[1:], 1):
173 |                 setattr(self, 'inner_product_linear_' + str(i + 1), nn.Linear(self.inner_product_layers[i - 1], self.inner_product_layers[i]))
174 |                 if self.is_batch_norm:
175 |                     setattr(self, 'inner_product_batch_norm_' + str(i + 1), nn.BatchNorm1d(self.inner_product_layers[i]))
176 |                 if self.is_deep_dropout:
177 |                     setattr(self, 'inner_product_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_inner_product_deep[i + 1]))
178 |             cat_size += inner_product_layers[-1]
179 |             print("Inner product network finished")
180 | 
181 |         """
182 |             deep part
183 |         """
184 |         if self.use_deep:
185 |             print("Init deep part")
186 | 
187 |             if self.is_deep_dropout:
188 |                 self.linear_0_dropout = nn.Dropout(self.dropout_deep[0])
189 |             self.linear_1 = nn.Linear(self.embedding_size*self.field_size, deep_layers[0])
190 |             if self.is_batch_norm:
191 |                 self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0])
192 |             if self.is_deep_dropout:
193 |                 self.linear_1_dropout = nn.Dropout(self.dropout_deep[1])
194 |             for i, h in enumerate(self.deep_layers[1:], 1):
195 |                 setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i]))
196 |                 if self.is_batch_norm:
197 |                     setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i]))
198 |                 if self.is_deep_dropout:
199 |                     setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1]))
200 |             cat_size += deep_layers[-1]
201 |             print("Init deep part succeed")
202 | 
203 |         self.last_layer = nn.Linear(cat_size,1)
204 |         print "Init succeed"
205 | 
206 |     def forward(self, Xi, Xv):
207 |         """
208 |         :param Xi_train: index input tensor, batch_size * k * 1
209 |         :param Xv_train: value input tensor, batch_size * k * 1
210 |         :return: the last output
211 |         """
212 | 
213 |         if self.deep_layers_activation == 'sigmoid':
214 |             activation = F.sigmoid
215 |         elif self.deep_layers_activation == 'tanh':
216 |             activation = F.tanh
217 |         else:
218 |             activation = F.relu
219 | 
220 |         """
221 |             embeddings
222 |         """
223 |         emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.embeddings)]
224 |         outputs = []
225 |         """
226 |             cross part
227 |         """
228 |         if self.use_cross:
229 |             x_0 = torch.cat(emb_arr,1)
230 |             x_l = x_0
231 |             for i in range(self.h_cross_depth):
232 |                 x_l = torch.sum(x_0 * x_l, 1).view([-1,1]) * getattr(self,'cross_weight_'+str(i+1)).view([1,-1]) + getattr(self,'cross_bias_'+str(i+1)) + x_l
233 |             outputs.append(x_l)
234 | 
235 |         """
236 |             inner product part
237 |         """
238 |         if self.use_inner_product:
239 |             fm_wij_arr = []
240 |             for i in range(self.field_size):
241 |                 for j in range(i + 1, self.field_size):
242 |                     fm_wij_arr.append(torch.sum(emb_arr[i] * emb_arr[j],1).view([-1,1]))
243 |             inner_output = torch.cat(fm_wij_arr,1)
244 | 
245 |             if self.is_inner_product_dropout:
246 |                 deep_emb = self.inner_product_0_dropout(inner_output)
247 |             x_deep = self.inner_product_linear_1(deep_emb)
248 |             if self.is_batch_norm:
249 |                 x_deep = self.inner_product_batch_norm_1(x_deep)
250 |             x_deep = activation(x_deep)
251 |             if self.is_inner_product_dropout:
252 |                 x_deep = self.inner_product_1_dropout(x_deep)
253 |             for i in range(1, len(self.deep_layers)):
254 |                 x_deep = getattr(self, 'inner_product_linear_' + str(i + 1))(x_deep)
255 |                 if self.is_batch_norm:
256 |                     x_deep = getattr(self, 'inner_product_batch_norm_' + str(i + 1))(x_deep)
257 |                 x_deep = activation(x_deep)
258 |                 if self.is_deep_dropout:
259 |                     x_deep = getattr(self, 'inner_product_' + str(i + 1) + '_dropout')(x_deep)
260 |             outputs.append(x_deep)
261 | 
262 |         """
263 |             deep part
264 |         """
265 |         if self.use_deep:
266 |             deep_emb = torch.cat(emb_arr,1)
267 | 
268 |             if self.is_deep_dropout:
269 |                 deep_emb = self.linear_0_dropout(deep_emb)
270 |             x_deep = self.linear_1(deep_emb)
271 |             if self.is_batch_norm:
272 |                 x_deep = self.batch_norm_1(x_deep)
273 |             x_deep = activation(x_deep)
274 |             if self.is_deep_dropout:
275 |                 x_deep = self.linear_1_dropout(x_deep)
276 |             for i in range(1, len(self.deep_layers)):
277 |                 x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep)
278 |                 if self.is_batch_norm:
279 |                     x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep)
280 |                 x_deep = activation(x_deep)
281 |                 if self.is_deep_dropout:
282 |                     x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep)
283 |             outputs.append(x_deep)
284 | 
285 |         """
286 |             total
287 |         """
288 |         output = self.last_layer(torch.cat(outputs,1))
289 |         return torch.sum(output,1)
290 | 
291 | 
292 |     def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None,
293 |                 y_valid = None, ealry_stopping=False, refit=False, save_path = None):
294 |         """
295 |         :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
296 |                         indi_j is the feature index of feature field j of sample i in the training set
297 |         :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
298 |                         vali_j is the feature value of feature field j of sample i in the training set
299 |                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
300 |         :param y_train: label of each sample in the training set
301 |         :param Xi_valid: list of list of feature indices of each sample in the validation set
302 |         :param Xv_valid: list of list of feature values of each sample in the validation set
303 |         :param y_valid: label of each sample in the validation set
304 |         :param ealry_stopping: perform early stopping or not
305 |         :param refit: refit the model on the train+valid dataset or not
306 |         :param save_path: the path to save the model
307 |         :return:
308 |         """
309 |         """
310 |         pre_process
311 |         """
312 |         if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])):
313 |             print("Save path is not existed!")
314 |             return
315 | 
316 |         if self.verbose:
317 |             print("pre_process data ing...")
318 |         is_valid = False
319 |         Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1))
320 |         Xv_train = np.array(Xv_train)
321 |         y_train = np.array(y_train)
322 |         x_size = Xi_train.shape[0]
323 |         if Xi_valid:
324 |             Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1))
325 |             Xv_valid = np.array(Xv_valid)
326 |             y_valid = np.array(y_valid)
327 |             x_valid_size = Xi_valid.shape[0]
328 |             is_valid = True
329 |         if self.verbose:
330 |             print("pre_process data finished")
331 | 
332 |         """
333 |             train model
334 |         """
335 |         model = self.train()
336 | 
337 |         optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
338 |         if self.optimizer_type == 'adam':
339 |             optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
340 |         elif self.optimizer_type == 'rmsp':
341 |             optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
342 |         elif self.optimizer_type == 'adag':
343 |             optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
344 | 
345 |         criterion = F.binary_cross_entropy_with_logits
346 | 
347 |         train_result = []
348 |         valid_result = []
349 |         for epoch in range(self.n_epochs):
350 |             total_loss = 0.0
351 |             batch_iter = x_size // self.batch_size
352 |             epoch_begin_time = time()
353 |             batch_begin_time = time()
354 |             for i in range(batch_iter+1):
355 |                 offset = i*self.batch_size
356 |                 end = min(x_size, offset+self.batch_size)
357 |                 if offset == end:
358 |                     break
359 |                 batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
360 |                 batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
361 |                 batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
362 |                 if self.use_cuda:
363 |                     batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
364 |                 optimizer.zero_grad()
365 |                 outputs = model(batch_xi, batch_xv)
366 |                 loss = criterion(outputs, batch_y)
367 |                 loss.backward()
368 |                 optimizer.step()
369 | 
370 |                 total_loss += loss.data[0]
371 |                 if self.verbose:
372 |                     if i % 100 == 99:  # print every 100 mini-batches
373 |                         eval = self.evaluate(batch_xi, batch_xv, batch_y)
374 |                         print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' %
375 |                               (epoch + 1, i + 1, total_loss/100.0, eval, time()-batch_begin_time))
376 |                         total_loss = 0.0
377 |                         batch_begin_time = time()
378 | 
379 |             train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size)
380 |             train_result.append(train_eval)
381 |             print('*'*50)
382 |             print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
383 |                   (epoch + 1, train_loss, train_eval, time()-epoch_begin_time))
384 |             print('*'*50)
385 | 
386 |             if is_valid:
387 |                 valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size)
388 |                 valid_result.append(valid_eval)
389 |                 print('*' * 50)
390 |                 print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
391 |                       (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time))
392 |                 print('*' * 50)
393 |             if save_path:
394 |                 torch.save(self.state_dict(),save_path)
395 |             if is_valid and ealry_stopping and self.training_termination(valid_result):
396 |                 print("early stop at [%d] epoch!" % (epoch+1))
397 |                 break
398 | 
399 |         # fit a few more epoch on train+valid until result reaches the best_train_score
400 |         if is_valid and refit:
401 |             if self.verbose:
402 |                 print("refitting the model")
403 |             if self.greater_is_better:
404 |                 best_epoch = np.argmax(valid_result)
405 |             else:
406 |                 best_epoch = np.argmin(valid_result)
407 |             best_train_score = train_result[best_epoch]
408 |             Xi_train = np.concatenate((Xi_train,Xi_valid))
409 |             Xv_train = np.concatenate((Xv_train,Xv_valid))
410 |             y_train = np.concatenate((y_train,y_valid))
411 |             x_size = x_size + x_valid_size
412 |             self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train)
413 |             for epoch in range(64):
414 |                 batch_iter = x_size // self.batch_size
415 |                 for i in range(batch_iter + 1):
416 |                     offset = i * self.batch_size
417 |                     end = min(x_size, offset + self.batch_size)
418 |                     if offset == end:
419 |                         break
420 |                     batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
421 |                     batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
422 |                     batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
423 |                     if self.use_cuda:
424 |                         batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
425 |                     optimizer.zero_grad()
426 |                     outputs = model(batch_xi, batch_xv)
427 |                     loss = criterion(outputs, batch_y)
428 |                     loss.backward()
429 |                     optimizer.step()
430 |                 train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size)
431 |                 if save_path:
432 |                     torch.save(self.state_dict(), save_path)
433 |                 if abs(best_train_score-train_eval) < 0.001 or \
434 |                         (self.greater_is_better and train_eval > best_train_score) or \
435 |                         ((not self.greater_is_better) and train_result < best_train_score):
436 |                     break
437 |             if self.verbose:
438 |                 print("refit finished")
439 | 
440 |     def eval_by_batch(self,Xi, Xv, y, x_size):
441 |         total_loss = 0.0
442 |         y_pred = []
443 |         batch_size = 16384
444 |         batch_iter = x_size // batch_size
445 |         criterion = F.binary_cross_entropy_with_logits
446 |         model = self.eval()
447 |         for i in range(batch_iter+1):
448 |             offset = i * batch_size
449 |             end = min(x_size, offset + batch_size)
450 |             if offset == end:
451 |                 break
452 |             batch_xi = Variable(torch.LongTensor(Xi[offset:end]))
453 |             batch_xv = Variable(torch.FloatTensor(Xv[offset:end]))
454 |             batch_y = Variable(torch.FloatTensor(y[offset:end]))
455 |             if self.use_cuda:
456 |                 batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
457 |             outputs = model(batch_xi, batch_xv)
458 |             pred = F.sigmoid(outputs).cpu()
459 |             y_pred.extend(pred.data.numpy())
460 |             loss = criterion(outputs, batch_y)
461 |             total_loss += loss.data[0]*(end-offset)
462 |         total_metric = self.eval_metric(y,y_pred)
463 |         return total_loss/x_size, total_metric
464 | 
465 |     # shuffle three lists simutaneously
466 |     def shuffle_in_unison_scary(self, a, b, c):
467 |         rng_state = np.random.get_state()
468 |         np.random.shuffle(a)
469 |         np.random.set_state(rng_state)
470 |         np.random.shuffle(b)
471 |         np.random.set_state(rng_state)
472 |         np.random.shuffle(c)
473 | 
474 |     def training_termination(self, valid_result):
475 |         if len(valid_result) > 4:
476 |             if self.greater_is_better:
477 |                 if valid_result[-1] < valid_result[-2] and \
478 |                     valid_result[-2] < valid_result[-3] and \
479 |                     valid_result[-3] < valid_result[-4]:
480 |                     return True
481 |             else:
482 |                 if valid_result[-1] > valid_result[-2] and \
483 |                     valid_result[-2] > valid_result[-3] and \
484 |                     valid_result[-3] > valid_result[-4]:
485 |                     return True
486 |         return False
487 | 
488 |     def predict(self, Xi, Xv):
489 |         """
490 |         :param Xi: the same as fit function
491 |         :param Xv: the same as fit function
492 |         :return: output, ont-dim array
493 |         """
494 |         Xi = np.array(Xi).reshape((-1,self.field_size,1))
495 |         Xi = Variable(torch.LongTensor(Xi))
496 |         Xv = Variable(torch.FloatTensor(Xv))
497 |         if self.use_cuda and torch.cuda.is_available():
498 |             Xi, Xv = Xi.cuda(), Xv.cuda()
499 | 
500 |         model = self.eval()
501 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
502 |         return (pred.data.numpy() > 0.5)
503 | 
504 |     def predict_proba(self, Xi, Xv):
505 |         Xi = np.array(Xi).reshape((-1, self.field_size, 1))
506 |         Xi = Variable(torch.LongTensor(Xi))
507 |         Xv = Variable(torch.FloatTensor(Xv))
508 |         if self.use_cuda and torch.cuda.is_available():
509 |             Xi, Xv = Xi.cuda(), Xv.cuda()
510 | 
511 |         model = self.eval()
512 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
513 |         return pred.data.numpy()
514 | 
515 |     def inner_predict(self, Xi, Xv):
516 |         """
517 |         :param Xi: tensor of feature index
518 |         :param Xv: tensor of feature value
519 |         :return: output, numpy
520 |         """
521 |         model = self.eval()
522 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
523 |         return (pred.data.numpy() > 0.5)
524 | 
525 |     def inner_predict_proba(self, Xi, Xv):
526 |         """
527 |         :param Xi: tensor of feature index
528 |         :param Xv: tensor of feature value
529 |         :return: output, numpy
530 |         """
531 |         model = self.eval()
532 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
533 |         return pred.data.numpy()
534 | 
535 | 
536 |     def evaluate(self, Xi, Xv, y):
537 |         """
538 |         :param Xi: tensor of feature index
539 |         :param Xv: tensor of feature value
540 |         :param y: tensor of labels
541 |         :return: metric of the evaluation
542 |         """
543 |         y_pred = self.inner_predict_proba(Xi, Xv)
544 |         return self.eval_metric(y.cpu().data.numpy(), y_pred)
545 | 
546 | """
547 |     test part
548 | """
549 | import sys
550 | sys.path.append('../')
551 | from utils import data_preprocess
552 | 
553 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv')
554 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv')
555 | with torch.cuda.device(0):
556 |     dcn = DCN(39, result_dict['feature_sizes'], batch_size=128 * 32, verbose=True, use_cuda=True,
557 |                       weight_decay=0.00002, use_inner_product=True).cuda()
558 |     dcn.fit(result_dict['index'], result_dict['value'], result_dict['label'],
559 |             test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, refit=False,
560 |             save_path='../data/model/dcn.pkl')
561 | 


--------------------------------------------------------------------------------
/model/DIN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | Created on Dec 10, 2017
  5 | @author: jachin,Nie
  6 | 
  7 | A pytorch implementation of NFM
  8 | 
  9 | Reference:
 10 | [1] Neural Factorization Machines for Sparse Predictive Analytics
 11 |     Xiangnan He,School of Computing,National University of Singapore,Singapore 117417,dcshex@nus.edu.sg
 12 |     Tat-Seng Chua,School of Computing,National University of Singapore,Singapore 117417,dcscts@nus.edu.sg
 13 | 
 14 | """
 15 | 
 16 | import os
 17 | import numpy as np
 18 | from sklearn.base import BaseEstimator, TransformerMixin
 19 | from sklearn.metrics import roc_auc_score
 20 | from time import time
 21 | 
 22 | import torch
 23 | import torch.autograd as autograd
 24 | import torch.nn as nn
 25 | import torch.nn.functional as F
 26 | import torch.optim as optim
 27 | from torch.autograd import Variable
 28 | 
 29 | import torch.backends.cudnn
 30 | 
 31 | 
 32 | """
 33 |     网络结构部分
 34 | """
 35 | 
 36 | class DIN(torch.nn.Module):
 37 |     """
 38 |     :parameter
 39 |     -------------
 40 |     field_size: size of the feature fields
 41 |     feature_sizes: a field_size-dim array, sizes of the feature dictionary
 42 |     embedding_size: size of the feature embedding
 43 |     is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not?
 44 |     dropout_shallow: an array of the size of 1, example:[0.5], the element is for the-first order part
 45 |     h_depth: deep network's hidden layers' depth
 46 |     deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2
 47 |     is_deep_dropout: bool, deep part uses dropout or not?
 48 |     dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2
 49 |     deep_layers_activation: relu or sigmoid etc
 50 |     n_epochs: epochs
 51 |     batch_size: batch_size
 52 |     learning_rate: learning_rate
 53 |     optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag'
 54 |     is_batch_norm：bool,  use batch_norm or not ?
 55 |     verbose: verbose
 56 |     weight_decay: weight decay (L2 penalty)
 57 |     random_seed: random_seed=950104 someone's birthday, my lukcy number
 58 |     use_fm: bool
 59 |     use_ffm: bool
 60 |     interation_type: bool, When it's true, the element-wise product of the fm or ffm embeddings will be added together, otherwise, the element-wise prodcut of embeddings will be concatenated.
 61 |     loss_type: "logloss", only
 62 |     eval_metric: roc_auc_score
 63 |     use_cuda: bool use gpu or cpu?
 64 |     n_class: number of classes. is bounded to 1
 65 |     greater_is_better: bool. Is the greater eval better?
 66 | 
 67 | 
 68 |     Attention: only support logsitcs regression
 69 |     """
 70 |     def __init__(self,field_size, feature_sizes, embedding_size = 4, is_shallow_dropout = True, dropout_shallow = [0.5],
 71 |                  h_depth = 2, deep_layers = [32, 32], is_deep_dropout = True, dropout_deep=[0.0, 0.5, 0.5],
 72 |                  deep_layers_activation = 'relu', n_epochs = 64, batch_size = 256, learning_rate = 0.003,
 73 |                  optimizer_type = 'adam', is_batch_norm = False, verbose = False, random_seed = 950104, weight_decay = 0.0,
 74 |                  use_fm = True, use_ffm = False, use_high_interaction = True,interation_type = True,loss_type = 'logloss', eval_metric = roc_auc_score,
 75 |                  use_cuda = True, n_class = 1, greater_is_better = True
 76 |                  ):
 77 |         super(DIN, self).__init__()
 78 |         self.field_size = field_size
 79 |         self.feature_sizes = feature_sizes
 80 |         self.embedding_size = embedding_size
 81 |         self.is_shallow_dropout = is_shallow_dropout
 82 |         self.dropout_shallow = dropout_shallow
 83 |         self.h_depth = h_depth
 84 |         self.deep_layers = deep_layers
 85 |         self.is_deep_dropout = is_deep_dropout
 86 |         self.dropout_deep = dropout_deep
 87 |         self.deep_layers_activation = deep_layers_activation
 88 |         self.n_epochs = n_epochs
 89 |         self.batch_size = batch_size
 90 |         self.learning_rate = learning_rate
 91 |         self.optimizer_type = optimizer_type
 92 |         self.is_batch_norm = is_batch_norm
 93 |         self.verbose = verbose
 94 |         self.weight_decay = weight_decay
 95 |         self.random_seed = random_seed
 96 |         self.use_fm = use_fm
 97 |         self.use_ffm = use_ffm
 98 |         self.use_high_interaction = use_high_interaction
 99 |         self.interation_type = interation_type
100 |         self.loss_type = loss_type
101 |         self.eval_metric = eval_metric
102 |         self.use_cuda = use_cuda
103 |         self.n_class = n_class
104 |         self.greater_is_better = greater_is_better
105 |         self.pre_train = False
106 | 
107 |         torch.manual_seed(self.random_seed)
108 | 
109 |         """
110 |             check cuda
111 |         """
112 |         if self.use_cuda and not torch.cuda.is_available():
113 |             self.use_cuda = False
114 |             print("Cuda is not available, automatically changed into cpu model")
115 | 
116 |         """
117 |             check use fm or ffm
118 |         """
119 |         if self.use_fm and self.use_ffm:
120 |             print("only support one type only, please make sure to choose only fm or ffm part")
121 |             exit(1)
122 |         elif self.use_fm:
123 |             print("The model is nfm(fm+nn layers)")
124 |         elif self.use_ffm:
125 |             print("The model is nffm(ffm+nn layers)")
126 |         else:
127 |             print("You have to choose more than one of (fm, ffm) models to use")
128 |             exit(1)
129 |         """
130 |             bias
131 |         """
132 |         self.bias = torch.nn.Parameter(torch.randn(1))
133 | 
134 |         """
135 |             fm part
136 |         """
137 |         if self.use_fm:
138 |             print("Init fm part")
139 |             self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
140 |             if self.dropout_shallow:
141 |                 self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
142 |             self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
143 |             print("Init fm part succeed")
144 | 
145 |         """
146 |             ffm part
147 |         """
148 |         if self.use_ffm:
149 |             print("Init ffm part")
150 |             self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
151 |             if self.dropout_shallow:
152 |                 self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
153 |             self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes])
154 |             print("Init ffm part succeed")
155 | 
156 |         """
157 |             high interaction part
158 |         """
159 |         if self.use_high_interaction and self.use_fm:
160 |             self.h_weights = nn.ParameterList([torch.nn.Parameter(torch.ones(self.embedding_size)) for i in range(self.field_size)])
161 |             self.h_bias = nn.ParameterList([torch.nn.Parameter(torch.ones(1)) for i in range(self.field_size)])
162 |             self.h_batch_norm = nn.BatchNorm1d(self.field_size)
163 | 
164 |         """
165 |             deep part
166 |         """
167 |         print("Init deep part")
168 | 
169 |         if self.is_deep_dropout:
170 |             self.linear_0_dropout = nn.Dropout(self.dropout_deep[0])
171 |         if self.interation_type:
172 |             self.linear_1 = nn.Linear(self.embedding_size, deep_layers[0])
173 |         else:
174 |             self.linear_1 = nn.Linear(self.field_size*(self.field_size-1)/2, deep_layers[0])
175 |         if self.is_batch_norm:
176 |             self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0])
177 |         if self.is_deep_dropout:
178 |             self.linear_1_dropout = nn.Dropout(self.dropout_deep[1])
179 |         for i, h in enumerate(self.deep_layers[1:], 1):
180 |             setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i]))
181 |             if self.is_batch_norm:
182 |                 setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i]))
183 |             if self.is_deep_dropout:
184 |                 setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1]))
185 | 
186 |         print("Init deep part succeed")
187 | 
188 |         print "Init succeed"
189 | 
190 |     def forward(self, Xi, Xv):
191 |         """
192 |         :param Xi_train: index input tensor, batch_size * k * 1
193 |         :param Xv_train: value input tensor, batch_size * k * 1
194 |         :return: the last output
195 |         """
196 |         """
197 |             fm part
198 |         """
199 |         if self.use_fm:
200 |             fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
201 |             fm_first_order = torch.cat(fm_first_order_emb_arr,1)
202 |             if self.is_shallow_dropout:
203 |                 fm_first_order = self.fm_first_order_dropout(fm_first_order)
204 | 
205 |             if self.interation_type:
206 |                 # use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation
207 |                 fm_second_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
208 |                 fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
209 |                 fm_sum_second_order_emb_square = fm_sum_second_order_emb*fm_sum_second_order_emb # (x+y)^2
210 |                 fm_second_order_emb_square = [item*item for item in fm_second_order_emb_arr]
211 |                 fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) #x^2+y^2
212 |                 fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5
213 |             else:
214 |                 fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
215 |                                            enumerate(self.fm_second_order_embeddings)]
216 |                 fm_wij_arr = []
217 |                 for i in range(self.field_size):
218 |                     for j in range(i + 1, self.field_size):
219 |                         fm_wij_arr.append(fm_second_order_emb_arr[i] * fm_second_order_emb_arr[j])
220 | 
221 | 
222 |         """
223 |             ffm part
224 |         """
225 |         if self.use_ffm:
226 |             ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)]
227 |             ffm_first_order = torch.cat(ffm_first_order_emb_arr,1)
228 |             if self.is_shallow_dropout:
229 |                 ffm_first_order = self.ffm_first_order_dropout(ffm_first_order)
230 |             ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in  f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)]
231 |             ffm_wij_arr = []
232 |             for i in range(self.field_size):
233 |                 for j in range(i+1, self.field_size):
234 |                     ffm_wij_arr.append(ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i])
235 |             ffm_second_order = sum(ffm_wij_arr)
236 | 
237 |         """
238 |             high interaction part
239 |         """
240 |         if self.use_high_interaction and self.use_fm:
241 |             total_prod = 1.0
242 |             for i, h_weight in enumerate(self.h_weights):
243 |                 total_prod = total_prod * (fm_second_order_emb_arr[i]*h_weight+self.h_bias[i])
244 |             high_output = total_prod
245 | 
246 | 
247 |         """
248 |             deep part
249 |         """
250 |         if self.use_fm and self.interation_type:
251 |             deep_emb = fm_second_order
252 |         elif self.use_ffm and self.interation_type:
253 |             deep_emb = ffm_second_order
254 |         elif self.use_fm:
255 |             deep_emb = torch.cat([torch.sum(fm_wij,1).view([-1,1]) for fm_wij in fm_wij_arr], 1)
256 |         else:
257 |             deep_emb = torch.cat([torch.sum(ffm_wij,1).view([-1,1]) for ffm_wij in ffm_wij_arr],1)
258 | 
259 |         if self.deep_layers_activation == 'sigmoid':
260 |             activation = F.sigmoid
261 |         elif self.deep_layers_activation == 'tanh':
262 |             activation = F.tanh
263 |         else:
264 |             activation = F.relu
265 | 
266 |         if self.is_deep_dropout:
267 |             deep_emb = self.linear_0_dropout(deep_emb)
268 |         x_deep = self.linear_1(deep_emb)
269 |         if self.is_batch_norm:
270 |             x_deep = self.batch_norm_1(x_deep)
271 |         x_deep = activation(x_deep)
272 |         if self.is_deep_dropout:
273 |             x_deep = self.linear_1_dropout(x_deep)
274 |         for i in range(1, len(self.deep_layers)):
275 |             x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep)
276 |             if self.is_batch_norm:
277 |                 x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep)
278 |             x_deep = activation(x_deep)
279 |             if self.is_deep_dropout:
280 |                 x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep)
281 | 
282 |         """
283 |             sum
284 |         """
285 |         if self.use_fm:
286 |             if self.use_high_interaction and not self.pre_train:
287 |                 total_sum = self.bias+ torch.sum(fm_first_order,1) + torch.sum(x_deep, 1) + torch.sum(high_output,1)
288 |             else:
289 |                 total_sum = self.bias + torch.sum(fm_first_order, 1) + torch.sum(x_deep, 1)
290 |         elif self.use_ffm:
291 |             total_sum = self.bias + torch.sum(ffm_first_order, 1) + torch.sum(x_deep, 1)
292 |         return total_sum
293 | 
294 | 
295 |     def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None,
296 |                 y_valid = None, ealry_stopping=False, pre_train = False, n_epochs = 64,refit=False, save_path = None):
297 |         """
298 |         :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
299 |                         indi_j is the feature index of feature field j of sample i in the training set
300 |         :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
301 |                         vali_j is the feature value of feature field j of sample i in the training set
302 |                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
303 |         :param y_train: label of each sample in the training set
304 |         :param Xi_valid: list of list of feature indices of each sample in the validation set
305 |         :param Xv_valid: list of list of feature values of each sample in the validation set
306 |         :param y_valid: label of each sample in the validation set
307 |         :param ealry_stopping: perform early stopping or not
308 |         :param pre_train: pre_train or not
309 |         :param n_epochs: number of epochs
310 |         :param refit: refit the model on the train+valid dataset or not
311 |         :param save_path: the path to save the model
312 |         :return:
313 |         """
314 |         """
315 |         pre_process
316 |         """
317 |         if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])):
318 |             print("Save path is not existed!")
319 |             return
320 | 
321 |         if self.verbose:
322 |             print("pre_process data ing...")
323 | 
324 |         self.pre_train = pre_train
325 |         self.n_epochs = n_epochs
326 |         is_valid = False
327 |         Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1))
328 |         Xv_train = np.array(Xv_train)
329 |         y_train = np.array(y_train)
330 |         x_size = Xi_train.shape[0]
331 |         if Xi_valid:
332 |             Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1))
333 |             Xv_valid = np.array(Xv_valid)
334 |             y_valid = np.array(y_valid)
335 |             x_valid_size = Xi_valid.shape[0]
336 |             is_valid = True
337 |         if self.verbose:
338 |             print("pre_process data finished")
339 | 
340 |         """
341 |             train model
342 |         """
343 |         model = self.train()
344 | 
345 |         optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
346 |         if self.optimizer_type == 'adam':
347 |             optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
348 |         elif self.optimizer_type == 'rmsp':
349 |             optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
350 |         elif self.optimizer_type == 'adag':
351 |             optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
352 | 
353 |         criterion = F.binary_cross_entropy_with_logits
354 | 
355 |         train_result = []
356 |         valid_result = []
357 |         for epoch in range(self.n_epochs):
358 |             total_loss = 0.0
359 |             batch_iter = x_size // self.batch_size
360 |             epoch_begin_time = time()
361 |             batch_begin_time = time()
362 |             for i in range(batch_iter+1):
363 |                 offset = i*self.batch_size
364 |                 end = min(x_size, offset+self.batch_size)
365 |                 if offset == end:
366 |                     break
367 |                 batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
368 |                 batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
369 |                 batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
370 |                 if self.use_cuda:
371 |                     batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
372 |                 optimizer.zero_grad()
373 |                 outputs = model(batch_xi, batch_xv)
374 |                 loss = criterion(outputs, batch_y)
375 |                 try:
376 |                     loss.backward()
377 |                 except:
378 |                     print batch_xi.is_cuda, batch_xv.is_cuda, batch_y.is_cuda
379 |                     print batch_xi
380 |                     print batch_xv
381 |                     print batch_y
382 |                 optimizer.step()
383 | 
384 |                 total_loss += loss.data[0]
385 |                 if self.verbose:
386 |                     if i % 100 == 99:  # print every 100 mini-batches
387 |                         eval = self.evaluate(batch_xi, batch_xv, batch_y)
388 |                         print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' %
389 |                               (epoch + 1, i + 1, total_loss/100.0, eval, time()-batch_begin_time))
390 |                         total_loss = 0.0
391 |                         batch_begin_time = time()
392 | 
393 |             train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size)
394 |             train_result.append(train_eval)
395 |             print('*'*50)
396 |             print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
397 |                   (epoch + 1, train_loss, train_eval, time()-epoch_begin_time))
398 |             print('*'*50)
399 | 
400 |             if is_valid:
401 |                 valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size)
402 |                 valid_result.append(valid_eval)
403 |                 print('*' * 50)
404 |                 print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
405 |                       (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time))
406 |                 print('*' * 50)
407 |             if save_path:
408 |                 torch.save(self.state_dict(),save_path)
409 |             if is_valid and ealry_stopping and self.training_termination(valid_result):
410 |                 print("early stop at [%d] epoch!" % (epoch+1))
411 |                 break
412 | 
413 |         # fit a few more epoch on train+valid until result reaches the best_train_score
414 |         if is_valid and refit:
415 |             if self.verbose:
416 |                 print("refitting the model")
417 |             if self.greater_is_better:
418 |                 best_epoch = np.argmax(valid_result)
419 |             else:
420 |                 best_epoch = np.argmin(valid_result)
421 |             best_train_score = train_result[best_epoch]
422 |             Xi_train = np.concatenate((Xi_train,Xi_valid))
423 |             Xv_train = np.concatenate((Xv_train,Xv_valid))
424 |             y_train = np.concatenate((y_train,y_valid))
425 |             x_size = x_size + x_valid_size
426 |             self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train)
427 |             for epoch in range(64):
428 |                 batch_iter = x_size // self.batch_size
429 |                 for i in range(batch_iter + 1):
430 |                     offset = i * self.batch_size
431 |                     end = min(x_size, offset + self.batch_size)
432 |                     if offset == end:
433 |                         break
434 |                     batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
435 |                     batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
436 |                     batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
437 |                     if self.use_cuda:
438 |                         batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
439 |                     optimizer.zero_grad()
440 |                     outputs = model(batch_xi, batch_xv)
441 |                     loss = criterion(outputs, batch_y)
442 |                     loss.backward()
443 |                     optimizer.step()
444 |                 train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size)
445 |                 if save_path:
446 |                     torch.save(self.state_dict(), save_path)
447 |                 if abs(best_train_score-train_eval) < 0.001 or \
448 |                         (self.greater_is_better and train_eval > best_train_score) or \
449 |                         ((not self.greater_is_better) and train_result < best_train_score):
450 |                     break
451 |             if self.verbose:
452 |                 print("refit finished")
453 | 
454 |     def eval_by_batch(self,Xi, Xv, y, x_size):
455 |         total_loss = 0.0
456 |         y_pred = []
457 |         if self.use_ffm:
458 |             batch_size = 16384*2
459 |         else:
460 |             batch_size = 16384
461 |         batch_iter = x_size // batch_size
462 |         criterion = F.binary_cross_entropy_with_logits
463 |         model = self.eval()
464 |         for i in range(batch_iter+1):
465 |             offset = i * batch_size
466 |             end = min(x_size, offset + batch_size)
467 |             if offset == end:
468 |                 break
469 |             batch_xi = Variable(torch.LongTensor(Xi[offset:end]))
470 |             batch_xv = Variable(torch.FloatTensor(Xv[offset:end]))
471 |             batch_y = Variable(torch.FloatTensor(y[offset:end]))
472 |             if self.use_cuda:
473 |                 batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
474 |             outputs = model(batch_xi, batch_xv)
475 |             pred = F.sigmoid(outputs).cpu()
476 |             y_pred.extend(pred.data.numpy())
477 |             loss = criterion(outputs, batch_y)
478 |             total_loss += loss.data[0]*(end-offset)
479 |         total_metric = self.eval_metric(y,y_pred)
480 |         return total_loss/x_size, total_metric
481 | 
482 |     # shuffle three lists simutaneously
483 |     def shuffle_in_unison_scary(self, a, b, c):
484 |         rng_state = np.random.get_state()
485 |         np.random.shuffle(a)
486 |         np.random.set_state(rng_state)
487 |         np.random.shuffle(b)
488 |         np.random.set_state(rng_state)
489 |         np.random.shuffle(c)
490 | 
491 |     def training_termination(self, valid_result):
492 |         if len(valid_result) > 4:
493 |             if self.greater_is_better:
494 |                 if valid_result[-1] < valid_result[-2] and \
495 |                     valid_result[-2] < valid_result[-3] and \
496 |                     valid_result[-3] < valid_result[-4]:
497 |                     return True
498 |             else:
499 |                 if valid_result[-1] > valid_result[-2] and \
500 |                     valid_result[-2] > valid_result[-3] and \
501 |                     valid_result[-3] > valid_result[-4]:
502 |                     return True
503 |         return False
504 | 
505 |     def predict(self, Xi, Xv):
506 |         """
507 |         :param Xi: the same as fit function
508 |         :param Xv: the same as fit function
509 |         :return: output, ont-dim array
510 |         """
511 |         Xi = np.array(Xi).reshape((-1,self.field_size,1))
512 |         Xi = Variable(torch.LongTensor(Xi))
513 |         Xv = Variable(torch.FloatTensor(Xv))
514 |         if self.use_cuda and torch.cuda.is_available():
515 |             Xi, Xv = Xi.cuda(), Xv.cuda()
516 | 
517 |         model = self.eval()
518 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
519 |         return (pred.data.numpy() > 0.5)
520 | 
521 |     def predict_proba(self, Xi, Xv):
522 |         Xi = np.array(Xi).reshape((-1, self.field_size, 1))
523 |         Xi = Variable(torch.LongTensor(Xi))
524 |         Xv = Variable(torch.FloatTensor(Xv))
525 |         if self.use_cuda and torch.cuda.is_available():
526 |             Xi, Xv = Xi.cuda(), Xv.cuda()
527 | 
528 |         model = self.eval()
529 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
530 |         return pred.data.numpy()
531 | 
532 |     def inner_predict(self, Xi, Xv):
533 |         """
534 |         :param Xi: tensor of feature index
535 |         :param Xv: tensor of feature value
536 |         :return: output, numpy
537 |         """
538 |         model = self.eval()
539 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
540 |         return (pred.data.numpy() > 0.5)
541 | 
542 |     def inner_predict_proba(self, Xi, Xv):
543 |         """
544 |         :param Xi: tensor of feature index
545 |         :param Xv: tensor of feature value
546 |         :return: output, numpy
547 |         """
548 |         model = self.eval()
549 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
550 |         return pred.data.numpy()
551 | 
552 | 
553 |     def evaluate(self, Xi, Xv, y):
554 |         """
555 |         :param Xi: tensor of feature index
556 |         :param Xv: tensor of feature value
557 |         :param y: tensor of labels
558 |         :return: metric of the evaluation
559 |         """
560 |         y_pred = self.inner_predict_proba(Xi, Xv)
561 |         return self.eval_metric(y.cpu().data.numpy(), y_pred)
562 | 
563 | """
564 |     test part
565 | """
566 | import sys
567 | sys.path.append('../')
568 | from utils import data_preprocess
569 | 
570 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv')
571 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv')
572 | with torch.cuda.device(0):
573 |     din = DIN(39, result_dict['feature_sizes'], batch_size=128 * 64, is_shallow_dropout=False, verbose=True, use_cuda=True,
574 |                       weight_decay=0.0000002, use_fm=True, use_ffm=False, use_high_interaction=True,interation_type=False).cuda()
575 |     # din.fit(result_dict['index'], result_dict['value'], result_dict['label'],
576 |     #         test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, pre_train=True,
577 |     #         n_epochs=32,refit=False,
578 |     #         save_path='../data/model/din.pkl')
579 |     din.load_state_dict(torch.load('../data/model/din.pkl'))
580 |     din.fit(result_dict['index'], result_dict['value'], result_dict['label'],
581 |             test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, pre_train=False,
582 |             n_epochs=64, refit=False,
583 |             save_path='../data/model/din.pkl')
584 | 


--------------------------------------------------------------------------------
/model/DeepFM.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | Created on Dec 10, 2017
  5 | @author: jachin,Nie
  6 | 
  7 | A pytorch implementation of deepfm
  8 | 
  9 | Reference:
 10 | [1] DeepFM: A Factorization-Machine based Neural Network for CTR Prediction,
 11 |     Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He.
 12 | 
 13 | """
 14 | 
 15 | import os
 16 | import numpy as np
 17 | from sklearn.base import BaseEstimator, TransformerMixin
 18 | from sklearn.metrics import roc_auc_score
 19 | from time import time
 20 | 
 21 | import torch
 22 | import torch.autograd as autograd
 23 | import torch.nn as nn
 24 | import torch.nn.functional as F
 25 | import torch.optim as optim
 26 | from torch.autograd import Variable
 27 | 
 28 | import torch.backends.cudnn
 29 | 
 30 | """
 31 |     缃戠粶缁撴瀯閮ㄥ垎
 32 | """
 33 | 
 34 | 
 35 | class DeepFM(torch.nn.Module):
 36 |     """
 37 |     :parameter
 38 |     -------------
 39 |     field_size: size of the feature fields
 40 |     feature_sizes: a field_size-dim array, sizes of the feature dictionary
 41 |     embedding_size: size of the feature embedding
 42 |     is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not?
 43 |     dropout_shallow: an array of the size of 2, example:[0.5,0.5], the first element is for the-first order part and the second element is for the second-order part
 44 |     h_depth: deep network's hidden layers' depth
 45 |     deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2
 46 |     is_deep_dropout: bool, deep part uses dropout or not?
 47 |     dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2
 48 |     deep_layers_activation: relu or sigmoid etc
 49 |     n_epochs: epochs
 50 |     batch_size: batch_size
 51 |     learning_rate: learning_rate
 52 |     optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag'
 53 |     is_batch_norm锛歜ool,  use batch_norm or not ?
 54 |     verbose: verbose
 55 |     weight_decay: weight decay (L2 penalty)
 56 |     random_seed: random_seed=950104 someone's birthday, my lukcy number
 57 |     use_fm: bool
 58 |     use_ffm: bool
 59 |     use_deep: bool
 60 |     loss_type: "logloss", only
 61 |     eval_metric: roc_auc_score
 62 |     use_cuda: bool use gpu or cpu?
 63 |     n_class: number of classes. is bounded to 1
 64 |     greater_is_better: bool. Is the greater eval better?
 65 | 
 66 | 
 67 |     Attention: only support logsitcs regression
 68 |     """
 69 | 
 70 |     def __init__(self, field_size, feature_sizes, embedding_size=4, is_shallow_dropout=True, dropout_shallow=[0.5, 0.5],
 71 |                  h_depth=2, deep_layers=[64, 64], is_deep_dropout=True, dropout_deep=[0.5, 0.5, 0.5],
 72 |                  deep_layers_activation='relu', n_epochs=24, batch_size=256, learning_rate=0.003,
 73 |                  optimizer_type='adam', is_batch_norm=False, verbose=False, random_seed=950104, weight_decay=0.0,
 74 |                  use_fm=True, use_ffm=False, use_deep=True, loss_type='logloss', eval_metric=roc_auc_score,
 75 |                  use_cuda=True, n_class=1, greater_is_better=True
 76 |                  ):
 77 |         super(DeepFM, self).__init__()
 78 |         self.field_size = field_size
 79 |         self.feature_sizes = feature_sizes
 80 |         self.embedding_size = embedding_size
 81 |         self.is_shallow_dropout = is_shallow_dropout
 82 |         self.dropout_shallow = dropout_shallow
 83 |         self.h_depth = h_depth
 84 |         self.deep_layers = deep_layers
 85 |         self.is_deep_dropout = is_deep_dropout
 86 |         self.dropout_deep = dropout_deep
 87 |         self.deep_layers_activation = deep_layers_activation
 88 |         self.n_epochs = n_epochs
 89 |         self.batch_size = batch_size
 90 |         self.learning_rate = learning_rate
 91 |         self.optimizer_type = optimizer_type
 92 |         self.is_batch_norm = is_batch_norm
 93 |         self.verbose = verbose
 94 |         self.weight_decay = weight_decay
 95 |         self.random_seed = random_seed
 96 |         self.use_fm = use_fm
 97 |         self.use_ffm = use_ffm
 98 |         self.use_deep = use_deep
 99 |         self.loss_type = loss_type
100 |         self.eval_metric = eval_metric
101 |         self.use_cuda = use_cuda
102 |         self.n_class = n_class
103 |         self.greater_is_better = greater_is_better
104 | 
105 |         torch.manual_seed(self.random_seed)
106 | 
107 |         """
108 |             check cuda
109 |         """
110 |         if self.use_cuda and not torch.cuda.is_available():
111 |             self.use_cuda = False
112 |             print("Cuda is not available, automatically changed into cpu model")
113 | 
114 |         """
115 |             check use fm or ffm
116 |         """
117 |         if self.use_fm and self.use_ffm:
118 |             print("only support one type only, please make sure to choose only fm or ffm part")
119 |             exit(1)
120 |         elif self.use_fm and self.use_deep:
121 |             print("The model is deepfm(fm+deep layers)")
122 |         elif self.use_ffm and self.use_deep:
123 |             print("The model is deepffm(ffm+deep layers)")
124 |         elif self.use_fm:
125 |             print("The model is fm only")
126 |         elif self.use_ffm:
127 |             print("The model is ffm only")
128 |         elif self.use_deep:
129 |             print("The model is deep layers only")
130 |         else:
131 |             print("You have to choose more than one of (fm, ffm, deep) models to use")
132 |             exit(1)
133 | 
134 |         """
135 |             bias
136 |         """
137 |         if self.use_fm or self.use_ffm:
138 |             self.bias = torch.nn.Parameter(torch.randn(1))
139 |         """
140 |             fm part
141 |         """
142 |         if self.use_fm:
143 |             print("Init fm part")
144 |             self.fm_first_order_embeddings = nn.ModuleList(
145 |                 [nn.Embedding(feature_size, 1) for feature_size in self.feature_sizes])
146 |             if self.dropout_shallow:
147 |                 self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
148 |             self.fm_second_order_embeddings = nn.ModuleList(
149 |                 [nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
150 |             if self.dropout_shallow:
151 |                 self.fm_second_order_dropout = nn.Dropout(self.dropout_shallow[1])
152 |             print("Init fm part succeed")
153 | 
154 |         """
155 |             ffm part
156 |         """
157 |         if self.use_ffm:
158 |             print("Init ffm part")
159 |             self.ffm_first_order_embeddings = nn.ModuleList(
160 |                 [nn.Embedding(feature_size, 1) for feature_size in self.feature_sizes])
161 |             if self.dropout_shallow:
162 |                 self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
163 |             self.ffm_second_order_embeddings = nn.ModuleList(
164 |                 [nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for
165 |                  feature_size in self.feature_sizes])
166 |             if self.dropout_shallow:
167 |                 self.ffm_second_order_dropout = nn.Dropout(self.dropout_shallow[1])
168 |             print("Init ffm part succeed")
169 | 
170 |         """
171 |             deep part
172 |         """
173 |         if self.use_deep:
174 |             print("Init deep part")
175 |             if not self.use_fm and not self.use_ffm:
176 |                 self.fm_second_order_embeddings = nn.ModuleList(
177 |                     [nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
178 | 
179 |             if self.is_deep_dropout:
180 |                 self.linear_0_dropout = nn.Dropout(self.dropout_deep[0])
181 | 
182 |             self.linear_1 = nn.Linear(self.field_size * self.embedding_size, deep_layers[0])
183 |             if self.is_batch_norm:
184 |                 self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0])
185 |             if self.is_deep_dropout:
186 |                 self.linear_1_dropout = nn.Dropout(self.dropout_deep[1])
187 |             for i, h in enumerate(self.deep_layers[1:], 1):
188 |                 setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i]))
189 |                 if self.is_batch_norm:
190 |                     setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i]))
191 |                 if self.is_deep_dropout:
192 |                     setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1]))
193 | 
194 |             print("Init deep part succeed")
195 | 
196 |         print("Init succeed")
197 | 
198 |     def forward(self, Xi, Xv):
199 |         """
200 |         :param Xi_train: index input tensor, batch_size * k * 1
201 |         :param Xv_train: value input tensor, batch_size * k * 1
202 |         :return: the last output
203 |         """
204 |         """
205 |             fm part
206 |         """
207 |         if self.use_fm:
208 |             fm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
209 |                                       enumerate(self.fm_first_order_embeddings)]
210 |             fm_first_order = torch.cat(fm_first_order_emb_arr, 1)
211 |             if self.is_shallow_dropout:
212 |                 fm_first_order = self.fm_first_order_dropout(fm_first_order)
213 | 
214 |             # use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation
215 |             fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
216 |                                        enumerate(self.fm_second_order_embeddings)]
217 |             fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
218 |             fm_sum_second_order_emb_square = fm_sum_second_order_emb * fm_sum_second_order_emb  # (x+y)^2
219 |             fm_second_order_emb_square = [item * item for item in fm_second_order_emb_arr]
220 |             fm_second_order_emb_square_sum = sum(fm_second_order_emb_square)  # x^2+y^2
221 |             fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5
222 |             if self.is_shallow_dropout:
223 |                 fm_second_order = self.fm_second_order_dropout(fm_second_order)
224 | 
225 |         """
226 |             ffm part
227 |         """
228 |         if self.use_ffm:
229 |             ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
230 |                                        enumerate(self.ffm_first_order_embeddings)]
231 |             ffm_first_order = torch.cat(ffm_first_order_emb_arr, 1)
232 |             if self.is_shallow_dropout:
233 |                 ffm_first_order = self.ffm_first_order_dropout(ffm_first_order)
234 |             ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for emb in f_embs] for
235 |                                         i, f_embs in enumerate(self.ffm_second_order_embeddings)]
236 |             ffm_wij_arr = []
237 |             for i in range(self.field_size):
238 |                 for j in range(i + 1, self.field_size):
239 |                     ffm_wij_arr.append(ffm_second_order_emb_arr[i][j] * ffm_second_order_emb_arr[j][i])
240 |             ffm_second_order = sum(ffm_wij_arr)
241 |             if self.is_shallow_dropout:
242 |                 ffm_second_order = self.ffm_second_order_dropout(ffm_second_order)
243 | 
244 |         """
245 |             deep part
246 |         """
247 |         if self.use_deep:
248 |             if self.use_fm:
249 |                 deep_emb = torch.cat(fm_second_order_emb_arr, 1)
250 |             elif self.use_ffm:
251 |                 deep_emb = torch.cat([sum(ffm_second_order_embs) for ffm_second_order_embs in ffm_second_order_emb_arr],
252 |                                      1)
253 |             else:
254 |                 deep_emb = torch.cat([(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
255 |                                       enumerate(self.fm_second_order_embeddings)], 1)
256 | 
257 |             if self.deep_layers_activation == 'sigmoid':
258 |                 activation = F.sigmoid
259 |             elif self.deep_layers_activation == 'tanh':
260 |                 activation = F.tanh
261 |             else:
262 |                 activation = F.relu
263 |             if self.is_deep_dropout:
264 |                 deep_emb = self.linear_0_dropout(deep_emb)
265 |             x_deep = self.linear_1(deep_emb)
266 |             if self.is_batch_norm:
267 |                 x_deep = self.batch_norm_1(x_deep)
268 |             x_deep = activation(x_deep)
269 |             if self.is_deep_dropout:
270 |                 x_deep = self.linear_1_dropout(x_deep)
271 |             for i in range(1, len(self.deep_layers)):
272 |                 x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep)
273 |                 if self.is_batch_norm:
274 |                     x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep)
275 |                 x_deep = activation(x_deep)
276 |                 if self.is_deep_dropout:
277 |                     x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep)
278 |         """
279 |             sum
280 |         """
281 |         if self.use_fm and self.use_deep:
282 |             total_sum = torch.sum(fm_first_order, 1) + torch.sum(fm_second_order, 1) + torch.sum(x_deep, 1) + self.bias
283 |         elif self.use_ffm and self.use_deep:
284 |             total_sum = torch.sum(ffm_first_order, 1) + torch.sum(ffm_second_order, 1) + torch.sum(x_deep,
285 |                                                                                                    1) + self.bias
286 |         elif self.use_fm:
287 |             total_sum = torch.sum(fm_first_order, 1) + torch.sum(fm_second_order, 1) + self.bias
288 |         elif self.use_ffm:
289 |             total_sum = torch.sum(ffm_first_order, 1) + torch.sum(ffm_second_order, 1) + self.bias
290 |         else:
291 |             total_sum = torch.sum(x_deep, 1)
292 |         return total_sum
293 | 
294 |     def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None,
295 |             y_valid=None, ealry_stopping=False, refit=False, save_path=None):
296 |         """
297 |         :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
298 |                         indi_j is the feature index of feature field j of sample i in the training set
299 |         :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
300 |                         vali_j is the feature value of feature field j of sample i in the training set
301 |                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
302 |         :param y_train: label of each sample in the training set
303 |         :param Xi_valid: list of list of feature indices of each sample in the validation set
304 |         :param Xv_valid: list of list of feature values of each sample in the validation set
305 |         :param y_valid: label of each sample in the validation set
306 |         :param ealry_stopping: perform early stopping or not
307 |         :param refit: refit the model on the train+valid dataset or not
308 |         :param save_path: the path to save the model
309 |         :return:
310 |         """
311 |         """
312 |         pre_process
313 |         """
314 |         if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])):
315 |             print("Save path is not existed!")
316 |             return
317 | 
318 |         if self.verbose:
319 |             print("pre_process data ing...")
320 |         is_valid = False
321 |         Xi_train = np.array(Xi_train)
322 |         Xi_train = Xi_train.reshape((-1, self.field_size, 1))
323 |         Xv_train = np.array(Xv_train)
324 |         y_train = np.array(y_train)
325 |         x_size = Xi_train.shape[0]
326 |         if Xi_valid:
327 |             Xi_valid = np.array(Xi_valid).reshape((-1, self.field_size, 1))
328 |             Xv_valid = np.array(Xv_valid)
329 |             y_valid = np.array(y_valid)
330 |             x_valid_size = Xi_valid.shape[0]
331 |             is_valid = True
332 |         if self.verbose:
333 |             print("pre_process data finished")
334 | 
335 |         """
336 |             train model
337 |         """
338 |         model = self.train()
339 |         if torch.cuda.device_count() > 1 and self.use_cuda:
340 |             print("Let's use", torch.cuda.device_count(), "GPUs!")
341 |             model = torch.nn.DataParallel(model.cuda())
342 |         optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
343 |         if self.optimizer_type == 'adam':
344 |             optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
345 |         elif self.optimizer_type == 'rmsp':
346 |             optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
347 |         elif self.optimizer_type == 'adag':
348 |             optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
349 | 
350 |         criterion = F.binary_cross_entropy_with_logits
351 | 
352 |         train_result = []
353 |         valid_result = []
354 |         for epoch in range(self.n_epochs):
355 |             total_loss = 0.0
356 |             batch_iter = x_size // self.batch_size
357 |             epoch_begin_time = time()
358 |             batch_begin_time = time()
359 |             self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
360 |             for i in range(batch_iter + 1):
361 |                 offset = i * self.batch_size
362 |                 end = min(x_size, offset + self.batch_size)
363 |                 if offset == end:
364 |                     break
365 |                 #if i == 10000:
366 |                     #break
367 |                 batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
368 |                 batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
369 |                 batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
370 | 
371 |                 if self.use_cuda:
372 |                     batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
373 |                 optimizer.zero_grad()
374 |                 outputs = model(batch_xi, batch_xv)
375 |                 loss = criterion(outputs, batch_y)
376 |                 loss.backward()
377 |                 optimizer.step()
378 | 
379 |                 total_loss += loss.item()
380 |                 if self.verbose:
381 |                     if i % 100 == 99:  # print every 100 mini-batches
382 |                         pre=outputs.cpu().data.numpy()
383 |                         y=batch_y.cpu().data.numpy()
384 |                         roc=self.roc(y,pre)
385 |                         print('[epoch,batch]: [%d, %5d] train_loss: %.6f train_metric: %.6f time: %.1f s' %
386 |                               (epoch + 1, i + 1, total_loss / 100.0, roc, time() - batch_begin_time))
387 |                         total_loss = 0.0
388 |                         batch_begin_time = time()
389 |                 else:
390 |                     #pre = outputs.cpu().data.numpy()
391 |                     #y = batch_y.cpu().data.numpy()
392 |                     #roc = self.roc(y, pre)
393 |                     roc=-1
394 |                     print('[epoch,batch]: [%d, %5d] train_loss: %.6f train_metric: %.6f time: %.1f s' %
395 |                           (epoch + 1, i + 1, loss.item(), roc, time() - batch_begin_time))
396 |                     batch_begin_time = time()
397 | 
398 |             train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size)
399 |             train_result.append(train_eval)
400 |             print('*' * 50)
401 |             print('epoch_result:[%d] train_loss: %.6f train_metric: %.6f time: %.1f s' %
402 |                   (epoch + 1, train_loss, train_eval, time() - epoch_begin_time))
403 |             print('*' * 50)
404 | 
405 |             if is_valid:
406 |                 valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size)
407 |                 valid_result.append(valid_eval)
408 |                 print('*' * 50)
409 |                 print('epoch_result:[%d] val_loss: %.6f val_metric: %.6f time: %.1f s' %
410 |                       (epoch + 1, valid_loss, valid_eval, time() - epoch_begin_time))
411 |                 print('*' * 50)
412 |             if save_path:
413 |                 torch.save(self.state_dict(), save_path)
414 |             if is_valid and ealry_stopping and self.training_termination(valid_result):
415 |                 print("early stop at [%d] epoch!" % (epoch + 1))
416 |                 break
417 | 
418 |         # fit a few more epoch on train+valid until result reaches the best_train_score
419 |         if is_valid and refit:
420 |             if self.verbose:
421 |                 print("refitting the model")
422 |             if self.greater_is_better:
423 |                 best_epoch = np.argmax(valid_result)
424 |             else:
425 |                 best_epoch = np.argmin(valid_result)
426 |             best_train_score = train_result[best_epoch]
427 |             Xi_train = np.concatenate((Xi_train, Xi_valid))
428 |             Xv_train = np.concatenate((Xv_train, Xv_valid))
429 |             y_train = np.concatenate((y_train, y_valid))
430 |             x_size = x_size + x_valid_size
431 |             self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
432 |             for epoch in range(64):
433 |                 batch_iter = x_size // self.batch_size
434 |                 for i in range(batch_iter + 1):
435 |                     offset = i * self.batch_size
436 |                     end = min(x_size, offset + self.batch_size)
437 |                     if offset == end:
438 |                         break
439 |                     batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
440 |                     batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
441 |                     batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
442 | 
443 |                     if self.use_cuda:
444 |                         batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
445 |                     optimizer.zero_grad()
446 |                     outputs = model(batch_xi, batch_xv)
447 |                     loss = criterion(outputs, batch_y)
448 |                     loss.backward()
449 |                     optimizer.step()
450 |                 train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size)
451 |                 if save_path:
452 |                     torch.save(self.state_dict(), save_path)
453 |                 if abs(best_train_score - train_eval) < 0.001 or \
454 |                         (self.greater_is_better and train_eval > best_train_score) or \
455 |                         ((not self.greater_is_better) and train_result < best_train_score):
456 |                     break
457 |             if self.verbose:
458 |                 print("refit finished")
459 | 
460 |     def eval_by_batch(self, Xi, Xv, y, x_size):
461 |         total_loss = 0.0
462 |         y_pred = []
463 |         y1=[]
464 |         if self.use_ffm:
465 |             batch_size = self.batch_size
466 |         else:
467 |             batch_size = self.batch_size
468 |         batch_iter = x_size // batch_size
469 |         criterion = F.binary_cross_entropy_with_logits
470 |         model = self.eval()
471 | 
472 |         for i in range(batch_iter + 1):
473 |             offset = i * batch_size
474 |             end = min(x_size, offset + batch_size)
475 |             if offset == end:
476 |                 break
477 |             if i==10:
478 |                 break
479 |             batch_xi = Variable(torch.LongTensor(Xi[offset:end]))
480 |             batch_xv = Variable(torch.FloatTensor(Xv[offset:end]))
481 |             batch_y = Variable(torch.FloatTensor(y[offset:end]))
482 | 
483 |             if self.use_cuda:
484 |                 batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
485 |             outputs = model(batch_xi, batch_xv)
486 |             loss = criterion(outputs, batch_y)
487 |             pred = F.sigmoid(outputs).cpu()
488 |             y_pred.extend(pred.cpu().data.numpy())
489 |             y1.extend(batch_y.cpu().data.numpy())
490 |             total_loss += loss.item()
491 |         roc=self.roc(y1,y_pred)
492 |         return total_loss / 10, roc
493 | 
494 |     # shuffle three lists simutaneously
495 |     def shuffle_in_unison_scary(self, a, b, c):
496 |         rng_state = np.random.get_state()
497 |         np.random.shuffle(a)
498 |         np.random.set_state(rng_state)
499 |         np.random.shuffle(b)
500 |         np.random.set_state(rng_state)
501 |         np.random.shuffle(c)
502 | 
503 |     def training_termination(self, valid_result):
504 |         if len(valid_result) > 4:
505 |             if self.greater_is_better:
506 |                 if valid_result[-1] < valid_result[-2] and \
507 |                         valid_result[-2] < valid_result[-3] and \
508 |                         valid_result[-3] < valid_result[-4]:
509 |                     return True
510 |             else:
511 |                 if valid_result[-1] > valid_result[-2] and \
512 |                         valid_result[-2] > valid_result[-3] and \
513 |                         valid_result[-3] > valid_result[-4]:
514 |                     return True
515 |         return False
516 | 
517 |     def predict_from_model_file(self, Xi, Xv,model,path):
518 |         """
519 |         :param Xi: the same as fit function
520 |         :param Xv: the same as fit function
521 |         :return: output, ont-dim array
522 |         """
523 |         state=torch.load(path)
524 |         model.load_state_dict(state)
525 |         Xi = np.array(Xi).reshape((-1, self.field_size, 1))
526 |         Xi = Variable(torch.LongTensor(Xi))
527 |         Xv = Variable(torch.FloatTensor(Xv))
528 |         if self.use_cuda and torch.cuda.is_available():
529 |             Xi, Xv = Xi.cuda(), Xv.cuda()
530 | 
531 |         pred = F.sigmoid(model(Xi, Xv)).cpu().data.numpy()
532 |         return pred
533 | 
534 |     def predict(self, Xi, Xv):
535 |         Xi = np.array(Xi).reshape((-1, self.field_size, 1))
536 |         Xi = Variable(torch.LongTensor(Xi))
537 |         Xv = Variable(torch.FloatTensor(Xv))
538 |         if self.use_cuda and torch.cuda.is_available():
539 |             Xi, Xv = Xi.cuda(), Xv.cuda()
540 | 
541 |         model = self.eval()
542 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
543 |         return pred.data.numpy()
544 | 
545 |     def inner_predict(self, Xi, Xv):
546 |         """
547 |         :param Xi: tensor of feature index
548 |         :param Xv: tensor of feature value
549 |         :return: output, numpy
550 |         """
551 |         model = self.eval()
552 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
553 |         return (pred.data.numpy() > 0.5)
554 | 
555 |     def inner_predict_proba(self, Xi, Xv):
556 |         """
557 |         :param Xi: tensor of feature index
558 |         :param Xv: tensor of feature value
559 |         :return: output, numpy
560 |         """
561 |         model = self.eval()
562 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
563 |         return pred.data.numpy()
564 | 
565 |     def roc(self, y,pre):
566 |         """
567 |         :param Xi: tensor of feature index
568 |         :param Xv: tensor of feature value
569 |         :param y: tensor of labels
570 |         :return: metric of the evaluation
571 |         """
572 |         total_metric=0
573 |         if len(set(y)) == 2:
574 |             total_metric = roc_auc_score(y, pre)
575 |         else:
576 |             total_metric = -1
577 |         return total_metric
578 | 


--------------------------------------------------------------------------------
/model/FNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | Created on Dec 10, 2017
  5 | @author: jachin,Nie
  6 | 
  7 | A pytorch implementation of FNN
  8 | 
  9 | Reference:
 10 | [1] Deep Learning over Multi-field Categorical Data: A Case Study on User Response Prediction
 11 | 
 12 | Weinan Zhang, Tianming Du, Jun Wang
 13 | 
 14 | """
 15 | import os
 16 | import numpy as np
 17 | from sklearn.base import BaseEstimator, TransformerMixin
 18 | from sklearn.metrics import roc_auc_score
 19 | from time import time
 20 | 
 21 | import torch
 22 | import torch.autograd as autograd
 23 | import torch.nn as nn
 24 | import torch.nn.functional as F
 25 | import torch.optim as optim
 26 | from torch.autograd import Variable
 27 | 
 28 | import torch.backends.cudnn
 29 | 
 30 | """
 31 |     网络结构部分
 32 | """
 33 | 
 34 | class FNN(torch.nn.Module):
 35 |     """
 36 |     :parameter
 37 |     -------------
 38 |     field_size: size of the feature fields
 39 |     feature_sizes: a field_size-dim array, sizes of the feature dictionary
 40 |     embedding_size: size of the feature embedding
 41 |     h_depth: deep network's hidden layers' depth
 42 |     deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2
 43 |     is_deep_dropout: bool, deep part uses dropout or not?
 44 |     dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2
 45 |     deep_layers_activation: relu or sigmoid etc
 46 |     n_epochs: epochs
 47 |     batch_size: batch_size
 48 |     learning_rate: learning_rate
 49 |     optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag'
 50 |     is_batch_norm：bool,  use batch_norm or not ?
 51 |     verbose: verbose
 52 |     pre_weight_decay: pretrain 's weight decay (L2 penalty)
 53 |     weight_decay: weight decay (L2 penalty)
 54 |     random_seed: random_seed=950104 someone's birthday, my lukcy number
 55 |     use_fm: bool
 56 |     use_ffm: bool
 57 |     loss_type: "logloss", only
 58 |     eval_metric: roc_auc_score
 59 |     use_cuda: bool use gpu or cpu?
 60 |     n_class: number of classes. is bounded to 1
 61 |     greater_is_better: bool. Is the greater eval better?
 62 | 
 63 | 
 64 |     Attention: only support logsitcs regression
 65 |     """
 66 | 
 67 |     def __init__(self, field_size, feature_sizes, embedding_size=4,
 68 |                  h_depth=2, deep_layers=[32, 32], is_deep_dropout=True, dropout_deep=[0.5, 0.5, 0.5],
 69 |                  deep_layers_activation='tanh', n_epochs=64, batch_size=256, learning_rate=0.003,
 70 |                  optimizer_type='adam', is_batch_norm=False, verbose=False, random_seed=950104, pre_weight_decay= 0.0,weight_decay=0.0,
 71 |                  use_fm=True, use_ffm=False, loss_type='logloss', eval_metric=roc_auc_score,
 72 |                  use_cuda=True, n_class=1, greater_is_better=True
 73 |                  ):
 74 |         super(FNN, self).__init__()
 75 |         self.field_size = field_size
 76 |         self.feature_sizes = feature_sizes
 77 |         self.embedding_size = embedding_size
 78 |         self.h_depth = h_depth
 79 |         self.deep_layers = deep_layers
 80 |         self.is_deep_dropout = is_deep_dropout
 81 |         self.dropout_deep = dropout_deep
 82 |         self.deep_layers_activation = deep_layers_activation
 83 |         self.n_epochs = n_epochs
 84 |         self.batch_size = batch_size
 85 |         self.learning_rate = learning_rate
 86 |         self.optimizer_type = optimizer_type
 87 |         self.is_batch_norm = is_batch_norm
 88 |         self.verbose = verbose
 89 |         self.pre_weight_decay = pre_weight_decay
 90 |         self.weight_decay = weight_decay
 91 |         self.random_seed = random_seed
 92 |         self.use_fm = use_fm
 93 |         self.use_ffm = use_ffm
 94 |         self.loss_type = loss_type
 95 |         self.eval_metric = eval_metric
 96 |         self.use_cuda = use_cuda
 97 |         self.n_class = n_class
 98 |         self.greater_is_better = greater_is_better
 99 |         self.pretrain = False
100 | 
101 |         torch.manual_seed(self.random_seed)
102 | 
103 |         """
104 |             check cuda
105 |         """
106 |         if self.use_cuda and not torch.cuda.is_available():
107 |             self.use_cuda = False
108 |             print("Cuda is not available, automatically changed into cpu model")
109 | 
110 |         """
111 |             check use fm or ffm
112 |         """
113 |         if self.use_fm and self.use_ffm:
114 |             print("only support one type only, please make sure to choose only fm or ffm part")
115 |             exit(1)
116 |         elif self.use_fm:
117 |             print("The model is FNN(fm+nn layers)")
118 |         elif self.use_ffm:
119 |             print("The model is FFNN(ffm+nn layers)")
120 |         else:
121 |             print("You have to choose more than one of (fm, ffm, deep) models to use")
122 |             exit(1)
123 | 
124 |         """
125 |             fm part
126 |         """
127 |         if self.use_fm:
128 |             print("Init fm part")
129 |             self.fm_bias = torch.nn.Parameter(torch.randn(1), requires_grad=True) #w0
130 |             self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes]) #wi
131 |             self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes]) #vi
132 |             print("Init fm part succeed")
133 | 
134 |         """
135 |             ffm part
136 |         """
137 |         if self.use_ffm:
138 |             print("Init ffm part")
139 |             self.ffm_bias = torch.nn.Parameter(torch.randn(1), requires_grad=True)
140 |             self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
141 |             self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes])
142 |             print("Init ffm part succeed")
143 | 
144 |         print("Init nn part")
145 |         if self.is_deep_dropout:
146 |             self.linear_0_dropout = nn.Dropout(self.dropout_deep[0])
147 |         if not use_ffm:
148 |             self.linear_1 = nn.Linear(1 + self.field_size + self.field_size  * self.embedding_size, deep_layers[0])
149 |         else:
150 |             self.linear_1 = nn.Linear(1 + self.field_size + self.field_size * self.field_size * self.embedding_size, deep_layers[0])
151 | 
152 |         if self.is_batch_norm:
153 |             self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0])
154 | 
155 |         if self.is_deep_dropout:
156 |             self.linear_1_dropout = nn.Dropout(self.dropout_deep[1])
157 |         for i, h in enumerate(self.deep_layers[1:], 1):
158 |             setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i]))
159 |             if self.is_batch_norm:
160 |                 setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i]))
161 |             if self.is_deep_dropout:
162 |                 setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1]))
163 |         self.deep_last_layer = nn.Linear(self.deep_layers[-1], self.n_class)
164 |         print("Init nn part succeed")
165 | 
166 |         print "Init succeed"
167 | 
168 |     def forward(self, Xi, Xv):
169 |         """
170 |         :param Xi: index input tensor, batch_size * k * 1
171 |         :param Xv: value input tensor, batch_size * k * 1
172 |         :param is_pretrain: the para to decide fm pretrain or not
173 |         :return: the last output
174 |         """
175 |         if self.pretrain and self.use_fm:
176 |             fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
177 |             fm_first_order_sum = torch.sum(sum(fm_first_order_emb_arr),1)
178 |             fm_second_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
179 |             fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
180 |             fm_sum_second_order_emb_square = fm_sum_second_order_emb*fm_sum_second_order_emb # (x+y)^2
181 |             fm_second_order_emb_square = [item*item for item in fm_second_order_emb_arr]
182 |             fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) #x^2+y^2
183 |             fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5
184 |             fm_second_order_sum = torch.sum(fm_second_order,1)
185 |             return self.fm_bias+fm_first_order_sum+fm_second_order_sum
186 |         elif self.pretrain and self.use_ffm:
187 |             ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)]
188 |             sum_ = torch.sum(sum(ffm_first_order_emb_arr),1)
189 |             ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in  f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)]
190 |             for i in range(self.field_size):
191 |                 for j in range(i+1, self.field_size):
192 |                     sum_ += torch.sum((ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i]),1)
193 |             return self.ffm_bias + sum_
194 |         elif not self.pretrain and self.use_fm:
195 |             fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
196 |             fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
197 |             fm_first_order = torch.cat(fm_first_order_emb_arr,1)
198 |             fm_second_order = torch.cat(fm_second_order_emb_arr,1)
199 |             if self.use_cuda:
200 |                 fm_bias = self.fm_bias * Variable(torch.ones(Xi.data.shape[0],1)).cuda()
201 |             else:
202 |                 fm_bias = self.fm_bias * Variable(torch.ones(Xi.data.shape[0], 1))
203 |             deep_emb = torch.cat([fm_bias,fm_first_order,fm_second_order],1)
204 |             if self.deep_layers_activation == 'sigmoid':
205 |                 activation = F.sigmoid
206 |             elif self.deep_layers_activation == 'tanh':
207 |                 activation = F.tanh
208 |             else:
209 |                 activation = F.relu
210 |             if self.is_deep_dropout:
211 |                 deep_emb = self.linear_0_dropout(deep_emb)
212 |             x_deep = self.linear_1(deep_emb)
213 |             if self.is_batch_norm:
214 |                 x_deep = self.batch_norm_1(x_deep)
215 |             x_deep = activation(x_deep)
216 |             if self.is_deep_dropout:
217 |                 x_deep = self.linear_1_dropout(x_deep)
218 |             for i in range(1, len(self.deep_layers)):
219 |                 x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep)
220 |                 if self.is_batch_norm:
221 |                     x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep)
222 |                 x_deep = activation(x_deep)
223 |                 if self.is_deep_dropout:
224 |                     x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep)
225 |             x_deep = self.deep_last_layer(x_deep)
226 |             return torch.sum(x_deep,1)
227 |         else:
228 |             ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)]
229 |             ffm_second_order_emb_arr = [torch.cat([(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for emb in f_embs],1) for
230 |                                         i, f_embs in enumerate(self.ffm_second_order_embeddings)]
231 |             ffm_first_order = torch.cat(ffm_first_order_emb_arr,1)
232 |             ffm_second_order = torch.cat(ffm_second_order_emb_arr,1)
233 |             if self.use_cuda:
234 |                 ffm_bias = self.ffm_bias * Variable(torch.ones(Xi.data.shape[0], 1)).cuda()
235 |             else:
236 |                 ffm_bias = self.ffm_bias * Variable(torch.ones(Xi.data.shape[0], 1))
237 |             deep_emb = torch.cat([ffm_bias, ffm_first_order, ffm_second_order], 1)
238 |             if self.deep_layers_activation == 'sigmoid':
239 |                 activation = F.sigmoid
240 |             elif self.deep_layers_activation == 'tanh':
241 |                 activation = F.tanh
242 |             else:
243 |                 activation = F.relu
244 |             if self.is_deep_dropout:
245 |                 deep_emb = self.linear_0_dropout(deep_emb)
246 |             x_deep = self.linear_1(deep_emb)
247 |             if self.is_batch_norm:
248 |                 x_deep = self.batch_norm_1(x_deep)
249 |             x_deep = activation(x_deep)
250 |             if self.is_deep_dropout:
251 |                 x_deep = self.linear_1_dropout(x_deep)
252 |             for i in range(1, len(self.deep_layers)):
253 |                 x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep)
254 |                 if self.is_batch_norm:
255 |                     x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep)
256 |                 x_deep = activation(x_deep)
257 |                 if self.is_deep_dropout:
258 |                     x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep)
259 |             x_deep = self.deep_last_layer(x_deep)
260 |             return torch.sum(x_deep,1)
261 | 
262 |     def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None,
263 |                 y_valid = None, is_pretrain = False, ealry_stopping=False, refit=False, save_path = None):
264 |         """
265 |         :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
266 |                         indi_j is the feature index of feature field j of sample i in the training set
267 |             :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
268 |                         vali_j is the feature value of feature field j of sample i in the training set
269 |                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
270 |         :param y_train: label of each sample in the training set
271 |         :param Xi_valid: list of list of feature indices of each sample in the validation set
272 |         :param Xv_valid: list of list of feature values of each sample in the validation set
273 |         :param y_valid: label of each sample in the validation set
274 |         :param is_pretrain: pretrain or not ?
275 |         :param ealry_stopping: perform early stopping or not
276 |         :param refit: refit the model on the train+valid dataset or not
277 |         :param save_path: the path to save the model
278 |         :return:
279 |         """
280 |         """
281 |         pre_process
282 |         """
283 |         if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])):
284 |             print("Save path is not existed!")
285 |             return
286 | 
287 |         if is_pretrain:
288 |             print("The model is pre_training now. You must change the mode in the next fitting")
289 | 
290 |         if self.verbose:
291 |             print("pre_process data ing...")
292 |         self.pretrain = is_pretrain
293 |         is_valid = False
294 |         Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1))
295 |         Xv_train = np.array(Xv_train)
296 |         y_train = np.array(y_train)
297 |         x_size = Xi_train.shape[0]
298 |         if Xi_valid:
299 |             Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1))
300 |             Xv_valid = np.array(Xv_valid)
301 |             y_valid = np.array(y_valid)
302 |             x_valid_size = Xi_valid.shape[0]
303 |             is_valid = True
304 |         if self.verbose:
305 |             print("pre_process data finished")
306 | 
307 |         """
308 |             train model
309 |         """
310 |         model = self.train()
311 |         if self.pretrain:
312 |             weight_decay = self.pre_weight_decay
313 |         else:
314 |             weight_decay = self.weight_decay
315 |         optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=weight_decay)
316 |         if self.optimizer_type == 'adam':
317 |             optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=weight_decay)
318 |         elif self.optimizer_type == 'rmsp':
319 |             optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=weight_decay)
320 |         elif self.optimizer_type == 'adag':
321 |             optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=weight_decay)
322 | 
323 |         criterion = F.binary_cross_entropy_with_logits
324 | 
325 |         train_result = []
326 |         valid_result = []
327 |         for epoch in range(self.n_epochs):
328 |             total_loss = 0.0
329 |             batch_iter = x_size // self.batch_size
330 |             epoch_begin_time = time()
331 |             batch_begin_time = time()
332 |             for i in range(batch_iter+1):
333 |                 offset = i*self.batch_size
334 |                 end = min(x_size, offset+self.batch_size)
335 |                 if offset == end:
336 |                     break
337 |                 batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
338 |                 batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
339 |                 batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
340 |                 if self.use_cuda:
341 |                     batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
342 |                 optimizer.zero_grad()
343 |                 outputs = model(batch_xi, batch_xv)
344 |                 loss = criterion(outputs, batch_y)
345 |                 loss.backward()
346 |                 optimizer.step()
347 | 
348 |                 total_loss += loss.data[0]
349 |                 if self.verbose:
350 |                     if i % 100 == 99:  # print every 100 mini-batches
351 |                         eval = self.evaluate(batch_xi, batch_xv, batch_y)
352 |                         print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' %
353 |                               (epoch + 1, i + 1, total_loss/100, eval, time()-batch_begin_time))
354 |                         total_loss = 0.0
355 |                         batch_begin_time = time()
356 | 
357 |             train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size)
358 |             train_result.append(train_eval)
359 |             print('*'*50)
360 |             print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
361 |                   (epoch + 1, train_loss, train_eval, time()-epoch_begin_time))
362 |             print('*'*50)
363 | 
364 |             if is_valid:
365 |                 valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size)
366 |                 valid_result.append(valid_eval)
367 |                 print('*' * 50)
368 |                 print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
369 |                       (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time))
370 |                 print('*' * 50)
371 |             if save_path:
372 |                 torch.save(self.state_dict(),save_path)
373 |             if is_valid and ealry_stopping and self.training_termination(valid_result):
374 |                 print("early stop at [%d] epoch!" % (epoch+1))
375 |                 break
376 | 
377 |         # fit a few more epoch on train+valid until result reaches the best_train_score
378 |         if is_valid and refit:
379 |             if self.verbose:
380 |                 print("refitting the model")
381 |             if self.greater_is_better:
382 |                 best_epoch = np.argmax(valid_result)
383 |             else:
384 |                 best_epoch = np.argmin(valid_result)
385 |             best_train_score = train_result[best_epoch]
386 |             Xi_train = np.concatenate((Xi_train,Xi_valid))
387 |             Xv_train = np.concatenate((Xv_train,Xv_valid))
388 |             y_train = np.concatenate((y_train,y_valid))
389 |             x_size = x_size + x_valid_size
390 |             self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train)
391 |             for epoch in range(64):
392 |                 batch_iter = x_size // self.batch_size
393 |                 for i in range(batch_iter + 1):
394 |                     offset = i * self.batch_size
395 |                     end = min(x_size, offset + self.batch_size)
396 |                     if offset == end:
397 |                         break
398 |                     batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
399 |                     batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
400 |                     batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
401 |                     if self.use_cuda:
402 |                         batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
403 |                     optimizer.zero_grad()
404 |                     outputs = model(batch_xi, batch_xv)
405 |                     loss = criterion(outputs, batch_y)
406 |                     loss.backward()
407 |                     optimizer.step()
408 |                 train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size)
409 |                 if save_path:
410 |                     torch.save(self.state_dict(), save_path)
411 |                 if abs(best_train_score-train_eval) < 0.001 or \
412 |                         (self.greater_is_better and train_eval > best_train_score) or \
413 |                         ((not self.greater_is_better) and train_result < best_train_score):
414 |                     break
415 |             if self.verbose:
416 |                 print("refit finished")
417 | 
418 |     def eval_by_batch(self,Xi, Xv, y, x_size):
419 |         total_loss = 0.0
420 |         y_pred = []
421 |         if self.use_ffm:
422 |             batch_size = 16384*2
423 |         else:
424 |             batch_size = 16384
425 |         batch_iter = x_size // batch_size
426 |         criterion = F.binary_cross_entropy_with_logits
427 |         model = self.eval()
428 |         for i in range(batch_iter+1):
429 |             offset = i * batch_size
430 |             end = min(x_size, offset + batch_size)
431 |             if offset == end:
432 |                 break
433 |             batch_xi = Variable(torch.LongTensor(Xi[offset:end]))
434 |             batch_xv = Variable(torch.FloatTensor(Xv[offset:end]))
435 |             batch_y = Variable(torch.FloatTensor(y[offset:end]))
436 |             if self.use_cuda:
437 |                 batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
438 |             outputs = model(batch_xi, batch_xv)
439 |             pred = F.sigmoid(outputs).cpu()
440 |             y_pred.extend(pred.data.numpy())
441 |             loss = criterion(outputs, batch_y)
442 |             total_loss += loss.data[0]*(end-offset)
443 |         total_metric = self.eval_metric(y,y_pred)
444 |         return total_loss/x_size, total_metric
445 | 
446 |     # shuffle three lists simutaneously
447 |     def shuffle_in_unison_scary(self, a, b, c):
448 |         rng_state = np.random.get_state()
449 |         np.random.shuffle(a)
450 |         np.random.set_state(rng_state)
451 |         np.random.shuffle(b)
452 |         np.random.set_state(rng_state)
453 |         np.random.shuffle(c)
454 | 
455 |     def training_termination(self, valid_result):
456 |         if len(valid_result) > 4:
457 |             if self.greater_is_better:
458 |                 if valid_result[-1] < valid_result[-2] and \
459 |                     valid_result[-2] < valid_result[-3] and \
460 |                     valid_result[-3] < valid_result[-4]:
461 |                     return True
462 |             else:
463 |                 if valid_result[-1] > valid_result[-2] and \
464 |                     valid_result[-2] > valid_result[-3] and \
465 |                     valid_result[-3] > valid_result[-4]:
466 |                     return True
467 |         return False
468 | 
469 |     def predict(self, Xi, Xv):
470 |         """
471 |         :param Xi: the same as fit function
472 |         :param Xv: the same as fit function
473 |         :return: output, ont-dim array
474 |         """
475 |         Xi = np.array(Xi).reshape((-1,self.field_size,1))
476 |         Xi = Variable(torch.LongTensor(Xi))
477 |         Xv = Variable(torch.FloatTensor(Xv))
478 |         if self.use_cuda and torch.cuda.is_available():
479 |             Xi, Xv = Xi.cuda(), Xv.cuda()
480 | 
481 |         model = self.eval()
482 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
483 |         return (pred.data.numpy() > 0.5)
484 | 
485 |     def predict_proba(self, Xi, Xv):
486 |         Xi = np.array(Xi).reshape((-1, self.field_size, 1))
487 |         Xi = Variable(torch.LongTensor(Xi))
488 |         Xv = Variable(torch.FloatTensor(Xv))
489 |         if self.use_cuda and torch.cuda.is_available():
490 |             Xi, Xv = Xi.cuda(), Xv.cuda()
491 | 
492 |         model = self.eval()
493 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
494 |         return pred.data.numpy()
495 | 
496 |     def inner_predict(self, Xi, Xv):
497 |         """
498 |         :param Xi: tensor of feature index
499 |         :param Xv: tensor of feature value
500 |         :return: output, numpy
501 |         """
502 |         model = self.eval()
503 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
504 |         return (pred.data.numpy() > 0.5)
505 | 
506 |     def inner_predict_proba(self, Xi, Xv):
507 |         """
508 |         :param Xi: tensor of feature index
509 |         :param Xv: tensor of feature value
510 |         :return: output, numpy
511 |         """
512 |         model = self.eval()
513 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
514 |         return pred.data.numpy()
515 | 
516 | 
517 |     def evaluate(self, Xi, Xv, y):
518 |         """
519 |         :param Xi: tensor of feature index
520 |         :param Xv: tensor of feature value
521 |         :param y: tensor of labels
522 |         :return: metric of the evaluation
523 |         """
524 |         y_pred = self.inner_predict_proba(Xi, Xv)
525 |         return self.eval_metric(y.cpu().data.numpy(), y_pred)
526 | 
527 | """
528 |     test part
529 | """
530 | import sys
531 | sys.path.append('../')
532 | from utils import data_preprocess
533 | 
534 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv')
535 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv')
536 | with torch.cuda.device(2):
537 |     fnn = FNN(39, result_dict['feature_sizes'], batch_size=128 * 64, verbose=True, use_cuda=True,
538 |                       pre_weight_decay= 0.0001 ,weight_decay=0.00001, use_fm=False, use_ffm=True).cuda()
539 |     fnn.load_state_dict(torch.load('../data/model/ffnn.pkl'))
540 |     # fnn.fit(result_dict['index'], result_dict['value'], result_dict['label'],
541 |     #            test_dict['index'], test_dict['value'], test_dict['label'],ealry_stopping=True,refit=False,is_pretrain=True,save_path='../data/model/ffnn.pkl')
542 |     fnn.fit(result_dict['index'], result_dict['value'], result_dict['label'],
543 |                test_dict['index'], test_dict['value'], test_dict['label'],ealry_stopping=True,refit=False,is_pretrain=False,save_path='../data/model/ffnn.pkl')
544 | 


--------------------------------------------------------------------------------
/model/NFM.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | Created on Dec 10, 2017
  5 | @author: jachin,Nie
  6 | 
  7 | A pytorch implementation of NFM
  8 | 
  9 | Reference:
 10 | [1] Neural Factorization Machines for Sparse Predictive Analytics
 11 |     Xiangnan He,School of Computing,National University of Singapore,Singapore 117417,dcshex@nus.edu.sg
 12 |     Tat-Seng Chua,School of Computing,National University of Singapore,Singapore 117417,dcscts@nus.edu.sg
 13 | 
 14 | """
 15 | 
 16 | import os
 17 | import numpy as np
 18 | from sklearn.base import BaseEstimator, TransformerMixin
 19 | from sklearn.metrics import roc_auc_score
 20 | from time import time
 21 | 
 22 | import torch
 23 | import torch.autograd as autograd
 24 | import torch.nn as nn
 25 | import torch.nn.functional as F
 26 | import torch.optim as optim
 27 | from torch.autograd import Variable
 28 | 
 29 | import torch.backends.cudnn
 30 | 
 31 | 
 32 | """
 33 |     网络结构部分
 34 | """
 35 | 
 36 | class NFM(torch.nn.Module):
 37 |     """
 38 |     :parameter
 39 |     -------------
 40 |     field_size: size of the feature fields
 41 |     feature_sizes: a field_size-dim array, sizes of the feature dictionary
 42 |     embedding_size: size of the feature embedding
 43 |     is_shallow_dropout: bool, shallow part(fm or ffm part) uses dropout or not?
 44 |     dropout_shallow: an array of the size of 1, example:[0.5], the element is for the-first order part
 45 |     h_depth: deep network's hidden layers' depth
 46 |     deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2
 47 |     is_deep_dropout: bool, deep part uses dropout or not?
 48 |     dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2
 49 |     deep_layers_activation: relu or sigmoid etc
 50 |     n_epochs: epochs
 51 |     batch_size: batch_size
 52 |     learning_rate: learning_rate
 53 |     optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag'
 54 |     is_batch_norm：bool,  use batch_norm or not ?
 55 |     verbose: verbose
 56 |     weight_decay: weight decay (L2 penalty)
 57 |     random_seed: random_seed=950104 someone's birthday, my lukcy number
 58 |     use_fm: bool
 59 |     use_ffm: bool
 60 |     interation_type: bool, When it's true, the element-wise product of the fm or ffm embeddings will be added together, otherwise, the element-wise prodcut of embeddings will be concatenated.
 61 |     loss_type: "logloss", only
 62 |     eval_metric: roc_auc_score
 63 |     use_cuda: bool use gpu or cpu?
 64 |     n_class: number of classes. is bounded to 1
 65 |     greater_is_better: bool. Is the greater eval better?
 66 | 
 67 | 
 68 |     Attention: only support logsitcs regression
 69 |     """
 70 |     def __init__(self,field_size, feature_sizes, embedding_size = 4, is_shallow_dropout = True, dropout_shallow = [0.5],
 71 |                  h_depth = 2, deep_layers = [32, 32], is_deep_dropout = True, dropout_deep=[0.0, 0.5, 0.5],
 72 |                  deep_layers_activation = 'relu', n_epochs = 64, batch_size = 256, learning_rate = 0.003,
 73 |                  optimizer_type = 'adam', is_batch_norm = False, verbose = False, random_seed = 950104, weight_decay = 0.0,
 74 |                  use_fm = True, use_ffm = False, interation_type = True,loss_type = 'logloss', eval_metric = roc_auc_score,
 75 |                  use_cuda = True, n_class = 1, greater_is_better = True
 76 |                  ):
 77 |         super(NFM, self).__init__()
 78 |         self.field_size = field_size
 79 |         self.feature_sizes = feature_sizes
 80 |         self.embedding_size = embedding_size
 81 |         self.is_shallow_dropout = is_shallow_dropout
 82 |         self.dropout_shallow = dropout_shallow
 83 |         self.h_depth = h_depth
 84 |         self.deep_layers = deep_layers
 85 |         self.is_deep_dropout = is_deep_dropout
 86 |         self.dropout_deep = dropout_deep
 87 |         self.deep_layers_activation = deep_layers_activation
 88 |         self.n_epochs = n_epochs
 89 |         self.batch_size = batch_size
 90 |         self.learning_rate = learning_rate
 91 |         self.optimizer_type = optimizer_type
 92 |         self.is_batch_norm = is_batch_norm
 93 |         self.verbose = verbose
 94 |         self.weight_decay = weight_decay
 95 |         self.random_seed = random_seed
 96 |         self.use_fm = use_fm
 97 |         self.use_ffm = use_ffm
 98 |         self.interation_type = interation_type
 99 |         self.loss_type = loss_type
100 |         self.eval_metric = eval_metric
101 |         self.use_cuda = use_cuda
102 |         self.n_class = n_class
103 |         self.greater_is_better = greater_is_better
104 | 
105 |         torch.manual_seed(self.random_seed)
106 | 
107 |         """
108 |             check cuda
109 |         """
110 |         if self.use_cuda and not torch.cuda.is_available():
111 |             self.use_cuda = False
112 |             print("Cuda is not available, automatically changed into cpu model")
113 | 
114 |         """
115 |             check use fm or ffm
116 |         """
117 |         if self.use_fm and self.use_ffm:
118 |             print("only support one type only, please make sure to choose only fm or ffm part")
119 |             exit(1)
120 |         elif self.use_fm:
121 |             print("The model is nfm(fm+nn layers)")
122 |         elif self.use_ffm:
123 |             print("The model is nffm(ffm+nn layers)")
124 |         else:
125 |             print("You have to choose more than one of (fm, ffm) models to use")
126 |             exit(1)
127 |         """
128 |             bias
129 |         """
130 |         self.bias = torch.nn.Parameter(torch.randn(1))
131 | 
132 |         """
133 |             fm part
134 |         """
135 |         if self.use_fm:
136 |             print("Init fm part")
137 |             self.fm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
138 |             if self.dropout_shallow:
139 |                 self.fm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
140 |             self.fm_second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
141 |             print("Init fm part succeed")
142 | 
143 |         """
144 |             ffm part
145 |         """
146 |         if self.use_ffm:
147 |             print("Init ffm part")
148 |             self.ffm_first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
149 |             if self.dropout_shallow:
150 |                 self.ffm_first_order_dropout = nn.Dropout(self.dropout_shallow[0])
151 |             self.ffm_second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for i in range(self.field_size)]) for feature_size in self.feature_sizes])
152 |             print("Init ffm part succeed")
153 | 
154 |         """
155 |             deep part
156 |         """
157 |         print("Init deep part")
158 | 
159 |         if self.is_deep_dropout:
160 |             self.linear_0_dropout = nn.Dropout(self.dropout_deep[0])
161 |         if self.interation_type:
162 |             self.linear_1 = nn.Linear(self.embedding_size, deep_layers[0])
163 |         else:
164 |             self.linear_1 = nn.Linear(self.field_size*(self.field_size-1)/2, deep_layers[0])
165 |         if self.is_batch_norm:
166 |             self.batch_norm_1 = nn.BatchNorm1d(deep_layers[0])
167 |         if self.is_deep_dropout:
168 |             self.linear_1_dropout = nn.Dropout(self.dropout_deep[1])
169 |         for i, h in enumerate(self.deep_layers[1:], 1):
170 |             setattr(self, 'linear_' + str(i + 1), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i]))
171 |             if self.is_batch_norm:
172 |                 setattr(self, 'batch_norm_' + str(i + 1), nn.BatchNorm1d(deep_layers[i]))
173 |             if self.is_deep_dropout:
174 |                 setattr(self, 'linear_' + str(i + 1) + '_dropout', nn.Dropout(self.dropout_deep[i + 1]))
175 | 
176 |         print("Init deep part succeed")
177 | 
178 |         print "Init succeed"
179 | 
180 |     def forward(self, Xi, Xv):
181 |         """
182 |         :param Xi_train: index input tensor, batch_size * k * 1
183 |         :param Xv_train: value input tensor, batch_size * k * 1
184 |         :return: the last output
185 |         """
186 |         """
187 |             fm part
188 |         """
189 |         if self.use_fm:
190 |             fm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
191 |             fm_first_order = torch.cat(fm_first_order_emb_arr,1)
192 |             if self.is_shallow_dropout:
193 |                 fm_first_order = self.fm_first_order_dropout(fm_first_order)
194 | 
195 |             if self.interation_type:
196 |                 # use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation
197 |                 fm_second_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
198 |                 fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
199 |                 fm_sum_second_order_emb_square = fm_sum_second_order_emb*fm_sum_second_order_emb # (x+y)^2
200 |                 fm_second_order_emb_square = [item*item for item in fm_second_order_emb_arr]
201 |                 fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) #x^2+y^2
202 |                 fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5
203 |             else:
204 |                 fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
205 |                                            enumerate(self.fm_second_order_embeddings)]
206 |                 fm_wij_arr = []
207 |                 for i in range(self.field_size):
208 |                     for j in range(i + 1, self.field_size):
209 |                         fm_wij_arr.append(fm_second_order_emb_arr[i] * fm_second_order_emb_arr[j])
210 | 
211 | 
212 |         """
213 |             ffm part
214 |         """
215 |         if self.use_ffm:
216 |             ffm_first_order_emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.ffm_first_order_embeddings)]
217 |             ffm_first_order = torch.cat(ffm_first_order_emb_arr,1)
218 |             if self.is_shallow_dropout:
219 |                 ffm_first_order = self.ffm_first_order_dropout(ffm_first_order)
220 |             ffm_second_order_emb_arr = [[(torch.sum(emb(Xi[:,i,:]), 1).t() * Xv[:,i]).t() for emb in  f_embs] for i, f_embs in enumerate(self.ffm_second_order_embeddings)]
221 |             ffm_wij_arr = []
222 |             for i in range(self.field_size):
223 |                 for j in range(i+1, self.field_size):
224 |                     ffm_wij_arr.append(ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i])
225 |             ffm_second_order = sum(ffm_wij_arr)
226 | 
227 |         """
228 |             deep part
229 |         """
230 |         if self.use_fm and self.interation_type:
231 |             deep_emb = fm_second_order
232 |         elif self.use_ffm and self.interation_type:
233 |             deep_emb = ffm_second_order
234 |         elif self.use_fm:
235 |             deep_emb = torch.cat([torch.sum(fm_wij,1).view([-1,1]) for fm_wij in fm_wij_arr], 1)
236 |         else:
237 |             deep_emb = torch.cat([torch.sum(ffm_wij,1).view([-1,1]) for ffm_wij in ffm_wij_arr],1)
238 | 
239 |         if self.deep_layers_activation == 'sigmoid':
240 |             activation = F.sigmoid
241 |         elif self.deep_layers_activation == 'tanh':
242 |             activation = F.tanh
243 |         else:
244 |             activation = F.relu
245 | 
246 |         if self.is_deep_dropout:
247 |             deep_emb = self.linear_0_dropout(deep_emb)
248 |         x_deep = self.linear_1(deep_emb)
249 |         if self.is_batch_norm:
250 |             x_deep = self.batch_norm_1(x_deep)
251 |         x_deep = activation(x_deep)
252 |         if self.is_deep_dropout:
253 |             x_deep = self.linear_1_dropout(x_deep)
254 |         for i in range(1, len(self.deep_layers)):
255 |             x_deep = getattr(self, 'linear_' + str(i + 1))(x_deep)
256 |             if self.is_batch_norm:
257 |                 x_deep = getattr(self, 'batch_norm_' + str(i + 1))(x_deep)
258 |             x_deep = activation(x_deep)
259 |             if self.is_deep_dropout:
260 |                 x_deep = getattr(self, 'linear_' + str(i + 1) + '_dropout')(x_deep)
261 | 
262 |         """
263 |             sum
264 |         """
265 |         if self.use_fm:
266 |             total_sum = self.bias+ torch.sum(fm_first_order,1) + torch.sum(x_deep,1)
267 |         elif self.use_ffm:
268 |             total_sum = self.bias + torch.sum(ffm_first_order, 1) + torch.sum(x_deep, 1)
269 |         return total_sum
270 | 
271 | 
272 |     def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None,
273 |                 y_valid = None, ealry_stopping=False, refit=False, save_path = None):
274 |         """
275 |         :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
276 |                         indi_j is the feature index of feature field j of sample i in the training set
277 |         :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
278 |                         vali_j is the feature value of feature field j of sample i in the training set
279 |                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
280 |         :param y_train: label of each sample in the training set
281 |         :param Xi_valid: list of list of feature indices of each sample in the validation set
282 |         :param Xv_valid: list of list of feature values of each sample in the validation set
283 |         :param y_valid: label of each sample in the validation set
284 |         :param ealry_stopping: perform early stopping or not
285 |         :param refit: refit the model on the train+valid dataset or not
286 |         :param save_path: the path to save the model
287 |         :return:
288 |         """
289 |         """
290 |         pre_process
291 |         """
292 |         if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])):
293 |             print("Save path is not existed!")
294 |             return
295 | 
296 |         if self.verbose:
297 |             print("pre_process data ing...")
298 |         is_valid = False
299 |         Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1))
300 |         Xv_train = np.array(Xv_train)
301 |         y_train = np.array(y_train)
302 |         x_size = Xi_train.shape[0]
303 |         if Xi_valid:
304 |             Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1))
305 |             Xv_valid = np.array(Xv_valid)
306 |             y_valid = np.array(y_valid)
307 |             x_valid_size = Xi_valid.shape[0]
308 |             is_valid = True
309 |         if self.verbose:
310 |             print("pre_process data finished")
311 | 
312 |         """
313 |             train model
314 |         """
315 |         model = self.train()
316 | 
317 |         optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
318 |         if self.optimizer_type == 'adam':
319 |             optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
320 |         elif self.optimizer_type == 'rmsp':
321 |             optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
322 |         elif self.optimizer_type == 'adag':
323 |             optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
324 | 
325 |         criterion = F.binary_cross_entropy_with_logits
326 | 
327 |         train_result = []
328 |         valid_result = []
329 |         for epoch in range(self.n_epochs):
330 |             total_loss = 0.0
331 |             batch_iter = x_size // self.batch_size
332 |             epoch_begin_time = time()
333 |             batch_begin_time = time()
334 |             for i in range(batch_iter+1):
335 |                 offset = i*self.batch_size
336 |                 end = min(x_size, offset+self.batch_size)
337 |                 if offset == end:
338 |                     break
339 |                 batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
340 |                 batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
341 |                 batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
342 |                 if self.use_cuda:
343 |                     batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
344 |                 optimizer.zero_grad()
345 |                 outputs = model(batch_xi, batch_xv)
346 |                 loss = criterion(outputs, batch_y)
347 |                 loss.backward()
348 |                 optimizer.step()
349 | 
350 |                 total_loss += loss.data[0]
351 |                 if self.verbose:
352 |                     if i % 100 == 99:  # print every 100 mini-batches
353 |                         eval = self.evaluate(batch_xi, batch_xv, batch_y)
354 |                         print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' %
355 |                               (epoch + 1, i + 1, total_loss/100.0, eval, time()-batch_begin_time))
356 |                         total_loss = 0.0
357 |                         batch_begin_time = time()
358 | 
359 |             train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size)
360 |             train_result.append(train_eval)
361 |             print('*'*50)
362 |             print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
363 |                   (epoch + 1, train_loss, train_eval, time()-epoch_begin_time))
364 |             print('*'*50)
365 | 
366 |             if is_valid:
367 |                 valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size)
368 |                 valid_result.append(valid_eval)
369 |                 print('*' * 50)
370 |                 print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
371 |                       (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time))
372 |                 print('*' * 50)
373 |             if save_path:
374 |                 torch.save(self.state_dict(),save_path)
375 |             if is_valid and ealry_stopping and self.training_termination(valid_result):
376 |                 print("early stop at [%d] epoch!" % (epoch+1))
377 |                 break
378 | 
379 |         # fit a few more epoch on train+valid until result reaches the best_train_score
380 |         if is_valid and refit:
381 |             if self.verbose:
382 |                 print("refitting the model")
383 |             if self.greater_is_better:
384 |                 best_epoch = np.argmax(valid_result)
385 |             else:
386 |                 best_epoch = np.argmin(valid_result)
387 |             best_train_score = train_result[best_epoch]
388 |             Xi_train = np.concatenate((Xi_train,Xi_valid))
389 |             Xv_train = np.concatenate((Xv_train,Xv_valid))
390 |             y_train = np.concatenate((y_train,y_valid))
391 |             x_size = x_size + x_valid_size
392 |             self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train)
393 |             for epoch in range(64):
394 |                 batch_iter = x_size // self.batch_size
395 |                 for i in range(batch_iter + 1):
396 |                     offset = i * self.batch_size
397 |                     end = min(x_size, offset + self.batch_size)
398 |                     if offset == end:
399 |                         break
400 |                     batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
401 |                     batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
402 |                     batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
403 |                     if self.use_cuda:
404 |                         batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
405 |                     optimizer.zero_grad()
406 |                     outputs = model(batch_xi, batch_xv)
407 |                     loss = criterion(outputs, batch_y)
408 |                     loss.backward()
409 |                     optimizer.step()
410 |                 train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size)
411 |                 if save_path:
412 |                     torch.save(self.state_dict(), save_path)
413 |                 if abs(best_train_score-train_eval) < 0.001 or \
414 |                         (self.greater_is_better and train_eval > best_train_score) or \
415 |                         ((not self.greater_is_better) and train_result < best_train_score):
416 |                     break
417 |             if self.verbose:
418 |                 print("refit finished")
419 | 
420 |     def eval_by_batch(self,Xi, Xv, y, x_size):
421 |         total_loss = 0.0
422 |         y_pred = []
423 |         if self.use_ffm:
424 |             batch_size = 16384*2
425 |         else:
426 |             batch_size = 16384
427 |         batch_iter = x_size // batch_size
428 |         criterion = F.binary_cross_entropy_with_logits
429 |         model = self.eval()
430 |         for i in range(batch_iter+1):
431 |             offset = i * batch_size
432 |             end = min(x_size, offset + batch_size)
433 |             if offset == end:
434 |                 break
435 |             batch_xi = Variable(torch.LongTensor(Xi[offset:end]))
436 |             batch_xv = Variable(torch.FloatTensor(Xv[offset:end]))
437 |             batch_y = Variable(torch.FloatTensor(y[offset:end]))
438 |             if self.use_cuda:
439 |                 batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
440 | 
441 |             # self.print_embedding_prod(batch_xi,batch_xv)
442 | 
443 |             outputs = model(batch_xi, batch_xv)
444 |             pred = F.sigmoid(outputs).cpu()
445 |             y_pred.extend(pred.data.numpy())
446 |             loss = criterion(outputs, batch_y)
447 |             total_loss += loss.data[0]*(end-offset)
448 |         total_metric = self.eval_metric(y,y_pred)
449 |         return total_loss/x_size, total_metric
450 | 
451 |     # shuffle three lists simutaneously
452 |     def shuffle_in_unison_scary(self, a, b, c):
453 |         rng_state = np.random.get_state()
454 |         np.random.shuffle(a)
455 |         np.random.set_state(rng_state)
456 |         np.random.shuffle(b)
457 |         np.random.set_state(rng_state)
458 |         np.random.shuffle(c)
459 | 
460 |     def training_termination(self, valid_result):
461 |         if len(valid_result) > 4:
462 |             if self.greater_is_better:
463 |                 if valid_result[-1] < valid_result[-2] and \
464 |                     valid_result[-2] < valid_result[-3] and \
465 |                     valid_result[-3] < valid_result[-4]:
466 |                     return True
467 |             else:
468 |                 if valid_result[-1] > valid_result[-2] and \
469 |                     valid_result[-2] > valid_result[-3] and \
470 |                     valid_result[-3] > valid_result[-4]:
471 |                     return True
472 |         return False
473 | 
474 |     def predict(self, Xi, Xv):
475 |         """
476 |         :param Xi: the same as fit function
477 |         :param Xv: the same as fit function
478 |         :return: output, ont-dim array
479 |         """
480 |         Xi = np.array(Xi).reshape((-1,self.field_size,1))
481 |         Xi = Variable(torch.LongTensor(Xi))
482 |         Xv = Variable(torch.FloatTensor(Xv))
483 |         if self.use_cuda and torch.cuda.is_available():
484 |             Xi, Xv = Xi.cuda(), Xv.cuda()
485 | 
486 |         model = self.eval()
487 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
488 |         return (pred.data.numpy() > 0.5)
489 | 
490 |     def predict_proba(self, Xi, Xv):
491 |         Xi = np.array(Xi).reshape((-1, self.field_size, 1))
492 |         Xi = Variable(torch.LongTensor(Xi))
493 |         Xv = Variable(torch.FloatTensor(Xv))
494 |         if self.use_cuda and torch.cuda.is_available():
495 |             Xi, Xv = Xi.cuda(), Xv.cuda()
496 | 
497 |         model = self.eval()
498 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
499 |         return pred.data.numpy()
500 | 
501 |     def inner_predict(self, Xi, Xv):
502 |         """
503 |         :param Xi: tensor of feature index
504 |         :param Xv: tensor of feature value
505 |         :return: output, numpy
506 |         """
507 |         model = self.eval()
508 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
509 |         return (pred.data.numpy() > 0.5)
510 | 
511 |     def inner_predict_proba(self, Xi, Xv):
512 |         """
513 |         :param Xi: tensor of feature index
514 |         :param Xv: tensor of feature value
515 |         :return: output, numpy
516 |         """
517 |         model = self.eval()
518 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
519 |         return pred.data.numpy()
520 | 
521 | 
522 |     def evaluate(self, Xi, Xv, y):
523 |         """
524 |         :param Xi: tensor of feature index
525 |         :param Xv: tensor of feature value
526 |         :param y: tensor of labels
527 |         :return: metric of the evaluation
528 |         """
529 |         y_pred = self.inner_predict_proba(Xi, Xv)
530 |         return self.eval_metric(y.cpu().data.numpy(), y_pred)
531 | 
532 |     def print_embedding_prod(self,Xi,Xv):
533 |         if not self.use_fm:
534 |             print "Error! Only print fm model!"
535 |             return
536 |         fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in
537 |                                    enumerate(self.fm_second_order_embeddings)]
538 |         total_prod = fm_second_order_emb_arr[0] + 1.0
539 |         for emb in fm_second_order_emb_arr[1:]:
540 |             total_prod = total_prod * (emb + 1.0)
541 |         print "max:", torch.max(total_prod)
542 |         print "min", torch.min(total_prod)
543 | 
544 | """
545 |     test part
546 | """
547 | import sys
548 | sys.path.append('../')
549 | from utils import data_preprocess
550 | 
551 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv')
552 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv')
553 | with torch.cuda.device(1):
554 |     nfm = NFM(39, result_dict['feature_sizes'], batch_size=128 * 64, is_shallow_dropout=False, verbose=True, use_cuda=True,
555 |                       weight_decay=0.00002, use_fm=True, use_ffm=False, interation_type=False).cuda()
556 |     nfm.fit(result_dict['index'], result_dict['value'], result_dict['label'],
557 |             test_dict['index'], test_dict['value'], test_dict['label'], ealry_stopping=True, refit=False,
558 |             save_path='../data/model/nfm.pkl')
559 | 


--------------------------------------------------------------------------------
/model/PNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | Created on Dec 10, 2017
  5 | @author: jachin,Nie
  6 | 
  7 | A pytorch implementation of PNN
  8 | 
  9 | Reference:
 10 | [1] Product-based Neural Networks for User Response Prediction
 11 | Yanru Qu, Han Cai, Kan Ren, Weinan Zhang, Yong Yu Shanghai Jiao Tong University
 12 | {kevinqu, hcai, kren, wnzhang, yyu}@apex.sjtu.edu.cn Ying Wen, Jun Wang University College London {ying.wen, j.wang}@cs.ucl.ac.uk
 13 | 
 14 | """
 15 | import os
 16 | import numpy as np
 17 | from sklearn.base import BaseEstimator, TransformerMixin
 18 | from sklearn.metrics import roc_auc_score
 19 | from time import time
 20 | 
 21 | import torch
 22 | import torch.autograd as autograd
 23 | import torch.nn as nn
 24 | import torch.nn.functional as F
 25 | import torch.optim as optim
 26 | from torch.autograd import Variable
 27 | 
 28 | import torch.backends.cudnn
 29 | 
 30 | """
 31 |     网络结构部分
 32 | """
 33 | 
 34 | class PNN(torch.nn.Module):
 35 |     """
 36 |     :parameter
 37 |     -------------
 38 |     field_size: size of the feature fields
 39 |     feature_sizes: a field_size-dim array, sizes of the feature dictionary
 40 |     embedding_size: size of the feature embedding
 41 |     h_depth: deep network's hidden layers' depth
 42 |     deep_layers: a h_depth-dim array, each element is the size of corresponding hidden layers. example:[32,32] h_depth = 2
 43 |     is_deep_dropout: bool, deep part uses dropout or not?
 44 |     dropout_deep: an array of dropout factors,example:[0.5,0.5,0.5] h_depth=2
 45 |     use_inner_product: use inner product or not?
 46 |     use_outer_product: use outter product or not?
 47 |     deep_layers_activation: relu or sigmoid etc
 48 |     n_epochs: epochs
 49 |     batch_size: batch_size
 50 |     learning_rate: learning_rate
 51 |     optimizer_type: optimizer_type, 'adam', 'rmsp', 'sgd', 'adag'
 52 |     is_batch_norm：bool,  use batch_norm or not ?
 53 |     verbose: verbose
 54 |     weight_decay: weight decay (L2 penalty)
 55 |     random_seed: random_seed=950104 someone's birthday, my lukcy number
 56 |     loss_type: "logloss", only
 57 |     eval_metric: roc_auc_score
 58 |     use_cuda: bool use gpu or cpu?
 59 |     n_class: number of classes. is bounded to 1
 60 |     greater_is_better: bool. Is the greater eval better?
 61 | 
 62 | 
 63 |     Attention: only support logsitcs regression
 64 |     """
 65 | 
 66 |     def __init__(self, field_size, feature_sizes, embedding_size=4,
 67 |                  h_depth = 3, deep_layers=[32, 32, 32], is_deep_dropout=True, dropout_deep=[0.5, 0.5, 0.5], use_inner_product = True, use_outer_product = False,
 68 |                  deep_layers_activation='relu', n_epochs=64, batch_size=256, learning_rate=0.003,
 69 |                  optimizer_type='adam', is_batch_norm=False, verbose=False, random_seed=950104,weight_decay=0.0, loss_type='logloss', eval_metric=roc_auc_score,
 70 |                  use_cuda=True, n_class=1, greater_is_better=True
 71 |                  ):
 72 |         super(PNN, self).__init__()
 73 |         self.field_size = field_size
 74 |         self.feature_sizes = feature_sizes
 75 |         self.embedding_size = embedding_size
 76 |         self.h_depth = h_depth
 77 |         self.deep_layers = deep_layers
 78 |         self.is_deep_dropout = is_deep_dropout
 79 |         self.dropout_deep = dropout_deep
 80 |         self.use_inner_product = use_inner_product
 81 |         self.use_outer_product = use_outer_product
 82 |         self.deep_layers_activation = deep_layers_activation
 83 |         self.n_epochs = n_epochs
 84 |         self.batch_size = batch_size
 85 |         self.learning_rate = learning_rate
 86 |         self.optimizer_type = optimizer_type
 87 |         self.is_batch_norm = is_batch_norm
 88 |         self.verbose = verbose
 89 |         self.weight_decay = weight_decay
 90 |         self.random_seed = random_seed
 91 |         self.loss_type = loss_type
 92 |         self.eval_metric = eval_metric
 93 |         self.use_cuda = use_cuda
 94 |         self.n_class = n_class
 95 |         self.greater_is_better = greater_is_better
 96 | 
 97 |         torch.manual_seed(self.random_seed)
 98 | 
 99 |         """
100 |             check cuda
101 |         """
102 |         if self.use_cuda and not torch.cuda.is_available():
103 |             self.use_cuda = False
104 |             print("Cuda is not available, automatically changed into cpu model")
105 | 
106 |         """
107 |             check use inner_product or outer_product
108 |         """
109 |         if self.use_inner_product and self.use_inner_product:
110 |             print("The model uses both inner product and outer product")
111 |         elif self.use_inner_product:
112 |             print("The model uses inner product (IPNN))")
113 |         elif self.use_ffm:
114 |             print("The model uses outer product (OPNN)")
115 |         else:
116 |             print("The model is sample deep model only! Neither inner product or outer product is used")
117 | 
118 |         """
119 |             embbedding part
120 |         """
121 |         print("Init embeddings")
122 |         self.embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
123 |         print("Init embeddings finished")
124 | 
125 |         """
126 |             first order part (linear part)
127 |         """
128 |         print("Init first order part")
129 |         self.first_order_weight = nn.ModuleList([nn.ParameterList([torch.nn.Parameter(torch.randn(self.embedding_size), requires_grad=True) for j in range(self.field_size)]) for i in range(self.deep_layers[0])])
130 |         self.bias = torch.nn.Parameter(torch.randn(self.deep_layers[0]), requires_grad=True)
131 |         print("Init first order part finished")
132 | 
133 |         """
134 |             second order part (quadratic part)
135 |         """
136 |         print("Init second order part")
137 |         if self.use_inner_product:
138 |             self.inner_second_weight_emb = nn.ModuleList([nn.ParameterList([torch.nn.Parameter(torch.randn(self.embedding_size), requires_grad=True) for j in range(self.field_size)]) for i in range(self.deep_layers[0])])
139 | 
140 |         if self.use_outer_product:
141 |             arr = []
142 |             for i in range(self.deep_layers[0]):
143 |                 tmp = torch.randn(self.embedding_size,self.embedding_size)
144 |                 arr.append(torch.nn.Parameter(torch.mm(tmp,tmp.t())))
145 |             self.outer_second_weight_emb = nn.ParameterList(arr)
146 |         print("Init second order part finished")
147 | 
148 | 
149 |         print("Init nn part")
150 | 
151 |         for i, h in enumerate(self.deep_layers[1:], 1):
152 |             setattr(self, 'linear_' + str(i), nn.Linear(self.deep_layers[i - 1], self.deep_layers[i]))
153 |             if self.is_batch_norm:
154 |                 setattr(self, 'batch_norm_' + str(i), nn.BatchNorm1d(deep_layers[i]))
155 |             if self.is_deep_dropout:
156 |                 setattr(self, 'linear_' + str(i) + '_dropout', nn.Dropout(self.dropout_deep[i]))
157 |         self.deep_last_layer = nn.Linear(self.deep_layers[-1], self.n_class)
158 |         print("Init nn part succeed")
159 | 
160 |         print "Init succeed"
161 | 
162 |     def forward(self, Xi, Xv):
163 |         """
164 |         :param Xi: index input tensor, batch_size * k * 1
165 |         :param Xv: value input tensor, batch_size * k * 1
166 |         :param is_pretrain: the para to decide fm pretrain or not
167 |         :return: the last output
168 |         """
169 | 
170 |         """
171 |             embedding
172 |         """
173 |         emb_arr = [(torch.sum(emb(Xi[:,i,:]),1).t()*Xv[:,i]).t() for i, emb in enumerate(self.embeddings)]
174 | 
175 |         """
176 |             first order part (linear part)
177 |         """
178 |         first_order_arr = []
179 |         for i, weight_arr in enumerate(self.first_order_weight):
180 |             tmp_arr = []
181 |             for j, weight in enumerate(weight_arr):
182 |                 tmp_arr.append(torch.sum(emb_arr[j]*weight,1))
183 |             first_order_arr.append(sum(tmp_arr).view([-1,1]))
184 |         first_order = torch.cat(first_order_arr,1)
185 | 
186 |         """
187 |             second order part (quadratic part)
188 |         """
189 |         if self.use_inner_product:
190 |             inner_product_arr = []
191 |             for i, weight_arr in enumerate(self.inner_second_weight_emb):
192 |                 tmp_arr = []
193 |                 for j, weight in enumerate(weight_arr):
194 |                     tmp_arr.append(torch.sum(emb_arr[j] * weight, 1))
195 |                 sum_ = sum(tmp_arr)
196 |                 inner_product_arr.append((sum_*sum_).view([-1,1]))
197 |             inner_product = torch.cat(inner_product_arr,1)
198 |             first_order = first_order + inner_product
199 | 
200 |         if self.use_outer_product:
201 |             outer_product_arr = []
202 |             emb_arr_sum = sum(emb_arr)
203 |             emb_matrix_arr = torch.bmm(emb_arr_sum.view([-1,self.embedding_size,1]),emb_arr_sum.view([-1,1,self.embedding_size]))
204 |             for i, weight in enumerate(self.outer_second_weight_emb):
205 |                 outer_product_arr.append(torch.sum(torch.sum(emb_matrix_arr*weight,2),1).view([-1,1]))
206 |             outer_product = torch.cat(outer_product_arr,1)
207 |             first_order = first_order + outer_product
208 | 
209 |         """
210 |             nn part
211 |         """
212 |         if self.deep_layers_activation == 'sigmoid':
213 |             activation = F.sigmoid
214 |         elif self.deep_layers_activation == 'tanh':
215 |             activation = F.tanh
216 |         else:
217 |             activation = F.relu
218 |         x_deep = first_order
219 |         for i, h in enumerate(self.deep_layers[1:], 1):
220 |             x_deep = getattr(self, 'linear_' + str(i))(x_deep)
221 |             if self.is_batch_norm:
222 |                 x_deep = getattr(self, 'batch_norm_' + str(i))(x_deep)
223 |             x_deep = activation(x_deep)
224 |             if self.is_deep_dropout:
225 |                 x_deep = getattr(self, 'linear_' + str(i) + '_dropout')(x_deep)
226 |         x_deep = self.deep_last_layer(x_deep)
227 |         return torch.sum(x_deep, 1)
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 |     def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None,
235 |                 y_valid = None, ealry_stopping=False, refit=False, save_path = None):
236 |         """
237 |         :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
238 |                         indi_j is the feature index of feature field j of sample i in the training set
239 |         :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
240 |                         vali_j is the feature value of feature field j of sample i in the training set
241 |                         vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
242 |         :param y_train: label of each sample in the training set
243 |         :param Xi_valid: list of list of feature indices of each sample in the validation set
244 |         :param Xv_valid: list of list of feature values of each sample in the validation set
245 |         :param y_valid: label of each sample in the validation set
246 |         :param ealry_stopping: perform early stopping or not
247 |         :param refit: refit the model on the train+valid dataset or not
248 |         :param save_path: the path to save the model
249 |         :return:
250 |         """
251 |         """
252 |         pre_process
253 |         """
254 |         if save_path and not os.path.exists('/'.join(save_path.split('/')[0:-1])):
255 |             print("Save path is not existed!")
256 |             return
257 | 
258 |         if self.verbose:
259 |             print("pre_process data ing...")
260 |         is_valid = False
261 |         Xi_train = np.array(Xi_train).reshape((-1,self.field_size,1))
262 |         Xv_train = np.array(Xv_train)
263 |         y_train = np.array(y_train)
264 |         x_size = Xi_train.shape[0]
265 |         if Xi_valid:
266 |             Xi_valid = np.array(Xi_valid).reshape((-1,self.field_size,1))
267 |             Xv_valid = np.array(Xv_valid)
268 |             y_valid = np.array(y_valid)
269 |             x_valid_size = Xi_valid.shape[0]
270 |             is_valid = True
271 |         if self.verbose:
272 |             print("pre_process data finished")
273 | 
274 |         """
275 |             train model
276 |         """
277 |         model = self.train()
278 |         optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
279 |         if self.optimizer_type == 'adam':
280 |             optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
281 |         elif self.optimizer_type == 'rmsp':
282 |             optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
283 |         elif self.optimizer_type == 'adag':
284 |             optimizer = torch.optim.Adagrad(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
285 | 
286 |         criterion = F.binary_cross_entropy_with_logits
287 | 
288 |         train_result = []
289 |         valid_result = []
290 |         for epoch in range(self.n_epochs):
291 |             total_loss = 0.0
292 |             batch_iter = x_size // self.batch_size
293 |             epoch_begin_time = time()
294 |             batch_begin_time = time()
295 |             for i in range(batch_iter+1):
296 |                 offset = i*self.batch_size
297 |                 end = min(x_size, offset+self.batch_size)
298 |                 if offset == end:
299 |                     break
300 |                 batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
301 |                 batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
302 |                 batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
303 |                 if self.use_cuda:
304 |                     batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
305 |                 optimizer.zero_grad()
306 |                 outputs = model(batch_xi, batch_xv)
307 |                 loss = criterion(outputs, batch_y)
308 |                 loss.backward()
309 |                 optimizer.step()
310 | 
311 |                 total_loss += loss.data[0]
312 |                 if self.verbose:
313 |                     if i % 100 == 99:  # print every 100 mini-batches
314 |                         eval = self.evaluate(batch_xi, batch_xv, batch_y)
315 |                         print('[%d, %5d] loss: %.6f metric: %.6f time: %.1f s' %
316 |                               (epoch + 1, i + 1, total_loss, eval, time()-batch_begin_time))
317 |                         total_loss = 0.0
318 |                         batch_begin_time = time()
319 | 
320 |             train_loss, train_eval = self.eval_by_batch(Xi_train,Xv_train,y_train,x_size)
321 |             train_result.append(train_eval)
322 |             print('*'*50)
323 |             print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
324 |                   (epoch + 1, train_loss, train_eval, time()-epoch_begin_time))
325 |             print('*'*50)
326 | 
327 |             if is_valid:
328 |                 valid_loss, valid_eval = self.eval_by_batch(Xi_valid, Xv_valid, y_valid, x_valid_size)
329 |                 valid_result.append(valid_eval)
330 |                 print('*' * 50)
331 |                 print('[%d] loss: %.6f metric: %.6f time: %.1f s' %
332 |                       (epoch + 1, valid_loss, valid_eval,time()-epoch_begin_time))
333 |                 print('*' * 50)
334 |             if save_path:
335 |                 torch.save(self.state_dict(),save_path)
336 |             if is_valid and ealry_stopping and self.training_termination(valid_result):
337 |                 print("early stop at [%d] epoch!" % (epoch+1))
338 |                 break
339 | 
340 |         # fit a few more epoch on train+valid until result reaches the best_train_score
341 |         if is_valid and refit:
342 |             if self.verbose:
343 |                 print("refitting the model")
344 |             if self.greater_is_better:
345 |                 best_epoch = np.argmax(valid_result)
346 |             else:
347 |                 best_epoch = np.argmin(valid_result)
348 |             best_train_score = train_result[best_epoch]
349 |             Xi_train = np.concatenate((Xi_train,Xi_valid))
350 |             Xv_train = np.concatenate((Xv_train,Xv_valid))
351 |             y_train = np.concatenate((y_train,y_valid))
352 |             x_size = x_size + x_valid_size
353 |             self.shuffle_in_unison_scary(Xi_train,Xv_train,y_train)
354 |             for epoch in range(64):
355 |                 batch_iter = x_size // self.batch_size
356 |                 for i in range(batch_iter + 1):
357 |                     offset = i * self.batch_size
358 |                     end = min(x_size, offset + self.batch_size)
359 |                     if offset == end:
360 |                         break
361 |                     batch_xi = Variable(torch.LongTensor(Xi_train[offset:end]))
362 |                     batch_xv = Variable(torch.FloatTensor(Xv_train[offset:end]))
363 |                     batch_y = Variable(torch.FloatTensor(y_train[offset:end]))
364 |                     if self.use_cuda:
365 |                         batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
366 |                     optimizer.zero_grad()
367 |                     outputs = model(batch_xi, batch_xv)
368 |                     loss = criterion(outputs, batch_y)
369 |                     loss.backward()
370 |                     optimizer.step()
371 |                 train_loss, train_eval = self.eval_by_batch(Xi_train, Xv_train, y_train, x_size)
372 |                 if save_path:
373 |                     torch.save(self.state_dict(), save_path)
374 |                 if abs(best_train_score-train_eval) < 0.001 or \
375 |                         (self.greater_is_better and train_eval > best_train_score) or \
376 |                         ((not self.greater_is_better) and train_result < best_train_score):
377 |                     break
378 |             if self.verbose:
379 |                 print("refit finished")
380 | 
381 |     def eval_by_batch(self,Xi, Xv, y, x_size):
382 |         total_loss = 0.0
383 |         y_pred = []
384 |         batch_size = 16384
385 |         batch_iter = x_size // batch_size
386 |         criterion = F.binary_cross_entropy_with_logits
387 |         model = self.eval()
388 |         for i in range(batch_iter+1):
389 |             offset = i * batch_size
390 |             end = min(x_size, offset + batch_size)
391 |             if offset == end:
392 |                 break
393 |             batch_xi = Variable(torch.LongTensor(Xi[offset:end]))
394 |             batch_xv = Variable(torch.FloatTensor(Xv[offset:end]))
395 |             batch_y = Variable(torch.FloatTensor(y[offset:end]))
396 |             if self.use_cuda:
397 |                 batch_xi, batch_xv, batch_y = batch_xi.cuda(), batch_xv.cuda(), batch_y.cuda()
398 |             outputs = model(batch_xi, batch_xv)
399 |             pred = F.sigmoid(outputs).cpu()
400 |             y_pred.extend(pred.data.numpy())
401 |             loss = criterion(outputs, batch_y)
402 |             total_loss += loss.data[0]*(end-offset)
403 |         total_metric = self.eval_metric(y,y_pred)
404 |         return total_loss/x_size, total_metric
405 | 
406 |     # shuffle three lists simutaneously
407 |     def shuffle_in_unison_scary(self, a, b, c):
408 |         rng_state = np.random.get_state()
409 |         np.random.shuffle(a)
410 |         np.random.set_state(rng_state)
411 |         np.random.shuffle(b)
412 |         np.random.set_state(rng_state)
413 |         np.random.shuffle(c)
414 | 
415 |     def training_termination(self, valid_result):
416 |         if len(valid_result) > 4:
417 |             if self.greater_is_better:
418 |                 if valid_result[-1] < valid_result[-2] and \
419 |                     valid_result[-2] < valid_result[-3] and \
420 |                     valid_result[-3] < valid_result[-4]:
421 |                     return True
422 |             else:
423 |                 if valid_result[-1] > valid_result[-2] and \
424 |                     valid_result[-2] > valid_result[-3] and \
425 |                     valid_result[-3] > valid_result[-4]:
426 |                     return True
427 |         return False
428 | 
429 |     def predict(self, Xi, Xv):
430 |         """
431 |         :param Xi: the same as fit function
432 |         :param Xv: the same as fit function
433 |         :return: output, ont-dim array
434 |         """
435 |         Xi = np.array(Xi).reshape((-1,self.field_size,1))
436 |         Xi = Variable(torch.LongTensor(Xi))
437 |         Xv = Variable(torch.FloatTensor(Xv))
438 |         if self.use_cuda and torch.cuda.is_available():
439 |             Xi, Xv = Xi.cuda(), Xv.cuda()
440 | 
441 |         model = self.eval()
442 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
443 |         return (pred.data.numpy() > 0.5)
444 | 
445 |     def predict_proba(self, Xi, Xv):
446 |         Xi = np.array(Xi).reshape((-1, self.field_size, 1))
447 |         Xi = Variable(torch.LongTensor(Xi))
448 |         Xv = Variable(torch.FloatTensor(Xv))
449 |         if self.use_cuda and torch.cuda.is_available():
450 |             Xi, Xv = Xi.cuda(), Xv.cuda()
451 | 
452 |         model = self.eval()
453 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
454 |         return pred.data.numpy()
455 | 
456 |     def inner_predict(self, Xi, Xv):
457 |         """
458 |         :param Xi: tensor of feature index
459 |         :param Xv: tensor of feature value
460 |         :return: output, numpy
461 |         """
462 |         model = self.eval()
463 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
464 |         return (pred.data.numpy() > 0.5)
465 | 
466 |     def inner_predict_proba(self, Xi, Xv):
467 |         """
468 |         :param Xi: tensor of feature index
469 |         :param Xv: tensor of feature value
470 |         :return: output, numpy
471 |         """
472 |         model = self.eval()
473 |         pred = F.sigmoid(model(Xi, Xv)).cpu()
474 |         return pred.data.numpy()
475 | 
476 | 
477 |     def evaluate(self, Xi, Xv, y):
478 |         """
479 |         :param Xi: tensor of feature index
480 |         :param Xv: tensor of feature value
481 |         :param y: tensor of labels
482 |         :return: metric of the evaluation
483 |         """
484 |         y_pred = self.inner_predict_proba(Xi, Xv)
485 |         return self.eval_metric(y.cpu().data.numpy(), y_pred)
486 | 
487 | """
488 |     test part
489 | """
490 | import sys
491 | sys.path.append('../')
492 | from utils import data_preprocess
493 | 
494 | result_dict = data_preprocess.read_criteo_data('../data/train.csv', '../data/category_emb.csv')
495 | test_dict = data_preprocess.read_criteo_data('../data/test.csv', '../data/category_emb.csv')
496 | with torch.cuda.device(2):
497 |     pnn = PNN(39, result_dict['feature_sizes'], batch_size=128 * 64, verbose=True, use_cuda=True,weight_decay=0.00001, use_inner_product=True, use_outer_product=True).cuda()
498 |     pnn.fit(result_dict['index'], result_dict['value'], result_dict['label'],
499 |                test_dict['index'], test_dict['value'], test_dict['label'],ealry_stopping=True,refit=False,save_path='../data/model/pnn.pkl')
500 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/model/__init__.py


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/utils/__init__.py


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/utils/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/data_preprocess.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/utils/__pycache__/data_preprocess.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/common.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/univeryinli/recommender-system-pytorch/44d7561d4778d2d62fa92855d2b0a2c43c6ca3e4/utils/common.py


--------------------------------------------------------------------------------
/utils/data_preprocess.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | Created on Dec 10, 2017
  5 | @author: jachin,Nie
  6 | 
  7 | This script is used to preprocess the raw data file
  8 | 
  9 | """
 10 | 
 11 | import sys
 12 | import math
 13 | import argparse
 14 | import hashlib, csv, math, os, pickle, subprocess
 15 | import pandas as pd
 16 | 
 17 | def gen_criteo_category_index(file_path):
 18 |     cate_dict = []
 19 |     for i in range(26):
 20 |         cate_dict.append({})
 21 |     for line in open(file_path, 'r'):
 22 |         datas = line.replace('\n','').split('\t')
 23 |         for i, item in enumerate(datas[14:]):
 24 |             if not cate_dict[i].has_key(item):
 25 |                 cate_dict[i][item] = len(cate_dict[i])
 26 |     return cate_dict
 27 | 
 28 | def write_criteo_category_index(file_path, cate_dict_arr):
 29 |     f = open(file_path,'w')
 30 |     for i, cate_dict in enumerate(cate_dict_arr):
 31 |         for key in cate_dict:
 32 |             f.write(str(i)+','+key+','+str(cate_dict[key])+'\n')
 33 | 
 34 | def load_criteo_category_index(file_path):
 35 |     f = open(file_path,'r')
 36 |     cate_dict = []
 37 |     for i in range(39):
 38 |         cate_dict.append({})
 39 |     for line in f:
 40 |         datas = line.strip().split(',')
 41 |         cate_dict[int(datas[0])][datas[1]] = int(datas[2])
 42 |     return cate_dict
 43 | 
 44 | def read_raw_criteo_data(file_path, embedding_path, type):
 45 |     """
 46 |     :param file_path: string
 47 |     :param type: string (train or test)
 48 |     :return: result: dict
 49 |             result['continuous_feat']:two-dim array
 50 |             result['category_feat']:dict
 51 |             result['category_feat']['index']:two-dim array
 52 |             result['category_feat']['value']:two-dim array
 53 |             result['label']: one-dim array
 54 |     """
 55 |     begin_index = 1
 56 |     if type != 'train' and type != 'test':
 57 |         print("type error")
 58 |         return {}
 59 |     elif type == 'test':
 60 |         begin_index = 0
 61 |     cate_embedding = load_criteo_category_index(embedding_path)
 62 |     result = {'continuous_feat':[], 'category_feat':{'index':[],'value':[]}, 'label':[], 'feature_sizes':[]}
 63 |     for i, item in enumerate(cate_embedding):
 64 |         result['feature_sizes'].append(len(item))
 65 |     f = open(file_path)
 66 |     for line in f:
 67 |         datas = line.replace('\n', '').split('\t')
 68 | 
 69 |         indexs = []
 70 |         values = []
 71 |         flag = True
 72 |         for i, item in enumerate(datas[begin_index + 13:]):
 73 |             if not cate_embedding[i].has_key(item):
 74 |                 flag = False
 75 |                 break
 76 |             indexs.append(cate_embedding[i][item])
 77 |             values.append(1)
 78 |         if not flag:
 79 |             continue
 80 |         result['category_feat']['index'].append(indexs)
 81 |         result['category_feat']['value'].append(values)
 82 | 
 83 |         if type == 'train':
 84 |             result['label'].append(int(datas[0]))
 85 |         else:
 86 |             result['label'].append(0)
 87 | 
 88 |         continuous_array = []
 89 |         for item in datas[begin_index:begin_index+13]:
 90 |             if item == '':
 91 |                 continuous_array.append(-10.0)
 92 |             elif float(item) < 2.0:
 93 |                 continuous_array.append(float(item))
 94 |             else:
 95 |                 continuous_array.append(math.log(float(item)))
 96 |         result['continuous_feat'].append(continuous_array)
 97 | 
 98 |     return result
 99 | 
100 | def read_criteo_data(file_path,emb_file):
101 |     result = {'lable':[], 'index':[],'value':[],'feature_sizes':[]}
102 |     cate_dict = load_criteo_category_index(emb_file)
103 |     for item in cate_dict:
104 |         result['feature_sizes'].append(len(item))
105 |     f = open(file_path,'r')
106 |     for line in f:
107 |         datas = line.strip().split(',')
108 |         result['lable'].append(int(datas[0]))
109 |         indexs = [int(item) for item in datas[1:]]
110 |         values = [1 for i in range(39)]
111 |         result['index'].append(indexs)
112 |         result['value'].append(values)
113 |     return result
114 | 
115 | def gen_criteo_category_emb_from_libffmfile(filepath, dir_path):
116 |     fr = open(filepath)
117 |     cate_emb_arr = [{} for i in range(39)]
118 |     for line in fr:
119 |         datas = line.strip().split(' ')
120 |         for item in datas[1:]:
121 |             [filed, index, value] = item.split(':')
122 |             filed = int(filed)
123 |             index = int(index)
124 |             if not cate_emb_arr[filed].has_key(index):
125 |                 cate_emb_arr[filed][index] = len(cate_emb_arr[filed])
126 | 
127 |     with open(dir_path, 'w') as f:
128 |         for i,item in enumerate(cate_emb_arr):
129 |             for key in item:
130 |                 f.write(str(i)+','+str(key)+','+str(item[key])+'\n')
131 | 
132 | def gen_emb_input_file(filepath, emb_file, dir_path):
133 |     cate_dict = load_criteo_category_index(emb_file)
134 |     fr = open(filepath,'r')
135 |     fw = open(dir_path,'w')
136 |     for line in fr:
137 |         row = []
138 |         datas = line.strip().split(' ')
139 |         row.append(datas[0])
140 |         for item in datas[1:]:
141 |             [filed, index, value] = item.split(':')
142 |             filed = int(filed)
143 |             row.append(str(cate_dict[filed][index]))
144 |         fw.write(','.join(row)+'\n')
145 | 
146 | 
147 | def read_csv_dataset(train_csv,task='like'):
148 |     train_dict={}
149 |     test_dict={}
150 |     train_csv=pd.read_csv(train_csv)
151 |     if task=='like':
152 |         lable=train_csv[task]
153 |     elif task=='finish':
154 |         lable=train_csv[task]
155 |     train_dict['lable']=lable[0:int(len(lable)*0.8)].to_list()
156 |     test_dict['lable']=lable[int(len(lable)*0.8)+1:-1].to_list()
157 | 
158 |     feild = ['uid','user_city','item_id','author_id','item_city','channel','music_id','video_duration']
159 |     value=[1]*len(feild)
160 |     values=[value for i in range(len(lable))]
161 |     train_dict['value']=values[0:int(len(lable)*0.8)]
162 |     test_dict['value']=values[int(len(lable)*0.8)+1:-1]
163 | 
164 |     feature_sizes=[73974,397,4122689,850308,462,5,89779,641]
165 |     train_dict['feature_sizes']=feature_sizes
166 |     test_dict['feature_sizes']=feature_sizes
167 |     '''
168 |     creat_time_segment=35898.1
169 |     min_num=53015373867
170 |     train_csv['creat_time']=train_csv['creat_time'].apply(lambda x:int((x-min_num)/creat_time_segment))
171 |     '''
172 |     
173 |     temp=train_csv[feild].values
174 |     train_dict['index']=temp[0:int(len(lable)*0.8)].tolist()
175 |     test_dict['index']=temp[int(len(lable)*0.8)+1:-1].tolist()
176 | 
177 |     return train_dict,test_dict
178 | 
179 | 
180 | def read_csv_dataset_pred(pred_csv,task='like'):
181 |     pred_dict={}
182 |     train_csv=pd.read_csv(pred_csv)
183 |     if task=='like':
184 |         lable=train_csv[task]
185 |     elif task=='finish':
186 |         lable=train_csv[task]
187 |     pred_dict['lable']=lable.to_list()
188 | 
189 |     feild = ['uid','user_city','item_id','author_id','item_city','channel','music_id','creat_time','video_duration']
190 |     value=[1]*len(feild)
191 |     values=[value for i in range(len(lable))]
192 |     pred_dict['value']=values
193 | 
194 |     feature_sizes=[]
195 |     for i in feild:
196 |         feature_size=max(train_csv[i])+1
197 |         if i=='creat_time':
198 |             feature_size=2010
199 |         feature_sizes.append(feature_size)
200 |     pred_dict['feature_sizes']=feature_sizes
201 | 
202 |     creat_time_segment=35898.1
203 |     min_num=53015373867
204 |     train_csv['creat_time']=train_csv['creat_time'].apply(lambda x:int((x-min_num)/creat_time_segment))
205 | 
206 |     temp=train_csv[feild].values
207 |     pred_dict['index']=temp.tolist()
208 |     return pred_dict
209 | 


--------------------------------------------------------------------------------
/utils/sample.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | """
 4 | Created on Dec 10, 2017
 5 | @author: jachin,Nie
 6 | 
 7 | This script is used to sample data from the raw dataset
 8 | 
 9 | python sample.py s_path t_path prob
10 | 
11 | """
12 | 
13 | import argparse
14 | import sys
15 | import random
16 | 
17 | s_path = sys.argv[1]
18 | t_path = sys.argv[2]
19 | prob = float(sys.argv[3])
20 | 
21 | with open(t_path,'wb') as f:
22 |     for line in open(s_path,'rb'):
23 |         if random.random() < prob:
24 |             f.write(line)
25 | 


--------------------------------------------------------------------------------
/utils/split_train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | """
 4 | Created on Dec 10, 2017
 5 | @author: jachin,Nie
 6 | 
 7 | This script is used to sample data from the raw dataset
 8 | 
 9 | python sample.py s_path t_path prob
10 | 
11 | """
12 | 
13 | import argparse
14 | import sys
15 | import random
16 | 
17 | s_path = sys.argv[1]
18 | tr_path = sys.argv[2]
19 | te_path = sys.argv[3]
20 | prob = float(sys.argv[4])
21 | 
22 | with open(tr_path,'wb') as fr:
23 |     with open(te_path,'wb') as fe:
24 |         for line in open(s_path,'rb'):
25 |             if random.random() < prob:
26 |                 fr.write(line)
27 |             else:
28 |                 fe.write(line)
29 | 


--------------------------------------------------------------------------------