├── GBFT+LR_simple.py └── README.md /GBFT+LR_simple.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | from __future__ import division 4 | import json 5 | import lightgbm as lgb 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn.metrics import mean_squared_error 9 | from sklearn.linear_model import LogisticRegression 10 | 11 | # load or create your dataset 12 | print('Load data...') 13 | df_train = pd.read_csv('../train.txt', header=None, sep=' ') 14 | df_test = pd.read_csv('../test.txt', header=None, sep=' ') 15 | 16 | 17 | y_train = df_train[0] # training label 18 | y_test = df_test[0] # testing label 19 | X_train = df_train.drop(0, axis=1) # training dataset 20 | X_test = df_test.drop(0, axis=1) # testing dataset 21 | 22 | # create dataset for lightgbm 23 | lgb_train = lgb.Dataset(X_train, y_train) 24 | lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) 25 | 26 | # specify your configurations as a dict 27 | params = { 28 | 'task': 'train', 29 | 'boosting_type': 'gbdt', 30 | 'objective': 'binary', 31 | 'metric': {'binary_logloss'}, 32 | 'num_leaves': 63, 33 | 'num_trees': 100, 34 | 'learning_rate': 0.01, 35 | 'feature_fraction': 0.9, 36 | 'bagging_fraction': 0.8, 37 | 'bagging_freq': 5, 38 | 'verbose': 0 39 | } 40 | 41 | # number of leaves,will be used in feature transformation 42 | num_leaf = 63 43 | 44 | 45 | print('Start training...') 46 | # train 47 | gbm = lgb.train(params, 48 | lgb_train, 49 | num_boost_round=100, 50 | valid_sets=lgb_train) 51 | 52 | print('Save model...') 53 | # save model to file 54 | gbm.save_model('model.txt') 55 | 56 | print('Start predicting...') 57 | # predict and get data on leaves, training data 58 | y_pred = gbm.predict(X_train,pred_leaf=True) 59 | 60 | # feature transformation and write result 61 | print('Writing transformed training data') 62 | transformed_training_matrix = np.zeros([len(y_pred),len(y_pred[0]) * num_leaf],dtype=np.int64) 63 | for i in range(0,len(y_pred)): 64 | temp = np.arange(len(y_pred[0])) * num_leaf - 1 + np.array(y_pred[i]) 65 | transformed_training_matrix[i][temp] += 1 66 | 67 | #for i in range(0,len(y_pred)): 68 | # for j in range(0,len(y_pred[i])): 69 | # transformed_training_matrix[i][j * num_leaf + y_pred[i][j]-1] = 1 70 | 71 | # predict and get data on leaves, testing data 72 | y_pred = gbm.predict(X_test,pred_leaf=True) 73 | 74 | # feature transformation and write result 75 | print('Writing transformed testing data') 76 | transformed_testing_matrix = np.zeros([len(y_pred),len(y_pred[0]) * num_leaf],dtype=np.int64) 77 | for i in range(0,len(y_pred)): 78 | temp = np.arange(len(y_pred[0])) * num_leaf - 1 + np.array(y_pred[i]) 79 | transformed_testing_matrix[i][temp] += 1 80 | 81 | #for i in range(0,len(y_pred)): 82 | # for j in range(0,len(y_pred[i])): 83 | # transformed_testing_matrix[i][j * num_leaf + y_pred[i][j]-1] = 1 84 | 85 | print('Calculate feature importances...') 86 | # feature importances 87 | print('Feature importances:', list(gbm.feature_importance())) 88 | print('Feature importances:', list(gbm.feature_importance("gain"))) 89 | 90 | 91 | # Logestic Regression Start 92 | print("Logestic Regression Start") 93 | 94 | # load or create your dataset 95 | print('Load data...') 96 | 97 | c = np.array([1,0.5,0.1,0.05,0.01,0.005,0.001]) 98 | for t in range(0,len(c)): 99 | lm = LogisticRegression(penalty='l2',C=c[t]) # logestic model construction 100 | lm.fit(transformed_training_matrix,y_train) # fitting the data 101 | 102 | #y_pred_label = lm.predict(transformed_training_matrix ) # For training data 103 | #y_pred_label = lm.predict(transformed_testing_matrix) # For testing data 104 | #y_pred_est = lm.predict_proba(transformed_training_matrix) # Give the probabilty on each label 105 | y_pred_est = lm.predict_proba(transformed_testing_matrix) # Give the probabilty on each label 106 | 107 | #print('number of testing data is ' + str(len(y_pred_label))) 108 | #print(y_pred_est) 109 | 110 | # calculate predict accuracy 111 | #num = 0 112 | #for i in range(0,len(y_pred_label)): 113 | #if y_test[i] == y_pred_label[i]: 114 | # if y_train[i] == y_pred_label[i]: 115 | # num += 1 116 | #print('penalty parameter is '+ str(c[t])) 117 | #print("prediction accuracy is " + str((num)/len(y_pred_label))) 118 | 119 | # Calculate the Normalized Cross-Entropy 120 | # for testing data 121 | NE = (-1) / len(y_pred_est) * sum(((1+y_test)/2 * np.log(y_pred_est[:,1]) + (1-y_test)/2 * np.log(1 - y_pred_est[:,1]))) 122 | # for training data 123 | #NE = (-1) / len(y_pred_est) * sum(((1+y_train)/2 * np.log(y_pred_est[:,1]) + (1-y_train)/2 * np.log(1 - y_pred_est[:,1]))) 124 | print("Normalized Cross Entropy " + str(NE)) 125 | 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LightGBM-GBDT-LR 2 | A simple python code of applying GBDT+LR for CTR prediction 3 | 4 | The part of GBDT is proceeded by LightGBM, which is recently proposed by Microsoft, please install it first 5 | 6 | https://github.com/Microsoft/LightGBM 7 | 8 | The part of Logestic Regression is proceeded by sklearn machine learning. 9 | 10 | The main idea is from the work of Facebook published in 2014 that merging GBDT and LR for CTR prediction 11 | 12 | http://quinonero.net/Publications/predicting-clicks-facebook.pdf 13 | 14 | GBDT is used for feature transformation while the LR uses the transformed data for prediction 15 | 16 | --------------------------------------------------------------------------------