├── README.md └── xgboost_gbdt_lr_utils.py /README.md: -------------------------------------------------------------------------------- 1 | # gbdt_lr_in_recsys 2 | * First, using gbdt+lr in recommend system and comparing the auc of lr, gbdt, gbdt+lr. 3 | * Second, using hyperopt-sklearn to automatically tune the hyperparameters of gbdt. 4 | * Data is private and not available, but just the commonly used in recsys(the commonly used libsvm format data file). 5 | -------------------------------------------------------------------------------- /xgboost_gbdt_lr_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin python 2 | #-*- coding:utf-8 -*- 3 | ''' 4 | author:zhiqiangxu 5 | date:2016/8/7 6 | ''' 7 | import xgboost as xgb 8 | from sklearn.preprocessing import OneHotEncoder 9 | from sklearn.cross_validation import train_test_split 10 | from sklearn.datasets import load_svmlight_file 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.metrics import roc_curve, auc, roc_auc_score 13 | from sklearn.externals import joblib 14 | from sklearn.ensemble import GradientBoostingClassifier 15 | from sklearn.grid_search import GridSearchCV 16 | from sklearn.externals import joblib 17 | import matplotlib.pyplot as plt 18 | import numpy as np 19 | import pandas as pd 20 | from scipy.sparse import csr_matrix 21 | from scipy.sparse import hstack 22 | import time, os, random, sys 23 | import math 24 | import hyperopt.tpe 25 | import hpsklearn.components 26 | import hpsklearn.demo_support 27 | random.seed(1) 28 | 29 | #choosing some samples and random split into train set and test set 30 | def datasetSplit(libSvmFile, trainFileName, testFileName, testSetRatio, lines): 31 | dataFile = open(libSvmFile, 'r') 32 | dataList = dataFile.readlines() 33 | totalLines = len(dataList) 34 | testFileLength = int(testSetRatio*lines) 35 | trainFileLength = lines - testFileLength 36 | List = range(totalLines) 37 | random.shuffle(List) 38 | trainFile = open(trainFileName, 'w') 39 | testFile = open(testFileName, 'w') 40 | posSampleCnt = 0 41 | for i in range(lines): 42 | if float(dataList[List[i]].split(' ')[0]) > 0.0: 43 | posSampleCnt = posSampleCnt + 1 44 | if i < trainFileLength: 45 | trainFile.write(dataList[List[i]]) 46 | else: 47 | testFile.write(dataList[List[i]]) 48 | dataFile.close() 49 | trainFile.close() 50 | testFile.close() 51 | print('Positive Sample Count: %d' % posSampleCnt) 52 | return posSampleCnt 53 | 54 | #calculate the positive and negative samples counts 55 | def calcPosNegCnt(libSvmFile): 56 | dataFile = open(libSvmFile, 'r') 57 | dataList = dataFile.readlines() 58 | posSampleCnt = 0 59 | negSampleCnt = 0 60 | for i in range(len(dataList)): 61 | if float(dataList[i].split(' ')[0]) > 0.0: 62 | posSampleCnt = posSampleCnt + 1 63 | else: 64 | negSampleCnt = negSampleCnt + 1 65 | print 'Positive Sample: %d' % posSampleCnt 66 | print 'Negative Sample: %d' % negSampleCnt 67 | 68 | #training xgboost and using xgboost to encode test set features 69 | def xgboost_lr_train_test(libsvmFileNameInitial): 70 | posSampleCnt = datasetSplit(libsvmFileNameInitial, 'data_train_th100', 'data_test_th100', 0.2, 1100000) 71 | X_train, y_train = load_svmlight_file('data_train_th100') 72 | print(X_train.shape) 73 | X_test, y_test = load_svmlight_file('data_test_th100') 74 | #training xgboost 75 | negPosRatio = (1100000-posSampleCnt)/posSampleCnt 76 | xgbclf = xgb.XGBClassifier(nthread=4, scale_pos_weight=negPosRatio, learning_rate=0.08, 77 | n_estimators=120, max_depth=5, gamma=0, subsample=0.9, colsample_bytree=0.5) 78 | xgbclf.fit(X_train, y_train) 79 | y_pred_train = xgbclf.predict_proba(X_train)[:, 1] 80 | xgb_train_auc = roc_auc_score(y_train, y_pred_train) 81 | print('xgboost train auc: %.5f' % xgb_train_auc) 82 | y_pred_test = xgbclf.predict_proba(X_test)[:, 1] 83 | xgb_test_auc = roc_auc_score(y_test, y_pred_test) 84 | print('xgboost test auc: %.5f' % xgb_test_auc) 85 | #using xgboost to encode train set and test set features 86 | X_train_leaves = xgbclf.apply(X_train) 87 | train_rows = X_train_leaves.shape[0] 88 | X_test_leaves = xgbclf.apply(X_test) 89 | X_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0) 90 | X_leaves = X_leaves.astype(np.int32) 91 | (rows, cols) = X_leaves.shape 92 | cum_count = np.zeros((1, cols), dtype=np.int32) 93 | for j in range(cols): 94 | if j == 0: 95 | cum_count[0][j] = len(np.unique(X_leaves[:, j])) 96 | else: 97 | cum_count[0][j] = len(np.unique(X_leaves[:, j])) + cum_count[0][j-1] 98 | print('Transform features genenrated by xgboost...') 99 | for j in range(cols): 100 | keyMapDict = {} 101 | if j == 0: 102 | initial_index = 1 103 | else: 104 | initial_index = cum_count[0][j-1]+1 105 | for i in range(rows): 106 | if keyMapDict.has_key(X_leaves[i, j]) == False: 107 | keyMapDict[X_leaves[i, j]] = initial_index 108 | X_leaves[i, j] = initial_index 109 | initial_index = initial_index + 1 110 | else: 111 | X_leaves[i, j] = keyMapDict[X_leaves[i, j]] 112 | #writing encoded features into file 113 | print('Write xgboost learned features to file ...') 114 | xgbFeatureLibsvm = open('xgb_feature_libsvm', 'w') 115 | for i in range(rows): 116 | if i < train_rows: 117 | xgbFeatureLibsvm.write(str(y_train[i])) 118 | else: 119 | xgbFeatureLibsvm.write(str(y_test[i-train_rows])) 120 | for j in range(cols): 121 | xgbFeatureLibsvm.write(' '+str(X_leaves[i, j])+':1.0') 122 | xgbFeatureLibsvm.write('\n') 123 | xgbFeatureLibsvm.close() 124 | 125 | #using xgboost encoded feature in lr to calculate auc 126 | def xgb_feature_lr_train_test(xgbfeaturefile, origin_libsvm_file): 127 | datasetSplit(origin_libsvm_file, 'data_train_th100', 'data_test_th100', 0.2, 1100000) 128 | datasetSplit(xgbfeaturefile, 'xgb_feature_train_libsvm','xgb_feature_test_libsvm', 0.2, 1100000) 129 | X_train_origin, y_train_origin = load_svmlight_file('data_train_th100') 130 | X_test_origin, y_test_origin = load_svmlight_file('data_test_th100') 131 | X_train, y_train = load_svmlight_file('xgb_feature_train_libsvm') 132 | print(X_train.shape) 133 | X_test, y_test = load_svmlight_file('xgb_feature_test_libsvm') 134 | print(X_test.shape) 135 | 136 | #fittting lr using just xgboost encoded feature 137 | lr = LogisticRegression(n_jobs=-1, C=0.1, penalty='l1') 138 | lr.fit(X_train, y_train) 139 | joblib.dump(lr, 'lr.m') 140 | y_pred_train = lr.predict_proba(X_train)[:, 1] 141 | lr_train_auc = roc_auc_score(y_train, y_pred_train) 142 | print('LR Train AUC: %.5f' % lr_train_auc) 143 | y_pred_test = lr.predict_proba(X_test)[:, 1] 144 | lr_test_auc = roc_auc_score(y_test, y_pred_test) 145 | print('LR Test AUC: %.5f' % lr_test_auc) 146 | 147 | # fitting lr using xgboost encoded feature and original feature 148 | X_train_ext = hstack([X_train_origin, X_train]) 149 | print(X_train_ext.shape) 150 | del(X_train) 151 | del(X_train_origin) 152 | X_test_ext = hstack([X_test_origin, X_test]) 153 | print(X_test_ext.shape) 154 | del(X_test) 155 | del(X_test_origin) 156 | lr = LogisticRegression(n_jobs=-1, C=0.1, penalty='l1') 157 | lr.fit(X_train_ext, y_train) 158 | joblib.dump(lr, 'lr_ext.m') 159 | y_pred_train = lr.predict_proba(X_train_ext)[:, 1] 160 | lr_train_auc = roc_auc_score(y_train, y_pred_train) 161 | print('LR Ext Train AUC: %.5f' % lr_train_auc) 162 | y_pred_test = lr.predict_proba(X_test_ext)[:, 1] 163 | lr_test_auc = roc_auc_score(y_test, y_pred_test) 164 | print('LR Ext Test AUC: %.5f' % lr_test_auc) 165 | 166 | #using gbdt, gbdt+lr to calculate auc 167 | def gbdt_lr_train_test(libsvmFileName): 168 | datasetSplit(libsvmFileName, 0.2, 'label_feature_data_train', 'label_feature_data_test', 600000) 169 | X_train, y_train = load_svmlight_file('label_feature_data_train') 170 | X_test, y_test = load_svmlight_file('label_feature_data_test') 171 | gbclf = GradientBoostingClassifier(n_estimators=30, max_depth=4, verbose=0) 172 | tuned_parameter = [{'n_estimators':[30, 40, 50, 60], 'max_depth':[3, 4, 5, 6], 'max_features':[0.5,0.7,0.9]}] 173 | gs_clf = GridSearchCV(gbclf, tuned_parameter, cv=5, scoring='roc_auc') 174 | gs_clf.fit(X_train.toarray(), y_train) 175 | print('best parameters set found: ') 176 | print(gs_clf.best_params_) 177 | y_pred_gbdt = gs_clf.predict_proba(X_test.toarray())[:, 1] 178 | gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) 179 | print('gbdt auc: %.5f' % gbdt_auc) 180 | X_train_leaves = gbclf.apply(X_train)[:,:,0] 181 | (train_rows, cols) = X_train_leaves.shape 182 | X_test_leaves = gbclf.apply(X_test)[:,:,0] 183 | gbdtenc = OneHotEncoder() 184 | X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) 185 | lr = LogisticRegression() 186 | lr.fit(X_trans[:train_rows, :], y_train) 187 | y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] 188 | gbdtlr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) 189 | print('gbdt+lr auc 1: %.5f' % gbdtlr_auc1) 190 | lr = LogisticRegression(n_jobs=-1) 191 | X_train_ext = hstack([X_trans[:train_rows, :], X_train]) 192 | lr.fit(X_train_ext, y_train) 193 | X_test_ext = hstack([X_trans[train_rows:, :], X_test]) 194 | y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] 195 | gbdtlr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) 196 | print('gbdt+lr auc 2: %.5f' % gbdtlr_auc2) 197 | 198 | #using lr to calculate auc on original data and cross featured data 199 | def lr_train_test(libsvmFileInitial, libsvmFileCross): 200 | datasetSplit(libsvmFileInitial, 'data_train_th500', 'data_test_th500', 0.2, 1100000) 201 | datasetSplit(libsvmFileCross, 'data_cross_train_th500', 'data_cross_test_th500', 0.2, 1100000) 202 | X_train_origin, y_train_origin = load_svmlight_file('data_train_th500') 203 | print(X_train_origin.shape) 204 | X_test_origin, y_test_origin = load_svmlight_file('data_test_th500') 205 | print(X_test_origin.shape) 206 | lr = LogisticRegression(C=0.1, penalty='l2') 207 | lr.fit(X_train_origin, y_train_origin) 208 | y_pred_train = lr.predict_proba(X_train_origin)[:, 1] 209 | lr_train_auc = roc_auc_score(y_train_origin, y_pred_train) 210 | print('lr train auc origin: %.5f' % lr_train_auc) 211 | y_pred_test = lr.predict_proba(X_test_origin)[:, 1] 212 | lr_test_auc = roc_auc_score(y_test_origin, y_pred_test) 213 | print('lr test auc origin: %.5f' % lr_test_auc) 214 | X_train_cross, y_train_cross = load_svmlight_file('data_cross_train_th500') 215 | print(X_train_cross.shape) 216 | X_test_cross, y_test_cross = load_svmlight_file('data_cross_test_th500') 217 | print(X_test_cross.shape) 218 | lr = LogisticRegression(C=0.1, penalty='l2') 219 | lr.fit(X_train_cross, y_train_cross) 220 | y_pred_train = lr.predict_proba(X_train_cross)[:, 1] 221 | lr_train_auc = roc_auc_score(y_train_cross, y_pred_train) 222 | print('lr train auc cross: %.5f' % lr_train_auc) 223 | y_pred_test = lr.predict_proba(X_test_cross)[:, 1] 224 | lr_test_auc = roc_auc_score(y_test_cross, y_pred_test) 225 | print('lr test auc cross: %.5f' % lr_test_auc) 226 | 227 | #using hyperopt-sklearn to automatically tune the parameters of gbdt 228 | def hyper_opt(libsvmFile): 229 | datasetSplit(libsvmFile, 'data_train_th100', 'data_test_th100', 0.2, 100000) 230 | X_train, y_train = load_svmlight_file('data_train_th100') 231 | X_train = X_train.toarray() 232 | estimator = hpsklearn.HyperoptEstimator(None, 233 | classifier=hpsklearn.components.any_classifier('clf'), 234 | algo=hyperopt.tpe.suggest, 235 | trial_timeout=10.0, 236 | max_evals=10) 237 | fit_iterator = estimator.fit_iter(X_train, y_train) 238 | fit_iterator.next() 239 | plot_helper = hpsklearn.demo_support.PlotHelper(estimator, mintodate_ylim=(0.0,0.1)) 240 | while len(estimator.trials.trials) < estimator.max_evals: 241 | fit_iterator.send(1) 242 | plot_helper.post_iter() 243 | plot_helper.post_loop() 244 | estimator.retrain_best_model_on_full_data(X_train, y_train) 245 | print 'Best classifier: \n', estimator.best_model() 246 | 247 | if __name__ == '__main__': 248 | calcPosNegCnt('label_feature_data_libsvm') 249 | datasetSplit('50018_20160625_cross_sample', 0.2, 'lr_data_train', 'lr_data_test', 600000) 250 | xgboost_lr_train_test('data_libsvm_th100') 251 | lr_train_test('data_libsvm_th500', 'data_cross_libsvm_th500') 252 | xgb_feature_lr_train_test('xgb_feature_libsvm', 'data_cross_libsvm_th100') 253 | hyper_opt('data_libsvm_th100') 254 | --------------------------------------------------------------------------------