├── .gitattributes ├── README.md ├── lgb.py └── xgb.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 直接跑xgb的代码就能出csv提交。 2 | 3 | 比赛网址:https://www.datafountain.cn/competitions/311/details 4 | 5 | 比赛数据:https://www.datafountain.cn/competitions/311/details/data-evaluation 6 | 7 | 比赛类型:多分类问题 8 | 9 | A榜排名 10 | 11 | 方案1:LightGBM model 0.81245 (2018-09-13 23:13) 12 | 13 | 方案2:XGBoost model 0.8254 排名 30th/1153 (2018-09-14 14:23:04) 14 | 15 | XGBoost的关键参数 16 | 17 | max_depth=12, learning_rate=0.05, 18 | n_estimators=752, silent=True, 19 | objective="multi:softmax", 20 | nthread=4, gamma=0, 21 | max_delta_step=0, subsample=1, colsample_bytree=0.9, colsample_bylevel=0.9, 22 | reg_alpha=1, reg_lambda=1, scale_pos_weight=1, 23 | base_score=0.5, seed=2018, missing=None,num_class=15 24 | 25 | -------------------------------------------------------------------------------- /lgb.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgb 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.preprocessing import LabelEncoder 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.metrics import f1_score 7 | train = pd.read_csv('./train.csv') 8 | test = pd.read_csv('./test.csv') 9 | 10 | 11 | #data pre-processing 12 | 13 | train = train[train.gender != '\\N'] 14 | # test = test[test.gender != '\\N'] 15 | train['gender'] = train['gender'].apply(lambda x : int(x)) 16 | test['gender'] = test['gender'].apply(lambda x : int(x)) 17 | 18 | train = train[train.age != '\\N'] 19 | # test = test[test.age != '\\N'] 20 | train['age'] = train['age'].apply(lambda x : int(x)) 21 | test['age'] = test['age'].apply(lambda x : int(x)) 22 | 23 | train = train[train['2_total_fee'] != '\\N'] 24 | # test = test[test['2_total_fee'] != '\\N'] 25 | test.loc[test['2_total_fee'] == '\\N','2_total_fee'] = 0.0 26 | train['2_total_fee'] = train['2_total_fee'].apply(lambda x : float(x)) 27 | test['2_total_fee'] = test['2_total_fee'].apply(lambda x : float(x)) 28 | 29 | train = train[train['3_total_fee'] != '\\N'] 30 | # test = test[test['3_total_fee'] != '\\N'] 31 | test.loc[test['3_total_fee'] == '\\N','3_total_fee'] = 0.0 32 | train['3_total_fee'] = train['3_total_fee'].apply(lambda x : float(x)) 33 | test['3_total_fee'] = test['3_total_fee'].apply(lambda x : float(x)) 34 | 35 | 36 | label = train.pop('current_service') 37 | le = LabelEncoder() 38 | label = le.fit_transform(label) 39 | 40 | 41 | feature = [value for value in train.columns.values if 42 | value not in ['user_id']] 43 | 44 | 45 | 46 | 47 | #lgb model 48 | def LGB(): 49 | clf = lgb.LGBMClassifier( 50 | bjective='multiclass', 51 | boosting_type='gbdt', 52 | num_leaves=35, 53 | max_depth=8, 54 | learning_rate=0.05, 55 | seed=2018, 56 | colsample_bytree=0.8, 57 | subsample=0.9, 58 | n_estimators=2000) 59 | return clf 60 | 61 | online = False 62 | online = True # please '# online = False'if you would like to submit 63 | if online: 64 | print ('online') 65 | 66 | model = LGB() 67 | model.fit(train[feature], label, eval_set=[(train[feature], label)], verbose=1) 68 | pred = model.predict(test[feature]) 69 | pred = le.inverse_transform(pred) 70 | test['predict'] = pred 71 | 72 | test[['user_id', 'predict']].to_csv('./sub.csv', index=False) 73 | else: 74 | print ('offline') 75 | train_x,test_x,train_y,test_y = train_test_split(train[feature],label,test_size=0.1,shuffle=True,random_state=2018) 76 | model = LGB() 77 | model.fit(train_x[feature], train_y, eval_set=[(test_x[feature], test_y)], verbose=1,early_stopping_rounds=100) 78 | pred = model.predict(test_x) 79 | print(f1_score(test_y,pred,average='weighted')) 80 | 81 | -------------------------------------------------------------------------------- /xgb.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgb 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.preprocessing import LabelEncoder 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.metrics import f1_score 7 | import xgboost as xgb 8 | train = pd.read_csv('./train.csv') 9 | test = pd.read_csv('./test.csv') 10 | print (train.shape) 11 | 12 | 13 | 14 | train = train[train.gender != '\\N'] 15 | # test = test[test.gender != '\\N'] 16 | train['gender'] = train['gender'].apply(lambda x : int(x)) 17 | test['gender'] = test['gender'].apply(lambda x : int(x)) 18 | 19 | train = train[train.age != '\\N'] 20 | # test = test[test.age != '\\N'] 21 | train['age'] = train['age'].apply(lambda x : int(x)) 22 | test['age'] = test['age'].apply(lambda x : int(x)) 23 | 24 | train = train[train['2_total_fee'] != '\\N'] 25 | # test = test[test['2_total_fee'] != '\\N'] 26 | test.loc[test['2_total_fee'] == '\\N','2_total_fee'] = 0.0 27 | train['2_total_fee'] = train['2_total_fee'].apply(lambda x : float(x)) 28 | test['2_total_fee'] = test['2_total_fee'].apply(lambda x : float(x)) 29 | 30 | train = train[train['3_total_fee'] != '\\N'] 31 | # test = test[test['3_total_fee'] != '\\N'] 32 | test.loc[test['3_total_fee'] == '\\N','3_total_fee'] = 0.0 33 | train['3_total_fee'] = train['3_total_fee'].apply(lambda x : float(x)) 34 | test['3_total_fee'] = test['3_total_fee'].apply(lambda x : float(x)) 35 | 36 | 37 | label = train.pop('current_service') 38 | le = LabelEncoder() 39 | label = le.fit_transform(label) 40 | 41 | 42 | feature = [value for value in train.columns.values if 43 | value not in ['user_id']] 44 | 45 | 46 | 47 | #xgb模型 48 | def XGB(): 49 | clf = xgb.XGBClassifier(max_depth=12, learning_rate=0.05, 50 | n_estimators=752, silent=True, 51 | objective="multi:softmax", 52 | nthread=4, gamma=0, 53 | max_delta_step=0, subsample=1, colsample_bytree=0.9, colsample_bylevel=0.9, 54 | reg_alpha=1, reg_lambda=1, scale_pos_weight=1, 55 | base_score=0.5, seed=2018, missing=None,num_class=15) 56 | return clf 57 | 58 | 59 | 60 | 61 | online = False 62 | # online = True 63 | if online: 64 | print ('online') 65 | 66 | model = XGB() 67 | model.fit(train[feature], label, eval_set=[(train[feature], label)], verbose=1,) 68 | pred = model.predict(test[feature]) 69 | pred = le.inverse_transform(pred) 70 | test['predict'] = pred 71 | 72 | test[['user_id', 'predict']].to_csv('./sub.csv', index=False) 73 | else: 74 | print ('offline') 75 | train_x,test_x,train_y,test_y = train_test_split(train[feature],label,test_size=0.1,shuffle=True,random_state=2018) 76 | model = XGB() 77 | model.fit(train_x[feature], train_y, eval_set=[(test_x[feature], test_y)], verbose=1,early_stopping_rounds=100) 78 | pred = model.predict(test_x) 79 | print(f1_score(test_y,pred,average='weighted')) 80 | 81 | # feature_list = model.feature_importances_ 82 | # pd.DataFrame( 83 | # { 84 | # 'feature':feature, 85 | # 'score':feature_list, 86 | # } 87 | # ).to_csv('./feature_importance.csv',index=False) 88 | 89 | # from sklearn.externals import joblib 90 | # joblib.dump(model, 'gbm.pkl') 91 | --------------------------------------------------------------------------------