├── .gitattributes
├── README.md
├── lgb.py
└── xgb.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 直接跑xgb的代码就能出csv提交。
 2 | 
 3 | 比赛网址：https://www.datafountain.cn/competitions/311/details
 4 | 
 5 | 比赛数据：https://www.datafountain.cn/competitions/311/details/data-evaluation
 6 | 
 7 | 比赛类型：多分类问题
 8 | 
 9 | A榜排名
10 | 
11 | 方案1：LightGBM model 0.81245 (2018-09-13 23:13)
12 | 
13 | 方案2：XGBoost model 0.8254 排名 30th/1153 (2018-09-14 14:23:04)
14 | 
15 | XGBoost的关键参数
16 | 
17 | max_depth=12, learning_rate=0.05,
18 | n_estimators=752, silent=True,
19 | objective="multi:softmax",
20 | nthread=4, gamma=0,
21 | max_delta_step=0, subsample=1, colsample_bytree=0.9, colsample_bylevel=0.9,
22 | reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
23 | base_score=0.5, seed=2018, missing=None,num_class=15
24 | 
25 | 


--------------------------------------------------------------------------------
/lgb.py:
--------------------------------------------------------------------------------
 1 | import lightgbm as lgb
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.preprocessing import LabelEncoder
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.metrics import f1_score
 7 | train = pd.read_csv('./train.csv')
 8 | test = pd.read_csv('./test.csv')
 9 | 
10 | 
11 | #data pre-processing
12 | 
13 | train = train[train.gender != '\\N']
14 | # test = test[test.gender != '\\N']
15 | train['gender'] = train['gender'].apply(lambda x : int(x))
16 | test['gender'] = test['gender'].apply(lambda x : int(x))
17 | 
18 | train = train[train.age != '\\N']
19 | # test = test[test.age != '\\N']
20 | train['age'] = train['age'].apply(lambda x : int(x))
21 | test['age'] = test['age'].apply(lambda x : int(x))
22 | 
23 | train = train[train['2_total_fee'] != '\\N']
24 | # test = test[test['2_total_fee'] != '\\N']
25 | test.loc[test['2_total_fee'] == '\\N','2_total_fee'] = 0.0
26 | train['2_total_fee'] = train['2_total_fee'].apply(lambda x : float(x))
27 | test['2_total_fee'] = test['2_total_fee'].apply(lambda x : float(x))
28 | 
29 | train = train[train['3_total_fee'] != '\\N']
30 | # test = test[test['3_total_fee'] != '\\N']
31 | test.loc[test['3_total_fee'] == '\\N','3_total_fee'] = 0.0
32 | train['3_total_fee'] = train['3_total_fee'].apply(lambda x : float(x))
33 | test['3_total_fee'] = test['3_total_fee'].apply(lambda x : float(x))
34 | 
35 | 
36 | label = train.pop('current_service')
37 | le = LabelEncoder()
38 | label = le.fit_transform(label)
39 | 
40 | 
41 | feature = [value for value in train.columns.values if
42 |                    value not in ['user_id']]
43 | 
44 | 
45 | 
46 | 
47 | #lgb model
48 | def LGB():
49 |         clf = lgb.LGBMClassifier(
50 |                 bjective='multiclass',
51 |                 boosting_type='gbdt',
52 |                 num_leaves=35,
53 |                 max_depth=8,
54 |                 learning_rate=0.05,
55 |                 seed=2018,
56 |                 colsample_bytree=0.8,
57 |                 subsample=0.9,
58 |                 n_estimators=2000)
59 |         return clf
60 | 
61 | online = False
62 | online = True # please '# online = False'if you would like to submit
63 | if online:
64 |         print ('online')
65 | 
66 |         model = LGB()
67 |         model.fit(train[feature], label, eval_set=[(train[feature], label)], verbose=1)
68 |         pred = model.predict(test[feature])
69 |         pred = le.inverse_transform(pred)
70 |         test['predict'] = pred
71 | 
72 |         test[['user_id', 'predict']].to_csv('./sub.csv', index=False)
73 | else:
74 |         print ('offline')
75 |         train_x,test_x,train_y,test_y = train_test_split(train[feature],label,test_size=0.1,shuffle=True,random_state=2018)
76 |         model = LGB()
77 |         model.fit(train_x[feature], train_y, eval_set=[(test_x[feature], test_y)], verbose=1,early_stopping_rounds=100)
78 |         pred = model.predict(test_x)
79 |         print(f1_score(test_y,pred,average='weighted'))
80 | 
81 | 


--------------------------------------------------------------------------------
/xgb.py:
--------------------------------------------------------------------------------
 1 | import lightgbm as lgb
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.preprocessing import LabelEncoder
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.metrics import f1_score
 7 | import xgboost as xgb
 8 | train = pd.read_csv('./train.csv')
 9 | test = pd.read_csv('./test.csv')
10 | print (train.shape)
11 | 
12 | 
13 | 
14 | train = train[train.gender != '\\N']
15 | # test = test[test.gender != '\\N']
16 | train['gender'] = train['gender'].apply(lambda x : int(x))
17 | test['gender'] = test['gender'].apply(lambda x : int(x))
18 | 
19 | train = train[train.age != '\\N']
20 | # test = test[test.age != '\\N']
21 | train['age'] = train['age'].apply(lambda x : int(x))
22 | test['age'] = test['age'].apply(lambda x : int(x))
23 | 
24 | train = train[train['2_total_fee'] != '\\N']
25 | # test = test[test['2_total_fee'] != '\\N']
26 | test.loc[test['2_total_fee'] == '\\N','2_total_fee'] = 0.0
27 | train['2_total_fee'] = train['2_total_fee'].apply(lambda x : float(x))
28 | test['2_total_fee'] = test['2_total_fee'].apply(lambda x : float(x))
29 | 
30 | train = train[train['3_total_fee'] != '\\N']
31 | # test = test[test['3_total_fee'] != '\\N']
32 | test.loc[test['3_total_fee'] == '\\N','3_total_fee'] = 0.0
33 | train['3_total_fee'] = train['3_total_fee'].apply(lambda x : float(x))
34 | test['3_total_fee'] = test['3_total_fee'].apply(lambda x : float(x))
35 | 
36 | 
37 | label = train.pop('current_service')
38 | le = LabelEncoder()
39 | label = le.fit_transform(label)
40 | 
41 | 
42 | feature = [value for value in train.columns.values if
43 |                    value not in ['user_id']]
44 | 
45 | 
46 | 
47 | #xgb模型
48 | def XGB():
49 |     clf = xgb.XGBClassifier(max_depth=12, learning_rate=0.05,
50 |                             n_estimators=752, silent=True,
51 |                             objective="multi:softmax",
52 |                             nthread=4, gamma=0,
53 |                             max_delta_step=0, subsample=1, colsample_bytree=0.9, colsample_bylevel=0.9,
54 |                             reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
55 |                             base_score=0.5, seed=2018, missing=None,num_class=15)
56 |     return clf
57 | 
58 | 
59 | 
60 | 
61 | online = False
62 | # online = True
63 | if online:
64 |         print ('online')
65 | 
66 |         model = XGB()
67 |         model.fit(train[feature], label, eval_set=[(train[feature], label)], verbose=1,)
68 |         pred = model.predict(test[feature])
69 |         pred = le.inverse_transform(pred)
70 |         test['predict'] = pred
71 | 
72 |         test[['user_id', 'predict']].to_csv('./sub.csv', index=False)
73 | else:
74 |         print ('offline')
75 |         train_x,test_x,train_y,test_y = train_test_split(train[feature],label,test_size=0.1,shuffle=True,random_state=2018)
76 |         model = XGB()
77 |         model.fit(train_x[feature], train_y, eval_set=[(test_x[feature], test_y)], verbose=1,early_stopping_rounds=100)
78 |         pred = model.predict(test_x)
79 |         print(f1_score(test_y,pred,average='weighted'))
80 | 
81 |         # feature_list = model.feature_importances_
82 |         # pd.DataFrame(
83 |         #         {
84 |         #                 'feature':feature,
85 |         #                 'score':feature_list,
86 |         #         }
87 |         # ).to_csv('./feature_importance.csv',index=False)
88 | 
89 |         # from sklearn.externals import joblib
90 |         # joblib.dump(model, 'gbm.pkl')
91 | 


--------------------------------------------------------------------------------