├── README.md └── src ├── main.py ├── dealdata.py ├── model.py └── feature.py /README.md: -------------------------------------------------------------------------------- 1 | # DCIC-Group-Image-of-Consumers-----Intelligent-Scoring-of-Credits 2 | 比赛:消费者人群画像—信用智能评分 3 | 主办方:福建省数字福建建设领导小组办公室 & 福建省工业和信息化厅 & 福州市人民政府 & 中国电子信息产业发展研究院 & 数字中国研究院 & 中国互联网投资基金 4 | 5 | 比赛链接:https://www.datafountain.cn/competitions/337/details 6 | 7 | 参赛人:Chain、我爱写代码、我真不会造轮子、憨子哥、Iron_man 8 | 9 | 这次比赛我们队取得了线上A榜和B榜的第三名,非常感谢我队友的辛勤付出。在答辩环节由于缺乏经验和准备,很遗憾我们未能进入最终的总决赛,但我们对能取得这样的成绩已经很满意了(我们五个做比赛都不多),希望能吸取这次的经验教训,继续学习,在以后的比赛中能有更好的表现。 10 | 11 | 在这里也要特别感谢那些大佬们的开源,正是他们的开源让我们学习到了很多,并且慢慢进步。在我们的代码中就有一些特征用到了那些大佬开源使用的方法,真的非常感谢。 12 | 13 | 比如郭大、林有夕、小兔子乖乖他们开源的这个:https://github.com/PandasCute/2018-CCF-BDCI-China-Unicom-Research-Institute-top2 14 | 15 | 以及鱼佬在知乎上写的文章都让我们受益匪浅。https://www.zhihu.com/people/wang-he-13-93/posts 16 | 17 | 18 | 运行代码步骤及说明: 19 | 20 | 1.先创建一个result的文件夹,来存放训练结果。 21 | 22 | 2.src文件夹中的main.py运行模型,然后运行dealdata.py来进行模型融合。 23 | 24 | 注:把文件夹名src改为src2(忘改回来了) 25 | 26 | 27 | 28 | feature.py是我们构造的六套特征 29 | 30 | model.py是我们的六个模型 31 | 32 | 我们这个虽然构造了六套特征,但其实主要就那几个特征,构造六套特征的目的是使特征之间有差异性,使融合的效果更好。 33 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from src2.feature import * 3 | from src2.model import * 4 | 5 | y_train = target 6 | def get_data(data,feature,shape): 7 | datacopy = data.copy() 8 | fea_data = feature(datacopy) 9 | cate_columns = [i for i in fea_data.columns] 10 | cate_columns.remove('用户编码') 11 | cate_columns.remove('信用分') 12 | train_data = fea_data[:shape] 13 | test_data = fea_data[shape:] 14 | X_train = train_data[cate_columns].values 15 | X_test = test_data[cate_columns].values 16 | return X_train,X_test 17 | 18 | 19 | x_train_1,x_test_1 = get_data(data,feature1,50000) 20 | x_train_2,x_test_2 = get_data(data,feature2,50000) 21 | x_train_3,x_test_3 = get_data(data,feature3,50000) 22 | x_train_4,x_test_4 = get_data(data,feature4,50000) 23 | x_train_5,x_test_5 = get_data(data,feature5,50000) 24 | x_train_6,x_test_6 = get_data(data,feature6,50000) 25 | 26 | 27 | lgb1_model(1,x_train_1,y_train,x_test_1,'1') 28 | lgb2_model(1,x_train_2,y_train,x_test_2,'2') 29 | xgb_model(1,x_train_3,y_train,x_test_3,'3') 30 | cat_model(1,x_train_4,y_train,x_test_4,'4') 31 | lgb3_model(1,x_train_5,y_train,x_test_5,'5') 32 | lgb4_model(1,x_train_6,y_train,x_test_6,'6') 33 | 34 | f.close() 35 | 36 | -------------------------------------------------------------------------------- /src/dealdata.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import mean_absolute_error 3 | import numpy as np 4 | 5 | 6 | 7 | train = pd.read_csv('../data/train_dataset.csv') 8 | test = pd.read_csv('../data/test_dataset.csv') 9 | target = train['信用分'] 10 | 11 | data1 = pd.read_csv('../result/lgb1_model_1.csv') 12 | data2 = pd.read_csv('../result/lgb2_model_2.csv') 13 | data3 = pd.read_csv('../result/xgb_model_3.csv') 14 | data4 = pd.read_csv('../result/cat_model_4.csv') 15 | data5 = pd.read_csv('../result/lgb3_model_5.csv') 16 | data6 = pd.read_csv('../result/lgb4_model_6.csv') 17 | 18 | 19 | predictions_blends2 = 0.17 * (data1['score2']) + 0.17 * data4['score2'] + 0.17 * data3['score2'] \ 20 | + 0.17 * (data6['score2']+1) + 0.15 * (data5['score2']+1) + 0.17 * (data2['score2']+1) 21 | predictions_blends2 = predictions_blends2.apply(lambda x:int(x + 0.5)) 22 | 23 | 24 | MAE = mean_absolute_error(train['信用分'],predictions_blends2) 25 | score = 1/(1+MAE) 26 | print('线下测评分数: best score is %8.8f' % score) 27 | 28 | 29 | predictions_blends = 0.17 * (data1['score']) + 0.17 * data4['score'] + 0.17 * data3['score'] \ 30 | + 0.17 * (data6['score']+1) + 0.15 * (data5['score']+1) + 0.17 * (data2['score']+1) 31 | 32 | test_data_sub = test[['用户编码']] 33 | test_data_sub['score'] = predictions_blends 34 | test_data_sub['score'] = test_data_sub['score'].apply(lambda x:int(x+0.5)) 35 | test_data_sub.columns = ['id','score'] 36 | test_data_sub.to_csv('ronghe.csv', index=False) -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgb 2 | from sklearn.model_selection import KFold, RepeatedKFold 3 | from sklearn.metrics import mean_absolute_error 4 | import numpy as np 5 | import pandas as pd 6 | import xgboost as xgb 7 | import catboost as cat 8 | f = open('../result/result.txt','w') 9 | 10 | 11 | 12 | 13 | def lgb1_model(num_model_seed,x_train,y_train,x_test,name): 14 | predictions_lgb = np.zeros(len(x_test)) 15 | oof = np.zeros(len(x_train)) 16 | seeds = [2019, 2019 * 2 + 1024, 4096, 2048, 1024] 17 | for model_seed in range(num_model_seed): 18 | param = {'num_leaves': 31, 19 | 'min_data_in_leaf': 20, 20 | 'objective': 'regression_l1', 21 | 'max_depth': 5, 22 | 'learning_rate': 0.0081, 23 | "min_child_samples": 30, 24 | "boosting": "gbdt", 25 | "feature_fraction": 0.7, 26 | "bagging_freq": 1, 27 | "bagging_fraction": 0.8, 28 | "bagging_seed": 11, 29 | "metric": 'mae', 30 | "lambda_l1": 0.60, 31 | "verbosity": -1} 32 | folds = KFold(n_splits=6, shuffle=True, random_state=seeds[0]) 33 | oof_lgb = np.zeros(len(x_train)) 34 | 35 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)): 36 | print(len(trn_idx)) 37 | print("fold n°{}".format(fold_ + 1)) 38 | trn_data = lgb.Dataset(x_train[trn_idx], y_train[trn_idx]) 39 | val_data = lgb.Dataset(x_train[val_idx], y_train[val_idx]) 40 | num_round = 10000 41 | clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=1000, 42 | early_stopping_rounds=500) 43 | oof_lgb[val_idx] = clf.predict(x_train[val_idx], num_iteration=clf.best_iteration) 44 | predictions_lgb += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits / num_model_seed 45 | oof += oof_lgb / num_model_seed 46 | MAE = mean_absolute_error(oof_lgb, y_train) 47 | score = 1 / (1 + MAE) 48 | print("CV score: {:<8.8f}".format(MAE)) 49 | print("score: {:<8.8f}".format(score)) 50 | MAE = mean_absolute_error(oof, y_train) 51 | score = 1 / (1 + MAE) 52 | print("CV score: {:<8.8f}".format(MAE)) 53 | print("score: {:<8.8f}".format(score)) 54 | sub_df = pd.read_csv('../data/submit_example.csv') 55 | sub_df['score'] = predictions_lgb 56 | sub_df['score2'] = oof_lgb 57 | sub_df.to_csv('../result/lgb1_model_{}.csv'.format(name), index=0, header=1, sep=',') 58 | f.write("lgb1_model_{}---score: {:<8.8f}".format(name,score)) 59 | f.write('\n') 60 | 61 | def lgb2_model(num_model_seed,x_train,y_train,x_test,name): 62 | predictions_lgb = np.zeros(len(x_test)) 63 | oof = np.zeros(len(x_train)) 64 | seeds = [2019, 2019 * 2 + 1024, 4096, 2048, 1024] 65 | for model_seed in range(num_model_seed): 66 | param = {'num_leaves': 31, 67 | 'min_data_in_leaf': 20, 68 | 'objective': 'regression_l2', 69 | 'max_depth': 5, 70 | 'learning_rate': 0.0081, 71 | "min_child_samples": 30, 72 | "boosting": "gbdt", 73 | "feature_fraction": 0.7, 74 | "bagging_freq": 1, 75 | "bagging_fraction": 0.8, 76 | "bagging_seed": 11, 77 | "metric": 'mae', 78 | "lambda_l1": 0.60, 79 | "verbosity": -1} 80 | folds = KFold(n_splits=6, shuffle=True, random_state=seeds[0]) # 81 | oof_lgb = np.zeros(len(x_train)) 82 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)): 83 | print(len(trn_idx)) 84 | print("fold n°{}".format(fold_ + 1)) 85 | trn_data = lgb.Dataset(x_train[trn_idx], y_train[trn_idx]) 86 | val_data = lgb.Dataset(x_train[val_idx], y_train[val_idx]) 87 | num_round = 10000 88 | clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=1000, 89 | early_stopping_rounds=500) 90 | oof_lgb[val_idx] = clf.predict(x_train[val_idx], num_iteration=clf.best_iteration) 91 | predictions_lgb += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits / num_model_seed 92 | oof += oof_lgb / num_model_seed 93 | MAE = mean_absolute_error(oof_lgb, y_train) 94 | score = 1 / (1 + MAE) 95 | print("CV score: {:<8.8f}".format(MAE)) 96 | print("score: {:<8.8f}".format(score)) 97 | MAE = mean_absolute_error(oof, y_train) 98 | score = 1 / (1 + MAE) 99 | print("CV score: {:<8.8f}".format(MAE)) 100 | print("score: {:<8.8f}".format(score)) 101 | 102 | sub_df = pd.read_csv('../data/submit_example.csv') 103 | sub_df['score'] = predictions_lgb 104 | sub_df['score2'] = oof_lgb 105 | sub_df.to_csv('../result/lgb2_model_{}.csv'.format(name), index=0, header=1, sep=',') 106 | f.write("lgb2_model_{}---score: {:<8.8f}".format(name, score)) 107 | f.write('\n') 108 | 109 | def xgb_model(num_model_seed,x_train,y_train,x_test,name): 110 | predictions_xgb = np.zeros(len(x_test)) 111 | oof_xgb = np.zeros(len(x_train)) 112 | seeds = [2019, 4096, 2019 * 2 + 1024, 2048, 1024] 113 | for seed in range(num_model_seed): 114 | xgb_params = {'eta': 0.004, 'max_depth': 6, 'subsample': 0.5, 'colsample_bytree': 0.5, 'alpha': 0.2, 115 | 'objective': 'reg:gamma', 'eval_metric': 'mae', 'silent': True, 'nthread': -1 116 | } 117 | folds = KFold(n_splits=5, shuffle=True, random_state=seeds[seed]) 118 | oof = np.zeros(len(x_train)) 119 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)): 120 | print("fold n°{}".format(fold_ + 1)) 121 | trn_data = xgb.DMatrix(x_train[trn_idx], y_train[trn_idx]) 122 | val_data = xgb.DMatrix(x_train[val_idx], y_train[val_idx]) 123 | 124 | watchlist = [(trn_data, 'train'), (val_data, 'valid_data')] 125 | clf = xgb.train(dtrain=trn_data, num_boost_round=10000, evals=watchlist, early_stopping_rounds=200, 126 | verbose_eval=1000, params=xgb_params) 127 | oof[val_idx] = clf.predict(xgb.DMatrix(x_train[val_idx]), ntree_limit=clf.best_ntree_limit) 128 | predictions_xgb += clf.predict(xgb.DMatrix(x_test), 129 | ntree_limit=clf.best_ntree_limit) / folds.n_splits / num_model_seed 130 | oof_xgb += oof / num_model_seed 131 | MAE = mean_absolute_error(y_train, oof) 132 | score = 1 / (1 + MAE) 133 | print("CV score: {:<8.8f}".format(MAE)) 134 | print("score: {:<8.8f}".format(score)) 135 | 136 | MAE = mean_absolute_error(oof_xgb, y_train) 137 | score = 1 / (1 + MAE) 138 | print("CV score: {:<8.8f}".format(MAE)) 139 | print("score: {:<8.8f}".format(score)) 140 | 141 | sub_df = pd.read_csv('../data/submit_example.csv') 142 | sub_df['score'] = predictions_xgb 143 | sub_df['score2'] = oof_xgb 144 | sub_df.to_csv('../result/xgb_model_{}.csv'.format(name), index=0, header=1, sep=',') 145 | f.write("xgb_model_{}---score: {:<8.8f}".format(name, score)) 146 | f.write('\n') 147 | 148 | def cat_model(num_model_seed,x_train,y_train,x_test,name): 149 | cat_params = {'depth': 7, 'learning_rate': 0.8, 'l2_leaf_reg': 2, 'num_boost_round': 10000, 'random_seed': 94, 150 | 'loss_function': 'MAE'} 151 | 152 | seeds = [2019, 2019 * 2 + 1024, 4096, 2048, 1024] 153 | oof_cat = np.zeros(len(x_train)) 154 | predictions_cat = np.zeros(len(x_test)) 155 | for seed in range(num_model_seed): 156 | folds = KFold(n_splits=5, shuffle=True, random_state=seeds[seed]) 157 | oof = np.zeros(len(x_train)) 158 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)): 159 | print("fold n°{}".format(fold_ + 1)) 160 | clf = cat.CatBoostRegressor(**cat_params) 161 | clf.fit(x_train[trn_idx], y_train[trn_idx], early_stopping_rounds=200, verbose_eval=3000, 162 | use_best_model=True, 163 | eval_set=(x_train[val_idx], y_train[val_idx])) 164 | oof[val_idx] = clf.predict(x_train[val_idx]) 165 | predictions_cat += clf.predict(x_test) / folds.n_splits / num_model_seed 166 | oof_cat += oof / num_model_seed 167 | MAE = mean_absolute_error(oof, y_train) 168 | score = 1 / (1 + MAE) 169 | print("CV score: {:<8.8f}".format(MAE)) 170 | print("score: {:<8.8f}".format(score)) 171 | 172 | MAE = mean_absolute_error(oof_cat, y_train) 173 | score = 1 / (1 + MAE) 174 | print("CV score: {:<8.8f}".format(MAE)) 175 | print("score: {:<8.8f}".format(score)) 176 | 177 | sub_df = pd.read_csv('../data/submit_example.csv') 178 | sub_df['score'] = predictions_cat 179 | sub_df['score2'] = oof_cat 180 | sub_df.to_csv('../result/cat_model_{}.csv'.format(name), index=0, header=1, sep=',') 181 | f.write("cat_model_{}---score: {:<8.8f}".format(name, score)) 182 | f.write('\n') 183 | 184 | 185 | def lgb3_model(num_model_seed,x_train,y_train,x_test,name): 186 | predictions_lgb = np.zeros(len(x_test)) 187 | oof = np.zeros(len(x_train)) 188 | seeds = [2019, 2019 * 2 + 1024, 4096, 2048, 1024] 189 | for model_seed in range(num_model_seed): 190 | param = {'num_leaves': 31, 191 | 'min_data_in_leaf': 20, 192 | 'objective': 'regression_l1', 193 | 'max_depth': 5, 194 | 'learning_rate': 0.01, 195 | "min_child_samples": 30, 196 | "boosting": "gbdt", 197 | "feature_fraction": 0.45, 198 | "bagging_freq": 1, 199 | "bagging_fraction": 0.8, 200 | "bagging_seed": 11, 201 | "metric": 'mae', 202 | "lambda_l1": 0.60, 203 | "verbosity": -1} 204 | folds = KFold(n_splits=6, shuffle=True, random_state=seeds[0]) # 205 | oof_lgb = np.zeros(len(x_train)) 206 | 207 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)): 208 | print(len(trn_idx)) 209 | print("fold n°{}".format(fold_ + 1)) 210 | trn_data = lgb.Dataset(x_train[trn_idx], y_train[trn_idx]) 211 | val_data = lgb.Dataset(x_train[val_idx], y_train[val_idx]) 212 | num_round = 10000 213 | clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=1000, 214 | early_stopping_rounds=500) 215 | oof_lgb[val_idx] = clf.predict(x_train[val_idx], num_iteration=clf.best_iteration) 216 | predictions_lgb += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits / num_model_seed 217 | 218 | oof += oof_lgb / num_model_seed 219 | MAE = mean_absolute_error(oof_lgb, y_train) 220 | score = 1 / (1 + MAE) 221 | print("CV score: {:<8.8f}".format(MAE)) 222 | print("score: {:<8.8f}".format(score)) 223 | MAE = mean_absolute_error(oof, y_train) 224 | score = 1 / (1 + MAE) 225 | print("CV score: {:<8.8f}".format(MAE)) 226 | print("score: {:<8.8f}".format(score)) 227 | 228 | sub_df = pd.read_csv('../data/submit_example.csv') 229 | sub_df['score'] = predictions_lgb 230 | sub_df['score2'] = oof_lgb 231 | sub_df.to_csv('../result/lgb3_model_{}.csv'.format(name), index=False) 232 | f.write("lgb3_model_{}---score: {:<8.8f}".format(name, score)) 233 | f.write('\n') 234 | 235 | def lgb4_model(num_model_seed,x_train,y_train,x_test,name): 236 | predictions_lgb = np.zeros(len(x_test)) 237 | oof = np.zeros(len(x_train)) 238 | seeds = [2018, 2019 * 2 + 1024, 4096, 2048, 1024] 239 | for model_seed in range(num_model_seed): 240 | param = {'num_leaves': 31, 241 | 'min_data_in_leaf': 20, 242 | 'objective': 'regression_l2', 243 | 'max_depth': 5, 244 | 'learning_rate': 0.01, 245 | "min_child_samples": 30, 246 | "boosting": "gbdt", 247 | "feature_fraction": 0.45, 248 | "bagging_freq": 1, 249 | "bagging_fraction": 0.8, 250 | "bagging_seed": 11, 251 | "metric": 'mae', 252 | "lambda_l1": 0.60, 253 | "verbosity": -1} 254 | folds = KFold(n_splits=6, shuffle=True, random_state=seeds[0]) 255 | oof_lgb = np.zeros(len(x_train)) 256 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)): 257 | print(len(trn_idx)) 258 | print("fold n°{}".format(fold_ + 1)) 259 | trn_data = lgb.Dataset(x_train[trn_idx], y_train[trn_idx]) 260 | val_data = lgb.Dataset(x_train[val_idx], y_train[val_idx]) 261 | num_round = 10000 262 | clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=1000, 263 | early_stopping_rounds=500) 264 | oof_lgb[val_idx] = clf.predict(x_train[val_idx], num_iteration=clf.best_iteration) 265 | predictions_lgb += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits / num_model_seed 266 | 267 | oof += oof_lgb / num_model_seed 268 | MAE = mean_absolute_error(oof_lgb, y_train) 269 | score = 1 / (1 + MAE) 270 | print("CV score: {:<8.8f}".format(MAE)) 271 | print("score: {:<8.8f}".format(score)) 272 | MAE = mean_absolute_error(oof, y_train) 273 | score = 1 / (1 + MAE) 274 | print("CV score: {:<8.8f}".format(MAE)) 275 | print("score: {:<8.8f}".format(score)) 276 | 277 | sub_df = pd.read_csv('../data/submit_example.csv') 278 | sub_df['score'] = predictions_lgb 279 | sub_df['score2'] = oof_lgb 280 | sub_df.to_csv('../result/lgb4_model_{}.csv'.format(name), index=False) 281 | f.write("lgb4_model_{}---score: {:<8.8f}".format(name, score)) 282 | f.write('\n') -------------------------------------------------------------------------------- /src/feature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import math 4 | from src2.model import * 5 | 6 | 7 | train = pd.read_csv('../data/train_dataset.csv') 8 | test = pd.read_csv('../data/test_dataset.csv') 9 | target = train['信用分'] 10 | data = pd.concat([train,test],axis=0,ignore_index=True) 11 | data = data.fillna(0) 12 | 13 | ####处理数据 14 | data.loc[data['用户年龄']==0, '用户年龄'] = None 15 | data.loc[data['用户话费敏感度']==0,'用户话费敏感度'] = data['用户话费敏感度'].median() 16 | 17 | 18 | def feature_count(data, features=[]): 19 | if len(set(features)) != len(features): 20 | print('equal feature !!!!') 21 | return data 22 | new_feature = 'count' 23 | for i in features: 24 | new_feature += '_' + i.replace('add_', '') 25 | temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature}) 26 | data = data.merge(temp, 'left', on=features) 27 | return data 28 | 29 | def endwords(x): 30 | va = str(x).split(".") 31 | x = int(va[-1]) 32 | return x 33 | 34 | def trans_set(x, countdict): 35 | if x == 0: 36 | return 0 37 | elif countdict[x] > 1000: 38 | return 1 39 | else: 40 | return 2 41 | 42 | def feature1(data): 43 | data['charge_type'] = 0 44 | data['charge_type'][(data['缴费用户最近一次缴费金额(元)'] % 10 == 0) & \ 45 | data['缴费用户最近一次缴费金额(元)'] != 0] = 1 46 | data['buy_rate'] = data['当月物流快递类应用使用次数'] / (data['当月网购类应用使用次数'] + 1) 47 | data['sixfee_nowfee'] = data['用户近6个月平均消费值(元)'] - data['用户账单当月总费用(元)'] 48 | data['fivefee_nowfee'] = data['用户近6个月平均消费值(元)'] * 6 - data['用户账单当月总费用(元)'] 49 | data['month'] = data['用户网龄(月)'].apply(lambda x: x % 12) 50 | data['year'] = data['用户网龄(月)'].apply(lambda x: x / 12) 51 | data['word1'] = data['用户账单当月总费用(元)'].apply(lambda x: endwords(x)) 52 | 53 | features = ['缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)', '用户账单当月总费用(元)'] 54 | for feature in features: 55 | data = feature_count(data, [feature]) 56 | data['current_fee_stability'] = \ 57 | data['用户账单当月总费用(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 58 | 59 | data['use_left_rate'] = data['用户账单当月总费用(元)'] / (data['用户当月账户余额(元)'] + 5) 60 | 61 | data['payment_rate'] = data['用户账单当月总费用(元)'] / (data['缴费用户最近一次缴费金额(元)'] + 5) 62 | 63 | data['balance_6month_rate'] = data['用户当月账户余额(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 64 | 65 | data['payment_6month_rate'] = data['缴费用户最近一次缴费金额(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 66 | 67 | data['balance_payment_rate'] = data['用户当月账户余额(元)'] / (data['缴费用户最近一次缴费金额(元)'] + 5) 68 | 69 | data['cosume_fee'] = data['用户账单当月总费用(元)'] - data['用户近6个月平均消费值(元)'] 70 | 71 | data['当月金融理财类应用使用总次数/all'] = \ 72 | data['当月金融理财类应用使用总次数'] / (data['当月网购类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 73 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 74 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 75 | data['当月视频播放类应用使用次数']) # 还ok点 76 | 77 | data['当月视频播放类应用使用次数/all'] = \ 78 | data['当月视频播放类应用使用次数'] / (data['当月网购类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 79 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 80 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 81 | data['当月金融理财类应用使用总次数']) 82 | 83 | data['当月网购类应用使用次数/all'] = \ 84 | data['当月网购类应用使用次数'] / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 85 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 86 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 87 | data['当月金融理财类应用使用总次数']) 88 | 89 | data['当月飞机类应用使用次数'] = \ 90 | data['当月飞机类应用使用次数'] / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 91 | data['当月网购类应用使用次数'] + data['当月旅游资讯类应用使用次数'] + \ 92 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 93 | data['当月金融理财类应用使用总次数']) 94 | 95 | data['total-six'] = data['用户账单当月总费用(元)'] / (data['用户近6个月平均消费值(元)']) 96 | data['total-curr'] = (data['用户账单当月总费用(元)'] - data['用户当月账户余额(元)']) 97 | data['total-curr'] = data.apply(lambda x: x['total-curr'] * x['用户话费敏感度'] if x['total-curr'] > 0 else 98 | x['total-curr'] * (6 - x['用户话费敏感度']), axis=1) 99 | data['cz_times'] = data['用户账单当月总费用(元)'] / (data['缴费用户最近一次缴费金额(元)'] + 1) 100 | data['last_five_fee'] = 6 * data['用户近6个月平均消费值(元)'] - data['用户账单当月总费用(元)'] 101 | data['now-five'] = data['用户账单当月总费用(元)'] - data['last_five_fee'] / 5 102 | data.drop(columns=['是否大学生客户','是否黑名单客户','当月是否到过福州山姆会员店','当月是否逛过福州仓山万达','是否经常逛商场的人'],inplace=True) 103 | return data 104 | 105 | def feature2(data): 106 | data['charge_type'] = 0 107 | data['charge_type'][(data['缴费用户最近一次缴费金额(元)'] % 10 == 0) & \ 108 | data['缴费用户最近一次缴费金额(元)'] != 0] = 1 109 | data['buy_rate'] = data['当月物流快递类应用使用次数'] / (data['当月网购类应用使用次数'] + 1) 110 | data['sixfee_nowfee'] = data['用户近6个月平均消费值(元)'] - data['用户账单当月总费用(元)'] 111 | data['fivefee_nowfee'] = data['用户近6个月平均消费值(元)'] * 6 - data['用户账单当月总费用(元)'] 112 | data['month'] = data['用户网龄(月)'].apply(lambda x: x % 12) 113 | data['year'] = data['用户网龄(月)'].apply(lambda x: x / 12) 114 | data['rate'] = data['缴费用户最近一次缴费金额(元)'].apply( 115 | lambda x: float('%.3f' % (x / math.ceil(x))) if int(x) != 0 else 0) 116 | features = ['缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)', '用户账单当月总费用(元)'] 117 | for feature in features: 118 | data = feature_count(data, [feature]) 119 | data['current_fee_stability'] = \ 120 | data['用户账单当月总费用(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 121 | 122 | data['use_left_rate'] = data['用户账单当月总费用(元)'] / (data['用户当月账户余额(元)'] + 5) 123 | 124 | data['payment_rate'] = data['用户账单当月总费用(元)'] / (data['缴费用户最近一次缴费金额(元)'] + 5) 125 | 126 | data['balance_6month_rate'] = data['用户当月账户余额(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 127 | 128 | data['payment_6month_rate'] = data['缴费用户最近一次缴费金额(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 129 | 130 | data['balance_payment_rate'] = data['用户当月账户余额(元)'] / (data['缴费用户最近一次缴费金额(元)'] + 5) 131 | 132 | data['cosume_fee'] = data['用户账单当月总费用(元)'] - data['用户近6个月平均消费值(元)'] 133 | 134 | data['当月金融理财类应用使用总次数/all'] = \ 135 | data['当月金融理财类应用使用总次数'] / (data['当月网购类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 136 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 137 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 138 | data['当月视频播放类应用使用次数']) # 还ok点 139 | 140 | data['当月视频播放类应用使用次数/all'] = \ 141 | data['当月视频播放类应用使用次数'] / (data['当月网购类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 142 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 143 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 144 | data['当月金融理财类应用使用总次数']) 145 | 146 | data['当月网购类应用使用次数/all'] = \ 147 | data['当月网购类应用使用次数'] / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 148 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 149 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 150 | data['当月金融理财类应用使用总次数']) 151 | 152 | data['当月飞机类应用使用次数'] = \ 153 | data['当月飞机类应用使用次数'] / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 154 | data['当月网购类应用使用次数'] + data['当月旅游资讯类应用使用次数'] + \ 155 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 156 | data['当月金融理财类应用使用总次数']) 157 | 158 | data['total-six'] = data['用户账单当月总费用(元)'] / (data['用户近6个月平均消费值(元)']) 159 | data['total-curr'] = (data['用户账单当月总费用(元)'] - data['用户当月账户余额(元)']) 160 | data['total-curr'] = data.apply(lambda x: x['total-curr'] * x['用户话费敏感度'] if x['total-curr'] > 0 else 161 | x['total-curr'] * (6 - x['用户话费敏感度']), axis=1) 162 | data['cz_times'] = data['用户账单当月总费用(元)'] / (data['缴费用户最近一次缴费金额(元)'] + 1) 163 | data['last_five_fee'] = 6 * data['用户近6个月平均消费值(元)'] - data['用户账单当月总费用(元)'] 164 | data['now-five'] = data['用户账单当月总费用(元)'] - data['last_five_fee'] / 5 165 | data.drop(columns=['是否大学生客户', '是否黑名单客户', '当月是否到过福州山姆会员店', '当月是否逛过福州仓山万达', '是否经常逛商场的人'], inplace=True) 166 | return data 167 | 168 | def feature3(data): 169 | data['充值途径'] = 0 170 | data['充值途径'][(data['缴费用户最近一次缴费金额(元)'] % 10 == 0) & \ 171 | data['缴费用户最近一次缴费金额(元)'] != 0] = 1 172 | data['buy_rate'] = data['当月物流快递类应用使用次数'] / (data['当月网购类应用使用次数'] + 1) 173 | data['sixfee_nowfee'] = data['用户近6个月平均消费值(元)'] - data['用户账单当月总费用(元)'] 174 | data['fivefee_nowfee'] = data['用户近6个月平均消费值(元)'] * 6 - data['用户账单当月总费用(元)'] 175 | data['month'] = data['用户网龄(月)'].apply(lambda x: x % 12) 176 | data['year'] = data['用户网龄(月)'].apply(lambda x: x / 12) 177 | data = feature_count(data, ['缴费用户最近一次缴费金额(元)']) 178 | data = feature_count(data, ['用户近6个月平均消费值(元)']) 179 | data = feature_count(data, ['用户账单当月总费用(元)']) 180 | data.drop(columns=['是否大学生客户', '是否黑名单客户', '是否经常逛商场的人'], inplace=True) 181 | return data 182 | 183 | def feature4(data): 184 | data['充值途径'] = 0 185 | data['充值途径'][(data['缴费用户最近一次缴费金额(元)'] % 10 == 0) & \ 186 | data['缴费用户最近一次缴费金额(元)'] != 0] = 1 187 | data['buy_rate'] = data['当月物流快递类应用使用次数'] / (data['当月网购类应用使用次数'] + 1) 188 | data['sixfee_nowfee'] = data['用户近6个月平均消费值(元)'] - data['用户账单当月总费用(元)'] 189 | data['fivefee_nowfee'] = data['用户近6个月平均消费值(元)'] * 6 - data['用户账单当月总费用(元)'] 190 | data['month'] = data['用户网龄(月)'].apply(lambda x: x % 12) 191 | data['year'] = data['用户网龄(月)'].apply(lambda x: x / 12) 192 | data['word1'] = data['用户账单当月总费用(元)'].apply(lambda x: endwords(x)) 193 | features = ['缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)', '用户账单当月总费用(元)'] 194 | for feature in features: 195 | data = feature_count(data, [feature]) 196 | 197 | data['current_fee_stability'] = \ 198 | data['用户账单当月总费用(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 199 | 200 | data['use_left_rate'] = data['用户账单当月总费用(元)'] / (data['用户当月账户余额(元)'] + 5) 201 | 202 | data['payment_rate'] = data['用户账单当月总费用(元)'] / (data['缴费用户最近一次缴费金额(元)'] + 5) 203 | 204 | data['balance_6month_rate'] = data['用户当月账户余额(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 205 | 206 | data['payment_6month_rate'] = data['缴费用户最近一次缴费金额(元)'] / (data['用户近6个月平均消费值(元)'] + 5) 207 | 208 | data['balance_payment_rate'] = data['用户当月账户余额(元)'] / (data['缴费用户最近一次缴费金额(元)'] + 5) 209 | 210 | data['当月金融理财类应用使用总次数/all'] = \ 211 | data['当月金融理财类应用使用总次数'] / (data['当月网购类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 212 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 213 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 214 | data['当月视频播放类应用使用次数']) # 还ok点 215 | 216 | data['当月视频播放类应用使用次数/all'] = \ 217 | data['当月视频播放类应用使用次数'] / (data['当月网购类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 218 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 219 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 220 | data['当月金融理财类应用使用总次数']) 221 | 222 | data['当月网购类应用使用次数/all'] = \ 223 | data['当月网购类应用使用次数'] / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 224 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + \ 225 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 226 | data['当月金融理财类应用使用总次数']) 227 | 228 | data['当月飞机类应用使用次数'] = \ 229 | data['当月飞机类应用使用次数'] / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 230 | data['当月网购类应用使用次数'] + data['当月旅游资讯类应用使用次数'] + \ 231 | data['近三个月月均商场出现次数'] / 3 + data['当月火车类应用使用次数'] + \ 232 | data['当月金融理财类应用使用总次数']) 233 | data.drop(columns=['是否大学生客户', '是否黑名单客户', '当月是否到过福州山姆会员店', '当月是否逛过福州仓山万达', '是否经常逛商场的人'], inplace=True) 234 | return data 235 | 236 | def feature5(data): 237 | data['充值途径'] = 0 238 | data['充值途径'][(data['缴费用户最近一次缴费金额(元)'] % 10 == 0) & \ 239 | data['缴费用户最近一次缴费金额(元)'] != 0] = 1 240 | data['buy_rate'] = data['当月物流快递类应用使用次数'] / (data['当月网购类应用使用次数'] + 1) 241 | data['sixfee_nowfee'] = data.apply(lambda x: x['用户近6个月平均消费值(元)'] - x['用户账单当月总费用(元)'], axis=1) 242 | data['cz_times'] = (data['用户账单当月总费用(元)']) / (data['缴费用户最近一次缴费金额(元)'] + 1) 243 | data['fivefee_nowfee'] = data['用户近6个月平均消费值(元)'] * 6 - data['用户账单当月总费用(元)'] 244 | data['month'] = data['用户网龄(月)'].apply(lambda x: x % 12) 245 | data['year'] = data['用户网龄(月)'].apply(lambda x: x / 12) 246 | 247 | data['word1'] = data['用户账单当月总费用(元)'].apply(lambda x: endwords(x)) 248 | data['word2'] = data['缴费用户最近一次缴费金额(元)'].apply(lambda x: endwords(x)) 249 | data['word3'] = data['用户近6个月平均消费值(元)'].apply(lambda x: endwords(x)) 250 | features = ['缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)', '用户账单当月总费用(元)'] 251 | data = feature_count(data, features) 252 | countdict = dict(data['用户账单当月总费用(元)'].value_counts()) 253 | data['set_fee'] = data['用户账单当月总费用(元)'].apply(lambda x: trans_set(x, countdict)) 254 | data['shopping_level'] = data['当月是否逛过福州仓山万达'] + data['当月是否到过福州山姆会员店'] + data['当月是否看电影'] + data['当月是否景点游览'] + data[ 255 | '当月是否体育场馆消费'] 256 | data['当月网购类应用使用次数/all'] = \ 257 | data['当月网购类应用使用次数'] / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 258 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数'] + \ 259 | data['当月金融理财类应用使用总次数']) 260 | 261 | data['当月交通类应用使用次数/all'] = \ 262 | (data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数']) / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 263 | data['当月网购类应用使用次数'] + data['当月旅游资讯类应用使用次数'] + 264 | data['当月金融理财类应用使用总次数']) 265 | data.drop(columns=['是否大学生客户', '是否黑名单客户', '当月是否到过福州山姆会员店', '当月是否逛过福州仓山万达', '是否经常逛商场的人'], inplace=True) 266 | return data 267 | 268 | def feature6(data): 269 | data['充值途径'] = 0 270 | data['充值途径'][(data['缴费用户最近一次缴费金额(元)'] % 10 == 0) & \ 271 | data['缴费用户最近一次缴费金额(元)'] != 0] = 1 272 | data['buy_rate'] = data['当月物流快递类应用使用次数'] / (data['当月网购类应用使用次数'] + 1) 273 | data['sixfee_nowfee'] = data.apply(lambda x: x['用户近6个月平均消费值(元)'] - x['用户账单当月总费用(元)'], axis=1) 274 | data['cz_times'] = (data['用户账单当月总费用(元)']) / (data['缴费用户最近一次缴费金额(元)'] + 1) 275 | data['fivefee_nowfee'] = data['用户近6个月平均消费值(元)'] * 6 - data['用户账单当月总费用(元)'] 276 | data['month'] = data['用户网龄(月)'].apply(lambda x: x % 12) 277 | data['year'] = data['用户网龄(月)'].apply(lambda x: x / 12) 278 | data['word1'] = data['用户账单当月总费用(元)'].apply(lambda x: endwords(x)) 279 | data['word2'] = data['缴费用户最近一次缴费金额(元)'].apply(lambda x: endwords(x)) 280 | data['word3'] = data['用户近6个月平均消费值(元)'].apply(lambda x: endwords(x)) 281 | features = ['缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)', '用户账单当月总费用(元)'] 282 | data = feature_count(data, features) 283 | countdict = dict(data['用户账单当月总费用(元)'].value_counts()) 284 | data['set_fee'] = data['用户账单当月总费用(元)'].apply(lambda x: trans_set(x, countdict)) 285 | data['shopping_level'] = data['当月是否逛过福州仓山万达'] + data['当月是否到过福州山姆会员店'] + data['当月是否看电影'] + data['当月是否景点游览'] + data[ 286 | '当月是否体育场馆消费'] 287 | data['当月网购类应用使用次数/all'] = \ 288 | data['当月网购类应用使用次数'] / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 289 | data['当月旅游资讯类应用使用次数'] + data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数'] + \ 290 | data['当月金融理财类应用使用总次数']) 291 | data['当月交通类应用使用次数/all'] = \ 292 | (data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数']) / (data['当月视频播放类应用使用次数'] + data['当月物流快递类应用使用次数'] + \ 293 | data['当月网购类应用使用次数'] + data['当月旅游资讯类应用使用次数'] + 294 | data['当月金融理财类应用使用总次数']) 295 | data.drop(columns=['是否大学生客户', '是否黑名单客户', '当月是否到过福州山姆会员店', '当月是否逛过福州仓山万达', '是否经常逛商场的人'], inplace=True) 296 | return data 297 | --------------------------------------------------------------------------------