├── README.md ├── run_B.py └── run_A.py /README.md: -------------------------------------------------------------------------------- 1 | # 招商银行2022FinTech精英训练营 2 | 3 | 4 | * A榜第九 B榜第二十六 5 | 6 | ## 环境配置 7 | 8 | numpy==1.19.5 9 | pandas==1.1.5 10 | scipy==1.5.4 11 | scikit-learn==0.24.2 12 | lightgbm==3.3.2 13 | tqdm==4.62.3 14 | 15 | 16 | ## A榜思路 17 | 18 | * 原始特征简单处理,统一减二,(基于2是出题方为了脱敏进行的变换) 19 | * 特征交叉 +-x/、group组合以及数值与类别变量交叉 20 | * 特征重要性进行特征遍历筛选,选择线下高于一定得分的模型(使用不同特征组合)进行融合 21 | * A榜线上线下相对较为一致,提升比较稳定 22 | * 完整特征大约需要50G内存(在笔记本上请谨慎运行) 23 | 24 | 25 | ## B榜思路 26 | 27 | * 观察完数据与其他选手分享的得分之后就一个思路 *找毒特* !! 28 | * 训练集与测试集分布不一致的特征-删除 29 | * 首先通过对抗验证的思路找到auc指标远大于0.5的特征,(这些特征名中有较大一部分带有CUR,猜测测试集中的数据取自和训练集不同的年份,导致差异显著)遂全部删除带有CUR的特征 30 | * 然后玄学删除特征 (总结发现的主要规律是线下越低,线上越高【奇怪的现象】) 31 | * 上一步骤主要通过特征重要性,迭代删除最重要的特征(和A榜做法相反,以获得更低的线下得分和更高的线上得分) 32 | * 提交进行玄学测试(次数有限,做的很差)) 33 | 34 | -------------------------------------------------------------------------------- /run_B.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sklearn 4 | import json 5 | import pandas as pd 6 | import warnings 7 | import multiprocessing 8 | import toad 9 | 10 | import lightgbm as lgb 11 | import numpy as np 12 | from tqdm import tqdm 13 | 14 | from sklearn.model_selection import StratifiedKFold 15 | from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, fbeta_score 16 | from sklearn.preprocessing import LabelEncoder 17 | from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD 18 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 19 | from gensim.models import Word2Vec 20 | 21 | 22 | warnings.filterwarnings('ignore') 23 | 24 | ############## 25 | # 读取数据 26 | # 拼接数据 27 | ############## 28 | 29 | if os.path.exists('data.pkl'): 30 | data = pd.read_pickle('data.pkl') 31 | else: 32 | train = pd.read_excel('train.xlsx') 33 | test = pd.read_excel('test_B榜.xlsx') 34 | 35 | data = pd.concat([train, test]).reset_index(drop=True) 36 | 37 | 38 | ############## 39 | # 数据预处理 40 | # 编码、缺失值等 41 | # 数值特征-2以对准 42 | ############## 43 | 44 | ff = [i for i in data.columns if i not in ['LABEL', 'CUST_UID']] 45 | cat_f = ['MON_12_CUST_CNT_PTY_ID', 'WTHR_OPN_ONL_ICO', 'LGP_HLD_CARD_LVL', 'NB_CTC_HLD_IDV_AIO_CARD_SITU'] 46 | num_f = [] 47 | for f in tqdm(ff): 48 | data[f] = data[f].fillna(-2) 49 | data[f] = data[f].astype('str') 50 | data[f] = data[f].apply(lambda x: x.replace('?', '-1')) 51 | 52 | if f not in cat_f: 53 | data[f] = data[f].astype('float') 54 | data[f] = data[f].replace(-1, np.nan) 55 | else: 56 | if f == 'MON_12_CUST_CNT_PTY_ID': 57 | lb = LabelEncoder() 58 | data[f] = lb.fit_transform(data[f]) 59 | else: 60 | grade_dict = {'A':7, 'B':6, 'C':5, 'D':4, 'E':3, 'F':2, 'G':1} 61 | data[f] = data[f].map(grade_dict) 62 | data[f] = data[f].replace(-1, np.nan) 63 | 64 | data[f] -= 2 65 | if data[f].max() > 1000000: 66 | num_f.append(f) 67 | 68 | 69 | ############## 70 | # 可能是毒特的feature 71 | # 所有CUR以及时间相关等 72 | ############## 73 | 74 | cur_f_list = [i for i in data.columns if 'CUR' in i] + ['OPN_TM', 'REG_DT', 'REG_CPT', 75 | 'COR_KEY_PROD_HLD_NBR', 76 | 'WTHR_OPN_ONL_ICO', 77 | 'NB_RCT_3_MON_LGN_TMS_AGV', 78 | 'AGN_AGR_LATEST_AGN_AMT' 79 | ] 80 | 81 | ############## 82 | # 分离训练测试 83 | ############## 84 | 85 | train = data[~data['LABEL'].isna()].reset_index(drop=True) 86 | test = data[data['LABEL'].isna()].reset_index(drop=True) 87 | 88 | features = [i for i in train.columns if i not in ['LABEL', 'CUST_UID', 89 | ] + cur_f_list] 90 | y = train['LABEL'] 91 | print("Train files: ", len(train), "| Test files: ", len(test), "| Feature nums", len(features)) 92 | 93 | import gc 94 | del data 95 | gc.collect 96 | 97 | def train_model(X_train, X_test, features, y, seed=2021, save_model=False): 98 | """ 99 | 训练lgb模型 100 | """ 101 | feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0}) 102 | KF = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True) 103 | score_list = [] 104 | params = { 105 | 'objective': 'binary', 106 | 'boosting_type': 'rf', 107 | 'metric': 'auc', 108 | 'n_jobs': -1, 109 | 'learning_rate': 0.05, 110 | 'num_leaves': 2 ** 6, 111 | 'max_depth': 8, 112 | 'tree_learner': 'serial', 113 | 'colsample_bytree': 0.8, 114 | 'subsample_freq': 1, 115 | 'subsample': 0.8, 116 | 'num_boost_round': 5000, 117 | 'max_bin': 50, 118 | 'verbose': -1, 119 | 'seed': seed, 120 | 'bagging_seed': seed, 121 | 'feature_fraction_seed': seed, 122 | 'early_stopping_rounds': 100, 123 | 124 | } 125 | oof_lgb = np.zeros(len(X_train)) 126 | predictions_lgb = np.zeros((len(X_test))) 127 | 128 | for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)): 129 | trn_data = lgb.Dataset(X_train.iloc[trn_idx][features], label=y.iloc[trn_idx]) 130 | val_data = lgb.Dataset(X_train.iloc[val_idx][features], label=y.iloc[val_idx]) 131 | num_round = 10000 132 | clf = lgb.train( 133 | params, 134 | trn_data, 135 | num_round, 136 | valid_sets=[trn_data, val_data], 137 | verbose_eval=100, 138 | early_stopping_rounds=50, 139 | ) 140 | 141 | oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration) 142 | predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) / 5 143 | feat_imp_df['imp'] += clf.feature_importance() / 5 144 | score_list.append(roc_auc_score(y.iloc[val_idx], oof_lgb[val_idx])) 145 | if save_model: 146 | clf.save_model(f'model_{fold_}.txt') 147 | 148 | print("AUC score: {}".format(roc_auc_score(y, oof_lgb))) 149 | print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) 150 | print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) 151 | print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) 152 | print("AUC mean: {}".format(np.mean(score_list))) 153 | return feat_imp_df, oof_lgb, predictions_lgb 154 | 155 | 156 | ############## 157 | # 模型多种子训练 158 | ############## 159 | 160 | seeds = [666, 888, 999] 161 | 162 | pred = [] 163 | oof = [] 164 | for seed in seeds: 165 | feat_imp_df, oof_lgb, predictions_lgb = train_model(train, test, features, y, seed) 166 | pred.append(predictions_lgb) 167 | oof.append(oof_lgb) 168 | 169 | 170 | ############## 171 | # 均值融合 172 | # 生成提交文件 173 | ############## 174 | 175 | test['label'] = np.mean(pred, axis=0) 176 | test[['CUST_UID', 'label']].to_csv('b_sub_8524.txt', index=False, header=None, sep=' ') 177 | print(test[['CUST_UID', 'label']].head()) 178 | len(test) 179 | -------------------------------------------------------------------------------- /run_A.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sklearn 4 | import json 5 | import pandas as pd 6 | import warnings 7 | import multiprocessing 8 | import toad 9 | 10 | import lightgbm as lgb 11 | import numpy as np 12 | from tqdm import tqdm 13 | 14 | from sklearn.model_selection import StratifiedKFold 15 | from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, fbeta_score 16 | from sklearn.preprocessing import LabelEncoder 17 | from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD 18 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 19 | from gensim.models import Word2Vec 20 | 21 | 22 | warnings.filterwarnings('ignore') 23 | 24 | ############## 25 | # 读取数据 26 | # 拼接数据 27 | ############## 28 | 29 | if os.path.exists('data.pkl'): 30 | data = pd.read_pickle('data.pkl') 31 | else: 32 | train = pd.read_excel('train.xlsx') 33 | test = pd.read_excel('test_A榜.xlsx') 34 | 35 | data = pd.concat([train, test]).reset_index(drop=True) 36 | 37 | 38 | ############## 39 | # 数据预处理 40 | # 编码、缺失值等 41 | # 数值特征-2以对准 42 | ############## 43 | 44 | ff = [i for i in data.columns if i not in ['LABEL', 'CUST_UID']] 45 | cat_f = ['MON_12_CUST_CNT_PTY_ID', 'WTHR_OPN_ONL_ICO', 'LGP_HLD_CARD_LVL', 'NB_CTC_HLD_IDV_AIO_CARD_SITU'] 46 | num_f = [] 47 | for f in tqdm(ff): 48 | data[f] = data[f].fillna(-2) 49 | data[f] = data[f].astype('str') 50 | data[f] = data[f].apply(lambda x: x.replace('?', '-1')) 51 | 52 | if f not in cat_f: 53 | data[f] = data[f].astype('float') 54 | data[f] = data[f].replace(-1, np.nan) 55 | else: 56 | if f == 'MON_12_CUST_CNT_PTY_ID': 57 | lb = LabelEncoder() 58 | data[f] = lb.fit_transform(data[f]) 59 | else: 60 | grade_dict = {'A':7, 'B':6, 'C':5, 'D':4, 'E':3, 'F':2, 'G':1} 61 | data[f] = data[f].map(grade_dict) 62 | data[f] = data[f].replace(-1, np.nan) 63 | 64 | data[f] -= 2 65 | if data[f].max() > 1000000: 66 | num_f.append(f) 67 | 68 | ############## 69 | # 特征工程 70 | # 1. 偏离值特征 71 | # 2. 数值和类别特征交叉 72 | # 3. 加减乘除交叉 73 | ############## 74 | 75 | for group in tqdm(cat_f): 76 | for feature in ff: 77 | if feature not in cat_f: 78 | tmp = data.groupby(group)[feature].agg([sum, min, max, np.mean]).reset_index() 79 | tmp = pd.merge(data, tmp, on=group, how='left') 80 | data['{}-mean_gb_{}'.format(feature, group)] = data[feature] - tmp['mean'] 81 | data['{}-min_gb_{}'.format(feature, group)] = data[feature] - tmp['min'] 82 | data['{}-max_gb_{}'.format(feature, group)] = data[feature] - tmp['max'] 83 | data['{}/sum_gb_{}'.format(feature, group)] = data[feature] / tmp['sum'] 84 | 85 | for i in tqdm(range(len(num_f))): 86 | for j in range(i + 1, len(num_f)): 87 | for cat in cat_f[1:]: 88 | f1 = ff[i] 89 | f2 = ff[j] 90 | data[f'{f1}_{f2}_log_{cat}'] = (np.log1p(data[f1]) - np.log1p(data[f2])) * data[cat] 91 | data[f'{f1}+{f2}_log_{cat}'] = (np.log1p(data[f1]) + np.log1p(data[f2])) * data[cat] 92 | data[f'{f1}*{f2}_log_{cat}'] = (np.log1p(data[f1]) * np.log1p(data[f2])) * data[cat] 93 | data[f'{f1}/{f2}_log_{cat}'] = (np.log1p(data[f1]) / np.log1p(data[f2])) * data[cat] 94 | data[f'{f2}/{f1}_log_{cat}'] = (np.log1p(data[f2]) / np.log1p(data[f1])) * data[cat] 95 | 96 | data[f'{f1}_{f2}_log_{cat}_'] = (np.log1p(data[f1]) - np.log1p(data[f2])) / data[cat] 97 | data[f'{f1}+{f2}_log_{cat}_'] = (np.log1p(data[f1]) + np.log1p(data[f2])) / data[cat] 98 | data[f'{f1}*{f2}_log_{cat}_'] = (np.log1p(data[f1]) * np.log1p(data[f2])) / data[cat] 99 | data[f'{f1}/{f2}_log_{cat}_'] = (np.log1p(data[f1]) / np.log1p(data[f2])) / data[cat] 100 | data[f'{f2}/{f1}_log_{cat}_'] = (np.log1p(data[f2]) / np.log1p(data[f1])) / data[cat] 101 | 102 | 103 | for i in tqdm(range(len(ff))): 104 | for j in range(i + 1, len(ff)): 105 | f1 = ff[i] 106 | f2 = ff[j] 107 | data[f'{f1}_{f2}'] = data[f1] - data[f2] 108 | data[f'{f1}+{f2}'] = data[f1] + data[f2] 109 | data[f'{f1}*{f2}'] = data[f1] * data[f2] 110 | data[f'{f1}/{f2}'] = data[f1] / data[f2] 111 | data[f'{f2}/{f1}'] = data[f2] / data[f1] 112 | 113 | 114 | ############## 115 | # 减少内存使用 116 | ############## 117 | 118 | def reduce_mem_usage(df): 119 | 120 | start_mem = df.memory_usage().sum() / 1024**2 121 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) 122 | 123 | for col in tqdm(features): 124 | col_type = df[col].dtype 125 | 126 | if col_type != object: 127 | c_min = df[col].min() 128 | c_max = df[col].max() 129 | if str(col_type)[:3] == 'int': 130 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 131 | df[col] = df[col].astype(np.int8) 132 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 133 | df[col] = df[col].astype(np.int16) 134 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 135 | df[col] = df[col].astype(np.int32) 136 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 137 | df[col] = df[col].astype(np.int64) 138 | else: 139 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 140 | df[col] = df[col].astype(np.float16) 141 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 142 | df[col] = df[col].astype(np.float32) 143 | else: 144 | df[col] = df[col].astype(np.float64) 145 | else: 146 | df[col] = df[col].astype('category') 147 | 148 | end_mem = df.memory_usage().sum() / 1024**2 149 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) 150 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) 151 | 152 | return df 153 | 154 | data = reduce_mem_usage(data) 155 | 156 | 157 | ############## 158 | # 分离训练测试 159 | ############## 160 | 161 | train = data[~data['LABEL'].isna()].reset_index(drop=True) 162 | test = data[data['LABEL'].isna()].reset_index(drop=True) 163 | 164 | features = [i for i in train.columns if i not in ['LABEL', 'CUST_UID', 165 | ]] 166 | y = train['LABEL'] 167 | print("Train files: ", len(train), "| Test files: ", len(test), "| Feature nums", len(features)) 168 | 169 | 170 | def train_model(X_train, X_test, features, y, seed=2021, save_model=False): 171 | 172 | feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0}) 173 | KF = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True) 174 | score_list = [] 175 | params = { 176 | 'objective': 'binary', 177 | 'boosting_type': 'gbdt', 178 | 'metric': 'auc', 179 | 'n_jobs': -1, 180 | 'learning_rate': 0.05, 181 | 'num_leaves': 2 ** 6, 182 | 'max_depth': 8, 183 | 'tree_learner': 'serial', 184 | 'colsample_bytree': 0.8, 185 | 'subsample_freq': 1, 186 | 'subsample': 0.8, 187 | 'num_boost_round': 5000, 188 | 'max_bin': 255, 189 | 'verbose': -1, 190 | 'seed': seed, 191 | 'bagging_seed': seed, 192 | 'feature_fraction_seed': seed, 193 | 'early_stopping_rounds': 100, 194 | 195 | } 196 | oof_lgb = np.zeros(len(X_train)) 197 | predictions_lgb = np.zeros((len(X_test))) 198 | 199 | for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)): 200 | trn_data = lgb.Dataset(X_train.iloc[trn_idx][features], label=y.iloc[trn_idx]) 201 | val_data = lgb.Dataset(X_train.iloc[val_idx][features], label=y.iloc[val_idx]) 202 | num_round = 10000 203 | clf = lgb.train( 204 | params, 205 | trn_data, 206 | num_round, 207 | valid_sets=[trn_data, val_data], 208 | verbose_eval=100, 209 | early_stopping_rounds=50, 210 | ) 211 | 212 | oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration) 213 | predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) / 5 214 | feat_imp_df['imp'] += clf.feature_importance() / 5 215 | score_list.append(roc_auc_score(y.iloc[val_idx], oof_lgb[val_idx])) 216 | if save_model: 217 | clf.save_model(f'model_{fold_}.txt') 218 | 219 | print("AUC score: {}".format(roc_auc_score(y, oof_lgb))) 220 | print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) 221 | print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) 222 | print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) 223 | print("AUC mean: {}".format(np.mean(score_list))) 224 | return feat_imp_df, oof_lgb, predictions_lgb 225 | 226 | 227 | 228 | ############## 229 | # 模型全特征训练 230 | # 得到特征重要性排名 231 | ############## 232 | seeds = [2021] 233 | 234 | pred = [] 235 | oof = [] 236 | for seed in seeds: 237 | feat_imp_df, oof_lgb, predictions_lgb = train_model(train, test, features, y, seed) 238 | pred.append(predictions_lgb) 239 | oof.append(oof_lgb) 240 | 241 | 242 | np.save('score_dict_a.npy', off_score_dict) 243 | feat_imp_df.to_pickle('feature_imp_a.pkl') 244 | 245 | 246 | ############## 247 | # 遍历特征组合 248 | # 寻找较优的验证分数模型 249 | ############## 250 | 251 | from collections import defaultdict 252 | 253 | off_score_dict = defaultdict(int) 254 | 255 | for i in range(201, 501): 256 | 257 | features2 = feat_imp_df.sort_values(['imp'])[-i:]['feat'].to_list() 258 | seeds = [2021] 259 | pred = [] 260 | oof = [] 261 | for seed in seeds: 262 | _, oof_lgb, predictions_lgb = train_model(train, test, features2, y, seed) 263 | pred.append(predictions_lgb) 264 | oof.append(oof_lgb) 265 | 266 | score_ = roc_auc_score(y, np.mean(oof, axis=0)) 267 | if score_ > 0.953: 268 | off_score_dict[i] = score_ 269 | 270 | 271 | ############## 272 | # 均值融合 273 | # 生成提交文件 274 | ############## 275 | 276 | score_dict = np.load('score_dict_a.npy', allow_pickle=True).item() 277 | feat_imp_df = pd.read_pickle('feature_imp_a.pkl') 278 | 279 | pred = [] 280 | oof = [] 281 | for k, v in tqdm(score_dict.items()): 282 | if v > 0.9532: 283 | features2 = feat_imp_df.sort_values(['imp'])[-k:]['feat'].to_list() 284 | _, oof_lgb, predictions_lgb = train_model(train, test, features2, y, seed=2021) 285 | pred.append(predictions_lgb) 286 | oof.append(oof_lgb) 287 | 288 | test['label'] = np.mean(pred, axis=0) 289 | test[['CUST_UID', 'label']].to_csv('sub.txt', index=False, header=None, sep=' ') 290 | print(test[['CUST_UID', 'label']].head()) 291 | len(test) --------------------------------------------------------------------------------