├── README.md
├── run_B.py
└── run_A.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 招商银行2022FinTech精英训练营
 2 | 
 3 | 
 4 | * A榜第九 B榜第二十六
 5 | 
 6 | ## 环境配置
 7 | 
 8 | numpy==1.19.5
 9 | pandas==1.1.5
10 | scipy==1.5.4
11 | scikit-learn==0.24.2
12 | lightgbm==3.3.2
13 | tqdm==4.62.3
14 | 
15 | 
16 | ## A榜思路
17 | 
18 | * 原始特征简单处理，统一减二，（基于2是出题方为了脱敏进行的变换）
19 | * 特征交叉 +-x/、group组合以及数值与类别变量交叉
20 | * 特征重要性进行特征遍历筛选，选择线下高于一定得分的模型（使用不同特征组合）进行融合
21 | * A榜线上线下相对较为一致，提升比较稳定
22 | * 完整特征大约需要50G内存（在笔记本上请谨慎运行）
23 | 
24 | 
25 | ## B榜思路
26 | 
27 | * 观察完数据与其他选手分享的得分之后就一个思路 *找毒特* ！！ 
28 | * 训练集与测试集分布不一致的特征-删除
29 | * 首先通过对抗验证的思路找到auc指标远大于0.5的特征，（这些特征名中有较大一部分带有CUR，猜测测试集中的数据取自和训练集不同的年份，导致差异显著）遂全部删除带有CUR的特征 
30 | * 然后玄学删除特征 （总结发现的主要规律是线下越低，线上越高【奇怪的现象】）
31 | * 上一步骤主要通过特征重要性，迭代删除最重要的特征（和A榜做法相反，以获得更低的线下得分和更高的线上得分）
32 | * 提交进行玄学测试（次数有限，做的很差））
33 | 
34 | 


--------------------------------------------------------------------------------
/run_B.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sklearn
  4 | import json
  5 | import pandas as pd
  6 | import warnings
  7 | import multiprocessing
  8 | import toad
  9 | 
 10 | import lightgbm as lgb
 11 | import numpy as np
 12 | from tqdm import tqdm
 13 | 
 14 | from sklearn.model_selection import StratifiedKFold
 15 | from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, fbeta_score
 16 | from sklearn.preprocessing import LabelEncoder
 17 | from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
 18 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 19 | from gensim.models import Word2Vec
 20 | 
 21 | 
 22 | warnings.filterwarnings('ignore')
 23 | 
 24 | ##############
 25 | # 读取数据
 26 | # 拼接数据
 27 | ##############
 28 | 
 29 | if os.path.exists('data.pkl'):
 30 |     data = pd.read_pickle('data.pkl')
 31 | else:
 32 |     train = pd.read_excel('train.xlsx')
 33 |     test = pd.read_excel('test_B榜.xlsx')
 34 | 
 35 |     data = pd.concat([train, test]).reset_index(drop=True)
 36 | 
 37 | 
 38 | ##############
 39 | # 数据预处理
 40 | # 编码、缺失值等
 41 | # 数值特征-2以对准
 42 | ##############
 43 | 
 44 | ff = [i for i in data.columns if i not in ['LABEL', 'CUST_UID']]
 45 | cat_f = ['MON_12_CUST_CNT_PTY_ID',  'WTHR_OPN_ONL_ICO',  'LGP_HLD_CARD_LVL', 'NB_CTC_HLD_IDV_AIO_CARD_SITU']
 46 | num_f = []
 47 | for f in tqdm(ff):
 48 |     data[f] = data[f].fillna(-2)
 49 |     data[f] = data[f].astype('str')
 50 |     data[f] = data[f].apply(lambda x: x.replace('?', '-1'))
 51 | 
 52 |     if f not in cat_f:
 53 |         data[f] = data[f].astype('float')
 54 |         data[f] = data[f].replace(-1, np.nan)
 55 |     else:
 56 |         if f == 'MON_12_CUST_CNT_PTY_ID':
 57 |             lb = LabelEncoder()
 58 |             data[f] = lb.fit_transform(data[f])
 59 |         else:
 60 |             grade_dict = {'A':7, 'B':6, 'C':5, 'D':4, 'E':3, 'F':2, 'G':1}
 61 |             data[f] = data[f].map(grade_dict)
 62 |             data[f] = data[f].replace(-1, np.nan)
 63 | 
 64 |     data[f] -= 2
 65 |     if data[f].max() > 1000000:
 66 |         num_f.append(f)
 67 | 
 68 | 
 69 | ##############
 70 | # 可能是毒特的feature
 71 | # 所有CUR以及时间相关等
 72 | ##############
 73 | 
 74 | cur_f_list = [i for i in data.columns if 'CUR' in i] + ['OPN_TM', 'REG_DT',  'REG_CPT',
 75 |                                                           'COR_KEY_PROD_HLD_NBR',
 76 |                                                           'WTHR_OPN_ONL_ICO',
 77 |                                                           'NB_RCT_3_MON_LGN_TMS_AGV',
 78 |                                                           'AGN_AGR_LATEST_AGN_AMT'
 79 |                                                        ]
 80 | 
 81 | ##############
 82 | # 分离训练测试
 83 | ##############
 84 | 
 85 | train = data[~data['LABEL'].isna()].reset_index(drop=True)
 86 | test = data[data['LABEL'].isna()].reset_index(drop=True)
 87 | 
 88 | features = [i for i in train.columns if i not in ['LABEL', 'CUST_UID',
 89 |                                                  ] + cur_f_list]
 90 | y = train['LABEL']
 91 | print("Train files: ", len(train), "| Test files: ", len(test), "| Feature nums", len(features))
 92 | 
 93 | import gc
 94 | del data
 95 | gc.collect
 96 | 
 97 | def train_model(X_train, X_test, features, y, seed=2021, save_model=False):
 98 |     """
 99 |     训练lgb模型
100 |     """
101 |     feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0})
102 |     KF = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
103 |     score_list = []
104 |     params = {
105 |         'objective': 'binary',
106 |         'boosting_type': 'rf',
107 |         'metric': 'auc',
108 |         'n_jobs': -1,
109 |         'learning_rate': 0.05,
110 |         'num_leaves': 2 ** 6,
111 |         'max_depth': 8,
112 |         'tree_learner': 'serial',
113 |         'colsample_bytree': 0.8,
114 |         'subsample_freq': 1,
115 |         'subsample': 0.8,
116 |         'num_boost_round': 5000,
117 |         'max_bin': 50,
118 |         'verbose': -1,
119 |         'seed': seed,
120 |         'bagging_seed': seed,
121 |         'feature_fraction_seed': seed,
122 |         'early_stopping_rounds': 100,
123 | 
124 |     }
125 |     oof_lgb = np.zeros(len(X_train))
126 |     predictions_lgb = np.zeros((len(X_test)))
127 | 
128 |     for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
129 |         trn_data = lgb.Dataset(X_train.iloc[trn_idx][features], label=y.iloc[trn_idx])
130 |         val_data = lgb.Dataset(X_train.iloc[val_idx][features], label=y.iloc[val_idx])
131 |         num_round = 10000
132 |         clf = lgb.train(
133 |             params,
134 |             trn_data,
135 |             num_round,
136 |             valid_sets=[trn_data, val_data],
137 |             verbose_eval=100,
138 |             early_stopping_rounds=50,
139 |         )
140 | 
141 |         oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
142 |         predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) / 5
143 |         feat_imp_df['imp'] += clf.feature_importance() / 5
144 |         score_list.append(roc_auc_score(y.iloc[val_idx], oof_lgb[val_idx]))
145 |         if save_model:
146 |             clf.save_model(f'model_{fold_}.txt')
147 | 
148 |     print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
149 |     print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
150 |     print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
151 |     print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
152 |     print("AUC mean: {}".format(np.mean(score_list)))
153 |     return feat_imp_df, oof_lgb, predictions_lgb
154 | 
155 | 
156 | ##############
157 | # 模型多种子训练
158 | ##############
159 | 
160 | seeds = [666, 888, 999]
161 | 
162 | pred = []
163 | oof = []
164 | for seed in seeds:
165 |     feat_imp_df, oof_lgb, predictions_lgb = train_model(train, test, features, y, seed)
166 |     pred.append(predictions_lgb)
167 |     oof.append(oof_lgb)
168 | 
169 | 
170 | ##############
171 | # 均值融合
172 | # 生成提交文件
173 | ##############
174 | 
175 | test['label'] = np.mean(pred, axis=0)
176 | test[['CUST_UID', 'label']].to_csv('b_sub_8524.txt', index=False, header=None, sep=' ')
177 | print(test[['CUST_UID', 'label']].head())
178 | len(test)
179 | 


--------------------------------------------------------------------------------
/run_A.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sklearn
  4 | import json
  5 | import pandas as pd
  6 | import warnings
  7 | import multiprocessing
  8 | import toad
  9 | 
 10 | import lightgbm as lgb
 11 | import numpy as np
 12 | from tqdm import tqdm
 13 | 
 14 | from sklearn.model_selection import StratifiedKFold
 15 | from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, fbeta_score
 16 | from sklearn.preprocessing import LabelEncoder
 17 | from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
 18 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 19 | from gensim.models import Word2Vec
 20 | 
 21 | 
 22 | warnings.filterwarnings('ignore')
 23 | 
 24 | ##############
 25 | # 读取数据
 26 | # 拼接数据
 27 | ##############
 28 | 
 29 | if os.path.exists('data.pkl'):
 30 |     data = pd.read_pickle('data.pkl')
 31 | else:
 32 |     train = pd.read_excel('train.xlsx')
 33 |     test = pd.read_excel('test_A榜.xlsx')
 34 | 
 35 |     data = pd.concat([train, test]).reset_index(drop=True)
 36 | 
 37 | 
 38 | ##############
 39 | # 数据预处理
 40 | # 编码、缺失值等
 41 | # 数值特征-2以对准
 42 | ##############
 43 | 
 44 | ff = [i for i in data.columns if i not in ['LABEL', 'CUST_UID']]
 45 | cat_f = ['MON_12_CUST_CNT_PTY_ID',  'WTHR_OPN_ONL_ICO',  'LGP_HLD_CARD_LVL', 'NB_CTC_HLD_IDV_AIO_CARD_SITU']
 46 | num_f = []
 47 | for f in tqdm(ff):
 48 |     data[f] = data[f].fillna(-2)
 49 |     data[f] = data[f].astype('str')
 50 |     data[f] = data[f].apply(lambda x: x.replace('?', '-1'))
 51 | 
 52 |     if f not in cat_f:
 53 |         data[f] = data[f].astype('float')
 54 |         data[f] = data[f].replace(-1, np.nan)
 55 |     else:
 56 |         if f == 'MON_12_CUST_CNT_PTY_ID':
 57 |             lb = LabelEncoder()
 58 |             data[f] = lb.fit_transform(data[f])
 59 |         else:
 60 |             grade_dict = {'A':7, 'B':6, 'C':5, 'D':4, 'E':3, 'F':2, 'G':1}
 61 |             data[f] = data[f].map(grade_dict)
 62 |             data[f] = data[f].replace(-1, np.nan)
 63 | 
 64 |     data[f] -= 2
 65 |     if data[f].max() > 1000000:
 66 |         num_f.append(f)
 67 | 
 68 | ##############
 69 | # 特征工程
 70 | # 1. 偏离值特征
 71 | # 2. 数值和类别特征交叉
 72 | # 3. 加减乘除交叉
 73 | ##############
 74 | 
 75 | for group in tqdm(cat_f):
 76 |     for feature in ff:
 77 |         if feature not in cat_f:
 78 |             tmp = data.groupby(group)[feature].agg([sum, min, max, np.mean]).reset_index()
 79 |             tmp = pd.merge(data, tmp, on=group, how='left')
 80 |             data['{}-mean_gb_{}'.format(feature, group)] = data[feature] - tmp['mean']
 81 |             data['{}-min_gb_{}'.format(feature, group)] = data[feature] - tmp['min']
 82 |             data['{}-max_gb_{}'.format(feature, group)] = data[feature] - tmp['max']
 83 |             data['{}/sum_gb_{}'.format(feature, group)] = data[feature] / tmp['sum']
 84 | 
 85 | for i in tqdm(range(len(num_f))):
 86 |     for j in range(i + 1, len(num_f)):
 87 |         for cat in cat_f[1:]:
 88 |             f1 = ff[i]
 89 |             f2 = ff[j]
 90 |             data[f'{f1}_{f2}_log_{cat}'] = (np.log1p(data[f1]) - np.log1p(data[f2])) * data[cat]
 91 |             data[f'{f1}+{f2}_log_{cat}'] = (np.log1p(data[f1]) + np.log1p(data[f2])) * data[cat]
 92 |             data[f'{f1}*{f2}_log_{cat}'] = (np.log1p(data[f1]) * np.log1p(data[f2])) * data[cat]
 93 |             data[f'{f1}/{f2}_log_{cat}'] = (np.log1p(data[f1]) / np.log1p(data[f2])) * data[cat]
 94 |             data[f'{f2}/{f1}_log_{cat}'] = (np.log1p(data[f2]) / np.log1p(data[f1])) * data[cat]
 95 | 
 96 |             data[f'{f1}_{f2}_log_{cat}_'] = (np.log1p(data[f1]) - np.log1p(data[f2])) / data[cat]
 97 |             data[f'{f1}+{f2}_log_{cat}_'] = (np.log1p(data[f1]) + np.log1p(data[f2])) / data[cat]
 98 |             data[f'{f1}*{f2}_log_{cat}_'] = (np.log1p(data[f1]) * np.log1p(data[f2])) / data[cat]
 99 |             data[f'{f1}/{f2}_log_{cat}_'] = (np.log1p(data[f1]) / np.log1p(data[f2])) / data[cat]
100 |             data[f'{f2}/{f1}_log_{cat}_'] = (np.log1p(data[f2]) / np.log1p(data[f1])) / data[cat]
101 | 
102 | 
103 | for i in tqdm(range(len(ff))):
104 |     for j in range(i + 1, len(ff)):
105 |         f1 = ff[i]
106 |         f2 = ff[j]
107 |         data[f'{f1}_{f2}'] = data[f1] - data[f2]
108 |         data[f'{f1}+{f2}'] = data[f1] + data[f2]
109 |         data[f'{f1}*{f2}'] = data[f1] * data[f2]
110 |         data[f'{f1}/{f2}'] = data[f1] / data[f2]
111 |         data[f'{f2}/{f1}'] = data[f2] / data[f1]
112 | 
113 | 
114 | ##############
115 | # 减少内存使用
116 | ##############
117 | 
118 | def reduce_mem_usage(df):
119 | 
120 |     start_mem = df.memory_usage().sum() / 1024**2
121 |     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
122 | 
123 |     for col in tqdm(features):
124 |         col_type = df[col].dtype
125 | 
126 |         if col_type != object:
127 |             c_min = df[col].min()
128 |             c_max = df[col].max()
129 |             if str(col_type)[:3] == 'int':
130 |                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
131 |                     df[col] = df[col].astype(np.int8)
132 |                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
133 |                     df[col] = df[col].astype(np.int16)
134 |                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
135 |                     df[col] = df[col].astype(np.int32)
136 |                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
137 |                     df[col] = df[col].astype(np.int64)
138 |             else:
139 |                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
140 |                     df[col] = df[col].astype(np.float16)
141 |                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
142 |                     df[col] = df[col].astype(np.float32)
143 |                 else:
144 |                     df[col] = df[col].astype(np.float64)
145 |         else:
146 |             df[col] = df[col].astype('category')
147 | 
148 |     end_mem = df.memory_usage().sum() / 1024**2
149 |     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
150 |     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
151 | 
152 |     return df
153 | 
154 | data = reduce_mem_usage(data)
155 | 
156 | 
157 | ##############
158 | # 分离训练测试
159 | ##############
160 | 
161 | train = data[~data['LABEL'].isna()].reset_index(drop=True)
162 | test = data[data['LABEL'].isna()].reset_index(drop=True)
163 | 
164 | features = [i for i in train.columns if i not in ['LABEL', 'CUST_UID',
165 |                                                  ]]
166 | y = train['LABEL']
167 | print("Train files: ", len(train), "| Test files: ", len(test), "| Feature nums", len(features))
168 | 
169 | 
170 | def train_model(X_train, X_test, features, y, seed=2021, save_model=False):
171 | 
172 |     feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0})
173 |     KF = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
174 |     score_list = []
175 |     params = {
176 |         'objective': 'binary',
177 |         'boosting_type': 'gbdt',
178 |         'metric': 'auc',
179 |         'n_jobs': -1,
180 |         'learning_rate': 0.05,
181 |         'num_leaves': 2 ** 6,
182 |         'max_depth': 8,
183 |         'tree_learner': 'serial',
184 |         'colsample_bytree': 0.8,
185 |         'subsample_freq': 1,
186 |         'subsample': 0.8,
187 |         'num_boost_round': 5000,
188 |         'max_bin': 255,
189 |         'verbose': -1,
190 |         'seed': seed,
191 |         'bagging_seed': seed,
192 |         'feature_fraction_seed': seed,
193 |         'early_stopping_rounds': 100,
194 | 
195 |     }
196 |     oof_lgb = np.zeros(len(X_train))
197 |     predictions_lgb = np.zeros((len(X_test)))
198 | 
199 |     for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
200 |         trn_data = lgb.Dataset(X_train.iloc[trn_idx][features], label=y.iloc[trn_idx])
201 |         val_data = lgb.Dataset(X_train.iloc[val_idx][features], label=y.iloc[val_idx])
202 |         num_round = 10000
203 |         clf = lgb.train(
204 |             params,
205 |             trn_data,
206 |             num_round,
207 |             valid_sets=[trn_data, val_data],
208 |             verbose_eval=100,
209 |             early_stopping_rounds=50,
210 |         )
211 | 
212 |         oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
213 |         predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) / 5
214 |         feat_imp_df['imp'] += clf.feature_importance() / 5
215 |         score_list.append(roc_auc_score(y.iloc[val_idx], oof_lgb[val_idx]))
216 |         if save_model:
217 |             clf.save_model(f'model_{fold_}.txt')
218 | 
219 |     print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
220 |     print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
221 |     print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
222 |     print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
223 |     print("AUC mean: {}".format(np.mean(score_list)))
224 |     return feat_imp_df, oof_lgb, predictions_lgb
225 | 
226 | 
227 | 
228 | ##############
229 | # 模型全特征训练
230 | # 得到特征重要性排名
231 | ##############
232 | seeds = [2021]
233 | 
234 | pred = []
235 | oof = []
236 | for seed in seeds:
237 |     feat_imp_df, oof_lgb, predictions_lgb = train_model(train, test, features, y, seed)
238 |     pred.append(predictions_lgb)
239 |     oof.append(oof_lgb)
240 | 
241 | 
242 | np.save('score_dict_a.npy', off_score_dict)
243 | feat_imp_df.to_pickle('feature_imp_a.pkl')
244 | 
245 | 
246 | ##############
247 | # 遍历特征组合
248 | # 寻找较优的验证分数模型
249 | ##############
250 | 
251 | from collections import defaultdict
252 | 
253 | off_score_dict = defaultdict(int)
254 | 
255 | for i in range(201, 501):
256 | 
257 |     features2 = feat_imp_df.sort_values(['imp'])[-i:]['feat'].to_list()
258 |     seeds = [2021]
259 |     pred = []
260 |     oof = []
261 |     for seed in seeds:
262 |         _, oof_lgb, predictions_lgb = train_model(train, test, features2, y, seed)
263 |         pred.append(predictions_lgb)
264 |         oof.append(oof_lgb)
265 | 
266 |     score_ = roc_auc_score(y, np.mean(oof, axis=0))
267 |     if score_ > 0.953:
268 |         off_score_dict[i] = score_
269 | 
270 | 
271 | ##############
272 | # 均值融合
273 | # 生成提交文件
274 | ##############
275 | 
276 | score_dict = np.load('score_dict_a.npy', allow_pickle=True).item()
277 | feat_imp_df = pd.read_pickle('feature_imp_a.pkl')
278 | 
279 | pred = []
280 | oof = []
281 | for k, v in tqdm(score_dict.items()):
282 |     if v > 0.9532:
283 |         features2 = feat_imp_df.sort_values(['imp'])[-k:]['feat'].to_list()
284 |         _, oof_lgb, predictions_lgb = train_model(train, test, features2, y, seed=2021)
285 |         pred.append(predictions_lgb)
286 |         oof.append(oof_lgb)
287 | 
288 | test['label'] = np.mean(pred, axis=0)
289 | test[['CUST_UID', 'label']].to_csv('sub.txt', index=False, header=None, sep=' ')
290 | print(test[['CUST_UID', 'label']].head())
291 | len(test)


--------------------------------------------------------------------------------