├── README.md ├── baseline_v11.py ├── clean_a.py ├── clean_b.py ├── data_pred_a+2b.py ├── data_pred_a+b.py ├── fast_baseline_v11.py ├── feat_a.py ├── feature ├── get_most.py └── white.py ├── hebing_pred.py ├── images └── q.jpg ├── input └── .gitkeep ├── model1.py ├── model2.py ├── model3_1.py ├── model3_4.py ├── model_pred_fee.py ├── piupiu_white.py ├── requirements.txt ├── run_perfect.sh ├── run_top1.sh ├── tool.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # CCF BDCI 2018: 联通研究院--个性化套餐匹配 2 | 3 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/minerva-ml/open-solution-home-credit/blob/master/LICENSE) 4 | 5 | 这是CCF BDCI 2018的面向电信领域的个性化套餐匹配第一名解决方案,赛题链接 6 | [面向电信行业存量用户的智能套餐个性化匹配模型 7 | ](https://www.datafountain.cn/competitions/311/details) :hibiscus:. 8 | 9 | 10 | 11 | ## Highlight 12 | 13 | - 嫁接单模型可达第一名 14 | 15 | ## Contributor 16 | - [piupiu](https://www.kaggle.com/pureheart) 17 | - [30CrMnSiA](https://www.kaggle.com/h4211819) 18 | - [spongebob](https://www.kaggle.com/baomengjiao) 19 | - [feiyang](https://www.kaggle.com/panfeiyang) 20 | 21 | ## Requirements 22 | - python3 23 | - lightgbm 2.1.1 24 | - numpy 1.14.3 25 | - pandas 0.23.1 26 | - scikit_learn 0.19.1 27 | - xgboost 0.72.1 28 | 29 | 30 | ## 运行方法 31 | - trian_old.csv(初赛训练集) | train.csv | test.csv -> input folder 32 | - pip3 install -r requirements.txt 33 | - chmod +x run_top1.sh 34 | - chmod +x run_perfect.sh 35 | - ./run_top1.sh (复现leaderboard第一) 36 | - ./run_perfect.sh (完美复现) 37 | 38 | 39 | 40 | ## 参与进来 41 | 欢迎整理贡献更多竞赛信息: 42 | 43 | - 代码分享 :dog: [皮皮虾我们走](https://github.com/PPshrimpGo) 发现更多开源方案~ 44 | 45 | - 信息分享 :sparkles: [DataSciComp](https://github.com/iphysresearch/DataSciComp) 46 | 47 | - 方案及套路分享 :fire: [砍手豪的知乎专栏](https://zhuanlan.zhihu.com/c_32887913) [数据科学之旅](https://zhuanlan.zhihu.com/c_173835749) 48 | 49 | ![image](http://github.com/PPshrimpGo/BDCI2018-ChinauUicom-1st-solution/raw/master/images/q.jpg) 50 | -------------------------------------------------------------------------------- /baseline_v11.py: -------------------------------------------------------------------------------- 1 | #import dask.dataframe as dd 2 | #from dask.multiprocessing import get 3 | import itertools 4 | import numpy as np 5 | import pandas as pd 6 | import gc 7 | import time 8 | from contextlib import contextmanager 9 | import lightgbm as lgb 10 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score 11 | from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold 12 | #import matplotlib.pyplot as plt 13 | #import seaborn as sns 14 | import warnings 15 | from sklearn.preprocessing import LabelEncoder 16 | from utils import * 17 | #from utils2 import * 18 | #from utils3 import * 19 | from datetime import datetime 20 | from datetime import timedelta 21 | #from tqdm import tqdm 22 | #test 23 | 24 | warnings.simplefilter(action='ignore', category=FutureWarning) 25 | 26 | 27 | 28 | USE_KFOLD = True 29 | 30 | data_path = './input/' 31 | 32 | ####################################读入文件#################################################### 33 | 34 | 35 | def astype(x,t): 36 | try: 37 | return t(x) 38 | except: 39 | return np.nan 40 | 41 | def have_0(x): 42 | try: 43 | r = x.split('.')[1][-1] 44 | return 0 if r=='0' else 1 45 | except: 46 | return 1 47 | 48 | str_dict = {'1_total_fee': 'str', 49 | '2_total_fee': 'str', 50 | '3_total_fee': 'str', 51 | '4_total_fee': 'str', 52 | 'pay_num': 'str', 53 | } 54 | 55 | 56 | have_0_c = ['1_total_fee', 57 | '2_total_fee', 58 | '3_total_fee', 59 | '4_total_fee', 60 | 'pay_num'] 61 | 62 | def deal(data): 63 | for c in have_0_c: 64 | data['have_0_{}'.format(c)] = data[c].apply(have_0) 65 | try: 66 | data[c] = data[c].astype(float) 67 | except: 68 | pass 69 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 70 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 71 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 72 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 73 | data.loc[data['age']==0,'age'] = np.nan 74 | data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan 75 | data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan 76 | data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan 77 | data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan 78 | for c in [ 79 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 80 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 81 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 82 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', ]: 83 | data[c] = data[c].round(4) 84 | data['is_duplicated'] =data.duplicated(subset=['1_total_fee','2_total_fee','3_total_fee', 85 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],keep=False) 86 | return data 87 | 88 | train = pd.read_csv(data_path + 'train.csv',dtype=str_dict) 89 | train = deal(train) 90 | train.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 91 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 92 | train = train[train['current_service'] != 999999] 93 | test = pd.read_csv(data_path + 'test.csv',dtype=str_dict) 94 | test = deal(test) 95 | 96 | ####################################读入stacking文件#################################################### 97 | 98 | train_p = pd.read_csv('./cv/val_prob_model_1.csv') 99 | train = train.merge(train_p,on='user_id',how='left') 100 | test_p = pd.read_csv('./cv/sub_prob_model_1.csv') 101 | test = test.merge(test_p,on='user_id',how='left') 102 | 103 | 104 | train_p = pd.read_csv('val_prob_hebing2.csv') 105 | train = train.merge(train_p,on='user_id',how='left') 106 | test_p = pd.read_csv('sub_prob_hebing2.csv') 107 | test = test.merge(test_p,on='user_id',how='left') 108 | 109 | train_p = pd.read_csv('./cv/val_prob_model_2.csv') 110 | train = train.merge(train_p,on='user_id',how='left') 111 | test_p = pd.read_csv('./cv/sub_prob_model_2.csv') 112 | test = test.merge(test_p,on='user_id',how='left') 113 | 114 | test_total_fee = pd.read_csv('./sub/sub_fee_pred.csv') 115 | train_total_fee = pd.read_csv('./sub/val_fee_pred.csv') 116 | train_total_fee.columns = ['user_id', 'fee_pred'] 117 | test_total_fee.columns = ['user_id', 'fee_pred'] 118 | train = train.merge(train_total_fee,on='user_id',how='left') 119 | test = test.merge(test_total_fee,on='user_id',how='left') 120 | 121 | label2current_service =dict(zip(range(0,len(set(train['current_service']))),sorted(list(set(train['current_service']))))) 122 | current_service2label =dict(zip(sorted(list(set(train['current_service']))),range(0,len(set(train['current_service']))))) 123 | print(len(label2current_service)) 124 | train = train.append(test).reset_index(drop = True) 125 | 126 | 127 | piupiu_p = pd.read_csv('data_preds_xgb1_20181030_050913.csv') 128 | train = train.merge(piupiu_p,on='user_id',how='left') 129 | piupiu_p2 = pd.read_csv('data_preds_xgb1_20181030_230142.csv') 130 | train = train.merge(piupiu_p2,on='user_id',how='left') 131 | 132 | get_most = pd.read_csv('Magic_Feature_Exclude_Old.csv') #不包括初赛的 133 | get_most2 = pd.read_csv('Magic_Feature_Include_Old.csv') #包括初赛的 134 | 135 | 136 | ####################################特征工程################################################### 137 | 138 | call_time = ['local_caller_time', 'service1_caller_time', 'service2_caller_time'] 139 | traffic = ['month_traffic','last_month_traffic','local_trafffic_month'] 140 | cat_cols = ['service_type','contract_type', 'net_service', 'gender', 'complaint_level', 141 | #3 #9,8 #4 #3 #4 142 | 'is_mix_service', 'many_over_bill', 'is_promise_low_consume', #2 143 | ] 144 | continus_col = [ 145 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','former_complaint_fee', 146 | 147 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 148 | 149 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 150 | 151 | 'online_time','contract_time', 152 | 153 | 'pay_times', 'former_complaint_num' 154 | ] 155 | def one_hot_encoder(train,column,n=100,nan_as_category=False): 156 | tmp = train[column].value_counts().to_frame() 157 | values = list(tmp[tmp[column]>n].index) 158 | train.loc[train[column].isin(values),column+'N'] = train.loc[train[column].isin(values),column] 159 | train = pd.get_dummies(train, columns=[column+'N'], dummy_na=False) 160 | return train 161 | # 162 | 163 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 164 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'fea-min']: 165 | get_most.columns = [column,column+'_most'] 166 | train = train.merge(get_most,on=column,how='left') 167 | 168 | 169 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','fea-min']: 170 | train[column+'_int'] = train[column].fillna(-1).astype('int') 171 | train[column+'_int_last'] = train[column+'_int']%10 #last int 172 | train[column+'_decimal'] = round(((train[column]-train[column+'_int'])*100).fillna(-1)).astype('int') #decimal 173 | train[column+'_decimal_is_0'] = (train[column+'_decimal']==0).astype('int') 174 | train[column+'_decimal_is_5'] = (train[column+'_decimal']%5==0).astype('int') 175 | train[column+'_decimal_last'] = train[column+'_decimal']%10 176 | train[column+'_decimal_last2'] = train[column+'_decimal']//5 177 | train[column+'_extra_fee'] = ((train[column]*100)-600)%1000 178 | train[column+'_27perMB'] = ((train[column+'_extra_fee']%27 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 179 | train[column+'_15perMB'] = ((train[column+'_extra_fee']%15 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 180 | train = one_hot_encoder(train,column,n=1000,nan_as_category=True) 181 | train['pay_num_last2'] = train['pay_num_int']%100 182 | train['former_complaint_fee_last2'] = round(train['former_complaint_fee'])%100 183 | 184 | 185 | train['4-fea-dealta'] = round((train['4_total_fee'] - train['3_total_fee'])*100).fillna(999999.9).astype('int') 186 | train['3-fea-dealta'] = round((train['3_total_fee'] - train['2_total_fee'])*100).fillna(999999.9).astype('int') 187 | train['2-fea-dealta'] = round((train['2_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 188 | train['1-fea-dealta'] = round((train['4_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 189 | train['1-3-fea-dealta'] = round((train['3_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 190 | train['1-min-fea-dealta'] = round((train['1_total_fee'] - train['fea-min'])*100).fillna(999999.9).astype('int') 191 | for column in ['4-fea-dealta', '3-fea-dealta', '2-fea-dealta', '1-fea-dealta','1-3-fea-dealta','1-min-fea-dealta']: 192 | train[column+'_is_0'] = (train[column]==0).astype('int') 193 | train[column+'_is_6000'] = ((train[column]%6000 == 0)&(train[column] != 0)).astype('int') 194 | train[column+'_is_5'] = ((train[column]%5 == 0)&(train[column] != 0)).astype('int') 195 | train[column+'_is_10'] = ((train[column]%10 == 0)&(train[column] != 0)).astype('int') 196 | train[column+'_is_15'] = ((train[column]%15 == 0)&(train[column] != 0)).astype('int') 197 | train[column+'_is_27'] = ((train[column]%27 == 0)&(train[column] != 0)).astype('int') 198 | train[column+'_is_30'] = ((train[column]%30 == 0)&(train[column] != 0)).astype('int') 199 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 200 | train[column+'_is_100'] = ((train[column]%100 == 0)&(train[column] != 0)).astype('int') 201 | train[column+'_is_500'] = ((train[column]%500 == 0)&(train[column] != 0)).astype('int') 202 | 203 | for column in ['month_traffic', 'last_month_traffic', 'local_trafffic_month']: 204 | train[column+'_is_int'] = ((train[column]%1 == 0)&(train[column] != 0)).astype('int') 205 | train[column+'_is_512'] = ((train[column]%512 == 0)&(train[column] != 0)).astype('int') 206 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 207 | train[column+'_is_double'] = ((train[column]%512%50 == 0)&(train[column] != 0)&(train[column+'_is_512'] == 0)&(train[column+'_is_50'] == 0)).astype('int') 208 | train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 209 | 210 | train['service12'] = train['service2_caller_time']+train['service1_caller_time'] 211 | for column in ['local_caller_time', 'service1_caller_time', 'service2_caller_time','service12']: 212 | train[column+'_decimal'] = round(((round(train[column])- train[column])*60)).astype('int') 213 | train[column+'_decimal_is_int'] = ((train[column+'_decimal']==0)&(train[column] != 0)).astype('int') 214 | 215 | train = one_hot_encoder(train,'online_time',n=2000,nan_as_category=True) 216 | train = one_hot_encoder(train,'contract_time',n=1000,nan_as_category=True) 217 | 218 | print(train.shape) 219 | train = one_hot_encoder(train,'contract_type',n=1,nan_as_category=True) 220 | 221 | 222 | 223 | train['current_service'] = train['current_service'].map(current_service2label) 224 | 225 | 226 | train['age'] = train['age'].fillna(-20) 227 | train['cut_age'] = train['age'].apply(lambda x: int(x/10)) 228 | train['cut_online'] = (train['online_time'] / 12).astype(int) 229 | 230 | 231 | 232 | 233 | 234 | train['4-fea-dealta_'] = train['4_total_fee'] / (train['3_total_fee']+0.00001) 235 | train['3-fea-dealta_'] = train['3_total_fee'] / (train['2_total_fee']+0.00001) 236 | train['2-fea-dealta_'] = train['2_total_fee'] / (train['1_total_fee']+0.00001) 237 | train['1-fea-dealta_'] = train['4_total_fee'] / (train['1_total_fee']+0.00001) 238 | train['pay_num-dealta_'] = train['pay_num'] / (train['1_total_fee']+0.00001) 239 | 240 | train['fea-dealta_p'] = train['1_total_fee'] - train['fee_pred'] 241 | 242 | train['month_traffic_delata'] = train['month_traffic'] - train['last_month_traffic'] 243 | train['month_traffic_delata_'] = train['month_traffic'] / (train['last_month_traffic']+0.00001) 244 | train['2month_traffic_sum'] = train['month_traffic'] + train['last_month_traffic'] 245 | train['add_month_traffic'] = train['month_traffic'] - train['local_trafffic_month'] 246 | train['add_month_traffic_'] = train['month_traffic'] / (train['local_trafffic_month']+0.00001) 247 | 248 | 249 | train['service1_caller_time_delata'] = train['service1_caller_time'] / (train['service2_caller_time']+0.00001) 250 | train['service1_caller_time_delata2'] = train['service1_caller_time'] / (train['local_caller_time']+0.00001) 251 | train['service2_caller_time_delata_'] = train['service2_caller_time'] / (train['local_caller_time']+0.00001) 252 | train['local_caller_time_reatio'] = train['local_caller_time']/(train['service1_caller_time']+train['service2_caller_time']+0.00001) 253 | 254 | 255 | train['div_online_time_contract'] = train['contract_time'] / (train['online_time']+0.00001) 256 | train['div_online_time_contract'] = train['contract_time'] - train['online_time'] 257 | 258 | 259 | train['div_former_complaint_num'] = train['former_complaint_num'] / (train['pay_times']+0.00001) 260 | train['div_former_complaint_num'] = train['former_complaint_num'] - train['pay_times'] 261 | 262 | 263 | train['fea-sum'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 264 | train['fea-var'] = train[[str(1+i) +'_total_fee' for i in range(4)]].var(axis = 1) 265 | train['fea-max'] = train[[str(1+i) +'_total_fee' for i in range(4)]].max(axis = 1) 266 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 267 | train['fea-mean4'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 268 | train['fea-mean3'] = train[[str(1+i) +'_total_fee' for i in range(3)]].sum(axis = 1) 269 | train['fea-mean2'] = train[[str(1+i) +'_total_fee' for i in range(2)]].sum(axis = 1) 270 | train['fea-extra'] = train['fea-sum']-4*train['fea-min'] 271 | train['1_total_fee_extra_for_min'] = train['1_total_fee']-train['fea-min'] 272 | train['fea_unum'] = train[['1_total_fee','2_total_fee','3_total_fee', '4_total_fee']].nunique(axis=1) 273 | 274 | train['call_time_sum'] = train[call_time].sum(axis = 1) 275 | train['call_time_var'] = train[call_time].var(axis = 1) 276 | train['call_time_min'] = train[call_time].min(axis = 1) 277 | train['call_time_max'] = train[call_time].max(axis = 1) 278 | 279 | train['traffic_sum'] = train[traffic].sum(axis = 1) 280 | train['traffic_var'] = train[traffic].var(axis = 1) 281 | train['traffic_min'] = train[traffic].min(axis = 1) 282 | train['traffic_max'] = train[traffic].max(axis = 1) 283 | 284 | 285 | train['average_pay'] = train['pay_num'] / train['pay_times'] 286 | 287 | 288 | 289 | train['div_traffic_price_2'] = train['last_month_traffic']/ 1000 / train['2_total_fee'] 290 | train['div_traffic_price_3'] = train['local_trafffic_month']/ 1000 / train['1_total_fee'] 291 | train['div_add_month_traffic_price'] = train['add_month_traffic']/ 1000 / train['1_total_fee'] 292 | train['div_local_caller_time_price'] = train['local_trafffic_month'] / 1000/ train['1_total_fee'] 293 | 294 | 295 | train['1-min-fea-dealta_div'] = train['1-min-fea-dealta']/(train['service1_caller_time']+0.0001) 296 | train['div_service1_caller_time_price'] = train['service1_caller_time'] / train['1_total_fee'] 297 | train['div_local_caller_time'] = train['local_caller_time'] / train['1_total_fee'] 298 | train['div_call_time_sum_price'] = train['call_time_sum'] / train['1_total_fee'] 299 | train['1_total_fee_maybe_real_calller'] = train['1_total_fee']- train['service1_caller_time']*0.15 300 | train['1_total_fee_maybe_real_calller2'] = train['1_total_fee']- train['service1_caller_time']*0.1 301 | train['1_total_fee_extra_for_min_caller_time'] = train['1_total_fee_extra_for_min']/(train['service1_caller_time']+0.001) 302 | 303 | train['div_service1_caller_time'] = train['service1_caller_time']/train['last_month_traffic'] 304 | train['div_local_caller_time'] = train['local_caller_time']/train['last_month_traffic'] 305 | train['div_local_caller_time2'] = train['local_caller_time']/train['month_traffic'] 306 | 307 | 308 | train['avg_complain_fee'] = train['former_complaint_fee'] / (train['former_complaint_num'] + 0.000000001) 309 | 310 | result = [] 311 | 312 | result.append(get_feat_ngroup(train,['cut_age','gender'])) 313 | for size_feat in ['1_total_fee','2_total_fee','3_total_fee', '4_total_fee','pay_num', 314 | 'last_month_traffic','month_traffic','local_trafffic_month', 315 | 'local_caller_time','service1_caller_time','service2_caller_time']: 316 | result.append(get_feat_size(train,[size_feat])) 317 | 318 | 319 | result.append(get_feat_stat_feat(train, ['contract_type'], ['1_total_fee'], ['max'])) 320 | result.append(get_feat_stat_feat(train, ['contract_type'], ['2_total_fee'], ['mean'])) 321 | result.append(get_feat_stat_feat(train, ['contract_type'], ['last_month_traffic'], ['var','mean'])) 322 | result.append(get_feat_stat_feat(train, ['contract_type'], ['call_time_sum'], ['mean'])) 323 | 324 | for base_feat in [['contract_type']]: 325 | for other_feat in ['1_total_fee', 'pay_num', 326 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 327 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 328 | ]: 329 | stat_list = ['mean'] 330 | tmp = get_feat_stat_feat(train,base_feat,[other_feat],stat_list=stat_list) 331 | name = tmp.columns[0] 332 | train[name] = tmp 333 | train[name+'_comp'] = train[other_feat].values-train[name].values 334 | 335 | 336 | train['1_total_fee_ratio'] = train['1_total_fee']/(train['fea-sum']+0.000001) 337 | train['3_total_fee_ratio'] = train['3_total_fee']/(train['fea-sum']+0.000001) 338 | train['call_time_sum_ratio'] = train['call_time_sum']/(train['traffic_sum']+0.000001) 339 | train['call_time_sum_ratio2'] = train['call_time_sum']/(train['fea-sum']+0.000001) 340 | train['traffic_sum_ratio1'] = train['traffic_sum']/(train['fea-sum']+0.000001) 341 | 342 | 343 | ####################################lgb和metric函数################################################### 344 | 345 | 346 | def f1_score_vali(preds, data_vali): 347 | labels = data_vali.get_label() 348 | preds = np.argmax(preds.reshape(11, -1),axis=0) 349 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 350 | return 'macro_f1_score', score_vali, True 351 | 352 | def evaluate_macroF1_lgb(data_vali, preds): 353 | labels = data_vali.astype(int) 354 | preds = np.array(preds) 355 | preds = np.argmax(preds,axis=1) 356 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 357 | return score_vali 358 | 359 | def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True, 360 | objective='', metrics='',debug= False, 361 | feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ): 362 | 363 | lgb_params = params 364 | 365 | train_df = df[df[target].notnull()] 366 | test_df = df[df[target].isnull()] 367 | 368 | # Divide in training/validation and test data 369 | print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape)) 370 | del df 371 | gc.collect() 372 | # Cross validation model 373 | if stratified: 374 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234) 375 | else: 376 | folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234) 377 | 378 | oof_preds = np.zeros((train_df.shape[0],11)) 379 | sub_preds = np.zeros((test_df.shape[0],11)) 380 | feature_importance_df = pd.DataFrame() 381 | feats = predictors 382 | cv_resul = [] 383 | 384 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])): 385 | if (USE_KFOLD == False) and (n_fold == 1): 386 | break 387 | train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx] 388 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx] 389 | #weight_num = train_df['weight'].iloc[train_idx] 390 | 391 | train_y_t = train_y.values 392 | valid_y_t = valid_y.values 393 | print(train_y_t) 394 | xgtrain = lgb.Dataset(train_x.values, label = train_y_t, 395 | feature_name=predictors, 396 | categorical_feature=categorical_features 397 | ) 398 | # xgtrain.set_weight(np.array(weight_num)) 399 | xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t, 400 | feature_name=predictors, 401 | categorical_feature=categorical_features 402 | ) 403 | 404 | clf = lgb.train(lgb_params, 405 | xgtrain, 406 | valid_sets=[xgvalid],#, xgtrain], 407 | valid_names=['valid'],#,'train'], 408 | num_boost_round=num_boost_round, 409 | early_stopping_rounds=early_stopping_rounds, 410 | verbose_eval=verbose_eval, 411 | # feval=feval 412 | ) 413 | 414 | 415 | 416 | oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) 417 | sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits 418 | 419 | 420 | gain = clf.feature_importance('gain') 421 | fold_importance_df = pd.DataFrame({'feature':clf.feature_name(), 422 | 'split':clf.feature_importance('split'), 423 | 'gain':100*gain/gain.sum(), 424 | 'fold':n_fold, 425 | }).sort_values('gain',ascending=False) 426 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 427 | 428 | result = sklearn_mertric(valid_y, oof_preds[valid_idx]) 429 | # result = clf.best_score['valid']['macro_f1_score'] 430 | print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result)) 431 | cv_resul.append(round(result,5)) 432 | gc.collect() 433 | 434 | #score = np.array(cv_resul).mean() 435 | score = "final" 436 | if USE_KFOLD: 437 | # print('Full f1 score %.6f' % score) 438 | for i in range(11): 439 | train_df["class_" + str(i)] = oof_preds[:,i] 440 | test_df["class_" + str(i)] = sub_preds[:,i] 441 | train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 442 | test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 443 | oof_preds = [np.argmax(x)for x in oof_preds] 444 | sub_preds = [np.argmax(x)for x in sub_preds] 445 | train_df[target] = oof_preds 446 | test_df[target] = sub_preds 447 | print(test_df[target].mean()) 448 | train_df[target] = oof_preds 449 | train_df[target] = train_df[target].map(label2current_service) 450 | test_df[target] = sub_preds 451 | test_df[target] = test_df[target].map(label2current_service) 452 | print('all_cv', cv_resul) 453 | train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False) 454 | test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False) 455 | print("test_df mean:") 456 | 457 | display_importances(feature_importance_df,score) 458 | 459 | 460 | 461 | def display_importances(feature_importance_df_,score): 462 | ft = feature_importance_df_[["feature", "split","gain"]].groupby("feature").mean().sort_values(by="gain", ascending=False) 463 | print(ft.head(60)) 464 | ft.to_csv('importance_lightgbm_{}.csv'.format(score),index=True) 465 | cols = ft[:40].index 466 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] 467 | 468 | 469 | ####################################计算################################################################# 470 | 471 | 472 | params = { 473 | 'metric': 'multi_logloss', 474 | 'num_class':11, 475 | 'boosting_type': 'gbdt', 476 | 'objective': 'multiclass', 477 | 'feature_fraction': 0.7, 478 | 'learning_rate': 0.01, 479 | 'bagging_fraction': 0.7, 480 | #'bagging_freq': 2, 481 | 'num_leaves': 64, 482 | 'max_depth': -1, 483 | 'num_threads': 16, 484 | 'seed': 2018, 485 | 'verbose': -1, 486 | #'is_unbalance':True, 487 | } 488 | 489 | 490 | categorical_columns = [ 491 | 'contract_type', 492 | # 'is_mix_service', 493 | # 'is_promise_low_consume', 494 | 'net_service', 495 | 'gender'] 496 | for feature in categorical_columns: 497 | print(f'Transforming {feature}...') 498 | encoder = LabelEncoder() 499 | train[feature] = encoder.fit_transform(train[feature].astype(str)) 500 | 501 | 502 | x = [] 503 | no_use = ['current_service', 'user_id','group','weight' 504 | 505 | ] + x 506 | 507 | 508 | categorical_columns = [] 509 | all_data_frame = [] 510 | all_data_frame.append(train) 511 | 512 | for aresult in result: 513 | all_data_frame.append(aresult) 514 | 515 | train = concat(all_data_frame) 516 | feats = [f for f in train.columns if f not in no_use] 517 | categorical_columns = [f for f in categorical_columns if f not in no_use] 518 | clf = kfold_lightgbm(params,train,feats,'current_service' ,5 , num_boost_round=4000, categorical_features=categorical_columns) 519 | -------------------------------------------------------------------------------- /clean_a.py: -------------------------------------------------------------------------------- 1 | from tool import * 2 | 3 | data_path = './input/' 4 | d = {89950166: 1, 89950167: 2, 89950168: 5, 90063345: 0, 90109916: 4, 5 | 90155946: 8, 99999825: 10, 99999826: 7, 99999827: 6, 99999828: 3, 99999830: 9} 6 | rd = {0: 90063345, 1: 89950166, 2: 89950167, 3: 99999828, 4: 90109916, 7 | 5: 89950168, 6: 99999827, 7: 99999826, 8: 90155946, 9: 99999830, 10: 99999825} 8 | 9 | 10 | def astype(x,t): 11 | try: 12 | return t(x) 13 | except: 14 | return np.nan 15 | 16 | def have_0(x): 17 | try: 18 | r = x.split('.')[1][-1] 19 | return 0 if r=='0' else 1 20 | except: 21 | return 1 22 | 23 | str_dict = {'1_total_fee': 'str', 24 | '2_total_fee': 'str', 25 | '3_total_fee': 'str', 26 | '4_total_fee': 'str', 27 | 'last_month_traffic': 'str', 28 | 'local_caller_time': 'str', 29 | 'local_trafffic_month': 'str', 30 | 'month_traffic': 'str', 31 | 'pay_num': 'str', 32 | 'service1_caller_time': 'str', 33 | 'service2_caller_time': 'str'} 34 | train = pd.read_csv(data_path + 'train_old.csv',dtype=str_dict) 35 | #test = pd.read_csv(data_path + 'republish_test.csv',dtype=str_dict) 36 | train['label'] = train['current_service'].map(d) 37 | 38 | have_0_c = ['1_total_fee', 39 | '2_total_fee', 40 | '3_total_fee', 41 | '4_total_fee', 42 | 'month_traffic', 43 | 'last_month_traffic', 44 | 'local_trafffic_month', 45 | 'local_caller_time', 46 | 'service1_caller_time', 47 | 'service2_caller_time', 48 | 'pay_num'] 49 | def deal(data): 50 | for c in have_0_c: 51 | data['have_0_{}'.format(c)] = data[c].apply(have_0) 52 | try: 53 | data[c] = data[c].astype(float) 54 | except: 55 | pass 56 | for c in ['1_total_fee','2_total_fee', '3_total_fee', '4_total_fee','pay_num' ]: 57 | data['{}_len'.format(c)] = data[c].astype(str).apply(lambda x: 0 if '.' not in x else len(x.split('.')[1])) 58 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 59 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 60 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 61 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 62 | data.loc[data['age']==0,'age'] = np.nan 63 | data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan 64 | data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan 65 | data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan 66 | data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan 67 | for c in [ 68 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 69 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 70 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 71 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', ]: 72 | data[c] = data[c].round(4) 73 | return data 74 | 75 | train = deal(train) 76 | #test = deal(test) 77 | 78 | 79 | data_path='./data/a/' 80 | train.to_csv(data_path + 'train.csv',index=False) 81 | #test.to_csv(data_path + 'test.csv',index=False) 82 | #print('预处理完成') 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /clean_b.py: -------------------------------------------------------------------------------- 1 | from tool import * 2 | data_path = './input/' 3 | d = {89950166: 1, 89950167: 2, 89950168: 5, 90063345: 0, 90109916: 4, 4 | 90155946: 8, 99999825: 10, 99999826: 7, 99999827: 6, 99999828: 3, 99999830: 9} 5 | rd = {0: 90063345, 1: 89950166, 2: 89950167, 3: 99999828, 4: 90109916, 6 | 5: 89950168, 6: 99999827, 7: 99999826, 8: 90155946, 9: 99999830, 10: 99999825} 7 | 8 | 9 | def astype(x,t): 10 | try: 11 | return t(x) 12 | except: 13 | return np.nan 14 | 15 | def have_0(x): 16 | try: 17 | r = x.split('.')[1][-1] 18 | return 0 if r=='0' else 1 19 | except: 20 | return 1 21 | 22 | str_dict = {'1_total_fee': 'str', 23 | '2_total_fee': 'str', 24 | '3_total_fee': 'str', 25 | '4_total_fee': 'str', 26 | 'last_month_traffic': 'str', 27 | 'local_caller_time': 'str', 28 | 'local_trafffic_month': 'str', 29 | 'month_traffic': 'str', 30 | 'pay_num': 'str', 31 | 'service1_caller_time': 'str', 32 | 'service2_caller_time': 'str'} 33 | 34 | train = pd.read_csv(data_path + 'train.csv',dtype=str_dict) 35 | test = pd.read_csv(data_path + 'test.csv',dtype=str_dict) 36 | train['label'] = train['current_service'].map(d) 37 | 38 | have_0_c = ['1_total_fee', 39 | '2_total_fee', 40 | '3_total_fee', 41 | '4_total_fee', 42 | 'month_traffic', 43 | 'last_month_traffic', 44 | 'local_trafffic_month', 45 | 'local_caller_time', 46 | 'service1_caller_time', 47 | 'service2_caller_time', 48 | 'pay_num'] 49 | 50 | def deal(data): 51 | for c in have_0_c: 52 | data['have_0_{}'.format(c)] = data[c].apply(have_0) 53 | try: 54 | data[c] = data[c].astype(float) 55 | except: 56 | pass 57 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 58 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 59 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 60 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 61 | data.loc[data['age']==0,'age'] = np.nan 62 | data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan 63 | data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan 64 | data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan 65 | data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan 66 | for c in [ 67 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 68 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 69 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 70 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', ]: 71 | data[c] = data[c].round(4) 72 | return data 73 | 74 | train = deal(train) 75 | train = train[train['current_service'] != 999999] 76 | test = deal(test) 77 | 78 | 79 | 80 | data_path = 'data/b/' 81 | train.to_csv(data_path + 'train_new.csv',index=False) 82 | test.to_csv(data_path + 'test_new.csv',index=False) 83 | #print('预处理完成') 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /data_pred_a+2b.py: -------------------------------------------------------------------------------- 1 | from tool import * 2 | from feat_a import * 3 | 4 | data_path = 'data/b/' 5 | d = {89950166: 1, 89950167: 2, 89950168: 5, 90063345: 0, 90109916: 4, 6 | 90155946: 8, 99999825: 10, 99999826: 7, 99999827: 6, 99999828: 3, 99999830: 9} 7 | rd = {0: 90063345, 1: 89950166, 2: 89950167, 3: 99999828, 4: 90109916, 8 | 5: 89950168, 6: 99999827, 7: 99999826, 8: 90155946, 9: 99999830, 10: 99999825} 9 | d1 = {0: 0, 4: 1, 8: 2} 10 | rd1 = {0: 0, 1: 4, 2: 8} 11 | d4 = {1: 0, 2: 1, 3: 2, 5: 3, 6: 4, 7: 5, 9: 6, 10: 7} 12 | rd4 = {0: 1, 1: 2, 2: 3, 3: 5, 4: 6, 5: 7, 6: 9, 7: 10} 13 | 14 | cc = ['service_type', 'is_mix_service', 'is_promise_low_consume', 15 | 'net_service', 'gender', 'age', 'online_time', 'contract_type', 16 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 17 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 18 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 19 | 'many_over_bill', 'contract_time', 'pay_times', 'pay_num'] 20 | #print('读取train数据...') 21 | train_old = pd.read_csv('data/a/' + 'train.csv') 22 | train = pd.read_csv(data_path + 'train_new.csv') 23 | test = pd.read_csv(data_path + 'test_new.csv') 24 | data = train_old.append(train).drop_duplicates(cc).append(test) 25 | 26 | #print('构造特征...') 27 | data_feat = make_feat(data, 'online') 28 | test_feat = data_feat[data_feat['user_id'].isin(test['user_id'])].copy() 29 | train_feat = data_feat[data_feat['user_id'].isin(train['user_id'])].copy() 30 | 31 | train_feat1 = train_feat[train_feat['service_type'] == 1].copy() 32 | test_feat1 = test_feat[test_feat['service_type'] == 1].copy() 33 | train_feat1['label'] = train_feat1['label'].map(d1) 34 | predictors1 = [c for c in train_feat.columns if (c not in ['user_id', 'current_service', 'label']) and 35 | ('contract_type' not in c) and ('service_type' not in c)] 36 | params = {'objective': 'multi:softprob', 37 | 'eta': 0.02, 38 | 'max_depth': 6, 39 | 'silent': 1, 40 | 'num_class': 3, 41 | 'eval_metric': "mlogloss", 42 | 'min_child_weight': 3, 43 | 'subsample': 0.7, 44 | 'colsample_bytree': 0.7, 45 | 'seed': 66 46 | } 47 | 48 | train = pd.read_csv(data_path + 'train_new.csv') 49 | def xgb_cv(params, train_feat, test_feat, predictors, label='label', cv=5, stratified=True): 50 | #print('开始CV 5折训练...') 51 | t0 = time.time() 52 | train_preds = np.zeros((len(train_feat), train_feat[label].nunique())) 53 | test_preds = np.zeros((len(test_feat), train_feat[label].nunique())) 54 | xgb_test = xgb.DMatrix(test_feat[predictors]) 55 | models = [] 56 | if stratified: 57 | folds = StratifiedKFold(n_splits=cv, shuffle=True, random_state=66) 58 | else: 59 | folds = KFold(n_splits=cv, shuffle=True, random_state=66) 60 | for i, (train_index, test_index) in enumerate(folds.split(train_feat, train_feat[label])): 61 | temp = train_feat.iloc[train_index] 62 | temp = temp.append(temp[temp['user_id'].isin(train['user_id'].values)]) 63 | xgb_train = xgb.DMatrix(temp[predictors], temp['label']) 64 | temp = train_feat.iloc[test_index] 65 | temp = temp[temp['user_id'].isin(train['user_id'].values)] 66 | xgb_eval_temp = xgb.DMatrix(temp[predictors], temp['label']) 67 | xgb_eval = xgb.DMatrix(train_feat[predictors].iloc[test_index], train_feat['label'].iloc[test_index]) 68 | 69 | #print('开始第{}轮训练...'.format(i)) 70 | params = {'objective': 'multi:softprob', 71 | 'eta': 0.02, 72 | 'max_depth': 6, 73 | 'silent': 1, 74 | 'num_class': 11, 75 | 'eval_metric': "mlogloss", 76 | 'min_child_weight': 3, 77 | 'subsample': 0.7, 78 | 'colsample_bytree': 0.7, 79 | 'seed': 66 80 | } if params is None else params 81 | watchlist = [(xgb_train, 'train'), (xgb_eval_temp, 'val')] 82 | 83 | clf = xgb.train(params, 84 | xgb_train, 85 | num_boost_round=3000, 86 | evals=watchlist, 87 | verbose_eval=50, 88 | early_stopping_rounds=50) 89 | 90 | train_preds[test_index] += clf.predict(xgb_eval) 91 | test_preds += clf.predict(xgb_test) 92 | models.append(clf) 93 | #print('用时{}秒'.format(time.time() - t0)) 94 | return train_preds, test_preds / 5 95 | 96 | 97 | train_preds1, test_preds1 = xgb_cv(params, train_feat1, test_feat1, predictors1) 98 | int_train_preds1 = train_preds1.argmax(axis=1) 99 | int_test_preds1 = test_preds1.argmax(axis=1) 100 | #print('线下第一类的得分为: {}'.format(multi_f1(train_feat1['label'], int_train_preds1) ** 2)) 101 | train_preds1 = pd.DataFrame(train_preds1) 102 | train_preds1['user_id'] = train_feat1['user_id'].values 103 | test_preds1 = pd.DataFrame(test_preds1) 104 | test_preds1['user_id'] = test_feat1['user_id'].values 105 | data_pred1 = train_preds1.append(test_preds1) 106 | data_pred1.columns = [rd1[i] if i in rd1 else i for i in data_pred1.columns] 107 | 108 | train_feat4 = train_feat[train_feat['service_type'] != 1].copy() 109 | test_feat4 = test_feat[test_feat['service_type'] != 1].copy() 110 | train_feat4['label'] = train_feat4['label'].map(d4) 111 | predictors4 = [c for c in train_feat.columns if (c not in ['user_id', 'current_service', 'label'])] 112 | params = {'objective': 'multi:softprob', 113 | 'eta': 0.02, 114 | 'max_depth': 6, 115 | 'silent': 1, 116 | 'num_class': 8, 117 | 'eval_metric': "mlogloss", 118 | 'min_child_weight': 3, 119 | 'subsample': 0.7, 120 | 'colsample_bytree': 0.7, 121 | 'seed': 66 122 | } 123 | train_preds4, test_preds4 = xgb_cv(params, train_feat4, test_feat4, predictors4) 124 | int_train_preds4 = train_preds4.argmax(axis=1) 125 | int_test_preds4 = test_preds4.argmax(axis=1) 126 | #print('线下第四类的得分为: {}'.format(multi_f1(train_feat4['label'], int_train_preds4) ** 2)) 127 | train_preds4 = pd.DataFrame(train_preds4) 128 | train_preds4['user_id'] = train_feat4['user_id'].values 129 | test_preds4 = pd.DataFrame(test_preds4) 130 | test_preds4['user_id'] = test_feat4['user_id'].values 131 | data_pred4 = train_preds4.append(test_preds4) 132 | data_pred4.columns = [rd4[i] if i in rd4 else i for i in data_pred4.columns] 133 | 134 | # 输出预测概率,做stacking使用 135 | data_pred = data_pred1.append(data_pred4).fillna(0) 136 | data_pred.to_csv('data_preds_xgb1_20181030_230142.csv', index=False) 137 | 138 | -------------------------------------------------------------------------------- /data_pred_a+b.py: -------------------------------------------------------------------------------- 1 | from tool import * 2 | from feat_a import * 3 | 4 | data_path = 'data/b/' 5 | d = {89950166: 1, 89950167: 2, 89950168: 5, 90063345: 0, 90109916: 4, 6 | 90155946: 8, 99999825: 10, 99999826: 7, 99999827: 6, 99999828: 3, 99999830: 9} 7 | rd = {0: 90063345, 1: 89950166, 2: 89950167, 3: 99999828, 4: 90109916, 8 | 5: 89950168, 6: 99999827, 7: 99999826, 8: 90155946, 9: 99999830, 10: 99999825} 9 | d1 = {0: 0, 4: 1, 8: 2} 10 | rd1 = {0: 0, 1: 4, 2: 8} 11 | d4 = {1: 0, 2: 1, 3: 2, 5: 3, 6: 4, 7: 5, 9: 6, 10: 7} 12 | rd4 = {0: 1, 1: 2, 2: 3, 3: 5, 4: 6, 5: 7, 6: 9, 7: 10} 13 | 14 | cc = ['service_type', 'is_mix_service', 'is_promise_low_consume', 15 | 'net_service', 'gender', 'age', 'online_time', 'contract_type', 16 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 17 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 18 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 19 | 'many_over_bill', 'contract_time', 'pay_times', 'pay_num'] 20 | #print('读取train数据...') 21 | train_old = pd.read_csv('data/a/' + 'train.csv') 22 | train = pd.read_csv(data_path + 'train_new.csv') 23 | test = pd.read_csv(data_path + 'test_new.csv') 24 | data = train_old.append(train).drop_duplicates(cc).append(test) 25 | 26 | #print('构造特征...') 27 | data_feat = make_feat(data, 'online') 28 | test_feat = data_feat[data_feat['user_id'].isin(test['user_id'])].copy() 29 | train_feat = data_feat[data_feat['user_id'].isin(train['user_id'])].copy() 30 | 31 | train_feat1 = train_feat[train_feat['service_type'] == 1].copy() 32 | test_feat1 = test_feat[test_feat['service_type'] == 1].copy() 33 | train_feat1['label'] = train_feat1['label'].map(d1) 34 | predictors1 = [c for c in train_feat.columns if (c not in ['user_id', 'current_service', 'label']) and 35 | ('contract_type' not in c) and ('service_type' not in c)] 36 | params = {'objective': 'multi:softprob', 37 | 'eta': 0.02, 38 | 'max_depth': 6, 39 | 'silent': 1, 40 | 'num_class': 3, 41 | 'eval_metric': "mlogloss", 42 | 'min_child_weight': 3, 43 | 'subsample': 0.7, 44 | 'colsample_bytree': 0.7, 45 | 'seed': 66 46 | } 47 | 48 | train = pd.read_csv(data_path + 'train_new.csv') 49 | def xgb_cv(params, train_feat, test_feat, predictors, label='label', cv=5, stratified=True): 50 | #print('开始CV 5折训练...') 51 | t0 = time.time() 52 | train_preds = np.zeros((len(train_feat), train_feat[label].nunique())) 53 | test_preds = np.zeros((len(test_feat), train_feat[label].nunique())) 54 | xgb_test = xgb.DMatrix(test_feat[predictors]) 55 | models = [] 56 | if stratified: 57 | folds = StratifiedKFold(n_splits=cv, shuffle=True, random_state=66) 58 | else: 59 | folds = KFold(n_splits=cv, shuffle=True, random_state=66) 60 | for i, (train_index, test_index) in enumerate(folds.split(train_feat, train_feat[label])): 61 | temp = train_feat.iloc[train_index] 62 | # temp = temp.append(temp[temp['user_id'].isin(train['user_id'].values)]) 63 | xgb_train = xgb.DMatrix(temp[predictors], temp['label']) 64 | temp = train_feat.iloc[test_index] 65 | temp = temp[temp['user_id'].isin(train['user_id'].values)] 66 | xgb_eval_temp = xgb.DMatrix(temp[predictors], temp['label']) 67 | xgb_eval = xgb.DMatrix(train_feat[predictors].iloc[test_index], train_feat['label'].iloc[test_index]) 68 | 69 | #print('开始第{}轮训练...'.format(i)) 70 | params = {'objective': 'multi:softprob', 71 | 'eta': 0.02, 72 | 'max_depth': 6, 73 | 'silent': 1, 74 | 'num_class': 11, 75 | 'eval_metric': "mlogloss", 76 | 'min_child_weight': 3, 77 | 'subsample': 0.7, 78 | 'colsample_bytree': 0.7, 79 | 'seed': 66 80 | } if params is None else params 81 | watchlist = [(xgb_train, 'train'), (xgb_eval_temp, 'val')] 82 | 83 | clf = xgb.train(params, 84 | xgb_train, 85 | num_boost_round=3000, 86 | evals=watchlist, 87 | verbose_eval=50, 88 | early_stopping_rounds=50) 89 | 90 | train_preds[test_index] += clf.predict(xgb_eval) 91 | test_preds += clf.predict(xgb_test) 92 | models.append(clf) 93 | #print('用时{}秒'.format(time.time() - t0)) 94 | return train_preds, test_preds / 5 95 | 96 | 97 | train_preds1, test_preds1 = xgb_cv(params, train_feat1, test_feat1, predictors1) 98 | int_train_preds1 = train_preds1.argmax(axis=1) 99 | int_test_preds1 = test_preds1.argmax(axis=1) 100 | #print('线下第一类的得分为: {}'.format(multi_f1(train_feat1['label'], int_train_preds1) ** 2)) 101 | train_preds1 = pd.DataFrame(train_preds1) 102 | train_preds1['user_id'] = train_feat1['user_id'].values 103 | test_preds1 = pd.DataFrame(test_preds1) 104 | test_preds1['user_id'] = test_feat1['user_id'].values 105 | data_pred1 = train_preds1.append(test_preds1) 106 | data_pred1.columns = [rd1[i] if i in rd1 else i for i in data_pred1.columns] 107 | 108 | train_feat4 = train_feat[train_feat['service_type'] != 1].copy() 109 | test_feat4 = test_feat[test_feat['service_type'] != 1].copy() 110 | train_feat4['label'] = train_feat4['label'].map(d4) 111 | predictors4 = [c for c in train_feat.columns if (c not in ['user_id', 'current_service', 'label'])] 112 | params = {'objective': 'multi:softprob', 113 | 'eta': 0.02, 114 | 'max_depth': 6, 115 | 'silent': 1, 116 | 'num_class': 8, 117 | 'eval_metric': "mlogloss", 118 | 'min_child_weight': 3, 119 | 'subsample': 0.7, 120 | 'colsample_bytree': 0.7, 121 | 'seed': 66 122 | } 123 | train_preds4, test_preds4 = xgb_cv(params, train_feat4, test_feat4, predictors4) 124 | int_train_preds4 = train_preds4.argmax(axis=1) 125 | int_test_preds4 = test_preds4.argmax(axis=1) 126 | #print('线下第四类的得分为: {}'.format(multi_f1(train_feat4['label'], int_train_preds4) ** 2)) 127 | train_preds4 = pd.DataFrame(train_preds4) 128 | train_preds4['user_id'] = train_feat4['user_id'].values 129 | test_preds4 = pd.DataFrame(test_preds4) 130 | test_preds4['user_id'] = test_feat4['user_id'].values 131 | data_pred4 = train_preds4.append(test_preds4) 132 | data_pred4.columns = [rd4[i] if i in rd4 else i for i in data_pred4.columns] 133 | 134 | # 输出预测概率,做stacking使用 135 | data_pred = data_pred1.append(data_pred4).fillna(0) 136 | data_pred.to_csv('data_preds_xgb1_20181030_050913.csv', index=False) 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /fast_baseline_v11.py: -------------------------------------------------------------------------------- 1 | #import dask.dataframe as dd 2 | #from dask.multiprocessing import get 3 | import itertools 4 | import numpy as np 5 | import pandas as pd 6 | import gc 7 | import time 8 | from contextlib import contextmanager 9 | import lightgbm as lgb 10 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score 11 | from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold 12 | #import matplotlib.pyplot as plt 13 | #import seaborn as sns 14 | import warnings 15 | from sklearn.preprocessing import LabelEncoder 16 | from utils import * 17 | #from utils2 import * 18 | #from utils3 import * 19 | from datetime import datetime 20 | from datetime import timedelta 21 | #from tqdm import tqdm 22 | #test 23 | 24 | warnings.simplefilter(action='ignore', category=FutureWarning) 25 | 26 | 27 | 28 | USE_KFOLD = True 29 | 30 | data_path = './input/' 31 | 32 | ####################################读入文件#################################################### 33 | 34 | 35 | def astype(x,t): 36 | try: 37 | return t(x) 38 | except: 39 | return np.nan 40 | 41 | def have_0(x): 42 | try: 43 | r = x.split('.')[1][-1] 44 | return 0 if r=='0' else 1 45 | except: 46 | return 1 47 | 48 | str_dict = {'1_total_fee': 'str', 49 | '2_total_fee': 'str', 50 | '3_total_fee': 'str', 51 | '4_total_fee': 'str', 52 | 'pay_num': 'str', 53 | } 54 | 55 | 56 | have_0_c = ['1_total_fee', 57 | '2_total_fee', 58 | '3_total_fee', 59 | '4_total_fee', 60 | 'pay_num'] 61 | 62 | def deal(data): 63 | for c in have_0_c: 64 | data['have_0_{}'.format(c)] = data[c].apply(have_0) 65 | try: 66 | data[c] = data[c].astype(float) 67 | except: 68 | pass 69 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 70 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 71 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 72 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 73 | data.loc[data['age']==0,'age'] = np.nan 74 | data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan 75 | data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan 76 | data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan 77 | data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan 78 | for c in [ 79 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 80 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 81 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 82 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', ]: 83 | data[c] = data[c].round(4) 84 | data['is_duplicated'] =data.duplicated(subset=['1_total_fee','2_total_fee','3_total_fee', 85 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],keep=False) 86 | return data 87 | 88 | train = pd.read_csv(data_path + 'train.csv',dtype=str_dict) 89 | train = deal(train) 90 | train.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 91 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 92 | train = train[train['current_service'] != 999999] 93 | test = pd.read_csv(data_path + 'test.csv',dtype=str_dict) 94 | test = deal(test) 95 | 96 | ####################################读入stacking文件#################################################### 97 | 98 | #train_p = pd.read_csv('./cv/val_prob_model_1.csv') 99 | #train = train.merge(train_p,on='user_id',how='left') 100 | #test_p = pd.read_csv('./cv/sub_prob_model_1.csv') 101 | #test = test.merge(test_p,on='user_id',how='left') 102 | 103 | 104 | #train_p = pd.read_csv('val_prob_hebing2.csv') 105 | #train = train.merge(train_p,on='user_id',how='left') 106 | #test_p = pd.read_csv('sub_prob_hebing2.csv') 107 | #test = test.merge(test_p,on='user_id',how='left') 108 | 109 | train_p = pd.read_csv('./cv/val_prob_model_1.csv') 110 | train = train.merge(train_p,on='user_id',how='left') 111 | test_p = pd.read_csv('./cv/sub_prob_model_1.csv') 112 | test = test.merge(test_p,on='user_id',how='left') 113 | 114 | #test_total_fee = pd.read_csv('./sub/sub_fee_pred.csv') 115 | #train_total_fee = pd.read_csv('./sub/val_fee_pred.csv') 116 | #train_total_fee.columns = ['user_id', 'fee_pred'] 117 | #test_total_fee.columns = ['user_id', 'fee_pred'] 118 | #train = train.merge(train_total_fee,on='user_id',how='left') 119 | #test = test.merge(test_total_fee,on='user_id',how='left') 120 | 121 | label2current_service =dict(zip(range(0,len(set(train['current_service']))),sorted(list(set(train['current_service']))))) 122 | current_service2label =dict(zip(sorted(list(set(train['current_service']))),range(0,len(set(train['current_service']))))) 123 | print(len(label2current_service)) 124 | train = train.append(test).reset_index(drop = True) 125 | 126 | 127 | #piupiu_p = pd.read_csv('data_preds_xgb1_20181030_050913.csv') 128 | #train = train.merge(piupiu_p,on='user_id',how='left') 129 | #piupiu_p2 = pd.read_csv('data_preds_xgb1_20181030_230142.csv') 130 | #train = train.merge(piupiu_p2,on='user_id',how='left') 131 | 132 | get_most = pd.read_csv('Magic_Feature_Exclude_Old.csv') #不包括初赛的 133 | get_most2 = pd.read_csv('Magic_Feature_Include_Old.csv') #包括初赛的 134 | 135 | 136 | ####################################特征工程################################################### 137 | 138 | call_time = ['local_caller_time', 'service1_caller_time', 'service2_caller_time'] 139 | traffic = ['month_traffic','last_month_traffic','local_trafffic_month'] 140 | cat_cols = ['service_type','contract_type', 'net_service', 'gender', 'complaint_level', 141 | #3 #9,8 #4 #3 #4 142 | 'is_mix_service', 'many_over_bill', 'is_promise_low_consume', #2 143 | ] 144 | continus_col = [ 145 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','former_complaint_fee', 146 | 147 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 148 | 149 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 150 | 151 | 'online_time','contract_time', 152 | 153 | 'pay_times', 'former_complaint_num' 154 | ] 155 | def one_hot_encoder(train,column,n=100,nan_as_category=False): 156 | tmp = train[column].value_counts().to_frame() 157 | values = list(tmp[tmp[column]>n].index) 158 | train.loc[train[column].isin(values),column+'N'] = train.loc[train[column].isin(values),column] 159 | train = pd.get_dummies(train, columns=[column+'N'], dummy_na=False) 160 | return train 161 | # 162 | 163 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 164 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'fea-min']: 165 | get_most.columns = [column,column+'_most'] 166 | train = train.merge(get_most,on=column,how='left') 167 | 168 | 169 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','fea-min']: 170 | train[column+'_int'] = train[column].fillna(-1).astype('int') 171 | train[column+'_int_last'] = train[column+'_int']%10 #last int 172 | train[column+'_decimal'] = round(((train[column]-train[column+'_int'])*100).fillna(-1)).astype('int') #decimal 173 | train[column+'_decimal_is_0'] = (train[column+'_decimal']==0).astype('int') 174 | train[column+'_decimal_is_5'] = (train[column+'_decimal']%5==0).astype('int') 175 | train[column+'_decimal_last'] = train[column+'_decimal']%10 176 | train[column+'_decimal_last2'] = train[column+'_decimal']//5 177 | train[column+'_extra_fee'] = ((train[column]*100)-600)%1000 178 | train[column+'_27perMB'] = ((train[column+'_extra_fee']%27 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 179 | train[column+'_15perMB'] = ((train[column+'_extra_fee']%15 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 180 | train = one_hot_encoder(train,column,n=1000,nan_as_category=True) 181 | train['pay_num_last2'] = train['pay_num_int']%100 182 | train['former_complaint_fee_last2'] = round(train['former_complaint_fee'])%100 183 | 184 | 185 | train['4-fea-dealta'] = round((train['4_total_fee'] - train['3_total_fee'])*100).fillna(999999.9).astype('int') 186 | train['3-fea-dealta'] = round((train['3_total_fee'] - train['2_total_fee'])*100).fillna(999999.9).astype('int') 187 | train['2-fea-dealta'] = round((train['2_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 188 | train['1-fea-dealta'] = round((train['4_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 189 | train['1-3-fea-dealta'] = round((train['3_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 190 | train['1-min-fea-dealta'] = round((train['1_total_fee'] - train['fea-min'])*100).fillna(999999.9).astype('int') 191 | for column in ['4-fea-dealta', '3-fea-dealta', '2-fea-dealta', '1-fea-dealta','1-3-fea-dealta','1-min-fea-dealta']: 192 | train[column+'_is_0'] = (train[column]==0).astype('int') 193 | train[column+'_is_6000'] = ((train[column]%6000 == 0)&(train[column] != 0)).astype('int') 194 | train[column+'_is_5'] = ((train[column]%5 == 0)&(train[column] != 0)).astype('int') 195 | train[column+'_is_10'] = ((train[column]%10 == 0)&(train[column] != 0)).astype('int') 196 | train[column+'_is_15'] = ((train[column]%15 == 0)&(train[column] != 0)).astype('int') 197 | train[column+'_is_27'] = ((train[column]%27 == 0)&(train[column] != 0)).astype('int') 198 | train[column+'_is_30'] = ((train[column]%30 == 0)&(train[column] != 0)).astype('int') 199 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 200 | train[column+'_is_100'] = ((train[column]%100 == 0)&(train[column] != 0)).astype('int') 201 | train[column+'_is_500'] = ((train[column]%500 == 0)&(train[column] != 0)).astype('int') 202 | 203 | for column in ['month_traffic', 'last_month_traffic', 'local_trafffic_month']: 204 | train[column+'_is_int'] = ((train[column]%1 == 0)&(train[column] != 0)).astype('int') 205 | train[column+'_is_512'] = ((train[column]%512 == 0)&(train[column] != 0)).astype('int') 206 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 207 | train[column+'_is_double'] = ((train[column]%512%50 == 0)&(train[column] != 0)&(train[column+'_is_512'] == 0)&(train[column+'_is_50'] == 0)).astype('int') 208 | train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 209 | 210 | train['service12'] = train['service2_caller_time']+train['service1_caller_time'] 211 | for column in ['local_caller_time', 'service1_caller_time', 'service2_caller_time','service12']: 212 | train[column+'_decimal'] = round(((round(train[column])- train[column])*60)).astype('int') 213 | train[column+'_decimal_is_int'] = ((train[column+'_decimal']==0)&(train[column] != 0)).astype('int') 214 | 215 | train = one_hot_encoder(train,'online_time',n=2000,nan_as_category=True) 216 | train = one_hot_encoder(train,'contract_time',n=1000,nan_as_category=True) 217 | 218 | print(train.shape) 219 | train = one_hot_encoder(train,'contract_type',n=1,nan_as_category=True) 220 | 221 | 222 | 223 | train['current_service'] = train['current_service'].map(current_service2label) 224 | 225 | 226 | train['age'] = train['age'].fillna(-20) 227 | train['cut_age'] = train['age'].apply(lambda x: int(x/10)) 228 | train['cut_online'] = (train['online_time'] / 12).astype(int) 229 | 230 | 231 | 232 | 233 | 234 | train['4-fea-dealta_'] = train['4_total_fee'] / (train['3_total_fee']+0.00001) 235 | train['3-fea-dealta_'] = train['3_total_fee'] / (train['2_total_fee']+0.00001) 236 | train['2-fea-dealta_'] = train['2_total_fee'] / (train['1_total_fee']+0.00001) 237 | train['1-fea-dealta_'] = train['4_total_fee'] / (train['1_total_fee']+0.00001) 238 | train['pay_num-dealta_'] = train['pay_num'] / (train['1_total_fee']+0.00001) 239 | 240 | #train['fea-dealta_p'] = train['1_total_fee'] - train['fee_pred'] 241 | 242 | train['month_traffic_delata'] = train['month_traffic'] - train['last_month_traffic'] 243 | train['month_traffic_delata_'] = train['month_traffic'] / (train['last_month_traffic']+0.00001) 244 | train['2month_traffic_sum'] = train['month_traffic'] + train['last_month_traffic'] 245 | train['add_month_traffic'] = train['month_traffic'] - train['local_trafffic_month'] 246 | train['add_month_traffic_'] = train['month_traffic'] / (train['local_trafffic_month']+0.00001) 247 | 248 | 249 | train['service1_caller_time_delata'] = train['service1_caller_time'] / (train['service2_caller_time']+0.00001) 250 | train['service1_caller_time_delata2'] = train['service1_caller_time'] / (train['local_caller_time']+0.00001) 251 | train['service2_caller_time_delata_'] = train['service2_caller_time'] / (train['local_caller_time']+0.00001) 252 | train['local_caller_time_reatio'] = train['local_caller_time']/(train['service1_caller_time']+train['service2_caller_time']+0.00001) 253 | 254 | 255 | train['div_online_time_contract'] = train['contract_time'] / (train['online_time']+0.00001) 256 | train['div_online_time_contract'] = train['contract_time'] - train['online_time'] 257 | 258 | 259 | train['div_former_complaint_num'] = train['former_complaint_num'] / (train['pay_times']+0.00001) 260 | train['div_former_complaint_num'] = train['former_complaint_num'] - train['pay_times'] 261 | 262 | 263 | train['fea-sum'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 264 | train['fea-var'] = train[[str(1+i) +'_total_fee' for i in range(4)]].var(axis = 1) 265 | train['fea-max'] = train[[str(1+i) +'_total_fee' for i in range(4)]].max(axis = 1) 266 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 267 | train['fea-mean4'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 268 | train['fea-mean3'] = train[[str(1+i) +'_total_fee' for i in range(3)]].sum(axis = 1) 269 | train['fea-mean2'] = train[[str(1+i) +'_total_fee' for i in range(2)]].sum(axis = 1) 270 | train['fea-extra'] = train['fea-sum']-4*train['fea-min'] 271 | train['1_total_fee_extra_for_min'] = train['1_total_fee']-train['fea-min'] 272 | train['fea_unum'] = train[['1_total_fee','2_total_fee','3_total_fee', '4_total_fee']].nunique(axis=1) 273 | 274 | train['call_time_sum'] = train[call_time].sum(axis = 1) 275 | train['call_time_var'] = train[call_time].var(axis = 1) 276 | train['call_time_min'] = train[call_time].min(axis = 1) 277 | train['call_time_max'] = train[call_time].max(axis = 1) 278 | 279 | train['traffic_sum'] = train[traffic].sum(axis = 1) 280 | train['traffic_var'] = train[traffic].var(axis = 1) 281 | train['traffic_min'] = train[traffic].min(axis = 1) 282 | train['traffic_max'] = train[traffic].max(axis = 1) 283 | 284 | 285 | train['average_pay'] = train['pay_num'] / train['pay_times'] 286 | 287 | 288 | 289 | train['div_traffic_price_2'] = train['last_month_traffic']/ 1000 / train['2_total_fee'] 290 | train['div_traffic_price_3'] = train['local_trafffic_month']/ 1000 / train['1_total_fee'] 291 | train['div_add_month_traffic_price'] = train['add_month_traffic']/ 1000 / train['1_total_fee'] 292 | train['div_local_caller_time_price'] = train['local_trafffic_month'] / 1000/ train['1_total_fee'] 293 | 294 | 295 | train['1-min-fea-dealta_div'] = train['1-min-fea-dealta']/(train['service1_caller_time']+0.0001) 296 | train['div_service1_caller_time_price'] = train['service1_caller_time'] / train['1_total_fee'] 297 | train['div_local_caller_time'] = train['local_caller_time'] / train['1_total_fee'] 298 | train['div_call_time_sum_price'] = train['call_time_sum'] / train['1_total_fee'] 299 | train['1_total_fee_maybe_real_calller'] = train['1_total_fee']- train['service1_caller_time']*0.15 300 | train['1_total_fee_maybe_real_calller2'] = train['1_total_fee']- train['service1_caller_time']*0.1 301 | train['1_total_fee_extra_for_min_caller_time'] = train['1_total_fee_extra_for_min']/(train['service1_caller_time']+0.001) 302 | 303 | train['div_service1_caller_time'] = train['service1_caller_time']/train['last_month_traffic'] 304 | train['div_local_caller_time'] = train['local_caller_time']/train['last_month_traffic'] 305 | train['div_local_caller_time2'] = train['local_caller_time']/train['month_traffic'] 306 | 307 | 308 | train['avg_complain_fee'] = train['former_complaint_fee'] / (train['former_complaint_num'] + 0.000000001) 309 | 310 | result = [] 311 | 312 | result.append(get_feat_ngroup(train,['cut_age','gender'])) 313 | for size_feat in ['1_total_fee','2_total_fee','3_total_fee', '4_total_fee','pay_num', 314 | 'last_month_traffic','month_traffic','local_trafffic_month', 315 | 'local_caller_time','service1_caller_time','service2_caller_time']: 316 | result.append(get_feat_size(train,[size_feat])) 317 | 318 | 319 | result.append(get_feat_stat_feat(train, ['contract_type'], ['1_total_fee'], ['max'])) 320 | result.append(get_feat_stat_feat(train, ['contract_type'], ['2_total_fee'], ['mean'])) 321 | result.append(get_feat_stat_feat(train, ['contract_type'], ['last_month_traffic'], ['var','mean'])) 322 | result.append(get_feat_stat_feat(train, ['contract_type'], ['call_time_sum'], ['mean'])) 323 | 324 | for base_feat in [['contract_type']]: 325 | for other_feat in ['1_total_fee', 'pay_num', 326 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 327 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 328 | ]: 329 | stat_list = ['mean'] 330 | tmp = get_feat_stat_feat(train,base_feat,[other_feat],stat_list=stat_list) 331 | name = tmp.columns[0] 332 | train[name] = tmp 333 | train[name+'_comp'] = train[other_feat].values-train[name].values 334 | 335 | 336 | train['1_total_fee_ratio'] = train['1_total_fee']/(train['fea-sum']+0.000001) 337 | train['3_total_fee_ratio'] = train['3_total_fee']/(train['fea-sum']+0.000001) 338 | train['call_time_sum_ratio'] = train['call_time_sum']/(train['traffic_sum']+0.000001) 339 | train['call_time_sum_ratio2'] = train['call_time_sum']/(train['fea-sum']+0.000001) 340 | train['traffic_sum_ratio1'] = train['traffic_sum']/(train['fea-sum']+0.000001) 341 | 342 | 343 | ####################################lgb和metric函数################################################### 344 | 345 | 346 | def f1_score_vali(preds, data_vali): 347 | labels = data_vali.get_label() 348 | preds = np.argmax(preds.reshape(11, -1),axis=0) 349 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 350 | return 'macro_f1_score', score_vali, True 351 | 352 | def evaluate_macroF1_lgb(data_vali, preds): 353 | labels = data_vali.astype(int) 354 | preds = np.array(preds) 355 | preds = np.argmax(preds,axis=1) 356 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 357 | return score_vali 358 | 359 | def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True, 360 | objective='', metrics='',debug= False, 361 | feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ): 362 | 363 | lgb_params = params 364 | 365 | train_df = df[df[target].notnull()] 366 | test_df = df[df[target].isnull()] 367 | 368 | # Divide in training/validation and test data 369 | print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape)) 370 | del df 371 | gc.collect() 372 | # Cross validation model 373 | if stratified: 374 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234) 375 | else: 376 | folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234) 377 | 378 | oof_preds = np.zeros((train_df.shape[0],11)) 379 | sub_preds = np.zeros((test_df.shape[0],11)) 380 | feature_importance_df = pd.DataFrame() 381 | feats = predictors 382 | cv_resul = [] 383 | 384 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])): 385 | if (USE_KFOLD == False) and (n_fold == 1): 386 | break 387 | train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx] 388 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx] 389 | #weight_num = train_df['weight'].iloc[train_idx] 390 | 391 | train_y_t = train_y.values 392 | valid_y_t = valid_y.values 393 | print(train_y_t) 394 | xgtrain = lgb.Dataset(train_x.values, label = train_y_t, 395 | feature_name=predictors, 396 | categorical_feature=categorical_features 397 | ) 398 | # xgtrain.set_weight(np.array(weight_num)) 399 | xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t, 400 | feature_name=predictors, 401 | categorical_feature=categorical_features 402 | ) 403 | 404 | clf = lgb.train(lgb_params, 405 | xgtrain, 406 | valid_sets=[xgvalid],#, xgtrain], 407 | valid_names=['valid'],#,'train'], 408 | num_boost_round=num_boost_round, 409 | early_stopping_rounds=early_stopping_rounds, 410 | verbose_eval=verbose_eval, 411 | # feval=feval 412 | ) 413 | 414 | 415 | 416 | oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) 417 | sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits 418 | 419 | 420 | gain = clf.feature_importance('gain') 421 | fold_importance_df = pd.DataFrame({'feature':clf.feature_name(), 422 | 'split':clf.feature_importance('split'), 423 | 'gain':100*gain/gain.sum(), 424 | 'fold':n_fold, 425 | }).sort_values('gain',ascending=False) 426 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 427 | 428 | result = sklearn_mertric(valid_y, oof_preds[valid_idx]) 429 | # result = clf.best_score['valid']['macro_f1_score'] 430 | print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result)) 431 | cv_resul.append(round(result,5)) 432 | gc.collect() 433 | 434 | #score = np.array(cv_resul).mean() 435 | score = "final" 436 | if USE_KFOLD: 437 | # print('Full f1 score %.6f' % score) 438 | for i in range(11): 439 | train_df["class_" + str(i)] = oof_preds[:,i] 440 | test_df["class_" + str(i)] = sub_preds[:,i] 441 | train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 442 | test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 443 | oof_preds = [np.argmax(x)for x in oof_preds] 444 | sub_preds = [np.argmax(x)for x in sub_preds] 445 | train_df[target] = oof_preds 446 | test_df[target] = sub_preds 447 | print(test_df[target].mean()) 448 | train_df[target] = oof_preds 449 | train_df[target] = train_df[target].map(label2current_service) 450 | test_df[target] = sub_preds 451 | test_df[target] = test_df[target].map(label2current_service) 452 | print('all_cv', cv_resul) 453 | train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False) 454 | test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False) 455 | print("test_df mean:") 456 | 457 | display_importances(feature_importance_df,score) 458 | 459 | 460 | 461 | def display_importances(feature_importance_df_,score): 462 | ft = feature_importance_df_[["feature", "split","gain"]].groupby("feature").mean().sort_values(by="gain", ascending=False) 463 | print(ft.head(60)) 464 | ft.to_csv('importance_lightgbm_{}.csv'.format(score),index=True) 465 | cols = ft[:40].index 466 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] 467 | 468 | 469 | ####################################计算################################################################# 470 | 471 | 472 | params = { 473 | 'metric': 'multi_logloss', 474 | 'num_class':11, 475 | 'boosting_type': 'gbdt', 476 | 'objective': 'multiclass', 477 | 'feature_fraction': 0.7, 478 | 'learning_rate': 0.01, 479 | 'bagging_fraction': 0.7, 480 | #'bagging_freq': 2, 481 | 'num_leaves': 64, 482 | 'max_depth': -1, 483 | 'num_threads': 16, 484 | 'seed': 2018, 485 | 'verbose': -1, 486 | #'is_unbalance':True, 487 | } 488 | 489 | 490 | categorical_columns = [ 491 | 'contract_type', 492 | # 'is_mix_service', 493 | # 'is_promise_low_consume', 494 | 'net_service', 495 | 'gender'] 496 | for feature in categorical_columns: 497 | print(f'Transforming {feature}...') 498 | encoder = LabelEncoder() 499 | train[feature] = encoder.fit_transform(train[feature].astype(str)) 500 | 501 | 502 | x = [] 503 | no_use = ['current_service', 'user_id','group','weight' 504 | 505 | ] + x 506 | 507 | 508 | categorical_columns = [] 509 | all_data_frame = [] 510 | all_data_frame.append(train) 511 | 512 | for aresult in result: 513 | all_data_frame.append(aresult) 514 | 515 | train = concat(all_data_frame) 516 | feats = [f for f in train.columns if f not in no_use] 517 | categorical_columns = [f for f in categorical_columns if f not in no_use] 518 | clf = kfold_lightgbm(params,train,feats,'current_service' ,5 , num_boost_round=4000, categorical_features=categorical_columns) 519 | -------------------------------------------------------------------------------- /feat_a.py: -------------------------------------------------------------------------------- 1 | from tool import * 2 | 3 | cache_path = '/' 4 | inplace = False 5 | 6 | ############################### 工具函数 ########################### 7 | # 合并节约内存 8 | def concat(L): 9 | result = None 10 | for l in L: 11 | if result is None: 12 | result = l 13 | else: 14 | result[l.columns.tolist()] = l 15 | return result 16 | 17 | # 统计转化率 18 | def bys_rate(data,cate,cate2,label): 19 | temp = data.groupby(cate2,as_index=False)[label].agg({'count':'count','sum':'sum'}).rename(columns={'2_total_fee':'1_total_fee'}) 20 | temp['rate'] = temp['sum']/temp['count'] 21 | data_temp = data[[cate]].copy() 22 | data_temp = data_temp.merge(temp[[cate,'rate']],on=cate,how='left') 23 | return data_temp['rate'] 24 | 25 | # 统计转化率 26 | def mul_rate(data,cate,label): 27 | temp1 = data.groupby([cate,label],as_index=False).size().unstack().fillna(0) 28 | temp2 = data.groupby([cate], as_index=False).size() 29 | temp2.loc[temp2 < 20] = np.nan 30 | temp3 = (temp1.T/temp2).T 31 | temp3.columns = [cate+'_'+str(c)+'_conversion' for c in temp3.columns] 32 | temp3 = temp3.reset_index() 33 | data = data.merge(temp3,on=cate,how='left') 34 | return data 35 | 36 | # 相同的个数 37 | def get_same_count(li): 38 | return pd.Series(li).value_counts().values[0] 39 | 40 | # 相同的个数 41 | def get_second_min(li): 42 | return sorted(li)[1] 43 | 44 | # One-hot encoding for categorical columns with get_dummies 45 | def one_hot_encoder(df, nan_as_category=True, min_count=100,inplace=True): 46 | original_columns = list(df.columns) 47 | categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] 48 | result = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category) 49 | new_columns = [c for c in result.columns if c not in original_columns] 50 | cat_columns = [c for c in original_columns if c not in result.columns] 51 | if not inplace: 52 | for c in cat_columns: 53 | result[c] = df[c] 54 | for c in new_columns: 55 | if (result[c].sum()<100) or ((result.shape[0]-result[c].sum())<100): 56 | del result[c] 57 | new_columns.remove(c) 58 | return result, new_columns 59 | 60 | # 连续特征离散化 61 | def one_hot_encoder_continus(df, col, n_scatter=10,nan_as_category=True): 62 | df[col+'_scatter'] = pd.qcut(df[col],n_scatter) 63 | result = pd.get_dummies(df, columns=[col+'_scatter'], dummy_na=nan_as_category) 64 | return result 65 | 66 | ############################### 预处理函数 ########################### 67 | def pre_treatment(data,data_key): 68 | result_path = cache_path + 'data_{}.feature'.format(data_key) 69 | if os.path.exists(result_path) & 0: 70 | data = pd.read_feature(result_path) 71 | else: 72 | month_fee = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee'] 73 | data['total_fee_mean4'] = data[month_fee[:4]].mean(axis=1) 74 | data['total_fee_mean3'] = data[month_fee[:3]].mean(axis=1) 75 | data['total_fee_mean2'] = data[month_fee[:2]].mean(axis=1) 76 | data['total_fee_std4'] = data[month_fee[:4]].std(axis=1) 77 | # data['total_fee_mode4'] = data[month_fee[:4]].apply(mode,axis=1) 78 | data['total_fee_Standardization'] = data['total_fee_std4'] / (data['total_fee_mean4'] + 0.1) 79 | data['1_total_fee_rate12'] = data['1_total_fee'] / (data['2_total_fee'] + 0.1) 80 | data['1_total_fee_rate23'] = data['2_total_fee'] / (data['3_total_fee'] + 0.1) 81 | data['1_total_fee_rate34'] = data['3_total_fee'] / (data['4_total_fee'] + 0.1) 82 | data['1_total_fee_rate24'] = data['total_fee_mean2'] / (data['total_fee_mean4'] + 0.1) 83 | data['total_fee_max4'] = data[month_fee[:4]].max(axis=1) 84 | data['total_fee_min4'] = data[month_fee[:4]].min(axis=1) 85 | data['total_fee_second_min4'] = data[month_fee[:4]].apply(get_second_min,axis=1) 86 | data['service_caller_time_diff'] = data['service2_caller_time'] - data['service1_caller_time'] 87 | data['service_caller_time_sum'] = data['service2_caller_time'] + data['service1_caller_time'] 88 | data['service_caller_time_min'] = data[['service1_caller_time','service2_caller_time']].min(axis=1) 89 | data['service_caller_time_max'] = data[['service1_caller_time', 'service2_caller_time']].max(axis=1) 90 | 91 | data['1_total_fee_last0_number'] = count_encoding(data['1_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-1]).astype(int)) 92 | data['1_total_fee_last1_number'] = count_encoding(data['1_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-2]).astype(int)) 93 | data['1_total_fee_last2_number'] = count_encoding(data['1_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-4]).astype(int)) 94 | data['1_total_fee_last3_number'] = count_encoding(data['1_total_fee'].fillna(-1)//10) 95 | data['2_total_fee_last0_number'] = count_encoding(data['2_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-1]).astype(int)) 96 | data['2_total_fee_last1_number'] = count_encoding(data['2_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-2]).astype(int)) 97 | data['2_total_fee_last2_number'] = count_encoding(data['2_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-4]).astype(int)) 98 | data['2_total_fee_last3_number'] = count_encoding(data['2_total_fee'].fillna(-1) // 10) 99 | data['3_total_fee_last0_number'] = count_encoding(data['3_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-1]).astype(int)) 100 | data['3_total_fee_last1_number'] = count_encoding(data['3_total_fee'].fillna(-1).apply( lambda x:('%.2f' % x)[-2]).astype(int)) 101 | data['3_total_fee_last2_number'] = count_encoding(data['3_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-4]).astype(int)) 102 | data['3_total_fee_last3_number'] = count_encoding(data['3_total_fee'].fillna(-1) // 10) 103 | data['4_total_fee_last0_number'] = count_encoding(data['4_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-1]).astype(int)) 104 | data['4_total_fee_last1_number'] = count_encoding(data['4_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-2]).astype(int)) 105 | data['4_total_fee_last2_number'] = count_encoding( data['4_total_fee'].fillna(-1).apply(lambda x: ('%.2f' % x)[-4]).astype(int)) 106 | data['4_total_fee_last3_number'] = count_encoding(data['4_total_fee'].fillna(-1) // 10) 107 | # data['total_fee_sample_count'] = data[month_fee].apply(get_same_count,axis=1) 108 | 109 | for fee in ['1_total_fee','2_total_fee', '3_total_fee', '4_total_fee']: 110 | data['{}_1'.format(fee)] = ((data[fee] % 1==0) & (data[fee] !=0)) 111 | data['{}_01'.format(fee)] = ((data[fee] % 0.1==0) & (data[fee] !=0)) 112 | 113 | data['pay_number_last_2'] = data['pay_num']*100%100 114 | # data = one_hot_encoder_continus(data,'1_total_fee',20) 115 | # data = one_hot_encoder_continus(data, '2_total_fee', 20) 116 | # data = one_hot_encoder_continus(data, '3_total_fee', 20) 117 | # data = one_hot_encoder_continus(data, '4_total_fee', 20) 118 | # data = one_hot_encoder_continus(data, 'age', 10) 119 | # data = one_hot_encoder_continus(data, 'online_time', 10) 120 | 121 | data['1_total_fee_log'] = np.log(data['1_total_fee']+2) 122 | data['2_total_fee_log'] = np.log(data['2_total_fee'] + 2) 123 | data['3_total_fee_log'] = np.log(data['3_total_fee'] + 2) 124 | data['4_total_fee_log'] = np.log(data['4_total_fee'] + 2) 125 | data = grp_standard(data, 'contract_type', ['1_total_fee_log'], drop=False) 126 | data = grp_standard(data, 'contract_type', ['service_caller_time_min'], drop=False) 127 | data = grp_standard(data, 'contract_type', ['service_caller_time_max'], drop=False) 128 | data = grp_standard(data, 'contract_type', ['online_time'], drop=False) 129 | data = grp_standard(data, 'contract_type', ['age'], drop=False) 130 | data = grp_standard(data, 'net_service', ['1_total_fee_log'], drop=False) 131 | data = grp_standard(data, 'net_service', ['service_caller_time_min'], drop=False) 132 | data = grp_standard(data, 'net_service', ['service_caller_time_max'], drop=False) 133 | data = grp_standard(data, 'net_service', ['online_time'], drop=False) 134 | data = grp_standard(data, 'net_service', ['age'], drop=False) 135 | data['age_scatter'] = pd.qcut(data['age'], 5) 136 | data = grp_standard(data, 'age_scatter', ['1_total_fee_log'], drop=False) 137 | data = grp_standard(data, 'age_scatter', ['service_caller_time_min'], drop=False) 138 | data = grp_standard(data, 'age_scatter', ['service_caller_time_max'], drop=False) 139 | data = grp_standard(data, 'age_scatter', ['online_time'], drop=False) 140 | data = grp_standard(data, 'age_scatter', ['age'], drop=False) 141 | data['online_time_scatter'] = pd.qcut(data['online_time'], 5) 142 | data = grp_standard(data, 'online_time_scatter', ['1_total_fee_log'], drop=False) 143 | data = grp_standard(data, 'online_time_scatter', ['service_caller_time_min'], drop=False) 144 | data = grp_standard(data, 'online_time_scatter', ['service_caller_time_max'], drop=False) 145 | data = grp_standard(data, 'online_time_scatter', ['online_time'], drop=False) 146 | data = grp_standard(data, 'online_time_scatter', ['age'], drop=False) 147 | data = grp_standard(data, 'service_type', ['1_total_fee_log'], drop=False) 148 | data = grp_standard(data, 'service_type', ['service_caller_time_min'], drop=False) 149 | data = grp_standard(data, 'service_type', ['service_caller_time_max'], drop=False) 150 | data = grp_standard(data, 'service_type', ['online_time'], drop=False) 151 | data = grp_standard(data, 'service_type', ['age'], drop=False) 152 | 153 | del data['1_total_fee_log'],data['2_total_fee_log'],data['3_total_fee_log'],data['4_total_fee_log'], \ 154 | data['age_scatter'],data['online_time_scatter'] 155 | 156 | # data['online_time_count'] = count_encoding(data['online_time']//3) 157 | data['month_traffic_last_month_traffic_sum'] = data['month_traffic'] + data['last_month_traffic'] 158 | data['month_traffic_last_month_traffic_diff'] = data['month_traffic'] - data['last_month_traffic'] 159 | data['month_traffic_last_month_traffic_rate'] = data['month_traffic'] / (data['last_month_traffic']+0.01) 160 | data['outer_trafffic_month'] = data['month_traffic'] - data['local_trafffic_month'] 161 | data['local_trafffic_month_month_traffic_rate'] = data['local_trafffic_month'] / (data['month_traffic'] + 0.01) 162 | 163 | data['month_traffic_last_month_traffic_sum_1_total_fee_rate'] = data['month_traffic_last_month_traffic_sum'] / (data['1_total_fee'] + 0.01) 164 | data['month_traffic_local_caller_time'] = data['month_traffic'] / (data['local_caller_time'] + 0.01) 165 | data['pay_num_per'] = data['pay_num'] / (data['pay_times']+0.01) 166 | data['total_fee_mean4_pay_num_rate'] = data['pay_num'] / (data['total_fee_mean4'] + 0.01) 167 | data['local_trafffic_month_spend'] = data['local_trafffic_month'] - data['last_month_traffic'] 168 | data['month_traffic_1_total_fee_rate'] = data['month_traffic'] / (data['1_total_fee'] + 0.01) 169 | 170 | for traffic in ['month_traffic','last_month_traffic', 'local_trafffic_month']: 171 | data['{}_1'.format(traffic)] = ((data[traffic] % 1==0) & (data[traffic] !=0)) 172 | data['{}_50'.format(traffic)] = ((data[traffic] % 50==0) & (data[traffic] !=0)) 173 | data['{}_1024'.format(traffic)] = ((data[traffic] % 1024==0) & (data[traffic] !=0)) 174 | data['{}_1024_50'.format(traffic)] = ((data[traffic] % 1024 % 50 == 0) & (data[traffic] != 0)) 175 | 176 | data['service_caller_time'] = data['service1_caller_time'] + data['service2_caller_time'] 177 | data['outer_caller_time'] = data['service_caller_time'] - data['local_caller_time'] 178 | data['local_caller_time_rate'] = data['local_caller_time'] / (data['service_caller_time']+0.01) 179 | data['service1_caller_time_rate'] = data['service1_caller_time'] / (data['service_caller_time'] + 0.01) 180 | data['local_caller_time_service2_caller_time_rate'] = data['local_caller_time'] / (data['service2_caller_time'] + 0.01) 181 | data['service1_caller_time_1_total_fee_rate'] = data['service_caller_time'] / (data['1_total_fee'] + 0.01) 182 | 183 | # data['online_fee'] = groupby(data,data,'online_time','total_fee_mean4','median') 184 | # data['1_total_fee_10'] = data['1_total_fee']//10 185 | # data['1_total_fee_10_online_time'] = groupby(data, data, '1_total_fee_10', 'online_time', 'median') 186 | # del data['1_total_fee_10'] 187 | # data['per_month_fee'] = data['pay_num'] / (data['online_time']+0.01) 188 | # data['per_month_times'] = data['pay_times'] / (data['online_time'] + 0.01) 189 | # data 190 | data['contract_time_count'] = count_encoding(data['contract_time']) 191 | data['pay_num_count'] = count_encoding(data['pay_num']) 192 | data['pay_num_last0_number'] = count_encoding(data['pay_num'].apply(lambda x: ('%.2f' % x)[-1]).astype(int)) 193 | data['pay_num_last1_number'] = count_encoding(data['pay_num'].apply(lambda x: ('%.2f' % x)[-2]).astype(int)) 194 | data['pay_num_last2_number'] = count_encoding(data['pay_num'].apply(lambda x: ('%.2f' % x)[-4]).astype(int)) 195 | data['pay_num_count'] = count_encoding(data['pay_num'] // 10) 196 | data['age_count3'] = count_encoding(data['age'] // 3) 197 | data['age_count6'] = count_encoding(data['age'] // 6) 198 | data['age_count10'] = count_encoding(data['age'] // 10) 199 | # data['contract_time_count'] = count_encoding(data['contract_time']) 200 | # for i in range(11): 201 | # data['temp'] = (data['label']==i).astype(int) 202 | # data['1_total_fee_rate_cate{}'.format(i)] = cv_convert(data['1_total_fee'],data['temp']) 203 | # del data['temp'] 204 | 205 | # data['1_total_fee_zheng'] = round(data['1_total_fee']) 206 | # data = one_hot_encoder(data, '1_total_fee_zheng', n=4000, nan_as_category=True) 207 | 208 | # 转化率 209 | data = mul_rate(data, 'pay_num', 'current_service') 210 | 211 | data = pd.get_dummies(data, columns=['contract_type'], dummy_na=-1) 212 | data = pd.get_dummies(data, columns=['net_service'], dummy_na=-1) 213 | data = pd.get_dummies(data, columns=['complaint_level'], dummy_na=-1) 214 | data.reset_index(drop=True,inplace=True) 215 | # data.to_feather(result_path) 216 | return data 217 | 218 | 219 | ############################### 特征函数 ########################### 220 | # 特征 221 | def get__feat(data,data_key): 222 | result_path = cache_path + '_feat_{}.feature'.format(data_key) 223 | if os.path.exists(result_path) & (not inplace): 224 | feat = pd.read_feature(result_path) 225 | else: 226 | data_temp = data.copy() 227 | 228 | feat.to_feather(result_path) 229 | return feat 230 | 231 | 232 | 233 | # 二次处理特征 234 | def second_feat(result): 235 | return result 236 | 237 | def make_feat(data,data_key): 238 | t0 = time.time() 239 | # data_key = hashlib.md5(data.to_string().encode()).hexdigest() 240 | # #print('数据key为:{}'.format(data_key)) 241 | result_path = cache_path + 'feat_set_{}.feature'.format(data_key) 242 | if os.path.exists(result_path) & 0: 243 | result = pd.read_feature(result_path, 'w') 244 | else: 245 | data = pre_treatment(data,'data_key') 246 | 247 | result = [data] 248 | # #print('开始构造特征...') 249 | # result.append(get_context_feat()) # context特征 250 | # result.append(get_user_feat()) # 用户特征 251 | # result.append(get_item_feat()) # 商品特征 252 | # result.append(get_shop_feat()) # 商店特征 253 | 254 | #print('开始合并特征...') 255 | result = concat(result) 256 | 257 | result = second_feat(result) 258 | 259 | #print('特征矩阵大小:{}'.format(result.shape)) 260 | #print('生成特征一共用时{}秒'.format(time.time() - t0)) 261 | return result 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | -------------------------------------------------------------------------------- /feature/get_most.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | data_path = '../input/' 5 | output_path = '../' 6 | 7 | def astype(x,t): 8 | try: 9 | return t(x) 10 | except: 11 | return np.nan 12 | 13 | def have_0(x): 14 | try: 15 | r = x.split('.')[1][-1] 16 | return 0 if r=='0' else 1 17 | except: 18 | return 1 19 | 20 | str_dict = {'1_total_fee': 'str', 21 | '2_total_fee': 'str', 22 | '3_total_fee': 'str', 23 | '4_total_fee': 'str', 24 | 'pay_num': 'str', 25 | } 26 | 27 | 28 | have_0_c = ['1_total_fee', 29 | '2_total_fee', 30 | '3_total_fee', 31 | '4_total_fee', 32 | 'pay_num'] 33 | 34 | def deal(data): 35 | for c in have_0_c: 36 | data['have_0_{}'.format(c)] = data[c].apply(have_0) 37 | try: 38 | data[c] = data[c].astype(float) 39 | except: 40 | pass 41 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 42 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 43 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 44 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 45 | data.loc[data['age']==0,'age'] = np.nan 46 | data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan 47 | data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan 48 | data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan 49 | data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan 50 | for c in [ 51 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 52 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 53 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 54 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', 55 | ]: 56 | data[c] = data[c].round(6) 57 | return data 58 | 59 | train = pd.read_csv(data_path + 'train.csv',dtype=str_dict) 60 | train = deal(train) 61 | train = train[train['current_service'] != 999999] 62 | test = pd.read_csv(data_path + 'test.csv',dtype=str_dict) 63 | test = deal(test) 64 | train_old = pd.read_csv(data_path +'train_old.csv',dtype=str_dict)[:] 65 | train_old = deal(train_old) 66 | 67 | def get_magic_feature(df, outname): 68 | """ 69 | It is the magic beer and niaobu, try it and enjoy! 70 | """ 71 | df['fea_unum'] = df[['1_total_fee','2_total_fee','3_total_fee', '4_total_fee']].nunique(axis=1) 72 | df.drop_duplicates(subset =['1_total_fee','2_total_fee','3_total_fee', '4_total_fee'],inplace=True) 73 | df = df[df.fea_unum>2] 74 | for month1_month2 in [ 75 | [1,2], 76 | [1,3], 77 | [1,4], 78 | [2,1], 79 | [2,3], 80 | [2,4], 81 | [3,1], 82 | [3,2], 83 | [3,4], 84 | [4,1], 85 | [4,2], 86 | [4,3], 87 | ]: 88 | month1, month2 = str(month1_month2[0]), str(month1_month2[1]) 89 | mstr = '_total_fee' 90 | tmp = df.groupby([month1 + mstr, month2 + mstr]).size().reset_index() 91 | tmp.columns =['first','second','{}_total_fee_{}_total_fee'.format(month1,month2)] 92 | if month1_month2 == [1,2]: 93 | result_df = tmp 94 | else: 95 | result_df = result_df.merge(tmp, on = ['first','second'], how = 'outer') 96 | 97 | tmpall = result_df 98 | tmpall = tmpall[tmpall.second!=0] 99 | tmpall['count'] = tmpall.iloc[:,2:].sum(axis=1) 100 | tmpall = tmpall.merge(tmpall.groupby('second',as_index=False)['count'].agg({'sum':'sum'}),on='second',how='left') 101 | tmpall['rate'] = tmpall['count'] / tmpall['sum'] 102 | tmpall = tmpall.sort_values(['first','rate'],ascending=False) 103 | tmpall = tmpall [tmpall['count']>10] 104 | tmpall = tmpall.sort_values(['first','count'],ascending=False) 105 | tmp_res = tmpall.drop_duplicates('first',keep='first') 106 | tmp_res[tmp_res['count']>10].to_csv(output_path + outname, columns = ['first','second'],index = False) 107 | 108 | # Magic_Feature_Exclude_Old 109 | train = train.append(test).reset_index(drop = True) 110 | train.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 111 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 112 | get_magic_feature(train, 'Magic_Feature_Exclude_Old.csv') 113 | 114 | # Magic_Feature_Include_Old 115 | train = train.append(train_old).reset_index(drop = True) 116 | train.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 117 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 118 | get_magic_feature(train, 'Magic_Feature_Include_Old.csv') 119 | -------------------------------------------------------------------------------- /feature/white.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | pd.set_option('max_columns',1000) 4 | pd.set_option('max_row',300) 5 | 6 | input_path = '../input/' 7 | output_path = '../' 8 | train = pd.read_csv(input_path + 'train.csv') 9 | test = pd.read_csv(input_path +'test.csv') 10 | 11 | def astype(x,t): 12 | try: 13 | return t(x) 14 | except: 15 | return np.nan 16 | 17 | def deal(data): 18 | 19 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 20 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 21 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 22 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 23 | for c in [ 24 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 25 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 26 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 27 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', 28 | ]: 29 | data[c] = data[c].round(4) 30 | return data 31 | 32 | train = deal(train) 33 | train = train[train['current_service'] != 999999] 34 | test = deal(test) 35 | 36 | cc = [ 37 | '1_total_fee', 38 | '2_total_fee', 39 | '3_total_fee', 40 | 'month_traffic', 41 | 'pay_times', 42 | 'last_month_traffic', 43 | 'service2_caller_time', 44 | 'age' 45 | ] 46 | 47 | train = train.drop_duplicates(cc) 48 | white = test.merge(train,on=cc,how='left') 49 | white = white[~white['current_service'].isnull()][['user_id_x','user_id_y','current_service']].copy() 50 | white['current_service'] = white['current_service'].astype('int') 51 | white.drop(['user_id_y'],inplace=True,axis=1) 52 | white.columns=['user_id','current_service'] 53 | white.to_csv(output_path + 'white.csv', index = False) 54 | #white = pd.read_csv('white.csv') 55 | -------------------------------------------------------------------------------- /hebing_pred.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Nov 8 15:49:27 2018 5 | 6 | @author: hzs 7 | """ 8 | import pandas as pd 9 | 10 | train_p4 = pd.read_csv('./cv/val_prob_model_3_1.csv') 11 | test_p4 = pd.read_csv('./cv/sub_prob_model_3_1.csv') 12 | 13 | train_p1 = pd.read_csv('./cv/val_prob_model_3_4.csv') 14 | test_p1 = pd.read_csv('./cv/sub_prob_model_3_4.csv') 15 | 16 | train_p = pd.concat([train_p4,train_p1]) 17 | test_p = pd.concat([test_p4,test_p1]) 18 | 19 | train_p = train_p.to_csv('val_prob_hebing2.csv',index=None) 20 | test_p = test_p.to_csv('sub_prob_hebing2.csv',index=None) -------------------------------------------------------------------------------- /images/q.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPshrimpGo/BDCI2018-ChinauUicom-1st-solution/21093c2162e10416e304e30896f5bf0c1bdbcbd4/images/q.jpg -------------------------------------------------------------------------------- /input/.gitkeep: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /model1.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | import pandas as pd 4 | import gc 5 | import time 6 | from contextlib import contextmanager 7 | import lightgbm as lgb 8 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score 9 | from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold 10 | import warnings 11 | from sklearn.preprocessing import LabelEncoder 12 | from utils import * 13 | from datetime import datetime 14 | from datetime import timedelta 15 | #test 16 | 17 | warnings.simplefilter(action='ignore', category=FutureWarning) 18 | 19 | 20 | USE_KFOLD = True 21 | 22 | data_path = './input/' 23 | 24 | ####################################读入文件#################################################### 25 | #要准备hzs的两个get most文件 26 | def astype(x,t): 27 | try: 28 | return t(x) 29 | except: 30 | return np.nan 31 | 32 | def have_0(x): 33 | try: 34 | r = x.split('.')[1][-1] 35 | return 0 if r=='0' else 1 36 | except: 37 | return 1 38 | 39 | str_dict = {'1_total_fee': 'str', 40 | '2_total_fee': 'str', 41 | '3_total_fee': 'str', 42 | '4_total_fee': 'str', 43 | 'pay_num': 'str', 44 | } 45 | 46 | 47 | have_0_c = ['1_total_fee', 48 | '2_total_fee', 49 | '3_total_fee', 50 | '4_total_fee', 51 | 'pay_num'] 52 | 53 | def deal(data): 54 | for c in have_0_c: 55 | data['have_0_{}'.format(c)] = data[c].apply(have_0) 56 | try: 57 | data[c] = data[c].astype(float) 58 | except: 59 | pass 60 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 61 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 62 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 63 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 64 | data.loc[data['age']==0,'age'] = np.nan 65 | data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan 66 | data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan 67 | data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan 68 | data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan 69 | for c in [ 70 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 71 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 72 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 73 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', ]: 74 | data[c] = data[c].round(4) 75 | return data 76 | 77 | train = pd.read_csv(data_path + 'train.csv',dtype=str_dict) 78 | train = deal(train) 79 | train.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 80 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 81 | train = train[train['current_service'] != 999999] 82 | test = pd.read_csv(data_path + 'test.csv',dtype=str_dict) 83 | test = deal(test) 84 | 85 | train_old = pd.read_csv('./input/train_old.csv',dtype=str_dict)[:] 86 | train_old = deal(train_old) 87 | train_old.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 88 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 89 | 90 | 91 | 92 | print(len(train)) 93 | 94 | 95 | label2current_service =dict(zip(range(0,len(set(train['current_service']))),sorted(list(set(train['current_service']))))) 96 | current_service2label =dict(zip(sorted(list(set(train['current_service']))),range(0,len(set(train['current_service']))))) 97 | print(len(label2current_service)) 98 | train = train.append(test).reset_index(drop = True) 99 | print(len(train)) 100 | shape1 = len(train) 101 | #train['is_b'] = 1 102 | #train_old['is_b'] = 0 103 | train = train.append(train_old).reset_index(drop = True) 104 | print(len(train)) 105 | shape2 = len(train) 106 | 107 | get_most = pd.read_csv('Magic_Feature_Exclude_Old.csv') 108 | get_most2 = pd.read_csv('Magic_Feature_Include_Old.csv') 109 | 110 | ####################################特征工程################################################### 111 | 112 | call_time = ['local_caller_time', 'service1_caller_time', 'service2_caller_time'] 113 | traffic = ['month_traffic','last_month_traffic','local_trafffic_month'] 114 | cat_cols = ['service_type','contract_type', 'net_service', 'gender', 'complaint_level', 115 | #3 #9,8 #4 #3 #4 116 | 'is_mix_service', 'many_over_bill', 'is_promise_low_consume', #2 117 | ] 118 | continus_col = [ 119 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','former_complaint_fee', 120 | 121 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 122 | 123 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 124 | 125 | 'online_time','contract_time', 126 | 127 | 'pay_times', 'former_complaint_num' 128 | ] 129 | def one_hot_encoder(train,column,n=100,nan_as_category=False): 130 | tmp = train[column].value_counts().to_frame() 131 | values = list(tmp[tmp[column]>n].index) 132 | train.loc[train[column].isin(values),column+'N'] = train.loc[train[column].isin(values),column] 133 | train = pd.get_dummies(train, columns=[column+'N'], dummy_na=False) 134 | return train 135 | # 136 | 137 | 138 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 139 | 140 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'fea-min']: 141 | get_most.columns = [column,column+'_most'] 142 | train = train.merge(get_most,on=column,how='left') 143 | 144 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'fea-min']: 145 | get_most2.columns = [column,column+'_most2'] 146 | train = train.merge(get_most2,on=column,how='left') 147 | 148 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','fea-min']: 149 | train[column+'_int'] = train[column].fillna(-1).astype('int') 150 | train[column+'_int_last'] = train[column+'_int']%10 #last int 151 | train[column+'_decimal'] = round(((train[column]-train[column+'_int'])*100).fillna(-1)).astype('int') #decimal 152 | train[column+'_decimal_is_0'] = (train[column+'_decimal']==0).astype('int') 153 | train[column+'_decimal_is_5'] = (train[column+'_decimal']%5==0).astype('int') 154 | train[column+'_decimal_last'] = train[column+'_decimal']%10 155 | train[column+'_decimal_last2'] = train[column+'_decimal']//5 156 | train[column+'_extra_fee'] = ((train[column]*100)-600)%1000 157 | train[column+'_27perMB'] = ((train[column+'_extra_fee']%27 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 158 | train[column+'_15perMB'] = ((train[column+'_extra_fee']%15 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 159 | train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 160 | 161 | train['pay_num_last2'] = train['pay_num_int']%100 162 | train['former_complaint_fee_last2'] = round(train['former_complaint_fee'])%100 163 | 164 | 165 | train['4-fea-dealta'] = round((train['4_total_fee'] - train['3_total_fee'])*100).fillna(999999.9).astype('int') 166 | train['3-fea-dealta'] = round((train['3_total_fee'] - train['2_total_fee'])*100).fillna(999999.9).astype('int') 167 | train['2-fea-dealta'] = round((train['2_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 168 | train['1-fea-dealta'] = round((train['4_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 169 | train['1-3-fea-dealta'] = round((train['3_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 170 | train['1-min-fea-dealta'] = round((train['1_total_fee'] - train['fea-min'])*100).fillna(999999.9).astype('int') 171 | 172 | for column in ['4-fea-dealta', '3-fea-dealta', '2-fea-dealta', '1-fea-dealta','1-3-fea-dealta','1-min-fea-dealta']: 173 | train[column+'_is_0'] = (train[column]==0).astype('int') 174 | train[column+'_is_6000'] = ((train[column]%6000 == 0)&(train[column] != 0)).astype('int') 175 | train[column+'_is_5'] = ((train[column]%5 == 0)&(train[column] != 0)).astype('int') 176 | train[column+'_is_10'] = ((train[column]%10 == 0)&(train[column] != 0)).astype('int') 177 | train[column+'_is_15'] = ((train[column]%15 == 0)&(train[column] != 0)).astype('int') 178 | train[column+'_is_27'] = ((train[column]%27 == 0)&(train[column] != 0)).astype('int') 179 | train[column+'_is_30'] = ((train[column]%30 == 0)&(train[column] != 0)).astype('int') 180 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 181 | train[column+'_is_100'] = ((train[column]%100 == 0)&(train[column] != 0)).astype('int') 182 | train[column+'_is_500'] = ((train[column]%500 == 0)&(train[column] != 0)).astype('int') 183 | 184 | for column in ['month_traffic', 'last_month_traffic', 'local_trafffic_month']: 185 | train[column+'_is_int'] = ((train[column]%1 == 0)&(train[column] != 0)).astype('int') 186 | train[column+'_is_512'] = ((train[column]%512 == 0)&(train[column] != 0)).astype('int') 187 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 188 | train[column+'_is_double'] = ((train[column]%512%50 == 0)&(train[column] != 0)&(train[column+'_is_512'] == 0)&(train[column+'_is_50'] == 0)).astype('int') 189 | train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 190 | 191 | train['service12'] = train['service2_caller_time']+train['service1_caller_time'] 192 | for column in ['local_caller_time', 'service1_caller_time', 'service2_caller_time','service12']: 193 | train[column+'_decimal'] = round(((round(train[column])- train[column])*60)).astype('int') 194 | train[column+'_decimal_is_int'] = ((train[column+'_decimal']==0)&(train[column] != 0)).astype('int') 195 | 196 | train = one_hot_encoder(train,'online_time',n=5000,nan_as_category=True) 197 | train = one_hot_encoder(train,'contract_time',n=5000,nan_as_category=True) 198 | 199 | print(train.shape) 200 | train = one_hot_encoder(train,'contract_type',n=1,nan_as_category=True) 201 | 202 | 203 | 204 | #lable 映射 205 | train['current_service'] = train['current_service'].map(current_service2label) 206 | 207 | 208 | train['age'] = train['age'].fillna(-20) 209 | train['cut_age'] = train['age'].apply(lambda x: int(x/10)) 210 | train['cut_online'] = (train['online_time'] / 12).astype(int) 211 | 212 | 213 | 214 | train['4-fea-dealta'] = train['4_total_fee'] - train['3_total_fee'] 215 | train['3-fea-dealta'] = train['3_total_fee'] - train['2_total_fee'] 216 | train['2-fea-dealta'] = train['2_total_fee'] - train['1_total_fee'] 217 | train['1-fea-dealta'] = train['4_total_fee'] - train['1_total_fee'] 218 | 219 | train['4-fea-dealta_'] = train['4_total_fee'] / (train['3_total_fee']+0.00001) 220 | train['3-fea-dealta_'] = train['3_total_fee'] / (train['2_total_fee']+0.00001) 221 | train['2-fea-dealta_'] = train['2_total_fee'] / (train['1_total_fee']+0.00001) 222 | train['1-fea-dealta_'] = train['4_total_fee'] / (train['1_total_fee']+0.00001) 223 | train['pay_num-dealta_'] = train['pay_num'] / (train['1_total_fee']+0.00001) 224 | 225 | 226 | 227 | train['month_traffic_delata'] = train['month_traffic'] - train['last_month_traffic'] 228 | train['month_traffic_delata_'] = train['month_traffic'] / (train['last_month_traffic']+0.00001) 229 | train['2month_traffic_sum'] = train['month_traffic'] + train['last_month_traffic'] 230 | train['add_month_traffic'] = train['month_traffic'] - train['local_trafffic_month'] 231 | train['add_month_traffic_'] = train['month_traffic'] / (train['local_trafffic_month']+0.00001) 232 | 233 | train['service1_caller_time_delata'] = train['service1_caller_time'] / (train['service2_caller_time']+0.00001) 234 | train['service1_caller_time_delata2'] = train['service1_caller_time'] / (train['local_caller_time']+0.00001) 235 | train['service2_caller_time_delata_'] = train['service2_caller_time'] / (train['local_caller_time']+0.00001) 236 | train['local_caller_time_reatio'] = train['local_caller_time']/(train['service1_caller_time']+train['service2_caller_time']+0.00001) 237 | 238 | train['div_online_time_contract'] = train['contract_time'] / (train['online_time']+0.00001) 239 | train['div_online_time_contract'] = train['contract_time'] - train['online_time'] 240 | 241 | 242 | train['div_former_complaint_num'] = train['former_complaint_num'] / (train['pay_times']+0.00001) 243 | train['div_former_complaint_num'] = train['former_complaint_num'] - train['pay_times'] 244 | 245 | 246 | train['fea-sum'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 247 | train['fea-var'] = train[[str(1+i) +'_total_fee' for i in range(4)]].var(axis = 1) 248 | train['fea-max'] = train[[str(1+i) +'_total_fee' for i in range(4)]].max(axis = 1) 249 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 250 | train['fea-mean4'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 251 | train['fea-mean3'] = train[[str(1+i) +'_total_fee' for i in range(3)]].sum(axis = 1) 252 | train['fea-mean2'] = train[[str(1+i) +'_total_fee' for i in range(2)]].sum(axis = 1) 253 | train['fea-extra'] = train['fea-sum']-4*train['fea-min'] 254 | train['1_total_fee_extra_for_min'] = train['1_total_fee']-train['fea-min'] 255 | train['fea_unum'] = train[['1_total_fee','2_total_fee','3_total_fee', '4_total_fee']].nunique(axis=1) 256 | 257 | train['call_time_sum'] = train[call_time].sum(axis = 1) 258 | train['call_time_var'] = train[call_time].var(axis = 1) 259 | train['call_time_min'] = train[call_time].min(axis = 1) 260 | train['call_time_max'] = train[call_time].max(axis = 1) 261 | 262 | train['traffic_sum'] = train[traffic].sum(axis = 1) 263 | train['traffic_var'] = train[traffic].var(axis = 1) 264 | train['traffic_min'] = train[traffic].min(axis = 1) 265 | train['traffic_max'] = train[traffic].max(axis = 1) 266 | 267 | 268 | train['average_pay'] = train['pay_num'] / train['pay_times'] 269 | 270 | 271 | train['div_traffic_price_2'] = train['last_month_traffic']/ 1000 / train['2_total_fee'] 272 | train['div_traffic_price_3'] = train['local_trafffic_month']/ 1000 / train['1_total_fee'] 273 | train['div_add_month_traffic_price'] = train['add_month_traffic']/ 1000 / train['1_total_fee'] 274 | train['div_local_caller_time_price'] = train['local_trafffic_month'] / 1000/ train['1_total_fee'] 275 | 276 | 277 | train['1-min-fea-dealta_div'] = train['1-min-fea-dealta']/(train['service1_caller_time']+0.0001) 278 | train['div_service1_caller_time_price'] = train['service1_caller_time'] / train['1_total_fee'] 279 | train['div_local_caller_time'] = train['local_caller_time'] / train['1_total_fee'] 280 | train['div_call_time_sum_price'] = train['call_time_sum'] / train['1_total_fee'] 281 | train['1_total_fee_maybe_real_calller'] = train['1_total_fee']- train['service1_caller_time']*0.15 282 | train['1_total_fee_maybe_real_calller2'] = train['1_total_fee']- train['service1_caller_time']*0.1 283 | train['1_total_fee_extra_for_min_caller_time'] = train['1_total_fee_extra_for_min']/(train['service1_caller_time']+0.001) 284 | 285 | train['div_service1_caller_time'] = train['service1_caller_time']/train['last_month_traffic'] 286 | train['div_local_caller_time'] = train['local_caller_time']/train['last_month_traffic'] 287 | train['div_local_caller_time2'] = train['local_caller_time']/train['month_traffic'] 288 | 289 | 290 | train['avg_complain_fee'] = train['former_complaint_fee'] / (train['former_complaint_num'] + 0.000000001) 291 | 292 | 293 | result = [] 294 | 295 | result.append(get_feat_ngroup(train,['cut_age','gender'])) 296 | for size_feat in ['1_total_fee','2_total_fee','3_total_fee', '4_total_fee','pay_num', 297 | 'last_month_traffic','month_traffic','local_trafffic_month', 298 | 'local_caller_time','service1_caller_time','service2_caller_time']: 299 | result.append(get_feat_size(train,[size_feat])) 300 | 301 | 302 | result.append(get_feat_stat_feat(train, ['contract_type'], ['1_total_fee'], ['max'])) 303 | result.append(get_feat_stat_feat(train, ['contract_type'], ['2_total_fee'], ['mean'])) 304 | result.append(get_feat_stat_feat(train, ['contract_type'], ['last_month_traffic'], ['var','mean'])) 305 | result.append(get_feat_stat_feat(train, ['contract_type'], ['call_time_sum'], ['mean'])) 306 | 307 | for base_feat in [['contract_type']]: 308 | for other_feat in ['1_total_fee', 'pay_num', 309 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 310 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 311 | ]: 312 | stat_list = ['mean'] 313 | tmp = get_feat_stat_feat(train,base_feat,[other_feat],stat_list=stat_list) 314 | name = tmp.columns[0] 315 | train[name] = tmp 316 | train[name+'_comp'] = train[other_feat].values-train[name].values 317 | 318 | 319 | train['1_total_fee_ratio'] = train['1_total_fee']/(train['fea-sum']+0.000001) 320 | train['3_total_fee_ratio'] = train['3_total_fee']/(train['fea-sum']+0.000001) 321 | train['call_time_sum_ratio'] = train['call_time_sum']/(train['traffic_sum']+0.000001) 322 | train['call_time_sum_ratio2'] = train['call_time_sum']/(train['fea-sum']+0.000001) 323 | train['traffic_sum_ratio1'] = train['traffic_sum']/(train['fea-sum']+0.000001) 324 | 325 | ####################################lgb和metric函数################################################### 326 | 327 | def f1_score_vali(preds, data_vali): 328 | labels = data_vali.get_label() 329 | preds = np.argmax(preds.reshape(11, -1),axis=0) 330 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 331 | return 'macro_f1_score', score_vali, True 332 | 333 | def evaluate_macroF1_lgb(data_vali, preds): 334 | labels = data_vali.astype(int) 335 | preds = np.array(preds) 336 | preds = np.argmax(preds,axis=1) 337 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 338 | return score_vali 339 | 340 | def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True, 341 | objective='', metrics='',debug= False, 342 | feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ): 343 | 344 | lgb_params = params 345 | 346 | train_df = df[df[target].notnull()] 347 | test_df = df[df[target].isnull()] 348 | 349 | # Divide in training/validation and test data 350 | print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape)) 351 | del df 352 | gc.collect() 353 | # Cross validation model 354 | if stratified: 355 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234) 356 | else: 357 | folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234) 358 | # folds = GroupKFold(n_splits=5) 359 | # Create arrays and dataframes to store results 360 | oof_preds = np.zeros((train_df.shape[0],11)) 361 | sub_preds = np.zeros((test_df.shape[0],11)) 362 | feature_importance_df = pd.DataFrame() 363 | feats = predictors 364 | cv_resul = [] 365 | ''' 366 | perm = [i for i in range(len(train_df))] 367 | perm = pd.DataFrame(perm) 368 | perm.columns = ['index_'] 369 | 370 | for n_fold in range(5): 371 | train_idx = np.array(perm[train_df['cv'] != n_fold]['index_']) 372 | valid_idx = np.array(perm[train_df['cv'] == n_fold]['index_']) 373 | ''' 374 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])): 375 | if (USE_KFOLD == False) and (n_fold == 1): 376 | break 377 | train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx] 378 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx] 379 | 380 | train_x = pd.concat([train_x,train_old[feats]]) 381 | train_y = pd.concat([train_y,train_old[target]]) 382 | 383 | train_y_t = train_y.values 384 | valid_y_t = valid_y.values 385 | print(train_y_t) 386 | xgtrain = lgb.Dataset(train_x.values, label = train_y_t, 387 | feature_name=predictors, 388 | categorical_feature=categorical_features 389 | ) 390 | xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t, 391 | feature_name=predictors, 392 | categorical_feature=categorical_features 393 | ) 394 | 395 | clf = lgb.train(lgb_params, 396 | xgtrain, 397 | valid_sets=[xgvalid],#, xgtrain], 398 | valid_names=['valid'],#,'train'], 399 | num_boost_round=num_boost_round, 400 | early_stopping_rounds=early_stopping_rounds, 401 | verbose_eval=verbose_eval, 402 | # feval=feval 403 | ) 404 | 405 | 406 | 407 | oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) 408 | sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits 409 | 410 | 411 | gain = clf.feature_importance('gain') 412 | fold_importance_df = pd.DataFrame({'feature':clf.feature_name(), 413 | 'split':clf.feature_importance('split'), 414 | 'gain':100*gain/gain.sum(), 415 | 'fold':n_fold, 416 | }).sort_values('gain',ascending=False) 417 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 418 | 419 | result = evaluate_macroF1_lgb(valid_y, oof_preds[valid_idx]) 420 | # result = clf.best_score['valid']['macro_f1_score'] 421 | print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result)) 422 | cv_resul.append(round(result,5)) 423 | gc.collect() 424 | 425 | score = 'model_1' 426 | #score = np.array(cv_resul).mean() 427 | if USE_KFOLD: 428 | #print('Full f1 score %.6f' % score) 429 | for i in range(11): 430 | train_df["class_" + str(i)] = oof_preds[:,i] 431 | test_df["class_" + str(i)] = sub_preds[:,i] 432 | train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 433 | test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 434 | oof_preds = [np.argmax(x)for x in oof_preds] 435 | sub_preds = [np.argmax(x)for x in sub_preds] 436 | train_df[target] = oof_preds 437 | test_df[target] = sub_preds 438 | print(test_df[target].mean()) 439 | train_df[target] = oof_preds 440 | train_df[target] = train_df[target].map(label2current_service) 441 | test_df[target] = sub_preds 442 | test_df[target] = test_df[target].map(label2current_service) 443 | print('all_cv', cv_resul) 444 | 445 | train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False) 446 | test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False) 447 | print("test_df mean:") 448 | 449 | display_importances(feature_importance_df,score) 450 | 451 | 452 | 453 | def display_importances(feature_importance_df_,score): 454 | ft = feature_importance_df_[["feature", "split","gain"]].groupby("feature").mean().sort_values(by="gain", ascending=False) 455 | print(ft.head(60)) 456 | ft.to_csv('importance_lightgbm_{}.csv'.format(score),index=True) 457 | cols = ft[:40].index 458 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] 459 | 460 | 461 | ####################################计算################################################################# 462 | 463 | 464 | params = { 465 | 'metric': 'multi_logloss', 466 | 'num_class':11, 467 | 'boosting_type': 'gbdt', 468 | 'objective': 'multiclass', 469 | 'feature_fraction': 0.7, 470 | 'learning_rate': 0.02, 471 | 'bagging_fraction': 0.7, 472 | #'bagging_freq': 2, 473 | 'num_leaves': 64, 474 | 'max_depth': -1, 475 | 'num_threads': 16, 476 | 'seed': 2018, 477 | 'verbose': -1, 478 | #'is_unbalance':True, 479 | } 480 | 481 | 482 | categorical_columns = [ 483 | 'contract_type', 484 | 'net_service', 485 | 'gender'] 486 | for feature in categorical_columns: 487 | print(f'Transforming {feature}...') 488 | encoder = LabelEncoder() 489 | train[feature] = encoder.fit_transform(train[feature].astype(str)) 490 | 491 | 492 | x = [] 493 | no_use = ['current_service', 'user_id','group', 494 | 495 | ] + x 496 | 497 | 498 | 499 | 500 | categorical_columns = [] 501 | all_data_frame = [] 502 | all_data_frame.append(train) 503 | 504 | for aresult in result: 505 | all_data_frame.append(aresult) 506 | 507 | train = concat(all_data_frame) 508 | feats = [f for f in train.columns if f not in no_use] 509 | categorical_columns = [f for f in categorical_columns if f not in no_use] 510 | 511 | train_old = train.iloc[shape1:shape2] 512 | train = train.iloc[:shape1] 513 | #train = train[train.service_type!=1] 514 | #train_old = train_old[train_old.service_type!=1] 515 | clf = kfold_lightgbm(params,train,feats,'current_service' ,5 , num_boost_round=4000, categorical_features=categorical_columns) 516 | -------------------------------------------------------------------------------- /model3_1.py: -------------------------------------------------------------------------------- 1 | #import dask.dataframe as dd 2 | #from dask.multiprocessing import get 3 | import itertools 4 | import numpy as np 5 | import pandas as pd 6 | import gc 7 | import time 8 | from contextlib import contextmanager 9 | import lightgbm as lgb 10 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score 11 | from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold 12 | #import matplotlib.pyplot as plt 13 | #import seaborn as sns 14 | import warnings 15 | from sklearn.preprocessing import LabelEncoder 16 | from utils import * 17 | #from utils2 import * 18 | #from utils3 import * 19 | from datetime import datetime 20 | from datetime import timedelta 21 | #from tqdm import tqdm 22 | #test 23 | 24 | warnings.simplefilter(action='ignore', category=FutureWarning) 25 | 26 | 27 | USE_KFOLD = True 28 | 29 | data_path = './input/' 30 | 31 | ####################################读入文件#################################################### 32 | #要准备hzs的两个get most文件 33 | def astype(x,t): 34 | try: 35 | return t(x) 36 | except: 37 | return np.nan 38 | 39 | def have_0(x): 40 | try: 41 | r = x.split('.')[1][-1] 42 | return 0 if r=='0' else 1 43 | except: 44 | return 1 45 | 46 | str_dict = {'1_total_fee': 'str', 47 | '2_total_fee': 'str', 48 | '3_total_fee': 'str', 49 | '4_total_fee': 'str', 50 | 'pay_num': 'str', 51 | } 52 | 53 | 54 | have_0_c = ['1_total_fee', 55 | '2_total_fee', 56 | '3_total_fee', 57 | '4_total_fee', 58 | 'pay_num'] 59 | 60 | def deal(data): 61 | for c in have_0_c: 62 | data['have_0_{}'.format(c)] = data[c].apply(have_0) 63 | try: 64 | data[c] = data[c].astype(float) 65 | except: 66 | pass 67 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 68 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 69 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 70 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 71 | data.loc[data['age']==0,'age'] = np.nan 72 | data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan 73 | data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan 74 | data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan 75 | data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan 76 | for c in [ 77 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 78 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 79 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 80 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', ]: 81 | data[c] = data[c].round(4) 82 | return data 83 | 84 | train = pd.read_csv(data_path + 'train.csv',dtype=str_dict) 85 | train = deal(train) 86 | train.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 87 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 88 | train = train[train['current_service'] != 999999] 89 | test = pd.read_csv(data_path + 'test.csv',dtype=str_dict) 90 | test = deal(test) 91 | 92 | train_old = pd.read_csv('./input/train_old.csv',dtype=str_dict)[:] 93 | train_old = deal(train_old) 94 | train_old.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 95 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 96 | 97 | 98 | 99 | print(len(train)) 100 | 101 | 102 | label2current_service =dict(zip(range(0,len(set(train['current_service']))),sorted(list(set(train['current_service']))))) 103 | current_service2label =dict(zip(sorted(list(set(train['current_service']))),range(0,len(set(train['current_service']))))) 104 | print(len(label2current_service)) 105 | train = train.append(test).reset_index(drop = True) 106 | print(len(train)) 107 | shape1 = len(train) 108 | #train['is_b'] = 1 109 | #train_old['is_b'] = 0 110 | train = train.append(train_old).reset_index(drop = True) 111 | print(len(train)) 112 | shape2 = len(train) 113 | 114 | get_most = pd.read_csv('Magic_Feature_Exclude_Old.csv') 115 | get_most2 = pd.read_csv('Magic_Feature_Include_Old.csv') 116 | 117 | ####################################特征工程################################################### 118 | 119 | call_time = ['local_caller_time', 'service1_caller_time', 'service2_caller_time'] 120 | traffic = ['month_traffic','last_month_traffic','local_trafffic_month'] 121 | cat_cols = ['service_type','contract_type', 'net_service', 'gender', 'complaint_level', 122 | #3 #9,8 #4 #3 #4 123 | 'is_mix_service', 'many_over_bill', 'is_promise_low_consume', #2 124 | ] 125 | continus_col = [ 126 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','former_complaint_fee', 127 | 128 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 129 | 130 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 131 | 132 | 'online_time','contract_time', 133 | 134 | 'pay_times', 'former_complaint_num' 135 | ] 136 | def one_hot_encoder(train,column,n=100,nan_as_category=False): 137 | tmp = train[column].value_counts().to_frame() 138 | values = list(tmp[tmp[column]>n].index) 139 | train.loc[train[column].isin(values),column+'N'] = train.loc[train[column].isin(values),column] 140 | train = pd.get_dummies(train, columns=[column+'N'], dummy_na=False) 141 | return train 142 | # 143 | 144 | 145 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 146 | 147 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'fea-min']: 148 | get_most.columns = [column,column+'_most'] 149 | train = train.merge(get_most,on=column,how='left') 150 | 151 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'fea-min']: 152 | get_most2.columns = [column,column+'_most2'] 153 | train = train.merge(get_most2,on=column,how='left') 154 | 155 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','fea-min']: 156 | train[column+'_int'] = train[column].fillna(-1).astype('int') 157 | train[column+'_int_last'] = train[column+'_int']%10 #last int 158 | train[column+'_decimal'] = round(((train[column]-train[column+'_int'])*100).fillna(-1)).astype('int') #decimal 159 | train[column+'_decimal_is_0'] = (train[column+'_decimal']==0).astype('int') 160 | train[column+'_decimal_is_5'] = (train[column+'_decimal']%5==0).astype('int') 161 | train[column+'_decimal_last'] = train[column+'_decimal']%10 162 | train[column+'_decimal_last2'] = train[column+'_decimal']//5 163 | train[column+'_extra_fee'] = ((train[column]*100)-600)%1000 164 | train[column+'_27perMB'] = ((train[column+'_extra_fee']%27 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 165 | train[column+'_15perMB'] = ((train[column+'_extra_fee']%15 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 166 | train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 167 | train['pay_num_last2'] = train['pay_num_int']%100 168 | train['former_complaint_fee_last2'] = round(train['former_complaint_fee'])%100 169 | 170 | 171 | train['4-fea-dealta'] = round((train['4_total_fee'] - train['3_total_fee'])*100).fillna(999999.9).astype('int') 172 | train['3-fea-dealta'] = round((train['3_total_fee'] - train['2_total_fee'])*100).fillna(999999.9).astype('int') 173 | train['2-fea-dealta'] = round((train['2_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 174 | train['1-fea-dealta'] = round((train['4_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 175 | train['1-3-fea-dealta'] = round((train['3_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 176 | train['1-min-fea-dealta'] = round((train['1_total_fee'] - train['fea-min'])*100).fillna(999999.9).astype('int') 177 | for column in ['4-fea-dealta', '3-fea-dealta', '2-fea-dealta', '1-fea-dealta','1-3-fea-dealta','1-min-fea-dealta']: 178 | train[column+'_is_0'] = (train[column]==0).astype('int') 179 | train[column+'_is_6000'] = ((train[column]%6000 == 0)&(train[column] != 0)).astype('int') 180 | train[column+'_is_5'] = ((train[column]%5 == 0)&(train[column] != 0)).astype('int') 181 | train[column+'_is_10'] = ((train[column]%10 == 0)&(train[column] != 0)).astype('int') 182 | train[column+'_is_15'] = ((train[column]%15 == 0)&(train[column] != 0)).astype('int') 183 | train[column+'_is_27'] = ((train[column]%27 == 0)&(train[column] != 0)).astype('int') 184 | train[column+'_is_30'] = ((train[column]%30 == 0)&(train[column] != 0)).astype('int') 185 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 186 | train[column+'_is_100'] = ((train[column]%100 == 0)&(train[column] != 0)).astype('int') 187 | train[column+'_is_500'] = ((train[column]%500 == 0)&(train[column] != 0)).astype('int') 188 | 189 | for column in ['month_traffic', 'last_month_traffic', 'local_trafffic_month']: 190 | train[column+'_is_int'] = ((train[column]%1 == 0)&(train[column] != 0)).astype('int') 191 | train[column+'_is_512'] = ((train[column]%512 == 0)&(train[column] != 0)).astype('int') 192 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 193 | train[column+'_is_double'] = ((train[column]%512%50 == 0)&(train[column] != 0)&(train[column+'_is_512'] == 0)&(train[column+'_is_50'] == 0)).astype('int') 194 | train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 195 | 196 | train['service12'] = train['service2_caller_time']+train['service1_caller_time'] 197 | for column in ['local_caller_time', 'service1_caller_time', 'service2_caller_time','service12']: 198 | train[column+'_decimal'] = round(((round(train[column])- train[column])*60)).astype('int') 199 | train[column+'_decimal_is_int'] = ((train[column+'_decimal']==0)&(train[column] != 0)).astype('int') 200 | 201 | train = one_hot_encoder(train,'online_time',n=5000,nan_as_category=True) 202 | train = one_hot_encoder(train,'contract_time',n=5000,nan_as_category=True) 203 | 204 | print(train.shape) 205 | train = one_hot_encoder(train,'contract_type',n=1,nan_as_category=True) 206 | 207 | 208 | 209 | #lable 映射 210 | train['current_service'] = train['current_service'].map(current_service2label) 211 | 212 | 213 | train['age'] = train['age'].fillna(-20) 214 | train['cut_age'] = train['age'].apply(lambda x: int(x/10)) 215 | train['cut_online'] = (train['online_time'] / 12).astype(int) 216 | 217 | 218 | 219 | train['4-fea-dealta'] = train['4_total_fee'] - train['3_total_fee'] 220 | train['3-fea-dealta'] = train['3_total_fee'] - train['2_total_fee'] 221 | train['2-fea-dealta'] = train['2_total_fee'] - train['1_total_fee'] 222 | train['1-fea-dealta'] = train['4_total_fee'] - train['1_total_fee'] 223 | 224 | train['4-fea-dealta_'] = train['4_total_fee'] / (train['3_total_fee']+0.00001) 225 | train['3-fea-dealta_'] = train['3_total_fee'] / (train['2_total_fee']+0.00001) 226 | train['2-fea-dealta_'] = train['2_total_fee'] / (train['1_total_fee']+0.00001) 227 | train['1-fea-dealta_'] = train['4_total_fee'] / (train['1_total_fee']+0.00001) 228 | train['pay_num-dealta_'] = train['pay_num'] / (train['1_total_fee']+0.00001) 229 | 230 | 231 | 232 | train['month_traffic_delata'] = train['month_traffic'] - train['last_month_traffic'] 233 | train['month_traffic_delata_'] = train['month_traffic'] / (train['last_month_traffic']+0.00001) 234 | train['2month_traffic_sum'] = train['month_traffic'] + train['last_month_traffic'] 235 | train['add_month_traffic'] = train['month_traffic'] - train['local_trafffic_month'] 236 | train['add_month_traffic_'] = train['month_traffic'] / (train['local_trafffic_month']+0.00001) 237 | 238 | train['service1_caller_time_delata'] = train['service1_caller_time'] / (train['service2_caller_time']+0.00001) 239 | train['service1_caller_time_delata2'] = train['service1_caller_time'] / (train['local_caller_time']+0.00001) 240 | train['service2_caller_time_delata_'] = train['service2_caller_time'] / (train['local_caller_time']+0.00001) 241 | train['local_caller_time_reatio'] = train['local_caller_time']/(train['service1_caller_time']+train['service2_caller_time']+0.00001) 242 | 243 | train['div_online_time_contract'] = train['contract_time'] / (train['online_time']+0.00001) 244 | train['div_online_time_contract'] = train['contract_time'] - train['online_time'] 245 | 246 | 247 | train['div_former_complaint_num'] = train['former_complaint_num'] / (train['pay_times']+0.00001) 248 | train['div_former_complaint_num'] = train['former_complaint_num'] - train['pay_times'] 249 | 250 | 251 | train['fea-sum'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 252 | train['fea-var'] = train[[str(1+i) +'_total_fee' for i in range(4)]].var(axis = 1) 253 | train['fea-max'] = train[[str(1+i) +'_total_fee' for i in range(4)]].max(axis = 1) 254 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 255 | train['fea-mean4'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 256 | train['fea-mean3'] = train[[str(1+i) +'_total_fee' for i in range(3)]].sum(axis = 1) 257 | train['fea-mean2'] = train[[str(1+i) +'_total_fee' for i in range(2)]].sum(axis = 1) 258 | train['fea-extra'] = train['fea-sum']-4*train['fea-min'] 259 | train['1_total_fee_extra_for_min'] = train['1_total_fee']-train['fea-min'] 260 | train['fea_unum'] = train[['1_total_fee','2_total_fee','3_total_fee', '4_total_fee']].nunique(axis=1) 261 | 262 | train['call_time_sum'] = train[call_time].sum(axis = 1) 263 | train['call_time_var'] = train[call_time].var(axis = 1) 264 | train['call_time_min'] = train[call_time].min(axis = 1) 265 | train['call_time_max'] = train[call_time].max(axis = 1) 266 | 267 | train['traffic_sum'] = train[traffic].sum(axis = 1) 268 | train['traffic_var'] = train[traffic].var(axis = 1) 269 | train['traffic_min'] = train[traffic].min(axis = 1) 270 | train['traffic_max'] = train[traffic].max(axis = 1) 271 | 272 | 273 | train['average_pay'] = train['pay_num'] / train['pay_times'] 274 | 275 | 276 | train['div_traffic_price_2'] = train['last_month_traffic']/ 1000 / train['2_total_fee'] 277 | train['div_traffic_price_3'] = train['local_trafffic_month']/ 1000 / train['1_total_fee'] 278 | train['div_add_month_traffic_price'] = train['add_month_traffic']/ 1000 / train['1_total_fee'] 279 | train['div_local_caller_time_price'] = train['local_trafffic_month'] / 1000/ train['1_total_fee'] 280 | 281 | 282 | train['1-min-fea-dealta_div'] = train['1-min-fea-dealta']/(train['service1_caller_time']+0.0001) 283 | train['div_service1_caller_time_price'] = train['service1_caller_time'] / train['1_total_fee'] 284 | train['div_local_caller_time'] = train['local_caller_time'] / train['1_total_fee'] 285 | train['div_call_time_sum_price'] = train['call_time_sum'] / train['1_total_fee'] 286 | train['1_total_fee_maybe_real_calller'] = train['1_total_fee']- train['service1_caller_time']*0.15 287 | train['1_total_fee_maybe_real_calller2'] = train['1_total_fee']- train['service1_caller_time']*0.1 288 | train['1_total_fee_extra_for_min_caller_time'] = train['1_total_fee_extra_for_min']/(train['service1_caller_time']+0.001) 289 | 290 | train['div_service1_caller_time'] = train['service1_caller_time']/train['last_month_traffic'] 291 | train['div_local_caller_time'] = train['local_caller_time']/train['last_month_traffic'] 292 | train['div_local_caller_time2'] = train['local_caller_time']/train['month_traffic'] 293 | 294 | 295 | train['avg_complain_fee'] = train['former_complaint_fee'] / (train['former_complaint_num'] + 0.000000001) 296 | 297 | 298 | result = [] 299 | 300 | result.append(get_feat_ngroup(train,['cut_age','gender'])) 301 | for size_feat in ['1_total_fee','2_total_fee','3_total_fee', '4_total_fee','pay_num', 302 | 'last_month_traffic','month_traffic','local_trafffic_month', 303 | 'local_caller_time','service1_caller_time','service2_caller_time']: 304 | result.append(get_feat_size(train,[size_feat])) 305 | 306 | 307 | result.append(get_feat_stat_feat(train, ['contract_type'], ['1_total_fee'], ['max'])) 308 | result.append(get_feat_stat_feat(train, ['contract_type'], ['2_total_fee'], ['mean'])) 309 | result.append(get_feat_stat_feat(train, ['contract_type'], ['last_month_traffic'], ['var','mean'])) 310 | result.append(get_feat_stat_feat(train, ['contract_type'], ['call_time_sum'], ['mean'])) 311 | 312 | for base_feat in [['contract_type']]: 313 | for other_feat in ['1_total_fee', 'pay_num', 314 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 315 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 316 | ]: 317 | stat_list = ['mean'] 318 | tmp = get_feat_stat_feat(train,base_feat,[other_feat],stat_list=stat_list) 319 | name = tmp.columns[0] 320 | train[name] = tmp 321 | train[name+'_comp'] = train[other_feat].values-train[name].values 322 | 323 | 324 | train['1_total_fee_ratio'] = train['1_total_fee']/(train['fea-sum']+0.000001) 325 | train['3_total_fee_ratio'] = train['3_total_fee']/(train['fea-sum']+0.000001) 326 | train['call_time_sum_ratio'] = train['call_time_sum']/(train['traffic_sum']+0.000001) 327 | train['call_time_sum_ratio2'] = train['call_time_sum']/(train['fea-sum']+0.000001) 328 | train['traffic_sum_ratio1'] = train['traffic_sum']/(train['fea-sum']+0.000001) 329 | 330 | ####################################lgb和metric函数################################################### 331 | 332 | def f1_score_vali(preds, data_vali): 333 | labels = data_vali.get_label() 334 | preds = np.argmax(preds.reshape(11, -1),axis=0) 335 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 336 | return 'macro_f1_score', score_vali, True 337 | 338 | def evaluate_macroF1_lgb(data_vali, preds): 339 | labels = data_vali.astype(int) 340 | preds = np.array(preds) 341 | preds = np.argmax(preds,axis=1) 342 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 343 | return score_vali 344 | 345 | def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True, 346 | objective='', metrics='',debug= False, 347 | feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ): 348 | 349 | lgb_params = params 350 | 351 | train_df = df[df[target].notnull()] 352 | test_df = df[df[target].isnull()] 353 | 354 | # Divide in training/validation and test data 355 | print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape)) 356 | del df 357 | gc.collect() 358 | # Cross validation model 359 | if stratified: 360 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234) 361 | else: 362 | folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234) 363 | # folds = GroupKFold(n_splits=5) 364 | # Create arrays and dataframes to store results 365 | oof_preds = np.zeros((train_df.shape[0],11)) 366 | sub_preds = np.zeros((test_df.shape[0],11)) 367 | feature_importance_df = pd.DataFrame() 368 | feats = predictors 369 | cv_resul = [] 370 | ''' 371 | perm = [i for i in range(len(train_df))] 372 | perm = pd.DataFrame(perm) 373 | perm.columns = ['index_'] 374 | 375 | for n_fold in range(5): 376 | train_idx = np.array(perm[train_df['cv'] != n_fold]['index_']) 377 | valid_idx = np.array(perm[train_df['cv'] == n_fold]['index_']) 378 | ''' 379 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])): 380 | if (USE_KFOLD == False) and (n_fold == 1): 381 | break 382 | train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx] 383 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx] 384 | 385 | train_x = pd.concat([train_x,train_old[feats]]) 386 | train_y = pd.concat([train_y,train_old[target]]) 387 | 388 | train_y_t = train_y.values 389 | valid_y_t = valid_y.values 390 | print(train_y_t) 391 | xgtrain = lgb.Dataset(train_x.values, label = train_y_t, 392 | feature_name=predictors, 393 | categorical_feature=categorical_features 394 | ) 395 | xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t, 396 | feature_name=predictors, 397 | categorical_feature=categorical_features 398 | ) 399 | 400 | clf = lgb.train(lgb_params, 401 | xgtrain, 402 | valid_sets=[xgvalid],#, xgtrain], 403 | valid_names=['valid'],#,'train'], 404 | num_boost_round=num_boost_round, 405 | early_stopping_rounds=early_stopping_rounds, 406 | verbose_eval=verbose_eval, 407 | # feval=feval 408 | ) 409 | 410 | 411 | 412 | oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) 413 | sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits 414 | 415 | 416 | gain = clf.feature_importance('gain') 417 | fold_importance_df = pd.DataFrame({'feature':clf.feature_name(), 418 | 'split':clf.feature_importance('split'), 419 | 'gain':100*gain/gain.sum(), 420 | 'fold':n_fold, 421 | }).sort_values('gain',ascending=False) 422 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 423 | 424 | result = evaluate_macroF1_lgb(valid_y, oof_preds[valid_idx]) 425 | # result = clf.best_score['valid']['macro_f1_score'] 426 | print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result)) 427 | cv_resul.append(round(result,5)) 428 | gc.collect() 429 | 430 | #score = np.array(cv_resul).mean()\ 431 | score = 'model_3_1' 432 | if USE_KFOLD: 433 | #print('Full f1 score %.6f' % score) 434 | for i in range(11): 435 | train_df["class_" + str(i)] = oof_preds[:,i] 436 | test_df["class_" + str(i)] = sub_preds[:,i] 437 | train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 438 | test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 439 | oof_preds = [np.argmax(x)for x in oof_preds] 440 | sub_preds = [np.argmax(x)for x in sub_preds] 441 | train_df[target] = oof_preds 442 | test_df[target] = sub_preds 443 | print(test_df[target].mean()) 444 | train_df[target] = oof_preds 445 | train_df[target] = train_df[target].map(label2current_service) 446 | test_df[target] = sub_preds 447 | test_df[target] = test_df[target].map(label2current_service) 448 | print('all_cv', cv_resul) 449 | train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False) 450 | test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False) 451 | print("test_df mean:") 452 | 453 | display_importances(feature_importance_df,score) 454 | 455 | 456 | 457 | def display_importances(feature_importance_df_,score): 458 | ft = feature_importance_df_[["feature", "split","gain"]].groupby("feature").mean().sort_values(by="gain", ascending=False) 459 | print(ft.head(60)) 460 | ft.to_csv('importance_lightgbm_{}.csv'.format(score),index=True) 461 | cols = ft[:40].index 462 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] 463 | 464 | 465 | ####################################计算################################################################# 466 | 467 | 468 | params = { 469 | 'metric': 'multi_logloss', 470 | 'num_class':11, 471 | 'boosting_type': 'gbdt', 472 | 'objective': 'multiclass', 473 | 'feature_fraction': 0.7, 474 | 'learning_rate': 0.02, 475 | 'bagging_fraction': 0.7, 476 | #'bagging_freq': 2, 477 | 'num_leaves': 64, 478 | 'max_depth': -1, 479 | 'num_threads': 16, 480 | 'seed': 2018, 481 | 'verbose': -1, 482 | #'is_unbalance':True, 483 | } 484 | 485 | 486 | categorical_columns = [ 487 | 'contract_type', 488 | 'net_service', 489 | 'gender'] 490 | for feature in categorical_columns: 491 | print(f'Transforming {feature}...') 492 | encoder = LabelEncoder() 493 | train[feature] = encoder.fit_transform(train[feature].astype(str)) 494 | 495 | 496 | x = [] 497 | no_use = ['current_service', 'user_id','group', 498 | 499 | ] + x 500 | 501 | 502 | 503 | 504 | categorical_columns = [] 505 | all_data_frame = [] 506 | all_data_frame.append(train) 507 | 508 | for aresult in result: 509 | all_data_frame.append(aresult) 510 | 511 | train = concat(all_data_frame) 512 | feats = [f for f in train.columns if f not in no_use] 513 | categorical_columns = [f for f in categorical_columns if f not in no_use] 514 | 515 | train_old = train.iloc[shape1:shape2] 516 | train = train.iloc[:shape1] 517 | train = train[train.service_type==1] 518 | train_old = train_old[train_old.service_type==1] 519 | clf = kfold_lightgbm(params,train,feats,'current_service' ,5 , num_boost_round=4000, categorical_features=categorical_columns) 520 | -------------------------------------------------------------------------------- /model3_4.py: -------------------------------------------------------------------------------- 1 | #import dask.dataframe as dd 2 | #from dask.multiprocessing import get 3 | import itertools 4 | import numpy as np 5 | import pandas as pd 6 | import gc 7 | import time 8 | from contextlib import contextmanager 9 | import lightgbm as lgb 10 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score 11 | from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold 12 | #import matplotlib.pyplot as plt 13 | #import seaborn as sns 14 | import warnings 15 | from sklearn.preprocessing import LabelEncoder 16 | from utils import * 17 | #from utils2 import * 18 | #from utils3 import * 19 | from datetime import datetime 20 | from datetime import timedelta 21 | #from tqdm import tqdm 22 | #test 23 | 24 | warnings.simplefilter(action='ignore', category=FutureWarning) 25 | 26 | 27 | USE_KFOLD = True 28 | 29 | data_path = './input/' 30 | 31 | ####################################读入文件#################################################### 32 | #要准备hzs的两个get most文件 33 | def astype(x,t): 34 | try: 35 | return t(x) 36 | except: 37 | return np.nan 38 | 39 | def have_0(x): 40 | try: 41 | r = x.split('.')[1][-1] 42 | return 0 if r=='0' else 1 43 | except: 44 | return 1 45 | 46 | str_dict = {'1_total_fee': 'str', 47 | '2_total_fee': 'str', 48 | '3_total_fee': 'str', 49 | '4_total_fee': 'str', 50 | 'pay_num': 'str', 51 | } 52 | 53 | 54 | have_0_c = ['1_total_fee', 55 | '2_total_fee', 56 | '3_total_fee', 57 | '4_total_fee', 58 | 'pay_num'] 59 | 60 | def deal(data): 61 | for c in have_0_c: 62 | data['have_0_{}'.format(c)] = data[c].apply(have_0) 63 | try: 64 | data[c] = data[c].astype(float) 65 | except: 66 | pass 67 | data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float)) 68 | data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float)) 69 | data['age'] = data['age'].apply(lambda x: astype(x,int)) 70 | data['gender'] = data['gender'].apply(lambda x: astype(x,int)) 71 | data.loc[data['age']==0,'age'] = np.nan 72 | data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan 73 | data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan 74 | data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan 75 | data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan 76 | for c in [ 77 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 78 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 79 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 80 | 'many_over_bill', 'contract_type', 'contract_time', 'pay_num', ]: 81 | data[c] = data[c].round(4) 82 | return data 83 | 84 | train = pd.read_csv(data_path + 'train.csv',dtype=str_dict) 85 | train = deal(train) 86 | train.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 87 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 88 | train = train[train['current_service'] != 999999] 89 | test = pd.read_csv(data_path + 'test.csv',dtype=str_dict) 90 | test = deal(test) 91 | 92 | train_old = pd.read_csv('./input/train_old.csv',dtype=str_dict)[:] 93 | train_old = deal(train_old) 94 | train_old.drop_duplicates(subset = ['1_total_fee','2_total_fee','3_total_fee', 95 | 'month_traffic','pay_times','last_month_traffic','service2_caller_time','age'],inplace=True) 96 | 97 | 98 | 99 | print(len(train)) 100 | 101 | 102 | label2current_service =dict(zip(range(0,len(set(train['current_service']))),sorted(list(set(train['current_service']))))) 103 | current_service2label =dict(zip(sorted(list(set(train['current_service']))),range(0,len(set(train['current_service']))))) 104 | print(len(label2current_service)) 105 | train = train.append(test).reset_index(drop = True) 106 | print(len(train)) 107 | shape1 = len(train) 108 | #train['is_b'] = 1 109 | #train_old['is_b'] = 0 110 | train = train.append(train_old).reset_index(drop = True) 111 | print(len(train)) 112 | shape2 = len(train) 113 | 114 | get_most = pd.read_csv('Magic_Feature_Exclude_Old.csv') 115 | get_most2 = pd.read_csv('Magic_Feature_Include_Old.csv') 116 | 117 | ####################################特征工程################################################### 118 | 119 | call_time = ['local_caller_time', 'service1_caller_time', 'service2_caller_time'] 120 | traffic = ['month_traffic','last_month_traffic','local_trafffic_month'] 121 | cat_cols = ['service_type','contract_type', 'net_service', 'gender', 'complaint_level', 122 | #3 #9,8 #4 #3 #4 123 | 'is_mix_service', 'many_over_bill', 'is_promise_low_consume', #2 124 | ] 125 | continus_col = [ 126 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','former_complaint_fee', 127 | 128 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 129 | 130 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 131 | 132 | 'online_time','contract_time', 133 | 134 | 'pay_times', 'former_complaint_num' 135 | ] 136 | def one_hot_encoder(train,column,n=100,nan_as_category=False): 137 | tmp = train[column].value_counts().to_frame() 138 | values = list(tmp[tmp[column]>n].index) 139 | train.loc[train[column].isin(values),column+'N'] = train.loc[train[column].isin(values),column] 140 | train = pd.get_dummies(train, columns=[column+'N'], dummy_na=False) 141 | return train 142 | # 143 | 144 | 145 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 146 | 147 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'fea-min']: 148 | get_most.columns = [column,column+'_most'] 149 | train = train.merge(get_most,on=column,how='left') 150 | 151 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'fea-min']: 152 | get_most2.columns = [column,column+'_most2'] 153 | train = train.merge(get_most2,on=column,how='left') 154 | 155 | for column in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','fea-min']: 156 | train[column+'_int'] = train[column].fillna(-1).astype('int') 157 | train[column+'_int_last'] = train[column+'_int']%10 #last int 158 | train[column+'_decimal'] = round(((train[column]-train[column+'_int'])*100).fillna(-1)).astype('int') #decimal 159 | train[column+'_decimal_is_0'] = (train[column+'_decimal']==0).astype('int') 160 | train[column+'_decimal_is_5'] = (train[column+'_decimal']%5==0).astype('int') 161 | train[column+'_decimal_last'] = train[column+'_decimal']%10 162 | train[column+'_decimal_last2'] = train[column+'_decimal']//5 163 | train[column+'_extra_fee'] = ((train[column]*100)-600)%1000 164 | train[column+'_27perMB'] = ((train[column+'_extra_fee']%27 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 165 | train[column+'_15perMB'] = ((train[column+'_extra_fee']%15 == 0)&(train[column+'_extra_fee'] != 0)).astype('int') 166 | train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 167 | train['pay_num_last2'] = train['pay_num_int']%100 168 | train['former_complaint_fee_last2'] = round(train['former_complaint_fee'])%100 169 | 170 | 171 | train['4-fea-dealta'] = round((train['4_total_fee'] - train['3_total_fee'])*100).fillna(999999.9).astype('int') 172 | train['3-fea-dealta'] = round((train['3_total_fee'] - train['2_total_fee'])*100).fillna(999999.9).astype('int') 173 | train['2-fea-dealta'] = round((train['2_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 174 | train['1-fea-dealta'] = round((train['4_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 175 | train['1-3-fea-dealta'] = round((train['3_total_fee'] - train['1_total_fee'])*100).fillna(999999.9).astype('int') 176 | train['1-min-fea-dealta'] = round((train['1_total_fee'] - train['fea-min'])*100).fillna(999999.9).astype('int') 177 | for column in ['4-fea-dealta', '3-fea-dealta', '2-fea-dealta', '1-fea-dealta','1-3-fea-dealta','1-min-fea-dealta']: 178 | train[column+'_is_0'] = (train[column]==0).astype('int') 179 | train[column+'_is_6000'] = ((train[column]%6000 == 0)&(train[column] != 0)).astype('int') 180 | train[column+'_is_5'] = ((train[column]%5 == 0)&(train[column] != 0)).astype('int') 181 | train[column+'_is_10'] = ((train[column]%10 == 0)&(train[column] != 0)).astype('int') 182 | train[column+'_is_15'] = ((train[column]%15 == 0)&(train[column] != 0)).astype('int') 183 | train[column+'_is_27'] = ((train[column]%27 == 0)&(train[column] != 0)).astype('int') 184 | train[column+'_is_30'] = ((train[column]%30 == 0)&(train[column] != 0)).astype('int') 185 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 186 | train[column+'_is_100'] = ((train[column]%100 == 0)&(train[column] != 0)).astype('int') 187 | train[column+'_is_500'] = ((train[column]%500 == 0)&(train[column] != 0)).astype('int') 188 | 189 | for column in ['month_traffic', 'last_month_traffic', 'local_trafffic_month']: 190 | train[column+'_is_int'] = ((train[column]%1 == 0)&(train[column] != 0)).astype('int') 191 | train[column+'_is_512'] = ((train[column]%512 == 0)&(train[column] != 0)).astype('int') 192 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 193 | train[column+'_is_double'] = ((train[column]%512%50 == 0)&(train[column] != 0)&(train[column+'_is_512'] == 0)&(train[column+'_is_50'] == 0)).astype('int') 194 | train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 195 | 196 | train['service12'] = train['service2_caller_time']+train['service1_caller_time'] 197 | for column in ['local_caller_time', 'service1_caller_time', 'service2_caller_time','service12']: 198 | train[column+'_decimal'] = round(((round(train[column])- train[column])*60)).astype('int') 199 | train[column+'_decimal_is_int'] = ((train[column+'_decimal']==0)&(train[column] != 0)).astype('int') 200 | 201 | train = one_hot_encoder(train,'online_time',n=5000,nan_as_category=True) 202 | train = one_hot_encoder(train,'contract_time',n=5000,nan_as_category=True) 203 | 204 | print(train.shape) 205 | train = one_hot_encoder(train,'contract_type',n=1,nan_as_category=True) 206 | 207 | 208 | 209 | #lable 映射 210 | train['current_service'] = train['current_service'].map(current_service2label) 211 | 212 | 213 | train['age'] = train['age'].fillna(-20) 214 | train['cut_age'] = train['age'].apply(lambda x: int(x/10)) 215 | train['cut_online'] = (train['online_time'] / 12).astype(int) 216 | 217 | 218 | 219 | train['4-fea-dealta'] = train['4_total_fee'] - train['3_total_fee'] 220 | train['3-fea-dealta'] = train['3_total_fee'] - train['2_total_fee'] 221 | train['2-fea-dealta'] = train['2_total_fee'] - train['1_total_fee'] 222 | train['1-fea-dealta'] = train['4_total_fee'] - train['1_total_fee'] 223 | 224 | train['4-fea-dealta_'] = train['4_total_fee'] / (train['3_total_fee']+0.00001) 225 | train['3-fea-dealta_'] = train['3_total_fee'] / (train['2_total_fee']+0.00001) 226 | train['2-fea-dealta_'] = train['2_total_fee'] / (train['1_total_fee']+0.00001) 227 | train['1-fea-dealta_'] = train['4_total_fee'] / (train['1_total_fee']+0.00001) 228 | train['pay_num-dealta_'] = train['pay_num'] / (train['1_total_fee']+0.00001) 229 | 230 | 231 | 232 | train['month_traffic_delata'] = train['month_traffic'] - train['last_month_traffic'] 233 | train['month_traffic_delata_'] = train['month_traffic'] / (train['last_month_traffic']+0.00001) 234 | train['2month_traffic_sum'] = train['month_traffic'] + train['last_month_traffic'] 235 | train['add_month_traffic'] = train['month_traffic'] - train['local_trafffic_month'] 236 | train['add_month_traffic_'] = train['month_traffic'] / (train['local_trafffic_month']+0.00001) 237 | 238 | train['service1_caller_time_delata'] = train['service1_caller_time'] / (train['service2_caller_time']+0.00001) 239 | train['service1_caller_time_delata2'] = train['service1_caller_time'] / (train['local_caller_time']+0.00001) 240 | train['service2_caller_time_delata_'] = train['service2_caller_time'] / (train['local_caller_time']+0.00001) 241 | train['local_caller_time_reatio'] = train['local_caller_time']/(train['service1_caller_time']+train['service2_caller_time']+0.00001) 242 | 243 | train['div_online_time_contract'] = train['contract_time'] / (train['online_time']+0.00001) 244 | train['div_online_time_contract'] = train['contract_time'] - train['online_time'] 245 | 246 | 247 | train['div_former_complaint_num'] = train['former_complaint_num'] / (train['pay_times']+0.00001) 248 | train['div_former_complaint_num'] = train['former_complaint_num'] - train['pay_times'] 249 | 250 | 251 | train['fea-sum'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 252 | train['fea-var'] = train[[str(1+i) +'_total_fee' for i in range(4)]].var(axis = 1) 253 | train['fea-max'] = train[[str(1+i) +'_total_fee' for i in range(4)]].max(axis = 1) 254 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(4)]].min(axis = 1) 255 | train['fea-mean4'] = train[[str(1+i) +'_total_fee' for i in range(4)]].sum(axis = 1) 256 | train['fea-mean3'] = train[[str(1+i) +'_total_fee' for i in range(3)]].sum(axis = 1) 257 | train['fea-mean2'] = train[[str(1+i) +'_total_fee' for i in range(2)]].sum(axis = 1) 258 | train['fea-extra'] = train['fea-sum']-4*train['fea-min'] 259 | train['1_total_fee_extra_for_min'] = train['1_total_fee']-train['fea-min'] 260 | train['fea_unum'] = train[['1_total_fee','2_total_fee','3_total_fee', '4_total_fee']].nunique(axis=1) 261 | 262 | train['call_time_sum'] = train[call_time].sum(axis = 1) 263 | train['call_time_var'] = train[call_time].var(axis = 1) 264 | train['call_time_min'] = train[call_time].min(axis = 1) 265 | train['call_time_max'] = train[call_time].max(axis = 1) 266 | 267 | train['traffic_sum'] = train[traffic].sum(axis = 1) 268 | train['traffic_var'] = train[traffic].var(axis = 1) 269 | train['traffic_min'] = train[traffic].min(axis = 1) 270 | train['traffic_max'] = train[traffic].max(axis = 1) 271 | 272 | 273 | train['average_pay'] = train['pay_num'] / train['pay_times'] 274 | 275 | 276 | train['div_traffic_price_2'] = train['last_month_traffic']/ 1000 / train['2_total_fee'] 277 | train['div_traffic_price_3'] = train['local_trafffic_month']/ 1000 / train['1_total_fee'] 278 | train['div_add_month_traffic_price'] = train['add_month_traffic']/ 1000 / train['1_total_fee'] 279 | train['div_local_caller_time_price'] = train['local_trafffic_month'] / 1000/ train['1_total_fee'] 280 | 281 | 282 | train['1-min-fea-dealta_div'] = train['1-min-fea-dealta']/(train['service1_caller_time']+0.0001) 283 | train['div_service1_caller_time_price'] = train['service1_caller_time'] / train['1_total_fee'] 284 | train['div_local_caller_time'] = train['local_caller_time'] / train['1_total_fee'] 285 | train['div_call_time_sum_price'] = train['call_time_sum'] / train['1_total_fee'] 286 | train['1_total_fee_maybe_real_calller'] = train['1_total_fee']- train['service1_caller_time']*0.15 287 | train['1_total_fee_maybe_real_calller2'] = train['1_total_fee']- train['service1_caller_time']*0.1 288 | train['1_total_fee_extra_for_min_caller_time'] = train['1_total_fee_extra_for_min']/(train['service1_caller_time']+0.001) 289 | 290 | train['div_service1_caller_time'] = train['service1_caller_time']/train['last_month_traffic'] 291 | train['div_local_caller_time'] = train['local_caller_time']/train['last_month_traffic'] 292 | train['div_local_caller_time2'] = train['local_caller_time']/train['month_traffic'] 293 | 294 | 295 | train['avg_complain_fee'] = train['former_complaint_fee'] / (train['former_complaint_num'] + 0.000000001) 296 | 297 | 298 | result = [] 299 | 300 | result.append(get_feat_ngroup(train,['cut_age','gender'])) 301 | for size_feat in ['1_total_fee','2_total_fee','3_total_fee', '4_total_fee','pay_num', 302 | 'last_month_traffic','month_traffic','local_trafffic_month', 303 | 'local_caller_time','service1_caller_time','service2_caller_time']: 304 | result.append(get_feat_size(train,[size_feat])) 305 | 306 | 307 | result.append(get_feat_stat_feat(train, ['contract_type'], ['1_total_fee'], ['max'])) 308 | result.append(get_feat_stat_feat(train, ['contract_type'], ['2_total_fee'], ['mean'])) 309 | result.append(get_feat_stat_feat(train, ['contract_type'], ['last_month_traffic'], ['var','mean'])) 310 | result.append(get_feat_stat_feat(train, ['contract_type'], ['call_time_sum'], ['mean'])) 311 | 312 | for base_feat in [['contract_type']]: 313 | for other_feat in ['1_total_fee', 'pay_num', 314 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 315 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 316 | ]: 317 | stat_list = ['mean'] 318 | tmp = get_feat_stat_feat(train,base_feat,[other_feat],stat_list=stat_list) 319 | name = tmp.columns[0] 320 | train[name] = tmp 321 | train[name+'_comp'] = train[other_feat].values-train[name].values 322 | 323 | 324 | train['1_total_fee_ratio'] = train['1_total_fee']/(train['fea-sum']+0.000001) 325 | train['3_total_fee_ratio'] = train['3_total_fee']/(train['fea-sum']+0.000001) 326 | train['call_time_sum_ratio'] = train['call_time_sum']/(train['traffic_sum']+0.000001) 327 | train['call_time_sum_ratio2'] = train['call_time_sum']/(train['fea-sum']+0.000001) 328 | train['traffic_sum_ratio1'] = train['traffic_sum']/(train['fea-sum']+0.000001) 329 | 330 | ####################################lgb和metric函数################################################### 331 | 332 | def f1_score_vali(preds, data_vali): 333 | labels = data_vali.get_label() 334 | preds = np.argmax(preds.reshape(11, -1),axis=0) 335 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 336 | return 'macro_f1_score', score_vali, True 337 | 338 | def evaluate_macroF1_lgb(data_vali, preds): 339 | labels = data_vali.astype(int) 340 | preds = np.array(preds) 341 | preds = np.argmax(preds,axis=1) 342 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 343 | return score_vali 344 | 345 | def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True, 346 | objective='', metrics='',debug= False, 347 | feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ): 348 | 349 | lgb_params = params 350 | 351 | train_df = df[df[target].notnull()] 352 | test_df = df[df[target].isnull()] 353 | 354 | # Divide in training/validation and test data 355 | print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape)) 356 | del df 357 | gc.collect() 358 | # Cross validation model 359 | if stratified: 360 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234) 361 | else: 362 | folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234) 363 | # folds = GroupKFold(n_splits=5) 364 | # Create arrays and dataframes to store results 365 | oof_preds = np.zeros((train_df.shape[0],11)) 366 | sub_preds = np.zeros((test_df.shape[0],11)) 367 | feature_importance_df = pd.DataFrame() 368 | feats = predictors 369 | cv_resul = [] 370 | ''' 371 | perm = [i for i in range(len(train_df))] 372 | perm = pd.DataFrame(perm) 373 | perm.columns = ['index_'] 374 | 375 | for n_fold in range(5): 376 | train_idx = np.array(perm[train_df['cv'] != n_fold]['index_']) 377 | valid_idx = np.array(perm[train_df['cv'] == n_fold]['index_']) 378 | ''' 379 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])): 380 | if (USE_KFOLD == False) and (n_fold == 1): 381 | break 382 | train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx] 383 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx] 384 | 385 | train_x = pd.concat([train_x,train_old[feats]]) 386 | train_y = pd.concat([train_y,train_old[target]]) 387 | 388 | train_y_t = train_y.values 389 | valid_y_t = valid_y.values 390 | print(train_y_t) 391 | xgtrain = lgb.Dataset(train_x.values, label = train_y_t, 392 | feature_name=predictors, 393 | categorical_feature=categorical_features 394 | ) 395 | xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t, 396 | feature_name=predictors, 397 | categorical_feature=categorical_features 398 | ) 399 | 400 | clf = lgb.train(lgb_params, 401 | xgtrain, 402 | valid_sets=[xgvalid],#, xgtrain], 403 | valid_names=['valid'],#,'train'], 404 | num_boost_round=num_boost_round, 405 | early_stopping_rounds=early_stopping_rounds, 406 | verbose_eval=verbose_eval, 407 | # feval=feval 408 | ) 409 | 410 | 411 | 412 | oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) 413 | sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits 414 | 415 | 416 | gain = clf.feature_importance('gain') 417 | fold_importance_df = pd.DataFrame({'feature':clf.feature_name(), 418 | 'split':clf.feature_importance('split'), 419 | 'gain':100*gain/gain.sum(), 420 | 'fold':n_fold, 421 | }).sort_values('gain',ascending=False) 422 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 423 | 424 | result = evaluate_macroF1_lgb(valid_y, oof_preds[valid_idx]) 425 | # result = clf.best_score['valid']['macro_f1_score'] 426 | print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result)) 427 | cv_resul.append(round(result,5)) 428 | gc.collect() 429 | 430 | #score = np.array(cv_resul).mean() 431 | score = 'model_3_4' 432 | if USE_KFOLD: 433 | #print('Full f1 score %.6f' % score) 434 | for i in range(11): 435 | train_df["class_" + str(i)] = oof_preds[:,i] 436 | test_df["class_" + str(i)] = sub_preds[:,i] 437 | train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 438 | test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f') 439 | oof_preds = [np.argmax(x)for x in oof_preds] 440 | sub_preds = [np.argmax(x)for x in sub_preds] 441 | train_df[target] = oof_preds 442 | test_df[target] = sub_preds 443 | print(test_df[target].mean()) 444 | train_df[target] = oof_preds 445 | train_df[target] = train_df[target].map(label2current_service) 446 | test_df[target] = sub_preds 447 | test_df[target] = test_df[target].map(label2current_service) 448 | print('all_cv', cv_resul) 449 | train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False) 450 | test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False) 451 | print("test_df mean:") 452 | 453 | display_importances(feature_importance_df,score) 454 | 455 | 456 | 457 | def display_importances(feature_importance_df_,score): 458 | ft = feature_importance_df_[["feature", "split","gain"]].groupby("feature").mean().sort_values(by="gain", ascending=False) 459 | print(ft.head(60)) 460 | ft.to_csv('importance_lightgbm_{}.csv'.format(score),index=True) 461 | cols = ft[:40].index 462 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] 463 | 464 | 465 | ####################################计算################################################################# 466 | 467 | 468 | params = { 469 | 'metric': 'multi_logloss', 470 | 'num_class':11, 471 | 'boosting_type': 'gbdt', 472 | 'objective': 'multiclass', 473 | 'feature_fraction': 0.7, 474 | 'learning_rate': 0.02, 475 | 'bagging_fraction': 0.7, 476 | #'bagging_freq': 2, 477 | 'num_leaves': 64, 478 | 'max_depth': -1, 479 | 'num_threads': 16, 480 | 'seed': 2018, 481 | 'verbose': -1, 482 | #'is_unbalance':True, 483 | } 484 | 485 | 486 | categorical_columns = [ 487 | 'contract_type', 488 | 'net_service', 489 | 'gender'] 490 | for feature in categorical_columns: 491 | print(f'Transforming {feature}...') 492 | encoder = LabelEncoder() 493 | train[feature] = encoder.fit_transform(train[feature].astype(str)) 494 | 495 | 496 | x = [] 497 | no_use = ['current_service', 'user_id','group', 498 | 499 | ] + x 500 | 501 | 502 | 503 | 504 | categorical_columns = [] 505 | all_data_frame = [] 506 | all_data_frame.append(train) 507 | 508 | for aresult in result: 509 | all_data_frame.append(aresult) 510 | 511 | train = concat(all_data_frame) 512 | feats = [f for f in train.columns if f not in no_use] 513 | categorical_columns = [f for f in categorical_columns if f not in no_use] 514 | 515 | train_old = train.iloc[shape1:shape2] 516 | train = train.iloc[:shape1] 517 | train = train[train.service_type!=1] 518 | train_old = train_old[train_old.service_type!=1] 519 | clf = kfold_lightgbm(params,train,feats,'current_service' ,5 , num_boost_round=4000, categorical_features=categorical_columns) 520 | -------------------------------------------------------------------------------- /model_pred_fee.py: -------------------------------------------------------------------------------- 1 | #import dask.dataframe as dd 2 | #from dask.multiprocessing import get 3 | import itertools 4 | import numpy as np 5 | import pandas as pd 6 | import gc 7 | import time 8 | from contextlib import contextmanager 9 | import lightgbm as lgb 10 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score 11 | from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold 12 | #import matplotlib.pyplot as plt 13 | #import seaborn as sns 14 | import warnings 15 | from sklearn.preprocessing import LabelEncoder 16 | from utils import * 17 | ##from utils3 import * 18 | from datetime import datetime 19 | from datetime import timedelta 20 | #from tqdm import tqdm 21 | #test 22 | 23 | warnings.simplefilter(action='ignore', category=FutureWarning) 24 | 25 | @contextmanager 26 | def timer(title): 27 | t0 = time.time() 28 | yield 29 | print("{} - done in {:.0f}s".format(title, time.time() - t0)) 30 | 31 | USE_KFOLD = True 32 | 33 | def process_data(data,is_train=False): 34 | 35 | def parse_genser(x): 36 | if x == '01': 37 | return '0' 38 | elif x == '02': 39 | return '1' 40 | 41 | elif x == '00': 42 | return np.nan 43 | else: 44 | return x 45 | 46 | print(data.gender.value_counts()) 47 | data['gender'] = data['gender'].apply(lambda x: parse_genser(x)) 48 | print(data.gender.value_counts()) 49 | 50 | for col in ['gender', 'age']: 51 | data[col] = data[col].replace("\\N",np.nan) 52 | data[col] = data[col].astype('float') 53 | 54 | 55 | if is_train: 56 | pass 57 | 58 | return data 59 | 60 | 61 | train = pd.read_csv('./input/train.csv',dtype = {'gender':str})[:] 62 | train = train[train.current_service!=999999] 63 | #train = train.iloc[:520870] 64 | 65 | train = process_data(train,is_train=True) 66 | print(len(train)) 67 | 68 | test = pd.read_csv('./input/test.csv',dtype = { 69 | 'gender':str, 70 | '1_total_fee':str, 71 | '2_total_fee':str, 72 | '3_total_fee':str, 73 | '4_total_fee':str, 74 | 'month_traffic':str, 75 | 'last_month_traffic':str, 76 | 'local_trafffic_month':str, 77 | 'local_caller_time':str, 78 | 'service1_caller_time':str, 79 | 'service2_caller_time':str, 80 | 'pay_num':str,} 81 | )[:] 82 | test = process_data(test,is_train=False) 83 | test.drop(['1_total_fee'] ,axis = 1, inplace = True) 84 | 85 | train_old = pd.read_csv('./input/train_old.csv',dtype = { 86 | 'gender':str, 87 | '1_total_fee':str, 88 | '2_total_fee':str, 89 | '3_total_fee':str, 90 | '4_total_fee':str, 91 | 'month_traffic':str, 92 | 'last_month_traffic':str, 93 | 'local_trafffic_month':str, 94 | 'local_caller_time':str, 95 | 'service1_caller_time':str, 96 | 'service2_caller_time':str, 97 | 'pay_num':str,} 98 | )[:] 99 | train_old = process_data(train_old,is_train=False) 100 | train = train.append(train_old).reset_index(drop = True) 101 | print(len(train)) 102 | shape1 = len(train) 103 | train = train.append(test).reset_index(drop = True) 104 | print(len(train)) 105 | shape2 = len(train) 106 | 107 | 108 | #train = train[train['service_type']!=1] 109 | 110 | import_continue = [ 111 | '1_total_fee', 112 | '3_total_fee', 113 | 'last_month_traffic', 114 | 'service2_caller_time', 115 | 'fea-sum', 116 | '2month_traffic_sum', 117 | 'contract_time', 118 | 'fea-min' 119 | ] 120 | 121 | 122 | call_time = ['local_caller_time', 'service1_caller_time', 'service2_caller_time'] 123 | traffic = ['month_traffic','last_month_traffic','local_trafffic_month'] 124 | cat_cols = [ 125 | 'service_type', #2,3 126 | 'is_mix_service', #2 127 | 'many_over_bill', #2 128 | 'contract_type', #9,8 129 | 'is_promise_low_consume',#2 130 | 'net_service', #4 131 | 'gender', #3 132 | 'complaint_level', #4 133 | 134 | 'cut_online', 135 | 'cut_age', 136 | 137 | ] 138 | continus_col = [ 139 | '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','former_complaint_fee', 140 | 141 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 142 | 143 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 144 | 145 | 'online_time','contract_time', 146 | 147 | 'pay_times', 'former_complaint_num' 148 | ] 149 | def one_hot_encoder(train,column,n=100,nan_as_category=False): 150 | tmp = train[column].value_counts().to_frame() 151 | values = list(tmp[tmp[column]>n].index) 152 | train.loc[train[column].isin(values),column+'N'] = train.loc[train[column].isin(values),column] 153 | train = pd.get_dummies(train, columns=[column+'N'], dummy_na=False) 154 | return train 155 | 156 | def get_len_after_decimal(x): 157 | try: 158 | r = len(x.split('.')[1]) 159 | except: 160 | r = np.nan 161 | return r 162 | 163 | def get_len_before_decimal(x): 164 | try: 165 | r = len(x.split('.')[0]) 166 | except: 167 | r = np.nan 168 | return r 169 | 170 | def get_len_00(x): 171 | try: 172 | r = 1 if '.00' == x[-3:] else 0 173 | except: 174 | r = np.nan 175 | return r 176 | 177 | def get_len_0(x): 178 | try: 179 | r = 1 if '.0' == x[-3:] else 0 180 | except: 181 | r = np.nan 182 | return r 183 | 184 | def pasre_fee(x): 185 | taocan = [16,26,36,46,56,66,76,106,136,166,196,296,396,596] 186 | for i in range(len(taocan)): 187 | if x - taocan[i] >= 15: 188 | return i 189 | return np.nan 190 | 191 | def pasre_fee_min(x): 192 | taocan = [16,26,36,46,56,66,76,106,136,166,196,296,396,596] 193 | for i in range(len(taocan)): 194 | if abs(x - taocan[i]) < 0.01: 195 | return i 196 | return np.nan 197 | 198 | result = [] 199 | 200 | 201 | for column in ['2_total_fee', '3_total_fee', '4_total_fee']: 202 | print(column, train[column].dtypes) 203 | #train[column+'_len_after_dot'] = train[column].apply(lambda x: get_len_after_decimal(x)) 204 | train[column+'_len_before_dot'] = train[column].apply(lambda x: get_len_before_decimal(x)) 205 | train[column+'_00'] = train[column].apply(lambda x: get_len_00(x)) 206 | train[column+'_0'] = train[column].apply(lambda x: get_len_0(x)) 207 | train[column] = train[column].replace("\\N",np.nan) 208 | train[column] = train[column].astype('float') 209 | 210 | 211 | # 212 | #for column in continus_col: 213 | # train = one_hot_encoder(train,column,n=1000,nan_as_category=True) 214 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(2,4)]].min(axis = 1) 215 | 216 | for column in ['2_total_fee', '3_total_fee', '4_total_fee', 'pay_num','fea-min']: 217 | train[column] = train[column].astype('float') 218 | train[column+'_parse_min'] = train[column].apply(lambda x: pasre_fee_min(x)) 219 | train[column+'_parse'] = train[column].apply(lambda x: pasre_fee(x)) 220 | train[column+'_int'] = round(train[column].fillna(0)).astype('int') 221 | train[column+'_shifenwei'] = train[column+'_int'] // 10 222 | train[column+'_int_last'] = train[column+'_int']%10 #last int 223 | train[column+'_decimal'] = ((train[column+'_int'] - train[column])*100).fillna(0).astype('int') #decimal 224 | train[column+'_decimal_is_0'] = (train[column+'_decimal']==0).astype('int') 225 | train[column+'_decimal_is_5'] = (train[column+'_decimal']%5==0).astype('int') 226 | train[column+'_decimal_last'] = train[column+'_decimal']%10 227 | ### train = one_hot_encoder(train,column,n=2000,nan_as_category=True) 228 | train['pay_num_last2'] = train['pay_num_int']%100 229 | train['former_complaint_fee_last2'] = round(train['former_complaint_fee'])%100 230 | 231 | 232 | 233 | 234 | 235 | 236 | for column in ['month_traffic', 'last_month_traffic', 'local_trafffic_month']: 237 | #train[column+'_len_after_dot'] = train[column].apply(lambda x: get_len_after_decimal(x)) 238 | train[column+'_len_before_dot'] = train[column].apply(lambda x: get_len_before_decimal(x)) 239 | train[column] = round(train[column].astype('float'),6) 240 | train[column+'_is_int'] = ((train[column]%1 == 0)&(train[column] != 0)).astype('int') 241 | train[column+'_is_512'] = ((train[column]%512 == 0)&(train[column] != 0)).astype('int') 242 | train[column+'_is_50'] = ((train[column]%50 == 0)&(train[column] != 0)).astype('int') 243 | train[column+'_is_double'] = ((train[column]%512%50 == 0)&(train[column] != 0)).astype('int') 244 | 245 | for column in ['local_caller_time', 'service1_caller_time', 'service2_caller_time','service12']: 246 | if column == 'service12': 247 | train['service12'] = train['service2_caller_time']+train['service1_caller_time'] 248 | else: 249 | #train[column+'_len_after_dot'] = train[column].apply(lambda x: get_len_after_decimal(x)) 250 | train[column+'_len_before_dot'] = train[column].apply(lambda x: get_len_before_decimal(x)) 251 | train[column] = round(train[column].astype('float'),6) 252 | train[column+'_decimal'] = round(((round(train[column])- train[column])*60)).astype('int') 253 | train[column+'_decimal_is_int'] = ((train[column+'_decimal']==0)&(train[column] != 0)).astype('int') 254 | 255 | 256 | print(train.shape) 257 | 258 | train['is_duplicated'] = train.duplicated(subset=['service_type', 'is_mix_service', 'online_time', '1_total_fee', 259 | '2_total_fee', '3_total_fee', '4_total_fee','many_over_bill', 'contract_type', 'contract_time', 260 | 'is_promise_low_consume', 'net_service', 'pay_times', 'pay_num','local_caller_time', 261 | 'service1_caller_time', 'service2_caller_time', 'gender', 'age'],keep=False) 262 | 263 | #单特征处理 264 | #年龄 265 | train['age'] = train['age'].fillna(-20) 266 | train['cut_age'] = train['age'].apply(lambda x: int(x/10)) 267 | train['cut_online'] = (train['online_time'] / 12).astype(int) 268 | 269 | 270 | #同类特征加减乘除 271 | #钱 272 | train['4-fea-dealta'] = train['4_total_fee'] - train['3_total_fee'] 273 | train['3-fea-dealta'] = train['3_total_fee'] - train['2_total_fee'] 274 | 275 | train['4-fea-dealta_'] = train['4_total_fee'] / (train['3_total_fee']+0.00001) 276 | train['3-fea-dealta_'] = train['3_total_fee'] / (train['2_total_fee']+0.00001) 277 | 278 | 279 | #流量 280 | train['month_traffic_delata'] = train['month_traffic'] - train['last_month_traffic'] 281 | train['month_traffic_delata_'] = train['month_traffic'] / (train['last_month_traffic']+0.00001) 282 | train['2month_traffic_sum'] = train['month_traffic'] + train['last_month_traffic'] 283 | train['add_month_traffic'] = train['month_traffic'] - train['local_trafffic_month'] 284 | train['add_month_traffic'] = train['month_traffic'] / (train['local_trafffic_month']+0.00001) 285 | 286 | #通话时间 287 | train['service1_caller_time_delata'] = train['service1_caller_time'] / (train['service2_caller_time']+0.00001) 288 | train['service1_caller_time_delata2'] = train['service1_caller_time'] / (train['local_caller_time']+0.00001) 289 | train['service2_caller_time_delata_'] = train['service2_caller_time'] / (train['local_caller_time']+0.00001) 290 | 291 | #合约时间 292 | train['div_online_time_contract'] = train['contract_time'] / (train['online_time']+0.00001) 293 | train['div_online_time_contract'] = train['contract_time'] - train['online_time'] 294 | 295 | #次数 296 | train['div_former_complaint_num'] = train['former_complaint_num'] / (train['pay_times']+0.00001) 297 | train['div_former_complaint_num'] = train['former_complaint_num'] - train['pay_times'] 298 | 299 | #同类特征统计 300 | #4个月的费用和 301 | train['fea-sum'] = train[[str(1+i) +'_total_fee' for i in range(2,4)]].sum(axis = 1) 302 | train['fea-var'] = train[[str(1+i) +'_total_fee' for i in range(2,4)]].var(axis = 1) 303 | train['fea-max'] = train[[str(1+i) +'_total_fee' for i in range(2,4)]].max(axis = 1) 304 | train['fea-min'] = train[[str(1+i) +'_total_fee' for i in range(2,4)]].min(axis = 1) 305 | 306 | train['call_time_sum'] = train[call_time].sum(axis = 1) 307 | train['call_time_var'] = train[call_time].var(axis = 1) 308 | train['call_time_min'] = train[call_time].min(axis = 1) 309 | train['call_time_max'] = train[call_time].max(axis = 1) 310 | 311 | train['traffic_sum'] = train[traffic].sum(axis = 1) 312 | train['traffic_var'] = train[traffic].var(axis = 1) 313 | train['traffic_min'] = train[traffic].min(axis = 1) 314 | train['traffic_max'] = train[traffic].max(axis = 1) 315 | 316 | #不同类特征加减乘除 317 | #钱_次数 318 | train['average_pay'] = train['pay_num'] / train['pay_times'] 319 | 320 | #钱_流量 321 | train['div_traffic_price_2'] = train['last_month_traffic']/ 1000 / train['2_total_fee'] 322 | 323 | #钱_通话时间 324 | 325 | 326 | #流量_通话时间 327 | train['div_service1_caller_time'] = train['service1_caller_time']/train['last_month_traffic'] 328 | train['div_local_caller_time'] = train['local_caller_time']/train['last_month_traffic'] 329 | train['div_local_caller_time2'] = train['local_caller_time']/train['month_traffic'] 330 | 331 | #费用_次数 332 | train['avg_complain_fee'] = train['former_complaint_fee'] / (train['former_complaint_num'] + 0.000000001) 333 | 334 | # cat*num 335 | 336 | result.append(get_feat_ngroup(train,['cut_age','gender'])) 337 | #result.append(get_feat_stat_feat(train, ['contract_type'], ['1_total_fee'], ['max'])) 338 | result.append(get_feat_stat_feat(train, ['contract_type'], ['2_total_fee'], ['mean'])) 339 | result.append(get_feat_stat_feat(train, ['contract_type'], ['last_month_traffic'], ['var','mean'])) 340 | result.append(get_feat_stat_feat(train, ['contract_type'], ['call_time_sum'], ['mean'])) 341 | 342 | for base_feat in [['contract_type']]: 343 | for other_feat in ['pay_num', 344 | 'month_traffic', 'last_month_traffic', 'local_trafffic_month', 345 | 'local_caller_time', 'service1_caller_time', 'service2_caller_time', 346 | ]: 347 | stat_list = ['mean'] 348 | tmp = get_feat_stat_feat(train,base_feat,[other_feat],stat_list=stat_list) 349 | name = tmp.columns[0] 350 | train[name] = tmp 351 | train[name+'_comp'] = train[other_feat].values-train[name].values 352 | 353 | #比例性特征 354 | #train['1_total_fee_ratio'] = train['1_total_fee']/(train['fea-sum']+0.000001) 355 | train['3_total_fee_ratio'] = train['3_total_fee']/(train['fea-sum']+0.000001) 356 | train['call_time_sum_ratio'] = train['call_time_sum']/(train['traffic_sum']+0.000001) 357 | train['call_time_sum_ratio2'] = train['call_time_sum']/(train['fea-sum']+0.000001) 358 | train['traffic_sum_ratio1'] = train['traffic_sum']/(train['fea-sum']+0.000001) 359 | 360 | 361 | 362 | def f1_score_vali(preds, data_vali): 363 | labels = data_vali.get_label() 364 | preds = np.argmax(preds.reshape(11, -1),axis=0) 365 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 366 | return 'macro_f1_score', score_vali, True 367 | 368 | def evaluate_macroF1_lgb(data_vali, preds): 369 | labels = data_vali.astype(int) 370 | preds = np.array(preds) 371 | preds = np.argmax(preds.reshape(11, -1),axis=0) 372 | score_vali = f1_score(y_true=labels,y_pred=preds,average='macro') 373 | return score_vali 374 | 375 | def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = False, 376 | objective='', metrics='',debug= False, 377 | feval = None, early_stopping_rounds=120, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ): 378 | 379 | lgb_params = params 380 | 381 | train_df = df[df[target].notnull()] 382 | test_df = df[df[target].isnull()] 383 | 384 | # Divide in training/validation and test data 385 | print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape)) 386 | del df 387 | gc.collect() 388 | # Cross validation model 389 | if stratified: 390 | folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234) 391 | else: 392 | folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234) 393 | # folds = GroupKFold(n_splits=5) 394 | # Create arrays and dataframes to store results 395 | oof_preds = np.zeros(train_df.shape[0]) 396 | sub_preds = np.zeros(test_df.shape[0]) 397 | feature_importance_df = pd.DataFrame() 398 | feats = predictors 399 | cv_resul = [] 400 | ''' 401 | perm = [i for i in range(len(train_df))] 402 | perm = pd.DataFrame(perm) 403 | perm.columns = ['index_'] 404 | 405 | for n_fold in range(5): 406 | train_idx = np.array(perm[train_df['cv'] != n_fold]['index_']) 407 | valid_idx = np.array(perm[train_df['cv'] == n_fold]['index_']) 408 | ''' 409 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])): 410 | if (USE_KFOLD == False) and (n_fold == 1): 411 | break 412 | train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx] 413 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx] 414 | 415 | #train_x = pd.concat([train_x,train_old[feats]]) 416 | #train_y = pd.concat([train_y,train_old[target]]) 417 | 418 | train_y_t = train_y.values 419 | valid_y_t = valid_y.values 420 | print(train_y_t) 421 | xgtrain = lgb.Dataset(train_x.values, label = train_y_t, 422 | feature_name=predictors, 423 | categorical_feature=categorical_features 424 | ) 425 | xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t, 426 | feature_name=predictors, 427 | categorical_feature=categorical_features 428 | ) 429 | 430 | clf = lgb.train(lgb_params, 431 | xgtrain, 432 | valid_sets=[xgvalid],#, xgtrain], 433 | valid_names=['valid'],#,'train'], 434 | num_boost_round=num_boost_round, 435 | early_stopping_rounds=early_stopping_rounds, 436 | verbose_eval=verbose_eval, 437 | feval=feval) 438 | 439 | 440 | 441 | oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) 442 | sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits 443 | 444 | 445 | gain = clf.feature_importance('gain') 446 | fold_importance_df = pd.DataFrame({'feature':clf.feature_name(), 447 | 'split':clf.feature_importance('split'), 448 | 'gain':100*gain/gain.sum(), 449 | 'fold':n_fold, 450 | }).sort_values('gain',ascending=False) 451 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 452 | 453 | #result = sklearn_mertric(valid_y, oof_preds[valid_idx]) 454 | result = clf.best_score['valid']['l1'] 455 | print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result)) 456 | cv_resul.append(round(result,5)) 457 | gc.collect() 458 | 459 | #score = np.array(cv_resul).mean() 460 | score = 'fee_pred' 461 | if USE_KFOLD: 462 | train_df[target] = oof_preds 463 | test_df[target] = sub_preds 464 | print(test_df[target].mean()) 465 | train_df[target] = oof_preds 466 | #train_df[target] = train_df[target].map(label2current_service) 467 | test_df[target] = sub_preds 468 | #test_df[target] = test_df[target].map(label2current_service) 469 | print('all_cv', cv_resul) 470 | train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False) 471 | test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False) 472 | print("test_df mean:") 473 | 474 | display_importances(feature_importance_df,score) 475 | 476 | 477 | 478 | def display_importances(feature_importance_df_,score): 479 | ft = feature_importance_df_[["feature", "split","gain"]].groupby("feature").mean().sort_values(by="gain", ascending=False) 480 | print(ft.head(60)) 481 | ft.to_csv('importance_lightgbm_{}.csv'.format(score),index=True) 482 | cols = ft[:40].index 483 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] 484 | 485 | 486 | ####################################计算################################################################# 487 | 488 | 489 | params = { 490 | 'metric': 'mae', 491 | 'boosting_type': 'gbdt', 492 | 'objective': 'regression', 493 | 'feature_fraction': 0.65, 494 | 'learning_rate': 0.1, 495 | 'bagging_fraction': 0.65, 496 | #'bagging_freq': 2, 497 | 'num_leaves': 64, 498 | 'max_depth': -1, 499 | 'num_threads': 32, 500 | 'seed': 2018, 501 | 'verbose': -1, 502 | #'is_unbalance':True, 503 | } 504 | 505 | 506 | categorical_columns = [ 507 | 'contract_type', 508 | # 'is_mix_service', 509 | # 'is_promise_low_consume', 510 | 'net_service', 511 | 'gender'] 512 | for feature in categorical_columns: 513 | print(f'Transforming {feature}...') 514 | encoder = LabelEncoder() 515 | train[feature] = encoder.fit_transform(train[feature].astype(str)) 516 | 517 | #no_use_from_pandas = pd.read_csv('importance_lightgbm_0.903734.csv') 518 | #x = list(no_use_from_pandas[no_use_from_pandas.gain==0.0]['feature']) 519 | x = [] 520 | no_use = ['current_service', 'user_id','group','1_total_fee', 521 | 522 | ] + x 523 | 524 | 525 | # for i in categorical_columns: 526 | # result.append(pd.get_dummies(train[[i]], columns= [i], dummy_na= False)) 527 | 528 | categorical_columns = [] 529 | all_data_frame = [] 530 | all_data_frame.append(train) 531 | 532 | for aresult in result: 533 | all_data_frame.append(aresult) 534 | 535 | train = concat(all_data_frame) 536 | 537 | feats = [f for f in train.columns if f not in no_use] 538 | categorical_columns = [f for f in categorical_columns if f not in no_use] 539 | 540 | train['1_total_fee'] = train['1_total_fee'].astype(float) 541 | #train_old = train.iloc[shape1:shape2] 542 | #train = train.iloc[:shape1] 543 | #4000`: 544 | clf = kfold_lightgbm(params,train,feats,'1_total_fee' ,5 , num_boost_round=4000, categorical_features=categorical_columns)#2000 545 | -------------------------------------------------------------------------------- /piupiu_white.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Oct 28 11:01:29 2018 5 | 6 | @author: hzs 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | sub = pd.read_csv('./sub/sub_final.csv') 13 | 14 | 15 | def replace_white_user_id(submission): 16 | white_user_id = pd.read_csv('white.csv') 17 | white_dict = dict(zip(white_user_id['user_id'],white_user_id['current_service'].astype(int))) 18 | submission['current_service'] = list(map(lambda x,y: y if x not in white_dict else white_dict[x],submission['user_id'],submission['current_service'])) 19 | return submission 20 | 21 | sub = replace_white_user_id(sub) 22 | sub.to_csv('./sub_final_white.csv',index=None) 23 | 24 | 25 | def f(x): 26 | try: 27 | r = x.split('.')[1][-1] 28 | return 0 if r=='0' else 1 29 | except: 30 | return 1 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lightgbm==2.1.1 2 | numpy==1.14.3 3 | pandas==0.23.1 4 | scikit_learn==0.19.1 5 | xgboost==0.72.1 6 | -------------------------------------------------------------------------------- /run_perfect.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | stage=0 4 | 5 | echo '**' `date +%H:%M:%S` 'start with stage=' $stage 6 | echo 'Any question contact QQ:674785731' 7 | 8 | mkdir -p cv 9 | mkdir -p sub 10 | mkdir -p cache 11 | mkdir -p data/a 12 | mkdir -p data/b 13 | cp input/train_old.csv data/a/train.csv 14 | cp input/train.csv data/b/train_new.csv 15 | cp input/test.csv data/b/test_new.csv 16 | 17 | 18 | # gen magic feature 19 | if [ $stage -le 0 ]; then 20 | cd feature && python3 get_most.py && cd .. 21 | echo '**' `date +%H:%M:%S` 'finished get most' 22 | fi 23 | 24 | # white 25 | if [ $stage -le 1 ]; then 26 | cd feature && python3 white.py && cd .. 27 | echo '**' `date +%H:%M:%S` 'finished white' 28 | fi 29 | #model 1 30 | if [ $stage -le 2 ]; then 31 | python3 model1.py 32 | echo '**' `date +%H:%M:%S` 'finished model1' 33 | fi 34 | 35 | #model 2 36 | if [ $stage -le 3 ]; then 37 | python3 model2.py 38 | echo '**' `date +%H:%M:%S` 'finished model2' 39 | fi 40 | 41 | #model 3 42 | if [ $stage -le 4 ]; then 43 | python3 model3_1.py 44 | python3 model3_4.py 45 | echo '**' `date +%H:%M:%S` 'finished model3' 46 | fi 47 | 48 | #model pred fee 49 | if [ $stage -le 5 ]; then 50 | python3 model_pred_fee.py 51 | echo '**' `date +%H:%M:%S` 'finished model fee pred' 52 | fi 53 | 54 | # combibe 1 and 4 55 | if [ $stage -le 6 ]; then 56 | python3 hebing_pred.py 57 | echo '**' `date +%H:%M:%S` 'finished combine' 58 | fi 59 | 60 | # piupiu 61 | if [ $stage -le 7 ]; then 62 | python3 clean_a.py 63 | python3 clean_b.py 64 | python3 data_pred_a+2b.py 65 | python3 data_pred_a+b.py 66 | echo '**' `date +%H:%M:%S` 'finished clean' 67 | fi 68 | 69 | # stacking 70 | if [ $stage -le 8 ]; then 71 | python3 baseline_v11.py 72 | echo '**' `date +%H:%M:%S` 'finished fianle model' 73 | fi 74 | 75 | # whilte 76 | if [ $stage -le 9 ]; then 77 | python3 piupiu_white.py 78 | echo '**' `date +%H:%M:%S` 'finished white' 79 | fi 80 | echo 'all done! submit file is sub_final_white' 81 | -------------------------------------------------------------------------------- /run_top1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | stage=0 4 | 5 | echo '**' `date +%H:%M:%S` 'start with stage=' $stage 6 | echo 'Any question contact QQ:674785731' 7 | 8 | mkdir -p cv 9 | mkdir -p sub 10 | 11 | # gen magic feature 12 | if [ $stage -le 0 ]; then 13 | cd feature && python3 get_most.py && cd .. 14 | echo '**' `date +%H:%M:%S` 'finished get most' 15 | fi 16 | 17 | # white 18 | if [ $stage -le 1 ]; then 19 | cd feature && python3 white.py && cd .. 20 | echo '**' `date +%H:%M:%S` 'finished white' 21 | fi 22 | #model 2 23 | if [ $stage -le 2 ]; then 24 | python3 model1.py 25 | echo '**' `date +%H:%M:%S` 'finished jiajie model1' 26 | fi 27 | 28 | if [ $stage -le 3 ]; then 29 | python3 fast_baseline_v11.py 30 | echo '**' `date +%H:%M:%S` 'finished final' 31 | fi 32 | 33 | if [ $stage -le 4 ]; then 34 | python3 piupiu_white.py 35 | echo '**' `date +%H:%M:%S` 'finished piupiuwhite,sub_final_while' 36 | fi 37 | 38 | -------------------------------------------------------------------------------- /tool.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import os 3 | import sys 4 | import time 5 | import pickle 6 | import datetime 7 | import numpy as np 8 | import pandas as pd 9 | import xgboost as xgb 10 | from tqdm import tqdm 11 | import multiprocessing 12 | import lightgbm as lgb 13 | from scipy import stats 14 | from functools import partial 15 | from dateutil.parser import parse 16 | from lightgbm import LGBMClassifier 17 | from collections import defaultdict 18 | from sklearn.metrics import f1_score 19 | from datetime import date, timedelta 20 | from contextlib import contextmanager 21 | from sklearn.metrics import recall_score 22 | from sklearn.cross_validation import KFold 23 | from sklearn.metrics import precision_score 24 | from joblib import dump, load, Parallel, delayed 25 | from sklearn.linear_model import LinearRegression 26 | from sklearn.model_selection import StratifiedKFold, GroupKFold 27 | from sklearn.decomposition import NMF, PCA, TruncatedSVD 28 | from sklearn.feature_extraction.text import CountVectorizer 29 | from sklearn.decomposition import LatentDirichletAllocation as LDA 30 | 31 | 32 | 33 | # 求解rmse的均值和标准差 34 | def get_ave_std(c1,c2,f1,f2): 35 | ''' 36 | :param c1: 提交的常数1 37 | :param c2: 提交的常数2 38 | :param f1: 得分1 39 | :param f2: 得分2 40 | :return: 均值和标准差 41 | ''' 42 | f1 = f1**2; f2 = f2**2; 43 | a = 2; b = 2*(c1+c2); c = c1**2+c2**2-(f1-f2); 44 | ave = (f1 - f2 + c2 ** 2 - c1 ** 2) / 2 / (c2 - c1) 45 | std = (f1 - (c1 - ave) ** 2) ** 0.5 46 | return ave,std 47 | 48 | # 求解rmse的均值 49 | def get_sub_ave_std(c1,c2,f1,f2,n1,n2): 50 | ''' 51 | :param c1: 提交1的常数 52 | :param c2: 提交2有差异的部分的常数 53 | :param f1: 提交1的分数 54 | :param f2: 提交2的分数 55 | :param n1: 提交总个数 56 | :param n2: 提交2有差异部分的个数 57 | :return: 提交2有差异部分的均值 58 | ''' 59 | result = ((c1+c2)-((f1**2-f2**2)*n1/n2/(c1-c2)))/2 60 | return result 61 | 62 | 63 | # 抽样函数 64 | def make_sample(n,n_sub=2,seed=None): 65 | import random 66 | if seed is not None: 67 | random.seed(seed) 68 | if type(n) is int: 69 | l = list(range(n)) 70 | s = int(n / n_sub) 71 | else: 72 | l = list(n) 73 | s = int(len(n) / n_sub) 74 | random.shuffle(l) 75 | result = [] 76 | for i in range(n_sub): 77 | if i == n_sub: 78 | result.append(l[i*s:]) 79 | else: 80 | result.append(l[i*s: (i+1)*s]) 81 | return result 82 | 83 | # 统计list的value_counts 84 | def value_counts(l): 85 | s = set(l) 86 | d = dict([(x,0) for x in s]) 87 | for i in l: 88 | d[i] += 1 89 | result = pd.Series(d) 90 | result.sort_values(ascending=False,inplace=True) 91 | return result 92 | 93 | # 分类特征转化率 94 | def analyse(data,name,label='label'): 95 | result = data.groupby(name)[label].agg({'count':'count', 96 | 'sum':'sum'}) 97 | result['rate'] = result['sum']/result['count'] 98 | return result 99 | 100 | # 连续特征转化率,等距分隔 101 | def analyse2(data,name='id',label='label', factor=10): 102 | grouping = pd.cut(data[name],factor) 103 | rate = data.groupby(grouping)[label].agg({'sum':'sum', 104 | 'count':'count'}) 105 | rate['rate'] = rate['sum']/rate['count'] 106 | return rate 107 | 108 | # 连续特征转化率,等数分隔 109 | def analyse3(data,name='id',label='label', factor=10): 110 | grouping = pd.qcut(data[name],factor) 111 | rate = data.groupby(grouping)[label].agg({'sum':'sum', 112 | 'count':'count'}) 113 | rate['rate'] = rate['sum']/rate['count'] 114 | return rate 115 | 116 | # 分组标准化 117 | def grp_standard(data,key,names,drop=False): 118 | for name in names: 119 | new_name = name if drop else name + '_' + key + '_' + 'standardize' 120 | mean_std = data.groupby(key, as_index=False)[name].agg({'mean': 'mean', 121 | 'std': 'std'}) 122 | data = data.merge(mean_std, on=key, how='left') 123 | data[new_name] = ((data[name]-data['mean'])/data['std']).fillna(0).astype(np.float32) 124 | data[new_name] = data[new_name].replace(-np.inf, 0).fillna(0) 125 | data.drop(['mean','std'],axis=1,inplace=True) 126 | return data 127 | 128 | # 分组归一化 129 | def grp_normalize(data,key,names,start=0,drop=False): 130 | for name in names: 131 | new_name = name if drop else name + '_' + key + '_' + 'normalize' 132 | max_min = data.groupby(key,as_index=False)[name].agg({'max':'max', 133 | 'min':'min'}) 134 | data = data.merge(max_min, on=key, how='left') 135 | data[new_name] = (data[name]-data['min'])/(data['max']-data['min']) 136 | data[new_name] = data[new_name].replace(-np.inf, start).fillna(start).astype(np.float32) 137 | data.drop(['max','min'],axis=1,inplace=True) 138 | return data 139 | 140 | # 分组排序 141 | def grp_rank(data,key,names,ascending=True): 142 | for name in names: 143 | data.sort_values([key, name], inplace=True, ascending=ascending) 144 | data['rank'] = range(data.shape[0]) 145 | min_rank = data.groupby(key, as_index=False)['rank'].agg({'min_rank': 'min'}) 146 | data = pd.merge(data, min_rank, on=key, how='left') 147 | data['rank'] = data['rank'] - data['min_rank'] 148 | data[names] = data['rank'] 149 | data.drop(['rank'],axis=1,inplace=True) 150 | return data 151 | 152 | # 合并节约内存 153 | def concat(L): 154 | result = None 155 | for l in L: 156 | if result is None: 157 | result = l 158 | else: 159 | result[l.columns.tolist()] = l 160 | return result 161 | 162 | # 分组排序函数 163 | def group_rank(data, key, values, ascending=True): 164 | if type(key)==list: 165 | data_temp = data[key + [values]].copy() 166 | data_temp.sort_values(key + [values], inplace=True, ascending=ascending) 167 | data_temp['rank'] = range(data_temp.shape[0]) 168 | min_rank = data_temp.groupby(key,as_index=False)['rank'].agg({'min_rank':'min'}) 169 | index = data_temp.index 170 | data_temp = data_temp.merge(min_rank,on=key,how='left') 171 | data_temp.index = index 172 | else: 173 | data_temp = data[[key,values]].copy() 174 | data_temp.sort_values(key + [values], inplace=True, ascending=ascending) 175 | data_temp['rank'] = range(data_temp.shape[0]) 176 | data_temp['min_rank'] = data_temp[key].map(data_temp.groupby(key)['rank'].min()) 177 | data_temp['rank'] = data_temp['rank'] - data_temp['min_rank'] 178 | return data_temp['rank'] 179 | 180 | def nunique(x): 181 | return len(set(x)) 182 | 183 | 184 | # 前后时间差的函数: 185 | def group_diff_time(data,key,value,n): 186 | data_temp = data[key+[value]].copy() 187 | shift_value = data_temp.groupby(key)[value].shift(n) 188 | data_temp['shift_value'] = data_temp[value] - shift_value 189 | return data_temp['shift_value'] 190 | 191 | 192 | 193 | # smape 194 | def smape(y_true,y_pred): 195 | y_true = np.array(y_true) 196 | y_pred = np.array(y_pred) 197 | y_diff = np.abs(y_true-y_pred) 198 | y_sum = y_true+y_pred 199 | return np.mean(y_diff/y_sum)*2 200 | 201 | 202 | # groupby 直接拼接 203 | def groupby(data,stat,key,value,func): 204 | key = key if type(key)==list else [key] 205 | data_temp = data[key].copy() 206 | feat = stat.groupby(key,as_index=False)[value].agg({'feat':func}) 207 | data_temp = data_temp.merge(feat,on=key,how='left') 208 | return data_temp['feat'] 209 | 210 | 211 | 212 | # 计算关系紧密程度指数 213 | def tfidf2(df,key1,key2): 214 | key = key1 + key2 215 | tfidf2 = '_'.join(key) + '_tfidf2' 216 | df1 = df.groupby(key,as_index=False)[key[0]].agg({'key_count': 'size'}) 217 | df2 = df1.groupby(key1,as_index=False)['count'].agg({'key1_count': 'sum'}) 218 | df3 = df1.groupby(key2, as_index=False)['count'].agg({'key2_count': 'sum'}) 219 | df1 = df1.merge(df2,on=key1,how='left').merge(df3,on=key2,how='left') 220 | df1[tfidf2] = df1['key_count'] / df['key2_count'] / df['key1_count'] 221 | 222 | 223 | # 相差的日期数 224 | def diff_of_days(day1, day2): 225 | days = (parse(day1[:10]) - parse(day2[:10])).days 226 | return days 227 | 228 | # 相差的分钟数 229 | def diff_of_minutes(time1,time2): 230 | minutes = (parse(time1) - parse(time2)).total_seconds()//60 231 | return abs(minutes) 232 | 233 | # 相差的小时数 234 | def diff_of_hours(time1,time2): 235 | hours = (parse(time1) - parse(time2)).total_seconds()//3600 236 | return abs(hours) 237 | 238 | # 日期的加减 239 | def date_add_days(start_date, days): 240 | end_date = parse(start_date[:10]) + timedelta(days=days) 241 | end_date = end_date.strftime('%Y-%m-%d') 242 | return end_date 243 | 244 | # 日期的加减 245 | def date_add_hours(start_date, hours): 246 | end_date = parse(start_date) + timedelta(hours=hours) 247 | end_date = end_date.strftime('%Y-%m-%d %H:%M:%S') 248 | return end_date 249 | 250 | # 获取某个类型里面第n次的值 251 | def get_last_values(data, stat, key, sort_value, value, shift, sort=None): 252 | key = key if type(key)==list else [key] 253 | if sort == 'ascending': 254 | stat_temp = stat.sort_values(sort_value, ascending=True) 255 | elif sort == 'descending': 256 | stat_temp = stat.sort_values(sort_value, ascending=False) 257 | else: 258 | stat_temp = stat.copy() 259 | stat_temp['value'] = stat_temp.groupby(key)[value].shift(shift) 260 | stat_temp.drop_duplicates(key,keep='last',inplace=True) 261 | data_temp = data[key].copy() 262 | data_temp = data_temp.merge(stat_temp,on=key,how='left') 263 | return data_temp['value'] 264 | 265 | # 获取某个类型里面第n次的值 266 | def get_first_values(data, stat, key, sort_value, value, shift, sort=None): 267 | key = key if type(key)==list else [key] 268 | if sort == 'ascending': 269 | stat_temp = stat.sort_values(sort_value, ascending=True) 270 | elif sort == 'descending': 271 | stat_temp = stat.sort_values(sort_value, ascending=False) 272 | else: 273 | stat_temp = stat.copy() 274 | stat_temp['value'] = stat_temp.groupby(key)[value].shift(-shift) 275 | stat_temp.drop_duplicates(key,keep='first',inplace=True) 276 | data_temp = data[key].copy() 277 | data_temp = data_temp.merge(stat_temp,on=key,how='left') 278 | return data_temp['value'] 279 | 280 | 281 | 282 | # 压缩数据 283 | def compress(data): 284 | size = sys.getsizeof(data)/2**20 285 | def intcp(series): 286 | ma = max(series) 287 | mi = min(series) 288 | if (ma<128) & (mi>=-128): 289 | return 'int8' 290 | elif (ma<32768) & (mi>=-32768): 291 | return 'int16' 292 | elif (ma<2147483648) & (mi>=-2147483648): 293 | return 'int32' 294 | else: 295 | return None 296 | def floatcp(series): 297 | ma = max(series) 298 | mi = min(series) 299 | if (ma<32770) & (mi>-32770): 300 | return 'float16' 301 | elif (ma<2147483600) & (mi>-2147483600): 302 | return 'float32' 303 | else: 304 | return None 305 | 306 | for c in data.columns: 307 | ctype = None 308 | dtypes = data[c].dtypes 309 | if dtypes == np.int64: 310 | ctype = intcp(data[c]) 311 | if dtypes == np.int32: 312 | ctype = intcp(data[c]) 313 | if dtypes == np.int16: 314 | ctype = intcp(data[c]) 315 | if dtypes == np.float64: 316 | ctype = floatcp(data[c]) 317 | if dtypes == np.float32: 318 | ctype = floatcp(data[c]) 319 | if ctype is None: 320 | continue 321 | try: 322 | 323 | data[c] = data[c].astype(ctype) 324 | #print('{} convet to {}, done! {}'.format(dtypes,ctype,c)) 325 | except: 326 | print('error') 327 | #print('特征{}的类型为:{},转化出线问题!!!'.format(c,dtypes)) 328 | #print('原始数据大小为: {}M'.format(round(size, 2))) 329 | #print('新数据大小为: {}M'.format(round(sys.getsizeof(data) / 2 ** 20,2))) 330 | return data 331 | 332 | 333 | 334 | 335 | def trend(y): 336 | try: 337 | x = np.arange(0, len(y)).reshape(-1, 1) 338 | lr = LinearRegression() 339 | lr.fit(x, y) 340 | trend = lr.coef_[0] 341 | except: 342 | trend = np.nan 343 | return trend 344 | 345 | 346 | @contextmanager 347 | def timer(title): 348 | t0 = time.time() 349 | yield 350 | #print("{} - done in {:.0f}s".format(title, time.time() - t0)) 351 | 352 | 353 | def jiangwei(stat,data, id, feature): 354 | #print('lda ...') 355 | mapping = {} 356 | for sample in stat[[id, feature]].values: 357 | mapping.setdefault(sample[0], []).append(str(sample[1])) 358 | ids = list(mapping.keys()) 359 | sentences = [' '.join(mapping[cate_]) for cate_ in ids] 360 | stat_sentences_matrix = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', min_df=2).fit_transform(sentences) 361 | mapping = {} 362 | for sample in data[[id, feature]].values: 363 | mapping.setdefault(sample[0], []).append(str(sample[1])) 364 | ids = list(mapping.keys()) 365 | sentences = [' '.join(mapping[cate_]) for cate_ in ids] 366 | data_sentences_matrix = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', min_df=2).fit_transform(sentences) 367 | 368 | lda = LDA(n_components=5, 369 | learning_method='online', 370 | batch_size=1000, 371 | n_jobs=40, 372 | random_state=520) 373 | lda.fit(stat_sentences_matrix) 374 | lda_matrix = lda.transform(data_sentences_matrix) 375 | lda_matrix = pd.DataFrame(lda_matrix,columns=['lda_{}_{}'.format(feature, i) for i in range(5)]).astype('float16') 376 | 377 | nmf = NMF(n_components=5, 378 | random_state=520, 379 | beta_loss='kullback-leibler', 380 | solver='mu', 381 | max_iter=1000, 382 | alpha=.1, 383 | l1_ratio=.5) 384 | nmf.fit(stat_sentences_matrix) 385 | nmf_matrix = nmf.transform(stat_sentences_matrix) 386 | nmf_matrix = pd.DataFrame(nmf_matrix,columns=['nmf_{}_{}'.format(feature, i) for i in range(5)]).astype('float16') 387 | 388 | pca = TruncatedSVD(5) 389 | pca.fit(stat_sentences_matrix) 390 | pca_matrix = pca.transform(stat_sentences_matrix) 391 | pca_matrix = pd.DataFrame(pca_matrix, 392 | columns=["%s_%s_svd_action" % ('user_sku', i) for i in range(5)]).astype('float32') 393 | 394 | matrix = concat([lda_matrix,nmf_matrix,pca_matrix]) 395 | matrix[id] = ids 396 | return matrix 397 | 398 | # 获取阈值 399 | def get_threshold(preds,silent=False): 400 | preds_temp = sorted(preds,reverse=True) 401 | n = sum(preds) # 实际正例个数 402 | m = 0 # 提交的正例个数 403 | e = 0 # 正确个数的期望值 404 | f1 = 0 # f1的期望得分 405 | for threshold in preds_temp: 406 | e += threshold 407 | m += 1 408 | f1_temp = e/(m+n) 409 | if f1>f1_temp: 410 | break 411 | else: 412 | f1 = f1_temp 413 | if not silent: 414 | print('error') 415 | #print('阈值为:{}'.format(threshold)) 416 | #print('提交正例个数为:{}'.format(m-1)) 417 | #print('期望得分为:{}'.format(f1*2)) 418 | return [(1 if (pred>threshold) else 0) for pred in preds] 419 | 420 | # 多分类F1值 421 | def multi_f1(true,pred,silent=False): 422 | true_dummy = pd.get_dummies(pd.Series(true)) 423 | pred_dummy = pd.get_dummies(pd.Series(pred)) 424 | scores = [] 425 | for c in true_dummy.columns: 426 | score = f1_score(true_dummy[c],pred_dummy[c]) 427 | if not silent: 428 | pass 429 | #print('{} : {}'.format(c,score)) 430 | scores.append(score) 431 | return np.mean(scores) 432 | 433 | 434 | # 多分类f1期望得分 435 | def exp_multi_f1(pred,int_preds,silent=False): 436 | int_preds_dummy = pd.get_dummies(pd.Series(int_preds)) 437 | pred = pd.DataFrame(pred,columns=int_preds_dummy.columns) 438 | scores = [] 439 | for c in pred.columns: 440 | n = pred[c].sum() 441 | m = int_preds_dummy[c].sum() 442 | r = pred[int_preds_dummy[c]==1][c].sum() 443 | f1 = 2*r / (m+n) 444 | if not silent: 445 | pass 446 | #print('{} : {}'.format(c, f1)) 447 | scores.append(f1) 448 | return np.mean(scores) 449 | 450 | # 多分类f1最佳阈值 451 | def get_multi_f1_threshold(preds): 452 | def derivative(arg, p): 453 | m, n, r = arg 454 | s = m + n 455 | return 2 * (p * s - r) / (s + 1) / s 456 | 457 | def get_multi_f1_threshold_di(preds, int_preds, preds_flag): 458 | int_preds_matrix = pd.get_dummies(int_preds).values 459 | para_dict = {} 460 | for i in range(preds.shape[1]): 461 | m = preds[:, i].sum() 462 | n = int_preds_matrix[:, i].sum() 463 | r = preds[int_preds_matrix[:, i] == 1, i].sum() 464 | para_dict[i] = (m, n, r) 465 | for i in range(preds.shape[0]): 466 | if preds_flag[i]: 467 | continue 468 | else: 469 | int_preds[i] = np.argmax([derivative(para_dict[j],preds[i,j]) for j in range(preds.shape[1])]) 470 | return int_preds 471 | 472 | int_preds = pd.Series(preds.argmax(axis=1)) 473 | preds_flag = list(preds.max(axis=1)>0.5) 474 | for i in range(3): 475 | int_preds = get_multi_f1_threshold_di(preds,int_preds,preds_flag) 476 | #print('期望的分: {}'.format(exp_multi_f1(preds,int_preds))) 477 | 478 | # 贝叶斯平滑 479 | def bayes_encode(C, I): 480 | def compute_moment(tries, success): 481 | ctr_list = [] 482 | var = 0.0 483 | for i in range(len(tries)): 484 | ctr_list.append(float(success[i]) / tries[i]) 485 | mean = sum(ctr_list) / len(ctr_list) 486 | for ctr in ctr_list: 487 | var += pow(ctr - mean, 2) 488 | return mean, var / (len(ctr_list) - 1) 489 | 490 | def update_from_data_by_moment(tries, success): 491 | mean, var = compute_moment(tries, success) 492 | alpha = (mean + 0.000001) * ((mean + 0.000001) * (1.000001 - mean) / (var + 0.000001) - 1) 493 | beta = (1.000001 - mean) * ((mean + 0.000001) * (1.000001 - mean) / (var + 0.000001) - 1) 494 | return alpha, beta 495 | 496 | I_temp = list(I) 497 | C_temp = list(C) 498 | alpha, beta = update_from_data_by_moment(I_temp, C_temp) 499 | rate = ((alpha + C) / (alpha + beta + I)).astype('float32') 500 | return rate 501 | 502 | # 交叉验证统计转化率 503 | def mul_cv_convert(data,category,label,cv=5): 504 | rate = np.zeros((len(data),data[label].nunique())) 505 | kf = KFold(len(data), n_folds=cv, shuffle=True, random_state=66) 506 | for i, (train_index, test_index) in enumerate(kf): 507 | stat1 = data.iloc[train_index] 508 | stat2 = data.iloc[test_index] 509 | temp1 = stat1.groupby([category, label], as_index=False).size().unstack().fillna(0) 510 | temp2 = stat1[~stat1[label].isnull()].groupby([category], as_index=False).size() 511 | temp3 = (temp1.T / temp2).T 512 | columns = [category + '_' + str(c) + '_conversion' for c in temp3.columns] 513 | temp3 = temp3.reset_index() 514 | temp4 = stat2[[category]].merge(temp3, on=category, how='left') 515 | rate[test_index,:] = temp4.drop(category,axis=1).values 516 | rate = pd.DataFrame(rate,columns=columns) 517 | data = concat([data,rate],axis=1) 518 | return data 519 | 520 | # count encoding 521 | def count_encoding(li): 522 | temp = pd.Series(li) 523 | result = temp.map(temp.value_counts()) 524 | return result 525 | 526 | # 众位数 527 | def mode(li): 528 | if stats.mode(li)[1][0]==1: 529 | return np.nan 530 | return stats.mode(li)[0][0] 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | def xgb_cv(params, train_feat, test_feat, predictors, label='label',groups=None,cv=5,stratified=True): 539 | #print('开始CV 5折训练...') 540 | t0 = time.time() 541 | train_preds = np.zeros((len(train_feat), train_feat[label].nunique())) 542 | test_preds = np.zeros((len(test_feat), train_feat[label].nunique())) 543 | xgb_test = xgb.DMatrix(test_feat[predictors]) 544 | models = [] 545 | group_kfold = GroupKFold(n_splits=cv).get_n_splits(train_preds, train_preds[label], groups, random_state=66) 546 | for i, train_index, test_index in group_kfold.split(train_preds, train_preds[label], groups): 547 | xgb_train = xgb.DMatrix(train_feat[predictors].iloc[train_index], train_feat[label].iloc[train_index]) 548 | xgb_eval = xgb.DMatrix(train_feat[predictors].iloc[test_index], train_feat[label].iloc[test_index]) 549 | 550 | #print('开始第{}轮训练...'.format(i)) 551 | params = {'objective': 'multi:softprob', 552 | 'eta': 0.1, 553 | 'max_depth': 6, 554 | 'silent': 1, 555 | 'num_class': 11, 556 | 'eval_metric': "mlogloss", 557 | 'min_child_weight': 3, 558 | 'subsample': 0.7, 559 | 'colsample_bytree': 0.7, 560 | 'seed': 66 561 | } if params is None else params 562 | watchlist = [(xgb_train, 'train'), (xgb_eval, 'val')] 563 | 564 | clf = xgb.train(params, 565 | xgb_train, 566 | num_boost_round=3000, 567 | evals=watchlist, 568 | verbose_eval=50, 569 | early_stopping_rounds=50) 570 | 571 | train_preds[test_index] += clf.predict(xgb_eval) 572 | test_preds += clf.predict(xgb_test) 573 | models.append(clf) 574 | pickle.dump(models,open('xgb_{}.model'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),'+wb')) 575 | #print('用时{}秒'.format(time.time()-t0)) 576 | return train_preds,test_preds/5 577 | 578 | def lgb_cv(params, train_feat, test_feat, predictors, label='label', cv=5,stratified=True): 579 | #print('开始CV 5折训练...') 580 | t0 = time.time() 581 | train_preds = np.zeros((len(train_feat), train_feat[label].nunique())) 582 | test_preds = np.zeros((len(test_feat), train_feat[label].nunique())) 583 | models = [] 584 | group_kfold = GroupKFold(n_splits=cv).get_n_splits(train_preds, train_preds[label], groups, random_state=66) 585 | for i, train_index, test_index in group_kfold.split(train_preds, train_preds[label], groups): 586 | lgb_train = lgb.Dataset(train_feat[predictors].iloc[train_index], train_feat['label'].iloc[train_index]) 587 | lgb_eval = lgb.Dataset(train_feat[predictors].iloc[test_index], train_feat['label'].iloc[test_index]) 588 | 589 | #print('开始第{}轮训练...'.format(i)) 590 | params = { 591 | 'task': 'train', 592 | 'boosting_type': 'gbdt', 593 | 'objective': 'multiclass', 594 | 'num_class':11, 595 | 'max_depth': 8, 596 | 'num_leaves': 150, 597 | 'learning_rate': 0.05, 598 | 'subsample': 0.7, 599 | 'colsample_bytree': 0.7, 600 | 'feature_fraction': 0.9, 601 | 'bagging_fraction': 0.95, 602 | 'bagging_freq': 5, 603 | 'verbose': 0, 604 | 'seed': 66, 605 | } if params is None else params 606 | 607 | clf = lgb.train(params, 608 | lgb_train, 609 | num_boost_round=10000, 610 | valid_sets=lgb_eval, 611 | verbose_eval=50, 612 | early_stopping_rounds=100) 613 | 614 | train_preds[test_index] += clf.predict(train_feat[predictors].iloc[test_index]) 615 | test_preds += clf.predict(test_feat[predictors]) 616 | models.append(clf) 617 | pickle.dump(models, open('xgb_{}.model'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), '+wb')) 618 | #print('用时{}秒'.format(time.time() - t0)) 619 | return train_preds, test_preds / 5 620 | 621 | 622 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jul 7 23:08:26 2018 5 | 6 | @author: bmj 7 | """ 8 | 9 | import gc 10 | import time 11 | from time import strftime,gmtime 12 | import numpy as np 13 | import pandas as pd 14 | import os 15 | load = False 16 | cache_path = './cache3/' 17 | 18 | from time import strftime,gmtime 19 | 20 | 21 | def concat(L): 22 | result = None 23 | for l in L: 24 | if result is None: 25 | result = l 26 | else: 27 | try: 28 | result[l.columns.tolist()] = l 29 | except: 30 | print(l.head()) 31 | return result 32 | 33 | 34 | def left_merge(data1,data2,on): 35 | if type(on) != list: 36 | on = [on] 37 | if (set(on) & set(data2.columns)) != set(on): 38 | data2_temp = data2.reset_index() 39 | else: 40 | data2_temp = data2.copy() 41 | columns = [f for f in data2.columns if f not in on] 42 | result = data1.merge(data2_temp,on=on,how='left') 43 | result = result[columns] 44 | return result 45 | 46 | 47 | def get_feat_size(train,size_feat): 48 | """计算A组的数量大小(忽略NaN等价于count)""" 49 | result_path = cache_path + ('_').join(size_feat)+'_feat_count'+'.hdf' 50 | if os.path.exists(result_path) & load: 51 | result = pd.read_hdf(result_path) 52 | else: 53 | result = train[size_feat].groupby(by=size_feat).size().reset_index().rename(columns={0: ('_').join(size_feat)+'_count'}) 54 | result = left_merge(train,result,on=size_feat) 55 | return result 56 | 57 | 58 | def get_feat_size_feat(train,base_feat,other_feat): 59 | """计算唯一计数(等价于unique count)""" 60 | result_path = cache_path + ('_').join(base_feat)+'_count_'+('_').join(other_feat)+'.hdf' 61 | if os.path.exists(result_path) & load: 62 | result = pd.read_hdf(result_path) 63 | else: 64 | result = train[base_feat].groupby(base_feat).size().reset_index()\ 65 | .groupby(other_feat).size().reset_index().rename(columns={0: ('_').join(base_feat)+'_count_'+('_').join(other_feat)}) 66 | result = left_merge(train,result,on=other_feat) 67 | return result 68 | 69 | 70 | def get_feat_stat_feat(train,base_feat,other_feat,stat_list=['min','max','var','size','mean','skew']): 71 | name = ('_').join(base_feat) + '_' + ('_').join(other_feat) + '_' + ('_').join(stat_list) 72 | result_path = cache_path + name +'.hdf' 73 | if os.path.exists(result_path) & load: 74 | result = pd.read_hdf(result_path) 75 | else: 76 | agg_dict = {} 77 | for stat in stat_list: 78 | agg_dict[name+stat] = stat 79 | result = train[base_feat + other_feat].groupby(base_feat)[",".join(other_feat)]\ 80 | .agg(agg_dict) 81 | result = left_merge(train,result,on=base_feat) 82 | return result 83 | 84 | def get_feat_ngroup(train,base_feat): 85 | name = ('_').join(base_feat)+'_ngroup' 86 | result_path = cache_path + ('_').join(base_feat)+'_ngroup'+'.hdf' 87 | if os.path.exists(result_path) & load: 88 | result = pd.read_hdf(result_path, 'w') 89 | else: 90 | train[name] = train.groupby(base_feat).ngroup() 91 | result = train[[name]] 92 | train.drop([name],axis=1,inplace=True) 93 | return result 94 | --------------------------------------------------------------------------------