└── base0326SUB.py /base0326SUB.py: -------------------------------------------------------------------------------- 1 | have done: 2 | 3 | 1.时间处理,转化为真实时间,将小时进行分段映射。 4 | 5 | 2.组合特征,例如 item sales level 和 price level的组合 6 | 7 | 3.组合特征的统计,例如一个user浏览过多少item,item brand等。 8 | 9 | 4.shop中的连续特征离散化。并进行组合 10 | 11 | to do: 12 | 13 | 1.组合特征及全局统计特征根据业务逻辑进行优化 14 | 15 | 2.根据hour map添加时间特征 16 | 17 | 3.类别特征 18 | 19 | 4.巴拉巴拉 20 | 21 | import pandas as pd 22 | import numpy as np 23 | import lightgbm as lgb 24 | from sklearn.model_selection import train_test_split 25 | from sklearn.metrics import log_loss 26 | from sklearn import preprocessing 27 | import warnings 28 | 29 | warnings.filterwarnings("ignore") 30 | 31 | import time 32 | 33 | 34 | def timestamp_datetime(value): 35 | format = '%Y-%m-%d %H:%M:%S' 36 | value = time.localtime(value) 37 | dt = time.strftime(format, value) 38 | return dt 39 | 40 | 41 | def base_process(data): 42 | lbl = preprocessing.LabelEncoder() 43 | print( 44 | '--------------------------------------------------------------item--------------------------------------------------------------') 45 | data['len_item_category'] = data['item_category_list'].map(lambda x: len(str(x).split(';'))) 46 | data['len_item_property'] = data['item_property_list'].map(lambda x: len(str(x).split(';'))) 47 | for i in range(1, 3): 48 | data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map( 49 | lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else '')) # item_category_list的第0列全部都一样 50 | for i in range(10): 51 | data['item_property_list' + str(i)] = lbl.fit_transform(data['item_property_list'].map(lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else '')) 52 | for col in ['item_id', 'item_brand_id', 'item_city_id']: 53 | data[col] = lbl.fit_transform(data[col]) 54 | print( 55 | '--------------------------------------------------------------user--------------------------------------------------------------') 56 | for col in ['user_id']: 57 | data[col] = lbl.fit_transform(data[col]) 58 | print('user 0,1 feature') 59 | data['gender0'] = data['user_gender_id'].apply(lambda x: 1 if x == -1 else 2) 60 | data['age0'] = data['user_age_level'].apply(lambda x: 1 if x == 1004 | x == 1005 | x == 1006 | x == 1007 else 2) 61 | data['occupation0'] = data['user_occupation_id'].apply(lambda x: 1 if x == -1 | x == 2003 else 2) 62 | data['star0'] = data['user_star_level'].apply(lambda x: 1 if x == -1 | x == 3000 | x == 3001 else 2) 63 | print( 64 | '--------------------------------------------------------------context--------------------------------------------------------------') 65 | data['realtime'] = data['context_timestamp'].apply(timestamp_datetime) 66 | data['realtime'] = pd.to_datetime(data['realtime']) 67 | data['day'] = data['realtime'].dt.day 68 | data['hour'] = data['realtime'].dt.hour 69 | data['len_predict_category_property'] = data['predict_category_property'].map(lambda x: len(str(x).split(';'))) 70 | for i in range(5): 71 | data['predict_category_property' + str(i)] = lbl.fit_transform(data['predict_category_property'].map( 72 | lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else '')) 73 | print('context 0,1 feature') 74 | data['context_page0'] = data['context_page_id'].apply( 75 | lambda x: 1 if x == 4001 | x == 4002 | x == 4003 | x == 4004 | x == 4007 else 2) 76 | print( 77 | '--------------------------------------------------------------shop--------------------------------------------------------------') 78 | for col in ['shop_id']: 79 | data[col] = lbl.fit_transform(data[col]) 80 | data['shop_score_delivery0'] = data['shop_score_delivery'].apply(lambda x: 0 if x <= 0.98 and x >= 0.96 else 1) 81 | return data 82 | 83 | 84 | def map_hour(x): 85 | if (x>=7)&(x<=12): 86 | return 1 87 | elif (x>=13)&(x<=20): 88 | return 2 89 | else: 90 | return 3 91 | 92 | def deliver(x): 93 | #x=round(x,6) 94 | jiange=0.1 95 | for i in range(1,20): 96 | if (x>=4.1+jiange*(i-1))&(x<=4.1+jiange*i): 97 | return i+1 98 | if x==-5: 99 | return 1 100 | 101 | def deliver1(x): 102 | if (x>=2)&(x<=4): 103 | return 1 104 | elif (x>=5)&(x<=7): 105 | return 2 106 | else: 107 | return 3 108 | 109 | 110 | def review(x): 111 | # x=round(x,6) 112 | jiange = 0.02 113 | for i in range(1, 30): 114 | if (x >= 0.714 + jiange * (i - 1)) & (x <= 0.714 + jiange * i): 115 | return i + 1 116 | if x == -1: 117 | return 1 118 | 119 | def review1(x): 120 | # x=round(x,6) 121 | if (x>=2)&(x<=12): 122 | return 1 123 | elif (x>=13)&(x<=15): 124 | return 2 125 | else: 126 | return 3 127 | 128 | 129 | def service(x): 130 | #x=round(x,6) 131 | jiange=0.1 132 | for i in range(1,20): 133 | if (x>=3.93+jiange*(i-1))&(x<=3.93+jiange*i): 134 | return i+1 135 | if x==-1: 136 | return 1 137 | 138 | def service1(x): 139 | if (x>=2)&(x<=7): 140 | return 1 141 | elif (x>=8)&(x<=9): 142 | return 2 143 | else: 144 | return 3 145 | 146 | 147 | def describe(x): 148 | #x=round(x,6) 149 | jiange=0.1 150 | for i in range(1,30): 151 | if (x>=3.93+jiange*(i-1))&(x<=3.93+jiange*i): 152 | return i+1 153 | if x==-1: 154 | return 1 155 | 156 | def describe1(x): 157 | if (x>=2)&(x<=8): 158 | return 1 159 | elif (x>=9)&(x<=10): 160 | return 2 161 | else: 162 | return 3 163 | 164 | def shijian(data): 165 | data['hour_map'] = data['hour'].apply(map_hour) 166 | return data 167 | 168 | def shop_fenduan(data): 169 | data['shop_score_delivery'] = data['shop_score_delivery'] * 5 170 | data = data[data['shop_score_delivery'] != -5] 171 | data['deliver_map'] = data['shop_score_delivery'].apply(deliver) 172 | data['deliver_map'] = data['deliver_map'].apply(deliver1) 173 | # del data['shop_score_delivery'] 174 | print(data.deliver_map.value_counts()) 175 | 176 | data['shop_score_service'] = data['shop_score_service'] * 5 177 | data = data[data['shop_score_service'] != -5] 178 | data['service_map'] = data['shop_score_service'].apply(service) 179 | data['service_map'] = data['service_map'].apply(service1) 180 | # del data['shop_score_service'] 181 | print(data.service_map.value_counts()) # 视为好评,中评,差评 182 | # 183 | data['shop_score_description'] = data['shop_score_description'] * 5 184 | data = data[data['shop_score_description'] != -5] 185 | data['de_map'] = data['shop_score_description'].apply(describe) 186 | data['de_map'] = data['de_map'].apply(describe1) 187 | # del data['shop_score_description'] 188 | print(data.de_map.value_counts()) 189 | 190 | data = data[data['shop_review_positive_rate'] != -1] 191 | data['review_map'] = data['shop_review_positive_rate'].apply(review) 192 | data['review_map'] = data['review_map'].apply(review1) 193 | print(data.review_map.value_counts()) 194 | 195 | data['normal_shop'] = data.apply( 196 | lambda x: 1 if (x.deliver_map == 3) & (x.service_map == 3) & (x.de_map == 3) & (x.review_map == 3) else 0, 197 | axis=1) 198 | del data['de_map'] 199 | del data['service_map'] 200 | del data['deliver_map'] 201 | del data['review_map'] 202 | return data 203 | 204 | 205 | def slide_cnt(data): 206 | # item_cnt = data.groupby(by='item_id').count()['instance_id'].to_dict() 207 | # data['item_cnt'] = data['item_id'].apply(lambda x: item_cnt[x]) 208 | # user_cnt = data.groupby(by='user_id').count()['instance_id'].to_dict() 209 | # data['user_cnt'] = data['user_id'].apply(lambda x: user_cnt[x]) 210 | # shop_cnt = data.groupby(by='shop_id').count()['instance_id'].to_dict() 211 | # data['shop_cnt'] = data['shop_id'].apply(lambda x: shop_cnt[x]) 212 | 213 | print('当前日期前一天的cnt') 214 | for d in range(19, 26): # 18到24号 215 | df1 = data[data['day'] == d - 1] 216 | df2 = data[data['day'] == d] # 19到25号 217 | user_cnt = df1.groupby(by='user_id').count()['instance_id'].to_dict() 218 | item_cnt = df1.groupby(by='item_id').count()['instance_id'].to_dict() 219 | shop_cnt = df1.groupby(by='shop_id').count()['instance_id'].to_dict() 220 | df2['user_cnt1'] = df2['user_id'].apply(lambda x: user_cnt.get(x, 0)) 221 | df2['item_cnt1'] = df2['item_id'].apply(lambda x: item_cnt.get(x, 0)) 222 | df2['shop_cnt1'] = df2['shop_id'].apply(lambda x: shop_cnt.get(x, 0)) 223 | df2 = df2[['user_cnt1', 'item_cnt1', 'shop_cnt1', 'instance_id']] 224 | if d == 19: 225 | Df2 = df2 226 | else: 227 | Df2 = pd.concat([df2, Df2]) 228 | data = pd.merge(data, Df2, on=['instance_id'], how='left') 229 | print('当前日期之前的cnt') 230 | for d in range(19, 26): 231 | # 19到25,25是test 232 | df1 = data[data['day'] < d] 233 | df2 = data[data['day'] == d] 234 | user_cnt = df1.groupby(by='user_id').count()['instance_id'].to_dict() 235 | item_cnt = df1.groupby(by='item_id').count()['instance_id'].to_dict() 236 | shop_cnt = df1.groupby(by='shop_id').count()['instance_id'].to_dict() 237 | df2['user_cntx'] = df2['user_id'].apply(lambda x: user_cnt.get(x, 0)) 238 | df2['item_cntx'] = df2['item_id'].apply(lambda x: item_cnt.get(x, 0)) 239 | df2['shop_cntx'] = df2['shop_id'].apply(lambda x: shop_cnt.get(x, 0)) 240 | df2 = df2[['user_cntx', 'item_cntx', 'shop_cntx', 'instance_id']] 241 | if d == 19: 242 | Df2 = df2 243 | else: 244 | Df2 = pd.concat([df2, Df2]) 245 | data = pd.merge(data, Df2, on=['instance_id'], how='left') 246 | 247 | print("前一个小时的统计量") 248 | 249 | return data 250 | 251 | 252 | def zuhe(data): 253 | for col in ['user_gender_id','user_age_level','user_occupation_id','user_star_level']: 254 | data[col] = data[col].apply(lambda x: 0 if x == -1 else x) 255 | 256 | for col in ['item_sales_level', 'item_price_level', 'item_collected_level', 257 | 'user_gender_id','user_age_level','user_occupation_id','user_star_level', 258 | 'shop_review_num_level', 'shop_star_level']: 259 | data[col] = data[col].astype(str) 260 | 261 | print('item两两组合') 262 | data['sale_price'] = data['item_sales_level'] + data['item_price_level'] 263 | data['sale_collect'] = data['item_sales_level'] + data['item_collected_level'] 264 | data['price_collect'] = data['item_price_level'] + data['item_collected_level'] 265 | 266 | print('user两两组合') 267 | data['gender_age'] = data['user_gender_id'] + data['user_age_level'] 268 | data['gender_occ'] = data['user_gender_id'] + data['user_occupation_id'] 269 | data['gender_star'] = data['user_gender_id'] + data['user_star_level'] 270 | 271 | print('shop两两组合') 272 | data['review_star'] = data['shop_review_num_level'] + data['shop_star_level'] 273 | 274 | 275 | for col in ['item_sales_level', 'item_price_level', 'item_collected_level', 'sale_price','sale_collect', 'price_collect', 276 | 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level','gender_age','gender_occ','gender_star', 277 | 'shop_review_num_level','shop_star_level','review_star']: 278 | data[col] = data[col].astype(int) 279 | 280 | del data['review_star'] 281 | 282 | return data 283 | 284 | def item(data): 285 | print('一个item有多少brand,price salse collected level……') 286 | 287 | itemcnt = data.groupby(['item_id'], as_index=False)['instance_id'].agg({'item_cnt': 'count'}) 288 | data = pd.merge(data, itemcnt, on=['item_id'], how='left') 289 | 290 | for col in ['item_brand_id','item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']: 291 | itemcnt = data.groupby([col, 'item_id'], as_index=False)['instance_id'].agg({str(col) + '_item_cnt': 'count'}) 292 | data = pd.merge(data, itemcnt, on=[col, 'item_id'], how='left') 293 | data[str(col) + '_item_prob']=data[str(col) + '_item_cnt']/data['item_cnt'] 294 | del data['item_cnt'] 295 | 296 | print('一个brand有多少price salse collected level……') 297 | 298 | itemcnt = data.groupby(['item_brand_id'], as_index=False)['instance_id'].agg({'item_brand_cnt': 'count'}) 299 | data = pd.merge(data, itemcnt, on=['item_brand_id'], how='left') 300 | 301 | for col in ['item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']: 302 | itemcnt = data.groupby([col, 'item_brand_id'], as_index=False)['instance_id'].agg({str(col) + '_brand_cnt': 'count'}) 303 | data = pd.merge(data, itemcnt, on=[col, 'item_brand_id'], how='left') 304 | data[str(col) + '_brand_prob'] = data[str(col) + '_brand_cnt'] / data['item_brand_cnt'] 305 | del data['item_brand_cnt'] 306 | 307 | print('一个city有多少item_price_level,item_sales_level,item_collected_level,item_pv_level') 308 | 309 | itemcnt = data.groupby(['item_city_id'], as_index=False)['instance_id'].agg({'item_city_cnt': 'count'}) 310 | data = pd.merge(data, itemcnt, on=['item_city_id'], how='left') 311 | for col in ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']: 312 | itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_city_cnt': 'count'}) 313 | data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left') 314 | data[str(col) + '_city_prob'] = data[str(col) + '_city_cnt'] / data['item_city_cnt'] 315 | del data['item_city_cnt'] 316 | 317 | print('一个price有多少item_sales_level,item_collected_level,item_pv_level') 318 | 319 | itemcnt = data.groupby(['item_price_level'], as_index=False)['instance_id'].agg({'item_price_cnt': 'count'}) 320 | data = pd.merge(data, itemcnt, on=['item_price_level'], how='left') 321 | for col in ['item_sales_level', 'item_collected_level', 'item_pv_level']: 322 | itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_price_cnt': 'count'}) 323 | data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left') 324 | data[str(col) + '_price_prob'] = data[str(col) + '_price_cnt'] / data['item_price_cnt'] 325 | del data['item_price_cnt'] 326 | 327 | print('一个item_sales_level有多少item_collected_level,item_pv_level') 328 | 329 | itemcnt = data.groupby(['item_sales_level'], as_index=False)['instance_id'].agg({'item_salse_cnt': 'count'}) 330 | data = pd.merge(data, itemcnt, on=['item_sales_level'], how='left') 331 | for col in ['item_collected_level', 'item_pv_level']: 332 | itemcnt = data.groupby([col, 'item_sales_level'], as_index=False)['instance_id'].agg({str(col) + '_salse_cnt': 'count'}) 333 | data = pd.merge(data, itemcnt, on=[col, 'item_sales_level'], how='left') 334 | data[str(col) + '_salse_prob'] = data[str(col) + '_salse_cnt'] / data['item_salse_cnt'] 335 | del data['item_salse_cnt'] 336 | 337 | print('一个item_collected_level有多少item_pv_level') 338 | 339 | itemcnt = data.groupby(['item_collected_level'], as_index=False)['instance_id'].agg({'item_coll_cnt': 'count'}) 340 | data = pd.merge(data, itemcnt, on=['item_collected_level'], how='left') 341 | for col in ['item_pv_level']: 342 | itemcnt = data.groupby([col, 'item_collected_level'], as_index=False)['instance_id'].agg({str(col) + '_coll_cnt': 'count'}) 343 | data = pd.merge(data, itemcnt, on=[col, 'item_collected_level'], how='left') 344 | data[str(col) + '_coll_prob'] = data[str(col) + '_coll_cnt'] / data['item_coll_cnt'] 345 | del data['item_coll_cnt'] 346 | 347 | return data 348 | 349 | def user(data): 350 | print('用户有多少性别') 351 | itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'}) 352 | data = pd.merge(data, itemcnt, on=['user_id'], how='left') 353 | 354 | for col in ['user_gender_id','user_age_level', 'user_occupation_id', 'user_star_level']: 355 | itemcnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col) + '_user_cnt': 'count'}) 356 | data = pd.merge(data, itemcnt, on=[col, 'user_id'], how='left') 357 | data[str(col) + '_user_prob']=data[str(col) + '_user_cnt']/data['user_cnt'] 358 | del data['user_cnt'] 359 | 360 | print('性别的年龄段,职业有多少') 361 | itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'}) 362 | data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left') 363 | 364 | for col in ['user_age_level', 'user_occupation_id', 'user_star_level']: 365 | itemcnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col) + '_user_gender_cnt': 'count'}) 366 | data = pd.merge(data, itemcnt, on=[col, 'user_gender_id'], how='left') 367 | data[str(col) + '_user_gender_prob']=data[str(col) + '_user_gender_cnt']/data['user_gender_cnt'] 368 | del data['user_gender_cnt'] 369 | 370 | print('user_age_level对应的user_occupation_id,user_star_level') 371 | itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'}) 372 | data = pd.merge(data, itemcnt, on=['user_age_level'], how='left') 373 | 374 | for col in ['user_occupation_id', 'user_star_level']: 375 | itemcnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col) + '_user_age_cnt': 'count'}) 376 | data = pd.merge(data, itemcnt, on=[col, 'user_age_level'], how='left') 377 | data[str(col) + '_user_age_prob']=data[str(col) + '_user_age_cnt']/data['user_age_cnt'] 378 | del data['user_age_cnt'] 379 | 380 | print('user_occupation_id对应的user_star_level') 381 | itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'}) 382 | data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left') 383 | for col in ['user_star_level']: 384 | itemcnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col) + '_user_occ_cnt': 'count'}) 385 | data = pd.merge(data, itemcnt, on=[col, 'user_occupation_id'], how='left') 386 | data[str(col) + '_user_occ_prob']=data[str(col) + '_user_occ_cnt']/data['user_occ_cnt'] 387 | del data['user_occ_cnt'] 388 | 389 | return data 390 | 391 | def user_item(data): 392 | itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'}) 393 | data = pd.merge(data, itemcnt, on=['user_id'], how='left') 394 | print('一个user有多少item_id,item_brand_id……') 395 | for col in ['item_id', 396 | 'item_brand_id','item_city_id','item_price_level', 397 | 'item_sales_level','item_collected_level','item_pv_level']: 398 | item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col)+'_user_cnt': 'count'}) 399 | data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left') 400 | data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt'] 401 | 402 | print('一个user_gender有多少item_id,item_brand_id……') 403 | itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'}) 404 | data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left') 405 | for col in ['item_id', 406 | 'item_brand_id','item_city_id','item_price_level', 407 | 'item_sales_level','item_collected_level','item_pv_level']: 408 | item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col)+'_user_gender_cnt': 'count'}) 409 | data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left') 410 | data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt'] 411 | 412 | print('一个user_age_level有多少item_id,item_brand_id……') 413 | itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'}) 414 | data = pd.merge(data, itemcnt, on=['user_age_level'], how='left') 415 | for col in ['item_id', 416 | 'item_brand_id','item_city_id','item_price_level', 417 | 'item_sales_level','item_collected_level','item_pv_level']: 418 | item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col)+'_user_age_cnt': 'count'}) 419 | data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left') 420 | data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt'] 421 | 422 | print('一个user_occupation_id有多少item_id,item_brand_id…') 423 | itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'}) 424 | data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left') 425 | for col in ['item_id', 426 | 'item_brand_id','item_city_id','item_price_level', 427 | 'item_sales_level','item_collected_level','item_pv_level']: 428 | item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col)+'_user_occ_cnt': 'count'}) 429 | data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left') 430 | data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt'] 431 | 432 | return data 433 | 434 | 435 | def user_shop(data): 436 | print('一个user有多少shop_id,shop_review_num_level……') 437 | 438 | for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']: 439 | item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg( 440 | {str(col) + '_user_cnt': 'count'}) 441 | data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left') 442 | data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt'] 443 | del data['user_cnt'] 444 | 445 | print('一个user_gender有多少shop_id,shop_review_num_level……') 446 | for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']: 447 | item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg( 448 | {str(col) + '_user_gender_cnt': 'count'}) 449 | data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left') 450 | data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt'] 451 | del data['user_gender_cnt'] 452 | 453 | print('一个user_age_level有多少shop_id,shop_review_num_level……') 454 | for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']: 455 | item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg( 456 | {str(col) + '_user_age_cnt': 'count'}) 457 | data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left') 458 | data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt'] 459 | del data['user_age_cnt'] 460 | 461 | print('一个user_occupation_id有多少shop_id,shop_review_num_level……') 462 | for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']: 463 | item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg( 464 | {str(col) + '_user_occ_cnt': 'count'}) 465 | data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left') 466 | data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt'] 467 | del data['user_occ_cnt'] 468 | 469 | return data 470 | 471 | 472 | def shop_item(data): 473 | print('一个shop有多少item_id,item_brand_id,item_city_id,item_price_level……') 474 | itemcnt = data.groupby(['shop_id'], as_index=False)['instance_id'].agg({'shop_cnt': 'count'}) 475 | data = pd.merge(data, itemcnt, on=['shop_id'], how='left') 476 | for col in ['item_id', 477 | 'item_brand_id','item_city_id','item_price_level', 478 | 'item_sales_level','item_collected_level','item_pv_level']: 479 | item_shop_cnt = data.groupby([col, 'shop_id'], as_index=False)['instance_id'].agg({str(col)+'_shop_cnt': 'count'}) 480 | data = pd.merge(data, item_shop_cnt, on=[col, 'shop_id'], how='left') 481 | data[str(col) + '_shop_prob'] = data[str(col) + '_shop_cnt'] / data['shop_cnt'] 482 | del data['shop_cnt'] 483 | 484 | print('一个shop_review_num_level有多少item_id,item_brand_id,item_city_id,item_price_level……') 485 | itemcnt = data.groupby(['shop_review_num_level'], as_index=False)['instance_id'].agg({'shop_rev_cnt': 'count'}) 486 | data = pd.merge(data, itemcnt, on=['shop_review_num_level'], how='left') 487 | for col in ['item_id', 488 | 'item_brand_id','item_city_id','item_price_level', 489 | 'item_sales_level','item_collected_level','item_pv_level']: 490 | item_shop_cnt = data.groupby([col, 'shop_review_num_level'], as_index=False)['instance_id'].agg({str(col)+'_shop_rev_cnt': 'count'}) 491 | data = pd.merge(data, item_shop_cnt, on=[col, 'shop_review_num_level'], how='left') 492 | data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt'] 493 | del data['shop_rev_cnt'] 494 | 495 | # print('一个shop_star_level有多少item_id,item_brand_id,item_city_id,item_price_level……') 496 | # itemcnt = data.groupby(['shop_star_level'], as_index=False)['instance_id'].agg({'shop_star_cnt': 'count'}) 497 | # data = pd.merge(data, itemcnt, on=['shop_star_level'], how='left') 498 | # for col in ['item_id', 499 | # 'item_brand_id', 'item_city_id', 'item_price_level', 500 | # 'item_sales_level', 'item_collected_level', 'item_pv_level']: 501 | # item_shop_cnt = data.groupby([col, 'shop_star_level'], as_index=False)['instance_id'].agg({str(col) + '_shop_star_cnt': 'count'}) 502 | # data = pd.merge(data, item_shop_cnt, on=[col, 'shop_star_level'], how='left') 503 | # data[str(col) + '_shop_star_prob'] = data[str(col) + '_shop_star_cnt'] / data['shop_star_cnt'] 504 | # del data['shop_star_cnt'] 505 | return data 506 | 507 | 508 | def lgbCV(train, test): 509 | col = [c for c in train if 510 | c not in ['is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'instance_id', 511 | 'context_id', 'realtime', 'context_timestamp']] 512 | # cat = ['sale_price', 'gender_star', 'user_age_level', 'item_price_level', 'item_sales_level', 'sale_collect', 513 | # 'price_collect', 'item_brand_id', 'user_star_level', 'item_id', 'shop_id', 514 | # 'item_city_id', 'context_page_id', 'gender_age', 'shop_star_level', 'item_pv_level', 'user_occupation_id', 515 | # 'day', 'gender_occ', 'user_gender_id'] 516 | X = train[col] 517 | y = train['is_trade'].values 518 | X_tes = test[col] 519 | y_tes = test['is_trade'].values 520 | print('Training LGBM model...') 521 | lgb0 = lgb.LGBMClassifier( 522 | objective='binary', 523 | # metric='binary_error', 524 | num_leaves=35, 525 | depth=8, 526 | learning_rate=0.05, 527 | seed=2018, 528 | colsample_bytree=0.8, 529 | # min_child_samples=8, 530 | subsample=0.9, 531 | n_estimators=20000) 532 | lgb_model = lgb0.fit(X, y, eval_set=[(X_tes, y_tes)], early_stopping_rounds=200) 533 | best_iter = lgb_model.best_iteration_ 534 | predictors = [i for i in X.columns] 535 | feat_imp = pd.Series(lgb_model.feature_importance(), predictors).sort_values(ascending=False) 536 | print(feat_imp) 537 | print(feat_imp.shape) 538 | # pred= lgb_model.predict(test[col]) 539 | pred = lgb_model.predict_proba(test[col])[:, 1] 540 | test['pred'] = pred 541 | test['index'] = range(len(test)) 542 | # print(test[['is_trade','pred']]) 543 | print('误差 ', log_loss(test['is_trade'], test['pred'])) 544 | return best_iter 545 | 546 | def sub(train, test, best_iter): 547 | col = [c for c in train if 548 | c not in ['is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'instance_id', 549 | 'context_id', 'realtime', 'context_timestamp']] 550 | X = train[col] 551 | y = train['is_trade'].values 552 | print('Training LGBM model...') 553 | lgb0 = lgb.LGBMClassifier( 554 | objective='binary', 555 | # metric='binary_error', 556 | num_leaves=35, 557 | depth=8, 558 | learning_rate=0.05, 559 | seed=2018, 560 | colsample_bytree=0.8, 561 | # min_child_samples=8, 562 | subsample=0.9, 563 | n_estimators=best_iter) 564 | lgb_model = lgb0.fit(X, y) 565 | predictors = [i for i in X.columns] 566 | feat_imp = pd.Series(lgb_model.feature_importance(), predictors).sort_values(ascending=False) 567 | print(feat_imp) 568 | print(feat_imp.shape) 569 | # pred= lgb_model.predict(test[col]) 570 | pred = lgb_model.predict_proba(test[col])[:, 1] 571 | test['predicted_score'] = pred 572 | sub1 = test[['instance_id', 'predicted_score']] 573 | sub=pd.read_csv("input/test.txt", sep="\s+") 574 | sub=pd.merge(sub,sub1,on=['instance_id'],how='left') 575 | sub=sub.fillna(0) 576 | #sub[['instance_id', 'predicted_score']].to_csv('result/result0320.csv',index=None,sep=' ') 577 | sub[['instance_id', 'predicted_score']].to_csv('result/result0326.txt',sep=" ",index=False) 578 | 579 | 580 | if __name__ == "__main__": 581 | train = pd.read_csv("input/train.txt", sep="\s+") 582 | test = pd.read_csv("input/test.txt", sep="\s+") 583 | data = pd.concat([train, test]) 584 | data = data.drop_duplicates(subset='instance_id') # 把instance id去重 585 | print('make feature') 586 | data = base_process(data) 587 | data=shijian(data) 588 | data=shop_fenduan(data) 589 | data = slide_cnt(data) 590 | data = zuhe(data) 591 | print('----------------------------全局统计特征---------------------------------------------------') 592 | data = item(data) 593 | data = user(data) 594 | data = user_item(data) 595 | data = user_shop(data) 596 | data=shop_item(data) 597 | "----------------------------------------------------线下----------------------------------------" 598 | train= data[(data['day'] >= 18) & (data['day'] <= 23)] 599 | test= data[(data['day'] == 24)] 600 | best_iter = lgbCV(train, test) 601 | "----------------------------------------------------线上----------------------------------------" 602 | train = data[data.is_trade.notnull()] 603 | test = data[data.is_trade.isnull()] 604 | sub(train, test, best_iter) 605 | --------------------------------------------------------------------------------