├── README.md ├── com_uitl.py ├── ctrip_test_all_data_v29_lgb_feature.py ├── ctrip_test_all_data_v31_lgb_feature.py ├── ctrip_test_all_data_v42.py ├── ctrip_test_all_data_v43.py ├── data └── README.md ├── stage_1_数据预处理.py ├── stage_2_特征构造及预测.bat ├── stage_3_融合提交.py ├── try ├── README.md └── leak.zip ├── 代码说明.txt └── 最高得分结果文件.csv /README.md: -------------------------------------------------------------------------------- 1 | 比赛地址和数据:https://www.kesci.com/apps/home/#!/competition/58dba69775722d38fa2dfcf6/content/0 2 | 3 | 通过分析,我们发现数据集是按照orderdate_lastord字段排序过的,所以为了保证抽取的样本和原样本相似,我们将训练样本分成:02468 和13579 4 | 5 | 主要从uid,basicroomid,roomid三个主体进行特征构造。 6 | 7 | 1.uid:用户上次订单的相关信息反映了用户的偏好,通过对比这次订单和上次订单的差异来构造特征。 8 | 9 | 2.basicroomid:通过构造basicroom的特征,模型才可以通过比较不同basicroom的相似性和差异性从训练中学习到用户为什么选择该basicroom。 10 | 11 | 3.roomid:通过构造room的特征,原因同上。 12 | 13 | 通过分析,我们还大胆的猜测了roomtag_1其实是携程的推荐,标签应该是猜你喜欢。我们的模型其实已经相当于stacking的第一层。 14 | 所以我们也根据这种思路进行融合。 15 | 我们在02468训练,然后预测13579以及test上的所有数据集。得到一个新特征:prob 16 | 17 | 因为题目要求预测7天的数据,所以前6天的数据在这些特征上是穿越的。 18 | 所以我们将预测分成了两个模型来分开预测:前6天&第7天 19 | 20 | 在以上特征的基础上使用lgb进行训练,并未继续做任何融合。 21 | -------------------------------------------------------------------------------- /com_uitl.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | def merge_count(df,columns,value,cname): 3 | add = pd.DataFrame(df.groupby(columns)[value].count()).reset_index() 4 | add.columns=columns+[cname] 5 | df=df.merge(add,on=columns,how="left") 6 | return df 7 | 8 | def merge_nunique(df,columns,value,cname): 9 | add = pd.DataFrame(df.groupby(columns)[value].nunique()).reset_index() 10 | add.columns=columns+[cname] 11 | df=df.merge(add,on=columns,how="left") 12 | return df 13 | 14 | def merge_median(df,columns,value,cname): 15 | add = pd.DataFrame(df.groupby(columns)[value].median()).reset_index() 16 | add.columns=columns+[cname] 17 | df=df.merge(add,on=columns,how="left") 18 | return df 19 | 20 | def merge_mean(df,columns,value,cname): 21 | add = pd.DataFrame(df.groupby(columns)[value].mean()).reset_index() 22 | add.columns=columns+[cname] 23 | df=df.merge(add,on=columns,how="left") 24 | return df 25 | 26 | def merge_sum(df,columns,value,cname): 27 | add = pd.DataFrame(df.groupby(columns)[value].sum()).reset_index() 28 | add.columns=columns+[cname] 29 | df=df.merge(add,on=columns,how="left") 30 | return df 31 | 32 | def merge_max(df,columns,value,cname): 33 | add = pd.DataFrame(df.groupby(columns)[value].max()).reset_index() 34 | add.columns=columns+[cname] 35 | df=df.merge(add,on=columns,how="left") 36 | return df 37 | 38 | def merge_min(df,columns,value,cname): 39 | add = pd.DataFrame(df.groupby(columns)[value].min()).reset_index() 40 | add.columns=columns+[cname] 41 | df=df.merge(add,on=columns,how="left") 42 | return df 43 | 44 | def merge_std(df,columns,value,cname): 45 | add = pd.DataFrame(df.groupby(columns)[value].std()).reset_index() 46 | add.columns=columns+[cname] 47 | df=df.merge(add,on=columns,how="left") 48 | return df 49 | 50 | -------------------------------------------------------------------------------- /ctrip_test_all_data_v29_lgb_feature.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import pandas as pd 3 | import lightgbm as lgb 4 | from com_uitl import * 5 | 6 | 7 | # 每个basicid价格的中位数 8 | def df_median(df): 9 | add = pd.DataFrame(df.groupby(["orderid", "basicroomid"]).price_deduct.median()).reset_index() 10 | add.columns = ["orderid", "basicroomid", "basicroomid_price_deduct_median"] 11 | df = df.merge(add, on=["orderid", "basicroomid"], how="left") 12 | return df 13 | 14 | # 每个basicid价格的最小值 15 | def df_min(df): 16 | add = pd.DataFrame(df.groupby(["orderid", "basicroomid"]).price_deduct.min()).reset_index() 17 | add.columns = ["orderid", "basicroomid", "basicroomid_price_deduct_min"] 18 | df = df.merge(add, on=["orderid", "basicroomid"], how="left") 19 | return df 20 | 21 | # 每个orderid价格的最小值 22 | def df_min_orderid(df): 23 | add = pd.DataFrame(df.groupby(["orderid"]).price_deduct.min()).reset_index() 24 | add.columns = ["orderid", "orderid_price_deduct_min"] 25 | df = df.merge(add, on=["orderid"], how="left") 26 | return df 27 | 28 | #排序特征 29 | def df_rank_mean(df): 30 | add = pd.DataFrame(df.groupby(["basicroomid"]).orderid_price_deduct_min_rank.mean()).reset_index() 31 | add.columns = ["basicroomid","orderid_price_deduct_min_rank_mean"] 32 | df = df.merge(add, on=["basicroomid"], how="left") 33 | return df 34 | 35 | def df_roomrank_mean(df): 36 | add = pd.DataFrame(df.groupby(["roomid"]).basicroomid_price_rank.mean()).reset_index() 37 | add.columns = ["roomid","basicroomid_price_rank_mean"] 38 | df = df.merge(add, on=["roomid"], how="left") 39 | return df 40 | 41 | 42 | #添加转化率特征 43 | leak=pd.read_csv("try/leak.csv") 44 | #提取basicroomid的转化率 45 | feature_df=leak[["orderid","basicroomid","orderlabel"]].copy() 46 | feature_df.sort_values("orderlabel") 47 | feature_df=feature_df.drop_duplicates(["orderid","basicroomid"],keep="last") 48 | basicroom_mean=pd.DataFrame(feature_df.groupby("basicroomid").orderlabel.mean()).reset_index() 49 | basicroom_mean.columns=["basicroomid","basicroomid_mean"] 50 | 51 | basicroom_sum=pd.DataFrame(feature_df.groupby("basicroomid").orderlabel.sum()).reset_index() 52 | basicroom_sum.columns=["basicroomid","basicroomid_sum"] 53 | 54 | del leak 55 | 56 | #basic的trick 57 | trick_1=pd.read_csv("feature_3days_train.csv").append(pd.read_csv("feature_3days_test.csv")) 58 | trick_2=trick_1.copy() 59 | trick_3=trick_1.copy() 60 | trick_1.columns=["hotelid","basicroomid","orderdate","trick_1"] 61 | trick_2.columns=["hotelid","basicroomid","orderdate","trick_2"] 62 | trick_3.columns=["hotelid","basicroomid","orderdate","trick_3"] 63 | trick_1["orderdate"]=trick_1["orderdate"]-1 64 | trick_2["orderdate"]=trick_2["orderdate"]-2 65 | trick_3["orderdate"]=trick_3["orderdate"]-3 66 | #room的trick 67 | trick_room=pd.read_csv("feature_room_30days_ordnumratio_train.csv").append(pd.read_csv("feature_room_30days_ordnumratio_test.csv")) 68 | trick_room.columns=["hotelid","roomid","orderdate","trick_room"] 69 | trick_room["orderdate"]=trick_room["orderdate"]-1 70 | #basic七天的的trick 71 | trick_basic_7d=pd.read_csv("feature_basic_7days_train.csv").append(pd.read_csv("feature_basic_7days_test.csv")) 72 | trick_basic_7d.columns=["hotelid","basicroomid","orderdate","trick_basic_7d"] 73 | trick_basic_7d["orderdate"]=trick_basic_7d["orderdate"]-1 74 | #basic30天的trick 75 | trick_basic_30d=pd.read_csv("feature_basic_30days_train.csv").append(pd.read_csv("feature_basic_30days_test.csv")) 76 | trick_basic_30d_fill=trick_basic_30d.copy() 77 | trick_basic_30d.columns=["hotelid","basicroomid","orderdate","trick_basic_30d"] 78 | trick_basic_30d["orderdate"]=trick_basic_30d["orderdate"]-1 79 | #trick_basic_realratio 80 | trick_basic_realratio=pd.read_csv("feature_basic_30days_realratio_train.csv").append(pd.read_csv("feature_basic_30days_realratio_test.csv")) 81 | trick_basic_realratio.columns=["basicroomid","orderdate","trick_basic_30days_realratio"] 82 | trick_basic_realratio["orderdate"]=trick_basic_realratio["orderdate"]-1 83 | #trick_room_realratio 84 | trick_room_realratio=pd.read_csv("feature_room_30days_realratio_train.csv").append(pd.read_csv("feature_room_30days_realratio_test.csv")) 85 | trick_room_realratio.columns=["roomid","orderdate","trick_room_30days_realratio"] 86 | trick_room_realratio["orderdate"]=trick_room_realratio["orderdate"]-1 87 | 88 | 89 | #构造特征 90 | #for j in range(10): 91 | for j in [0,2,4,6,8]: 92 | #for j in [1,3,5,7,9]: 93 | print j 94 | all=pd.read_csv("try/offline_%s.csv"%j) 95 | 96 | # test中的basic_30days_ordnumratio数据异常,需要重新设置 97 | del all["basic_30days_ordnumratio"] 98 | all = all.merge(trick_basic_30d_fill, on=["hotelid", "basicroomid", "orderdate"], how="left") 99 | # 使用穿越特征 100 | all = all.merge(trick_1, on=["hotelid", "basicroomid", "orderdate"], how="left") 101 | all = all.merge(trick_2, on=["hotelid", "basicroomid", "orderdate"], how="left") 102 | all = all.merge(trick_3, on=["hotelid", "basicroomid", "orderdate"], how="left") 103 | all["basic_trick_today"] = all["trick_1"] - all["basic_recent3_ordernum_ratio"] 104 | 105 | all = all.merge(trick_room, on=["hotelid", "roomid", "orderdate"], how="left") 106 | all["room_trick_today"] = all["trick_room"] - all["room_30days_ordnumratio"] 107 | 108 | all = all.merge(trick_basic_7d, on=["hotelid", "basicroomid", "orderdate"], how="left") 109 | all["basic_7d_trick_today"] = all["trick_basic_7d"] - all["basic_week_ordernum_ratio"] 110 | 111 | all = all.merge(trick_basic_30d, on=["hotelid", "basicroomid", "orderdate"], how="left") 112 | all["basic_30d_trick_today"] = all["trick_basic_30d"] - all["basic_30days_ordnumratio"] 113 | 114 | all = all.merge(trick_basic_realratio, on=["basicroomid", "orderdate"], how="left") 115 | all["basic_30days_realratio_today"] = all["trick_basic_30days_realratio"] - all["basic_30days_realratio"] 116 | all = all.merge(trick_room_realratio, on=["roomid", "orderdate"], how="left") 117 | all["room_30days_realratio_today"] = all["trick_room_30days_realratio"] - all["room_30days_realratio"] 118 | 119 | all["roomid_ori"] = all["roomid"] 120 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 121 | 122 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 123 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 124 | 125 | all=df_median(all) 126 | all=df_min(all) 127 | all=df_min_orderid(all) 128 | 129 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 130 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 131 | 132 | all = df_rank_mean(all) 133 | all = df_roomrank_mean(all) 134 | 135 | 136 | #添加新特征20170527 137 | #平均值 138 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 139 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 140 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 141 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 142 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 143 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 144 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 145 | 146 | 147 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 148 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 149 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 150 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 151 | 152 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 153 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 154 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 155 | 156 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 157 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 158 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 159 | 160 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 161 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 162 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 163 | 164 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 165 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 166 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 167 | 168 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 169 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 170 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 171 | 172 | #上次订购的价格和当时最低价的比 173 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 174 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 175 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 176 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 177 | 178 | 179 | all["price_tail1"]=all["price_deduct"]%10 180 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 181 | #all["price_tail2"]=all["price_deduct"]%100 182 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 183 | #del all["basicroomid_lastord"] 184 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 185 | #del all["roomid_lastord"] 186 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 187 | #del all["hotelid_lastord"] 188 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 189 | 190 | #价格高低 191 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 192 | 193 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 194 | 195 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 196 | 197 | 198 | for i in [2,3,4,5,6,8]: 199 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 200 | del all["roomservice_%s_lastord"%i] 201 | 202 | for i in [2,3,4,5,6]: 203 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 204 | del all["roomtag_%s_lastord"%i] 205 | 206 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 207 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 208 | del all["ordertype_%s_ratio"%i] 209 | 210 | #所有的 211 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 212 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 213 | 214 | ]: 215 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 216 | 217 | #一周的 218 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 219 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 220 | ]: 221 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 222 | 223 | #一个月的 224 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 225 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 226 | 227 | ]: 228 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 229 | 230 | #三个月的 231 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 232 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 233 | 234 | ]: 235 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 236 | 237 | 238 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 239 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 240 | 241 | all["star_dif"]=all["user_avgstar"]-all["star"] 242 | 243 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 244 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 245 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 246 | 247 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 248 | 249 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 250 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 251 | 252 | print all.shape 253 | 254 | if j==0 or j==1: 255 | train=all 256 | else: 257 | train=train.append(all) 258 | 259 | #算法测试 260 | train_y=train["orderlabel"].values 261 | del train["orderlabel"] 262 | 263 | 264 | print train.shape 265 | #lgb算法 266 | train = lgb.Dataset(train, label=train_y) 267 | params = { 268 | 'boosting_type': 'gbdt', 269 | 'objective': 'binary', 270 | 'metric': 'binary_logloss', 271 | 'min_child_weight': 1.5, 272 | 'num_leaves': 2 ** 5, 273 | 'lambda_l2': 10, 274 | 'subsample': 0.7, 275 | 'colsample_bytree': 0.7, 276 | 'colsample_bylevel': 0.7, 277 | 'learning_rate': 0.05, 278 | 'tree_method': 'exact', 279 | 'seed': 2017, 280 | 'nthread': 12, 281 | 'silent': True, 282 | } 283 | num_round = 1300 284 | model = lgb.train(params, train, num_round, 285 | ) 286 | 287 | 288 | for j in range(10): 289 | print j 290 | all=pd.read_csv("try/online_%s.csv"%j) 291 | 292 | # test中的basic_30days_ordnumratio数据异常,需要重新设置 293 | del all["basic_30days_ordnumratio"] 294 | all = all.merge(trick_basic_30d_fill, on=["hotelid", "basicroomid", "orderdate"], how="left") 295 | # 使用穿越特征 296 | all = all.merge(trick_1, on=["hotelid", "basicroomid", "orderdate"], how="left") 297 | all = all.merge(trick_2, on=["hotelid", "basicroomid", "orderdate"], how="left") 298 | all = all.merge(trick_3, on=["hotelid", "basicroomid", "orderdate"], how="left") 299 | all["basic_trick_today"] = all["trick_1"] - all["basic_recent3_ordernum_ratio"] 300 | 301 | all = all.merge(trick_room, on=["hotelid", "roomid", "orderdate"], how="left") 302 | all["room_trick_today"] = all["trick_room"] - all["room_30days_ordnumratio"] 303 | 304 | all = all.merge(trick_basic_7d, on=["hotelid", "basicroomid", "orderdate"], how="left") 305 | all["basic_7d_trick_today"] = all["trick_basic_7d"] - all["basic_week_ordernum_ratio"] 306 | 307 | all = all.merge(trick_basic_30d, on=["hotelid", "basicroomid", "orderdate"], how="left") 308 | all["basic_30d_trick_today"] = all["trick_basic_30d"] - all["basic_30days_ordnumratio"] 309 | 310 | all = all.merge(trick_basic_realratio, on=["basicroomid", "orderdate"], how="left") 311 | all["basic_30days_realratio_today"] = all["trick_basic_30days_realratio"] - all["basic_30days_realratio"] 312 | all = all.merge(trick_room_realratio, on=["roomid", "orderdate"], how="left") 313 | all["room_30days_realratio_today"] = all["trick_room_30days_realratio"] - all["room_30days_realratio"] 314 | 315 | all["roomid_ori"] = all["roomid"] 316 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 317 | 318 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 319 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 320 | 321 | all=df_median(all) 322 | all=df_min(all) 323 | all=df_min_orderid(all) 324 | 325 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 326 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 327 | 328 | all = df_rank_mean(all) 329 | all = df_roomrank_mean(all) 330 | 331 | #添加新特征20170527 332 | #平均值 333 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 334 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 335 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 336 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 337 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 338 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 339 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 340 | 341 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 342 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 343 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 344 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 345 | 346 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 347 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 348 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 349 | 350 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 351 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 352 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 353 | 354 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 355 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 356 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 357 | 358 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 359 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 360 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 361 | 362 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 363 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 364 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 365 | 366 | #上次订购的价格和当时最低价的比 367 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 368 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 369 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 370 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 371 | 372 | 373 | all["price_tail1"]=all["price_deduct"]%10 374 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 375 | #all["price_tail2"]=all["price_deduct"]%100 376 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 377 | #del all["basicroomid_lastord"] 378 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 379 | #del all["roomid_lastord"] 380 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 381 | #del all["hotelid_lastord"] 382 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 383 | 384 | #价格高低 385 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 386 | 387 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 388 | 389 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 390 | 391 | 392 | for i in [2,3,4,5,6,8]: 393 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 394 | del all["roomservice_%s_lastord"%i] 395 | 396 | for i in [2,3,4,5,6]: 397 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 398 | del all["roomtag_%s_lastord"%i] 399 | 400 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 401 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 402 | del all["ordertype_%s_ratio"%i] 403 | 404 | #所有的 405 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 406 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 407 | 408 | ]: 409 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 410 | 411 | #一周的 412 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 413 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 414 | ]: 415 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 416 | 417 | #一个月的 418 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 419 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 420 | 421 | ]: 422 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 423 | 424 | #三个月的 425 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 426 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 427 | 428 | ]: 429 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 430 | 431 | 432 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 433 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 434 | 435 | all["star_dif"]=all["user_avgstar"]-all["star"] 436 | 437 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 438 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 439 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 440 | 441 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 442 | 443 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 444 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 445 | 446 | print all.shape 447 | 448 | online = model.predict(all.values) 449 | online = pd.DataFrame(online) 450 | online.columns = ["prob"] 451 | online["orderid"] = all["orderid"].values 452 | online["basicroomid"] = all["basicroomid"].values 453 | online["predict_roomid"] = all["roomid_ori"].values 454 | online["roomid"] = all["roomid"].values 455 | 456 | if j==0: 457 | result=online 458 | else: 459 | result=result.append(online) 460 | 461 | result.to_csv("all_result_v29_24680_test_feature.csv",index=None) 462 | del result["basicroomid"] 463 | del result["roomid"] 464 | result = result.sort_values("prob") 465 | del result["prob"] 466 | result = result.drop_duplicates("orderid", keep="last") 467 | result["orderid"]=result["orderid"].apply(lambda x:"ORDER_"+str(x)) 468 | result["predict_roomid"]=result["predict_roomid"].apply(lambda x:"ROOM_"+str(x)) 469 | result.to_csv("sub_v29_24680.csv",index=None) 470 | 471 | 472 | for j in [1,3,5,7,9]: 473 | print j 474 | all=pd.read_csv("try/offline_%s.csv"%j) 475 | del all["orderlabel"] 476 | 477 | # test中的basic_30days_ordnumratio数据异常,需要重新设置 478 | del all["basic_30days_ordnumratio"] 479 | all = all.merge(trick_basic_30d_fill, on=["hotelid", "basicroomid", "orderdate"], how="left") 480 | # 使用穿越特征 481 | all = all.merge(trick_1, on=["hotelid", "basicroomid", "orderdate"], how="left") 482 | all = all.merge(trick_2, on=["hotelid", "basicroomid", "orderdate"], how="left") 483 | all = all.merge(trick_3, on=["hotelid", "basicroomid", "orderdate"], how="left") 484 | all["basic_trick_today"] = all["trick_1"] - all["basic_recent3_ordernum_ratio"] 485 | 486 | all = all.merge(trick_room, on=["hotelid", "roomid", "orderdate"], how="left") 487 | all["room_trick_today"] = all["trick_room"] - all["room_30days_ordnumratio"] 488 | 489 | all = all.merge(trick_basic_7d, on=["hotelid", "basicroomid", "orderdate"], how="left") 490 | all["basic_7d_trick_today"] = all["trick_basic_7d"] - all["basic_week_ordernum_ratio"] 491 | 492 | all = all.merge(trick_basic_30d, on=["hotelid", "basicroomid", "orderdate"], how="left") 493 | all["basic_30d_trick_today"] = all["trick_basic_30d"] - all["basic_30days_ordnumratio"] 494 | 495 | all = all.merge(trick_basic_realratio, on=["basicroomid", "orderdate"], how="left") 496 | all["basic_30days_realratio_today"] = all["trick_basic_30days_realratio"] - all["basic_30days_realratio"] 497 | all = all.merge(trick_room_realratio, on=["roomid", "orderdate"], how="left") 498 | all["room_30days_realratio_today"] = all["trick_room_30days_realratio"] - all["room_30days_realratio"] 499 | 500 | 501 | all["roomid_ori"] = all["roomid"] 502 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 503 | 504 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 505 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 506 | 507 | all=df_median(all) 508 | all=df_min(all) 509 | all=df_min_orderid(all) 510 | 511 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 512 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 513 | 514 | all = df_rank_mean(all) 515 | all = df_roomrank_mean(all) 516 | 517 | #添加新特征20170527 518 | #平均值 519 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 520 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 521 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 522 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 523 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 524 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 525 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 526 | 527 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 528 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 529 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 530 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 531 | 532 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 533 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 534 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 535 | 536 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 537 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 538 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 539 | 540 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 541 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 542 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 543 | 544 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 545 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 546 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 547 | 548 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 549 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 550 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 551 | 552 | #上次订购的价格和当时最低价的比 553 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 554 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 555 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 556 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 557 | 558 | 559 | all["price_tail1"]=all["price_deduct"]%10 560 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 561 | #all["price_tail2"]=all["price_deduct"]%100 562 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 563 | #del all["basicroomid_lastord"] 564 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 565 | #del all["roomid_lastord"] 566 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 567 | #del all["hotelid_lastord"] 568 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 569 | 570 | #价格高低 571 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 572 | 573 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 574 | 575 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 576 | 577 | 578 | for i in [2,3,4,5,6,8]: 579 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 580 | del all["roomservice_%s_lastord"%i] 581 | 582 | for i in [2,3,4,5,6]: 583 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 584 | del all["roomtag_%s_lastord"%i] 585 | 586 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 587 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 588 | del all["ordertype_%s_ratio"%i] 589 | 590 | #所有的 591 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 592 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 593 | 594 | ]: 595 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 596 | 597 | #一周的 598 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 599 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 600 | ]: 601 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 602 | 603 | #一个月的 604 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 605 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 606 | 607 | ]: 608 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 609 | 610 | #三个月的 611 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 612 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 613 | 614 | ]: 615 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 616 | 617 | 618 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 619 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 620 | 621 | all["star_dif"]=all["user_avgstar"]-all["star"] 622 | 623 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 624 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 625 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 626 | 627 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 628 | 629 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 630 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 631 | 632 | print all.shape 633 | 634 | online = model.predict(all.values) 635 | online = pd.DataFrame(online) 636 | online.columns = ["prob"] 637 | online["orderid"] = all["orderid"].values 638 | online["basicroomid"] = all["basicroomid"].values 639 | online["predict_roomid"] = all["roomid_ori"].values 640 | online["roomid"] = all["roomid"].values 641 | 642 | if j==0 or j==1: 643 | result=online 644 | else: 645 | result=result.append(online) 646 | 647 | result.to_csv("all_result_v29_24680_train_feature.csv",index=None) 648 | 649 | -------------------------------------------------------------------------------- /ctrip_test_all_data_v31_lgb_feature.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import pandas as pd 3 | import lightgbm as lgb 4 | import numpy as np 5 | from com_uitl import * 6 | 7 | 8 | # 每个basicid价格的中位数 9 | def df_median(df): 10 | add = pd.DataFrame(df.groupby(["orderid", "basicroomid"]).price_deduct.median()).reset_index() 11 | add.columns = ["orderid", "basicroomid", "basicroomid_price_deduct_median"] 12 | df = df.merge(add, on=["orderid", "basicroomid"], how="left") 13 | return df 14 | 15 | # 每个basicid价格的最小值 16 | def df_min(df): 17 | add = pd.DataFrame(df.groupby(["orderid", "basicroomid"]).price_deduct.min()).reset_index() 18 | add.columns = ["orderid", "basicroomid", "basicroomid_price_deduct_min"] 19 | df = df.merge(add, on=["orderid", "basicroomid"], how="left") 20 | return df 21 | 22 | # 每个orderid价格的最小值 23 | def df_min_orderid(df): 24 | add = pd.DataFrame(df.groupby(["orderid"]).price_deduct.min()).reset_index() 25 | add.columns = ["orderid", "orderid_price_deduct_min"] 26 | df = df.merge(add, on=["orderid"], how="left") 27 | return df 28 | 29 | #排序特征 30 | def df_rank_mean(df): 31 | add = pd.DataFrame(df.groupby(["basicroomid"]).orderid_price_deduct_min_rank.mean()).reset_index() 32 | add.columns = ["basicroomid","orderid_price_deduct_min_rank_mean"] 33 | df = df.merge(add, on=["basicroomid"], how="left") 34 | return df 35 | 36 | def df_roomrank_mean(df): 37 | add = pd.DataFrame(df.groupby(["roomid"]).basicroomid_price_rank.mean()).reset_index() 38 | add.columns = ["roomid","basicroomid_price_rank_mean"] 39 | df = df.merge(add, on=["roomid"], how="left") 40 | return df 41 | 42 | 43 | #添加转化率特征 44 | leak=pd.read_csv("try/leak.csv") 45 | #提取basicroomid的转化率 46 | feature_df=leak[["orderid","basicroomid","orderlabel"]].copy() 47 | feature_df.sort_values("orderlabel") 48 | feature_df=feature_df.drop_duplicates(["orderid","basicroomid"],keep="last") 49 | basicroom_mean=pd.DataFrame(feature_df.groupby("basicroomid").orderlabel.mean()).reset_index() 50 | basicroom_mean.columns=["basicroomid","basicroomid_mean"] 51 | 52 | basicroom_sum=pd.DataFrame(feature_df.groupby("basicroomid").orderlabel.sum()).reset_index() 53 | basicroom_sum.columns=["basicroomid","basicroomid_sum"] 54 | 55 | del leak 56 | 57 | #basic的trick 58 | trick_1=pd.read_csv("feature_3days_train.csv").append(pd.read_csv("feature_3days_test.csv")) 59 | trick_2=trick_1.copy() 60 | trick_3=trick_1.copy() 61 | trick_1.columns=["hotelid","basicroomid","orderdate","trick_1"] 62 | trick_2.columns=["hotelid","basicroomid","orderdate","trick_2"] 63 | trick_3.columns=["hotelid","basicroomid","orderdate","trick_3"] 64 | trick_1["orderdate"]=trick_1["orderdate"]-1 65 | trick_2["orderdate"]=trick_2["orderdate"]-2 66 | trick_3["orderdate"]=trick_3["orderdate"]-3 67 | #room的trick 68 | trick_room=pd.read_csv("feature_room_30days_ordnumratio_train.csv").append(pd.read_csv("feature_room_30days_ordnumratio_test.csv")) 69 | trick_room.columns=["hotelid","roomid","orderdate","trick_room"] 70 | trick_room["orderdate"]=trick_room["orderdate"]-1 71 | #basic七天的的trick 72 | trick_basic_7d=pd.read_csv("feature_basic_7days_train.csv").append(pd.read_csv("feature_basic_7days_test.csv")) 73 | trick_basic_7d.columns=["hotelid","basicroomid","orderdate","trick_basic_7d"] 74 | trick_basic_7d["orderdate"]=trick_basic_7d["orderdate"]-1 75 | #basic30天的trick 76 | trick_basic_30d=pd.read_csv("feature_basic_30days_train.csv").append(pd.read_csv("feature_basic_30days_test.csv")) 77 | trick_basic_30d_fill=trick_basic_30d.copy() 78 | trick_basic_30d.columns=["hotelid","basicroomid","orderdate","trick_basic_30d"] 79 | trick_basic_30d["orderdate"]=trick_basic_30d["orderdate"]-1 80 | #trick_basic_realratio 81 | trick_basic_realratio=pd.read_csv("feature_basic_30days_realratio_train.csv").append(pd.read_csv("feature_basic_30days_realratio_test.csv")) 82 | trick_basic_realratio.columns=["basicroomid","orderdate","trick_basic_30days_realratio"] 83 | trick_basic_realratio["orderdate"]=trick_basic_realratio["orderdate"]-1 84 | #trick_room_realratio 85 | trick_room_realratio=pd.read_csv("feature_room_30days_realratio_train.csv").append(pd.read_csv("feature_room_30days_realratio_test.csv")) 86 | trick_room_realratio.columns=["roomid","orderdate","trick_room_30days_realratio"] 87 | trick_room_realratio["orderdate"]=trick_room_realratio["orderdate"]-1 88 | 89 | 90 | #构造特征 91 | #for j in range(10): 92 | for j in [0,2,4,6,8]: 93 | #for j in [1,3,5,7,9]: 94 | print j 95 | all=pd.read_csv("try/offline_%s.csv"%j) 96 | 97 | #20170620添加特征 98 | for i in ["basic_week_ordernum_ratio", "basic_recent3_ordernum_ratio", "basic_comment_ratio", 99 | "basic_30days_ordnumratio", "basic_30days_realratio"]: 100 | all = merge_max(all, ["orderid"], i, "%s_max" % i) 101 | for i in ["room_30days_ordnumratio", "room_30days_realratio"]: 102 | all = merge_max(all, ["orderid", "basicroomid"], i, "%s_max" % i) 103 | all["user_roomservice_8_345ratio"]=all["user_roomservice_5_345ratio"] 104 | del all["user_roomservice_5_345ratio"] 105 | all["user_roomservice_8_2ratio"]=1-all["user_roomservice_8_345ratio"]-all["user_roomservice_8_1ratio"] 106 | all["user_roomservice_4_1ratio_3month"] = 1 - all["user_roomservice_4_0ratio_3month"] - all["user_roomservice_4_2ratio_3month"] - all["user_roomservice_4_3ratio_3month"] - all["user_roomservice_4_4ratio_3month"] - all["user_roomservice_4_5ratio_3month"] 107 | all["user_roomservice_4_1ratio_1month"] = 1 - all["user_roomservice_4_0ratio_1month"] - all["user_roomservice_4_2ratio_1month"] - all["user_roomservice_4_3ratio_1month"] - all["user_roomservice_4_4ratio_1month"] - all["user_roomservice_4_5ratio_1month"] 108 | all["user_roomservice_4_1ratio_1week"] = 1 - all["user_roomservice_4_0ratio_1week"] - all["user_roomservice_4_2ratio_1week"] - all["user_roomservice_4_3ratio_1week"] - all["user_roomservice_4_4ratio_1week"] - all["user_roomservice_4_5ratio_1week"] 109 | all["user_roomservice_2_0ratio"]=1-all["user_roomservice_2_1ratio"] 110 | all["user_roomservice_3_0ratio"]=1-all["user_roomservice_3_123ratio"] 111 | all["user_roomservice_5_0ratio"]=1-all["user_roomservice_5_1ratio"] 112 | all["user_roomservice_7_1ratio"]=1-all["user_roomservice_7_0ratio"] 113 | all["user_roomservice_2_max"] = np.argmax(all[["user_roomservice_2_%sratio" % i for i in range(2)]].values, axis=1) 114 | all["user_roomservice_3_max"] = np.argmax(all[["user_roomservice_3_%sratio" % i for i in [0,123]]].values, axis=1) 115 | all["user_roomservice_5_max"] = np.argmax(all[["user_roomservice_5_%sratio" % i for i in range(2)]].values, axis=1) 116 | all["user_roomservice_7_max"] = np.argmax(all[["user_roomservice_7_%sratio" % i for i in range(2)]].values, axis=1) 117 | all["user_roomservice_4_max"]=np.argmax(all[["user_roomservice_4_%sratio"%i for i in range(6)]].values,axis=1) 118 | all["user_roomservice_6_max"]=np.argmax(all[["user_roomservice_6_%sratio"%i for i in range(3)]].values,axis=1) 119 | all["user_roomservice_8_max"]=np.argmax(all[["user_roomservice_8_%sratio"%i for i in [1,2,345]]].values,axis=1) 120 | all["user_roomservice_4_max_1week"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 121 | all["user_roomservice_4_max_1month"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 122 | all["user_roomservice_4_max_3month"]=np.argmax(all[["user_roomservice_4_%sratio_3month"%i for i in range(6)]].values,axis=1) 123 | all["roomservice_8"]=all["roomservice_8"].apply(lambda x:2 if x>2 else x-1) 124 | all["roomservice_3"]=all["roomservice_3"].apply(lambda x:1 if x>0 else 0) 125 | for i in range(2,9): 126 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["user_roomservice_%s_max"%i]) 127 | del all["user_roomservice_2_0ratio"] 128 | del all["user_roomservice_3_0ratio"] 129 | del all["user_roomservice_5_0ratio"] 130 | del all["user_roomservice_7_1ratio"] 131 | 132 | all["roomid_ori"] = all["roomid"] 133 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 134 | 135 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 136 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 137 | 138 | all=df_median(all) 139 | all=df_min(all) 140 | all=df_min_orderid(all) 141 | 142 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 143 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 144 | 145 | all = df_rank_mean(all) 146 | all = df_roomrank_mean(all) 147 | 148 | 149 | #添加新特征20170527 150 | #平均值 151 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 152 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 153 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 154 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 155 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 156 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 157 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 158 | 159 | 160 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 161 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 162 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 163 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 164 | 165 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 166 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 167 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 168 | 169 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 170 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 171 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 172 | 173 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 174 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 175 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 176 | 177 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 178 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 179 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 180 | 181 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 182 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 183 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 184 | 185 | #上次订购的价格和当时最低价的比 186 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 187 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 188 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 189 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 190 | 191 | 192 | all["price_tail1"]=all["price_deduct"]%10 193 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 194 | #all["price_tail2"]=all["price_deduct"]%100 195 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 196 | #del all["basicroomid_lastord"] 197 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 198 | #del all["roomid_lastord"] 199 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 200 | #del all["hotelid_lastord"] 201 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 202 | 203 | #价格高低 204 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 205 | 206 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 207 | 208 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 209 | 210 | 211 | for i in [2,3,4,5,6,8]: 212 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 213 | del all["roomservice_%s_lastord"%i] 214 | 215 | for i in [2,3,4,5,6]: 216 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 217 | del all["roomtag_%s_lastord"%i] 218 | 219 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 220 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 221 | del all["ordertype_%s_ratio"%i] 222 | 223 | #所有的 224 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 225 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 226 | 227 | ]: 228 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 229 | 230 | #一周的 231 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 232 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 233 | ]: 234 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 235 | 236 | #一个月的 237 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 238 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 239 | 240 | ]: 241 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 242 | 243 | #三个月的 244 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 245 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 246 | 247 | ]: 248 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 249 | 250 | 251 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 252 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 253 | 254 | all["star_dif"]=all["user_avgstar"]-all["star"] 255 | 256 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 257 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 258 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 259 | 260 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 261 | 262 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 263 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 264 | 265 | print all.shape 266 | 267 | if j==0 or j==1: 268 | train=all 269 | else: 270 | train=train.append(all) 271 | 272 | #算法测试 273 | train_y=train["orderlabel"].values 274 | del train["orderlabel"] 275 | 276 | 277 | print train.shape 278 | #lgb算法 279 | train = lgb.Dataset(train, label=train_y) 280 | params = { 281 | 'boosting_type': 'gbdt', 282 | 'objective': 'binary', 283 | 'metric': 'binary_logloss', 284 | 'min_child_weight': 1.5, 285 | 'num_leaves': 2 ** 5, 286 | 'lambda_l2': 10, 287 | 'subsample': 0.7, 288 | 'colsample_bytree': 0.7, 289 | 'colsample_bylevel': 0.7, 290 | 'learning_rate': 0.05, 291 | 'tree_method': 'exact', 292 | 'seed': 2017, 293 | 'nthread': 12, 294 | 'silent': True, 295 | } 296 | num_round = 1300 297 | model = lgb.train(params, train, num_round, 298 | ) 299 | 300 | 301 | for j in range(10): 302 | print j 303 | all=pd.read_csv("try/online_%s.csv"%j) 304 | 305 | #20170620添加特征 306 | for i in ["basic_week_ordernum_ratio", "basic_recent3_ordernum_ratio", "basic_comment_ratio", 307 | "basic_30days_ordnumratio", "basic_30days_realratio"]: 308 | all = merge_max(all, ["orderid"], i, "%s_max" % i) 309 | for i in ["room_30days_ordnumratio", "room_30days_realratio"]: 310 | all = merge_max(all, ["orderid", "basicroomid"], i, "%s_max" % i) 311 | all["user_roomservice_8_345ratio"]=all["user_roomservice_5_345ratio"] 312 | del all["user_roomservice_5_345ratio"] 313 | all["user_roomservice_8_2ratio"]=1-all["user_roomservice_8_345ratio"]-all["user_roomservice_8_1ratio"] 314 | all["user_roomservice_4_1ratio_3month"] = 1 - all["user_roomservice_4_0ratio_3month"] - all["user_roomservice_4_2ratio_3month"] - all["user_roomservice_4_3ratio_3month"] - all["user_roomservice_4_4ratio_3month"] - all["user_roomservice_4_5ratio_3month"] 315 | all["user_roomservice_4_1ratio_1month"] = 1 - all["user_roomservice_4_0ratio_1month"] - all["user_roomservice_4_2ratio_1month"] - all["user_roomservice_4_3ratio_1month"] - all["user_roomservice_4_4ratio_1month"] - all["user_roomservice_4_5ratio_1month"] 316 | all["user_roomservice_4_1ratio_1week"] = 1 - all["user_roomservice_4_0ratio_1week"] - all["user_roomservice_4_2ratio_1week"] - all["user_roomservice_4_3ratio_1week"] - all["user_roomservice_4_4ratio_1week"] - all["user_roomservice_4_5ratio_1week"] 317 | all["user_roomservice_2_0ratio"]=1-all["user_roomservice_2_1ratio"] 318 | all["user_roomservice_3_0ratio"]=1-all["user_roomservice_3_123ratio"] 319 | all["user_roomservice_5_0ratio"]=1-all["user_roomservice_5_1ratio"] 320 | all["user_roomservice_7_1ratio"]=1-all["user_roomservice_7_0ratio"] 321 | all["user_roomservice_2_max"] = np.argmax(all[["user_roomservice_2_%sratio" % i for i in range(2)]].values, axis=1) 322 | all["user_roomservice_3_max"] = np.argmax(all[["user_roomservice_3_%sratio" % i for i in [0,123]]].values, axis=1) 323 | all["user_roomservice_5_max"] = np.argmax(all[["user_roomservice_5_%sratio" % i for i in range(2)]].values, axis=1) 324 | all["user_roomservice_7_max"] = np.argmax(all[["user_roomservice_7_%sratio" % i for i in range(2)]].values, axis=1) 325 | all["user_roomservice_4_max"]=np.argmax(all[["user_roomservice_4_%sratio"%i for i in range(6)]].values,axis=1) 326 | all["user_roomservice_6_max"]=np.argmax(all[["user_roomservice_6_%sratio"%i for i in range(3)]].values,axis=1) 327 | all["user_roomservice_8_max"]=np.argmax(all[["user_roomservice_8_%sratio"%i for i in [1,2,345]]].values,axis=1) 328 | all["user_roomservice_4_max_1week"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 329 | all["user_roomservice_4_max_1month"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 330 | all["user_roomservice_4_max_3month"]=np.argmax(all[["user_roomservice_4_%sratio_3month"%i for i in range(6)]].values,axis=1) 331 | all["roomservice_8"]=all["roomservice_8"].apply(lambda x:2 if x>2 else x-1) 332 | all["roomservice_3"]=all["roomservice_3"].apply(lambda x:1 if x>0 else 0) 333 | for i in range(2,9): 334 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["user_roomservice_%s_max"%i]) 335 | del all["user_roomservice_2_0ratio"] 336 | del all["user_roomservice_3_0ratio"] 337 | del all["user_roomservice_5_0ratio"] 338 | del all["user_roomservice_7_1ratio"] 339 | 340 | all["roomid_ori"] = all["roomid"] 341 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 342 | 343 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 344 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 345 | 346 | all=df_median(all) 347 | all=df_min(all) 348 | all=df_min_orderid(all) 349 | 350 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 351 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 352 | 353 | all = df_rank_mean(all) 354 | all = df_roomrank_mean(all) 355 | 356 | #添加新特征20170527 357 | #平均值 358 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 359 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 360 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 361 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 362 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 363 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 364 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 365 | 366 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 367 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 368 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 369 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 370 | 371 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 372 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 373 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 374 | 375 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 376 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 377 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 378 | 379 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 380 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 381 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 382 | 383 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 384 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 385 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 386 | 387 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 388 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 389 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 390 | 391 | #上次订购的价格和当时最低价的比 392 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 393 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 394 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 395 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 396 | 397 | 398 | all["price_tail1"]=all["price_deduct"]%10 399 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 400 | #all["price_tail2"]=all["price_deduct"]%100 401 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 402 | #del all["basicroomid_lastord"] 403 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 404 | #del all["roomid_lastord"] 405 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 406 | #del all["hotelid_lastord"] 407 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 408 | 409 | #价格高低 410 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 411 | 412 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 413 | 414 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 415 | 416 | 417 | for i in [2,3,4,5,6,8]: 418 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 419 | del all["roomservice_%s_lastord"%i] 420 | 421 | for i in [2,3,4,5,6]: 422 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 423 | del all["roomtag_%s_lastord"%i] 424 | 425 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 426 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 427 | del all["ordertype_%s_ratio"%i] 428 | 429 | #所有的 430 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 431 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 432 | 433 | ]: 434 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 435 | 436 | #一周的 437 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 438 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 439 | ]: 440 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 441 | 442 | #一个月的 443 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 444 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 445 | 446 | ]: 447 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 448 | 449 | #三个月的 450 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 451 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 452 | 453 | ]: 454 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 455 | 456 | 457 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 458 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 459 | 460 | all["star_dif"]=all["user_avgstar"]-all["star"] 461 | 462 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 463 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 464 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 465 | 466 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 467 | 468 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 469 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 470 | 471 | print all.shape 472 | 473 | online = model.predict(all.values) 474 | online = pd.DataFrame(online) 475 | online.columns = ["prob"] 476 | online["orderid"] = all["orderid"].values 477 | online["basicroomid"] = all["basicroomid"].values 478 | online["predict_roomid"] = all["roomid_ori"].values 479 | online["roomid"] = all["roomid"].values 480 | 481 | if j==0: 482 | result=online 483 | else: 484 | result=result.append(online) 485 | 486 | result.to_csv("all_result_v31_24680_test_feature.csv",index=None) 487 | del result["basicroomid"] 488 | del result["roomid"] 489 | result = result.sort_values("prob") 490 | del result["prob"] 491 | result = result.drop_duplicates("orderid", keep="last") 492 | result["orderid"]=result["orderid"].apply(lambda x:"ORDER_"+str(x)) 493 | result["predict_roomid"]=result["predict_roomid"].apply(lambda x:"ROOM_"+str(x)) 494 | result.to_csv("sub_v31_24680.csv",index=None) 495 | 496 | 497 | for j in [1,3,5,7,9]: 498 | print j 499 | all=pd.read_csv("try/offline_%s.csv"%j) 500 | del all["orderlabel"] 501 | 502 | #20170620添加特征 503 | for i in ["basic_week_ordernum_ratio", "basic_recent3_ordernum_ratio", "basic_comment_ratio", 504 | "basic_30days_ordnumratio", "basic_30days_realratio"]: 505 | all = merge_max(all, ["orderid"], i, "%s_max" % i) 506 | for i in ["room_30days_ordnumratio", "room_30days_realratio"]: 507 | all = merge_max(all, ["orderid", "basicroomid"], i, "%s_max" % i) 508 | all["user_roomservice_8_345ratio"]=all["user_roomservice_5_345ratio"] 509 | del all["user_roomservice_5_345ratio"] 510 | all["user_roomservice_8_2ratio"]=1-all["user_roomservice_8_345ratio"]-all["user_roomservice_8_1ratio"] 511 | all["user_roomservice_4_1ratio_3month"] = 1 - all["user_roomservice_4_0ratio_3month"] - all["user_roomservice_4_2ratio_3month"] - all["user_roomservice_4_3ratio_3month"] - all["user_roomservice_4_4ratio_3month"] - all["user_roomservice_4_5ratio_3month"] 512 | all["user_roomservice_4_1ratio_1month"] = 1 - all["user_roomservice_4_0ratio_1month"] - all["user_roomservice_4_2ratio_1month"] - all["user_roomservice_4_3ratio_1month"] - all["user_roomservice_4_4ratio_1month"] - all["user_roomservice_4_5ratio_1month"] 513 | all["user_roomservice_4_1ratio_1week"] = 1 - all["user_roomservice_4_0ratio_1week"] - all["user_roomservice_4_2ratio_1week"] - all["user_roomservice_4_3ratio_1week"] - all["user_roomservice_4_4ratio_1week"] - all["user_roomservice_4_5ratio_1week"] 514 | all["user_roomservice_2_0ratio"]=1-all["user_roomservice_2_1ratio"] 515 | all["user_roomservice_3_0ratio"]=1-all["user_roomservice_3_123ratio"] 516 | all["user_roomservice_5_0ratio"]=1-all["user_roomservice_5_1ratio"] 517 | all["user_roomservice_7_1ratio"]=1-all["user_roomservice_7_0ratio"] 518 | all["user_roomservice_2_max"] = np.argmax(all[["user_roomservice_2_%sratio" % i for i in range(2)]].values, axis=1) 519 | all["user_roomservice_3_max"] = np.argmax(all[["user_roomservice_3_%sratio" % i for i in [0,123]]].values, axis=1) 520 | all["user_roomservice_5_max"] = np.argmax(all[["user_roomservice_5_%sratio" % i for i in range(2)]].values, axis=1) 521 | all["user_roomservice_7_max"] = np.argmax(all[["user_roomservice_7_%sratio" % i for i in range(2)]].values, axis=1) 522 | all["user_roomservice_4_max"]=np.argmax(all[["user_roomservice_4_%sratio"%i for i in range(6)]].values,axis=1) 523 | all["user_roomservice_6_max"]=np.argmax(all[["user_roomservice_6_%sratio"%i for i in range(3)]].values,axis=1) 524 | all["user_roomservice_8_max"]=np.argmax(all[["user_roomservice_8_%sratio"%i for i in [1,2,345]]].values,axis=1) 525 | all["user_roomservice_4_max_1week"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 526 | all["user_roomservice_4_max_1month"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 527 | all["user_roomservice_4_max_3month"]=np.argmax(all[["user_roomservice_4_%sratio_3month"%i for i in range(6)]].values,axis=1) 528 | all["roomservice_8"]=all["roomservice_8"].apply(lambda x:2 if x>2 else x-1) 529 | all["roomservice_3"]=all["roomservice_3"].apply(lambda x:1 if x>0 else 0) 530 | for i in range(2,9): 531 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["user_roomservice_%s_max"%i]) 532 | del all["user_roomservice_2_0ratio"] 533 | del all["user_roomservice_3_0ratio"] 534 | del all["user_roomservice_5_0ratio"] 535 | del all["user_roomservice_7_1ratio"] 536 | 537 | all["roomid_ori"] = all["roomid"] 538 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 539 | 540 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 541 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 542 | 543 | all=df_median(all) 544 | all=df_min(all) 545 | all=df_min_orderid(all) 546 | 547 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 548 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 549 | 550 | all = df_rank_mean(all) 551 | all = df_roomrank_mean(all) 552 | 553 | #添加新特征20170527 554 | #平均值 555 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 556 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 557 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 558 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 559 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 560 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 561 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 562 | 563 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 564 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 565 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 566 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 567 | 568 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 569 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 570 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 571 | 572 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 573 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 574 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 575 | 576 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 577 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 578 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 579 | 580 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 581 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 582 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 583 | 584 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 585 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 586 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 587 | 588 | #上次订购的价格和当时最低价的比 589 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 590 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 591 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 592 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 593 | 594 | 595 | all["price_tail1"]=all["price_deduct"]%10 596 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 597 | #all["price_tail2"]=all["price_deduct"]%100 598 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 599 | #del all["basicroomid_lastord"] 600 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 601 | #del all["roomid_lastord"] 602 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 603 | #del all["hotelid_lastord"] 604 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 605 | 606 | #价格高低 607 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 608 | 609 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 610 | 611 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 612 | 613 | 614 | for i in [2,3,4,5,6,8]: 615 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 616 | del all["roomservice_%s_lastord"%i] 617 | 618 | for i in [2,3,4,5,6]: 619 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 620 | del all["roomtag_%s_lastord"%i] 621 | 622 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 623 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 624 | del all["ordertype_%s_ratio"%i] 625 | 626 | #所有的 627 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 628 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 629 | 630 | ]: 631 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 632 | 633 | #一周的 634 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 635 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 636 | ]: 637 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 638 | 639 | #一个月的 640 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 641 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 642 | 643 | ]: 644 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 645 | 646 | #三个月的 647 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 648 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 649 | 650 | ]: 651 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 652 | 653 | 654 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 655 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 656 | 657 | all["star_dif"]=all["user_avgstar"]-all["star"] 658 | 659 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 660 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 661 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 662 | 663 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 664 | 665 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 666 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 667 | 668 | print all.shape 669 | 670 | online = model.predict(all.values) 671 | online = pd.DataFrame(online) 672 | online.columns = ["prob"] 673 | online["orderid"] = all["orderid"].values 674 | online["basicroomid"] = all["basicroomid"].values 675 | online["predict_roomid"] = all["roomid_ori"].values 676 | online["roomid"] = all["roomid"].values 677 | 678 | if j==0 or j==1: 679 | result=online 680 | else: 681 | result=result.append(online) 682 | 683 | result.to_csv("all_result_v31_24680_train_feature.csv",index=None) 684 | 685 | -------------------------------------------------------------------------------- /ctrip_test_all_data_v42.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import pandas as pd 3 | import lightgbm as lgb 4 | import numpy as np 5 | from com_uitl import * 6 | 7 | 8 | # 每个basicid价格的中位数 9 | def df_median(df): 10 | add = pd.DataFrame(df.groupby(["orderid", "basicroomid"]).price_deduct.median()).reset_index() 11 | add.columns = ["orderid", "basicroomid", "basicroomid_price_deduct_median"] 12 | df = df.merge(add, on=["orderid", "basicroomid"], how="left") 13 | return df 14 | 15 | # 每个basicid价格的最小值 16 | def df_min(df): 17 | add = pd.DataFrame(df.groupby(["orderid", "basicroomid"]).price_deduct.min()).reset_index() 18 | add.columns = ["orderid", "basicroomid", "basicroomid_price_deduct_min"] 19 | df = df.merge(add, on=["orderid", "basicroomid"], how="left") 20 | return df 21 | 22 | # 每个orderid价格的最小值 23 | def df_min_orderid(df): 24 | add = pd.DataFrame(df.groupby(["orderid"]).price_deduct.min()).reset_index() 25 | add.columns = ["orderid", "orderid_price_deduct_min"] 26 | df = df.merge(add, on=["orderid"], how="left") 27 | return df 28 | 29 | #排序特征 30 | def df_rank_mean(df): 31 | add = pd.DataFrame(df.groupby(["basicroomid"]).orderid_price_deduct_min_rank.mean()).reset_index() 32 | add.columns = ["basicroomid","orderid_price_deduct_min_rank_mean"] 33 | df = df.merge(add, on=["basicroomid"], how="left") 34 | return df 35 | 36 | def df_roomrank_mean(df): 37 | add = pd.DataFrame(df.groupby(["roomid"]).basicroomid_price_rank.mean()).reset_index() 38 | add.columns = ["roomid","basicroomid_price_rank_mean"] 39 | df = df.merge(add, on=["roomid"], how="left") 40 | return df 41 | 42 | 43 | #添加转化率特征 44 | leak=pd.read_csv("try/leak.csv") 45 | #提取basicroomid的转化率 46 | feature_df=leak[["orderid","basicroomid","orderlabel"]].copy() 47 | feature_df.sort_values("orderlabel") 48 | feature_df=feature_df.drop_duplicates(["orderid","basicroomid"],keep="last") 49 | basicroom_mean=pd.DataFrame(feature_df.groupby("basicroomid").orderlabel.mean()).reset_index() 50 | basicroom_mean.columns=["basicroomid","basicroomid_mean"] 51 | 52 | basicroom_sum=pd.DataFrame(feature_df.groupby("basicroomid").orderlabel.sum()).reset_index() 53 | basicroom_sum.columns=["basicroomid","basicroomid_sum"] 54 | 55 | del leak 56 | feature_train=pd.read_csv("all_result_v31_24680_train_feature.csv")[["prob","orderid","basicroomid","predict_roomid"]] 57 | feature_train.columns=["prob","orderid","basicroomid","roomid"] 58 | feature_test=pd.read_csv("all_result_v31_24680_test_feature.csv")[["prob","orderid","basicroomid","predict_roomid"]] 59 | feature_test.columns=["prob","orderid","basicroomid","roomid"] 60 | #构造特征 61 | #for j in range(10): 62 | for j in [1,3,5,7,9]: 63 | print j 64 | all=pd.read_csv("try/offline_%s.csv"%j) 65 | all=all.merge(feature_train,on=["orderid","basicroomid","roomid"],how="left") 66 | 67 | #20170620添加特征 68 | for i in ["basic_week_ordernum_ratio", "basic_recent3_ordernum_ratio", "basic_comment_ratio", 69 | "basic_30days_ordnumratio", "basic_30days_realratio"]: 70 | all = merge_max(all, ["orderid"], i, "%s_max" % i) 71 | for i in ["room_30days_ordnumratio", "room_30days_realratio"]: 72 | all = merge_max(all, ["orderid", "basicroomid"], i, "%s_max" % i) 73 | all["user_roomservice_8_345ratio"]=all["user_roomservice_5_345ratio"] 74 | del all["user_roomservice_5_345ratio"] 75 | all["user_roomservice_8_2ratio"]=1-all["user_roomservice_8_345ratio"]-all["user_roomservice_8_1ratio"] 76 | all["user_roomservice_4_1ratio_3month"] = 1 - all["user_roomservice_4_0ratio_3month"] - all["user_roomservice_4_2ratio_3month"] - all["user_roomservice_4_3ratio_3month"] - all["user_roomservice_4_4ratio_3month"] - all["user_roomservice_4_5ratio_3month"] 77 | all["user_roomservice_4_1ratio_1month"] = 1 - all["user_roomservice_4_0ratio_1month"] - all["user_roomservice_4_2ratio_1month"] - all["user_roomservice_4_3ratio_1month"] - all["user_roomservice_4_4ratio_1month"] - all["user_roomservice_4_5ratio_1month"] 78 | all["user_roomservice_4_1ratio_1week"] = 1 - all["user_roomservice_4_0ratio_1week"] - all["user_roomservice_4_2ratio_1week"] - all["user_roomservice_4_3ratio_1week"] - all["user_roomservice_4_4ratio_1week"] - all["user_roomservice_4_5ratio_1week"] 79 | all["user_roomservice_2_0ratio"]=1-all["user_roomservice_2_1ratio"] 80 | all["user_roomservice_3_0ratio"]=1-all["user_roomservice_3_123ratio"] 81 | all["user_roomservice_5_0ratio"]=1-all["user_roomservice_5_1ratio"] 82 | all["user_roomservice_7_1ratio"]=1-all["user_roomservice_7_0ratio"] 83 | all["user_roomservice_2_max"] = np.argmax(all[["user_roomservice_2_%sratio" % i for i in range(2)]].values, axis=1) 84 | all["user_roomservice_3_max"] = np.argmax(all[["user_roomservice_3_%sratio" % i for i in [0,123]]].values, axis=1) 85 | all["user_roomservice_5_max"] = np.argmax(all[["user_roomservice_5_%sratio" % i for i in range(2)]].values, axis=1) 86 | all["user_roomservice_7_max"] = np.argmax(all[["user_roomservice_7_%sratio" % i for i in range(2)]].values, axis=1) 87 | all["user_roomservice_4_max"]=np.argmax(all[["user_roomservice_4_%sratio"%i for i in range(6)]].values,axis=1) 88 | all["user_roomservice_6_max"]=np.argmax(all[["user_roomservice_6_%sratio"%i for i in range(3)]].values,axis=1) 89 | all["user_roomservice_8_max"]=np.argmax(all[["user_roomservice_8_%sratio"%i for i in [1,2,345]]].values,axis=1) 90 | all["user_roomservice_4_max_1week"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 91 | all["user_roomservice_4_max_1month"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 92 | all["user_roomservice_4_max_3month"]=np.argmax(all[["user_roomservice_4_%sratio_3month"%i for i in range(6)]].values,axis=1) 93 | all["roomservice_8"]=all["roomservice_8"].apply(lambda x:2 if x>2 else x-1) 94 | all["roomservice_3"]=all["roomservice_3"].apply(lambda x:1 if x>0 else 0) 95 | for i in range(2,9): 96 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["user_roomservice_%s_max"%i]) 97 | del all["user_roomservice_2_0ratio"] 98 | del all["user_roomservice_3_0ratio"] 99 | del all["user_roomservice_5_0ratio"] 100 | del all["user_roomservice_7_1ratio"] 101 | 102 | 103 | 104 | all["roomid_ori"] = all["roomid"] 105 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 106 | 107 | #使用prob构造特征 108 | all=merge_max(all,["orderid","basicroomid"],"prob","basic_prob_max") 109 | all=merge_max(all,["orderid","basicroomid","roomid"],"prob","room_prob_max") 110 | all=merge_sum(all,["orderid","basicroomid"],"prob","basic_prob_sum") 111 | all=merge_sum(all,["orderid","basicroomid","roomid"],"prob","room_prob_sum") 112 | all=merge_mean(all,["orderid","basicroomid"],"prob","basic_prob_mean") 113 | all=merge_mean(all,["orderid","basicroomid","roomid"],"prob","room_prob_mean") 114 | all=merge_mean(all,["rank"],"prob","rank_prob_mean") 115 | all=merge_mean(all,["orderid","rank"],"prob","order_rank_prob_mean") 116 | all = merge_max(all, ["orderid"], "prob", "orderid_prob_max") 117 | all["orderid_prob_max_rt"]=all["prob"]/all["orderid_prob_max"] 118 | all["basic_prob_max_rt"]=all["prob"]/all["basic_prob_max"] 119 | all["room_prob_max_rt"]=all["prob"]/all["room_prob_max"] 120 | all["basic_prob_mean_rt"]=all["prob"]/all["basic_prob_mean"] 121 | all["room_prob_mean_rt"]=all["prob"]/all["room_prob_mean"] 122 | all["order_rank_prob_mean_rt"]=all["prob"]/all["order_rank_prob_mean"] 123 | #根据service统计 124 | for i in range(1, 9): 125 | all = merge_mean(all, ["orderid", "roomservice_%s" % i], "prob", "roomservice_prob_mean_%s" % i) 126 | 127 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 128 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 129 | 130 | all=df_median(all) 131 | all=df_min(all) 132 | all=df_min_orderid(all) 133 | 134 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 135 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 136 | 137 | all = df_rank_mean(all) 138 | all = df_roomrank_mean(all) 139 | 140 | 141 | #添加新特征20170527 142 | #平均值 143 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 144 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 145 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 146 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 147 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 148 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 149 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 150 | 151 | 152 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 153 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 154 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 155 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 156 | 157 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 158 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 159 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 160 | 161 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 162 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 163 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 164 | 165 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 166 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 167 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 168 | 169 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 170 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 171 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 172 | 173 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 174 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 175 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 176 | 177 | #上次订购的价格和当时最低价的比 178 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 179 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 180 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 181 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 182 | 183 | 184 | all["price_tail1"]=all["price_deduct"]%10 185 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 186 | #all["price_tail2"]=all["price_deduct"]%100 187 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 188 | #del all["basicroomid_lastord"] 189 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 190 | #del all["roomid_lastord"] 191 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 192 | #del all["hotelid_lastord"] 193 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 194 | 195 | #价格高低 196 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 197 | 198 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 199 | 200 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 201 | 202 | 203 | for i in [2,3,4,5,6,8]: 204 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 205 | del all["roomservice_%s_lastord"%i] 206 | 207 | for i in [2,3,4,5,6]: 208 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 209 | del all["roomtag_%s_lastord"%i] 210 | 211 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 212 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 213 | del all["ordertype_%s_ratio"%i] 214 | 215 | #所有的 216 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 217 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 218 | 219 | ]: 220 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 221 | 222 | #一周的 223 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 224 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 225 | ]: 226 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 227 | 228 | #一个月的 229 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 230 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 231 | 232 | ]: 233 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 234 | 235 | #三个月的 236 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 237 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 238 | 239 | ]: 240 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 241 | 242 | 243 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 244 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 245 | 246 | all["star_dif"]=all["user_avgstar"]-all["star"] 247 | 248 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 249 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 250 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 251 | 252 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 253 | 254 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 255 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 256 | 257 | print all.shape 258 | 259 | if j==0 or j==1: 260 | train=all 261 | else: 262 | train=train.append(all) 263 | 264 | #算法测试 265 | train_y=train["orderlabel"].values 266 | del train["orderlabel"] 267 | 268 | 269 | print train.shape 270 | #lgb算法 271 | train = lgb.Dataset(train, label=train_y) 272 | params = { 273 | 'boosting_type': 'gbdt', 274 | 'objective': 'binary', 275 | 'metric': 'binary_logloss', 276 | 'min_child_weight': 1.5, 277 | 'num_leaves': 2 ** 5, 278 | 'lambda_l2': 10, 279 | 'subsample': 0.7, 280 | 'colsample_bytree': 0.7, 281 | 'colsample_bylevel': 0.7, 282 | 'learning_rate': 0.05, 283 | 'tree_method': 'exact', 284 | 'seed': 2017, 285 | 'nthread': 12, 286 | 'silent': True, 287 | } 288 | num_round = 1300 289 | model = lgb.train(params, train, num_round, 290 | ) 291 | 292 | 293 | for j in range(10): 294 | print j 295 | all=pd.read_csv("try/online_%s.csv"%j) 296 | all = all.merge(feature_test, on=["orderid", "basicroomid", "roomid"], how="left") 297 | 298 | #20170620添加特征 299 | for i in ["basic_week_ordernum_ratio", "basic_recent3_ordernum_ratio", "basic_comment_ratio", 300 | "basic_30days_ordnumratio", "basic_30days_realratio"]: 301 | all = merge_max(all, ["orderid"], i, "%s_max" % i) 302 | for i in ["room_30days_ordnumratio", "room_30days_realratio"]: 303 | all = merge_max(all, ["orderid", "basicroomid"], i, "%s_max" % i) 304 | all["user_roomservice_8_345ratio"]=all["user_roomservice_5_345ratio"] 305 | del all["user_roomservice_5_345ratio"] 306 | all["user_roomservice_8_2ratio"]=1-all["user_roomservice_8_345ratio"]-all["user_roomservice_8_1ratio"] 307 | all["user_roomservice_4_1ratio_3month"] = 1 - all["user_roomservice_4_0ratio_3month"] - all["user_roomservice_4_2ratio_3month"] - all["user_roomservice_4_3ratio_3month"] - all["user_roomservice_4_4ratio_3month"] - all["user_roomservice_4_5ratio_3month"] 308 | all["user_roomservice_4_1ratio_1month"] = 1 - all["user_roomservice_4_0ratio_1month"] - all["user_roomservice_4_2ratio_1month"] - all["user_roomservice_4_3ratio_1month"] - all["user_roomservice_4_4ratio_1month"] - all["user_roomservice_4_5ratio_1month"] 309 | all["user_roomservice_4_1ratio_1week"] = 1 - all["user_roomservice_4_0ratio_1week"] - all["user_roomservice_4_2ratio_1week"] - all["user_roomservice_4_3ratio_1week"] - all["user_roomservice_4_4ratio_1week"] - all["user_roomservice_4_5ratio_1week"] 310 | all["user_roomservice_2_0ratio"]=1-all["user_roomservice_2_1ratio"] 311 | all["user_roomservice_3_0ratio"]=1-all["user_roomservice_3_123ratio"] 312 | all["user_roomservice_5_0ratio"]=1-all["user_roomservice_5_1ratio"] 313 | all["user_roomservice_7_1ratio"]=1-all["user_roomservice_7_0ratio"] 314 | all["user_roomservice_2_max"] = np.argmax(all[["user_roomservice_2_%sratio" % i for i in range(2)]].values, axis=1) 315 | all["user_roomservice_3_max"] = np.argmax(all[["user_roomservice_3_%sratio" % i for i in [0,123]]].values, axis=1) 316 | all["user_roomservice_5_max"] = np.argmax(all[["user_roomservice_5_%sratio" % i for i in range(2)]].values, axis=1) 317 | all["user_roomservice_7_max"] = np.argmax(all[["user_roomservice_7_%sratio" % i for i in range(2)]].values, axis=1) 318 | all["user_roomservice_4_max"]=np.argmax(all[["user_roomservice_4_%sratio"%i for i in range(6)]].values,axis=1) 319 | all["user_roomservice_6_max"]=np.argmax(all[["user_roomservice_6_%sratio"%i for i in range(3)]].values,axis=1) 320 | all["user_roomservice_8_max"]=np.argmax(all[["user_roomservice_8_%sratio"%i for i in [1,2,345]]].values,axis=1) 321 | all["user_roomservice_4_max_1week"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 322 | all["user_roomservice_4_max_1month"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 323 | all["user_roomservice_4_max_3month"]=np.argmax(all[["user_roomservice_4_%sratio_3month"%i for i in range(6)]].values,axis=1) 324 | all["roomservice_8"]=all["roomservice_8"].apply(lambda x:2 if x>2 else x-1) 325 | all["roomservice_3"]=all["roomservice_3"].apply(lambda x:1 if x>0 else 0) 326 | for i in range(2,9): 327 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["user_roomservice_%s_max"%i]) 328 | del all["user_roomservice_2_0ratio"] 329 | del all["user_roomservice_3_0ratio"] 330 | del all["user_roomservice_5_0ratio"] 331 | del all["user_roomservice_7_1ratio"] 332 | 333 | 334 | all["roomid_ori"] = all["roomid"] 335 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 336 | #使用prob构造特征 337 | all=merge_max(all,["orderid","basicroomid"],"prob","basic_prob_max") 338 | all=merge_max(all,["orderid","basicroomid","roomid"],"prob","room_prob_max") 339 | all=merge_sum(all,["orderid","basicroomid"],"prob","basic_prob_sum") 340 | all=merge_sum(all,["orderid","basicroomid","roomid"],"prob","room_prob_sum") 341 | all=merge_mean(all,["orderid","basicroomid"],"prob","basic_prob_mean") 342 | all=merge_mean(all,["orderid","basicroomid","roomid"],"prob","room_prob_mean") 343 | all=merge_mean(all,["rank"],"prob","rank_prob_mean") 344 | all=merge_mean(all,["orderid","rank"],"prob","order_rank_prob_mean") 345 | all = merge_max(all, ["orderid"], "prob", "orderid_prob_max") 346 | all["orderid_prob_max_rt"]=all["prob"]/all["orderid_prob_max"] 347 | all["basic_prob_max_rt"]=all["prob"]/all["basic_prob_max"] 348 | all["room_prob_max_rt"]=all["prob"]/all["room_prob_max"] 349 | all["basic_prob_mean_rt"]=all["prob"]/all["basic_prob_mean"] 350 | all["room_prob_mean_rt"]=all["prob"]/all["room_prob_mean"] 351 | all["order_rank_prob_mean_rt"]=all["prob"]/all["order_rank_prob_mean"] 352 | #根据service统计 353 | for i in range(1, 9): 354 | all = merge_mean(all, ["orderid", "roomservice_%s" % i], "prob", "roomservice_prob_mean_%s" % i) 355 | 356 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 357 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 358 | 359 | all=df_median(all) 360 | all=df_min(all) 361 | all=df_min_orderid(all) 362 | 363 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 364 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 365 | 366 | all = df_rank_mean(all) 367 | all = df_roomrank_mean(all) 368 | 369 | #添加新特征20170527 370 | #平均值 371 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 372 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 373 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 374 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 375 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 376 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 377 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 378 | 379 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 380 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 381 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 382 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 383 | 384 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 385 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 386 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 387 | 388 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 389 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 390 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 391 | 392 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 393 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 394 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 395 | 396 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 397 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 398 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 399 | 400 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 401 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 402 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 403 | 404 | #上次订购的价格和当时最低价的比 405 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 406 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 407 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 408 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 409 | 410 | 411 | all["price_tail1"]=all["price_deduct"]%10 412 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 413 | #all["price_tail2"]=all["price_deduct"]%100 414 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 415 | #del all["basicroomid_lastord"] 416 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 417 | #del all["roomid_lastord"] 418 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 419 | #del all["hotelid_lastord"] 420 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 421 | 422 | #价格高低 423 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 424 | 425 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 426 | 427 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 428 | 429 | 430 | for i in [2,3,4,5,6,8]: 431 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 432 | del all["roomservice_%s_lastord"%i] 433 | 434 | for i in [2,3,4,5,6]: 435 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 436 | del all["roomtag_%s_lastord"%i] 437 | 438 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 439 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 440 | del all["ordertype_%s_ratio"%i] 441 | 442 | #所有的 443 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 444 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 445 | 446 | ]: 447 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 448 | 449 | #一周的 450 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 451 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 452 | ]: 453 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 454 | 455 | #一个月的 456 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 457 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 458 | 459 | ]: 460 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 461 | 462 | #三个月的 463 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 464 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 465 | 466 | ]: 467 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 468 | 469 | 470 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 471 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 472 | 473 | all["star_dif"]=all["user_avgstar"]-all["star"] 474 | 475 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 476 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 477 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 478 | 479 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 480 | 481 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 482 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 483 | 484 | print all.shape 485 | 486 | online = model.predict(all.values) 487 | online = pd.DataFrame(online) 488 | online.columns = ["prob"] 489 | online["orderid"] = all["orderid"].values 490 | online["basicroomid"] = all["basicroomid"].values 491 | online["predict_roomid"] = all["roomid_ori"].values 492 | online["roomid"] = all["roomid"].values 493 | 494 | if j==0: 495 | result=online 496 | else: 497 | result=result.append(online) 498 | 499 | result.to_csv("all_result_v42_13579_test_feature.csv",index=None) 500 | del result["basicroomid"] 501 | del result["roomid"] 502 | result = result.sort_values("prob") 503 | del result["prob"] 504 | result = result.drop_duplicates("orderid", keep="last") 505 | result["orderid"]=result["orderid"].apply(lambda x:"ORDER_"+str(x)) 506 | result["predict_roomid"]=result["predict_roomid"].apply(lambda x:"ROOM_"+str(x)) 507 | result.to_csv("sub_v42_13579.csv",index=None) 508 | -------------------------------------------------------------------------------- /ctrip_test_all_data_v43.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import pandas as pd 3 | import lightgbm as lgb 4 | from com_uitl import * 5 | import numpy as np 6 | 7 | 8 | # 每个basicid价格的中位数 9 | def df_median(df): 10 | add = pd.DataFrame(df.groupby(["orderid", "basicroomid"]).price_deduct.median()).reset_index() 11 | add.columns = ["orderid", "basicroomid", "basicroomid_price_deduct_median"] 12 | df = df.merge(add, on=["orderid", "basicroomid"], how="left") 13 | return df 14 | 15 | # 每个basicid价格的最小值 16 | def df_min(df): 17 | add = pd.DataFrame(df.groupby(["orderid", "basicroomid"]).price_deduct.min()).reset_index() 18 | add.columns = ["orderid", "basicroomid", "basicroomid_price_deduct_min"] 19 | df = df.merge(add, on=["orderid", "basicroomid"], how="left") 20 | return df 21 | 22 | # 每个orderid价格的最小值 23 | def df_min_orderid(df): 24 | add = pd.DataFrame(df.groupby(["orderid"]).price_deduct.min()).reset_index() 25 | add.columns = ["orderid", "orderid_price_deduct_min"] 26 | df = df.merge(add, on=["orderid"], how="left") 27 | return df 28 | 29 | #排序特征 30 | def df_rank_mean(df): 31 | add = pd.DataFrame(df.groupby(["basicroomid"]).orderid_price_deduct_min_rank.mean()).reset_index() 32 | add.columns = ["basicroomid","orderid_price_deduct_min_rank_mean"] 33 | df = df.merge(add, on=["basicroomid"], how="left") 34 | return df 35 | 36 | def df_roomrank_mean(df): 37 | add = pd.DataFrame(df.groupby(["roomid"]).basicroomid_price_rank.mean()).reset_index() 38 | add.columns = ["roomid","basicroomid_price_rank_mean"] 39 | df = df.merge(add, on=["roomid"], how="left") 40 | return df 41 | 42 | 43 | #添加转化率特征 44 | leak=pd.read_csv("try/leak.csv") 45 | #提取basicroomid的转化率 46 | feature_df=leak[["orderid","basicroomid","orderlabel"]].copy() 47 | feature_df.sort_values("orderlabel") 48 | feature_df=feature_df.drop_duplicates(["orderid","basicroomid"],keep="last") 49 | basicroom_mean=pd.DataFrame(feature_df.groupby("basicroomid").orderlabel.mean()).reset_index() 50 | basicroom_mean.columns=["basicroomid","basicroomid_mean"] 51 | 52 | basicroom_sum=pd.DataFrame(feature_df.groupby("basicroomid").orderlabel.sum()).reset_index() 53 | basicroom_sum.columns=["basicroomid","basicroomid_sum"] 54 | 55 | del leak 56 | feature_train=pd.read_csv("all_result_v29_24680_train_feature.csv")[["prob","orderid","basicroomid","predict_roomid"]] 57 | feature_train.columns=["prob","orderid","basicroomid","roomid"] 58 | feature_test=pd.read_csv("all_result_v29_24680_test_feature.csv")[["prob","orderid","basicroomid","predict_roomid"]] 59 | feature_test.columns=["prob","orderid","basicroomid","roomid"] 60 | 61 | feature_train_2=pd.read_csv("all_result_v31_24680_train_feature.csv")[["prob","orderid","basicroomid","predict_roomid"]] 62 | feature_train_2.columns=["prob_2","orderid","basicroomid","roomid"] 63 | feature_test_2=pd.read_csv("all_result_v31_24680_test_feature.csv")[["prob","orderid","basicroomid","predict_roomid"]] 64 | feature_test_2.columns=["prob_2","orderid","basicroomid","roomid"] 65 | 66 | #basic的trick 67 | trick_1=pd.read_csv("feature_3days_train.csv").append(pd.read_csv("feature_3days_test.csv")) 68 | trick_2=trick_1.copy() 69 | trick_3=trick_1.copy() 70 | trick_1.columns=["hotelid","basicroomid","orderdate","trick_1"] 71 | trick_2.columns=["hotelid","basicroomid","orderdate","trick_2"] 72 | trick_3.columns=["hotelid","basicroomid","orderdate","trick_3"] 73 | trick_1["orderdate"]=trick_1["orderdate"]-1 74 | trick_2["orderdate"]=trick_2["orderdate"]-2 75 | trick_3["orderdate"]=trick_3["orderdate"]-3 76 | #room的trick 77 | trick_room=pd.read_csv("feature_room_30days_ordnumratio_train.csv").append(pd.read_csv("feature_room_30days_ordnumratio_test.csv")) 78 | trick_room.columns=["hotelid","roomid","orderdate","trick_room"] 79 | trick_room["orderdate"]=trick_room["orderdate"]-1 80 | #basic七天的的trick 81 | trick_basic_7d=pd.read_csv("feature_basic_7days_train.csv").append(pd.read_csv("feature_basic_7days_test.csv")) 82 | trick_basic_7d.columns=["hotelid","basicroomid","orderdate","trick_basic_7d"] 83 | trick_basic_7d["orderdate"]=trick_basic_7d["orderdate"]-1 84 | #basic30天的trick 85 | trick_basic_30d=pd.read_csv("feature_basic_30days_train.csv").append(pd.read_csv("feature_basic_30days_test.csv")) 86 | trick_basic_30d_fill=trick_basic_30d.copy() 87 | trick_basic_30d.columns=["hotelid","basicroomid","orderdate","trick_basic_30d"] 88 | trick_basic_30d["orderdate"]=trick_basic_30d["orderdate"]-1 89 | #trick_basic_realratio 90 | trick_basic_realratio=pd.read_csv("feature_basic_30days_realratio_train.csv").append(pd.read_csv("feature_basic_30days_realratio_test.csv")) 91 | trick_basic_realratio.columns=["basicroomid","orderdate","trick_basic_30days_realratio"] 92 | trick_basic_realratio["orderdate"]=trick_basic_realratio["orderdate"]-1 93 | #trick_room_realratio 94 | trick_room_realratio=pd.read_csv("feature_room_30days_realratio_train.csv").append(pd.read_csv("feature_room_30days_realratio_test.csv")) 95 | trick_room_realratio.columns=["roomid","orderdate","trick_room_30days_realratio"] 96 | trick_room_realratio["orderdate"]=trick_room_realratio["orderdate"]-1 97 | #trick_basic_comment_ratio 98 | trick_basic_comment_ratio=pd.read_csv("feature_basic_comment_ratio_train.csv").append(pd.read_csv("feature_basic_comment_ratio_test.csv")) 99 | trick_basic_comment_ratio.columns=["basicroomid","orderdate","trick_basic_comment_ratio"] 100 | trick_basic_comment_ratio["orderdate"]=trick_basic_comment_ratio["orderdate"]-1 101 | 102 | #构造特征 103 | #for j in range(10): 104 | for j in [1,3,5,7,9]: 105 | print j 106 | all=pd.read_csv("try/offline_%s.csv"%j) 107 | all=all.merge(feature_train,on=["orderid","basicroomid","roomid"],how="left") 108 | all=all.merge(feature_train_2,on=["orderid","basicroomid","roomid"],how="left") 109 | #test中的basic_30days_ordnumratio数据异常,需要重新设置 110 | del all["basic_30days_ordnumratio"] 111 | all = all.merge(trick_basic_30d_fill, on=["hotelid", "basicroomid","orderdate"], how="left") 112 | #20170620添加特征 113 | for i in ["basic_week_ordernum_ratio", "basic_recent3_ordernum_ratio", "basic_comment_ratio", 114 | "basic_30days_ordnumratio", "basic_30days_realratio"]: 115 | all = merge_max(all, ["orderid"], i, "%s_max" % i) 116 | for i in ["room_30days_ordnumratio", "room_30days_realratio"]: 117 | all = merge_max(all, ["orderid", "basicroomid"], i, "%s_max" % i) 118 | all["user_roomservice_8_345ratio"]=all["user_roomservice_5_345ratio"] 119 | del all["user_roomservice_5_345ratio"] 120 | all["user_roomservice_8_2ratio"]=1-all["user_roomservice_8_345ratio"]-all["user_roomservice_8_1ratio"] 121 | all["user_roomservice_4_1ratio_3month"] = 1 - all["user_roomservice_4_0ratio_3month"] - all["user_roomservice_4_2ratio_3month"] - all["user_roomservice_4_3ratio_3month"] - all["user_roomservice_4_4ratio_3month"] - all["user_roomservice_4_5ratio_3month"] 122 | all["user_roomservice_4_1ratio_1month"] = 1 - all["user_roomservice_4_0ratio_1month"] - all["user_roomservice_4_2ratio_1month"] - all["user_roomservice_4_3ratio_1month"] - all["user_roomservice_4_4ratio_1month"] - all["user_roomservice_4_5ratio_1month"] 123 | all["user_roomservice_4_1ratio_1week"] = 1 - all["user_roomservice_4_0ratio_1week"] - all["user_roomservice_4_2ratio_1week"] - all["user_roomservice_4_3ratio_1week"] - all["user_roomservice_4_4ratio_1week"] - all["user_roomservice_4_5ratio_1week"] 124 | all["user_roomservice_2_0ratio"]=1-all["user_roomservice_2_1ratio"] 125 | all["user_roomservice_3_0ratio"]=1-all["user_roomservice_3_123ratio"] 126 | all["user_roomservice_5_0ratio"]=1-all["user_roomservice_5_1ratio"] 127 | all["user_roomservice_7_1ratio"]=1-all["user_roomservice_7_0ratio"] 128 | all["user_roomservice_2_max"] = np.argmax(all[["user_roomservice_2_%sratio" % i for i in range(2)]].values, axis=1) 129 | all["user_roomservice_3_max"] = np.argmax(all[["user_roomservice_3_%sratio" % i for i in [0,123]]].values, axis=1) 130 | all["user_roomservice_5_max"] = np.argmax(all[["user_roomservice_5_%sratio" % i for i in range(2)]].values, axis=1) 131 | all["user_roomservice_7_max"] = np.argmax(all[["user_roomservice_7_%sratio" % i for i in range(2)]].values, axis=1) 132 | all["user_roomservice_4_max"]=np.argmax(all[["user_roomservice_4_%sratio"%i for i in range(6)]].values,axis=1) 133 | all["user_roomservice_6_max"]=np.argmax(all[["user_roomservice_6_%sratio"%i for i in range(3)]].values,axis=1) 134 | all["user_roomservice_8_max"]=np.argmax(all[["user_roomservice_8_%sratio"%i for i in [1,2,345]]].values,axis=1) 135 | all["user_roomservice_4_max_1week"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 136 | all["user_roomservice_4_max_1month"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 137 | all["user_roomservice_4_max_3month"]=np.argmax(all[["user_roomservice_4_%sratio_3month"%i for i in range(6)]].values,axis=1) 138 | all["roomservice_8"]=all["roomservice_8"].apply(lambda x:2 if x>2 else x-1) 139 | all["roomservice_3"]=all["roomservice_3"].apply(lambda x:1 if x>0 else 0) 140 | for i in range(2,9): 141 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["user_roomservice_%s_max"%i]) 142 | del all["user_roomservice_2_0ratio"] 143 | del all["user_roomservice_3_0ratio"] 144 | del all["user_roomservice_5_0ratio"] 145 | del all["user_roomservice_7_1ratio"] 146 | 147 | #使用穿越特征 148 | all=all.merge(trick_1,on=["hotelid","basicroomid","orderdate"],how="left") 149 | all=all.merge(trick_2,on=["hotelid","basicroomid","orderdate"],how="left") 150 | all=all.merge(trick_3,on=["hotelid","basicroomid","orderdate"],how="left") 151 | all["basic_trick_today"]=all["trick_1"]-all["basic_recent3_ordernum_ratio"] 152 | 153 | all = all.merge(trick_room, on=["hotelid", "roomid", "orderdate"], how="left") 154 | all["room_trick_today"] = all["trick_room"] - all["room_30days_ordnumratio"] 155 | 156 | all = all.merge(trick_basic_7d, on=["hotelid", "basicroomid", "orderdate"], how="left") 157 | all["basic_7d_trick_today"] = all["trick_basic_7d"] - all["basic_week_ordernum_ratio"] 158 | 159 | all = all.merge(trick_basic_30d, on=["hotelid", "basicroomid", "orderdate"], how="left") 160 | all["basic_30d_trick_today"] = all["trick_basic_30d"] - all["basic_30days_ordnumratio"] 161 | 162 | all = all.merge(trick_basic_realratio, on=["basicroomid", "orderdate"], how="left") 163 | all["basic_30days_realratio_today"] = all["trick_basic_30days_realratio"] - all["basic_30days_realratio"] 164 | all = all.merge(trick_room_realratio, on=["roomid", "orderdate"], how="left") 165 | all["room_30days_realratio_today"] = all["trick_room_30days_realratio"] - all["room_30days_realratio"] 166 | all = all.merge(trick_basic_comment_ratio, on=["basicroomid", "orderdate"], how="left") 167 | all["basic_comment_ratio_today"] = all["trick_basic_comment_ratio"] - all["basic_comment_ratio"] 168 | 169 | all = merge_max(all, ["orderid"], "basic_trick_today", "basic_trick_today_max") 170 | all = merge_max(all, ["orderid"], "room_trick_today", "room_trick_today_max") 171 | all = merge_max(all, ["orderid"], "basic_30days_realratio_today", "basic_30days_realratio_today_max") 172 | all = merge_max(all, ["orderid"], "room_30days_realratio_today", "room_30days_realratio_today_max") 173 | 174 | all["roomid_ori"] = all["roomid"] 175 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 176 | 177 | #使用prob构造特征 178 | all=merge_max(all,["orderdate","basicroomid"],"prob","orderdate_basic_prob_max") 179 | all=merge_max(all,["orderdate","basicroomid","roomid"],"prob","orderdate_room_prob_max") 180 | all=merge_sum(all,["orderdate","basicroomid"],"prob","orderdate_basic_prob_sum") 181 | all=merge_sum(all,["orderdate","basicroomid","roomid"],"prob","orderdate_room_prob_sum") 182 | all=merge_mean(all,["orderdate","basicroomid"],"prob","orderdate_basic_prob_mean") 183 | all=merge_mean(all,["orderdate","basicroomid","roomid"],"prob","orderdate_room_prob_mean") 184 | 185 | all=merge_max(all,["orderid","basicroomid"],"prob","basic_prob_max") 186 | all=merge_max(all,["orderid","basicroomid","roomid"],"prob","room_prob_max") 187 | all=merge_sum(all,["orderid","basicroomid"],"prob","basic_prob_sum") 188 | all=merge_sum(all,["orderid","basicroomid","roomid"],"prob","room_prob_sum") 189 | all=merge_mean(all,["orderid","basicroomid"],"prob","basic_prob_mean") 190 | all=merge_mean(all,["orderid","basicroomid","roomid"],"prob","room_prob_mean") 191 | all=merge_mean(all,["rank"],"prob","rank_prob_mean") 192 | all=merge_mean(all,["orderid","rank"],"prob","order_rank_prob_mean") 193 | all = merge_max(all, ["orderid"], "prob", "orderid_prob_max") 194 | all["orderid_prob_max_rt"]=all["prob"]/all["orderid_prob_max"] 195 | all["basic_prob_max_rt"]=all["prob"]/all["basic_prob_max"] 196 | all["room_prob_max_rt"]=all["prob"]/all["room_prob_max"] 197 | all["basic_prob_mean_rt"]=all["prob"]/all["basic_prob_mean"] 198 | all["room_prob_mean_rt"]=all["prob"]/all["room_prob_mean"] 199 | all["order_rank_prob_mean_rt"]=all["prob"]/all["order_rank_prob_mean"] 200 | 201 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 202 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 203 | 204 | all=df_median(all) 205 | all=df_min(all) 206 | all=df_min_orderid(all) 207 | 208 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 209 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 210 | 211 | all = df_rank_mean(all) 212 | all = df_roomrank_mean(all) 213 | 214 | 215 | #添加新特征20170527 216 | #平均值 217 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 218 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 219 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 220 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 221 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 222 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 223 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 224 | 225 | 226 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 227 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 228 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 229 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 230 | 231 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 232 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 233 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 234 | 235 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 236 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 237 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 238 | 239 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 240 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 241 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 242 | 243 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 244 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 245 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 246 | 247 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 248 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 249 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 250 | 251 | #上次订购的价格和当时最低价的比 252 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 253 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 254 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 255 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 256 | 257 | 258 | all["price_tail1"]=all["price_deduct"]%10 259 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 260 | #all["price_tail2"]=all["price_deduct"]%100 261 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 262 | #del all["basicroomid_lastord"] 263 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 264 | #del all["roomid_lastord"] 265 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 266 | #del all["hotelid_lastord"] 267 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 268 | 269 | #价格高低 270 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 271 | 272 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 273 | 274 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 275 | 276 | 277 | for i in [2,3,4,5,6,8]: 278 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 279 | del all["roomservice_%s_lastord"%i] 280 | 281 | for i in [2,3,4,5,6]: 282 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 283 | del all["roomtag_%s_lastord"%i] 284 | 285 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 286 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 287 | del all["ordertype_%s_ratio"%i] 288 | 289 | #所有的 290 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 291 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 292 | 293 | ]: 294 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 295 | 296 | #一周的 297 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 298 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 299 | ]: 300 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 301 | 302 | #一个月的 303 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 304 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 305 | 306 | ]: 307 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 308 | 309 | #三个月的 310 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 311 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 312 | 313 | ]: 314 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 315 | 316 | 317 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 318 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 319 | 320 | all["star_dif"]=all["user_avgstar"]-all["star"] 321 | 322 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 323 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 324 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 325 | 326 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 327 | 328 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 329 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 330 | 331 | print all.shape 332 | 333 | if j==0 or j==1: 334 | train=all 335 | else: 336 | train=train.append(all) 337 | 338 | #算法测试 339 | train_y=train["orderlabel"].values 340 | del train["orderlabel"] 341 | 342 | 343 | print train.shape 344 | #lgb算法 345 | train = lgb.Dataset(train, label=train_y) 346 | params = { 347 | 'boosting_type': 'gbdt', 348 | 'objective': 'binary', 349 | 'metric': 'binary_logloss', 350 | 'min_child_weight': 1.5, 351 | 'num_leaves': 2 ** 5, 352 | 'lambda_l2': 10, 353 | 'subsample': 0.7, 354 | 'colsample_bytree': 0.7, 355 | 'colsample_bylevel': 0.7, 356 | 'learning_rate': 0.05, 357 | 'tree_method': 'exact', 358 | 'seed': 2017, 359 | 'nthread': 12, 360 | 'silent': True, 361 | } 362 | num_round = 1300 363 | model = lgb.train(params, train, num_round, 364 | ) 365 | 366 | 367 | for j in range(10): 368 | print j 369 | all=pd.read_csv("try/online_%s.csv"%j) 370 | all = all.merge(feature_test, on=["orderid", "basicroomid", "roomid"], how="left") 371 | all = all.merge(feature_test_2, on=["orderid", "basicroomid", "roomid"], how="left") 372 | #test中的basic_30days_ordnumratio数据异常,需要重新设置 373 | del all["basic_30days_ordnumratio"] 374 | all = all.merge(trick_basic_30d_fill, on=["hotelid", "basicroomid","orderdate"], how="left") 375 | #20170620添加特征 376 | for i in ["basic_week_ordernum_ratio", "basic_recent3_ordernum_ratio", "basic_comment_ratio", 377 | "basic_30days_ordnumratio", "basic_30days_realratio"]: 378 | all = merge_max(all, ["orderid"], i, "%s_max" % i) 379 | for i in ["room_30days_ordnumratio", "room_30days_realratio"]: 380 | all = merge_max(all, ["orderid", "basicroomid"], i, "%s_max" % i) 381 | all["user_roomservice_8_345ratio"]=all["user_roomservice_5_345ratio"] 382 | del all["user_roomservice_5_345ratio"] 383 | all["user_roomservice_8_2ratio"]=1-all["user_roomservice_8_345ratio"]-all["user_roomservice_8_1ratio"] 384 | all["user_roomservice_4_1ratio_3month"] = 1 - all["user_roomservice_4_0ratio_3month"] - all["user_roomservice_4_2ratio_3month"] - all["user_roomservice_4_3ratio_3month"] - all["user_roomservice_4_4ratio_3month"] - all["user_roomservice_4_5ratio_3month"] 385 | all["user_roomservice_4_1ratio_1month"] = 1 - all["user_roomservice_4_0ratio_1month"] - all["user_roomservice_4_2ratio_1month"] - all["user_roomservice_4_3ratio_1month"] - all["user_roomservice_4_4ratio_1month"] - all["user_roomservice_4_5ratio_1month"] 386 | all["user_roomservice_4_1ratio_1week"] = 1 - all["user_roomservice_4_0ratio_1week"] - all["user_roomservice_4_2ratio_1week"] - all["user_roomservice_4_3ratio_1week"] - all["user_roomservice_4_4ratio_1week"] - all["user_roomservice_4_5ratio_1week"] 387 | all["user_roomservice_2_0ratio"]=1-all["user_roomservice_2_1ratio"] 388 | all["user_roomservice_3_0ratio"]=1-all["user_roomservice_3_123ratio"] 389 | all["user_roomservice_5_0ratio"]=1-all["user_roomservice_5_1ratio"] 390 | all["user_roomservice_7_1ratio"]=1-all["user_roomservice_7_0ratio"] 391 | all["user_roomservice_2_max"] = np.argmax(all[["user_roomservice_2_%sratio" % i for i in range(2)]].values, axis=1) 392 | all["user_roomservice_3_max"] = np.argmax(all[["user_roomservice_3_%sratio" % i for i in [0,123]]].values, axis=1) 393 | all["user_roomservice_5_max"] = np.argmax(all[["user_roomservice_5_%sratio" % i for i in range(2)]].values, axis=1) 394 | all["user_roomservice_7_max"] = np.argmax(all[["user_roomservice_7_%sratio" % i for i in range(2)]].values, axis=1) 395 | all["user_roomservice_4_max"]=np.argmax(all[["user_roomservice_4_%sratio"%i for i in range(6)]].values,axis=1) 396 | all["user_roomservice_6_max"]=np.argmax(all[["user_roomservice_6_%sratio"%i for i in range(3)]].values,axis=1) 397 | all["user_roomservice_8_max"]=np.argmax(all[["user_roomservice_8_%sratio"%i for i in [1,2,345]]].values,axis=1) 398 | all["user_roomservice_4_max_1week"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 399 | all["user_roomservice_4_max_1month"]=np.argmax(all[["user_roomservice_4_%sratio_1month"%i for i in range(6)]].values,axis=1) 400 | all["user_roomservice_4_max_3month"]=np.argmax(all[["user_roomservice_4_%sratio_3month"%i for i in range(6)]].values,axis=1) 401 | all["roomservice_8"]=all["roomservice_8"].apply(lambda x:2 if x>2 else x-1) 402 | all["roomservice_3"]=all["roomservice_3"].apply(lambda x:1 if x>0 else 0) 403 | for i in range(2,9): 404 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["user_roomservice_%s_max"%i]) 405 | del all["user_roomservice_2_0ratio"] 406 | del all["user_roomservice_3_0ratio"] 407 | del all["user_roomservice_5_0ratio"] 408 | del all["user_roomservice_7_1ratio"] 409 | 410 | #使用穿越特征 411 | all=all.merge(trick_1,on=["hotelid","basicroomid","orderdate"],how="left") 412 | all=all.merge(trick_2,on=["hotelid","basicroomid","orderdate"],how="left") 413 | all=all.merge(trick_3,on=["hotelid","basicroomid","orderdate"],how="left") 414 | all["basic_trick_today"]=all["trick_1"]-all["basic_recent3_ordernum_ratio"] 415 | 416 | all = all.merge(trick_room, on=["hotelid", "roomid", "orderdate"], how="left") 417 | all["room_trick_today"] = all["trick_room"] - all["room_30days_ordnumratio"] 418 | 419 | all = all.merge(trick_basic_7d, on=["hotelid", "basicroomid", "orderdate"], how="left") 420 | all["basic_7d_trick_today"] = all["trick_basic_7d"] - all["basic_week_ordernum_ratio"] 421 | 422 | all = all.merge(trick_basic_30d, on=["hotelid", "basicroomid", "orderdate"], how="left") 423 | all["basic_30d_trick_today"] = all["trick_basic_30d"] - all["basic_30days_ordnumratio"] 424 | 425 | all = all.merge(trick_basic_realratio, on=["basicroomid", "orderdate"], how="left") 426 | all["basic_30days_realratio_today"] = all["trick_basic_30days_realratio"] - all["basic_30days_realratio"] 427 | all = all.merge(trick_room_realratio, on=["roomid", "orderdate"], how="left") 428 | all["room_30days_realratio_today"] = all["trick_room_30days_realratio"] - all["room_30days_realratio"] 429 | all = all.merge(trick_basic_comment_ratio, on=["basicroomid", "orderdate"], how="left") 430 | all["basic_comment_ratio_today"] = all["trick_basic_comment_ratio"] - all["basic_comment_ratio"] 431 | 432 | all = merge_max(all, ["orderid"], "basic_trick_today", "basic_trick_today_max") 433 | all = merge_max(all, ["orderid"], "room_trick_today", "room_trick_today_max") 434 | all = merge_max(all, ["orderid"], "basic_30days_realratio_today", "basic_30days_realratio_today_max") 435 | all = merge_max(all, ["orderid"], "room_30days_realratio_today", "room_30days_realratio_today_max") 436 | 437 | all["roomid_ori"] = all["roomid"] 438 | all["roomid"] = map(lambda x, y: int(str(x)[:-len(str(y))]), all["roomid"], all["rank"]) 439 | 440 | #使用prob构造特征 441 | all=merge_max(all,["orderdate","basicroomid"],"prob","orderdate_basic_prob_max") 442 | all=merge_max(all,["orderdate","basicroomid","roomid"],"prob","orderdate_room_prob_max") 443 | all=merge_sum(all,["orderdate","basicroomid"],"prob","orderdate_basic_prob_sum") 444 | all=merge_sum(all,["orderdate","basicroomid","roomid"],"prob","orderdate_room_prob_sum") 445 | all=merge_mean(all,["orderdate","basicroomid"],"prob","orderdate_basic_prob_mean") 446 | all=merge_mean(all,["orderdate","basicroomid","roomid"],"prob","orderdate_room_prob_mean") 447 | 448 | all=merge_max(all,["orderid","basicroomid"],"prob","basic_prob_max") 449 | all=merge_max(all,["orderid","basicroomid","roomid"],"prob","room_prob_max") 450 | all=merge_sum(all,["orderid","basicroomid"],"prob","basic_prob_sum") 451 | all=merge_sum(all,["orderid","basicroomid","roomid"],"prob","room_prob_sum") 452 | all=merge_mean(all,["orderid","basicroomid"],"prob","basic_prob_mean") 453 | all=merge_mean(all,["orderid","basicroomid","roomid"],"prob","room_prob_mean") 454 | all=merge_mean(all,["rank"],"prob","rank_prob_mean") 455 | all=merge_mean(all,["orderid","rank"],"prob","order_rank_prob_mean") 456 | all = merge_max(all, ["orderid"], "prob", "orderid_prob_max") 457 | all["orderid_prob_max_rt"]=all["prob"]/all["orderid_prob_max"] 458 | all["basic_prob_max_rt"]=all["prob"]/all["basic_prob_max"] 459 | all["room_prob_max_rt"]=all["prob"]/all["room_prob_max"] 460 | all["basic_prob_mean_rt"]=all["prob"]/all["basic_prob_mean"] 461 | all["room_prob_mean_rt"]=all["prob"]/all["room_prob_mean"] 462 | all["order_rank_prob_mean_rt"]=all["prob"]/all["order_rank_prob_mean"] 463 | 464 | all = all.merge(basicroom_mean, on="basicroomid", how="left").fillna(0) 465 | all = all.merge(basicroom_sum, on="basicroomid", how="left").fillna(0) 466 | 467 | all=df_median(all) 468 | all=df_min(all) 469 | all=df_min_orderid(all) 470 | 471 | all["basicroomid_price_rank"] = all['price_deduct'].groupby([all['orderid'], all['basicroomid']]).rank() 472 | all["orderid_price_deduct_min_rank"] = all['orderid_price_deduct_min'].groupby(all['orderid']).rank() 473 | 474 | all = df_rank_mean(all) 475 | all = df_roomrank_mean(all) 476 | 477 | #添加新特征20170527 478 | #平均值 479 | all=merge_mean(all,["basicroomid"],"basic_week_ordernum_ratio","basic_week_ordernum_ratio_mean") 480 | all=merge_mean(all,["basicroomid"],"basic_recent3_ordernum_ratio","basic_recent3_ordernum_ratio_mean") 481 | all=merge_mean(all,["basicroomid"],"basic_comment_ratio","basic_comment_ratio_mean") 482 | all=merge_mean(all,["basicroomid"],"basic_30days_ordnumratio","basic_30days_ordnumratio_mean") 483 | all=merge_mean(all,["basicroomid"],"basic_30days_realratio","basic_30days_realratio_mean") 484 | all=merge_mean(all,["roomid"],"room_30days_ordnumratio","room_30days_ordnumratio_mean") 485 | all=merge_mean(all,["roomid"],"room_30days_realratio","room_30days_realratio_mean") 486 | 487 | all["city_num"]=all["user_ordernum"]/all["user_citynum"] 488 | all["area_price"]=all["user_avgprice"]/all["user_avgroomarea"] 489 | all["price_max_min_rt"]=all["user_maxprice"]/all["user_minprice"] 490 | all["basicroomid_price_deduct_min_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["user_minprice"] 491 | 492 | all["price_dif"]=all["basicroomid_price_deduct_min"]-all["price_deduct"] 493 | all["price_dif_hotel"]=all["basicroomid_price_deduct_min"]-all["hotel_minprice_lastord"] 494 | all["price_dif_basic"]=all["basicroomid_price_deduct_min"]-all["basic_minprice_lastord"] 495 | 496 | all["price_dif_rt"]=all["basicroomid_price_deduct_min"]/all["price_deduct"] 497 | all["price_dif_hotel_rt"]=all["basicroomid_price_deduct_min"]/all["hotel_minprice_lastord"] 498 | all["price_dif_basic_rt"]=all["basicroomid_price_deduct_min"]/all["basic_minprice_lastord"] 499 | 500 | all["price_dif_hotel"]=all["orderid_price_deduct_min"]-all["price_deduct"] 501 | all["price_dif_hotel_hotel"]=all["orderid_price_deduct_min"]-all["hotel_minprice_lastord"] 502 | all["price_dif_basic_hotel"]=all["orderid_price_deduct_min"]-all["basic_minprice_lastord"] 503 | 504 | all["price_dif_hotel_rt"]=all["orderid_price_deduct_min"]/all["price_deduct"] 505 | all["price_dif_hotel_hotel_rt"]=all["orderid_price_deduct_min"]/all["hotel_minprice_lastord"] 506 | all["price_dif_basic_hotel_rt"]=all["orderid_price_deduct_min"]/all["basic_minprice_lastord"] 507 | 508 | #all["order_basic_minprice_dif"]=all["basicroomid_price_deduct_min"]-all["orderid_price_deduct_min"] 509 | all["order_basic_minprice_rt"]=all["basicroomid_price_deduct_min"]/all["orderid_price_deduct_min"] 510 | #all["hotel_basic_minprice_lastord_rt"]=all["basic_minprice_lastord"]/all["hotel_minprice_lastord"] 511 | 512 | #上次订购的价格和当时最低价的比 513 | all["hotel_last_price_min_rt"]=all["price_last_lastord"]/all["hotel_minprice_lastord"] 514 | all["basic_last_price_min_rt"]=all["price_last_lastord"]/all["basic_minprice_lastord"] 515 | all["hotel_last_price_min_dif"]=all["price_last_lastord"]-all["hotel_minprice_lastord"] 516 | all["basic_last_price_min_dif"]=all["price_last_lastord"]-all["basic_minprice_lastord"] 517 | 518 | 519 | all["price_tail1"]=all["price_deduct"]%10 520 | all["price_tail1"]=map(lambda x:1 if x==4 or x==7 else 0,all["price_tail1"]) 521 | #all["price_tail2"]=all["price_deduct"]%100 522 | all["basic_equal"]=map(lambda x,y:1 if x==y else 0,all["basicroomid"],all["basicroomid_lastord"]) 523 | #del all["basicroomid_lastord"] 524 | all["room_equal"]=map(lambda x,y:1 if x==y else 0,all["roomid"],all["roomid_lastord"]) 525 | #del all["roomid_lastord"] 526 | all["hotel_equal"]=map(lambda x,y:1 if x==y else 0,all["hotelid"],all["hotelid_lastord"]) 527 | #del all["hotelid_lastord"] 528 | all["rank_equal"]=map(lambda x,y:1 if x==y else 0,all["rank"],all["rank_lastord"]) 529 | 530 | #价格高低 531 | all["price_dx"] = map(lambda x, y: x-y, all["price_deduct"], all["price_last_lastord"]) 532 | 533 | all["return_dx"] = map(lambda x, y: x-y, all["returnvalue"], all["return_lastord"]) 534 | 535 | all["price_ori"] = map(lambda x, y:x+y, all["price_deduct"], all["returnvalue"]) 536 | 537 | 538 | for i in [2,3,4,5,6,8]: 539 | all["service_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomservice_%s"%i], all["roomservice_%s_lastord"%i]) 540 | del all["roomservice_%s_lastord"%i] 541 | 542 | for i in [2,3,4,5,6]: 543 | all["roomtag_equal_%s"%i] = map(lambda x, y: 1 if x == y else 0, all["roomtag_%s"%i], all["roomtag_%s_lastord"%i]) 544 | del all["roomtag_%s_lastord"%i] 545 | 546 | for i in [1,2,3,4,5,6,7,8,9,10,11]: 547 | all["ordertype_%s_num"%i] = map(lambda x, y:x*y, all["ordertype_%s_ratio"%i], all["user_ordernum"]) 548 | del all["ordertype_%s_ratio"%i] 549 | 550 | #所有的 551 | for c in ["orderbehavior_1_ratio","orderbehavior_2_ratio","orderbehavior_6_ratio","orderbehavior_7_ratio", 552 | #"user_roomservice_4_0ratio","user_roomservice_4_1ratio","user_roomservice_4_2ratio","user_roomservice_4_3ratio","user_roomservice_4_4ratio","user_roomservice_4_5ratio","user_roomservice_3_123ratio","user_roomservice_6_2ratio","user_roomservice_6_1ratio","user_roomservice_6_0ratio","user_roomservice_5_1ratio","user_roomservice_7_0ratio","user_roomservice_2_1ratio","user_roomservice_8_1ratio","user_roomservice_5_345ratio" 553 | 554 | ]: 555 | all[c]=map(lambda x,y:x*y,all[c],all["user_ordernum"]) 556 | 557 | #一周的 558 | for c in ["orderbehavior_3_ratio_1week","orderbehavior_4_ratio_1week","orderbehavior_5_ratio_1week", 559 | #"user_roomservice_3_123ratio_1week","user_roomservice_7_1ratio_1week","user_roomservice_7_0ratio_1week","user_roomservice_4_5ratio_1week","user_roomservice_4_4ratio_1week","user_roomservice_4_2ratio_1week","user_roomservice_4_3ratio_1week","user_roomservice_4_0ratio_1week" 560 | ]: 561 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1week"]) 562 | 563 | #一个月的 564 | for c in ["orderbehavior_3_ratio_1month","orderbehavior_4_ratio_1month","orderbehavior_5_ratio_1month", 565 | #"user_roomservice_3_123ratio_1month", "user_roomservice_7_1ratio_1month", "user_roomservice_7_0ratio_1month","user_roomservice_4_5ratio_1month", "user_roomservice_4_4ratio_1month", "user_roomservice_4_2ratio_1month","user_roomservice_4_3ratio_1month", "user_roomservice_4_0ratio_1month" 566 | 567 | ]: 568 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_1month"]) 569 | 570 | #三个月的 571 | for c in ["orderbehavior_3_ratio_3month","orderbehavior_4_ratio_3month","orderbehavior_5_ratio_3month", 572 | #"user_roomservice_3_123ratio_3month", "user_roomservice_7_1ratio_3month", "user_roomservice_7_0ratio_3month","user_roomservice_4_5ratio_3month", "user_roomservice_4_4ratio_3month", "user_roomservice_4_2ratio_3month","user_roomservice_4_3ratio_3month", "user_roomservice_4_0ratio_3month" 573 | 574 | ]: 575 | all[c] = map(lambda x,y: x * y, all[c], all["user_ordnum_3month"]) 576 | 577 | 578 | all["price_star"]=all["price_deduct"]/(all["star"]-1) 579 | all["price_minarea"]=all["price_deduct"]/(all["basic_minarea"]-1) 580 | 581 | all["star_dif"]=all["user_avgstar"]-all["star"] 582 | 583 | all["price_ave_dif_rt"]=all["price_deduct"]/all["user_avgdealprice"] 584 | all["price_ave_star_dif"]=all["price_deduct"]/all["user_avgprice_star"] 585 | all["price_h_w_rt"]=all["user_avgdealpriceholiday"]/all["user_avgdealpriceworkday"] 586 | 587 | all["price_ave_dif"] = all["price_deduct"] - all["user_avgdealprice"] 588 | 589 | all["user_roomservice_4_32_rt"]=all["user_roomservice_4_3ratio"]/all["user_roomservice_4_2ratio"] 590 | all["user_roomservice_4_43_rt"]=all["user_roomservice_4_4ratio"]/all["user_roomservice_4_3ratio"] 591 | 592 | print all.shape 593 | 594 | online = model.predict(all.values) 595 | online = pd.DataFrame(online) 596 | online.columns = ["prob"] 597 | online["orderid"] = all["orderid"].values 598 | online["basicroomid"] = all["basicroomid"].values 599 | online["predict_roomid"] = all["roomid_ori"].values 600 | online["roomid"] = all["roomid"].values 601 | 602 | if j==0: 603 | result=online 604 | else: 605 | result=result.append(online) 606 | 607 | result.to_csv("all_result_v43_13579_test_feature.csv",index=None) 608 | del result["basicroomid"] 609 | del result["roomid"] 610 | result = result.sort_values("prob") 611 | del result["prob"] 612 | result = result.drop_duplicates("orderid", keep="last") 613 | result["orderid"]=result["orderid"].apply(lambda x:"ORDER_"+str(x)) 614 | result["predict_roomid"]=result["predict_roomid"].apply(lambda x:"ROOM_"+str(x)) 615 | result.to_csv("sub_v43_13579.csv",index=None) 616 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | 存放原始数据 2 | -------------------------------------------------------------------------------- /stage_1_数据预处理.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | 4 | def remove(x): 5 | try: 6 | return re.search("\d+", x).group() 7 | except: 8 | return 0 9 | 10 | 11 | #数据切割--将train和test分别切割成10份 12 | all = pd.read_table("data/competition_train.txt") 13 | for i in range(10): 14 | all[(800000*i):(800000*i+800000)].to_csv("data/train_%s.csv"%i,index=None) 15 | all = pd.read_table("data/competition_test.txt") 16 | for i in range(10): 17 | all[(800000*i):(800000*i+800000)].to_csv("data/test_%s.csv"%i,index=None) 18 | 19 | #数据处理--去除字母等文字 20 | for i in range(10): 21 | all=pd.read_csv("data/train_%s.csv"%i) 22 | dt1=pd.to_datetime(all["orderdate"]) 23 | dt2=pd.to_datetime(all["orderdate_lastord"]) 24 | all["orderdate"]=dt1.dt.dayofyear 25 | all["orderdate_lastord"]=dt2.dt.dayofyear 26 | for c in ["orderid","uid","hotelid","basicroomid","roomid", 27 | "orderid_lastord","hotelid_lastord","roomid_lastord","basicroomid_lastord", 28 | ]: 29 | all[c] = all[c].apply(remove) 30 | all.to_csv("try/offline_%s.csv"%i,index=None) 31 | for i in range(10): 32 | all=pd.read_csv("data/test_%s.csv"%i) 33 | dt1=pd.to_datetime(all["orderdate"]) 34 | dt2=pd.to_datetime(all["orderdate_lastord"]) 35 | all["orderdate"]=dt1.dt.dayofyear 36 | all["orderdate_lastord"]=dt2.dt.dayofyear 37 | for c in ["orderid","uid","hotelid","basicroomid","roomid", 38 | "orderid_lastord","hotelid_lastord","roomid_lastord","basicroomid_lastord", 39 | ]: 40 | all[c] = all[c].apply(remove) 41 | all.to_csv("try/online_%s.csv"%i,index=None) 42 | 43 | #构造数据--构造每天的数据 44 | #==============================================basic_comment_ratio======================================== 45 | for i in range(10): 46 | if i==0: 47 | data=pd.read_csv("data/train_%s.csv"%i)[["basicroomid","orderdate","basic_comment_ratio"]] 48 | else: 49 | data=data.append(pd.read_csv("data/train_%s.csv"%i)[["basicroomid","orderdate","basic_comment_ratio"]]) 50 | data=data.sort_values("basic_comment_ratio") 51 | data=data.drop_duplicates(["basicroomid","orderdate"],keep="first") 52 | print data.shape 53 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 54 | for c in ["basicroomid" 55 | ]: 56 | data[c] = data[c].apply(remove) 57 | data.to_csv("feature_basic_comment_ratio_train.csv",index=None) 58 | 59 | for i in range(10): 60 | if i==0: 61 | data=pd.read_csv("data/test_%s.csv"%i)[["basicroomid","orderdate","basic_comment_ratio"]] 62 | else: 63 | data=data.append(pd.read_csv("data/test_%s.csv"%i)[["basicroomid","orderdate","basic_comment_ratio"]]) 64 | data=data.sort_values("basic_comment_ratio") 65 | data=data.drop_duplicates(["basicroomid","orderdate"],keep="first") 66 | print data.shape 67 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 68 | for c in ["basicroomid" 69 | ]: 70 | data[c] = data[c].apply(remove) 71 | data.to_csv("feature_basic_comment_ratio_test.csv",index=None) 72 | 73 | #==============================================basic_30days_realratio======================================== 74 | for i in range(10): 75 | if i==0: 76 | data=pd.read_csv("data/train_%s.csv"%i)[["basicroomid","orderdate","basic_30days_realratio"]] 77 | else: 78 | data=data.append(pd.read_csv("data/train_%s.csv"%i)[["basicroomid","orderdate","basic_30days_realratio"]]) 79 | 80 | data=data.sort_values("basic_30days_realratio") 81 | data=data.drop_duplicates(["basicroomid","orderdate"],keep="first") 82 | print data.shape 83 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 84 | 85 | for c in ["basicroomid" 86 | ]: 87 | data[c] = data[c].apply(remove) 88 | 89 | data.to_csv("feature_basic_30days_realratio_train.csv",index=None) 90 | 91 | for i in range(10): 92 | if i==0: 93 | data=pd.read_csv("data/test_%s.csv"%i)[["basicroomid","orderdate","basic_30days_realratio"]] 94 | else: 95 | data=data.append(pd.read_csv("data/test_%s.csv"%i)[["basicroomid","orderdate","basic_30days_realratio"]]) 96 | 97 | data=data.sort_values("basic_30days_realratio") 98 | data=data.drop_duplicates(["basicroomid","orderdate"],keep="first") 99 | print data.shape 100 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 101 | 102 | for c in ["basicroomid" 103 | ]: 104 | data[c] = data[c].apply(remove) 105 | 106 | data.to_csv("feature_basic_30days_realratio_test.csv",index=None) 107 | #==============================================room_30days_realratio======================================== 108 | for i in range(10): 109 | if i==0: 110 | data=pd.read_csv("data/train_%s.csv"%i)[["roomid","orderdate","room_30days_realratio"]] 111 | else: 112 | data=data.append(pd.read_csv("data/train_%s.csv"%i)[["roomid","orderdate","room_30days_realratio"]]) 113 | 114 | data=data.sort_values("room_30days_realratio") 115 | data=data.drop_duplicates(["roomid","orderdate"],keep="first") 116 | print data.shape 117 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 118 | 119 | for c in ["roomid" 120 | ]: 121 | data[c] = data[c].apply(remove) 122 | 123 | data.to_csv("feature_room_30days_realratio_train.csv",index=None) 124 | 125 | for i in range(10): 126 | if i==0: 127 | data=pd.read_csv("data/test_%s.csv"%i)[["roomid","orderdate","room_30days_realratio"]] 128 | else: 129 | data=data.append(pd.read_csv("data/test_%s.csv"%i)[["roomid","orderdate","room_30days_realratio"]]) 130 | 131 | data=data.sort_values("room_30days_realratio") 132 | data=data.drop_duplicates(["roomid","orderdate"],keep="first") 133 | print data.shape 134 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 135 | 136 | for c in ["roomid" 137 | ]: 138 | data[c] = data[c].apply(remove) 139 | 140 | data.to_csv("feature_room_30days_realratio_test.csv",index=None) 141 | #==============================================basic_recent3_ordernum_ratio======================================== 142 | for i in range(10): 143 | if i==0: 144 | data=pd.read_csv("data/train_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_recent3_ordernum_ratio"]] 145 | else: 146 | data=data.append(pd.read_csv("data/train_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_recent3_ordernum_ratio"]]) 147 | 148 | data=data.drop_duplicates() 149 | print data.shape 150 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 151 | 152 | for c in ["hotelid", "basicroomid" 153 | ]: 154 | data[c] = data[c].apply(remove) 155 | 156 | data.to_csv("feature_3days_train.csv",index=None) 157 | 158 | for i in range(10): 159 | if i==0: 160 | data=pd.read_csv("data/test_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_recent3_ordernum_ratio"]] 161 | else: 162 | data=data.append(pd.read_csv("data/test_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_recent3_ordernum_ratio"]]) 163 | 164 | data=data.drop_duplicates() 165 | print data.shape 166 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 167 | 168 | for c in ["hotelid", "basicroomid" 169 | ]: 170 | data[c] = data[c].apply(remove) 171 | 172 | data.to_csv("feature_3days_test.csv",index=None) 173 | #==============================================basic_week_ordernum_ratio======================================== 174 | for i in range(10): 175 | if i==0: 176 | data=pd.read_csv("data/train_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_week_ordernum_ratio"]] 177 | else: 178 | data=data.append(pd.read_csv("data/train_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_week_ordernum_ratio"]]) 179 | 180 | data=data.drop_duplicates() 181 | print data.shape 182 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 183 | 184 | for c in ["hotelid", "basicroomid" 185 | ]: 186 | data[c] = data[c].apply(remove) 187 | 188 | data.to_csv("feature_basic_7days_train.csv",index=None) 189 | 190 | for i in range(10): 191 | if i==0: 192 | data=pd.read_csv("data/test_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_week_ordernum_ratio"]] 193 | else: 194 | data=data.append(pd.read_csv("data/test_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_week_ordernum_ratio"]]) 195 | 196 | data=data.drop_duplicates() 197 | print data.shape 198 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 199 | 200 | for c in ["hotelid", "basicroomid" 201 | ]: 202 | data[c] = data[c].apply(remove) 203 | 204 | data.to_csv("feature_basic_7days_test.csv",index=None) 205 | #==============================================basic_30days_ordnumratio======================================== 206 | for i in range(10): 207 | if i==0: 208 | data=pd.read_csv("data/train_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_30days_ordnumratio"]] 209 | else: 210 | data=data.append(pd.read_csv("data/train_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_30days_ordnumratio"]]) 211 | 212 | data=data.sort_values("basic_30days_ordnumratio") 213 | data=data.drop_duplicates(["hotelid","basicroomid","orderdate"],keep="first") 214 | print data.shape 215 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 216 | 217 | for c in ["hotelid", "basicroomid" 218 | ]: 219 | data[c] = data[c].apply(remove) 220 | 221 | data.to_csv("feature_basic_30days_train.csv",index=None) 222 | 223 | for i in range(10): 224 | if i==0: 225 | data=pd.read_csv("data/test_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_30days_ordnumratio"]] 226 | else: 227 | data=data.append(pd.read_csv("data/test_%s.csv"%i)[["hotelid","basicroomid","orderdate","basic_30days_ordnumratio"]]) 228 | 229 | data=data.sort_values("basic_30days_ordnumratio") 230 | data=data.drop_duplicates(["hotelid","basicroomid","orderdate"],keep="first") 231 | print data.shape 232 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 233 | 234 | for c in ["hotelid", "basicroomid" 235 | ]: 236 | data[c] = data[c].apply(remove) 237 | 238 | data.to_csv("feature_basic_30days_test.csv",index=None) 239 | #==============================================room_30days_ordnumratio======================================== 240 | for i in range(10): 241 | if i==0: 242 | data=pd.read_csv("data/train_%s.csv"%i)[["hotelid","roomid","orderdate","room_30days_ordnumratio"]] 243 | else: 244 | data=data.append(pd.read_csv("data/train_%s.csv"%i)[["hotelid","roomid","orderdate","room_30days_ordnumratio"]]) 245 | 246 | data=data.drop_duplicates() 247 | print data.shape 248 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 249 | 250 | for c in ["hotelid", "roomid" 251 | ]: 252 | data[c] = data[c].apply(remove) 253 | 254 | data.to_csv("feature_room_30days_ordnumratio_train.csv",index=None) 255 | 256 | for i in range(10): 257 | if i==0: 258 | data=pd.read_csv("data/test_%s.csv"%i)[["hotelid","roomid","orderdate","room_30days_ordnumratio"]] 259 | else: 260 | data=data.append(pd.read_csv("data/test_%s.csv"%i)[["hotelid","roomid","orderdate","room_30days_ordnumratio"]]) 261 | 262 | data=data.drop_duplicates() 263 | print data.shape 264 | data["orderdate"]=pd.to_datetime(data["orderdate"]).dt.dayofyear 265 | 266 | for c in ["hotelid", "roomid" 267 | ]: 268 | data[c] = data[c].apply(remove) 269 | 270 | data.to_csv("feature_room_30days_ordnumratio_test.csv",index=None) -------------------------------------------------------------------------------- /stage_2_特征构造及预测.bat: -------------------------------------------------------------------------------- 1 | python ctrip_test_all_data_v29_lgb_feature.py 2 | python ctrip_test_all_data_v31_lgb_feature.py 3 | python ctrip_test_all_data_v42.py 4 | python ctrip_test_all_data_v43.py -------------------------------------------------------------------------------- /stage_3_融合提交.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | 3 | import pandas as pd 4 | ''' 5 | for j in range(10): 6 | print j 7 | if j==0: 8 | all=pd.read_csv("try/online_%s.csv"%j)[["orderid","orderdate"]] 9 | else: 10 | all=all.append(pd.read_csv("try/online_%s.csv"%j)[["orderid","orderdate"]]) 11 | 12 | all=all.drop_duplicates() 13 | all.to_csv("select_sub.csv",index=None) 14 | ''' 15 | select_sub=pd.read_csv("select_sub.csv") 16 | select_sub["orderid"]=select_sub["orderid"].apply(lambda x:"ORDER_"+str(x)) 17 | lastday=select_sub[select_sub.orderdate==117].copy()[["orderid"]] 18 | 19 | otherday=select_sub[select_sub.orderdate!=117].copy()[["orderid"]] 20 | 21 | #最后一天提交v42的 22 | test=pd.read_csv("sub_v42_13579.csv") 23 | lastday=lastday.merge(test,on="orderid",how="left") 24 | #其他天提交v43的 25 | test=pd.read_csv("sub_v43_13579.csv") 26 | otherday=otherday.merge(test,on="orderid",how="left") 27 | 28 | result=otherday.append(lastday) 29 | result.to_csv("result.csv",index=None) -------------------------------------------------------------------------------- /try/README.md: -------------------------------------------------------------------------------- 1 | 存放处理后的数据 2 | 3 | leak.zip为存在leak的数据,解压出来可以使用 4 | -------------------------------------------------------------------------------- /try/leak.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plantsgo/kesci_ctrip/600fd535188e0070f0ff3579ae7faaed3957ee37/try/leak.zip -------------------------------------------------------------------------------- /代码说明.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plantsgo/kesci_ctrip/600fd535188e0070f0ff3579ae7faaed3957ee37/代码说明.txt --------------------------------------------------------------------------------