├── README.md ├── code ├── gen_samples.py └── model.py └── data └── README.md /README.md: -------------------------------------------------------------------------------- 1 | # 商场中精确定位用户所在店铺: [地址](https://tianchi.aliyun.com/competition/introduction.htm?raceId=231620) 2 | 3 | ## 代码说明 4 | * 第七名队伍的初赛代码,复赛思路和初赛基本一致,由本人notebook代码整理过来,没有运行过,如有问题欢迎反馈 5 | * 特征名含义还是比较容易理解的,所以就不特别注释说明了 6 | * 多分类基本与麦芽的代码相同,初赛A榜0.905,代码中提供了一个注释掉的备选参数,由于时间问题没来得及跑,线下来看应该会好一个百分点左右 7 | * 最终模型为rank模型(lambdarank),除了需要设置分组外用法与其它模型相同, 8 | 9 | ## 代码使用说明 10 | * gen_samples.py: 多分类产生候选样本 11 | * model.py: 提取特征,训练模型 12 | 13 | -------------------------------------------------------------------------------- /code/gen_samples.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings('ignore') 3 | import pandas as pd 4 | import numpy as np 5 | import pickle, os, re, operator, gc 6 | from tqdm import tqdm 7 | from multiprocessing import Pool 8 | from sklearn.feature_extraction import DictVectorizer 9 | from sklearn.preprocessing import LabelEncoder 10 | import lightgbm as lgb 11 | import xgboost as xgb 12 | 13 | behavior = pd.read_csv('../data/user_shop_behavior.csv') 14 | behavior.loc[:, 'time_stamp'] = pd.to_datetime(behavior['time_stamp']) 15 | shop = pd.read_csv('../data/shop_info.csv') 16 | train = behavior.merge(shop[['shop_id', 'mall_id']], how='left', on='shop_id') 17 | train['row_id'] = train.index 18 | test = pd.read_csv('../data/test.csv') 19 | 20 | res = None 21 | train_samples = [] 22 | test_samples = [] 23 | for mall_id in tqdm(train.mall_id.unique()): 24 | sub_train = train[train.mall_id == mall_id] 25 | sub_test = test[test.mall_id == mall_id] 26 | 27 | train_set = [] 28 | for index, row in sub_train.iterrows(): 29 | wifi_dict = {} 30 | for wifi in row.wifi_infos.split(';'): 31 | bssid, signal, flag = wifi.split('|') 32 | wifi_dict[bssid] = int(signal) 33 | train_set.append(wifi_dict) 34 | 35 | test_set = [] 36 | for index, row in sub_test.iterrows(): 37 | wifi_dict = {} 38 | for wifi in row.wifi_infos.split(';'): 39 | bssid, signal, flag = wifi.split('|') 40 | wifi_dict[bssid] = int(signal) 41 | test_set.append(wifi_dict) 42 | 43 | v = DictVectorizer(sparse=False, sort=False) 44 | train_set = v.fit_transform(train_set) 45 | test_set = v.transform(test_set) 46 | train_set[train_set==0]=np.NaN 47 | test_set[test_set==0]=np.NaN 48 | sub_train = pd.concat([sub_train.reset_index(),pd.DataFrame(train_set)], axis=1) 49 | sub_test = pd.concat([sub_test.reset_index(),pd.DataFrame(test_set)], axis=1) 50 | 51 | lbl = LabelEncoder() 52 | lbl.fit(list(sub_train['shop_id'].values)) 53 | sub_train['label'] = lbl.transform(list(sub_train['shop_id'].values)) 54 | num_class=sub_train['label'].max()+1 55 | feature = [x for x in sub_train.columns if x not in ['user_id','label','shop_id','time_stamp','mall_id','wifi_infos','row_id']] 56 | params = { 57 | 'objective': 'multi:softprob', 58 | 'eta': 0.1, 59 | 'max_depth': 9, 60 | 'eval_metric': 'merror', 61 | 'seed': 0, 62 | 'num_class':num_class, 63 | 'silent' : 1 64 | } 65 | num_rounds=60 66 | 67 | # params = { 68 | # 'objective': 'multi:softprob', 69 | # 'eta': 0.05, 70 | # 'max_depth': 7, 71 | # 'eval_metric': 'merror', 72 | # 'colsample_bytree': 0.6, 73 | # 'sub_sample': 0.6, 74 | # 'colsample_bylevel': 0.6, 75 | # 'seed': 0, 76 | # 'num_class':num_class, 77 | # 'silent' : 1 78 | # } 79 | # num_rounds=300 80 | 81 | xgbtrain = xgb.DMatrix(sub_train[feature], sub_train['label']) 82 | xgbtest = xgb.DMatrix(sub_test[feature]) 83 | watchlist = [ (xgbtrain,'train'), (xgbtrain, 'test') ] 84 | model = xgb.train(params, xgbtrain, num_rounds, watchlist, early_stopping_rounds=15, verbose_eval=False) 85 | preds=model.predict(xgbtest) 86 | for row, pred in enumerate(preds): 87 | row_id = sub_test['row_id'].iloc[row] 88 | predSorted = (-pred).argsort() 89 | for i in range(10): 90 | test_samples.append({'row_id':row_id,'shop_id':lbl.inverse_transform(predSorted[i]),'prob':pred[predSorted[i]]}) 91 | if pred[predSorted[i]]>0.99: 92 | break 93 | preds=model.predict(xgbtrain) 94 | for row, pred in enumerate(preds): 95 | row_id = sub_train['row_id'].iloc[row] 96 | predSorted = (-pred).argsort() 97 | for i in range(10): 98 | train_samples.append({'row_id':row_id,'shop_id':lbl.inverse_transform(predSorted[i]),'prob':pred[predSorted[i]]}) 99 | 100 | train_samples = pd.DataFrame(train_samples) 101 | test_samples = pd.DataFrame(test_samples) 102 | 103 | train_samples.to_pickle(open('../data/train_samples_top10.pkl', 'wb')) 104 | test_samples.to_pickle(open('../data/test_samples_top10.pkl', 'wb')) 105 | 106 | -------------------------------------------------------------------------------- /code/model.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings('ignore') 3 | import pandas as pd 4 | import numpy as np 5 | import pickle, os, re, operator, gc 6 | from collections import Counter 7 | from tqdm import tqdm 8 | from multiprocessing import Pool 9 | from sklearn.preprocessing import LabelEncoder 10 | import lightgbm as lgb 11 | 12 | train = pd.read_csv('../data/user_shop_behavior.csv') 13 | train.loc[:, 'time_stamp'] = pd.to_datetime(train['time_stamp']) 14 | train['hour'] = train.time_stamp.dt.hour 15 | train['day'] = train.time_stamp.dt.day 16 | train['weekday'] = train.time_stamp.dt.weekday 17 | 18 | shop = pd.read_csv('../data/shop_info.csv') 19 | le = LabelEncoder() 20 | shop['category_id'] = le.fit_transform(shop['category_id']) 21 | train['row_id'] = train.index 22 | 23 | test = pd.read_csv('../data/test.csv') 24 | test.loc[:, 'time_stamp'] = pd.to_datetime(test['time_stamp']) 25 | test['hour'] = test.time_stamp.dt.hour 26 | test['day'] = test.time_stamp.dt.day 27 | test['weekday'] = test.time_stamp.dt.weekday 28 | 29 | num_partitions = 15 #number of partitions to split dataframe 30 | num_cores = 15 #number of cores on your machine 31 | 32 | # df并行apply windows下可能会报错 33 | def parallelize_dataframe(df, func): 34 | df_split = np.array_split(df, num_partitions) 35 | pool = Pool(num_cores) 36 | df = pd.concat(pool.map(func, df_split)) 37 | pool.close() 38 | pool.join() 39 | return df 40 | 41 | def best_wifi_parallelize(data): 42 | data[['best_wifi','best_wifi_strength']] = data.apply(get_best_wifi, axis=1) 43 | return data 44 | 45 | def get_best_wifi(row): 46 | best_wifi_strength = -112 47 | for wifi in row.wifi_infos.split(';'): 48 | bssid, signal, flag = wifi.split('|') 49 | if int(signal) > best_wifi_strength: 50 | best_wifi = bssid 51 | best_wifi_strength = int(signal) 52 | return pd.Series([best_wifi, best_wifi_strength]) 53 | 54 | train = parallelize_dataframe(train, best_wifi_parallelize) 55 | test = parallelize_dataframe(test, best_wifi_parallelize) 56 | 57 | # 样本前21天提取特征 58 | train_data = train[train.time_stamp.dt.day<=21] 59 | train_data = train_data.merge(shop, how='left', on='shop_id') 60 | test_data = train[(train.time_stamp.dt.day>10) & (train.time_stamp.dt.day<=31)] 61 | test_data = test_data.merge(shop, how='left', on='shop_id') 62 | train = train[(train.time_stamp.dt.day>21) & (train.time_stamp.dt.day<=31)] 63 | 64 | # merge候选样本 65 | train.rename(columns={'shop_id': 'shop_id_true'}, inplace=True) 66 | samples = pickle.load(open('../data/train_samples_top10.pkl', 'rb')) 67 | samples = samples.groupby('row_id').head(4) 68 | train = train.merge(samples, 'left', 'row_id') 69 | train = train.merge(shop, how='left', on='shop_id') 70 | train['target'] = np.where(train['shop_id_true']==train['shop_id'], 1, 0) 71 | 72 | samples = pickle.load(open('../data/test_samples_top10.pkl', 'rb')) 73 | samples = samples.groupby('row_id').head(9) 74 | test = test.merge(samples, 'left', 'row_id') 75 | del test['mall_id'] 76 | test = test.merge(shop, how='left', on='shop_id') 77 | 78 | def haversine_array(lat1, lng1, lat2, lng2): 79 | lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 80 | AVG_EARTH_RADIUS = 6371000 # in m 81 | lat = lat2 - lat1 82 | lng = lng2 - lng1 83 | d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2 84 | h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d)) 85 | return h 86 | 87 | def gen_feature(data, df): 88 | # shop 89 | shop_count = data.groupby(['shop_id']).size().reset_index().rename(columns={0:'shop_count'}) 90 | shop_weekday_count = data.groupby(['shop_id','weekday']).size().reset_index().rename(columns={0:'shop_weekday_count'}) 91 | shop_day_count = data.groupby(['shop_id','day']).size().reset_index() 92 | shop_day_count_var = shop_day_count.groupby(['shop_id'])[0].var().reset_index().rename(columns={0:'shop_day_count_var'}) 93 | shop_hour_count = data.groupby(['shop_id','hour']).size().reset_index().rename(columns={0:'shop_hour_count'}) 94 | 95 | # GPS 96 | data['la_dist'] = np.abs(data['latitude_x'] - data['latitude_y']) 97 | data['lo_dist'] = np.abs(data['longitude_x'] - data['longitude_y']) 98 | gps_hist = data[['shop_id', 'longitude_x', 'latitude_x', 'lo_dist', 'la_dist']].groupby('shop_id').mean().reset_index() 99 | gps_hist.columns = ['shop_id', 'hist_lo', 'hist_la', 'hist_lo_dist', 'hist_la_dist'] 100 | 101 | # cate 102 | cate_count = data.groupby(['category_id']).size().reset_index().rename(columns={0:'cate_count'}) 103 | cate_hour_count = data.groupby(['category_id','hour']).size().reset_index().rename(columns={0:'cate_hour_count'}) 104 | 105 | # wifi 106 | data['shop_wifi_count'] = data.wifi_infos.str.count(';')+1 107 | shop_wifi_count_mean = data[['shop_id','shop_wifi_count']].groupby('shop_id').mean().reset_index().rename(columns={'shop_wifi_count':'shop_wifi_count_mean'}) 108 | shop_wifi_count_sum = data[['shop_id','shop_wifi_count']].groupby('shop_id').sum().reset_index().rename(columns={'shop_wifi_count':'shop_wifi_count_sum'}) 109 | best_wifi_count = data.groupby(['shop_id','best_wifi']).size().reset_index().rename(columns={0:'best_wifi_count'}) 110 | 111 | # user 112 | user_ave_price = data[['user_id','price']].groupby('user_id').mean().reset_index().rename(columns={'price':'user_ave_price'}) 113 | user_count = data.groupby('user_id').size().reset_index().rename(columns={0:'user_count'}) 114 | user_shop_count = data.groupby(['user_id','shop_id']).size().reset_index().rename(columns={0:'user_shop_count'}) 115 | 116 | # last shop sample 117 | last = data.sort_values(['shop_id','time_stamp'],ascending=False).drop_duplicates('shop_id') 118 | last = last[['shop_id', 'longitude_x', 'latitude_x', 'wifi_infos']] 119 | last.columns = ['shop_id', 'last_longitude', 'last_latitude', 'last_wifi_infos'] 120 | 121 | # merge fature 122 | df = df.merge(shop_count, 'left', 'shop_id') 123 | df = df.merge(shop_day_count_var, 'left', 'shop_id') 124 | df = df.merge(shop_weekday_count, 'left', ['shop_id','weekday']) 125 | df = df.merge(shop_hour_count, 'left', ['shop_id','hour']) 126 | 127 | df = df.merge(gps_hist, 'left', 'shop_id') 128 | df['real_lo_dist'] = np.abs(df.longitude_y-df.longitude_x) 129 | df['real_la_dist'] = np.abs(df.latitude_y-df.latitude_x) 130 | df.loc[:, 'real_haversine_dist'] = haversine_array(df['latitude_x'].values, df['longitude_x'].values, df['latitude_y'].values, df['longitude_y'].values) 131 | df.loc[:, 'hist_haversine_dist'] = haversine_array(df['latitude_x'].values, df['longitude_x'].values, df['hist_la'].values, df['hist_lo'].values) 132 | 133 | df = df.merge(cate_count, 'left', 'category_id') 134 | df = df.merge(cate_hour_count, 'left', ['category_id', 'hour']) 135 | df['cate_hour_rate'] = df.cate_hour_count/df.cate_count 136 | 137 | df['wifi_count'] = df.wifi_infos.str.count(';')+1 138 | df = df.merge(shop_wifi_count_mean, 'left', 'shop_id') 139 | df = df.merge(shop_wifi_count_sum, 'left', 'shop_id') 140 | df['shop_wifi_count_rate'] = df.wifi_count/df.shop_wifi_count_mean 141 | df = df.merge(best_wifi_count, 'left', ['shop_id', 'best_wifi']) 142 | df['best_wifi_rate1'] = df.best_wifi_count/df.shop_count 143 | df['best_wifi_rate2'] = (df.best_wifi_count+100000)/(df.shop_count+200000) 144 | 145 | df = df.merge(user_count, 'left', 'user_id') 146 | df = df.merge(user_shop_count, 'left', ['user_id','shop_id']) 147 | df = df.merge(user_ave_price, 'left', 'user_id') 148 | df['price_rate'] = df.user_ave_price/df.price 149 | 150 | df = df.merge(last, 'left', 'shop_id') 151 | df.loc[:, 'last_dist_diff'] = haversine_array(df['latitude_x'].values, df['longitude_x'].values, df['last_latitude'].values, df['last_longitude'].values) 152 | 153 | return df 154 | 155 | train = gen_feature(train_data, train) 156 | test = gen_feature(test_data, test) 157 | 158 | # 提取shop历史wifi信息 159 | def extract_wifi_dict(data, tag): 160 | path = '../data/{}_wifi_dict'.format(tag) 161 | if os.path.exists(path): 162 | wifi_dict = pickle.load(open(path, 'rb')) 163 | return wifi_dict 164 | else: 165 | wifi_strength_mean = {} 166 | wifi_strength_max = {} 167 | wifi_sum_dict = {} 168 | mall_wifi_set = {} 169 | for shop_id in tqdm(data.shop_id.unique()): 170 | sub_data = data[data.shop_id == shop_id] 171 | wifi_strength_mean[shop_id] = {} 172 | wifi_strength_max[shop_id] = {} 173 | wifi_sum_dict[shop_id] = {} 174 | t = {} 175 | for i, row in sub_data.iterrows(): 176 | if row.mall_id not in mall_wifi_set: 177 | mall_wifi_set[row.mall_id] = set() 178 | for wifi in row.wifi_infos.split(';'): 179 | bssid, signal, flag = wifi.split('|') 180 | mall_wifi_set[row.mall_id].add(bssid) 181 | if bssid not in t: 182 | t[bssid] = [] 183 | t[bssid].append(int(signal)) 184 | for bssid in t: 185 | wifi_strength_mean[shop_id][bssid] = np.mean(t[bssid]) 186 | wifi_strength_max[shop_id][bssid] = np.max(t[bssid]) 187 | wifi_sum_dict[shop_id][bssid] = len(t[bssid]) 188 | 189 | wifi_dict = {} 190 | wifi_dict['wifi_sum_dict'] = wifi_sum_dict 191 | wifi_dict['wifi_strength_mean'] = wifi_strength_mean 192 | wifi_dict['wifi_strength_max'] = wifi_strength_max 193 | wifi_dict['mall_wifi_set'] = mall_wifi_set 194 | pickle.dump(wifi_dict, open(path, 'wb')) 195 | return wifi_dict 196 | 197 | # 提取关于wifi的特征 198 | apply_features = ['wifi_sum','wifi_num','wifi_connect_sum', 199 | 'wifi_dist_2_mean','wifi_dist_2_max','last_wifi_dist', 200 | 'best_wifi_dist_2_mean','best_wifi_dist_2_max', 201 | 'wifi_large_than_mean','wifi_large_than_max', 202 | 'wifi_dist_median','wifi_dist_mean','wifi_dist_std', 203 | 'top1_diff','top2_diff','top3_diff','top4_diff','top5_diff','top6_diff','top7_diff','top8_diff',] 204 | 205 | def wifi_info_parallelize(data): 206 | data[apply_features] = data.apply(wifi_info, axis=1) 207 | return data 208 | 209 | def wifi_info(row): 210 | wifi_sum = 0; wifi_num = 0; wifi_connect_sum = 0 211 | wifi_dist_2_mean = 0; wifi_dist_2_max = 0; last_wifi_dist = 0 212 | wifi_large_than_mean = 0; wifi_large_than_max = 0 213 | 214 | if row.shop_id not in wifi_sum_dict: 215 | return pd.Series([np.NaN]*len(apply_features)) 216 | 217 | last_wifi = {} 218 | for wifi in row.last_wifi_infos.split(';'): 219 | bssid, signal, flag = wifi.split('|') 220 | last_wifi[bssid] = int(signal) 221 | 222 | wifi_dist_list = [] 223 | wifi_dict = {} 224 | for wifi in row.wifi_infos.split(';'): 225 | bssid, signal, flag = wifi.split('|') 226 | signal = int(signal) 227 | last_wifi_dist += np.abs(signal-last_wifi.get(bssid,-130)) 228 | if bssid in mall_wifi_set[row.mall_id]: 229 | if flag == 'true': 230 | wifi_connect_sum = wifi_sum_dict[row.shop_id].get(bssid,0) 231 | if bssid in wifi_sum_dict[row.shop_id]: 232 | wifi_num += 1 233 | wifi_sum += wifi_sum_dict[row.shop_id][bssid] 234 | wifi_dict[bssid] = signal 235 | s = wifi_strength_mean[row.shop_id][bssid] 236 | wifi_dist_2_mean += np.abs(signal-s) 237 | wifi_dist_list.append(signal-s) 238 | if signal > s: 239 | wifi_large_than_mean += 1 240 | 241 | s = wifi_strength_max[row.shop_id][bssid] 242 | wifi_dist_2_max += np.abs(signal-s) 243 | if signal > s: 244 | wifi_large_than_max += 1 245 | else: 246 | wifi_dist_2_mean += np.abs(signal+130) 247 | wifi_dist_2_max += np.abs(signal+130) 248 | wifi_dist_list.append(np.abs(signal+130)) 249 | 250 | 251 | best_wifi_dist_2_mean = np.abs(wifi_strength_mean[row.shop_id].get(row.best_wifi, np.NaN) - row.best_wifi_strength) 252 | best_wifi_dist_2_max = np.abs(wifi_strength_max[row.shop_id].get(row.best_wifi, np.NaN) - row.best_wifi_strength) 253 | wifi_dict = sorted(wifi_dict.items(), key = lambda b:b[1], reverse = True) 254 | x = {} 255 | for i in range(8): 256 | if i >= len(wifi_dict): 257 | x['top{}_diff'.format(i+1)] = np.NaN 258 | else: 259 | bssid, signal = wifi_dict[i] 260 | x['top{}_diff'.format(i+1)] = signal-wifi_strength_mean[row.shop_id][bssid] 261 | 262 | if len(wifi_dist_list) == 0: 263 | wifi_dist_median = np.NaN 264 | wifi_dist_mean = np.NaN 265 | wifi_dist_std = np.NaN 266 | else: 267 | wifi_dist_median = np.median(wifi_dist_list) 268 | wifi_dist_mean = np.mean(wifi_dist_list) 269 | wifi_dist_std = np.std(wifi_dist_list) 270 | 271 | return_list = [wifi_sum,wifi_num,wifi_connect_sum, 272 | wifi_dist_2_mean,wifi_dist_2_max,last_wifi_dist, 273 | best_wifi_dist_2_mean,best_wifi_dist_2_max, 274 | wifi_large_than_mean,wifi_large_than_max, 275 | wifi_dist_median,wifi_dist_mean,wifi_dist_std, 276 | x['top1_diff'],x['top2_diff'],x['top3_diff'],x['top4_diff'],x['top5_diff'], 277 | x['top6_diff'],x['top7_diff'],x['top8_diff'],] 278 | 279 | return pd.Series(return_list) 280 | 281 | wifi_dict = extract_wifi_dict(train_data, 'train') 282 | wifi_sum_dict = wifi_dict['wifi_sum_dict'] 283 | wifi_strength_max = wifi_dict['wifi_strength_max'] 284 | wifi_strength_mean = wifi_dict['wifi_strength_mean'] 285 | mall_wifi_set = wifi_dict['mall_wifi_set'] 286 | train = parallelize_dataframe(train, wifi_info_parallelize) 287 | train['wifi_sum_rate'] = train['wifi_sum'] / train['shop_count'] 288 | 289 | # online 290 | wifi_dict = extract_wifi_dict(test_data, 'test') 291 | wifi_sum_dict = wifi_dict['wifi_sum_dict'] 292 | wifi_strength_max = wifi_dict['wifi_strength_max'] 293 | wifi_strength_mean = wifi_dict['wifi_strength_mean'] 294 | mall_wifi_set = wifi_dict['mall_wifi_set'] 295 | test = parallelize_dataframe(test, wifi_info_parallelize) 296 | test['wifi_sum_rate'] = test['wifi_sum'] / test['shop_count'] 297 | 298 | le = LabelEncoder() 299 | train['mall_id_labeled'] = le.fit_transform(train['mall_id']) 300 | test['mall_id_labeled'] = le.transform(test['mall_id']) 301 | 302 | 303 | # rank模型 304 | train_group = train.groupby('row_id', sort=False).size().reset_index() 305 | test_group = test.groupby('row_id', sort=False).size().reset_index() 306 | features = ['latitude_x','longitude_x','real_la_dist','real_lo_dist','hist_haversine_dist','hist_la_dist','hist_lo_dist','real_haversine_dist', 307 | 'category_id','cate_count','cate_hour_count', 'cate_hour_rate','wifi_num','wifi_connect_sum','wifi_sum_rate', 308 | 'wifi_dist_2_mean','wifi_sum','best_wifi_rate1','best_wifi_rate2','best_wifi_strength','shop_wifi_count_sum','best_wifi_count', 309 | 'shop_wifi_count_mean','user_count','user_shop_count','user_ave_price','price_rate','mall_id_labeled', 310 | 'shop_day_count_var','shop_weekday_count','best_wifi_dist_2_mean','last_dist_diff','last_wifi_dist','shop_hour_count','best_wifi_dist_2_max', 311 | 'wifi_dist_median','wifi_dist_mean','wifi_dist_std','wifi_large_than_max','wifi_dist_2_max','wifi_large_than_mean', 312 | 'top2_diff','top3_diff','top4_diff','top5_diff','top6_diff','top7_diff','top8_diff', 313 | ] 314 | dtrain = lgb.Dataset(train[features].values, label=train['target'].values) 315 | dtrain.set_group(train_group[0]) 316 | dtest = lgb.Dataset(test[features].values) 317 | dtest.set_group(test_group[0]) 318 | params = {'learning_rate': 0.1, 319 | 'max_depth': -1, 320 | 'objective': 'lambdarank', 321 | 'metric': 'binary_error', 322 | } 323 | model = lgb.train(params, dtrain, num_boost_round=1500, verbose_eval=200) 324 | 325 | pred = model.predict(test[features].values) 326 | res = test.copy() 327 | res['pred'] = pred 328 | res['fuck'] = res['pred']*1.2 + res['prob'] 329 | res = res.sort_values(['row_id', 'fuck'], ascending=False).drop_duplicates('row_id') 330 | res[['row_id', 'shop_id']].to_csv('../data/sub.csv', index=False) -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/totoruo/TianChi-Shop-Location-Competition/9a60f4923269399b34fcfbb7b8f87972594bd986/data/README.md --------------------------------------------------------------------------------