├── session_2 ├── all_xgb.bat ├── change_type.py ├── make_featuer_.py ├── simple_merage.py ├── online.py ├── 3_get_Feat_Aug.py ├── getpara.py ├── lightgbm_8.py ├── get_Feat_Aug_8.py ├── get_Feat_Aug_18.py ├── lightgbm_18.py ├── lightgbm_15.py └── get_Feat_Aug_15.py ├── session_1 ├── test.py ├── plot_net.py ├── find_data.py ├── opensource.py └── get_feat.py ├── README.md └── get_Feat_Aug.py /session_2/all_xgb.bat: -------------------------------------------------------------------------------- 1 | python ./lightgbm_8.py 2 | python ./lightgbm_15.py 3 | python ./lightgbm_18.py -------------------------------------------------------------------------------- /session_1/test.py: -------------------------------------------------------------------------------- 1 | from odps import ODPS 2 | 3 | 4 | odps = ODPS('**your-access-id**', '**your-secret-access-key**', '**your-default-project**', 5 | endpoint='**your-end-point**') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 天池贵州交通代码,第一赛季rank53,第二赛季rank13 2 | 3 | 题目说明:根据历史路段每小时每2分钟统计平均通行时间预测为了一小时每2分钟同行时间。 4 | 5 | 模型思路: 6 |  规则:统计历史路段对应时刻的通行时间作为未来1小时需要预测的目标 7 |  模型:寻找尽可能多的相似时刻的记录预测未来1小时的目标 8 |  复赛:将要预测的时段进行分类,针对类别分别建立模型 9 | 10 | 需要改进: 11 | 1. 数据的选择和清洗 12 | 2. 未利用上下游关系 13 |  3. 14 | 15 | 链接:https://pan.baidu.com/s/1yIPuLhA_XbSQBbYqoEWz7w?pwd=s6gp 提取码:s6gp 16 | 2023年5月25日 17 | 18 | -------------------------------------------------------------------------------- /session_1/plot_net.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import networkx as nx 4 | 5 | linktop=pd.read_csv('./gy_contest_link_top.txt',sep=';') 6 | 7 | linklist=[] 8 | 9 | for i in range(len(linktop)): 10 | 11 | temp=linktop['in_links'].astype('str').iloc[i] 12 | 13 | if temp.lower()=='nan': 14 | 15 | continue 16 | 17 | else: 18 | 19 | temp2=temp.split('#') 20 | 21 | for item in temp2: 22 | 23 | linklist.append((linktop['link_ID'].iloc[i],item)) 24 | 25 | for i in range(len(linktop)): 26 | 27 | temp=linktop['out_links'].astype('str').iloc[i] 28 | 29 | if temp.lower()=='nan': 30 | 31 | continue 32 | 33 | else: 34 | 35 | temp2=temp.split('#') 36 | 37 | for item in temp2: 38 | 39 | linklist.append((item,linktop['link_ID'].iloc[i])) 40 | 41 | G=nx.DiGraph() 42 | 43 | for node in set(linktop['link_ID']): 44 | 45 | G.add_node(node) 46 | 47 | for edge in linklist: 48 | 49 | G.add_edge(*edge) 50 | 51 | nx.draw(G,pos=nx.spring_layout(G),alpha=0.3) 52 | plt.show() -------------------------------------------------------------------------------- /session_2/change_type.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | # my_para08 = pd.read_csv('./analy_data_08.txt') 6 | # my_para15 = pd.read_csv('./analy_data_15.txt') 7 | # my_para18 = pd.read_csv('./analy_data_18.txt') 8 | # xx = pd.read_csv('./pre_data/feature_data_1.csv',dtype='str') 9 | # 10 | # x1 = xx[(xx['time_interval_month']=='06')&(xx['time_interval_begin_hour']=='08')].reset_index() 11 | # x2 = xx[(xx['time_interval_month']=='06')&(xx['time_interval_begin_hour']=='15')].reset_index() 12 | # x3 = xx[(xx['time_interval_month']=='06')&(xx['time_interval_begin_hour']=='18')].reset_index() 13 | # 14 | # x1 = pd.concat([x1[['link_ID','time_interval_begin_hour']],my_para08],axis=1) 15 | # x2 = pd.concat([x2[['link_ID','time_interval_begin_hour']],my_para15],axis=1) 16 | # x3 = pd.concat([x3[['link_ID','time_interval_begin_hour']],my_para18],axis=1) 17 | # 18 | # print x1 19 | # print x2 20 | # print x3 21 | # del x1['Unnamed: 0'] 22 | # del x2['Unnamed: 0'] 23 | # del x3['Unnamed: 0'] 24 | # 25 | # x1.to_csv('./06.txt') 26 | # x2.to_csv('./15.txt') 27 | # x3.to_csv('./18.txt') 28 | 29 | my_para08 = pd.read_table('./quaterfinal_gy_cmp_training_traveltime.txt',sep=';') 30 | my_para08['link_ID'] = my_para08['link_ID'].apply(str) 31 | print my_para08['link_ID'].unique -------------------------------------------------------------------------------- /session_2/make_featuer_.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | def AddBaseTimeFeature(df): 8 | 9 | df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20])) 10 | df = df.drop(['date', 'time_interval'], axis=1) 11 | df['time_interval_month'] = df['time_interval_begin'].map(lambda x: x.strftime('%m')) 12 | # df['time_interval_year'] = df['time_interval_begin'].map(lambda x: x.strftime('%Y')) 13 | df['time_interval_day'] = df['time_interval_begin'].map(lambda x: x.day) 14 | df['time_interval_begin_hour'] = df['time_interval_begin'].map(lambda x: x.strftime('%H')) 15 | df['time_interval_minutes'] = df['time_interval_begin'].map(lambda x: x.strftime('%M')) 16 | # Monday=1, Sunday=7 17 | df['time_interval_week'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1) 18 | return df 19 | 20 | 21 | # txt => csv 22 | # link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 23 | # link_info = link_info.sort_values('link_ID') 24 | # print link_info.dtypes 25 | # link_info['link_ID'] = link_info['link_ID'].apply(str) 26 | 27 | # session_1_training_data = pd.read_table(u'./session1/[新-训练集]gy_contest_traveltime_training_data_second.txt',sep=';') 28 | # session_1_training_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 29 | # print session_1_training_data.dtypes 30 | # session_1_training_data['link_ID'] = session_1_training_data['link_ID'].apply(str) 31 | 32 | # session_1_training_data = session_1_training_data[(session_1_training_data['date']>='2017-03-01')&(session_1_training_data['date']<'2017-04-01')] 33 | # print session_1_training_data.shape 34 | # print session_1_training_data 35 | # training_data = pd.read_table(u'./quaterfinal_gy_cmp_training_traveltime.txt',sep=';') 36 | # training_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 37 | # print training_data.dtypes 38 | # training_data['link_ID'] = training_data['link_ID'].apply(str) 39 | # print training_data.head() 40 | # print training_data.shape 41 | 42 | # training_data = pd.concat([session_1_training_data,training_data],axis=0) 43 | # print training_data.head() 44 | # print training_data.shape 45 | 46 | # training_data = pd.merge(training_data,link_info,on='link_ID') 47 | 48 | # testing_data = pd.read_table(u'./semifinal_gy_cmp_testing_template_seg2.txt',sep=';',header=None) 49 | # testing_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 50 | # # testing_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 51 | # print testing_data.dtypes 52 | # testing_data['link_ID'] = testing_data['link_ID'].apply(str) 53 | # testing_data = pd.merge(testing_data,link_info,on='link_ID') 54 | # testing_data['travel_time'] = np.NaN 55 | # print testing_data.head() 56 | # print testing_data.shape 57 | # print testing_data.isnull().sum() 58 | # feature_date = pd.concat([training_data,testing_data],axis=0) 59 | 60 | # feature_date = feature_date.sort_values(['link_ID','time_interval']) 61 | # print feature_date 62 | # print feature_date.dtypes 63 | # feature_date['link_ID'] = feature_date['link_ID'].apply(str) 64 | # feature_date.to_csv('./pre_data/feature_data.csv',index=False) 65 | # print feature_date.dtypes 66 | 67 | feature_data = pd.read_csv('./pre_data/feature_data.csv',dtype='str') 68 | print feature_data 69 | feature_data = feature_data[feature_data['date']>'2016-10-01'] 70 | print feature_data.dtypes 71 | feature_data['link_ID'] = feature_data['link_ID'].apply(str) 72 | print feature_data 73 | feature_data_date = AddBaseTimeFeature(feature_data) 74 | print feature_data_date 75 | feature_data_date.to_csv('./pre_data/feature_data.csv',index=False) 76 | 77 | 78 | # test 79 | # feature_data = pd.read_csv('./pre_data/feature_data.csv') 80 | # test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 81 | # test.to_csv('./pre_data/test.csv',index=False) -------------------------------------------------------------------------------- /session_2/simple_merage.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | 5 | 6 | # a_p = pd.read_table('./mapodoufu_2017-09-15_a_lgb.txt',sep="#") 7 | # print a_p[['link_ID','date','time_interval','travel_time']].shape 8 | # print a_p[['link_ID','date','time_interval','travel_time']].isnull().sum() 9 | 10 | # b_p = pd.read_table('./mapodoufu_2017-09-15_b_lgb.txt',sep="#") 11 | # print b_p[['link_ID','date','time_interval','travel_time']].shape 12 | # print b_p[['link_ID','date','time_interval','travel_time']].isnull().sum() 13 | 14 | # c_p = pd.read_table('./mapodoufu_2017-09-15_c_lgb.txt',sep="#") 15 | # print c_p[['link_ID','date','time_interval','travel_time']].shape 16 | # print c_p[['link_ID','date','time_interval','travel_time']].isnull().sum() 17 | 18 | # a = pd.read_table('./mapodoufu_2017-09-15_a_xgb.txt',sep="#") 19 | # print a[['link_ID','date','time_interval','travel_time']].shape 20 | # print a[['link_ID','date','time_interval','travel_time']].isnull().sum() 21 | 22 | # b = pd.read_table('./mapodoufu_2017-09-15_b_xgb.txt',sep="#") 23 | # print b[['link_ID','date','time_interval','travel_time']].shape 24 | # print b[['link_ID','date','time_interval','travel_time']].isnull().sum() 25 | 26 | # c = pd.read_table('./mapodoufu_2017-09-15_c_xgb.txt',sep="#") 27 | # print c[['link_ID','date','time_interval','travel_time']].shape 28 | # print c[['link_ID','date','time_interval','travel_time']].isnull().sum() 29 | 30 | 31 | # new_a = pd.merge(a,a_p,on=['link_ID','date','time_interval'],how='left') 32 | # new_b = pd.merge(b,b_p,on=['link_ID','date','time_interval'],how='left') 33 | # new_c = pd.merge(c,c_p,on=['link_ID','date','time_interval'],how='left') 34 | 35 | # new_a['travel_time'] = 0.5 * new_a['travel_time_x'] + 0.5 * new_a['travel_time_y'] 36 | # new_b['travel_time'] = 0.5 * new_b['travel_time_x'] + 0.5 * new_b['travel_time_y'] 37 | # new_c['travel_time'] = 0.5 * new_c['travel_time_x'] + 0.5 * new_c['travel_time_y'] 38 | 39 | # new_a[['link_ID','date','time_interval','travel_time']].to_csv('./mapoudoufu_2017_09_15_a_xgb_lgb.txt',sep='#',index=False) 40 | # print new_a[['link_ID','date','time_interval','travel_time']].shape 41 | # print new_a[['link_ID','date','time_interval','travel_time']].isnull().sum() 42 | # print new_a 43 | # new_b[['link_ID','date','time_interval','travel_time']].to_csv('./mapoudoufu_2017_09_15_b_xgb_lgb.txt',sep='#',index=False) 44 | # print new_b[['link_ID','date','time_interval','travel_time']].shape 45 | # print new_b[['link_ID','date','time_interval','travel_time']].isnull().sum() 46 | # print new_b 47 | # new_c[['link_ID','date','time_interval','travel_time']].to_csv('./mapoudoufu_2017_09_15_c_xgb_lgb.txt',sep='#',index=False) 48 | # print new_c[['link_ID','date','time_interval','travel_time']].shape 49 | # print new_c[['link_ID','date','time_interval','travel_time']].isnull().sum() 50 | # print new_c 51 | 52 | import numpy as np 53 | 54 | a_p = pd.read_table('./mapoudoufu_2017_09_15_a_xgb_lgb.txt',sep="#") 55 | b_p = pd.read_table('./mapoudoufu_2017_09_15_b_xgb_lgb.txt',sep="#") 56 | c_p = pd.read_table('./mapoudoufu_2017_09_15_c_xgb_lgb.txt',sep="#") 57 | 58 | a = pd.read_table('./frame08final.txt',sep=";") 59 | b = pd.read_table('./frame15final.txt',sep=";") 60 | c = pd.read_table('./frame18final.txt',sep=";") 61 | print a[['link_ID','date','time_interval','travel_time']].shape 62 | print a[['link_ID','date','time_interval','travel_time']].isnull().sum() 63 | print b[['link_ID','date','time_interval','travel_time']].shape 64 | print b[['link_ID','date','time_interval','travel_time']].isnull().sum() 65 | print c[['link_ID','date','time_interval','travel_time']].shape 66 | print c[['link_ID','date','time_interval','travel_time']].isnull().sum() 67 | 68 | new_a = pd.merge(a,a_p,on=['link_ID','date','time_interval'],how='left') 69 | new_b = pd.merge(b,b_p,on=['link_ID','date','time_interval'],how='left') 70 | new_c = pd.merge(c,c_p,on=['link_ID','date','time_interval'],how='left') 71 | 72 | new_a['travel_time'] = 0.5 * new_a['travel_time_x'] + 0.5 * new_a['travel_time_y'] 73 | new_b['travel_time'] = 0.5 * new_b['travel_time_x'] + 0.5 * new_b['travel_time_y'] 74 | new_c['travel_time'] = 0.5 * new_c['travel_time_x'] + 0.5 * new_c['travel_time_y'] 75 | 76 | new_c['travel_time'] = np.round(new_c['travel_time'],6) 77 | new_b['travel_time'] = np.round(new_b['travel_time'],6) 78 | new_a['travel_time'] = np.round(new_a['travel_time'],6) 79 | 80 | new_a[['link_ID','date','time_interval','travel_time']].to_csv('./mapoudoufu_2017_09_15_a_sub.txt',sep='#',index=False) 81 | print new_a[['link_ID','date','time_interval','travel_time']].shape 82 | print new_a[['link_ID','date','time_interval','travel_time']].isnull().sum() 83 | print new_a.head() 84 | new_b[['link_ID','date','time_interval','travel_time']].to_csv('./mapoudoufu_2017_09_15_b_sub.txt',sep='#',index=False) 85 | print new_b[['link_ID','date','time_interval','travel_time']].shape 86 | print new_b[['link_ID','date','time_interval','travel_time']].isnull().sum() 87 | print new_b.head() 88 | new_c[['link_ID','date','time_interval','travel_time']].to_csv('./mapoudoufu_2017_09_15_c_sub.txt',sep='#',index=False) 89 | print new_c[['link_ID','date','time_interval','travel_time']].shape 90 | print new_c[['link_ID','date','time_interval','travel_time']].isnull().sum() 91 | print new_c.head() -------------------------------------------------------------------------------- /session_1/find_data.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import pandas as pd 3 | import numpy as np 4 | 5 | # 大赛提供132条link的静态信息,以及这些link之间的上下游拓扑结构。 6 | # 同时,大赛提供2016年3月至2016年5月每条link每天的旅行时间,以及2016年6月早上[6:00- 8:00)每条link的平均旅行时间。 7 | ######################################################################################################################## 8 | # link_ID 每条路段的唯一标识 9 | # length link长度 10 | # width link宽度 11 | # link_class 路段等级 12 | # gy_contest_link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 13 | # print gy_contest_link_info 14 | ######################################################################################################################## 15 | #link_ID 每条路段的唯一标识 16 | #in_links link的直接上游link 17 | #out_links link的直接下游link 18 | # gy_contest_link_top = pd.read_table('./gy_contest_link_top.txt',sep=';') 19 | # print gy_contest_link_top['in_links'].map(lambda x:str(x).split('#')) 20 | ######################################################################################################################## 21 | #link_ID 每条路段的唯一标识 22 | #date 日期 23 | #time_interval 时间段 24 | #travel_time 旅行时间 25 | ######################################################################################################################## 26 | from scipy.stats import mode 27 | # 28 | def mode_function(df): 29 | counts = mode(df) 30 | return counts[0][0] 31 | 32 | # 1.1 这个部分是读取新的数据格式的文件,然后把时间格式划分一下/ 33 | gy_contest_link_traveltime_training_data = pd.read_table(u'./[新-训练集]gy_contest_traveltime_training_data_second.txt',sep=';') 34 | gy_contest_link_traveltime_training_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 35 | sub = gy_contest_link_traveltime_training_data.sort_values(['link_ID','time_interval']) 36 | Select_data = sub 37 | # ########## 转换time_interval格式为begin/end ########## 38 | Select_data['time_interval_begin'] = pd.to_datetime(Select_data.time_interval.map(lambda x : x[1:20])) 39 | # Select_data['time_interval_end'] = pd.to_datetime(Select_data.time_interval.map(lambda x : x[21:-1])) 40 | Select_data['time_interval_month'] = Select_data['time_interval_begin'].map(lambda x: x.strftime('%m')) 41 | Select_data['time_interval_hour'] = Select_data['time_interval_begin'].map(lambda x : x.strftime('%H')) 42 | Select_data['time_interval_minutes'] = Select_data['time_interval_begin'].map(lambda x : x.strftime('%M')) 43 | Select_data['time_interval_day'] = Select_data['time_interval_begin'].map(lambda x : x.day) 44 | print Select_data.head() 45 | 这个部分可以修改 04 05 代表读取这个时段的数据 46 | Select_data = Select_data[(Select_data['time_interval_month']=='04')] 47 | print Select_data.date.unique() 48 | Select_data.to_csv('./ruler_5_feat_data.csv',index=False) 49 | 50 | 51 | # 1.2 这个部分是统计中位数和众数 52 | Select_data = pd.read_csv('./ruler_4_feat_data.csv') 53 | print Select_data.head() 54 | # tmp = Select_data[Select_data['time_interval_month']==5] 55 | Select_data['link_ID'] = Select_data['link_ID'].astype('str') 56 | tmp_count = Select_data.groupby(['link_ID','time_interval_minutes'])['travel_time'].agg([('mean_',np.mean),('median_',np.median),('std_',np.std),('mode_',mode_function),('max_',np.max)]).reset_index() 57 | tmp_count.to_csv('./count_4_feat.csv',index=False) 58 | 59 | 60 | # 这个部分是测试线下 61 | train_org_feat = pd.read_csv('./count_4_feat.csv') 62 | train_org_feat = train_org_feat.fillna(0) 63 | # count_train = train_org_feat[train_org_feat['time_interval_month']==4] 64 | count_train = train_org_feat[['link_ID','time_interval_minutes','mean_','median_','mode_','std_','max_']] 65 | count_train['mode_median_'] = 0.50 * count_train['median_'] + 0.50 * count_train['mode_'] 66 | count_train.to_csv('./x.csv',index=False) 67 | 68 | # 这个部分是读取上一步的数据, 69 | count_train = pd.read_csv('./x.csv') 70 | count_train['link_ID'] = count_train['link_ID'].astype('str') 71 | print count_train 72 | 73 | 74 | 75 | # 这个步骤需要先使用上面 1.1 1.2 更改为5月份 目的是提取 路段 - 分钟 76 | test = pd.read_csv('./ruler_5_feat_data.csv') 77 | test['link_ID'] = test['link_ID'].astype('str') 78 | print test 79 | train = pd.merge(test,count_train,on=['link_ID','time_interval_minutes'],how='left') 80 | result=np.sum(np.abs(train['mode_median_']-train['travel_time'].values)/train['travel_time'].values)/len(train['travel_time'].values) 81 | result1=np.sum(np.abs(train['mode_']-train['travel_time'].values)/train['travel_time'].values)/len(train['travel_time'].values) 82 | result2=np.sum(np.abs(train['median_']-train['travel_time'].values)/train['travel_time'].values)/len(train['travel_time'].values) 83 | result3=np.sum(np.abs(train['mean_']-train['travel_time'].values)/train['travel_time'].values)/len(train['travel_time'].values) 84 | print result,result1,result2,result3 85 | print train 86 | 87 | 88 | 89 | # 这个部分是读取提交的数据的格式 90 | 91 | sub_demo = pd.read_table(u'[新-答案模板]gy_contest_result_template.txt',header=None,sep='#') 92 | 93 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 94 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 95 | del sub_demo['index'] 96 | del sub_demo['travel_time'] 97 | 98 | sub_demo['time_interval_begin'] = pd.to_datetime(sub_demo.time_interval.map(lambda x : x[1:20])) 99 | sub_demo['time_interval_minutes'] = sub_demo['time_interval_begin'].map(lambda x : x.strftime('%M')) 100 | sub_demo['time_interval_minutes'] = sub_demo['time_interval_minutes'].astype(int) 101 | print sub_demo 102 | sub_demo['link_ID'] = sub_demo['link_ID'].astype('str') 103 | del sub_demo['time_interval_begin'] 104 | 105 | sub_demo.to_csv('./tmp_sub.csv',index=False) 106 | 107 | 108 | 109 | # 这个部分是进行拼接 拼接6月和5月的统计数据 110 | tmp_sub = pd.read_csv('./tmp_sub.csv') 111 | tmp_sub = tmp_sub.drop_duplicates(['link_ID','time_interval']) 112 | print tmp_sub[tmp_sub['link_ID']=='4377906289813600514'] 113 | tmp_count = pd.read_csv('./count_5_feat.csv') 114 | print tmp_count[tmp_count['link_ID']=='4377906289813600514'] 115 | tmp_count['link_ID'] = tmp_count['link_ID'].astype('str') 116 | tmp_count['time_interval_minutes'] = tmp_count['time_interval_minutes'].astype(int) 117 | 118 | sub_demo= pd.merge(tmp_sub,tmp_count,on=['link_ID','time_interval_minutes'],how='left') 119 | print sub_demo 120 | 121 | 122 | sub_demo['t'] = 0.50 * sub_demo['median_'] + 0.50 * sub_demo['mode_'] + 0.001 * sub_demo['max_'] - 0.001 123 | sub_demo = sub_demo[['link_ID','date','time_interval','median_','mode_','t','std_','mean_']] 124 | sub_demo[['link_ID','date','time_interval','t']].to_csv('./siyueshinidehuangyan_2017-08-11.txt',sep='#',index=False,header=False) 125 | print sub_demo[['link_ID','date','time_interval','t']].shape 126 | print sub_demo[['link_ID','date','time_interval','t']].isnull().sum() 127 | -------------------------------------------------------------------------------- /session_1/opensource.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | def mape_object(y,d): 7 | 8 | g=1.0*np.sign(y-d)/d 9 | h=1.0/d 10 | return -g,h 11 | 12 | # 评价函数 13 | def mape(y,d): 14 | c=d.get_label() 15 | result=np.sum(np.abs(y-c)/c)/len(c) 16 | return "mape",result 17 | 18 | # 评价函数ln形式 19 | def mape_ln(y,d): 20 | c=d.get_label() 21 | result=np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 22 | return "mape",result 23 | 24 | def AddBaseTimeFeature(df): 25 | 26 | df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20])) 27 | df = df.drop(['date', 'time_interval'], axis=1) 28 | df['time_interval_month'] = df['time_interval_begin'].map(lambda x: x.strftime('%m')) 29 | df['time_interval_day'] = df['time_interval_begin'].map(lambda x: x.day) 30 | df['time_interval_begin_hour'] = df['time_interval_begin'].map(lambda x: x.strftime('%H')) 31 | df['time_interval_minutes'] = df['time_interval_begin'].map(lambda x: x.strftime('%M')) 32 | # Monday=1, Sunday=7 33 | df['time_interval_week'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1) 34 | return df 35 | 36 | # txt => csv 37 | link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 38 | link_info = link_info.sort_values('link_ID') 39 | training_data = pd.read_table('./gy_contest_link_traveltime_training_data.txt',sep=';') 40 | print training_data.shape 41 | training_data = pd.merge(training_data,link_info,on='link_ID') 42 | testing_data = pd.read_table('./sub_demo.txt',sep='#',header=None) 43 | testing_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 44 | testing_data = pd.merge(testing_data,link_info,on='link_ID') 45 | testing_data['travel_time'] = np.NaN 46 | print testing_data.shape 47 | feature_date = pd.concat([training_data,testing_data],axis=0) 48 | feature_date = feature_date.sort_values(['link_ID','time_interval']) 49 | print feature_date 50 | feature_date.to_csv('./pre_data/feature_data.csv',index=False) 51 | 52 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 53 | feature_data_date = AddBaseTimeFeature(feature_data) 54 | print feature_data_date 55 | feature_data_date.to_csv('./pre_data/feature_data.csv',index=False) 56 | 57 | 58 | 59 | from scipy.stats import mode 60 | # 中位数 61 | def mode_function(df): 62 | counts = mode(df) 63 | return counts[0][0] 64 | 65 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 66 | 67 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 68 | # day = pd.get_dummies(feature_data['time_interval_day'],prefix='day') 69 | del feature_data['time_interval_week'] 70 | feature_data = pd.concat([feature_data,week],axis=1) 71 | print feature_data.head() 72 | 73 | 74 | 75 | train = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==8),: ] 76 | for i in [58,48,38,28,18]: 77 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 78 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 79 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 80 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max)]).reset_index() 81 | train = pd.merge(train,tmp,on=['link_ID','time_interval_day'],how='left') 82 | 83 | train_label = np.log1p(train.pop('travel_time')) 84 | # validation = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 85 | 86 | 87 | test = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 88 | for i in [58,48,38,28,18]: 89 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 90 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 91 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 92 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max)]).reset_index() 93 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 94 | 95 | test_label = np.log1p(test.pop('travel_time')) 96 | 97 | train.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin'],inplace=True,axis=1) 98 | test.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin'],inplace=True,axis=1) 99 | 100 | 101 | 102 | import xgboost as xgb 103 | 104 | xlf = xgb.XGBRegressor(max_depth=11, 105 | learning_rate=0.01, 106 | n_estimators=301, 107 | silent=True, 108 | objective=mape_object, 109 | gamma=0, 110 | min_child_weight=5, 111 | max_delta_step=0, 112 | subsample=0.8, 113 | colsample_bytree=0.8, 114 | colsample_bylevel=1, 115 | reg_alpha=1e0, 116 | reg_lambda=0, 117 | scale_pos_weight=1, 118 | seed=9, 119 | missing=None) 120 | 121 | 122 | xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=2) 123 | print xlf.get_params() 124 | 125 | 126 | sub = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 127 | for i in [58,48,38,28,18]: 128 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 129 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 130 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 131 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max)]).reset_index() 132 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 133 | 134 | sub_label = np.log1p(sub.pop('travel_time')) 135 | 136 | sub.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin'],inplace=True,axis=1) 137 | 138 | result = xlf.predict(sub.values) 139 | 140 | travel_time = pd.DataFrame({'travel_time':list(result)}) 141 | sub_demo = pd.read_table('./sub_demo.txt',header=None,sep='#') 142 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 143 | del sub_demo['travel_time'] 144 | tt = pd.concat([sub_demo,travel_time],axis=1) 145 | # tt = tt.fillna(0) 146 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 147 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./opensource_.txt',sep='#',index=False,header=False) 148 | print tt[['link_ID','date','time_interval','travel_time']].shape 149 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 150 | 151 | 152 | mapodoufu1 = pd.read_table('./opensource_.txt',header=None,sep='#') 153 | mapodoufu2 = pd.read_table('./mapodoufu_2017-08-02.txt',header=None,sep='#') 154 | 155 | print sum(mapodoufu1[0]==mapodoufu2[0]) 156 | print sum(mapodoufu1[1]==mapodoufu2[1]) 157 | print sum(mapodoufu1[2]==mapodoufu2[2]) 158 | print sum(mapodoufu1[3]==mapodoufu2[3]) 159 | result=np.sum(np.abs(mapodoufu1[3]-mapodoufu2[3])/mapodoufu2[3])/len(mapodoufu2[3]) 160 | print result -------------------------------------------------------------------------------- /session_2/online.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | from sklearn import preprocessing 4 | import math 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | import pandas as pd 8 | import numpy as np 9 | from sklearn.ensemble import GradientBoostingRegressor 10 | from sklearn import neighbors 11 | from sklearn.ensemble import RandomForestRegressor 12 | import warnings 13 | import datetime 14 | import time 15 | 16 | #online 17 | data = pd.read_csv('traveltime_drop.txt',sep=';',low_memory=False) 18 | seg = pd.read_csv('semifinal_gy_cmp_testing_template_seg2.txt',sep=';',header=None) 19 | seg.columns = ['link_ID','date','time_interval','travel_time'] 20 | seg['time_interval_begin'] = pd.to_datetime(seg['time_interval'].map(lambda x: x[1:20])) 21 | seg['w'] = seg['time_interval_begin'].map(lambda x: x.weekday()) 22 | # link_ID;date;time_interval;travel_time;w 23 | del seg['time_interval_begin'] 24 | 25 | data['minut']=pd.Series(data['time_interval'].str.slice(15,17)) 26 | data['h']=pd.Series(data['time_interval'].str.slice(12,14)) 27 | data['slice']=pd.Series(data['time_interval'].str.slice(15,16)) 28 | data['1/travel_time']=1/data['travel_time'] 29 | data['1/travel_time2']=1/data['travel_time']/data['travel_time'] 30 | seg['h']=pd.Series(seg['time_interval'].str.slice(12,14)) 31 | seg['minut']=pd.Series(seg['time_interval'].str.slice(15,17)) 32 | seg['slice']=pd.Series(seg['time_interval'].str.slice(15,16)) 33 | #缺数据,不加实时 34 | #实时,历史 35 | nowh=data[(data['date']<'2017-07-01') &( (data['h']=='06')| (data['h']=='07') )] 36 | #历史 37 | history=data[(data['date']<'2017-07-01') &( data['h']=='08')] 38 | #模板 39 | seg=seg[(seg['date']>='2017-07-01') &( seg['date']<='2017-07-31')&(seg['h']=='08')] 40 | 41 | #link_ID date time_interval travel_time 42 | m_m=history.groupby(['link_ID','minut'],as_index=False)['travel_time'].median() 43 | s_m=history.groupby(['link_ID','slice'],as_index=False)['travel_time'].median() 44 | m=history.groupby(['link_ID'],as_index=False)['travel_time'].median() 45 | w_m=history.groupby(['link_ID','w'],as_index=False)['travel_time'].median() 46 | 47 | nh_m=nowh.groupby(['link_ID'],as_index=False)['travel_time'].median() 48 | nh_a=nowh.groupby(['link_ID'],as_index=False)['travel_time'].mean() 49 | 50 | s1=history.groupby(['link_ID','w'],as_index=False)['1/travel_time'].sum() 51 | s2=history.groupby(['link_ID','w'],as_index=False)['1/travel_time2'].sum() 52 | result=pd.merge(seg, s1, on=['link_ID','w'], how='left') 53 | result.rename(columns={'travel_time': 'true','1/travel_time': 's1'}, inplace=True) 54 | result=pd.merge(result, s2, on=['link_ID','w'], how='left') 55 | result.rename(columns={'1/travel_time2': 's2'}, inplace=True) 56 | result['preloss']=result['s1']/result['s2'] 57 | 58 | s3=history.groupby(['link_ID'],as_index=False)['1/travel_time'].sum() 59 | s4=history.groupby(['link_ID'],as_index=False)['1/travel_time2'].sum() 60 | result=pd.merge(result, s3, on=['link_ID'], how='left') 61 | result.rename(columns={'1/travel_time': 's3'}, inplace=True) 62 | result=pd.merge(result, s4, on=['link_ID'], how='left') 63 | result.rename(columns={'1/travel_time2': 's4'}, inplace=True) 64 | result['lloss']=result['s3']/result['s4'] 65 | 66 | s5=history.groupby(['link_ID','slice'],as_index=False)['1/travel_time'].sum() 67 | s6=history.groupby(['link_ID','slice'],as_index=False)['1/travel_time2'].sum() 68 | result=pd.merge(result, s5, on=['link_ID','slice'], how='left') 69 | result.rename(columns={'1/travel_time': 's5'}, inplace=True) 70 | result=pd.merge(result, s6, on=['link_ID','slice'], how='left') 71 | result.rename(columns={'1/travel_time2': 's6'}, inplace=True) 72 | result['sloss']=result['s5']/result['s6'] 73 | 74 | s7=history.groupby(['link_ID','minut'],as_index=False)['1/travel_time'].sum() 75 | s8=history.groupby(['link_ID','minut'],as_index=False)['1/travel_time2'].sum() 76 | result=pd.merge(result, s7, on=['link_ID','minut'], how='left') 77 | result.rename(columns={'1/travel_time': 's7'}, inplace=True) 78 | result=pd.merge(result, s8, on=['link_ID','minut'], how='left') 79 | result.rename(columns={'1/travel_time2': 's8'}, inplace=True) 80 | result['mloss']=result['s7']/result['s8'] 81 | 82 | ns1=nowh.groupby(['link_ID'],as_index=False)['1/travel_time'].sum() 83 | ns2=nowh.groupby(['link_ID'],as_index=False)['1/travel_time2'].sum() 84 | result=pd.merge(result, ns1, on=['link_ID'], how='left') 85 | result.rename(columns={'1/travel_time': 'ns1'}, inplace=True) 86 | result=pd.merge(result, ns2, on=['link_ID'], how='left') 87 | result.rename(columns={'1/travel_time2': 'ns2'}, inplace=True) 88 | result['nloss']=result['ns1']/result['ns2'] 89 | 90 | result=pd.merge(result, m, on=['link_ID'], how='left') 91 | result.rename(columns={'travel_time': 'm'}, inplace=True) 92 | result=pd.merge(result, s_m, on=['link_ID','slice'], how='left') 93 | result.rename(columns={'travel_time': 's_m'}, inplace=True) 94 | result=pd.merge(result, m_m, on=['link_ID','minut'], how='left') 95 | result.rename(columns={'travel_time': 'm_m'}, inplace=True) 96 | result=pd.merge(result, w_m, on=['link_ID','w'], how='left') 97 | result.rename(columns={'travel_time': 'w_m'}, inplace=True) 98 | result=pd.merge(result, nh_m, on=['link_ID'], how='left') 99 | result.rename(columns={'travel_time': 'nh_m'}, inplace=True) 100 | result=pd.merge(result, nh_a, on=['link_ID'], how='left') 101 | result.rename(columns={'travel_time': 'nh_a'}, inplace=True) 102 | 103 | 104 | result['max1']=(1/result['w_m']+1/result['preloss'])/(1/result['w_m']/result['w_m']+1/result['preloss']/result['preloss']) 105 | result['max2']=(result['preloss']+result['w_m'])/2 106 | result['max3']=(result['preloss']+result['sloss'])/2 107 | result['max4']=(result['preloss']+result['mloss'])/2 108 | result['max5']=(result['sloss']+result['mloss'])/2 109 | result['max6']=(result['sloss']+result['w_m'])/2 110 | result['max7']=(result['mloss']+result['w_m'])/2 111 | result['max8']=(1/result['sloss']+1/result['preloss'])/(1/result['sloss']/result['sloss']+1/result['preloss']/result['preloss']) 112 | result['max9']=(1/result['mloss']+1/result['preloss'])/(1/result['mloss']/result['mloss']+1/result['preloss']/result['preloss']) 113 | result['max10']=(1/result['mloss']+1/result['sloss'])/(1/result['mloss']/result['mloss']+1/result['sloss']/result['sloss']) 114 | result['max11']=(1/result['w_m']+1/result['sloss'])/(1/result['w_m']/result['w_m']+1/result['sloss']/result['sloss']) 115 | result['max12']=(1/result['mloss']+1/result['w_m'])/(1/result['mloss']/result['mloss']+1/result['w_m']/result['w_m']) 116 | result['max13']=(1/result['mloss']+1/result['w_m']+1/result['preloss'])/(1/result['mloss']/result['mloss']+1/result['w_m']/result['w_m']+1/result['preloss']/result['preloss']) 117 | 118 | 119 | #mape.to_csv('mape8old.csv') 120 | #result.to_csv('online8.csv',index=False) 121 | 122 | para=pd.read_csv('para08.txt') 123 | print para.head(5) 124 | del para['1'] 125 | temp=pd.DataFrame() 126 | temp=result[['link_ID','preloss','nloss','lloss','sloss','mloss','m' ,'s_m','m_m','w_m','nh_m','nh_a','max1','max2','max3','max4','max5','max6','max7', 127 | 'max8','max9','max10','max11','max12','max13']] 128 | temp=pd.merge(temp,para,on=['link_ID'], how='left') 129 | print temp.head(5) 130 | temp=np.array(temp) 131 | 132 | best=np.zeros((len(temp),1)) 133 | for i in range(0,len(temp)): 134 | best[i,0]=temp[i,int(temp[i,25])] 135 | result['traveltime']=pd.DataFrame(best) 136 | 137 | result[['link_ID','date','time_interval','traveltime']].to_csv('2017_09_13_08_rule.txt',sep=';',index=False) 138 | 139 | print result[['link_ID','date','time_interval','traveltime']].shape 140 | print result[['link_ID','date','time_interval','traveltime']].isnull().sum() 141 | -------------------------------------------------------------------------------- /session_2/3_get_Feat_Aug.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | def mape_object(y,d): 7 | 8 | g=1.0*np.sign(y-d)/d 9 | h=1.0/d 10 | return -g,h 11 | 12 | # 评价函数 13 | def mape(y,d): 14 | c=d.get_label() 15 | result=np.sum(np.abs(y-c)/c)/len(c) 16 | return "mape",result 17 | 18 | # 评价函数ln形式 19 | def mape_ln(y,d): 20 | c=d.get_label() 21 | result=np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 22 | return "mape",result 23 | 24 | def AddBaseTimeFeature(df): 25 | 26 | df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20])) 27 | df = df.drop(['date', 'time_interval'], axis=1) 28 | df['time_interval_month'] = df['time_interval_begin'].map(lambda x: x.strftime('%m')) 29 | df['time_interval_day'] = df['time_interval_begin'].map(lambda x: x.day) 30 | df['time_interval_begin_hour'] = df['time_interval_begin'].map(lambda x: x.strftime('%H')) 31 | df['time_interval_minutes'] = df['time_interval_begin'].map(lambda x: x.strftime('%M')) 32 | # Monday=1, Sunday=7 33 | df['time_interval_week'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1) 34 | return df 35 | 36 | # gy_contest_link_top.txt 37 | # link_top = pd.read_table('./gy_contest_link_top.txt',sep=';') 38 | # print link_top 39 | # 4377906284594800514 4377906284514600514 40 | 41 | ''' 42 | link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 43 | link_info = link_info.sort_values('link_ID') 44 | training_data = pd.read_table('./gy_contest_link_traveltime_training_data.txt',sep=';') 45 | print training_data.shape 46 | training_data = pd.merge(training_data,link_info,on='link_ID') 47 | testing_data = pd.read_table('./sub_demo.txt',sep='#',header=None) 48 | testing_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 49 | testing_data = pd.merge(testing_data,link_info,on='link_ID') 50 | testing_data['travel_time'] = np.NaN 51 | print testing_data.shape 52 | feature_date = pd.concat([training_data,testing_data],axis=0) 53 | feature_date = feature_date.sort_values(['link_ID','time_interval']) 54 | print feature_date 55 | feature_date.to_csv('./pre_data/feature_data.csv',index=False) 56 | 57 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 58 | feature_data_date = AddBaseTimeFeature(feature_data) 59 | print feature_data_date 60 | feature_data_date.to_csv('./pre_data/feature_data.csv',index=False) 61 | ''' 62 | # txt => csv 63 | 64 | 65 | 66 | ''' 67 | # test 68 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 69 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 70 | test.to_csv('./pre_data/test.csv',index=False) 71 | ''' 72 | 73 | 74 | from scipy.stats import mode 75 | # 中位数 76 | def mode_function(df): 77 | counts = mode(df) 78 | return counts[0][0] 79 | 80 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 81 | feature_data['link_ID'] = feature_data['link_ID'].astype(str) 82 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 83 | del feature_data['time_interval_week'] 84 | feature_data = pd.concat([feature_data,week],axis=1) 85 | 86 | train = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 87 | for i in [58,48,38,28,18,8,0]: 88 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 89 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 90 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 91 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 92 | train = pd.merge(train,tmp,on=['link_ID','time_interval_day'],how='left') 93 | 94 | 95 | train_history = feature_data.loc[(feature_data.time_interval_month == 4),: ] 96 | train_history = train_history.groupby(['link_ID', 'time_interval_minutes'])[ 97 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 98 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 99 | 100 | train = pd.merge(train,train_history,on=['link_ID','time_interval_minutes'],how='left') 101 | # print train.head() 102 | 103 | train_label = np.log1p(train.pop('travel_time')) 104 | # validation = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 105 | 106 | 107 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 108 | for i in [58,48,38,28,18,8,0]: 109 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 110 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 111 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 112 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 113 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 114 | 115 | 116 | test_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 117 | test_history = test_history.groupby(['link_ID', 'time_interval_minutes'])[ 118 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 119 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 120 | 121 | test = pd.merge(test,test_history,on=['link_ID','time_interval_minutes'],how='left') 122 | # print test.head() 123 | 124 | test_label = np.log1p(test.pop('travel_time')) 125 | 126 | train.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin'],inplace=True,axis=1) 127 | test.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin'],inplace=True,axis=1) 128 | 129 | # validation_0-mape:0.346155 0.60 130 | 131 | import xgboost as xgb 132 | 133 | xlf = xgb.XGBRegressor(max_depth=11, 134 | learning_rate=0.01, 135 | n_estimators=1000, 136 | silent=True, 137 | objective=mape_object, 138 | gamma=0, 139 | min_child_weight=5, 140 | max_delta_step=0, 141 | subsample=0.8, 142 | colsample_bytree=0.8, 143 | colsample_bylevel=1, 144 | reg_alpha=1e0, 145 | reg_lambda=0, 146 | scale_pos_weight=1, 147 | seed=9, 148 | missing=None) 149 | 150 | 151 | xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=2) 152 | # print xlf.get_params() 153 | 154 | 155 | sub = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour==8),: ] 156 | for i in [58,48,38,28,18,8,0]: 157 | tmp = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 158 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 159 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 160 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 161 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 162 | 163 | sub_history = feature_data.loc[(feature_data.time_interval_month == 6),: ] 164 | sub_history = sub_history.groupby(['link_ID', 'time_interval_minutes'])[ 165 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 166 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 167 | 168 | sub = pd.merge(sub,sub_history,on=['link_ID','time_interval_minutes'],how='left') 169 | # print sub.head() 170 | 171 | sub_label = np.log1p(sub.pop('travel_time')) 172 | 173 | sub.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin'],inplace=True,axis=1) 174 | 175 | result = xlf.predict(sub.values) 176 | 177 | travel_time = pd.DataFrame({'travel_time':list(result)}) 178 | sub_demo = pd.read_table(u'./quaterfinal_gy_cmp_testing_template_seg1(update).txt',header=None,sep=';') 179 | 180 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 181 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 182 | del sub_demo['index'] 183 | 184 | del sub_demo['travel_time'] 185 | 186 | tt = pd.concat([sub_demo,travel_time],axis=1) 187 | # tt = tt.fillna(0) 188 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 189 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./ss_2017-09-05.txt',sep='#',index=False,header=False) 190 | print tt[['link_ID','date','time_interval','travel_time']].shape 191 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 192 | 193 | # mapodoufu1 = pd.read_table('./ss_2017-08-11.txt',header=None,sep='#') 194 | # mapodoufu2 = pd.read_table('./siyueshinidehuangyan_2017-08-10.txt',header=None,sep='#') 195 | # 196 | # 197 | # print sum(mapodoufu1[0]==mapodoufu2[0]) 198 | # print sum(mapodoufu1[1]==mapodoufu2[1]) 199 | # print sum(mapodoufu1[2]==mapodoufu2[2]) 200 | # print sum(mapodoufu1[3]==mapodoufu2[3]) 201 | # result=np.sum(np.abs(mapodoufu1[3]-mapodoufu2[3])/mapodoufu2[3])/len(mapodoufu2[3]) 202 | # print result 203 | 204 | # [374] validation_0-rmse:0.443417 validation_0-mape:0.295266 0.0795302055463 -------------------------------------------------------------------------------- /session_1/get_feat.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | import xgboost as xgb 7 | 8 | def mape_object(y,d): 9 | 10 | g=1.0*np.sign(y-d)/d 11 | h=1.0/d 12 | return -g,h 13 | 14 | # 评价函数 15 | def mape(y,d): 16 | c=d.get_label() 17 | result=np.sum(np.abs(y-c)/c)/len(c) 18 | return "mape",result 19 | 20 | # 评价函数ln形式 21 | def mape_ln(y,d): 22 | c=d.get_label() 23 | result=np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 24 | return "mape",result 25 | 26 | # 基础时间特征 27 | def AddBaseTimeFeature(df): 28 | df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20])) 29 | # train_data['time_interval_end'] = pd.to_datetime(train_data['time_interval'].map(lambda x : x[21:-1])) 30 | # 删除 data time_interval link_class 31 | df = df.drop(['date', 'time_interval', 'link_class'], axis=1) 32 | print df.columns 33 | # 小时 分钟 月 日 星期 34 | df['time_interval_month'] = df['time_interval_begin'].map(lambda x: x.strftime('%m')) 35 | df['time_interval_day'] = df['time_interval_begin'].map(lambda x: x.day) 36 | df['time_interval_begin_hour'] = df['time_interval_begin'].map(lambda x: x.strftime('%H')) 37 | df['time_interval_minutes'] = df['time_interval_begin'].map(lambda x: x.strftime('%M')) 38 | # Monday=1, Sunday=7 39 | df['time_interval_week'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1) 40 | del df['time_interval_begin'] 41 | return df 42 | 43 | # 设置节假日的信息 44 | def AddHolidayAndWeekOneHotFeature(df): 45 | df['holiday'] = 0 46 | df.loc[(df['time_interval_month']==4)&(df['time_interval_day'].isin(list([2,3,4]))),'holiday'] = 1 47 | df.loc[(df['time_interval_month']==5)&(df['time_interval_day'].isin(list([1,2]))),'holiday'] = 1 48 | df.loc[(df['time_interval_month']==6)&(df['time_interval_day'].isin(list([9,10,11]))),'holiday'] = 1 49 | # 由于12日属于正常上班因此当作周一处理 50 | df.loc[(df['time_interval_month']==6)&(df['time_interval_day'].isin(list([12]))),'time_interval_week'] = 1 51 | df_week = pd.get_dummies(df['time_interval_week'],prefix='week') 52 | df.drop(['time_interval_week'],inplace=True,axis=1) 53 | df = pd.concat([df,df_week],axis=1) 54 | return df 55 | 56 | # 统计特征 57 | # 路段在当前月份一天内,30个时段的均值,方差,中位数,众数 58 | def Count_OneDay_Feat(df): 59 | count_feature = df.groupby( 60 | ['link_ID', 'time_interval_month', 'time_interval_day', 'time_interval_minutes'])[ 61 | 'travel_time'].agg([('mean_', np.mean), ('median_', np.median), ('mode_', mode_function),('std_',np.std),('max_',np.max),('min_',np.min)]).reset_index() 62 | 63 | df = pd.merge(df, count_feature, 64 | on=['link_ID', 'time_interval_month', 'time_interval_day', 'time_interval_minutes'], how='left') 65 | return df 66 | 67 | from scipy.stats import mode 68 | # 中位数 69 | def mode_function(df): 70 | counts = mode(df) 71 | return counts[0][0] 72 | # return np.argmax(counts) 73 | 74 | def All_month_count_feat(df): 75 | df_count = df.groupby(['link_ID', 'time_interval_month', 'time_interval_minutes'])[ 76 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), ('mode_m', mode_function), ('max_m', np.max), 77 | ('min_m', np.min),('std_m',np.std)]).reset_index() 78 | df = pd.merge(df, df_count, 79 | on=['link_ID', 'time_interval_minutes', 'time_interval_month']) 80 | return df 81 | ''' 82 | # 已给出的数据中不存在nan值 83 | # link_class 全为 1 84 | # travel_time max 1.965600e+03 2.000000e-01 绘制分布图 log1p分布 85 | # 读取道路属性 86 | link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 87 | # 读取历史数据 88 | training_data = pd.read_table('./gy_contest_link_traveltime_training_data.txt',sep=';') 89 | # 历史数据 + 道路属性 90 | train_data = pd.merge(training_data,link_info,on='link_ID',how='left') 91 | print train_data.shape 92 | # 历史数据 + 道路属性 .csv 93 | train_data.to_csv('./data/train_data.csv',index=False) 94 | 95 | ''' 96 | 97 | ''' 98 | # 基础时间处理 99 | 100 | train_data = pd.read_csv('./data/train_data.csv') 101 | train_data = AddBaseTimeFeature(train_data) 102 | train_data.to_csv('./data/train_test.csv',index=False) 103 | 104 | ''' 105 | 106 | ''' 107 | 每个月份的记录个数 108 | 3 2502906 0-23点数据 109 | 4 2392776 0-23点数据 110 | 5 2567206 0-23点数据 111 | 6 199496 预测月份 给了 6-7 点数据 112 | count 7662384 113 | 114 | u'link_ID', u'travel_time', u'length', u'width', u'time_interval_month',u'time_interval_day', 115 | u'time_interval_begin_hour',u'time_interval_minutes', u'time_interval_week', u'log_travel_time' 116 | 计算法定节假日: 117 | 4月 2 3 4 118 | 5月 1 2 119 | 6月 9 10 11 6月12日上班 120 | 121 | AddHolidayAndWeekOneHotFeature() 122 | index list = [u'link_ID', u'travel_time', u'length', u'width', u'time_interval_month', 123 | u'time_interval_day', u'time_interval_begin_hour', 124 | u'time_interval_minutes', u'holiday', u'week_1', u'week_2', u'week_3', 125 | u'week_4', u'week_5', u'week_6', u'week_7'] 126 | 127 | train = pd.read_csv('./data/train.csv') 128 | train = AddHolidayAndWeekOneHotFeature(train) 129 | train = Count_OneDay_Feat(train) 130 | train.to_csv('./data/train_org_feat.csv',index=False) 131 | print train.shape 132 | 133 | ''' 134 | 135 | ''' 136 | Index([u'link_ID', u'travel_time', u'length', u'width', u'time_interval_month', 137 | u'time_interval_day', u'time_interval_begin_hour', 138 | u'time_interval_minutes', u'holiday', u'week_1', u'week_2', u'week_3', 139 | u'week_4', u'week_5', u'week_6', u'week_7', u'mean_', u'median_', 140 | u'mode_', u'std_'], 141 | 142 | 143 | ''' 144 | #########################构造提交数据################################## 145 | ''' 146 | # 读取道路属性 147 | link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 148 | # 读取历史数据 149 | testing_data = pd.read_table('./mapodoufu_2017-07-27.txt',sep='#',header=None) 150 | testing_data.columns = ['link_ID','date','time_interval','travel_time'] 151 | # 历史数据 + 道路属性 152 | test_data = pd.merge(testing_data,link_info,on='link_ID',how='left') 153 | print test_data.shape 154 | # 历史数据 + 道路属性 .csv 155 | test_data.to_csv('./data/test_data.csv',index=False) 156 | ''' 157 | 158 | ''' 159 | # 基础时间处理 160 | 161 | test_data = pd.read_csv('./data/test_data.csv') 162 | test_data = AddBaseTimeFeature(test_data) 163 | test_data.to_csv('./data/test.csv',index=False) 164 | ''' 165 | 166 | ''' 167 | test = pd.read_csv('./data/test.csv') 168 | test = AddHolidayAndWeekOneHotFeature(test) 169 | test = Count_OneDay_Feat(test) 170 | test.to_csv('./data/test_org_feat.csv',index=False) 171 | print test.shape 172 | ''' 173 | 174 | 175 | 176 | ''' 177 | # 构造整个月份的特征 178 | train = pd.read_csv('./data/train.csv') 179 | train = train.fillna(0) 180 | train = All_month_count_feat(train) 181 | print train 182 | train.to_csv('./data/train_org_feat_change.csv',index=False) 183 | ''' 184 | 185 | train_org_feat = pd.read_csv('./data/train_org_feat.csv') 186 | train_org_feat = train_org_feat.fillna(0) 187 | 188 | train_org_feat_change = pd.read_csv('./data/train_org_feat_change.csv') 189 | train_org_feat_change = train_org_feat_change.fillna(0) 190 | 191 | 192 | # # 选取5月份数据的8-9点时间段构造测试数据 5月份所有8点时段数据 193 | test = train_org_feat[train_org_feat['time_interval_month']==5] 194 | test = test[test['time_interval_begin_hour']==8] 195 | print test.shape 196 | test.drop(['mean_','median_','mode_','std_','max_','min_','time_interval_month','time_interval_begin_hour'],inplace=True,axis=1) 197 | # 验证的标签数据 198 | test_label = test.pop('travel_time') 199 | test_label = np.log1p(test_label) 200 | # 获取7-8点的统计特征数据 201 | count_test = train_org_feat_change[train_org_feat_change['time_interval_month']==4] 202 | count_test = count_test[(count_test['time_interval_begin_hour']==8)] 203 | # last_test = count_test[count_test['time_interval_minutes']==58]['travel_time'] 204 | count_test = count_test[['link_ID','travel_time','time_interval_minutes','time_interval_day','mean_m','median_m','mode_m','std_m','max_m','min_m' 205 | ]] 206 | count_test['mode_median_m'] = (count_test['median_m'] + count_test['mode_m'])/2.0 207 | count_test['travel_time'] = np.log1p(count_test['travel_time']) 208 | test = pd.merge(test,count_test,on=['link_ID','time_interval_minutes','time_interval_day'],how='left') 209 | 210 | count_test = train_org_feat[train_org_feat['time_interval_month']==4] 211 | count_test = count_test[(count_test['time_interval_begin_hour']==8)] 212 | count_test = count_test[['link_ID','time_interval_minutes','time_interval_day','mean_','median_','mode_','std_','max_','min_' 213 | ]] 214 | count_test['mode_median_'] = (count_test['mode_'] + count_test['median_'])/2.0 215 | test = pd.merge(test,count_test,on=['link_ID','time_interval_minutes','time_interval_day'],how='left') 216 | test = test.fillna(test.median()) 217 | # print test.head() 218 | # print test.shape 219 | 220 | 221 | 222 | train = train_org_feat[train_org_feat['time_interval_month']==4] 223 | train = train[train['time_interval_begin_hour']==8] 224 | print train.shape 225 | train.drop(['mean_','median_','mode_','std_','max_','min_','time_interval_month','time_interval_begin_hour'],inplace=True,axis=1) 226 | # 验证的标签数据 227 | train_label = train.pop('travel_time') 228 | train_label = np.log1p(train_label) 229 | # 获取7-8点的统计特征数据 230 | count_train = train_org_feat_change[train_org_feat_change['time_interval_month']==3] 231 | count_train = count_train[(count_train['time_interval_begin_hour']==8)] 232 | # last_train = count_train[count_train['time_interval_minutes']==58]['travel_time'] 233 | count_train = count_train[['link_ID','travel_time','time_interval_minutes','time_interval_day','mean_m','median_m','mode_m','std_m','max_m','min_m' 234 | ]] 235 | count_train['mode_median_m'] = (count_train['median_m'] + count_train['mode_m'])/2.0 236 | count_train['travel_time'] = np.log1p(count_train['travel_time']) 237 | train = pd.merge(train,count_train,on=['link_ID','time_interval_minutes','time_interval_day'],how='left') 238 | 239 | 240 | count_train = train_org_feat[train_org_feat['time_interval_month']==3] 241 | count_train = count_train[(count_train['time_interval_begin_hour']==8)] 242 | count_train = count_train[['link_ID','time_interval_minutes','time_interval_day','mean_','median_','mode_','std_','max_','min_' 243 | ]] 244 | count_train['mode_median_'] = (count_train['mean_'] + count_train['mode_'])/2.0 245 | train = pd.merge(train,count_train,on=['link_ID','time_interval_minutes','time_interval_day'],how='left') 246 | train = train.fillna(train.median()) 247 | # print train.head() 248 | # print train.shape 249 | 250 | feat = ['length','width','time_interval_minutes','travel_time', 251 | 'mean_m', 'median_m','mode_m','max_m','min_m','std_m', 252 | 'mean_','median_','mode_','std_','week_1','week_5','week_6','week_7'] 253 | 254 | # [29] validation_0-rmse:0.635224 validation_0-mape:0.376652 255 | # [29] validation_0-rmse:0.680168 validation_0-mape:0.378365 256 | # [29] validation_0-rmse:0.691687 validation_0-mape:0.375677 257 | # [309] validation_0-rmse:0.683436 validation_0-mape:0.36668 9000 0.239716016874 258 | # [304] validation_0-rmse:0.68838 validation_0-mape:0.367605 12 0.241281228054 259 | # [294] validation_0-rmse:0.700439 validation_0-mape:0.369061 0.2386256583 260 | # model 模型 261 | from sklearn.ensemble import GradientBoostingRegressor 262 | 263 | alpha = 0.95 264 | 265 | clf = GradientBoostingRegressor(loss='quantile', alpha=alpha, 266 | n_estimators=1000, max_depth=11, 267 | learning_rate=.1, min_samples_leaf=9, 268 | min_samples_split=9) 269 | 270 | 271 | # clf.set_params(alpha=1.0 - alpha) 272 | clf.set_params(loss='ls') 273 | clf.fit(train[feat].values, train_label.values) 274 | pre = clf.predict(test[feat].values) 275 | result=np.sum(np.abs(np.expm1(pre)-np.abs(np.expm1(test_label.values)))/np.abs(np.expm1(test_label.values)))/len(test_label.values) 276 | print result 277 | 278 | 279 | # xlf = xgb.XGBRegressor(max_depth=11, 280 | # learning_rate=0.01, 281 | # n_estimators=1000, 282 | # silent=True, 283 | # objective=mape_object, 284 | # gamma=0, 285 | # min_child_weight=5, 286 | # max_delta_step=0, 287 | # subsample=0.8, 288 | # colsample_bytree=0.8, 289 | # colsample_bylevel=1, 290 | # reg_alpha=1e0, 291 | # reg_lambda=0, 292 | # scale_pos_weight=1, 293 | # seed=666, 294 | # missing=None) 295 | # 296 | # xlf.fit(train[feat].values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test[feat].values, test_label.values)],early_stopping_rounds=2) 297 | # limit = xlf.best_iteration + 1 298 | # print xlf.get_params() 299 | # 300 | # ################################################################## 301 | # sub_org_feat = pd.read_csv('./data/test_org_feat.csv') 302 | # sub_org_feat = sub_org_feat.fillna(0) 303 | # 304 | # # sub = sub_org_feat 305 | # sub = sub_org_feat[sub_org_feat['time_interval_month']==6] 306 | # sub = sub[sub['time_interval_begin_hour']==8] 307 | # print sub.shape 308 | # sub.drop(['mean_','median_','mode_','std_','max_','min_','time_interval_month','time_interval_begin_hour'],inplace=True,axis=1) 309 | # 310 | # # 验证的标签数据 311 | # sub_label = sub.pop('travel_time') 312 | # sub_label = np.log1p(sub_label) 313 | # # 获取7-8点的统计特征数据 314 | # count_sub = train_org_feat_change[train_org_feat_change['time_interval_month']==5] 315 | # count_sub = count_sub[(count_sub['time_interval_begin_hour']==8)] 316 | # 317 | # count_sub = count_sub[['link_ID','travel_time','time_interval_minutes','time_interval_day','mean_m','median_m','mode_m','std_m','max_m','min_m' 318 | # ]] 319 | # count_sub['mode_median_m'] = (count_sub['mean_m'] + count_sub['mode_m'])/2.0 320 | # count_sub['travel_time'] = np.log1p(count_sub['travel_time']) 321 | # 322 | # sub = pd.merge(sub,count_sub,on=['link_ID','time_interval_minutes','time_interval_day'],how='left') 323 | # 324 | # 325 | # count_sub = train_org_feat[train_org_feat['time_interval_month']==5] 326 | # count_sub = count_sub[(count_sub['time_interval_begin_hour']==8)] 327 | # count_sub = count_sub[['link_ID','time_interval_minutes','time_interval_day','mean_','median_','mode_','std_','max_','min_' 328 | # ]] 329 | # count_sub['mode_median_'] = (count_sub['mean_'] + count_sub['mode_'])/2.0 330 | # sub = pd.merge(sub,count_sub,on=['link_ID','time_interval_minutes','time_interval_day'],how='left') 331 | # sub = sub.fillna(train.median()) 332 | # print sub.head() 333 | # print sub 334 | # print sub.shape 335 | # 336 | # result = xlf.predict(sub[feat].values) 337 | # print len(list(result)) 338 | # sub_demo = pd.read_table('./sub_demo.txt',header=None,sep='#') 339 | # sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 340 | # del sub_demo['travel_time'] 341 | # travel_time = pd.DataFrame({'travel_time':list(result)}) 342 | # tt = pd.concat([sub_demo,travel_time],axis=1) 343 | # # tt = tt.fillna(0) 344 | # tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 345 | # tt[['link_ID','date','time_interval','travel_time']].to_csv('./mapodoufu_2017-08-01.txt',sep='#',index=False,header=False) 346 | # print tt[['link_ID','date','time_interval','travel_time']].shape 347 | # print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 348 | # 349 | # 350 | # # 该路段上游路段数量,该路段下游路段数量 351 | # 352 | # mapodoufu1 = pd.read_table('./mapodoufu_2017-08-01.txt',header=None,sep='#') 353 | # mapodoufu2 = pd.read_table('./mapodoufu_2017-07-27.txt',header=None,sep='#') 354 | # 355 | # print sum(mapodoufu1[0]==mapodoufu2[0]) 356 | # print sum(mapodoufu1[1]==mapodoufu2[1]) 357 | # print sum(mapodoufu1[2]==mapodoufu2[2]) 358 | # print sum(mapodoufu1[3]==mapodoufu2[3]) 359 | # result=np.sum(np.abs(mapodoufu1[3]-mapodoufu2[3])/mapodoufu2[3])/len(mapodoufu2[3]) 360 | # print result -------------------------------------------------------------------------------- /session_2/getpara.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python 3 | from sklearn import preprocessing 4 | import math 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | import pandas as pd 8 | import numpy as np 9 | from sklearn.ensemble import GradientBoostingRegressor 10 | from sklearn import neighbors 11 | from sklearn.ensemble import RandomForestRegressor 12 | import warnings 13 | import datetime 14 | import time 15 | 16 | #online 17 | data = pd.read_csv('traveltime_drop.txt',sep=';',low_memory=False) 18 | seg1 = pd.read_csv('padding.txt',sep=';',low_memory=False) 19 | del seg1['travel_time'] 20 | data['minut']=pd.Series(data['time_interval'].str.slice(15,17)) 21 | data['h']=pd.Series(data['time_interval'].str.slice(12,14)) 22 | data['slice']=pd.Series(data['time_interval'].str.slice(15,16)) 23 | data['1/travel_time']=1/data['travel_time'] 24 | data['1/travel_time2']=1/data['travel_time']/data['travel_time'] 25 | seg1['h']=pd.Series(seg1['time_interval'].str.slice(12,14)) 26 | 27 | 28 | #缺数据,不加实时 29 | #实时,历史 30 | nowh=data[(data['date']<'2017-06-01') &((data['h']=='06')|(data['h']=='07'))] 31 | #历史 32 | history=data[(data['date']<'2017-06-01') &( data['h']=='08')] 33 | #模板 34 | seg1=seg1[(seg1['date']>='2017-06-01') &( seg1['date']<='2017-06-30')&( seg1['h']=='08')] 35 | seg=data[(data['date']>='2017-06-01') &( data['date']<='2017-06-30')&( data['h']=='08')] 36 | 37 | #link_ID date time_interval travel_time 38 | m_m=history.groupby(['link_ID','minut'],as_index=False)['travel_time'].median() 39 | s_m=history.groupby(['link_ID','slice'],as_index=False)['travel_time'].median() 40 | m=history.groupby(['link_ID'],as_index=False)['travel_time'].median() 41 | w_m=history.groupby(['link_ID','w'],as_index=False)['travel_time'].median() 42 | 43 | nh_m=nowh.groupby(['link_ID'],as_index=False)['travel_time'].median() 44 | nh_a=nowh.groupby(['link_ID'],as_index=False)['travel_time'].mean() 45 | 46 | s1=history.groupby(['link_ID','w'],as_index=False)['1/travel_time'].sum() 47 | s2=history.groupby(['link_ID','w'],as_index=False)['1/travel_time2'].sum() 48 | result=pd.merge(seg1, seg, on=['link_ID','time_interval'], how='left') 49 | 50 | result=pd.merge(result, s1, on=['link_ID','w'], how='left') 51 | 52 | result.rename(columns={'travel_time': 'true','1/travel_time_y': 's1'}, inplace=True) 53 | result=pd.merge(result, s2, on=['link_ID','w'], how='left') 54 | result.rename(columns={'1/travel_time2_y': 's2'}, inplace=True) 55 | result['preloss']=result['s1']/result['s2'] 56 | 57 | s3=history.groupby(['link_ID'],as_index=False)['1/travel_time'].sum() 58 | s4=history.groupby(['link_ID'],as_index=False)['1/travel_time2'].sum() 59 | result=pd.merge(result, s3, on=['link_ID'], how='left') 60 | result.rename(columns={'1/travel_time': 's3'}, inplace=True) 61 | result=pd.merge(result, s4, on=['link_ID'], how='left') 62 | result.rename(columns={'1/travel_time2': 's4'}, inplace=True) 63 | result['lloss']=result['s3']/result['s4'] 64 | 65 | s5=history.groupby(['link_ID','slice'],as_index=False)['1/travel_time'].sum() 66 | s6=history.groupby(['link_ID','slice'],as_index=False)['1/travel_time2'].sum() 67 | result=pd.merge(result, s5, on=['link_ID','slice'], how='left') 68 | result.rename(columns={'1/travel_time': 's5'}, inplace=True) 69 | result=pd.merge(result, s6, on=['link_ID','slice'], how='left') 70 | result.rename(columns={'1/travel_time2': 's6'}, inplace=True) 71 | result['sloss']=result['s5']/result['s6'] 72 | 73 | s7=history.groupby(['link_ID','minut'],as_index=False)['1/travel_time'].sum() 74 | s8=history.groupby(['link_ID','minut'],as_index=False)['1/travel_time2'].sum() 75 | result=pd.merge(result, s7, on=['link_ID','minut'], how='left') 76 | result.rename(columns={'1/travel_time': 's7'}, inplace=True) 77 | result=pd.merge(result, s8, on=['link_ID','minut'], how='left') 78 | result.rename(columns={'1/travel_time2': 's8'}, inplace=True) 79 | result['mloss']=result['s7']/result['s8'] 80 | 81 | 82 | 83 | ns1=nowh.groupby(['link_ID'],as_index=False)['1/travel_time'].sum() 84 | ns2=nowh.groupby(['link_ID'],as_index=False)['1/travel_time2'].sum() 85 | result=pd.merge(result, ns1, on=['link_ID'], how='left') 86 | result.rename(columns={'1/travel_time': 'ns1'}, inplace=True) 87 | result=pd.merge(result, ns2, on=['link_ID'], how='left') 88 | result.rename(columns={'1/travel_time2': 'ns2'}, inplace=True) 89 | result['nloss']=result['ns1']/result['ns2'] 90 | 91 | ns3=nowh.groupby(['link_ID','w'],as_index=False)['1/travel_time'].sum() 92 | ns4=nowh.groupby(['link_ID','w'],as_index=False)['1/travel_time2'].sum() 93 | result=pd.merge(result, ns3, on=['link_ID','w'], how='left') 94 | result.rename(columns={'1/travel_time': 'ns3'}, inplace=True) 95 | result=pd.merge(result, ns4, on=['link_ID','w'], how='left') 96 | result.rename(columns={'1/travel_time2': 'ns4'}, inplace=True) 97 | result['nwloss']=result['ns3']/result['ns4'] 98 | 99 | 100 | ns5=nowh.groupby(['link_ID','slice'],as_index=False)['1/travel_time'].sum() 101 | ns6=nowh.groupby(['link_ID','slice'],as_index=False)['1/travel_time2'].sum() 102 | result=pd.merge(result, ns5, on=['link_ID','slice'], how='left') 103 | result.rename(columns={'1/travel_time': 'ns5'}, inplace=True) 104 | result=pd.merge(result, ns6, on=['link_ID','slice'], how='left') 105 | result.rename(columns={'1/travel_time2': 'ns6'}, inplace=True) 106 | result['nsloss']=result['ns5']/result['ns6'] 107 | 108 | ns7=nowh.groupby(['link_ID','minut'],as_index=False)['1/travel_time'].sum() 109 | ns8=nowh.groupby(['link_ID','minut'],as_index=False)['1/travel_time2'].sum() 110 | result=pd.merge(result, ns7, on=['link_ID','minut'], how='left') 111 | result.rename(columns={'1/travel_time': 'ns7'}, inplace=True) 112 | result=pd.merge(result, ns8, on=['link_ID','minut'], how='left') 113 | result.rename(columns={'1/travel_time2': 'ns8'}, inplace=True) 114 | result['nmloss']=result['ns7']/result['ns8'] 115 | 116 | 117 | 118 | 119 | result=pd.merge(result, m, on=['link_ID'], how='left') 120 | result.rename(columns={'travel_time': 'm'}, inplace=True) 121 | result=pd.merge(result, s_m, on=['link_ID','slice'], how='left') 122 | result.rename(columns={'travel_time': 's_m'}, inplace=True) 123 | result=pd.merge(result, m_m, on=['link_ID','minut'], how='left') 124 | result.rename(columns={'travel_time': 'm_m'}, inplace=True) 125 | result=pd.merge(result, w_m, on=['link_ID','w'], how='left') 126 | result.rename(columns={'travel_time': 'w_m'}, inplace=True) 127 | result=pd.merge(result, nh_m, on=['link_ID'], how='left') 128 | result.rename(columns={'travel_time': 'nh_m'}, inplace=True) 129 | result=pd.merge(result, nh_a, on=['link_ID'], how='left') 130 | result.rename(columns={'travel_time': 'nh_a'}, inplace=True) 131 | 132 | result['max1']=(1/result['w_m']+1/result['preloss'])/(1/result['w_m']/result['w_m']+1/result['preloss']/result['preloss']) 133 | result['max2']=(result['preloss']+result['w_m'])/2 134 | result['max3']=(result['preloss']+result['sloss'])/2 135 | result['max4']=(result['preloss']+result['mloss'])/2 136 | result['max5']=(result['sloss']+result['mloss'])/2 137 | result['max6']=(result['sloss']+result['w_m'])/2 138 | result['max7']=(result['mloss']+result['w_m'])/2 139 | 140 | result['max8']=(1/result['sloss']+1/result['preloss'])/(1/result['sloss']/result['sloss']+1/result['preloss']/result['preloss']) 141 | result['max9']=(1/result['mloss']+1/result['preloss'])/(1/result['mloss']/result['mloss']+1/result['preloss']/result['preloss']) 142 | result['max10']=(1/result['mloss']+1/result['sloss'])/(1/result['mloss']/result['mloss']+1/result['sloss']/result['sloss']) 143 | result['max11']=(1/result['w_m']+1/result['sloss'])/(1/result['w_m']/result['w_m']+1/result['sloss']/result['sloss']) 144 | result['max12']=(1/result['mloss']+1/result['w_m'])/(1/result['mloss']/result['mloss']+1/result['w_m']/result['w_m']) 145 | result['max13']=(1/result['mloss']+1/result['w_m']+1/result['preloss'])/(1/result['mloss']/result['mloss']+1/result['w_m']/result['w_m']+1/result['preloss']/result['preloss']) 146 | 147 | result['max14']=(result['preloss']+result['nwloss'])/2 148 | result['max15']=(result['preloss']+result['nsloss'])/2 149 | result['max16']=(result['preloss']+result['nmloss'])/2 150 | result['max17']=(1/result['preloss']+1/result['nwloss'])/(1/result['preloss']/result['preloss']+1/result['nwloss']/result['nwloss']) 151 | result['max18']=(1/result['preloss']+1/result['nsloss'])/(1/result['preloss']/result['preloss']+1/result['nsloss']/result['nsloss']) 152 | result['max19']=(1/result['preloss']+1/result['nmloss'])/(1/result['preloss']/result['preloss']+1/result['nmloss']/result['nmloss']) 153 | 154 | 155 | #link_ID date time_interval preloss nloss m s_m m_m w_m nh_m nh_a 156 | 157 | result['mapepreloss']=abs(result['preloss']-result['true'])/result['true'] 158 | result['mapenloss']=abs(result['nloss']-result['true'])/result['true'] 159 | result['lloss']=abs(result['lloss']-result['true'])/result['true'] 160 | result['sloss']=abs(result['sloss']-result['true'])/result['true'] 161 | result['mloss']=abs(result['mloss']-result['true'])/result['true'] 162 | result['mapem']=abs(result['m']-result['true'])/result['true'] 163 | result['mapes_m']=abs(result['s_m']-result['true'])/result['true'] 164 | result['mapem_m']=abs(result['m_m']-result['true'])/result['true'] 165 | result['mapew_m']=abs(result['w_m']-result['true'])/result['true'] 166 | result['mapenh_m']=abs(result['nh_m']-result['true'])/result['true'] 167 | result['mapenh_a']=abs(result['nh_a']-result['true'])/result['true'] 168 | result['max1']=abs(result['max1']-result['true'])/result['true'] 169 | result['max2']=abs(result['max2']-result['true'])/result['true'] 170 | 171 | result['max3']=abs(result['max3']-result['true'])/result['true'] 172 | result['max4']=abs(result['max4']-result['true'])/result['true'] 173 | result['max5']=abs(result['max5']-result['true'])/result['true'] 174 | result['max6']=abs(result['max6']-result['true'])/result['true'] 175 | result['max7']=abs(result['max7']-result['true'])/result['true'] 176 | result['max8']=abs(result['max8']-result['true'])/result['true'] 177 | result['max9']=abs(result['max9']-result['true'])/result['true'] 178 | result['max10']=abs(result['max10']-result['true'])/result['true'] 179 | result['max11']=abs(result['max11']-result['true'])/result['true'] 180 | result['max12']=abs(result['max12']-result['true'])/result['true'] 181 | result['max13']=abs(result['max13']-result['true'])/result['true'] 182 | result['max14']=abs(result['max14']-result['true'])/result['true'] 183 | result['max15']=abs(result['max15']-result['true'])/result['true'] 184 | result['max16']=abs(result['max16']-result['true'])/result['true'] 185 | result['max17']=abs(result['max17']-result['true'])/result['true'] 186 | result['max18']=abs(result['max18']-result['true'])/result['true'] 187 | result['max19']=abs(result['max19']-result['true'])/result['true'] 188 | 189 | result['nwloss']=abs(result['nwloss']-result['true'])/result['true'] 190 | result['nsloss']=abs(result['nsloss']-result['true'])/result['true'] 191 | result['nmloss']=abs(result['nmloss']-result['true'])/result['true'] 192 | 193 | mapepreloss=result.groupby(['link_ID'],as_index=False)['mapepreloss'].mean() 194 | mapenloss=result.groupby(['link_ID'],as_index=False)['mapenloss'].mean() 195 | lloss=result.groupby(['link_ID'],as_index=False)['lloss'].mean() 196 | sloss=result.groupby(['link_ID'],as_index=False)['sloss'].mean() 197 | mloss=result.groupby(['link_ID'],as_index=False)['mloss'].mean() 198 | mapem=result.groupby(['link_ID'],as_index=False)['mapem'].mean() 199 | mapes_m=result.groupby(['link_ID'],as_index=False)['mapes_m'].mean() 200 | mapem_m=result.groupby(['link_ID'],as_index=False)['mapem_m'].mean() 201 | mapew_m=result.groupby(['link_ID'],as_index=False)['mapew_m'].mean() 202 | mapenh_m=result.groupby(['link_ID'],as_index=False)['mapenh_m'].mean() 203 | mapenh_a=result.groupby(['link_ID'],as_index=False)['mapenh_a'].mean() 204 | max1=result.groupby(['link_ID'],as_index=False)['max1'].mean() 205 | max2=result.groupby(['link_ID'],as_index=False)['max2'].mean() 206 | max3=result.groupby(['link_ID'],as_index=False)['max3'].mean() 207 | max4=result.groupby(['link_ID'],as_index=False)['max4'].mean() 208 | max5=result.groupby(['link_ID'],as_index=False)['max5'].mean() 209 | max6=result.groupby(['link_ID'],as_index=False)['max6'].mean() 210 | max7=result.groupby(['link_ID'],as_index=False)['max7'].mean() 211 | max8=result.groupby(['link_ID'],as_index=False)['max8'].mean() 212 | max9=result.groupby(['link_ID'],as_index=False)['max9'].mean() 213 | max10=result.groupby(['link_ID'],as_index=False)['max10'].mean() 214 | max11=result.groupby(['link_ID'],as_index=False)['max11'].mean() 215 | max12=result.groupby(['link_ID'],as_index=False)['max12'].mean() 216 | max13=result.groupby(['link_ID'],as_index=False)['max13'].mean() 217 | 218 | max14=result.groupby(['link_ID'],as_index=False)['max14'].mean() 219 | max15=result.groupby(['link_ID'],as_index=False)['max15'].mean() 220 | max16=result.groupby(['link_ID'],as_index=False)['max16'].mean() 221 | max17=result.groupby(['link_ID'],as_index=False)['max17'].mean() 222 | max18=result.groupby(['link_ID'],as_index=False)['max18'].mean() 223 | max19=result.groupby(['link_ID'],as_index=False)['max19'].mean() 224 | 225 | nwloss=result.groupby(['link_ID'],as_index=False)['nwloss'].mean() 226 | nsloss=result.groupby(['link_ID'],as_index=False)['nsloss'].mean() 227 | nmloss=result.groupby(['link_ID'],as_index=False)['nmloss'].mean() 228 | 229 | 230 | 231 | temp=pd.DataFrame() 232 | temp=pd.concat([mapepreloss['mapepreloss'],mapenloss['mapenloss'],lloss['lloss'],sloss['sloss'],mloss['mloss'], 233 | mapem['mapem'] ,mapes_m['mapes_m'],mapem_m['mapem_m'],mapew_m['mapew_m'],mapenh_m['mapenh_m'],mapenh_a['mapenh_a'],max1['max1'] ,max2['max2'], 234 | max3['max3'], max4['max4'],max5['max5'] ,max6['max6'],max7['max7'] ,max8['max8'],max9['max9'] ,max10['max10'],max11['max11'] ,max12['max12'],max13['max13'] ],axis=1) 235 | print temp.head(5) 236 | mape=np.array(temp) 237 | best=np.zeros((len(temp),2)) 238 | for i in range(0,len(temp)): 239 | best[i,0]=1 240 | best[i,1]=mape[i,0] #temp mape 241 | if mape[i,1]0] 313 | print testmape 314 | print testmape.mean() 315 | para=pd.concat([mapepreloss['link_ID'],pd.DataFrame(best.astype(int))],axis=1) 316 | 317 | para.to_csv('para08.txt',index=False) 318 | 319 | # 15 0.273884567108 320 | # 18 0.269130923414 321 | # 08 0.287181842916 -------------------------------------------------------------------------------- /session_2/lightgbm_8.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import lightgbm as lgb 4 | import pandas as pd 5 | import numpy as np 6 | import gc 7 | 8 | from scipy.stats import mode 9 | def add_constact(df): 10 | return np.sum(1.0/df) / np.sum(1.0/df/df) 11 | # customes 12 | def mape_object(y,d): 13 | d = d.get_label() 14 | g = 1.0*np.sign(y-d)/d 15 | h = 1.0/d 16 | return g,h 17 | # 评价函数ln形式 18 | def mape_ln(y,d): 19 | c=d.get_label() 20 | result= np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 21 | return "mape",result,False 22 | # 中位数 23 | def mode_function(df): 24 | # print df 25 | df = df.astype(int) 26 | # print df 27 | counts = mode(df) 28 | return counts[0][0] 29 | # load or create your dataset 30 | 31 | print('Load data...') 32 | 33 | print u'8' 34 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 35 | feature_data['link_ID'] = feature_data['link_ID'].astype(str) 36 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 37 | feature_data.drop(['time_interval_week','link_class'],inplace=True,axis=1) 38 | feature_data = pd.concat([feature_data,week],axis=1) 39 | print feature_data.head() 40 | # print feature_data[feature_data.time_interval_begin_hour== 8] 41 | 42 | train48 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==8),: ] 43 | for i in [58,48,38,28,18,8,0]: 44 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 45 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 46 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 47 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 48 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 49 | train48 = pd.merge(train48,tmp,on=['link_ID','time_interval_day'],how='left') 50 | 51 | train58 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour== 8),: ] 52 | for i in [58,48,38,28,18,8,0]: 53 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 54 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 55 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 56 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 57 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 58 | train58 = pd.merge(train58,tmp,on=['link_ID','time_interval_day'],how='left') 59 | 60 | train67 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==7),: ] 61 | for i in [58,48,38,28,18,8,0]: 62 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 63 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 64 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 65 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 66 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 67 | train67 = pd.merge(train67,tmp,on=['link_ID','time_interval_day'],how='left') 68 | 69 | train66 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour== 6),: ] 70 | for i in [58,48,38,28,18,8,0]: 71 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 5)&(feature_data.time_interval_minutes >= i),:] 72 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 73 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 74 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 75 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 76 | train66 = pd.merge(train66,tmp,on=['link_ID','time_interval_day'],how='left') 77 | 78 | 79 | train56 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==6),: ] 80 | for i in [58,48,38,28,18,8,0]: 81 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 5)&(feature_data.time_interval_minutes >= i),:] 82 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 83 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 84 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 85 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 86 | train56 = pd.merge(train56,tmp,on=['link_ID','time_interval_day'],how='left') 87 | 88 | train57 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==7),: ] 89 | for i in [58,48,38,28,18,8,0]: 90 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 91 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 92 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 93 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 94 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 95 | train57 = pd.merge(train57,tmp,on=['link_ID','time_interval_day'],how='left') 96 | 97 | train47 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==7),: ] 98 | for i in [58,48,38,28,18,8,0]: 99 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 100 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 101 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 102 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 103 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 104 | train47 = pd.merge(train47,tmp,on=['link_ID','time_interval_day'],how='left') 105 | 106 | train418 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==18),: ] 107 | for i in [58,48,38,28,18,8,0]: 108 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 109 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 110 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 111 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 112 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 113 | train418 = pd.merge(train418,tmp,on=['link_ID','time_interval_day'],how='left') 114 | 115 | train518 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==18),: ] 116 | for i in [58,48,38,28,18,8,0]: 117 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 118 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 119 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 120 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 121 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 122 | train518 = pd.merge(train518,tmp,on=['link_ID','time_interval_day'],how='left') 123 | 124 | train = pd.concat([train418,train518,train67,train57,train58,train47,train48,train66,train56],axis=0) 125 | 126 | ############################################################################################################################################################ 127 | 128 | train_history = feature_data.loc[(feature_data.time_interval_month == 4),: ] 129 | train_history = train_history.groupby(['link_ID', 'time_interval_minutes'])[ 130 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 131 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 132 | # train_history['median_mode'] = 0.5 * train_history['mode_m'] + 0.5 * train_history['median_m'] 133 | 134 | train = pd.merge(train,train_history,on=['link_ID','time_interval_minutes'],how='left') 135 | 136 | train_constacot = feature_data.loc[(feature_data.time_interval_month == 4),: ] 137 | train_constacot = train_constacot.groupby(['link_ID'])[ 138 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 139 | train = pd.merge(train,train_constacot,on=['link_ID'],how='left') 140 | 141 | # train['speed_max'] = train['length'] / train['min_m'] 142 | # train['speed_min'] = train['length'] / train['max_m'] 143 | train['speed_mode'] = train['length'] / train['mode_m'] 144 | train['speed_median'] = train['length'] / train['median_m'] 145 | 146 | # train['120_speed'] = train['length'] / 120.0 147 | train['mean_std'] = train['mean_m'] / train['std_m'] 148 | train['max_min_distance'] = train['max_m'] - train['min_m'] 149 | 150 | train_8 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 8),: ] 151 | train_8 = train_8.groupby(['link_ID', 'time_interval_minutes'])[ 152 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 153 | 154 | train = pd.merge(train,train_8,on=['link_ID','time_interval_minutes'],how='left') 155 | train = train.fillna(-1) 156 | 157 | train.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m','mode_m'],inplace=True,axis=1) 158 | print train.shape 159 | 160 | train_label = np.log1p(train.pop('travel_time')) 161 | train_label = train_label.values 162 | train = train.values 163 | 164 | 165 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 166 | for i in [58,48,38,28,18,8,0]: 167 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 168 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 169 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 170 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 171 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 172 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 173 | 174 | 175 | test_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 176 | test_history = test_history.groupby(['link_ID', 'time_interval_minutes'])[ 177 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 178 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 179 | # test_history['median_mode'] = 0.5 * test_history['mode_m'] + 0.5 * test_history['median_m'] 180 | test = pd.merge(test,test_history,on=['link_ID','time_interval_minutes'],how='left') 181 | 182 | test_constacot = feature_data.loc[(feature_data.time_interval_month == 5),: ] 183 | test_constacot = test_constacot.groupby(['link_ID'])[ 184 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 185 | test = pd.merge(test,test_constacot,on=['link_ID'],how='left') 186 | # test['speed_max'] = test['length'] / test['min_m'] 187 | # test['speed_min'] = test['length'] / test['max_m'] 188 | test['speed_mode'] = test['length'] / test['mode_m'] 189 | test['speed_median'] = test['length'] / test['median_m'] 190 | 191 | # test['120_speed'] = test['length'] / 120.0 192 | test['mean_std'] = test['mean_m'] / test['std_m'] 193 | test['max_min_distance'] = test['max_m'] - test['min_m'] 194 | 195 | test_8 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 8),: ] 196 | test_8 = test_8.groupby(['link_ID', 'time_interval_minutes'])[ 197 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 198 | 199 | test = pd.merge(test,test_8,on=['link_ID','time_interval_minutes'],how='left') 200 | 201 | print test.head() 202 | # analy_data_org = test.copy() 203 | # 缺失值的处理 204 | test.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m','mode_m'],inplace=True,axis=1) 205 | test = test.fillna(-1) 206 | test_label = np.log1p(test.pop('travel_time')) 207 | 208 | test_label = test_label.values 209 | test = test.values 210 | 211 | 212 | print('Start training...') 213 | # train 214 | lgb_train = lgb.Dataset(train, train_label) 215 | lgb_eval = lgb.Dataset(test, test_label, reference=lgb_train) 216 | 217 | params = { 218 | 'boosting_type': 'gbdt', 219 | 'objective': 'regression', 220 | 'metric': 'rmse', 221 | 'num_leaves': 128, 222 | 'learning_rate': 0.0025, 223 | 'feature_fraction': 0.8, 224 | 'bagging_fraction': 0.8, 225 | 'bagging_freq': 5, 226 | 'verbose': 0 227 | } 228 | 229 | gbm = lgb.train(params, 230 | lgb_train, 231 | num_boost_round=5000, 232 | # init_model=gbm, 233 | fobj=mape_object, 234 | feval=mape_ln, 235 | valid_sets=lgb_eval, 236 | early_stopping_rounds = 20) 237 | 238 | print('Start predicting...') 239 | # predict 240 | sub = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour==8),: ] 241 | for i in [58,48,38,28,18,8,0]: 242 | tmp = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 243 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 244 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 245 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std),('min_%d' % (i), np.min)]).reset_index() 246 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 247 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 248 | 249 | sub_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 250 | sub_history = sub_history.groupby(['link_ID', 'time_interval_minutes'])[ 251 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 252 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 253 | # sub_history['median_mode'] = 0.5 * sub_history['mode_m'] + 0.5 * sub_history['median_m'] 254 | 255 | sub = pd.merge(sub,sub_history,on=['link_ID','time_interval_minutes'],how='left') 256 | 257 | sub_constacot = feature_data.loc[(feature_data.time_interval_month == 5),: ] 258 | sub_constacot = sub_constacot.groupby(['link_ID'])[ 259 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 260 | sub = pd.merge(sub,sub_constacot,on=['link_ID'],how='left') 261 | 262 | # sub['speed_max'] = sub['length'] / sub['min_m'] 263 | # sub['speed_min'] = sub['length'] / sub['max_m'] 264 | sub['speed_mode'] = sub['length'] / sub['mode_m'] 265 | sub['speed_median'] = sub['length'] / sub['median_m'] 266 | 267 | # sub['120_speed'] = sub['length'] / 120.0 268 | 269 | sub['mean_std'] = sub['mean_m'] / sub['std_m'] 270 | sub['max_min_distance'] = sub['max_m'] - sub['min_m'] 271 | 272 | sub_history_8 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 8),: ] 273 | sub_history_8 = sub_history_8.groupby(['link_ID', 'time_interval_minutes'])[ 274 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 275 | 276 | sub = pd.merge(sub,sub_history_8,on=['link_ID','time_interval_minutes'],how='left') 277 | 278 | sub_label = np.log1p(sub.pop('travel_time')) 279 | 280 | sub.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m','mode_m'],inplace=True,axis=1) 281 | sub = sub.values 282 | 283 | result = gbm.predict(sub, num_iteration=gbm.best_iteration) 284 | 285 | travel_time = pd.DataFrame({'travel_time':list(result)}) 286 | sub_demo = pd.read_table(u'./semifinal_gy_cmp_testing_template_seg2.txt',header=None,sep=';') 287 | 288 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 289 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 290 | del sub_demo['index'] 291 | del sub_demo['travel_time'] 292 | tt = pd.concat([sub_demo,travel_time],axis=1) 293 | # tt = tt.fillna(0) 294 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 295 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./2017-09-16_08.txt',sep='#',index=False,header=False) 296 | print tt[['link_ID','date','time_interval','travel_time']].shape 297 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 298 | 299 | ########################################################################################################################### 300 | # analy_data = gbm.predict(test, num_iteration=gbm.best_iteration) 301 | # analy_data = pd.DataFrame({'pre_travel_time':list(analy_data)}) 302 | # analy_data_sub = pd.concat([analy_data,analy_data_org],axis=1) 303 | # analy_data_sub['pre_travel_time'] = np.round(np.expm1(analy_data_sub['pre_travel_time']),6) 304 | # analy_data_sub.to_csv('./analy_data_08.txt') 305 | # print analy_data_sub.shape 306 | 307 | # [901] valid_0's rmse: 0.485679 valid_0's mape: 0.292658 308 | # [937] valid_0's rmse: 0.495004 valid_0's mape: 0.291705 309 | # [942] valid_0's rmse: 0.494127 valid_0's mape: 0.291573 310 | # [1660] valid_0's rmse: 0.49364 valid_0's mape: 0.291354 311 | 312 | 313 | # 4 5 6 7 314 | # [1879] valid_0's rmse: 0.494375 valid_0's mape: 0.291137 -------------------------------------------------------------------------------- /session_2/get_Feat_Aug_8.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | def mape_object(y,d): 8 | 9 | g=1.0*np.sign(y-d)/d 10 | h=1.0/d 11 | return -g,h 12 | def add_constact(df): 13 | return np.sum(1.0/df) / np.sum(1.0/df/df) 14 | # 评价函数 15 | def mape(y,d): 16 | c=d.get_label() 17 | result= np.sum(np.abs(y-c)/c)/len(c) 18 | return "mape",result 19 | 20 | # 评价函数ln形式 21 | def mape_ln(y,d): 22 | c=d.get_label() 23 | result= np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 24 | return "mape",result 25 | 26 | def AddBaseTimeFeature(df): 27 | 28 | df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20])) 29 | df = df.drop(['date', 'time_interval'], axis=1) 30 | df['time_interval_month'] = df['time_interval_begin'].map(lambda x: x.strftime('%m')) 31 | # df['time_interval_year'] = df['time_interval_begin'].map(lambda x: x.strftime('%Y')) 32 | df['time_interval_day'] = df['time_interval_begin'].map(lambda x: x.day) 33 | df['time_interval_begin_hour'] = df['time_interval_begin'].map(lambda x: x.strftime('%H')) 34 | df['time_interval_minutes'] = df['time_interval_begin'].map(lambda x: x.strftime('%M')) 35 | # Monday=1, Sunday=7 36 | df['time_interval_week'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1) 37 | return df 38 | 39 | 40 | # txt => csv 41 | # link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 42 | # link_info = link_info.sort_values('link_ID') 43 | 44 | # training_data = pd.read_table(u'./quaterfinal_gy_cmp_training_traveltime.txt',sep=';') 45 | # training_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 46 | # print training_data.head() 47 | # print training_data.shape 48 | # training_data = pd.merge(training_data,link_info,on='link_ID') 49 | 50 | # testing_data = pd.read_table(u'./quaterfinal_gy_cmp_testing_template_seg1(update).txt',sep=';',header=None) 51 | # testing_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 52 | # testing_data = pd.merge(testing_data,link_info,on='link_ID') 53 | # testing_data['travel_time'] = np.NaN 54 | # print testing_data.head() 55 | # print testing_data.shape 56 | # feature_date = pd.concat([training_data,testing_data],axis=0) 57 | 58 | # feature_date = feature_date.sort_values(['link_ID','time_interval']) 59 | # print feature_date 60 | # feature_date.to_csv('./pre_data/feature_data.csv',index=False) 61 | 62 | # feature_data = pd.read_csv('./pre_data/feature_data.csv') 63 | # feature_data = feature_data[feature_data['date']>'2016-10-01'] 64 | # print feature_data 65 | # feature_data_date = AddBaseTimeFeature(feature_data) 66 | # print feature_data_date 67 | # feature_data_date.to_csv('./pre_data/feature_data.csv',index=False) 68 | 69 | 70 | # # test 71 | # feature_data = pd.read_csv('./pre_data/feature_data.csv') 72 | # test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 73 | # test.to_csv('./pre_data/test.csv',index=False) 74 | 75 | import gc 76 | 77 | from scipy.stats import mode 78 | # 中位数 79 | def mode_function(df): 80 | df = df.astype(int) 81 | counts = mode(df) 82 | return counts[0][0] 83 | 84 | print u'8' 85 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 86 | feature_data['link_ID'] = feature_data['link_ID'].astype(str) 87 | # link_info_count = pd.read_csv('./pre_data/link_info_count.csv') 88 | # link_info_count['link_ID'] = link_info_count['link_ID'].astype(str) 89 | # feature_data = pd.merge(feature_data,link_info_count,on='link_ID',how='left') 90 | # link_class = pd.get_dummies(feature_data['link_class'],prefix='link_class') 91 | # int_count_onehot = pd.get_dummies(feature_data['in_count_'],prefix='in_count') 92 | # out_count_onehot = pd.get_dummies(feature_data['out_count_'],prefix='out_count') 93 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 94 | # time_interval_minutes = pd.get_dummies(feature_data['time_interval_minutes'],prefix='time_interval_minutes') 95 | # day = pd.get_dummies(feature_data['time_interval_day'],prefix='day') 96 | feature_data.drop(['time_interval_week','link_class'],inplace=True,axis=1) 97 | # linkId = pd.get_dummies(feature_data['link_ID'],prefix='link_id') 98 | feature_data = pd.concat([feature_data,week],axis=1) 99 | print feature_data.head() 100 | 101 | train48 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==8),: ] 102 | for i in [58,48,38,28,18,8,0]: 103 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 104 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 105 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 106 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 107 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 108 | train48 = pd.merge(train48,tmp,on=['link_ID','time_interval_day'],how='left') 109 | 110 | train58 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 111 | for i in [58,48,38,28,18,8,0]: 112 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 113 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 114 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 115 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 116 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 117 | train58 = pd.merge(train58,tmp,on=['link_ID','time_interval_day'],how='left') 118 | 119 | train57 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==7),: ] 120 | for i in [58,48,38,28,18,8,0]: 121 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 122 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 123 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 124 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 125 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 126 | train57 = pd.merge(train57,tmp,on=['link_ID','time_interval_day'],how='left') 127 | 128 | train518 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==18),: ] 129 | for i in [58,48,38,28,18,8,0]: 130 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 131 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 132 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 133 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 134 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 135 | train518 = pd.merge(train518,tmp,on=['link_ID','time_interval_day'],how='left') 136 | 137 | train67 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==7),: ] 138 | for i in [58,48,38,28,18,8,0]: 139 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 140 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 141 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 142 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 143 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 144 | train67 = pd.merge(train67,tmp,on=['link_ID','time_interval_day'],how='left') 145 | 146 | train418 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==18),: ] 147 | for i in [58,48,38,28,18,8,0]: 148 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 149 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 150 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 151 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 152 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 153 | train418 = pd.merge(train418,tmp,on=['link_ID','time_interval_day'],how='left') 154 | 155 | # train47 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==18),: ] 156 | # for i in [58,48,38,28,18,8,0]: 157 | # tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 158 | # tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 159 | # 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 160 | # ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 161 | # tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 162 | # train47 = pd.merge(train47,tmp,on=['link_ID','time_interval_day'],how='left') 163 | 164 | train = pd.concat([train418,train67,train518,train57,train58,train48],axis=0) 165 | 166 | train_history = feature_data.loc[(feature_data.time_interval_month == 4),: ] 167 | train_history = train_history.groupby(['link_ID', 'time_interval_minutes'])[ 168 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 169 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 170 | # train_history['median_mode'] = 0.5 * train_history['mode_m'] + 0.5 * train_history['median_m'] 171 | 172 | train = pd.merge(train,train_history,on=['link_ID','time_interval_minutes'],how='left') 173 | # train['speed_max'] = train['length'] / train['min_m'] 174 | # train['speed_min'] = train['length'] / train['max_m'] 175 | train['speed_mode'] = train['length'] / train['mode_m'] 176 | train['speed_median'] = train['length'] / train['median_m'] 177 | 178 | # train['120_speed'] = train['length'] / 120.0 179 | train['mean_std'] = train['mean_m'] / train['std_m'] 180 | train['max_min_distance'] = train['max_m'] - train['min_m'] 181 | 182 | train_8 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 8),: ] 183 | train_8 = train_8.groupby(['link_ID', 'time_interval_minutes'])[ 184 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 185 | 186 | train = pd.merge(train,train_8,on=['link_ID','time_interval_minutes'],how='left') 187 | 188 | print train.shape 189 | train = train.fillna(-1) 190 | train_label = np.log1p(train.pop('travel_time')) 191 | # validation = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 192 | 193 | 194 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 195 | for i in [58,48,38,28,18,8,0]: 196 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 197 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 198 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 199 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 200 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 201 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 202 | 203 | 204 | test_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 205 | test_history = test_history.groupby(['link_ID', 'time_interval_minutes'])[ 206 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 207 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 208 | # test_history['median_mode'] = 0.5 * test_history['mode_m'] + 0.5 * test_history['median_m'] 209 | test = pd.merge(test,test_history,on=['link_ID','time_interval_minutes'],how='left') 210 | 211 | # test['speed_max'] = test['length'] / test['min_m'] 212 | # test['speed_min'] = test['length'] / test['max_m'] 213 | test['speed_mode'] = test['length'] / test['mode_m'] 214 | test['speed_median'] = test['length'] / test['median_m'] 215 | 216 | # test['120_speed'] = test['length'] / 120.0 217 | test['mean_std'] = test['mean_m'] / test['std_m'] 218 | test['max_min_distance'] = test['max_m'] - test['min_m'] 219 | 220 | test_8 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 8),: ] 221 | test_8 = test_8.groupby(['link_ID', 'time_interval_minutes'])[ 222 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 223 | 224 | test = pd.merge(test,test_8,on=['link_ID','time_interval_minutes'],how='left') 225 | 226 | print test.head() 227 | 228 | # 缺失值的处理 229 | 230 | test = test.fillna(-1) 231 | test_label = np.log1p(test.pop('travel_time')) 232 | 233 | train.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 234 | test.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 235 | 236 | 237 | import xgboost as xgb 238 | # print xgb.__version__ 239 | 240 | xlf = xgb.XGBRegressor(max_depth=11, 241 | learning_rate=0.005, 242 | n_estimators=3000, 243 | silent=True, 244 | objective=mape_object, 245 | gamma=0, 246 | min_child_weight=5, 247 | max_delta_step=0, 248 | subsample=0.8, 249 | colsample_bytree=0.8, 250 | colsample_bylevel=1, 251 | reg_alpha=1e0, 252 | reg_lambda=0, 253 | scale_pos_weight=1, 254 | seed=9, 255 | missing=None) 256 | 257 | xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=3) 258 | # xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=2) 259 | print xlf.get_params() 260 | 261 | 262 | sub = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour==8),: ] 263 | for i in [58,48,38,28,18,8,0]: 264 | tmp = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 265 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 266 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 267 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 268 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 269 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 270 | 271 | sub_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 272 | sub_history = sub_history.groupby(['link_ID', 'time_interval_minutes'])[ 273 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 274 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 275 | # sub_history['median_mode'] = 0.5 * sub_history['mode_m'] + 0.5 * sub_history['median_m'] 276 | 277 | sub = pd.merge(sub,sub_history,on=['link_ID','time_interval_minutes'],how='left') 278 | # sub['speed_max'] = sub['length'] / sub['min_m'] 279 | # sub['speed_min'] = sub['length'] / sub['max_m'] 280 | sub['speed_mode'] = sub['length'] / sub['mode_m'] 281 | sub['speed_median'] = sub['length'] / sub['median_m'] 282 | 283 | # sub['120_speed'] = sub['length'] / 120.0 284 | 285 | sub['mean_std'] = sub['mean_m'] / sub['std_m'] 286 | sub['max_min_distance'] = sub['max_m'] - sub['min_m'] 287 | 288 | sub_history_8 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 8),: ] 289 | sub_history_8 = sub_history_8.groupby(['link_ID', 'time_interval_minutes'])[ 290 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 291 | 292 | sub = pd.merge(sub,sub_history_8,on=['link_ID','time_interval_minutes'],how='left') 293 | 294 | print sub.head() 295 | 296 | sub_label = np.log1p(sub.pop('travel_time')) 297 | 298 | sub.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 299 | 300 | result = xlf.predict(sub.values) 301 | 302 | travel_time = pd.DataFrame({'travel_time':list(result)}) 303 | sub_demo = pd.read_table(u'./semifinal_gy_cmp_testing_template_seg2.txt',header=None,sep=';') 304 | 305 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 306 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 307 | del sub_demo['index'] 308 | del sub_demo['travel_time'] 309 | tt = pd.concat([sub_demo,travel_time],axis=1) 310 | # tt = tt.fillna(0) 311 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 312 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./2017-09-15_08_xgb.txt',sep='#',index=False,header=False) 313 | print tt[['link_ID','date','time_interval','travel_time']].shape 314 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 315 | 316 | 317 | # [408] validation_0-rmse:0.47515 validation_0-mape:0.298994 318 | # [572] validation_0-rmse:0.474525 validation_0-mape:0.298064 319 | # [797] validation_0-rmse:0.475478 validation_0-mape:0.297869 320 | # 4 5 6 7 321 | # [797] validation_0-rmse:0.475478 validation_0-mape:0.297869 -------------------------------------------------------------------------------- /get_Feat_Aug.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | def mape_object(y,d): 7 | 8 | g=1.0*np.sign(y-d)/d 9 | h=1.0/d 10 | return -g,h 11 | 12 | # 评价函数 13 | def mape(y,d): 14 | c=d.get_label() 15 | result=np.sum(np.abs(y-c)/c)/len(c) 16 | return "mape",result 17 | 18 | # 评价函数ln形式 19 | def mape_ln(y,d): 20 | c=d.get_label() 21 | result=np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 22 | return "mape",result 23 | 24 | def AddBaseTimeFeature(df): 25 | 26 | df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20])) 27 | df = df.drop(['date', 'time_interval'], axis=1) 28 | df['time_interval_month'] = df['time_interval_begin'].map(lambda x: x.strftime('%m')) 29 | df['time_interval_day'] = df['time_interval_begin'].map(lambda x: x.day) 30 | df['time_interval_begin_hour'] = df['time_interval_begin'].map(lambda x: x.strftime('%H')) 31 | df['time_interval_minutes'] = df['time_interval_begin'].map(lambda x: x.strftime('%M')) 32 | # Monday=1, Sunday=7 33 | df['time_interval_week'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1) 34 | return df 35 | 36 | # gy_contest_link_top.txt 37 | # link_top = pd.read_table('./gy_contest_link_top.txt',sep=';') 38 | # print link_top 39 | # 4377906284594800514 4377906284514600514 40 | 41 | 42 | # txt => csv 43 | link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 44 | link_info = link_info.sort_values('link_ID') 45 | 46 | training_data = pd.read_table(u'./[新-训练集]gy_contest_traveltime_training_data_second.txt',sep=';') 47 | training_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 48 | print training_data.head() 49 | print training_data.shape 50 | training_data = pd.merge(training_data,link_info,on='link_ID') 51 | 52 | testing_data = pd.read_table(u'./[新-答案模板]gy_contest_result_template.txt',sep='#',header=None) 53 | testing_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 54 | testing_data = pd.merge(testing_data,link_info,on='link_ID') 55 | testing_data['travel_time'] = np.NaN 56 | print testing_data.shape 57 | feature_date = pd.concat([training_data,testing_data],axis=0) 58 | 59 | feature_date = feature_date.sort_values(['link_ID','time_interval']) 60 | print feature_date 61 | feature_date.to_csv('./pre_data/feature_data.csv',index=False) 62 | 63 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 64 | feature_data_date = AddBaseTimeFeature(feature_data) 65 | print feature_data_date 66 | feature_data_date.to_csv('./pre_data/feature_data.csv',index=False) 67 | 68 | 69 | # test 70 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 71 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 72 | test.to_csv('./pre_data/test.csv',index=False) 73 | 74 | import gc 75 | 76 | from scipy.stats import mode 77 | # 中位数 78 | def mode_function(df): 79 | counts = mode(df) 80 | return counts[0][0] 81 | # [378] validation_0-rmse:0.453285 validation_0-mape:0.291181 82 | 83 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 84 | feature_data['link_ID'] = feature_data['link_ID'].astype(str) 85 | # link_info_count = pd.read_csv('./pre_data/link_info_count.csv') 86 | # link_info_count['link_ID'] = link_info_count['link_ID'].astype(str) 87 | # feature_data = pd.merge(feature_data,link_info_count,on='link_ID',how='left') 88 | # link_class = pd.get_dummies(feature_data['link_class'],prefix='link_class') 89 | # int_count_onehot = pd.get_dummies(feature_data['in_count_'],prefix='in_count') 90 | # out_count_onehot = pd.get_dummies(feature_data['out_count_'],prefix='out_count') 91 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 92 | # time_interval_minutes = pd.get_dummies(feature_data['time_interval_minutes'],prefix='time_interval_minutes') 93 | # day = pd.get_dummies(feature_data['time_interval_day'],prefix='day') 94 | feature_data.drop(['time_interval_week','link_class'],inplace=True,axis=1) 95 | # linkId = pd.get_dummies(feature_data['link_ID'],prefix='link_id') 96 | feature_data = pd.concat([feature_data,week],axis=1) 97 | print feature_data.head() 98 | 99 | train = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==8),: ] 100 | for i in [58,48,38,28,18,8,0]: 101 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 102 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 103 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 104 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 105 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 106 | train = pd.merge(train,tmp,on=['link_ID','time_interval_day'],how='left') 107 | 108 | train_ = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==18),: ] 109 | for i in [58,48,38,28,18,8,0]: 110 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 111 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 112 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 113 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 114 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 115 | train_ = pd.merge(train_,tmp,on=['link_ID','time_interval_day'],how='left') 116 | 117 | ############################################################################################################################################################# 118 | 119 | train__ = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==18),: ] 120 | for i in [58,48,38,28,18,8,0]: 121 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 122 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 123 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 124 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 125 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 126 | train__ = pd.merge(train__,tmp,on=['link_ID','time_interval_day'],how='left') 127 | 128 | train___ = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==9),: ] 129 | for i in [58,48,38,28,18,8,0]: 130 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 8)&(feature_data.time_interval_minutes >= i),:] 131 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 132 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 133 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 134 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 135 | train___ = pd.merge(train___,tmp,on=['link_ID','time_interval_day'],how='left') 136 | 137 | # train_5 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==7),: ] 138 | # for i in [58,48,38,28,18,8,0]: 139 | # tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 140 | # tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 141 | # 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 142 | # ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 143 | # tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 144 | # train_5 = pd.merge(train_5,tmp,on=['link_ID','time_interval_day'],how='left') 145 | 146 | train____ = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==19),: ] 147 | for i in [58,48,38,28,18,8,0]: 148 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 18)&(feature_data.time_interval_minutes >= i),:] 149 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 150 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 151 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 152 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 153 | train____ = pd.merge(train____,tmp,on=['link_ID','time_interval_day'],how='left') 154 | 155 | train_____ = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==7),: ] 156 | for i in [58,48,38,28,18,8,0]: 157 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 158 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 159 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 160 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 161 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 162 | train_____ = pd.merge(train_____,tmp,on=['link_ID','time_interval_day'],how='left') 163 | 164 | # train_3 = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour==18),: ] 165 | # for i in [58,48,38,28,18,8,0]: 166 | # tmp = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 167 | # tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 168 | # 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 169 | # ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 170 | # tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 171 | # train_3 = pd.merge(train_3,tmp,on=['link_ID','time_interval_day'],how='left') 172 | print '123' 173 | train = pd.concat([train,train_,train_____,train___,train__,train____],axis=0) 174 | 175 | train_history = feature_data.loc[(feature_data.time_interval_month == 3),: ] 176 | train_history = train_history.groupby(['link_ID', 'time_interval_minutes'])[ 177 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 178 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 179 | # train_history['median_mode'] = 0.5 * train_history['mode_m'] + 0.5 * train_history['median_m'] 180 | 181 | train = pd.merge(train,train_history,on=['link_ID','time_interval_minutes'],how='left') 182 | # train['speed_max'] = train['length'] / train['min_m'] 183 | # train['speed_min'] = train['length'] / train['max_m'] 184 | train['speed_mode'] = train['length'] / train['mode_m'] 185 | train['speed_median'] = train['length'] / train['median_m'] 186 | 187 | # train['120_speed'] = train['length'] / 120.0 188 | train['mean_std'] = train['mean_m'] / train['std_m'] 189 | train['max_min_distance'] = train['max_m'] - train['min_m'] 190 | 191 | train_8 = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour == 8),: ] 192 | train_8 = train_8.groupby(['link_ID', 'time_interval_minutes'])[ 193 | 'travel_time'].agg([('median_8_', np.median),('mode_8_', mode_function)]).reset_index() 194 | 195 | train = pd.merge(train,train_8,on=['link_ID','time_interval_minutes'],how='left') 196 | 197 | print train.shape 198 | 199 | train_label = np.log1p(train.pop('travel_time')) 200 | # validation = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 201 | 202 | 203 | test = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 204 | for i in [58,48,38,28,18,8,0]: 205 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 206 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 207 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 208 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 209 | # tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 210 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 211 | 212 | 213 | test_history = feature_data.loc[(feature_data.time_interval_month == 4),: ] 214 | test_history = test_history.groupby(['link_ID', 'time_interval_minutes'])[ 215 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 216 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 217 | # test_history['median_mode'] = 0.5 * test_history['mode_m'] + 0.5 * test_history['median_m'] 218 | test = pd.merge(test,test_history,on=['link_ID','time_interval_minutes'],how='left') 219 | 220 | # test['speed_max'] = test['length'] / test['min_m'] 221 | # test['speed_min'] = test['length'] / test['max_m'] 222 | test['speed_mode'] = test['length'] / test['mode_m'] 223 | test['speed_median'] = test['length'] / test['median_m'] 224 | 225 | # test['120_speed'] = test['length'] / 120.0 226 | test['mean_std'] = test['mean_m'] / test['std_m'] 227 | test['max_min_distance'] = test['max_m'] - test['min_m'] 228 | 229 | test_8 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 8),: ] 230 | test_8 = test_8.groupby(['link_ID', 'time_interval_minutes'])[ 231 | 'travel_time'].agg([('median_8_', np.median),('mode_8_', mode_function)]).reset_index() 232 | 233 | test = pd.merge(test,test_8,on=['link_ID','time_interval_minutes'],how='left') 234 | 235 | print test.head() 236 | 237 | test_label = np.log1p(test.pop('travel_time')) 238 | 239 | train.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin','std_m'],inplace=True,axis=1) 240 | test.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin','std_m'],inplace=True,axis=1) 241 | 242 | 243 | 244 | import xgboost as xgb 245 | 246 | xlf = xgb.XGBRegressor(max_depth=11, 247 | learning_rate=0.01, 248 | n_estimators=1000, 249 | silent=True, 250 | objective=mape_object, 251 | gamma=0, 252 | min_child_weight=5, 253 | max_delta_step=0, 254 | subsample=0.8, 255 | colsample_bytree=0.8, 256 | colsample_bylevel=1, 257 | reg_alpha=1e0, 258 | reg_lambda=0, 259 | scale_pos_weight=1, 260 | seed=9, 261 | missing=None) 262 | 263 | 264 | xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=2) 265 | print xlf.get_params() 266 | 267 | 268 | sub = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 269 | for i in [58,48,38,28,18,8,0]: 270 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 271 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 272 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 273 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 274 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 275 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 276 | 277 | sub_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 278 | sub_history = sub_history.groupby(['link_ID', 'time_interval_minutes'])[ 279 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 280 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 281 | # sub_history['median_mode'] = 0.5 * sub_history['mode_m'] + 0.5 * sub_history['median_m'] 282 | 283 | sub = pd.merge(sub,sub_history,on=['link_ID','time_interval_minutes'],how='left') 284 | # sub['speed_max'] = sub['length'] / sub['min_m'] 285 | # sub['speed_min'] = sub['length'] / sub['max_m'] 286 | sub['speed_mode'] = sub['length'] / sub['mode_m'] 287 | sub['speed_median'] = sub['length'] / sub['median_m'] 288 | 289 | # sub['120_speed'] = sub['length'] / 120.0 290 | 291 | sub['mean_std'] = sub['mean_m'] / sub['std_m'] 292 | sub['max_min_distance'] = sub['max_m'] - sub['min_m'] 293 | 294 | sub_history_8 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 8),: ] 295 | sub_history_8 = sub_history_8.groupby(['link_ID', 'time_interval_minutes'])[ 296 | 'travel_time'].agg([('median_8_', np.median),('mode_8_', mode_function)]).reset_index() 297 | 298 | sub = pd.merge(sub,sub_history_8,on=['link_ID','time_interval_minutes'],how='left') 299 | 300 | print sub.head() 301 | 302 | sub_label = np.log1p(sub.pop('travel_time')) 303 | 304 | sub.drop(['time_interval_begin_hour','time_interval_month','time_interval_begin','std_m'],inplace=True,axis=1) 305 | 306 | result = xlf.predict(sub.values) 307 | 308 | travel_time = pd.DataFrame({'travel_time':list(result)}) 309 | sub_demo = pd.read_table(u'./[新-答案模板]gy_contest_result_template.txt',header=None,sep='#') 310 | 311 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 312 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 313 | del sub_demo['index'] 314 | 315 | del sub_demo['travel_time'] 316 | tt = pd.concat([sub_demo,travel_time],axis=1) 317 | # tt = tt.fillna(0) 318 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 319 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./mapodoufu_2017-08-12.txt',sep='#',index=False,header=False) 320 | print tt[['link_ID','date','time_interval','travel_time']].shape 321 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 322 | 323 | 324 | -------------------------------------------------------------------------------- /session_2/get_Feat_Aug_18.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | def mape_object(y,d): 8 | 9 | g=1.0*np.sign(y-d)/d 10 | h=1.0/d 11 | return -g,h 12 | def add_constact(df): 13 | return np.sum(1.0/df) / np.sum(1.0/df/df) 14 | # 评价函数 15 | def mape(y,d): 16 | c=d.get_label() 17 | result= np.sum(np.abs(y-c)/c)/len(c) 18 | return "mape",result 19 | 20 | # 评价函数ln形式 21 | def mape_ln(y,d): 22 | c=d.get_label() 23 | result= np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 24 | return "mape",result 25 | 26 | import gc 27 | 28 | from scipy.stats import mode 29 | # 中位数 30 | def mode_function(df): 31 | df = df.astype(int) 32 | counts = mode(df) 33 | return counts[0][0] 34 | 35 | 36 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 37 | feature_data['link_ID'] = feature_data['link_ID'].astype(str) 38 | # link_info_count = pd.read_csv('./pre_data/link_info_count.csv') 39 | # link_info_count['link_ID'] = link_info_count['link_ID'].astype(str) 40 | # feature_data = pd.merge(feature_data,link_info_count,on='link_ID',how='left') 41 | # link_class = pd.get_dummies(feature_data['link_class'],prefix='link_class') 42 | # int_count_onehot = pd.get_dummies(feature_data['in_count_'],prefix='in_count') 43 | # out_count_onehot = pd.get_dummies(feature_data['out_count_'],prefix='out_count') 44 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 45 | # time_interval_minutes = pd.get_dummies(feature_data['time_interval_minutes'],prefix='time_interval_minutes') 46 | # day = pd.get_dummies(feature_data['time_interval_day'],prefix='day') 47 | feature_data.drop(['time_interval_week','link_class'],inplace=True,axis=1) 48 | # linkId = pd.get_dummies(feature_data['link_ID'],prefix='link_id') 49 | feature_data = pd.concat([feature_data,week],axis=1) 50 | print feature_data.head() 51 | 52 | 53 | print u'make feature' 54 | 55 | 56 | 57 | train418 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==18),: ] 58 | for i in [58,48,38,28,18,8,0]: 59 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 60 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 61 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 62 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 63 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 64 | train418 = pd.merge(train418,tmp,on=['link_ID','time_interval_day'],how='left') 65 | 66 | train517 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==17),: ] 67 | for i in [58,48,38,28,18,8,0]: 68 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 16)&(feature_data.time_interval_minutes >= i),:] 69 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 70 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 71 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 72 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 73 | train517 = pd.merge(train517,tmp,on=['link_ID','time_interval_day'],how='left') 74 | 75 | train518 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==18),: ] 76 | for i in [58,48,38,28,18,8,0]: 77 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 78 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 79 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 80 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 81 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 82 | train518 = pd.merge(train518,tmp,on=['link_ID','time_interval_day'],how='left') 83 | 84 | train519 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==19),: ] 85 | for i in [58,48,38,28,18,8,0]: 86 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 18)&(feature_data.time_interval_minutes >= i),:] 87 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 88 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 89 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 90 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 91 | train519 = pd.merge(train519,tmp,on=['link_ID','time_interval_day'],how='left') 92 | 93 | train419 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 19),: ] 94 | for i in [58,48,38,28,18,8,0]: 95 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 18)&(feature_data.time_interval_minutes >= i),:] 96 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 97 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 98 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 99 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 100 | train419 = pd.merge(train419,tmp,on=['link_ID','time_interval_day'],how='left') 101 | ############################################################################################################################################################# 102 | train68 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 103 | for i in [58,48,38,28,18,8,0]: 104 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 105 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 106 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 107 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 108 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 109 | train68 = pd.merge(train68,tmp,on=['link_ID','time_interval_day'],how='left') 110 | 111 | train617 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==17),: ] 112 | for i in [58,48,38,28,18,8,0]: 113 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 16)&(feature_data.time_interval_minutes >= i),:] 114 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 115 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 116 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 117 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 118 | train617 = pd.merge(train617,tmp,on=['link_ID','time_interval_day'],how='left') 119 | 120 | train67 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==7),: ] 121 | for i in [58,48,38,28,18,8,0]: 122 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 123 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 124 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 125 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 126 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 127 | train67 = pd.merge(train67,tmp,on=['link_ID','time_interval_day'],how='left') 128 | 129 | train57 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==7),: ] 130 | for i in [58,48,38,28,18,8,0]: 131 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 132 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 133 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 134 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 135 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 136 | train57 = pd.merge(train57,tmp,on=['link_ID','time_interval_day'],how='left') 137 | 138 | train318 = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour==18),: ] 139 | for i in [58,48,38,28,18,8,0]: 140 | tmp = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 141 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 142 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 143 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 144 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 145 | train318 = pd.merge(train318,tmp,on=['link_ID','time_interval_day'],how='left') 146 | # train619 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==19),: ] 147 | # for i in [58,48,38,28,18,8,0]: 148 | # tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 18)&(feature_data.time_interval_minutes >= i),:] 149 | # tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 150 | # 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 151 | # ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 152 | # tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 153 | # train619 = pd.merge(train619,tmp,on=['link_ID','time_interval_day'],how='left') 154 | # 合并训练数据 155 | # train = pd.concat([train517,train518,train519,train419,train617,train68,train619,train418],axis=0) 156 | 157 | # train = pd.concat([train517,train518,train519,train617,train68,train418,train419],axis=0) 158 | train = pd.concat([train517,train518,train519,train68,train418,train419,train617,train67,train57,train318],axis=0) 159 | print u'合并结束' 160 | 161 | train_history = feature_data.loc[(feature_data.time_interval_month == 4),: ] 162 | train_history = train_history.groupby(['link_ID', 'time_interval_minutes'])[ 163 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 164 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 165 | # train_history['median_mode'] = 0.5 * train_history['mode_m'] + 0.5 * train_history['median_m'] 166 | 167 | train = pd.merge(train,train_history,on=['link_ID','time_interval_minutes'],how='left') 168 | # train['speed_max'] = train['length'] / train['min_m'] 169 | # train['speed_min'] = train['length'] / train['max_m'] 170 | train['speed_mode'] = train['length'] / train['mode_m'] 171 | train['speed_median'] = train['length'] / train['median_m'] 172 | 173 | # train['120_speed'] = train['length'] / 120.0 174 | train['mean_std'] = train['mean_m'] / train['std_m'] 175 | train['max_min_distance'] = train['max_m'] - train['min_m'] 176 | 177 | train_8 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 18),: ] 178 | train_8 = train_8.groupby(['link_ID', 'time_interval_minutes'])[ 179 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 180 | 181 | train = pd.merge(train,train_8,on=['link_ID','time_interval_minutes'],how='left') 182 | 183 | print train.shape 184 | train = train.fillna(-1) 185 | train_label = np.log1p(train.pop('travel_time')) 186 | # validation = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 187 | 188 | 189 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==18),: ] 190 | for i in [58,48,38,28,18,8,0]: 191 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 192 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 193 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 194 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 195 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 196 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 197 | 198 | 199 | test_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 200 | test_history = test_history.groupby(['link_ID', 'time_interval_minutes'])[ 201 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 202 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 203 | # test_history['median_mode'] = 0.5 * test_history['mode_m'] + 0.5 * test_history['median_m'] 204 | test = pd.merge(test,test_history,on=['link_ID','time_interval_minutes'],how='left') 205 | 206 | # test['speed_max'] = test['length'] / test['min_m'] 207 | # test['speed_min'] = test['length'] / test['max_m'] 208 | test['speed_mode'] = test['length'] / test['mode_m'] 209 | test['speed_median'] = test['length'] / test['median_m'] 210 | 211 | # test['120_speed'] = test['length'] / 120.0 212 | test['mean_std'] = test['mean_m'] / test['std_m'] 213 | test['max_min_distance'] = test['max_m'] - test['min_m'] 214 | 215 | test_8 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 18),: ] 216 | test_8 = test_8.groupby(['link_ID', 'time_interval_minutes'])[ 217 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 218 | 219 | test = pd.merge(test,test_8,on=['link_ID','time_interval_minutes'],how='left') 220 | 221 | print test.head() 222 | 223 | # 缺失值的处理 224 | 225 | test = test.fillna(-1) 226 | test_label = np.log1p(test.pop('travel_time')) 227 | 228 | train.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58'],inplace=True,axis=1) 229 | test.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58'],inplace=True,axis=1) 230 | 231 | 232 | import xgboost as xgb 233 | # print xgb.__version__ 234 | 235 | xlf = xgb.XGBRegressor(max_depth=11, 236 | learning_rate=0.008, 237 | n_estimators=4000, 238 | silent=True, 239 | objective=mape_object, 240 | gamma=0, 241 | min_child_weight=10, 242 | max_delta_step=0, 243 | subsample=0.8, 244 | colsample_bytree=0.8, 245 | colsample_bylevel=1, 246 | reg_alpha=1e0, 247 | reg_lambda=0, 248 | scale_pos_weight=1, 249 | seed=9, 250 | missing=None) 251 | 252 | xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=2) 253 | # xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=2) 254 | print xlf.get_params() 255 | 256 | 257 | sub = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour==18),: ] 258 | for i in [58,48,38,28,18,8,0]: 259 | tmp = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 260 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 261 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 262 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 263 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 264 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 265 | 266 | sub_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 267 | sub_history = sub_history.groupby(['link_ID', 'time_interval_minutes'])[ 268 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 269 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 270 | # sub_history['median_mode'] = 0.5 * sub_history['mode_m'] + 0.5 * sub_history['median_m'] 271 | 272 | sub = pd.merge(sub,sub_history,on=['link_ID','time_interval_minutes'],how='left') 273 | # sub['speed_max'] = sub['length'] / sub['min_m'] 274 | # sub['speed_min'] = sub['length'] / sub['max_m'] 275 | sub['speed_mode'] = sub['length'] / sub['mode_m'] 276 | sub['speed_median'] = sub['length'] / sub['median_m'] 277 | 278 | # sub['120_speed'] = sub['length'] / 120.0 279 | 280 | sub['mean_std'] = sub['mean_m'] / sub['std_m'] 281 | sub['max_min_distance'] = sub['max_m'] - sub['min_m'] 282 | 283 | sub_history_8 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 18),: ] 284 | sub_history_8 = sub_history_8.groupby(['link_ID', 'time_interval_minutes'])[ 285 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 286 | 287 | sub = pd.merge(sub,sub_history_8,on=['link_ID','time_interval_minutes'],how='left') 288 | 289 | print sub.head() 290 | sub = sub.fillna(-1) 291 | sub_label = np.log1p(sub.pop('travel_time')) 292 | 293 | sub.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58'],inplace=True,axis=1) 294 | 295 | result = xlf.predict(sub.values) 296 | 297 | travel_time = pd.DataFrame({'travel_time':list(result)}) 298 | sub_demo = pd.read_table(u'./semifinal_gy_cmp_testing_template_seg2.txt',header=None,sep=';') 299 | 300 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 301 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 302 | del sub_demo['index'] 303 | del sub_demo['travel_time'] 304 | tt = pd.concat([sub_demo,travel_time],axis=1) 305 | # tt = tt.fillna(0) 306 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 307 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./2017-09-15_18_xgb.txt',sep='#',index=False,header=False) 308 | print tt[['link_ID','date','time_interval','travel_time']].shape 309 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 310 | 311 | # [383] validation_0-rmse:0.443618 validation_0-mape:0.286548 312 | # [392] validation_0-rmse:0.440936 validation_0-mape:0.286057 313 | # [392] validation_0-rmse:0.440936 validation_0-mape:0.286057 314 | 315 | # 3 4 5 6 7 316 | # [488] validation_0-rmse:0.443788 validation_0-mape:0.282757 -------------------------------------------------------------------------------- /session_2/lightgbm_18.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import lightgbm as lgb 4 | import pandas as pd 5 | import numpy as np 6 | import gc 7 | 8 | from scipy.stats import mode 9 | def add_constact(df): 10 | return np.sum(1.0/df) / np.sum(1.0/df/df) 11 | # customes 12 | def mape_object(y,d): 13 | d = d.get_label() 14 | g = 1.0*np.sign(y-d)/d 15 | h = 1.0/d 16 | return g,h 17 | # 评价函数ln形式 18 | def mape_ln(y,d): 19 | c=d.get_label() 20 | result= np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 21 | return "mape",result,False 22 | # 中位数 23 | def mode_function(df): 24 | df = df.astype(int) 25 | counts = mode(df) 26 | return counts[0][0] 27 | # load or create your dataset 28 | 29 | print('Load data...') 30 | 31 | print u'18' 32 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 33 | feature_data['link_ID'] = feature_data['link_ID'].astype(str) 34 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 35 | feature_data.drop(['time_interval_week','link_class'],inplace=True,axis=1) 36 | feature_data = pd.concat([feature_data,week],axis=1) 37 | print feature_data.head() 38 | 39 | 40 | train418 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==18),: ] 41 | for i in [58,48,38,28,18,8,0]: 42 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 43 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 44 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 45 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 46 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 47 | train418 = pd.merge(train418,tmp,on=['link_ID','time_interval_day'],how='left') 48 | 49 | train518 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==18),: ] 50 | for i in [58,48,38,28,18,8,0]: 51 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 52 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 53 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 54 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 55 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 56 | train518 = pd.merge(train518,tmp,on=['link_ID','time_interval_day'],how='left') 57 | 58 | train517 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==17),: ] 59 | for i in [58,48,38,28,18,8,0]: 60 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 16)&(feature_data.time_interval_minutes >= i),:] 61 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 62 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 63 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 64 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 65 | train517 = pd.merge(train517,tmp,on=['link_ID','time_interval_day'],how='left') 66 | 67 | train519 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==19),: ] 68 | for i in [58,48,38,28,18,8,0]: 69 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 18)&(feature_data.time_interval_minutes >= i),:] 70 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 71 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 72 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 73 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 74 | train519 = pd.merge(train519,tmp,on=['link_ID','time_interval_day'],how='left') 75 | 76 | train616 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==16),: ] 77 | for i in [58,48,38,28,18,8,0]: 78 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 15)&(feature_data.time_interval_minutes >= i),:] 79 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 80 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 81 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 82 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 83 | train616 = pd.merge(train616,tmp,on=['link_ID','time_interval_day'],how='left') 84 | 85 | ############################################################################################################################################################ 86 | 87 | train68 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour== 8),: ] 88 | for i in [58,48,38,28,18,8,0]: 89 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 90 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 91 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 92 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 93 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 94 | train68 = pd.merge(train68,tmp,on=['link_ID','time_interval_day'],how='left') 95 | 96 | train617 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour== 17),: ] 97 | for i in [58,48,38,28,18,8,0]: 98 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 16)&(feature_data.time_interval_minutes >= i),:] 99 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 100 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 101 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 102 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 103 | train617 = pd.merge(train617,tmp,on=['link_ID','time_interval_day'],how='left') 104 | 105 | train419 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==19),: ] 106 | for i in [58,48,38,28,18,8,0]: 107 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 18)&(feature_data.time_interval_minutes >= i),:] 108 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 109 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 110 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 111 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 112 | train419 = pd.merge(train419,tmp,on=['link_ID','time_interval_day'],how='left') 113 | 114 | train67 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==7),: ] 115 | for i in [58,48,38,28,18,8,0]: 116 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 117 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 118 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 119 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 120 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 121 | train67 = pd.merge(train67,tmp,on=['link_ID','time_interval_day'],how='left') 122 | 123 | train57 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==7),: ] 124 | for i in [58,48,38,28,18,8,0]: 125 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 6)&(feature_data.time_interval_minutes >= i),:] 126 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 127 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 128 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 129 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 130 | train57 = pd.merge(train57,tmp,on=['link_ID','time_interval_day'],how='left') 131 | 132 | train318 = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour==18),: ] 133 | for i in [58,48,38,28,18,8,0]: 134 | tmp = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 135 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 136 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 137 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 138 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 139 | train318 = pd.merge(train318,tmp,on=['link_ID','time_interval_day'],how='left') 140 | 141 | train = pd.concat([train517,train518,train519,train68,train418,train419,train617,train67,train57,train318],axis=0) 142 | 143 | ############################################################################################################################################################ 144 | 145 | train_history = feature_data.loc[(feature_data.time_interval_month == 4),: ] 146 | train_history = train_history.groupby(['link_ID', 'time_interval_minutes'])[ 147 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 148 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 149 | # train_history['median_mode'] = 0.5 * train_history['mode_m'] + 0.5 * train_history['median_m'] 150 | 151 | train = pd.merge(train,train_history,on=['link_ID','time_interval_minutes'],how='left') 152 | 153 | train_constacot = feature_data.loc[(feature_data.time_interval_month == 4),: ] 154 | train_constacot = train_constacot.groupby(['link_ID'])[ 155 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 156 | train = pd.merge(train,train_constacot,on=['link_ID'],how='left') 157 | 158 | # train['speed_max'] = train['length'] / train['min_m'] 159 | # train['speed_min'] = train['length'] / train['max_m'] 160 | train['speed_mode'] = train['length'] / train['mode_m'] 161 | train['speed_median'] = train['length'] / train['median_m'] 162 | 163 | # train['120_speed'] = train['length'] / 120.0 164 | train['mean_std'] = train['mean_m'] / train['std_m'] 165 | train['max_min_distance'] = train['max_m'] - train['min_m'] 166 | 167 | train_8 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 18),: ] 168 | train_8 = train_8.groupby(['link_ID', 'time_interval_minutes'])[ 169 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 170 | 171 | train = pd.merge(train,train_8,on=['link_ID','time_interval_minutes'],how='left') 172 | 173 | 174 | train = train.fillna(-1) 175 | train.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58'],inplace=True,axis=1) 176 | 177 | print train.shape 178 | 179 | 180 | train_label = np.log1p(train.pop('travel_time')) 181 | train_label = train_label.values 182 | train = train.values 183 | 184 | 185 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==18),: ] 186 | for i in [58,48,38,28,18,8,0]: 187 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 188 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 189 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 190 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 191 | # tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 192 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 193 | 194 | 195 | test_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 196 | test_history = test_history.groupby(['link_ID', 'time_interval_minutes'])[ 197 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 198 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 199 | # test_history['median_mode'] = 0.5 * test_history['mode_m'] + 0.5 * test_history['median_m'] 200 | test = pd.merge(test,test_history,on=['link_ID','time_interval_minutes'],how='left') 201 | 202 | test_constacot = feature_data.loc[(feature_data.time_interval_month == 5),: ] 203 | test_constacot = test_constacot.groupby(['link_ID'])[ 204 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 205 | test = pd.merge(test,test_constacot,on=['link_ID'],how='left') 206 | 207 | # test['speed_max'] = test['length'] / test['min_m'] 208 | # test['speed_min'] = test['length'] / test['max_m'] 209 | test['speed_mode'] = test['length'] / test['mode_m'] 210 | test['speed_median'] = test['length'] / test['median_m'] 211 | 212 | # test['120_speed'] = test['length'] / 120.0 213 | test['mean_std'] = test['mean_m'] / test['std_m'] 214 | test['max_min_distance'] = test['max_m'] - test['min_m'] 215 | 216 | test_8 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 18),: ] 217 | test_8 = test_8.groupby(['link_ID', 'time_interval_minutes'])[ 218 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 219 | 220 | test = pd.merge(test,test_8,on=['link_ID','time_interval_minutes'],how='left') 221 | 222 | print test.head() 223 | # analy_data_org = test.copy() 224 | # 缺失值的处理 225 | test.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58'],inplace=True,axis=1) 226 | test = test.fillna(-1) 227 | test_label = np.log1p(test.pop('travel_time')) 228 | 229 | test_label = test_label.values 230 | test = test.values 231 | 232 | 233 | print('Start training...') 234 | # train 235 | lgb_train = lgb.Dataset(train, train_label) 236 | lgb_eval = lgb.Dataset(test, test_label, reference=lgb_train) 237 | 238 | params = { 239 | 'boosting_type': 'gbdt', 240 | 'objective': 'regression', 241 | 'metric': 'rmse', 242 | 'num_leaves': 128, 243 | 'learning_rate': 0.0025, 244 | 'feature_fraction': 0.8, 245 | 'bagging_fraction': 0.8, 246 | 'bagging_freq': 3, 247 | 'verbose': 0 248 | } 249 | 250 | gbm = lgb.train(params, 251 | lgb_train, 252 | num_boost_round=3000, 253 | # init_model=gbm, 254 | fobj=mape_object, 255 | feval=mape_ln, 256 | valid_sets=lgb_eval, 257 | early_stopping_rounds = 15) 258 | 259 | print('Start predicting...') 260 | # predict 261 | sub = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour==18),: ] 262 | for i in [58,48,38,28,18,8,0]: 263 | tmp = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 264 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 265 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 266 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 267 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 268 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 269 | 270 | sub_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 271 | sub_history = sub_history.groupby(['link_ID', 'time_interval_minutes'])[ 272 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 273 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 274 | # sub_history['median_mode'] = 0.5 * sub_history['mode_m'] + 0.5 * sub_history['median_m'] 275 | 276 | sub = pd.merge(sub,sub_history,on=['link_ID','time_interval_minutes'],how='left') 277 | 278 | 279 | sub_constacot = feature_data.loc[(feature_data.time_interval_month == 5),: ] 280 | sub_constacot = sub_constacot.groupby(['link_ID'])[ 281 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 282 | sub = pd.merge(sub,sub_constacot,on=['link_ID'],how='left') 283 | 284 | # sub['speed_max'] = sub['length'] / sub['min_m'] 285 | # sub['speed_min'] = sub['length'] / sub['max_m'] 286 | sub['speed_mode'] = sub['length'] / sub['mode_m'] 287 | sub['speed_median'] = sub['length'] / sub['median_m'] 288 | 289 | # sub['120_speed'] = sub['length'] / 120.0 290 | 291 | sub['mean_std'] = sub['mean_m'] / sub['std_m'] 292 | sub['max_min_distance'] = sub['max_m'] - sub['min_m'] 293 | 294 | sub_history_8 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 18),: ] 295 | sub_history_8 = sub_history_8.groupby(['link_ID', 'time_interval_minutes'])[ 296 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 297 | 298 | sub = pd.merge(sub,sub_history_8,on=['link_ID','time_interval_minutes'],how='left') 299 | 300 | print sub.head() 301 | 302 | sub_label = np.log1p(sub.pop('travel_time')) 303 | 304 | sub.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58'],inplace=True,axis=1) 305 | sub = sub.values 306 | 307 | result = gbm.predict(sub, num_iteration=gbm.best_iteration) 308 | 309 | travel_time = pd.DataFrame({'travel_time':list(result)}) 310 | sub_demo = pd.read_table(u'./semifinal_gy_cmp_testing_template_seg2.txt',header=None,sep=';') 311 | 312 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 313 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 314 | del sub_demo['index'] 315 | del sub_demo['travel_time'] 316 | tt = pd.concat([sub_demo,travel_time],axis=1) 317 | # tt = tt.fillna(0) 318 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 319 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./2017-09-16_18.txt',sep='#',index=False,header=False) 320 | print tt[['link_ID','date','time_interval','travel_time']].shape 321 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 322 | 323 | 324 | # analy_data = gbm.predict(test, num_iteration=gbm.best_iteration) 325 | # analy_data = pd.DataFrame({'pre_travel_time':list(analy_data)}) 326 | # analy_data_sub = pd.concat([analy_data,analy_data_org],axis=1) 327 | # analy_data_sub['pre_travel_time'] = np.round(np.expm1(analy_data_sub['pre_travel_time']),6) 328 | # analy_data_sub.to_csv('./analy_data_18.txt') 329 | # print analy_data_sub.shape 330 | 331 | # [874] valid_0's rmse: 0.447699 valid_0's mape: 0.280259 332 | # [840] valid_0's rmse: 0.454924 valid_0's mape: 0.278944 333 | # [1515] valid_0's rmse: 0.451254 valid_0's mape: 0.278442 334 | 335 | # [1704] valid_0's rmse: 0.453357 valid_0's mape: 0.278397 336 | 337 | # 3 4 5 6 7 338 | # [1776] valid_0's rmse: 0.452663 valid_0's mape: 0.275271 -------------------------------------------------------------------------------- /session_2/lightgbm_15.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import lightgbm as lgb 4 | import pandas as pd 5 | import numpy as np 6 | import gc 7 | 8 | from scipy.stats import mode 9 | # customes 10 | def mape_object(y,d): 11 | d = d.get_label() 12 | g = 1.0*np.sign(y-d)/d 13 | h = 1.0/d 14 | return g,h 15 | # 评价函数ln形式 16 | def mape_ln(y,d): 17 | c=d.get_label() 18 | result= np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 19 | return "mape",result,False 20 | 21 | def add_constact(df): 22 | return np.sum(1.0/df) / np.sum(1.0/df/df) 23 | # 中位数 24 | def mode_function(df): 25 | df = df.astype(int) 26 | counts = mode(df) 27 | return counts[0][0] 28 | # load or create your dataset 29 | 30 | print('Load data...') 31 | 32 | print u'15' 33 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 34 | feature_data['link_ID'] = feature_data['link_ID'].astype(str) 35 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 36 | feature_data.drop(['time_interval_week','link_class'],inplace=True,axis=1) 37 | feature_data = pd.concat([feature_data,week],axis=1) 38 | print feature_data.head() 39 | 40 | 41 | train415 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==15),: ] 42 | for i in [58,48,38,28,18,8,0]: 43 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 44 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 45 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 46 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 47 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 48 | train415 = pd.merge(train415,tmp,on=['link_ID','time_interval_day'],how='left') 49 | 50 | train515 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==15),: ] 51 | for i in [58,48,38,28,18,8,0]: 52 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 53 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 54 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 55 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 56 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 57 | train515 = pd.merge(train515,tmp,on=['link_ID','time_interval_day'],how='left') 58 | 59 | train516 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==16),: ] 60 | for i in [58,48,38,28,18,8,0]: 61 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 15)&(feature_data.time_interval_minutes >= i),:] 62 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 63 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 64 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 65 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 66 | train516 = pd.merge(train516,tmp,on=['link_ID','time_interval_day'],how='left') 67 | 68 | train518 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==18),: ] 69 | for i in [58,48,38,28,18,8,0]: 70 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 71 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 72 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 73 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 74 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 75 | train518 = pd.merge(train518,tmp,on=['link_ID','time_interval_day'],how='left') 76 | 77 | ############################################################################################################################################################ 78 | 79 | train613 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour== 13),: ] 80 | for i in [58,48,38,28,18,8,0]: 81 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 12)&(feature_data.time_interval_minutes >= i),:] 82 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 83 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 84 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 85 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 86 | train613 = pd.merge(train613,tmp,on=['link_ID','time_interval_day'],how='left') 87 | 88 | train616 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour== 16),: ] 89 | for i in [58,48,38,28,18,8,0]: 90 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 15)&(feature_data.time_interval_minutes >= i),:] 91 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 92 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 93 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 94 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 95 | train616 = pd.merge(train616,tmp,on=['link_ID','time_interval_day'],how='left') 96 | 97 | train418 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==18),: ] 98 | for i in [58,48,38,28,18,8,0]: 99 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 100 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 101 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 102 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 103 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 104 | train418 = pd.merge(train418,tmp,on=['link_ID','time_interval_day'],how='left') 105 | 106 | train68 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 107 | for i in [58,48,38,28,18,8,0]: 108 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 109 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 110 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 111 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 112 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 113 | train68 = pd.merge(train68,tmp,on=['link_ID','time_interval_day'],how='left') 114 | 115 | train58 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 116 | for i in [58,48,38,28,18,8,0]: 117 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 118 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 119 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 120 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 121 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 122 | train58 = pd.merge(train58,tmp,on=['link_ID','time_interval_day'],how='left') 123 | 124 | train48 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==8),: ] 125 | for i in [58,48,38,28,18,8,0]: 126 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 127 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 128 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 129 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 130 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 131 | train48 = pd.merge(train48,tmp,on=['link_ID','time_interval_day'],how='left') 132 | 133 | train315 = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour==15),: ] 134 | for i in [58,48,38,28,18,8,0]: 135 | tmp = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 136 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 137 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 138 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 139 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 140 | train315 = pd.merge(train315,tmp,on=['link_ID','time_interval_day'],how='left') 141 | 142 | 143 | train = pd.concat([train515,train518,train415,train616,train613,train516,train418,train68,train58,train48,train315],axis=0) 144 | 145 | ############################################################################################################################################################ 146 | 147 | train_history = feature_data.loc[(feature_data.time_interval_month == 4),: ] 148 | train_history = train_history.groupby(['link_ID', 'time_interval_minutes'])[ 149 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 150 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 151 | # train_history['median_mode'] = 0.5 * train_history['mode_m'] + 0.5 * train_history['median_m'] 152 | 153 | train = pd.merge(train,train_history,on=['link_ID','time_interval_minutes'],how='left') 154 | 155 | 156 | train_constacot = feature_data.loc[(feature_data.time_interval_month == 4),: ] 157 | train_constacot = train_constacot.groupby(['link_ID'])[ 158 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 159 | train = pd.merge(train,train_constacot,on=['link_ID'],how='left') 160 | 161 | # train['speed_max'] = train['length'] / train['min_m'] 162 | # train['speed_min'] = train['length'] / train['max_m'] 163 | train['speed_mode'] = train['length'] / train['mode_m'] 164 | train['speed_median'] = train['length'] / train['median_m'] 165 | 166 | # train['120_speed'] = train['length'] / 120.0 167 | train['mean_std'] = train['mean_m'] / train['std_m'] 168 | train['max_min_distance'] = train['max_m'] - train['min_m'] 169 | 170 | train_8 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 15),: ] 171 | train_8 = train_8.groupby(['link_ID', 'time_interval_minutes'])[ 172 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 173 | 174 | train = pd.merge(train,train_8,on=['link_ID','time_interval_minutes'],how='left') 175 | 176 | 177 | train = train.fillna(-1) 178 | train.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 179 | print train.shape 180 | train_label = np.log1p(train.pop('travel_time')) 181 | train_label = train_label.values 182 | train = train.values 183 | 184 | 185 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==15),: ] 186 | for i in [58,48,38,28,18,8,0]: 187 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 188 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 189 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 190 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 191 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 192 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 193 | 194 | 195 | test_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 196 | test_history = test_history.groupby(['link_ID', 'time_interval_minutes'])[ 197 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 198 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 199 | # test_history['median_mode'] = 0.5 * test_history['mode_m'] + 0.5 * test_history['median_m'] 200 | test = pd.merge(test,test_history,on=['link_ID','time_interval_minutes'],how='left') 201 | 202 | 203 | test_constacot = feature_data.loc[(feature_data.time_interval_month == 5),: ] 204 | test_constacot = test_constacot.groupby(['link_ID'])[ 205 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 206 | test = pd.merge(test,test_constacot,on=['link_ID'],how='left') 207 | 208 | 209 | # test['speed_max'] = test['length'] / test['min_m'] 210 | # test['speed_min'] = test['length'] / test['max_m'] 211 | test['speed_mode'] = test['length'] / test['mode_m'] 212 | test['speed_median'] = test['length'] / test['median_m'] 213 | 214 | # test['120_speed'] = test['length'] / 120.0 215 | test['mean_std'] = test['mean_m'] / test['std_m'] 216 | test['max_min_distance'] = test['max_m'] - test['min_m'] 217 | 218 | test_8 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 15),: ] 219 | test_8 = test_8.groupby(['link_ID', 'time_interval_minutes'])[ 220 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 221 | 222 | test = pd.merge(test,test_8,on=['link_ID','time_interval_minutes'],how='left') 223 | 224 | print test.head() 225 | # analy_data_org = test.copy() 226 | # 缺失值的处理 227 | test.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 228 | test = test.fillna(-1) 229 | test_label = np.log1p(test.pop('travel_time')) 230 | 231 | test_label = test_label.values 232 | test = test.values 233 | 234 | 235 | print('Start training...') 236 | # train 237 | lgb_train = lgb.Dataset(train, train_label) 238 | lgb_eval = lgb.Dataset(test, test_label, reference=lgb_train) 239 | 240 | params = { 241 | 'boosting_type': 'gbdt', 242 | 'objective': 'regression', 243 | 'metric': 'rmse', 244 | 'num_leaves': 128, 245 | 'learning_rate': 0.002, 246 | 'feature_fraction': 0.8, 247 | 'bagging_fraction': 0.8, 248 | 'bagging_freq': 5, 249 | 'verbose': 0 250 | } 251 | 252 | gbm = lgb.train(params, 253 | lgb_train, 254 | num_boost_round=5000, 255 | # init_model=gbm, 256 | fobj=mape_object, 257 | feval=mape_ln, 258 | valid_sets=lgb_eval, 259 | early_stopping_rounds = 5) 260 | 261 | print('Start predicting...') 262 | # predict 263 | sub = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour==15),: ] 264 | for i in [58,48,38,28,18,8,0]: 265 | tmp = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 266 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 267 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 268 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 269 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 270 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 271 | 272 | sub_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 273 | sub_history = sub_history.groupby(['link_ID', 'time_interval_minutes'])[ 274 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 275 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 276 | # sub_history['median_mode'] = 0.5 * sub_history['mode_m'] + 0.5 * sub_history['median_m'] 277 | 278 | sub = pd.merge(sub,sub_history,on=['link_ID','time_interval_minutes'],how='left') 279 | 280 | 281 | sub_constacot = feature_data.loc[(feature_data.time_interval_month == 5),: ] 282 | sub_constacot = sub_constacot.groupby(['link_ID'])[ 283 | 'travel_time'].agg([('constatic_m_1', add_constact)]).reset_index() 284 | sub = pd.merge(sub,sub_constacot,on=['link_ID'],how='left') 285 | 286 | 287 | # sub['speed_max'] = sub['length'] / sub['min_m'] 288 | # sub['speed_min'] = sub['length'] / sub['max_m'] 289 | sub['speed_mode'] = sub['length'] / sub['mode_m'] 290 | sub['speed_median'] = sub['length'] / sub['median_m'] 291 | 292 | # sub['120_speed'] = sub['length'] / 120.0 293 | 294 | sub['mean_std'] = sub['mean_m'] / sub['std_m'] 295 | sub['max_min_distance'] = sub['max_m'] - sub['min_m'] 296 | 297 | sub_history_8 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 15),: ] 298 | sub_history_8 = sub_history_8.groupby(['link_ID', 'time_interval_minutes'])[ 299 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 300 | 301 | sub = pd.merge(sub,sub_history_8,on=['link_ID','time_interval_minutes'],how='left') 302 | 303 | print sub.head() 304 | 305 | sub_label = np.log1p(sub.pop('travel_time')) 306 | 307 | sub.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 308 | sub = sub.values 309 | 310 | result = gbm.predict(sub, num_iteration=gbm.best_iteration) 311 | 312 | travel_time = pd.DataFrame({'travel_time':list(result)}) 313 | sub_demo = pd.read_table(u'./semifinal_gy_cmp_testing_template_seg2.txt',header=None,sep=';') 314 | 315 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 316 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 317 | del sub_demo['index'] 318 | del sub_demo['travel_time'] 319 | tt = pd.concat([sub_demo,travel_time],axis=1) 320 | # tt = tt.fillna(0) 321 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 322 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./2017-09-16_15.txt',sep='#',index=False,header=False) 323 | print tt[['link_ID','date','time_interval','travel_time']].shape 324 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 325 | 326 | # analy_data = gbm.predict(test, num_iteration=gbm.best_iteration) 327 | # analy_data = pd.DataFrame({'pre_travel_time':list(analy_data)}) 328 | # analy_data_sub = pd.concat([analy_data,analy_data_org],axis=1) 329 | # analy_data_sub['pre_travel_time'] = np.round(np.expm1(analy_data_sub['pre_travel_time']),6) 330 | # analy_data_sub.to_csv('./analy_data_15.txt') 331 | # print analy_data_sub.shape 332 | 333 | 334 | # [912] valid_0's rmse: 0.458532 valid_0's mape: 0.271816 335 | # [1025] valid_0's rmse: 0.456003 valid_0's mape: 0.271218 336 | # [1004] valid_0's rmse: 0.457181 valid_0's mape: 0.271113 337 | # [1068] valid_0's rmse: 0.455887 valid_0's mape: 0.270535 338 | # [1796] valid_0's rmse: 0.455726 valid_0's mape: 0.270582 339 | 340 | # [2615] valid_0's rmse: 0.456423 valid_0's mape: 0.270494 341 | 342 | # 3 4 5 6 7 343 | # [2479] valid_0's rmse: 0.454904 valid_0's mape: 0.270447 344 | 345 | -------------------------------------------------------------------------------- /session_2/get_Feat_Aug_15.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | def mape_object(y,d): 8 | 9 | g=1.0*np.sign(y-d)/d 10 | h=1.0/d 11 | return -g,h 12 | def add_constact(df): 13 | return np.sum(1.0/df) / np.sum(1.0/df/df) 14 | # 评价函数 15 | def mape(y,d): 16 | c=d.get_label() 17 | result= np.sum(np.abs(y-c)/c)/len(c) 18 | return "mape",result 19 | 20 | # 评价函数ln形式 21 | def mape_ln(y,d): 22 | c=d.get_label() 23 | result= np.sum(np.abs(np.expm1(y)-np.abs(np.expm1(c)))/np.abs(np.expm1(c)))/len(c) 24 | return "mape",result 25 | 26 | def AddBaseTimeFeature(df): 27 | 28 | df['time_interval_begin'] = pd.to_datetime(df['time_interval'].map(lambda x: x[1:20])) 29 | df = df.drop(['date', 'time_interval'], axis=1) 30 | df['time_interval_month'] = df['time_interval_begin'].map(lambda x: x.strftime('%m')) 31 | # df['time_interval_year'] = df['time_interval_begin'].map(lambda x: x.strftime('%Y')) 32 | df['time_interval_day'] = df['time_interval_begin'].map(lambda x: x.day) 33 | df['time_interval_begin_hour'] = df['time_interval_begin'].map(lambda x: x.strftime('%H')) 34 | df['time_interval_minutes'] = df['time_interval_begin'].map(lambda x: x.strftime('%M')) 35 | # Monday=1, Sunday=7 36 | df['time_interval_week'] = df['time_interval_begin'].map(lambda x: x.weekday() + 1) 37 | return df 38 | 39 | 40 | # txt => csv 41 | # link_info = pd.read_table('./gy_contest_link_info.txt',sep=';') 42 | # link_info = link_info.sort_values('link_ID') 43 | 44 | # training_data = pd.read_table(u'./quaterfinal_gy_cmp_training_traveltime.txt',sep=';') 45 | # training_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 46 | # print training_data.head() 47 | # print training_data.shape 48 | # training_data = pd.merge(training_data,link_info,on='link_ID') 49 | 50 | # testing_data = pd.read_table(u'./quaterfinal_gy_cmp_testing_template_seg1(update).txt',sep=';',header=None) 51 | # testing_data.columns = ['link_ID', 'date', 'time_interval', 'travel_time'] 52 | # testing_data = pd.merge(testing_data,link_info,on='link_ID') 53 | # testing_data['travel_time'] = np.NaN 54 | # print testing_data.head() 55 | # print testing_data.shape 56 | # feature_date = pd.concat([training_data,testing_data],axis=0) 57 | 58 | # feature_date = feature_date.sort_values(['link_ID','time_interval']) 59 | # print feature_date 60 | # feature_date.to_csv('./pre_data/feature_data.csv',index=False) 61 | 62 | # feature_data = pd.read_csv('./pre_data/feature_data.csv') 63 | # feature_data = feature_data[feature_data['date']>'2016-10-01'] 64 | # print feature_data 65 | # feature_data_date = AddBaseTimeFeature(feature_data) 66 | # print feature_data_date 67 | # feature_data_date.to_csv('./pre_data/feature_data.csv',index=False) 68 | 69 | 70 | # # test 71 | # feature_data = pd.read_csv('./pre_data/feature_data.csv') 72 | # test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 73 | # test.to_csv('./pre_data/test.csv',index=False) 74 | 75 | import gc 76 | 77 | from scipy.stats import mode 78 | # 中位数 79 | def mode_function(df): 80 | df = df.astype(int) 81 | counts = mode(df) 82 | return counts[0][0] 83 | 84 | print u'15' 85 | feature_data = pd.read_csv('./pre_data/feature_data.csv') 86 | feature_data['link_ID'] = feature_data['link_ID'].astype(str) 87 | # link_info_count = pd.read_csv('./pre_data/link_info_count.csv') 88 | # link_info_count['link_ID'] = link_info_count['link_ID'].astype(str) 89 | # feature_data = pd.merge(feature_data,link_info_count,on='link_ID',how='left') 90 | # link_class = pd.get_dummies(feature_data['link_class'],prefix='link_class') 91 | # int_count_onehot = pd.get_dummies(feature_data['in_count_'],prefix='in_count') 92 | # out_count_onehot = pd.get_dummies(feature_data['out_count_'],prefix='out_count') 93 | week = pd.get_dummies(feature_data['time_interval_week'],prefix='week') 94 | # time_interval_minutes = pd.get_dummies(feature_data['time_interval_minutes'],prefix='time_interval_minutes') 95 | # day = pd.get_dummies(feature_data['time_interval_day'],prefix='day') 96 | feature_data.drop(['time_interval_week','link_class'],inplace=True,axis=1) 97 | # linkId = pd.get_dummies(feature_data['link_ID'],prefix='link_id') 98 | feature_data = pd.concat([feature_data,week],axis=1) 99 | print feature_data.head() 100 | 101 | train415 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==15),: ] 102 | for i in [58,48,38,28,18,8,0]: 103 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 104 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 105 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 106 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 107 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 108 | train415 = pd.merge(train415,tmp,on=['link_ID','time_interval_day'],how='left') 109 | 110 | train515 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==15),: ] 111 | for i in [58,48,38,28,18,8,0]: 112 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 113 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 114 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 115 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 116 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 117 | train515 = pd.merge(train515,tmp,on=['link_ID','time_interval_day'],how='left') 118 | 119 | train516 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==16),: ] 120 | for i in [58,48,38,28,18,8,0]: 121 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 15)&(feature_data.time_interval_minutes >= i),:] 122 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 123 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 124 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 125 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 126 | train516 = pd.merge(train516,tmp,on=['link_ID','time_interval_day'],how='left') 127 | 128 | train518 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==18),: ] 129 | for i in [58,48,38,28,18,8,0]: 130 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 131 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 132 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 133 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 134 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 135 | train518 = pd.merge(train518,tmp,on=['link_ID','time_interval_day'],how='left') 136 | 137 | ############################################################################################################################################################# 138 | 139 | train613 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour== 13),: ] 140 | for i in [58,48,38,28,18,8,0]: 141 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 12)&(feature_data.time_interval_minutes >= i),:] 142 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 143 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 144 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 145 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 146 | train613 = pd.merge(train613,tmp,on=['link_ID','time_interval_day'],how='left') 147 | 148 | train616 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour== 16),: ] 149 | for i in [58,48,38,28,18,8,0]: 150 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 15)&(feature_data.time_interval_minutes >= i),:] 151 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 152 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 153 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 154 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 155 | train616 = pd.merge(train616,tmp,on=['link_ID','time_interval_day'],how='left') 156 | 157 | train418 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==18),: ] 158 | for i in [58,48,38,28,18,8,0]: 159 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 17)&(feature_data.time_interval_minutes >= i),:] 160 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 161 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 162 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 163 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 164 | train418 = pd.merge(train418,tmp,on=['link_ID','time_interval_day'],how='left') 165 | 166 | train68 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==8),: ] 167 | for i in [58,48,38,28,18,8,0]: 168 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 169 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 170 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 171 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 172 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 173 | train68 = pd.merge(train68,tmp,on=['link_ID','time_interval_day'],how='left') 174 | 175 | train58 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 176 | for i in [58,48,38,28,18,8,0]: 177 | tmp = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 178 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 179 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 180 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 181 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 182 | train58 = pd.merge(train58,tmp,on=['link_ID','time_interval_day'],how='left') 183 | 184 | train48 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour==8),: ] 185 | for i in [58,48,38,28,18,8,0]: 186 | tmp = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 7)&(feature_data.time_interval_minutes >= i),:] 187 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 188 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 189 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 190 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 191 | train48 = pd.merge(train48,tmp,on=['link_ID','time_interval_day'],how='left') 192 | 193 | train315 = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour==15),: ] 194 | for i in [58,48,38,28,18,8,0]: 195 | tmp = feature_data.loc[(feature_data.time_interval_month == 3)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 196 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 197 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 198 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 199 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 200 | train315 = pd.merge(train315,tmp,on=['link_ID','time_interval_day'],how='left') 201 | 202 | # train = pd.concat([train515,train518,train415,train616,train613,train516,train418],axis=0) 203 | train = pd.concat([train515,train518,train415,train616,train613,train516,train418,train68,train58,train48,train315],axis=0) 204 | 205 | train_history = feature_data.loc[(feature_data.time_interval_month == 4),: ] 206 | train_history = train_history.groupby(['link_ID', 'time_interval_minutes'])[ 207 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 208 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 209 | # train_history['median_mode'] = 0.5 * train_history['mode_m'] + 0.5 * train_history['median_m'] 210 | 211 | train = pd.merge(train,train_history,on=['link_ID','time_interval_minutes'],how='left') 212 | # train['speed_max'] = train['length'] / train['min_m'] 213 | # train['speed_min'] = train['length'] / train['max_m'] 214 | train['speed_mode'] = train['length'] / train['mode_m'] 215 | train['speed_median'] = train['length'] / train['median_m'] 216 | 217 | # train['120_speed'] = train['length'] / 120.0 218 | train['mean_std'] = train['mean_m'] / train['std_m'] 219 | train['max_min_distance'] = train['max_m'] - train['min_m'] 220 | 221 | train_8 = feature_data.loc[(feature_data.time_interval_month == 4)&(feature_data.time_interval_begin_hour == 15),: ] 222 | train_8 = train_8.groupby(['link_ID', 'time_interval_minutes'])[ 223 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 224 | 225 | train = pd.merge(train,train_8,on=['link_ID','time_interval_minutes'],how='left') 226 | 227 | print train.shape 228 | train = train.fillna(-1) 229 | train_label = np.log1p(train.pop('travel_time')) 230 | # validation = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour==8),: ] 231 | 232 | 233 | test = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour==15),: ] 234 | for i in [58,48,38,28,18,8,0]: 235 | tmp = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 236 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 237 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 238 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 239 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 240 | test = pd.merge(test,tmp,on=['link_ID','time_interval_day'],how='left') 241 | 242 | 243 | test_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 244 | test_history = test_history.groupby(['link_ID', 'time_interval_minutes'])[ 245 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 246 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 247 | # test_history['median_mode'] = 0.5 * test_history['mode_m'] + 0.5 * test_history['median_m'] 248 | test = pd.merge(test,test_history,on=['link_ID','time_interval_minutes'],how='left') 249 | 250 | # test['speed_max'] = test['length'] / test['min_m'] 251 | # test['speed_min'] = test['length'] / test['max_m'] 252 | test['speed_mode'] = test['length'] / test['mode_m'] 253 | test['speed_median'] = test['length'] / test['median_m'] 254 | 255 | # test['120_speed'] = test['length'] / 120.0 256 | test['mean_std'] = test['mean_m'] / test['std_m'] 257 | test['max_min_distance'] = test['max_m'] - test['min_m'] 258 | 259 | test_8 = feature_data.loc[(feature_data.time_interval_month == 5)&(feature_data.time_interval_begin_hour == 15),: ] 260 | test_8 = test_8.groupby(['link_ID', 'time_interval_minutes'])[ 261 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 262 | 263 | test = pd.merge(test,test_8,on=['link_ID','time_interval_minutes'],how='left') 264 | 265 | print test.head() 266 | 267 | # 缺失值的处理 268 | 269 | test = test.fillna(-1) 270 | test_label = np.log1p(test.pop('travel_time')) 271 | 272 | train.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 273 | test.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 274 | 275 | 276 | import xgboost as xgb 277 | print xgb.__version__ 278 | 279 | xlf = xgb.XGBRegressor(max_depth=11, 280 | learning_rate=0.005, 281 | n_estimators=4000, 282 | silent=True, 283 | objective=mape_object, 284 | gamma=0, 285 | min_child_weight=5, 286 | max_delta_step=0, 287 | subsample=0.8, 288 | colsample_bytree=0.8, 289 | colsample_bylevel=1, 290 | reg_alpha=1e0, 291 | reg_lambda=0, 292 | scale_pos_weight=1, 293 | seed=9, 294 | missing=None) 295 | 296 | xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=2) 297 | # xlf.fit(train.values, train_label.values, eval_metric=mape_ln, verbose=True, eval_set=[(test.values, test_label.values)],early_stopping_rounds=2) 298 | print xlf.get_params() 299 | 300 | 301 | sub = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour==15),: ] 302 | for i in [58,48,38,28,18,8,0]: 303 | tmp = feature_data.loc[(feature_data.time_interval_month == 7)&(feature_data.time_interval_begin_hour == 14)&(feature_data.time_interval_minutes >= i),:] 304 | tmp = tmp.groupby(['link_ID', 'time_interval_day'])[ 305 | 'travel_time'].agg([('mean_%d' % (i), np.mean), ('median_%d' % (i), np.median), 306 | ('mode_%d' % (i), mode_function), ('std_%d' % (i), np.std), ('max_%d' % (i), np.max),('min_%d' % (i), np.min)]).reset_index() 307 | tmp['std_%d' % (i)] = tmp['std_%d' % (i)].fillna(0) 308 | sub = pd.merge(sub,tmp,on=['link_ID','time_interval_day'],how='left') 309 | 310 | sub_history = feature_data.loc[(feature_data.time_interval_month == 5),: ] 311 | sub_history = sub_history.groupby(['link_ID', 'time_interval_minutes'])[ 312 | 'travel_time'].agg([('mean_m', np.mean), ('median_m', np.median), 313 | ('mode_m', mode_function), ('std_m', np.std), ('max_m', np.max),('min_m', np.min)]).reset_index() 314 | # sub_history['median_mode'] = 0.5 * sub_history['mode_m'] + 0.5 * sub_history['median_m'] 315 | 316 | sub = pd.merge(sub,sub_history,on=['link_ID','time_interval_minutes'],how='left') 317 | # sub['speed_max'] = sub['length'] / sub['min_m'] 318 | # sub['speed_min'] = sub['length'] / sub['max_m'] 319 | sub['speed_mode'] = sub['length'] / sub['mode_m'] 320 | sub['speed_median'] = sub['length'] / sub['median_m'] 321 | 322 | # sub['120_speed'] = sub['length'] / 120.0 323 | 324 | sub['mean_std'] = sub['mean_m'] / sub['std_m'] 325 | sub['max_min_distance'] = sub['max_m'] - sub['min_m'] 326 | 327 | sub_history_8 = feature_data.loc[(feature_data.time_interval_month == 6)&(feature_data.time_interval_begin_hour == 15),: ] 328 | sub_history_8 = sub_history_8.groupby(['link_ID', 'time_interval_minutes'])[ 329 | 'travel_time'].agg([('median_8_', np.median)]).reset_index() 330 | 331 | sub = pd.merge(sub,sub_history_8,on=['link_ID','time_interval_minutes'],how='left') 332 | 333 | print sub.head() 334 | 335 | sub_label = np.log1p(sub.pop('travel_time')) 336 | sub = sub.fillna(-1) 337 | sub.drop(['link_ID','time_interval_begin_hour','time_interval_month','time_interval_begin','std_m','std_58','max_m'],inplace=True,axis=1) 338 | 339 | result = xlf.predict(sub.values) 340 | 341 | travel_time = pd.DataFrame({'travel_time':list(result)}) 342 | sub_demo = pd.read_table(u'./semifinal_gy_cmp_testing_template_seg2.txt',header=None,sep=';') 343 | 344 | sub_demo.columns = ['link_ID','date','time_interval','travel_time'] 345 | sub_demo = sub_demo.sort_values(['link_ID','time_interval']).reset_index() 346 | del sub_demo['index'] 347 | del sub_demo['travel_time'] 348 | tt = pd.concat([sub_demo,travel_time],axis=1) 349 | # tt = tt.fillna(0) 350 | tt['travel_time'] = np.round(np.expm1(tt['travel_time']),6) 351 | tt[['link_ID','date','time_interval','travel_time']].to_csv('./2017-09-15_15_xgb.txt',sep='#',index=False,header=False) 352 | print tt[['link_ID','date','time_interval','travel_time']].shape 353 | print tt[['link_ID','date','time_interval','travel_time']].isnull().sum() 354 | 355 | 356 | # [430] validation_0-rmse:0.443893 validation_0-mape:0.273708 357 | # [851] validation_0-rmse:0.443482 validation_0-mape:0.273409 358 | 359 | # [700] validation_0-rmse:0.444018 validation_0-mape:0.273479 360 | # 3 4 5 6 7 361 | # [853] validation_0-rmse:0.440541 validation_0-mape:0.273385 --------------------------------------------------------------------------------