├── .DS_Store ├── analysis ├── __init__.py ├── .DS_Store ├── analysis.py └── read_item.py ├── tools.pyc ├── README.md ├── tools.py ├── main2.py ├── merge.py ├── model_stacking1.py ├── model_stacking.py ├── model.py └── gen.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xing89qs/CCF_Product/HEAD/.DS_Store -------------------------------------------------------------------------------- /analysis/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | -------------------------------------------------------------------------------- /tools.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xing89qs/CCF_Product/HEAD/tools.pyc -------------------------------------------------------------------------------- /analysis/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xing89qs/CCF_Product/HEAD/analysis/.DS_Store -------------------------------------------------------------------------------- /analysis/analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import tools 7 | 8 | frame = pd.read_csv('../dataset/2016-07-01/v2.csv') 9 | frame = frame[['name', 'type', 'province', 'market', '_first_sell']].drop_duplicates() 10 | print len(frame[frame._first_sell > 500]), len(frame) 11 | 12 | # frame = pd.read_csv('../data/farming.csv') 13 | # f1 = frame[['name', 'type', 'market', 'province']].drop_duplicates() 14 | # print f1.groupby(['market'], as_index=False)['name'].agg(len) 15 | 16 | frame = pd.read_csv('../current.csv') 17 | frame['diff'] = ((frame['y'] - frame['predictY']) / frame['y']) ** 2 18 | frame.sort_values(by=['diff'], inplace=True, ascending=False) 19 | frame = frame[0:5000] 20 | print frame.groupby(['name', 'market', 'type'], as_index=False)['diff'].agg( 21 | {'size': np.size, 'avg': np.mean}).sort_values( 22 | by=['size'], ascending=False) 23 | -------------------------------------------------------------------------------- /analysis/read_item.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | NAME = 'F95E90C9764A4FCCA8FA3648DBCDBE1B' 8 | MARKET = 'F84FFE619392149018384D16BE6FF525' 9 | TYPE = '配花类' 10 | 11 | frame = pd.read_csv('../data/farming.csv') 12 | tmp = \ 13 | frame[(frame.name == NAME) & (frame.market == MARKET) & ( 14 | frame.type == TYPE)][ 15 | ['avgprice', 'time']] 16 | tmp = tmp.sort_values(by=['time']) 17 | for index, row in tmp.iterrows(): 18 | print row['time'], row['avgprice'] 19 | 20 | plt.plot(xrange(len(tmp['time'])), tmp['avgprice'], 'b') 21 | plt.show() 22 | 23 | ''' 24 | len1 = len(tmp) 25 | 26 | frame = pd.read_csv('../current.csv') 27 | tmp = \ 28 | frame[(frame.name == NAME) & (frame.market == MARKET) & ( 29 | frame.type == TYPE)][ 30 | ['predictY', 'time']] 31 | plt.plot(xrange(len1 - len(tmp), len1), tmp['predictY'], 'g') 32 | 33 | frame = pd.read_csv('../submit_12_13_2.csv', header=None) 34 | frame.columns = ['market', 'type', 'name', 'time', 'predictY'] 35 | tmp = \ 36 | frame[(frame.name == NAME) & (frame.market == MARKET) & ( 37 | frame.type == TYPE)][ 38 | ['predictY', 'time']] 39 | 40 | print len1, len1 + len(tmp) 41 | plt.plot(xrange(len1, len1 + len(tmp), 1), tmp['predictY'], 'r') 42 | ''' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### CCF-农产品价格预测 2 | 3 | 赛题参见 - [赛题详情](http://datafountain.cn/data/science/player/competition/detail/description/244) 4 | 5 | #### 赛题说明 6 | - 使用2016年6月以前的农产品价格数据预测7月的农产品价格 7 | 8 | #### 成绩 9 | - 复赛第2,我们就是`xjboost`队。。。 10 | 11 | #### 代码说明 12 | - gen.py 13 | - 生成特征 14 | - model.py 15 | - 各个单模型 16 | - model_stacking.py 17 | - stacking 18 | 19 | 代码就将就看吧。。 具体的数据集太大就没传上来 20 | 21 | #### 模型算法 22 | - 训练集 23 | - 选择了2016年6月1日往后30天+2016年5月25日往后一个月的全部数据作为训练集 24 | 25 | - 测试集 26 | - 线上测试集为2016年7月 27 | 28 | - 特征 29 | - 窗口统计特征(前1,2,3,4,7,14,21,30,60天) 30 | - 6月1日(5月25日)的前x天农产品价格的平均值 31 | - 6月1日(5月25日)的前x天农产品价格的最小值 32 | - 6月1日(5月25日)的前x天农产品价格的标准差 33 | - 考虑有部分数据缺失,对每个农产品,我们将数据先按照时间排序,然后加上了 34 | - 6月1日(5月25日)的前x个非缺失的价格的平均值 35 | - 6月1日(5月25日)的前x个非缺失的价格的最小值 36 | - 6月1日(5月25日)的前x个非缺失的价格的标准差 37 | - 6月1日(5月25日)的前x天有几天价格有缺失(缺失率) 38 | - 日期特征 (没怎么用) 39 | - 一个月的第x天 40 | - 一周的第x天 41 | - 一年的第x天 42 | - 农产品的特征 43 | - 6月1日(5月25日)距离该农产品第一次有价格记录过去了x天 44 | - 6月1日(5月25日)距离该农产品最后一次有价格记录过去了x天 45 | - 所有数据中该农产品最后一次出现的价格 46 | 47 | - 算法说明 48 | - 一些简单的规则 49 | - 如提交最后一次的价格线上评分就能达到 **0.06088** 50 | - 直接使用模型的训练效果仅为**0.12**左右还不如简单的规则 51 | - 考虑到评价标准是**MPSE**,我们首先对label进行 **log transformation** 即$y\prime = log(x+1)$,预测后再使用$y^{predict} = exp(y\prime) - 1$变换回来 52 | - 但是效果提升不大 53 | - 考虑大部分回归模型的目标函数都是**MSE**的,label较大的对于误差的影响显然更大一些;而本题的评价函数是**MPSE**,label较小的更容易带来误差 54 | - 于是我们对训练样本设置了权重,$weight_i = \frac{1.0}{y^*_i*y^*_i}$,线上效果明显提升(0.05) 55 | 56 | - 模型融合 57 | - 我们将LinearRegression、Xgboost、RandomForest的结果进行加权融合 58 | - 有一定的提升(0.047) 59 | 60 | - 简单的规则 61 | - 有部分农产品的价格非常稳定(波动非常小),我们就直接选取其最后一次非缺失的平均价格作为7月每一天的价格作为预测结果 62 | - 同样也有一些提升(0.045) 63 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | import datetime 5 | import math 6 | import pandas as pd 7 | import time 8 | import MySQLdb 9 | import os 10 | 11 | 12 | def date_range(begin, end, time_regex='%Y-%m-%d'): 13 | ''' 14 | 生成begin到end的每一天的一个list 15 | :param 16 | begin: str 开始时间 17 | end: str 结束时间 18 | time_regex: str 时间格式的正则表达式 19 | :argument 20 | begin需要小于等于end 21 | :return: 22 | day_range: list 23 | -------- 24 | 如 date_range('20151220', '20151223')返回 ['20151220', '20151221', '20151222', '20151223'] 25 | ''' 26 | day_range = [] 27 | day = datetime.datetime.strptime(begin, time_regex).date() 28 | while True: 29 | day_str = datetime.datetime.strftime(day, time_regex) 30 | day_range.append(day_str) 31 | if day_str == end: 32 | break 33 | day = day + datetime.timedelta(days=1) 34 | return day_range 35 | 36 | 37 | def move_day(day_str, offset, time_regex='%Y-%m-%d'): 38 | ''' 39 | 计算day_str偏移offset天后的日期 40 | :param 41 | day_str: str 原时间 42 | offset: str 要偏移的天数 43 | time_regex: str 时间字符串的正则式 44 | :return: 45 | day_str: str 运算之后的结果时间, 同样以time_regex的格式返回 46 | -------- 47 | 如 move_day('20151228', 1)返回 '20151229' 48 | ''' 49 | day = datetime.datetime.strptime(day_str, time_regex).date() 50 | day = day + datetime.timedelta(days=offset) 51 | day_str = datetime.datetime.strftime(day, time_regex) 52 | return day_str 53 | 54 | 55 | def move_hours(day_str, offset, time_regex='%Y/%m/%d %H:%M:%S'): 56 | ''' 57 | 计算day_str偏移offset天后的日期 58 | :param 59 | day_str: str 原时间 60 | offset: str 要偏移的天数 61 | time_regex: str 时间字符串的正则式 62 | :return: 63 | day_str: str 运算之后的结果时间, 同样以time_regex的格式返回 64 | -------- 65 | 如 move_day('20151228', 1)返回 '20151229' 66 | ''' 67 | t = datetime.datetime.strptime(day_str, time_regex) 68 | t = t + datetime.timedelta(hours=offset) 69 | day_str = datetime.datetime.strftime(t, time_regex) 70 | return day_str 71 | 72 | 73 | def time_diff(day_str1, day_str2, time_regex='%Y-%m-%d'): 74 | ''' 75 | 计算day_str1和day_str2的日期差 76 | ''' 77 | day_str1 = str(day_str1) 78 | day_str2 = str(day_str2) 79 | day1 = datetime.datetime.strptime(day_str1, time_regex).date() 80 | day2 = datetime.datetime.strptime(day_str2, time_regex).date() 81 | return math.fabs((day1 - day2).days) 82 | 83 | 84 | def str2time_stamp(_str, re='%Y%m%d %H'): 85 | return int(time.mktime(time.strptime(_str, re))) 86 | 87 | 88 | def time_stamp2str(stamp, re='%Y%m%d %H:%M:%S'): 89 | return time.strftime(re, time.localtime(stamp)) 90 | 91 | 92 | def cross_join(frame1, frame2): 93 | ''' 94 | 笛卡尔积 95 | :param frame1: 96 | :param frame2: 97 | :return: 98 | ''' 99 | frame1['_tmpkey'] = 0 100 | frame2['_tmpkey'] = 0 101 | frame = pd.merge(frame1, frame2, how='outer', on='_tmpkey') 102 | frame.drop('_tmpkey', axis=1, inplace=True) 103 | frame1.drop('_tmpkey', axis=1, inplace=True) 104 | frame2.drop('_tmpkey', axis=1, inplace=True) 105 | return frame 106 | 107 | 108 | def merge_table(artist_frame, date_frame, Y_frame): 109 | frame1 = pd.merge(Y_frame, date_frame, how='left', on=['date', 'artist_id']) 110 | return pd.merge(frame1, artist_frame, how='left', on=['artist_id']) 111 | 112 | 113 | def get_week(date, re='%Y-%m-%d'): 114 | day = datetime.datetime.strptime(str(date), re).date() 115 | return int(day.strftime("%w")) 116 | 117 | 118 | __BEFORE_HOLIDAY_WEIGHT = { 119 | '20150403': 1, '20150430': 1, '20150619': 1, 120 | '20150925': 1, 121 | 122 | } 123 | __HOLIDAY_WEIGHT = { 124 | '20150404': 1, '20150405': 1, '20150406': 1, 125 | '20150501': 1, '20150502': 1, '20150503': 1, 126 | '20150620': 1, '20150621': 1, '20150622': 1, 127 | '20150926': 1, '20150927': 1, '20150928': 1, 128 | '20150929': 1, '20150930': 1, '20151001': 1, 129 | '20151002': 1, '20151003': 1, '20151004': 1, 130 | '20151005': 1, '20151006': 1, '20151007': 1, 131 | } 132 | 133 | 134 | def get_holiday_weight(date): 135 | if str(date) in __HOLIDAY_WEIGHT: 136 | return __HOLIDAY_WEIGHT[str(date)] 137 | return 0 138 | 139 | 140 | def get_before_holiday_weight(date): 141 | if str(date) in __BEFORE_HOLIDAY_WEIGHT: 142 | return __BEFORE_HOLIDAY_WEIGHT[str(date)] 143 | return 0 144 | -------------------------------------------------------------------------------- /main2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.linear_model import LinearRegression 7 | from xgboost import XGBRegressor 8 | 9 | 10 | def calculate(test_set): 11 | return np.mean( 12 | test_set[test_set.y != -1].apply( 13 | lambda x: ((x['predictY'] - x['y']) / x['y']) ** 2, 14 | axis=1)) 15 | 16 | 17 | def merge(x): 18 | if x['unique_size'] == 1 and x['_1day_exists_avg'] != -1: 19 | return x['_1day_exists_avg'] 20 | if x['predictY'] < 0: 21 | return x['predictY2'] * 0.5 + 0.5 * x['predictY4'] 22 | # 线性回归值很大直接用 23 | if x['predictY1'] > 100: 24 | return x['predictY1'] 25 | 26 | return x['predictY'] 27 | 28 | 29 | f1 = pd.read_csv('result/feature_set9/model1_online.csv') 30 | f1 = f1.rename(columns={'predictY': 'predictY1'}) 31 | f2 = pd.read_csv('result/feature_set9/model2_online.csv') 32 | f2 = f2.rename(columns={'predictY': 'predictY2'}) 33 | f4 = pd.read_csv('result/feature_set9/model4_online.csv') 34 | f4 = f4.rename(columns={'predictY': 'predictY4'}) 35 | f5 = pd.read_csv('result/feature_set9/model5_online.csv') 36 | f5 = f5.rename(columns={'predictY': 'predictY5'}) 37 | f6 = pd.read_csv('result/feature_set9/model6_online.csv') 38 | f6 = f6.rename(columns={'predictY': 'predictY6'}) 39 | f1_1 = pd.read_csv('result/feature_set1/model1_online.csv') 40 | f1_1 = f1_1.rename(columns={'predictY': 'predictY1_1'}) 41 | f2_1 = pd.read_csv('result/feature_set1/model2_online.csv') 42 | f2_1 = f2_1.rename(columns={'predictY': 'predictY2_1'}) 43 | f4_1 = pd.read_csv('result/feature_set1/model4_online.csv') 44 | f4_1 = f4_1.rename(columns={'predictY': 'predictY4_1'}) 45 | 46 | print len(f1), len(f2), len(f4), len(f5), len(f6), len(f1_1), len(f2_1), len(f4_1) 47 | 48 | new_test = f1 49 | new_test = pd.merge(new_test, f2, how='left') 50 | new_test = pd.merge(new_test, f4, how='left') 51 | new_test = pd.merge(new_test, f5, how='left') 52 | new_test = pd.merge(new_test, f6, how='left') 53 | new_test = pd.merge(new_test, f1_1, how='left') 54 | new_test = pd.merge(new_test, f2_1, how='left') 55 | new_test = pd.merge(new_test, f4_1, how='left') 56 | 57 | f1 = pd.read_csv('result/feature_set9/model1_online_stacking1.csv') 58 | f1 = f1.rename(columns={'predictY': 'predictY1'}) 59 | f2 = pd.read_csv('result/feature_set9/model2_online_stacking1.csv') 60 | f2 = f2.rename(columns={'predictY': 'predictY2'}) 61 | f4 = pd.read_csv('result/feature_set9/model4_online_stacking1.csv') 62 | f4 = f4.rename(columns={'predictY': 'predictY4'}) 63 | f5 = pd.read_csv('result/feature_set9/model5_online_stacking1.csv') 64 | f5 = f5.rename(columns={'predictY': 'predictY5'}) 65 | f6 = pd.read_csv('result/feature_set9/model6_online_stacking1.csv') 66 | f6 = f6.rename(columns={'predictY': 'predictY6'}) 67 | f1_1 = pd.read_csv('result/feature_set1/model1_online_stacking1.csv') 68 | f1_1 = f1_1.rename(columns={'predictY': 'predictY1_1'}) 69 | f2_1 = pd.read_csv('result/feature_set1/model2_online_stacking1.csv') 70 | f2_1 = f2_1.rename(columns={'predictY': 'predictY2_1'}) 71 | f4_1 = pd.read_csv('result/feature_set1/model4_online_stacking1.csv') 72 | f4_1 = f4_1.rename(columns={'predictY': 'predictY4_1'}) 73 | 74 | new_train1 = f1 75 | new_train1 = pd.merge(new_train1, f1, how='left') 76 | new_train1 = pd.merge(new_train1, f2, how='left') 77 | new_train1 = pd.merge(new_train1, f4, how='left') 78 | new_train1 = pd.merge(new_train1, f5, how='left') 79 | new_train1 = pd.merge(new_train1, f6, how='left') 80 | new_train1 = pd.merge(new_train1, f1_1, how='left') 81 | new_train1 = pd.merge(new_train1, f2_1, how='left') 82 | new_train1 = pd.merge(new_train1, f4_1, how='left') 83 | print len(f1), len(new_train1) 84 | 85 | f1 = pd.read_csv('result/feature_set9/model1_online_stacking2.csv') 86 | f1 = f1.rename(columns={'predictY': 'predictY1'}) 87 | f2 = pd.read_csv('result/feature_set9/model2_online_stacking2.csv') 88 | f2 = f2.rename(columns={'predictY': 'predictY2'}) 89 | f4 = pd.read_csv('result/feature_set9/model4_online_stacking2.csv') 90 | f4 = f4.rename(columns={'predictY': 'predictY4'}) 91 | f5 = pd.read_csv('result/feature_set9/model5_online_stacking2.csv') 92 | f5 = f5.rename(columns={'predictY': 'predictY5'}) 93 | f6 = pd.read_csv('result/feature_set9/model6_online_stacking2.csv') 94 | f6 = f6.rename(columns={'predictY': 'predictY6'}) 95 | f1_1 = pd.read_csv('result/feature_set1/model1_online_stacking2.csv') 96 | f1_1 = f1_1.rename(columns={'predictY': 'predictY1_1'}) 97 | f2_1 = pd.read_csv('result/feature_set1/model2_online_stacking2.csv') 98 | f2_1 = f2_1.rename(columns={'predictY': 'predictY2_1'}) 99 | f4_1 = pd.read_csv('result/feature_set1/model4_online_stacking2.csv') 100 | f4_1 = f4_1.rename(columns={'predictY': 'predictY4_1'}) 101 | 102 | new_train2 = f1 103 | new_train2 = pd.merge(new_train2, f1, how='left') 104 | new_train2 = pd.merge(new_train2, f2, how='left') 105 | new_train2 = pd.merge(new_train2, f4, how='left') 106 | new_train2 = pd.merge(new_train2, f5, how='left') 107 | new_train2 = pd.merge(new_train2, f6, how='left') 108 | new_train2 = pd.merge(new_train2, f1_1, how='left') 109 | new_train2 = pd.merge(new_train2, f2_1, how='left') 110 | new_train2 = pd.merge(new_train2, f4_1, how='left') 111 | 112 | print len(f1), len(new_train2) 113 | 114 | new_train = pd.concat([new_train1, new_train2]) 115 | 116 | model1 = LinearRegression(normalize=True) 117 | feature_set = ['predictY2', 'predictY4', '_last_price_all'] 118 | model1.fit(new_train[feature_set].as_matrix(), new_train['y'].as_matrix(), 119 | sample_weight=map(lambda x: 1.0 / x / x, new_train['y'].as_matrix()) 120 | ) 121 | 122 | print model1.coef_ 123 | 124 | new_test['predictY'] = new_test['predictY2_1'] 125 | print len(new_test) 126 | # print calculate(new_test) 127 | 128 | new_test['predictY'] = model1.predict(new_test[feature_set].as_matrix()) 129 | unique_size = pd.read_csv('unique_size.csv') 130 | new_test = pd.merge(new_test, unique_size, how='left') 131 | new_test['predictY'] = new_test.apply(merge, axis=1) 132 | 133 | print new_test['_1day_exists_avg'].min() 134 | 135 | new_test[['market', 'type', 'name', 'time', 'predictY']].to_csv('submit_12_15_2.csv', header=None, index=False) 136 | 137 | print calculate(new_test) 138 | -------------------------------------------------------------------------------- /merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | from xgboost import XGBRegressor 5 | from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor 6 | from sklearn.svm import LinearSVR 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.linear_model import LinearRegression 9 | import pandas as pd 10 | import numpy as np 11 | import tools 12 | 13 | 14 | def dfs(l, r, feature_file, folder): 15 | if l == r: 16 | return pd.read_csv(folder + '/' + feature_file[l] + '.csv') 17 | mid = (l + r) / 2 18 | l_frame = dfs(l, mid, feature_file, folder) 19 | r_frame = dfs(mid + 1, r, feature_file, folder) 20 | return pd.merge(l_frame, r_frame, how='left') 21 | 22 | 23 | def calculate(test_set): 24 | return np.mean( 25 | test_set[test_set.y != -1].apply( 26 | lambda x: ((x['predictY'] - x['y']) / x['y']) ** 2, 27 | axis=1)) 28 | 29 | 30 | def diff1(x): 31 | if x['_3day_exists_avg'] is None: 32 | return None 33 | return x['_3day_exists_avg'] - x['_7day_exists_avg'] 34 | 35 | 36 | def diff2(x): 37 | if x['_7day_avg'] is None: 38 | return None 39 | return x['_7day_avg'] - x['_30day_avg'] 40 | 41 | 42 | def run(feature_files, training_dates, feature_set_folder): 43 | train_set = pd.concat( 44 | [dfs(0, len(feature_files), feature_files + ['y'], 'dataset1/' + date) for date in training_dates]) 45 | test_set = dfs(0, len(feature_files), feature_files + ['y'], 'dataset1/2016-06-01') 46 | test1_set = dfs(0, len(feature_files), feature_files + ['y'], 'dataset1/2016-05-25') 47 | 48 | train_set = train_set.fillna(-1, downcast='infer') 49 | test_set = test_set.fillna(-1, downcast='infer') 50 | test1_set = test1_set.fillna(-1, downcast='infer') 51 | 52 | train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) 53 | test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) 54 | test1_set['y_log'] = test1_set['y'].apply(lambda x: np.log(1 + x)) 55 | 56 | feature_set = filter(lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], 57 | train_set.columns) 58 | 59 | scaler = StandardScaler() 60 | scaler.fit(train_set[feature_set].as_matrix()) 61 | 62 | # model1 63 | model1 = LinearRegression(normalize=True) 64 | model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 65 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 66 | ) 67 | test_set['predictY'] = model1.predict(scaler.transform(test_set[feature_set].as_matrix())) 68 | test_set.to_csv('result1/' + feature_set_folder + '/model1_offline.csv') 69 | test1_set['predictY'] = model1.predict(scaler.transform(test1_set[feature_set].as_matrix())) 70 | test1_set.to_csv('result1/' + feature_set_folder + '/model1_offline1.csv') 71 | 72 | # model2 73 | model2 = XGBRegressor(n_estimators=500, learning_rate=0.02, max_depth=5, colsample_bytree=0.7, subsample=0.8) 74 | model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 75 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 76 | ) 77 | test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) 78 | test_set.to_csv('result1/' + feature_set_folder + '/model2_offline.csv') 79 | test1_set['predictY'] = model2.predict(test1_set[feature_set].as_matrix()) 80 | test1_set.to_csv('result1/' + feature_set_folder + '/model2_offline1.csv') 81 | 82 | # model3 83 | model3 = LinearSVR(tol=1e-7) 84 | model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 85 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 86 | ) 87 | test_set['predictY'] = model3.predict(scaler.transform(test_set[feature_set].as_matrix())) 88 | test_set.to_csv('result1/' + feature_set_folder + '/model3_offline.csv') 89 | test1_set['predictY'] = model3.predict(scaler.transform(test1_set[feature_set].as_matrix())) 90 | test1_set.to_csv('result1/' + feature_set_folder + '/model3_offline1.csv') 91 | 92 | # model4 93 | model4 = RandomForestRegressor(n_estimators=500, max_depth=6, max_features=0.3, max_leaf_nodes=60) 94 | model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 95 | sample_weight=np.array(map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) 96 | ) 97 | test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) 98 | test_set.to_csv('result1/' + feature_set_folder + '/model4_offline.csv') 99 | test1_set['predictY'] = model4.predict(test1_set[feature_set].as_matrix()) 100 | test1_set.to_csv('result1/' + feature_set_folder + '/model4_offline1.csv') 101 | 102 | # model5 103 | model5 = XGBRegressor(n_estimators=500, learning_rate=0.02, max_depth=6, colsample_bytree=0.7, subsample=0.8) 104 | model5.fit(train_set[feature_set].as_matrix(), train_set['y_log'].as_matrix()) 105 | test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) 106 | test_set['predictY'] = test_set['predictY'].apply(lambda x: np.exp(x) - 1) 107 | test_set.to_csv('result1/' + feature_set_folder + '/model5_offline.csv') 108 | test1_set['predictY'] = model5.predict(test1_set[feature_set].as_matrix()) 109 | test1_set['predictY'] = test_set['predictY'].apply(lambda x: np.exp(x) - 1) 110 | test1_set.to_csv('result1/' + feature_set_folder + '/model5_offline1.csv') 111 | 112 | pass 113 | 114 | 115 | # run(['v1', 'v2', 'v3'], tools.date_range('2016-04-01', '2016-05-01'), 'feature_set1') 116 | # submit(['v1', 'v2', 'v3', 'v7']) 117 | 118 | 119 | def merge(x): 120 | if x['_1day_exists_avg'] == -1: 121 | return x['predictY'] 122 | if x['unique_size'] == 1 and x['_1day_exists_avg'] != -1: 123 | return x['_1day_exists_avg'] 124 | # 线性回归值很大直接用 125 | if x['predictY1'] > 100: 126 | return x['predictY1'] 127 | 128 | if x['predictY1'] < 0: 129 | return x['predictY'] * 0.5 + 0.5 * x['predictY4'] 130 | 131 | return x['predictY'] * 0.4 + x['predictY1'] * 0.2 + x['predictY4'] * 0.4 132 | 133 | 134 | f1 = pd.read_csv('result/feature_set1/model1_offline.csv') 135 | f = pd.read_csv('result/feature_set1/model2_offline.csv') 136 | unique_size = pd.read_csv('unique_size.csv') 137 | f = pd.merge(f, unique_size, how='left') 138 | print calculate(f) 139 | f['predictY1'] = f1['predictY'] 140 | f4 = pd.read_csv('result/feature_set1/model4_offline.csv') 141 | f['predictY4'] = f4['predictY'] 142 | 143 | f['predictY'] = f.apply(merge, axis=1) 144 | print calculate(f) 145 | 146 | f.to_csv('current.csv') 147 | 148 | f1 = pd.read_csv('result/feature_set2/model1_online.csv') 149 | f = pd.read_csv('result/feature_set2/model2_online.csv') 150 | f2 = pd.read_csv('result/feature_set2/model4_online.csv') 151 | unique_size = pd.read_csv('unique_size.csv') 152 | f = pd.merge(f, unique_size, how='left') 153 | f['predictY1'] = f1['predictY'] 154 | f['predictY4'] = f2['predictY'] 155 | f['predictY'] = f.apply(merge, axis=1) 156 | 157 | f[['market', 'type', 'name', 'time', 'predictY']].to_csv('submit_12_16_1.csv', header=None, index=False) 158 | -------------------------------------------------------------------------------- /model_stacking1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | from xgboost import XGBRegressor 5 | from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor 6 | from sklearn.svm import LinearSVR 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.linear_model import LinearRegression 9 | import pandas as pd 10 | import numpy as np 11 | 12 | 13 | def dfs(l, r, feature_file, folder): 14 | if l == r: 15 | return pd.read_csv(folder + '/' + feature_file[l] + '.csv') 16 | mid = (l + r) / 2 17 | l_frame = dfs(l, mid, feature_file, folder) 18 | r_frame = dfs(mid + 1, r, feature_file, folder) 19 | return pd.merge(l_frame, r_frame, how='left') 20 | 21 | 22 | def calculate(test_set): 23 | return np.mean( 24 | test_set[test_set.y != -1].apply( 25 | lambda x: ((x['predictY'] - x['y']) / x['y']) ** 2, 26 | axis=1)) 27 | 28 | 29 | def diff1(x): 30 | if x['_3day_exists_avg'] is None: 31 | return None 32 | return x['_3day_exists_avg'] - x['_7day_exists_avg'] 33 | 34 | 35 | def diff2(x): 36 | if x['_7day_avg'] is None: 37 | return None 38 | return x['_7day_avg'] - x['_30day_avg'] 39 | 40 | 41 | def run(feature_files, training_dates, feature_set_folder): 42 | train_set1 = pd.concat( 43 | [dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date) for date in training_dates]) 44 | 45 | train_set = train_set1[train_set1.time_diff <= 15] 46 | test_set = train_set1[train_set1.time_diff > 15] 47 | 48 | train_set = train_set.fillna(-1, downcast='infer') 49 | test_set = test_set.fillna(-1, downcast='infer') 50 | 51 | train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) 52 | test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) 53 | 54 | feature_set = filter(lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], 55 | train_set.columns) 56 | 57 | scaler = StandardScaler() 58 | scaler.fit(train_set[feature_set].as_matrix()) 59 | # 60 | # model1 61 | # model1 = LinearRegression(normalize=True) 62 | # model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 63 | # sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 64 | # ) 65 | # print zip(feature_set, model1.coef_) 66 | # test_set['predictY'] = model1.predict(scaler.transform(test_set[feature_set].as_matrix())) 67 | # test_set.to_csv('result/' + feature_set_folder + '/model1_offline_stacking2.csv') 68 | # # 69 | # # model2 70 | # model2 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 71 | # colsample_bylevel=0.7) 72 | # model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 73 | # sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 74 | # ) 75 | # test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) 76 | # test_set.to_csv('result/' + feature_set_folder + '/model2_offline_stacking2.csv') 77 | # 78 | # # model3 79 | # model3 = LinearSVR(tol=1e-7) 80 | # model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 81 | # sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 82 | # ) 83 | # test_set['predictY'] = model3.predict(scaler.transform(test_set[feature_set].as_matrix())) 84 | # test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv') 85 | 86 | # model4 87 | model4 = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features=0.2, max_leaf_nodes=100) 88 | model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 89 | sample_weight=np.array(map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) 90 | ) 91 | test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) 92 | test_set.to_csv('result/' + feature_set_folder + '/model4_offline_stacking2.csv') 93 | # model5 94 | model5 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 95 | colsample_bylevel=0.7, seed=10000) 96 | model5.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 97 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 98 | ) 99 | test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) 100 | test_set.to_csv('result/' + feature_set_folder + '/model5_offline_stacking2.csv') 101 | 102 | # model6 103 | model6 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=5, colsample_bytree=0.7, subsample=0.7, 104 | colsample_bylevel=0.7) 105 | model6.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 106 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 107 | ) 108 | test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix()) 109 | test_set.to_csv('result/' + feature_set_folder + '/model6_offline_stacking2.csv') 110 | pass 111 | 112 | 113 | def submit(feature_files, training_dates, feature_set_folder): 114 | train_set1 = pd.concat( 115 | [dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date) for date in training_dates]) 116 | 117 | train_set = train_set1[train_set1.time_diff <= 15] 118 | test_set = train_set1[train_set1.time_diff > 15] 119 | 120 | train_set = train_set.fillna(-1, downcast='infer') 121 | test_set = test_set.fillna(-1, downcast='infer') 122 | 123 | train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) 124 | test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) 125 | 126 | feature_set = filter(lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], 127 | train_set.columns) 128 | 129 | scaler = StandardScaler() 130 | scaler.fit(train_set[feature_set].as_matrix()) 131 | 132 | # model1 133 | model1 = LinearRegression(normalize=True) 134 | model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 135 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 136 | ) 137 | print zip(feature_set, model1.coef_) 138 | test_set['predictY'] = model1.predict(scaler.transform(test_set[feature_set].as_matrix())) 139 | test_set.to_csv('result/' + feature_set_folder + '/model1_online_stacking2.csv') 140 | 141 | # model2 142 | model2 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 143 | colsample_bylevel=0.7) 144 | model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 145 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 146 | ) 147 | test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) 148 | test_set.to_csv('result/' + feature_set_folder + '/model2_online_stacking2.csv') 149 | 150 | # model3 151 | model3 = LinearSVR(tol=1e-7) 152 | model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 153 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 154 | ) 155 | test_set['predictY'] = model3.predict(scaler.transform(test_set[feature_set].as_matrix())) 156 | test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv') 157 | 158 | # model4 159 | model4 = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features=0.2, max_leaf_nodes=100) 160 | model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 161 | sample_weight=np.array(map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) 162 | ) 163 | test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) 164 | test_set.to_csv('result/' + feature_set_folder + '/model4_online_stacking2.csv') 165 | 166 | # model5 167 | model5 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 168 | colsample_bylevel=0.7, seed=10000) 169 | model5.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 170 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 171 | ) 172 | test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) 173 | test_set.to_csv('result/' + feature_set_folder + '/model5_online_stacking2.csv') 174 | 175 | # model6 176 | model6 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=5, colsample_bytree=0.7, subsample=0.7, 177 | colsample_bylevel=0.7) 178 | model6.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 179 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 180 | ) 181 | test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix()) 182 | test_set.to_csv('result/' + feature_set_folder + '/model6_online_stacking2.csv') 183 | 184 | pass 185 | 186 | 187 | run(['v1', 'v3', 'v11'], ['2016-05-01', '2016-04-25'], 'feature_set1') 188 | # run(['v1', 'v3', 'v11', 'v14'], ['2016-05-01', '2016-04-25'], 'feature_set9') 189 | # submit(['v1', 'v3', 'v11', 'v14'], ['2016-06-01', '2016-05-25'], 'feature_set9') 190 | # run(['v1', 'v2', 'v3', 'v11'], ['2016-05-01', '2016-04-25'], 'feature_set4') 191 | # run(['v1', 'v2', 'v3', 'v11'], ['2016-05-01', '2016-04-25', '2016-04-20', '2016-04-15'], 'feature_set6') 192 | # run(['v1', 'v2', 'v3', 'v10'], ['2016-05-01', '2016-04-25'], 'feature_set3') 193 | submit(['v1', 'v3', 'v11'], ['2016-06-01', '2016-05-25'], 'feature_set1') 194 | # submit(['v1', 'v14', 'v3', 'v11'], ['2016-06-01', '2016-05-25'], 'feature_set9') 195 | # submit(['v1', 'v2', 'v3', 'v11', 'v14'], ['2016-06-01', '2016-05-25'], 'feature_set6') 196 | 197 | 198 | f = pd.read_csv('result/feature_set1/model1_offline.csv') 199 | print calculate(f) 200 | -------------------------------------------------------------------------------- /model_stacking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | from xgboost import XGBRegressor 5 | from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor 6 | from sklearn.svm import LinearSVR 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.linear_model import LinearRegression 9 | import pandas as pd 10 | import numpy as np 11 | 12 | 13 | def dfs(l, r, feature_file, folder): 14 | if l == r: 15 | return pd.read_csv(folder + '/' + feature_file[l] + '.csv') 16 | mid = (l + r) / 2 17 | l_frame = dfs(l, mid, feature_file, folder) 18 | r_frame = dfs(mid + 1, r, feature_file, folder) 19 | return pd.merge(l_frame, r_frame, how='left') 20 | 21 | 22 | def calculate(test_set): 23 | return np.mean( 24 | test_set[test_set.y != -1].apply( 25 | lambda x: ((x['predictY'] - x['y']) / x['y']) ** 2, 26 | axis=1)) 27 | 28 | 29 | def diff1(x): 30 | if x['_3day_exists_avg'] is None: 31 | return None 32 | return x['_3day_exists_avg'] - x['_7day_exists_avg'] 33 | 34 | 35 | def diff2(x): 36 | if x['_7day_avg'] is None: 37 | return None 38 | return x['_7day_avg'] - x['_30day_avg'] 39 | 40 | 41 | def run(feature_files, training_dates, feature_set_folder): 42 | train_set1 = pd.concat( 43 | [dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date) for date in training_dates]) 44 | 45 | train_set = train_set1[train_set1.time_diff > 15] 46 | test_set = train_set1[train_set1.time_diff <= 15] 47 | 48 | train_set = train_set.fillna(-1, downcast='infer') 49 | test_set = test_set.fillna(-1, downcast='infer') 50 | 51 | train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) 52 | test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) 53 | 54 | feature_set = filter(lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], 55 | train_set.columns) 56 | 57 | scaler = StandardScaler() 58 | scaler.fit(train_set[feature_set].as_matrix()) 59 | # 60 | # # model1 61 | # model1 = LinearRegression(normalize=True) 62 | # model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 63 | # sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 64 | # ) 65 | # print zip(feature_set, model1.coef_) 66 | # test_set['predictY'] = model1.predict(scaler.transform(test_set[feature_set].as_matrix())) 67 | # test_set.to_csv('result/' + feature_set_folder + '/model1_offline_stacking1.csv') 68 | # 69 | # # model2 70 | # model2 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 71 | # colsample_bylevel=0.7) 72 | # model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 73 | # sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 74 | # ) 75 | # test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) 76 | # test_set.to_csv('result/' + feature_set_folder + '/model2_offline_stacking1.csv') 77 | # 78 | # # model3 79 | # model3 = LinearSVR(tol=1e-7) 80 | # model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 81 | # sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 82 | # ) 83 | # test_set['predictY'] = model3.predict(scaler.transform(test_set[feature_set].as_matrix())) 84 | # test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv') 85 | 86 | # model4 87 | model4 = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features=0.2, max_leaf_nodes=100) 88 | model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 89 | sample_weight=np.array(map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) 90 | ) 91 | test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) 92 | test_set.to_csv('result/' + feature_set_folder + '/model4_offline_stacking1.csv') 93 | 94 | # model5 95 | model5 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 96 | colsample_bylevel=0.7, seed=10000) 97 | model5.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 98 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 99 | ) 100 | test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) 101 | test_set.to_csv('result/' + feature_set_folder + '/model5_offline_stacking1.csv') 102 | 103 | # model5 104 | model6 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=5, colsample_bytree=0.7, subsample=0.7, 105 | colsample_bylevel=0.7) 106 | model6.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 107 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 108 | ) 109 | test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix()) 110 | test_set.to_csv('result/' + feature_set_folder + '/model6_offline_stacking1.csv') 111 | 112 | pass 113 | 114 | 115 | def submit(feature_files, training_dates, feature_set_folder): 116 | train_set1 = pd.concat( 117 | [dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date) for date in training_dates]) 118 | 119 | train_set = train_set1[train_set1.time_diff > 15] 120 | test_set = train_set1[train_set1.time_diff <= 15] 121 | 122 | train_set = train_set.fillna(-1, downcast='infer') 123 | test_set = test_set.fillna(-1, downcast='infer') 124 | 125 | train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) 126 | test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) 127 | 128 | feature_set = filter(lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], 129 | train_set.columns) 130 | 131 | scaler = StandardScaler() 132 | scaler.fit(train_set[feature_set].as_matrix()) 133 | # 134 | # model1 135 | model1 = LinearRegression(normalize=True) 136 | model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 137 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 138 | ) 139 | print zip(feature_set, model1.coef_) 140 | test_set['predictY'] = model1.predict(scaler.transform(test_set[feature_set].as_matrix())) 141 | test_set.to_csv('result/' + feature_set_folder + '/model1_online_stacking1.csv') 142 | print test_set 143 | 144 | # model2 145 | model2 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 146 | colsample_bylevel=0.7) 147 | model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 148 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 149 | ) 150 | test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) 151 | test_set.to_csv('result/' + feature_set_folder + '/model2_online_stacking1.csv') 152 | 153 | # model3 154 | model3 = LinearSVR(tol=1e-7) 155 | model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 156 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 157 | ) 158 | test_set['predictY'] = model3.predict(scaler.transform(test_set[feature_set].as_matrix())) 159 | test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv') 160 | 161 | # model4 162 | model4 = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features=0.2, max_leaf_nodes=100) 163 | model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 164 | sample_weight=np.array(map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) 165 | ) 166 | test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) 167 | test_set.to_csv('result/' + feature_set_folder + '/model4_online_stacking1.csv') 168 | 169 | # model5 170 | model5 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 171 | colsample_bylevel=0.7, seed=10000) 172 | model5.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 173 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 174 | ) 175 | test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) 176 | test_set.to_csv('result/' + feature_set_folder + '/model5_online_stacking1.csv') 177 | 178 | # model6 179 | model6 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=5, colsample_bytree=0.7, subsample=0.7, 180 | colsample_bylevel=0.7) 181 | model6.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 182 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 183 | ) 184 | test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix()) 185 | test_set.to_csv('result/' + feature_set_folder + '/model6_online_stacking1.csv') 186 | 187 | pass 188 | 189 | 190 | # run(['v1', 'v3', 'v11'], ['2016-05-01', '2016-04-25'], 'feature_set1') 191 | # run(['v1', 'v3', 'v11', 'v14'], ['2016-05-01', '2016-04-25'], 'feature_set9') 192 | # submit(['v1', 'v3', 'v11', 'v14'], ['2016-06-01', '2016-05-25'], 'feature_set9') 193 | # run(['v1', 'v2', 'v3', 'v11'], ['2016-05-01', '2016-04-25'], 'feature_set4') 194 | # run(['v1', 'v2', 'v3', 'xxv11'], ['2016-05-01', '2016-04-25', '2016-04-20', '2016-04-15'], 'feature_set6') 195 | # run(['v1', 'v2', 'v3', 'v10'], ['2016-05-01', '2016-04-25'], 'feature_set3') 196 | # submit(['v1', 'v14', 'v3', 'v11'], ['2016-06-01', '2016-05-25'], 'feature_set9') 197 | submit(['v1', 'v3', 'v11'], ['2016-06-01', '2016-05-25'], 'feature_set1') 198 | # submit(['v1', 'v2', 'v3', 'v11', 'v14'], ['2016-06-01', '2016-05-25'], 'feature_set6') 199 | 200 | 201 | f = pd.read_csv('result/feature_set1/model1_offline.csv') 202 | print calculate(f) 203 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | from xgboost import XGBRegressor 5 | from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor 6 | from sklearn.svm import LinearSVR 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.linear_model import LinearRegression 9 | import pandas as pd 10 | import numpy as np 11 | 12 | 13 | def dfs(l, r, feature_file, folder): 14 | if l == r: 15 | return pd.read_csv(folder + '/' + feature_file[l] + '.csv') 16 | mid = (l + r) / 2 17 | l_frame = dfs(l, mid, feature_file, folder) 18 | r_frame = dfs(mid + 1, r, feature_file, folder) 19 | return pd.merge(l_frame, r_frame, how='left') 20 | 21 | 22 | def calculate(test_set): 23 | return np.mean( 24 | test_set[test_set.y != -1].apply( 25 | lambda x: ((x['predictY'] - x['y']) / x['y']) ** 2, 26 | axis=1)) 27 | 28 | 29 | def diff1(x): 30 | if x['_3day_exists_avg'] is None: 31 | return None 32 | return x['_3day_exists_avg'] - x['_7day_exists_avg'] 33 | 34 | 35 | def diff2(x): 36 | if x['_7day_avg'] is None: 37 | return None 38 | return x['_7day_avg'] - x['_30day_avg'] 39 | 40 | 41 | def run(feature_files, training_dates, feature_set_folder): 42 | train_set = pd.concat( 43 | [dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date) for date in training_dates]) 44 | test_set = dfs(0, len(feature_files), feature_files + ['y'], 'dataset/2016-06-01') 45 | test1_set = dfs(0, len(feature_files), feature_files + ['y'], 'dataset/2016-05-25') 46 | # train_set.to_csv('train_set.csv', index=False) 47 | # test_set.to_csv('test_set.csv', index=False) 48 | 49 | ''' 50 | unique_size = pd.read_csv('unique_size.csv') 51 | train_set = pd.merge(train_set, unique_size, how='left') 52 | train_set = train_set[train_set.unique_size > 1] 53 | train_set.drop(['unique_size'], axis=1, inplace=True) 54 | ''' 55 | 56 | train_set = train_set.fillna(-1, downcast='infer') 57 | test_set = test_set.fillna(-1, downcast='infer') 58 | test1_set = test1_set.fillna(-1, downcast='infer') 59 | 60 | train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) 61 | test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) 62 | test1_set['y_log'] = test1_set['y'].apply(lambda x: np.log(1 + x)) 63 | 64 | feature_set = filter(lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], 65 | train_set.columns) 66 | 67 | scaler = StandardScaler() 68 | scaler.fit(train_set[feature_set].as_matrix()) 69 | 70 | # model1 71 | model1 = LinearRegression(normalize=True) 72 | model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 73 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 74 | ) 75 | print zip(feature_set, model1.coef_) 76 | test_set['predictY'] = model1.predict(scaler.transform(test_set[feature_set].as_matrix())) 77 | test_set.to_csv('result/' + feature_set_folder + '/model1_offline.csv') 78 | test1_set['predictY'] = model1.predict(scaler.transform(test1_set[feature_set].as_matrix())) 79 | test1_set.to_csv('result/' + feature_set_folder + '/model1_offline1.csv') 80 | 81 | # model2 82 | model2 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 83 | colsample_bylevel=0.7) 84 | model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 85 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 86 | ) 87 | test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) 88 | test_set.to_csv('result/' + feature_set_folder + '/model2_offline.csv') 89 | test1_set['predictY'] = model2.predict(test1_set[feature_set].as_matrix()) 90 | test1_set.to_csv('result/' + feature_set_folder + '/model2_offline1.csv') 91 | 92 | # model3 93 | model3 = LinearSVR(tol=1e-7) 94 | model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 95 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 96 | ) 97 | test_set['predictY'] = model3.predict(scaler.transform(test_set[feature_set].as_matrix())) 98 | test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv') 99 | test1_set['predictY'] = model3.predict(scaler.transform(test1_set[feature_set].as_matrix())) 100 | test1_set.to_csv('result/' + feature_set_folder + '/model3_offline1.csv') 101 | 102 | # model4 103 | model4 = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features=0.2, max_leaf_nodes=100) 104 | model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 105 | sample_weight=np.array(map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) 106 | ) 107 | test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) 108 | test_set.to_csv('result/' + feature_set_folder + '/model4_offline.csv') 109 | test1_set['predictY'] = model4.predict(test1_set[feature_set].as_matrix()) 110 | test1_set.to_csv('result/' + feature_set_folder + '/model4_offline1.csv') 111 | 112 | # model15 113 | model15 = ExtraTreesRegressor(n_estimators=1000, max_depth=12, max_features=0.3, max_leaf_nodes=400) 114 | model15.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 115 | sample_weight=np.array(map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) 116 | ) 117 | test_set['predictY'] = model15.predict(test_set[feature_set].as_matrix()) 118 | test_set.to_csv('result/' + feature_set_folder + '/model15_offline.csv') 119 | test1_set['predictY'] = model15.predict(test1_set[feature_set].as_matrix()) 120 | test1_set.to_csv('result/' + feature_set_folder + '/model15_offline1.csv') 121 | 122 | # model5 123 | model5 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 124 | colsample_bylevel=0.7, seed=10000) 125 | model5.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 126 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 127 | ) 128 | test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) 129 | test_set.to_csv('result/' + feature_set_folder + '/model5_offline.csv') 130 | test1_set['predictY'] = model5.predict(test1_set[feature_set].as_matrix()) 131 | test1_set.to_csv('result/' + feature_set_folder + '/model5_offline1.csv') 132 | 133 | # model6 134 | model6 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=5, colsample_bytree=0.7, subsample=0.7, 135 | colsample_bylevel=0.7) 136 | model6.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 137 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 138 | ) 139 | test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix()) 140 | test_set.to_csv('result/' + feature_set_folder + '/model6_offline.csv') 141 | test1_set['predictY'] = model6.predict(test1_set[feature_set].as_matrix()) 142 | test1_set.to_csv('result/' + feature_set_folder + '/model6_offline1.csv') 143 | 144 | pass 145 | 146 | 147 | def submit(feature_files, training_dates, feature_set_folder): 148 | train_set = pd.concat( 149 | [dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date) for date in training_dates]) 150 | test_set = dfs(0, len(feature_files), feature_files + ['y'], 'dataset/2016-07-01') 151 | 152 | ''' 153 | unique_size = pd.read_csv('unique_size.csv') 154 | train_set = pd.merge(train_set, unique_size, how='left') 155 | train_set = train_set[train_set.unique_size > 1] 156 | train_set.drop(['unique_size'], axis=1, inplace=True) 157 | ''' 158 | 159 | train_set = train_set.fillna(-1, downcast='infer') 160 | test_set = test_set.fillna(-1, downcast='infer') 161 | 162 | train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) 163 | test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) 164 | 165 | feature_set = filter( 166 | lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], 167 | train_set.columns) 168 | 169 | scaler = StandardScaler() 170 | scaler.fit(train_set[feature_set].as_matrix()) 171 | 172 | # model1 173 | model1 = LinearRegression(normalize=True) 174 | model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 175 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 176 | ) 177 | print zip(feature_set, model1.coef_) 178 | test_set['predictY'] = model1.predict(scaler.transform(test_set[feature_set].as_matrix())) 179 | test_set.to_csv('result/' + feature_set_folder + '/model1_online.csv') 180 | 181 | # model2 182 | model2 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 183 | colsample_bylevel=0.7) 184 | model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 185 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 186 | ) 187 | test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) 188 | test_set.to_csv('result/' + feature_set_folder + '/model2_online.csv') 189 | 190 | # model3 191 | model3 = LinearSVR(tol=1e-7) 192 | model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), 193 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 194 | ) 195 | test_set['predictY'] = model3.predict(scaler.transform(test_set[feature_set].as_matrix())) 196 | test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv') 197 | 198 | # model4 199 | model4 = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features=0.2, max_leaf_nodes=100) 200 | model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 201 | sample_weight=np.array(map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) 202 | ) 203 | test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) 204 | test_set.to_csv('result/' + feature_set_folder + '/model4_online.csv') 205 | 206 | # model5 207 | model5 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, 208 | colsample_bylevel=0.7, seed=10000) 209 | model5.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 210 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 211 | ) 212 | test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) 213 | test_set.to_csv('result/' + feature_set_folder + '/model5_online.csv') 214 | 215 | # model6 216 | model6 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=5, colsample_bytree=0.7, subsample=0.7, 217 | colsample_bylevel=0.7) 218 | model6.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), 219 | sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()) 220 | ) 221 | test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix()) 222 | test_set.to_csv('result/' + feature_set_folder + '/model6_online.csv') 223 | 224 | pass 225 | 226 | 227 | run(['v1', 'v3', 'v11'], ['2016-05-01', '2016-04-25'], 'feature_set1') 228 | # run(['v1', 'v2','v3', 'v11'], ['2016-05-01', '2016-04-25'], 'feature_set2') 229 | # submit(['v1', 'v3', 'v11'], ['2016-06-01', '2016-05-25'], 'feature_set1') 230 | submit(['v1', 'v2', 'v3', 'v11'], ['2016-06-01', '2016-05-25'], 'feature_set2') 231 | -------------------------------------------------------------------------------- /gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -- coding:utf-8 -- 3 | 4 | import pandas as pd 5 | import tools 6 | import numpy as np 7 | import os 8 | 9 | 10 | def _second_min(x): 11 | tmp = np.sort(x.as_matrix()) 12 | return None if len(tmp) < 2 else tmp[1] 13 | 14 | 15 | def _third_min(x): 16 | tmp = np.sort(x.as_matrix()) 17 | return None if len(tmp) < 3 else tmp[2] 18 | 19 | 20 | def makeY(date_begin, folder): 21 | if date_begin == '2016-07-01': 22 | frame = pd.read_csv('data/product_market.csv') 23 | frame['y'] = 0 24 | frame.to_csv(folder + '/' + 'y.csv', index=False) 25 | return 26 | date_end = tools.move_day(date_begin, 30) 27 | frame = pd.read_csv('data/farming.csv') 28 | frame = frame[(frame.time >= date_begin) & (frame.time <= date_end)] 29 | frame[['province', 'market', 'type', 'name', 'time', 'avgprice']].rename(columns={'avgprice': 'y'}).to_csv( 30 | folder + '/' + 'y.csv', index=False) 31 | pass 32 | 33 | 34 | # 特征V1 35 | def makeV1(date_begin, folder): 36 | frame = pd.read_csv('data/farming.csv') 37 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 38 | for day in [1, 2, 3, 4, 7, 14, 21, 30, 60]: 39 | date1 = tools.move_day(date_begin, -day) 40 | frame1 = frame[(frame.time >= date1) & (frame.time < date_begin)] 41 | frame1 = frame1.groupby(['province', 'market', 'type', 'name'], as_index=False)['avgprice'].agg( 42 | {'_' + str(day) + 'day_avg': np.mean, 43 | '_' + str(day) + 'day_std': np.std, 44 | '_' + str(day) + 'day_min': np.min, 45 | }) 46 | frame_final = pd.merge(frame_final, frame1, how='left', on=['province', 'market', 'type', 'name']) 47 | frame_final.drop(['y'], axis=1, inplace=True) 48 | frame_final.to_csv(folder + '/' + 'v1.csv', index=False) 49 | pass 50 | 51 | 52 | # 特征V2 53 | def makeV2(date_begin, folder): 54 | frame = pd.read_csv('data/farming.csv') 55 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 56 | frame.sort_values(by=['time'], inplace=True) 57 | item_group = frame.groupby(['province', 'market', 'type', 'name'], as_index=False) 58 | frame_final = pd.merge(frame_final, 59 | item_group['time'].agg({ 60 | '_last_sell': ( 61 | lambda x: -1 if len(x) == 0 else tools.time_diff(x.as_matrix()[-1], date_begin)), 62 | '_first_sell': ( 63 | lambda x: -1 if len(x) == 0 else tools.time_diff(x.as_matrix()[0], date_begin)), 64 | # '_has_record': np.size, 65 | }), how='left') 66 | frame_final = pd.merge(frame_final, item_group['avgprice'].agg({'_last_price': lambda x: x.as_matrix()[-1]}), 67 | how='left') 68 | frame_final.drop(['y'], axis=1, inplace=True) 69 | frame_final.to_csv(folder + '/' + 'v2.csv', index=False) 70 | pass 71 | 72 | 73 | # 特征V3 74 | def makeV3(date_begin, folder): 75 | frame = pd.read_csv('data/farming.csv') 76 | frame.sort_values(by=['time'], inplace=True) 77 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 78 | frame = frame[frame.time < date_begin] 79 | frame_group = frame.groupby(['province', 'market', 'type', 'name'], as_index=False) 80 | for day in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: 81 | frame1 = frame_group['avgprice'].agg( 82 | {'_' + str(day) + 'day_exists_avg': lambda x: np.mean(x.as_matrix()[-day:]), 83 | '_' + str(day) + 'day_exists_std': lambda x: np.std(x.as_matrix()[-day:]), 84 | '_' + str(day) + 'day_exists_min': lambda x: np.min(x.as_matrix()[-day:]), 85 | }) 86 | frame_final = pd.merge(frame_final, frame1, how='left', on=['province', 'market', 'type', 'name']) 87 | frame_final.drop(['y'], axis=1, inplace=True) 88 | frame_final.to_csv(folder + '/' + 'v3.csv', index=False) 89 | pass 90 | 91 | 92 | def makeV4(date_begin, folder): 93 | frame = pd.read_csv('data/farming.csv') 94 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 95 | for day in [1, 2, 3, 4, 7, 14, 21, 30, 60]: 96 | date1 = tools.move_day(date_begin, -day) 97 | frame1 = frame[(frame.time >= date1) & (frame.time < date_begin)] 98 | frame1 = frame1.groupby(['province', 'market', 'type', 'name'], as_index=False)['avgprice'].agg( 99 | {'_' + str(day) + 'day_size': lambda x: len(x), 100 | }) 101 | frame_final = pd.merge(frame_final, frame1, how='left', on=['province', 'market', 'type', 'name']) 102 | frame_final.drop(['y'], axis=1, inplace=True) 103 | frame_final.to_csv(folder + '/' + 'v4.csv', index=False) 104 | pass 105 | 106 | 107 | def makeV5(date_begin, folder): 108 | frame = pd.read_csv('data/farming.csv') 109 | frame.sort_values(by=['time'], inplace=True) 110 | frame['minprice'] = frame['minprice'].apply(lambda x: None if x == 0.0 else x) 111 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 112 | frame = frame[frame.time < date_begin] 113 | frame_group = frame.groupby(['province', 'market', 'type', 'name'], as_index=False) 114 | for day in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: 115 | frame1 = frame_group['minprice'].agg( 116 | {'_' + str(day) + 'day_exists_avg_min': lambda x: np.mean(x.as_matrix()[-day:]), 117 | '_' + str(day) + 'day_exists_std_min': lambda x: np.std(x.as_matrix()[-day:]), 118 | '_' + str(day) + 'day_exists_min_min': lambda x: np.min(x.as_matrix()[-day:]), 119 | }) 120 | frame_final = pd.merge(frame_final, frame1, how='left', on=['province', 'market', 'type', 'name']) 121 | frame_final.drop(['y'], axis=1, inplace=True) 122 | frame_final.to_csv(folder + '/' + 'v5.csv', index=False) 123 | pass 124 | 125 | 126 | def makeV6(date_begin, folder): 127 | frame = pd.read_csv('data/farming.csv') 128 | frame.sort_values(by=['time'], inplace=True) 129 | frame['maxprice'] = frame['maxprice'].apply(lambda x: None if x == 0.0 else x) 130 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 131 | frame = frame[frame.time < date_begin] 132 | frame_group = frame.groupby(['province', 'market', 'type', 'name'], as_index=False) 133 | for day in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: 134 | frame1 = frame_group['maxprice'].agg( 135 | {'_' + str(day) + 'day_exists_avg_max': lambda x: np.mean(x.as_matrix()[-day:]), 136 | '_' + str(day) + 'day_exists_std_max': lambda x: np.std(x.as_matrix()[-day:]), 137 | '_' + str(day) + 'day_exists_min_max': lambda x: np.min(x.as_matrix()[-day:]), 138 | }) 139 | frame_final = pd.merge(frame_final, frame1, how='left', on=['province', 'market', 'type', 'name']) 140 | frame_final.drop(['y'], axis=1, inplace=True) 141 | frame_final.to_csv(folder + '/' + 'v6.csv', index=False) 142 | pass 143 | 144 | 145 | # 特征V7 146 | def makeV7(date_begin, folder): 147 | frame = pd.read_csv('data/farming.csv') 148 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 149 | for day in [1, 2, 3, 4, 7, 14, 21, 30, 60]: 150 | date1 = tools.move_day(date_begin, -day) 151 | frame1 = frame[(frame.time >= date1) & (frame.time < date_begin)] 152 | frame1 = frame1.groupby(['province', 'market', 'type', 'name'], as_index=False)['avgprice'].agg( 153 | {'_' + str(day) + 'day_offset': lambda x: np.max(x) - np.min(x), 154 | '_' + str(day) + 'day_min2': _second_min, 155 | '_' + str(day) + 'day_min3': _third_min, 156 | }) 157 | frame_final = pd.merge(frame_final, frame1, how='left', on=['province', 'market', 'type', 'name']) 158 | frame_final.drop(['y'], axis=1, inplace=True) 159 | frame_final.to_csv(folder + '/' + 'v7.csv', index=False) 160 | pass 161 | 162 | 163 | # 特征V8 164 | def makeV8(date_begin, folder): 165 | frame = pd.read_csv('data/farming.csv') 166 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 167 | for day in [1, 2, 3, 4, 7, 14, 21, 30, 60]: 168 | date1 = tools.move_day(date_begin, -day) 169 | frame1 = frame[(frame.time >= date1) & (frame.time < date_begin)] 170 | frame1 = frame1.groupby(['name', 'type'], as_index=False)['avgprice'].agg( 171 | {'_' + str(day) + 'day_avg': np.mean, 172 | }) 173 | frame_final = pd.merge(frame_final, frame1, how='left', on=['type', 'name']) 174 | frame_final.drop(['y'], axis=1, inplace=True) 175 | frame_final.to_csv(folder + '/' + 'v8.csv', index=False) 176 | pass 177 | 178 | 179 | # 特征V9 180 | def makeV9(date_begin, folder): 181 | frame = pd.read_csv('data/farming.csv') 182 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 183 | 184 | # 前半个月平均值 185 | date1 = tools.move_day(date_begin, -30) 186 | date2 = tools.move_day(date_begin, -15) 187 | frame1 = frame[(frame.time >= date1) & (frame.time < date2)] 188 | frame1 = frame1.groupby(['province', 'market', 'type', 'name'], as_index=False)['avgprice'].agg( 189 | {'_half_month_ago_day_avg': np.mean, }) 190 | frame_final = pd.merge(frame_final, frame1, how='left', on=['province', 'market', 'type', 'name']) 191 | 192 | # 前1个月平均值 193 | date1 = tools.move_day(date_begin, -60) 194 | date2 = tools.move_day(date_begin, -30) 195 | frame1 = frame[(frame.time >= date1) & (frame.time < date2)] 196 | frame1 = frame1.groupby(['province', 'market', 'type', 'name'], as_index=False)['avgprice'].agg( 197 | {'_one_month_ago_day_avg': np.mean, }) 198 | frame_final = pd.merge(frame_final, frame1, how='left', on=['province', 'market', 'type', 'name']) 199 | frame_final.drop(['y'], axis=1, inplace=True) 200 | frame_final.to_csv(folder + '/' + 'v9.csv', index=False) 201 | 202 | 203 | # 特征V10 204 | def makeV10(date_begin, folder): 205 | frame = pd.read_csv('data/farming.csv') 206 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 207 | TYPES = frame['type'].drop_duplicates().as_matrix() 208 | for i in xrange(len(TYPES)): 209 | frame_final['_is_type' + str(i)] = frame_final['type'].apply(lambda x: 1 if x == TYPES[i] else 0) 210 | frame_final.drop(['y'], axis=1, inplace=True) 211 | frame_final.to_csv(folder + '/' + 'v10.csv', index=False) 212 | pass 213 | 214 | 215 | # 特征V11 216 | def makeV11(date_begin, folder): 217 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 218 | frame_final['time_diff'] = frame_final['time'].apply(lambda x: tools.time_diff(x, date_begin)) 219 | frame_final.drop(['y'], axis=1, inplace=True) 220 | frame_final.to_csv(folder + '/' + 'v11.csv', index=False) 221 | pass 222 | 223 | 224 | # 特征V12 225 | def makeV12(date_begin, folder): 226 | frame_final = pd.read_csv(folder + '/' + 'y.csv') 227 | frame_final['time_diff'] = frame_final['time'].apply(lambda x: np.log(tools.time_diff(x, date_begin))) 228 | frame_final.drop(['y'], axis=1, inplace=True) 229 | frame_final.to_csv(folder + '/' + 'v12.csv', index=False) 230 | pass 231 | 232 | 233 | # 特征V13 234 | def makeV13(date_begin, folder): 235 | frame_final = pd.read_csv(folder + '/' + 'v1.csv') 236 | DAYS = [60, 30, 21, 14, 7, 4, 3, 2, 1] 237 | for i in xrange(len(DAYS) - 1): 238 | frame_final['_' + str(DAYS[i + 1]) + 'day_avg'] = frame_final.apply( 239 | lambda x: x['_' + str(DAYS[i]) + 'day_avg'] if np.isnan(x['_' + str(DAYS[i + 1]) + 'day_avg']) else 240 | x['_' + str(DAYS[i + 1]) + 'day_avg'], axis=1 241 | ) 242 | frame_final['_' + str(DAYS[i + 1]) + 'day_min'] = frame_final.apply( 243 | lambda x: x['_' + str(DAYS[i]) + 'day_min'] if np.isnan(x['_' + str(DAYS[i + 1]) + 'day_min']) else 244 | x['_' + str(DAYS[i + 1]) + 'day_avg'], axis=1 245 | ) 246 | frame_final.to_csv(folder + '/' + 'v13.csv', index=False) 247 | pass 248 | 249 | DATES = ['2016-06-01', '2016-05-25', '2016-05-01', '2016-04-25', '2016-07-01', '2015-06-01', '2015-07-01', 250 | '2016-04-01', '2016-03-01', '2016-05-15', '2016-05-20', '2016-04-15', '2016-04-20'] 251 | FOLDER = [] 252 | 253 | for i in xrange(len(DATES)): 254 | FOLDER.append('dataset/' + DATES[i]) 255 | if not os.path.exists(FOLDER[i]): 256 | os.mkdir(FOLDER[i]) 257 | makeY(DATES[i], FOLDER[i]) 258 | makeV1(DATES[i], FOLDER[i]) 259 | makeV2(DATES[i], FOLDER[i]) 260 | makeV3(DATES[i], FOLDER[i]) 261 | # makeV4(DATES[i], FOLDER[i]) 262 | # makeV5(DATES[i], FOLDER[i]) 263 | # makeV6(DATES[i], FOLDER[i]) 264 | # makeV7(DATES[i], FOLDER[i]) 265 | # makeV8(DATES[i], FOLDER[i]) 266 | # makeV9(DATES[i], FOLDER[i]) 267 | # makeV10(DATES[i], FOLDER[i]) 268 | makeV11(DATES[i], FOLDER[i]) 269 | # makeV12(DATES[i], FOLDER[i]) 270 | # makeV13(DATES[i], FOLDER[i]) 271 | --------------------------------------------------------------------------------