├── README.md ├── model.py └── 榜4冠军.png /README.md: -------------------------------------------------------------------------------- 1 | # 凤凰金融量化投资大赛 2 | 3 | ### 初赛第四阶段冠军选手代码 4 | 5 | ![](./榜4冠军.png) 6 | 7 | ### 代码运行方式 8 | ```python 9 | python model.py 10 | ``` -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Run in Python 3.6 4 | # Author: Gu Quan 5 | # All rights reserved. 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import os 10 | import lightgbm as lgb 11 | 12 | 13 | # ----------------------------------- 14 | # 1. 批量读取数据 15 | def readRoundFiles(round_dir): 16 | file_list = os.listdir(round_dir) 17 | def readDayFile(filename): 18 | data = pd.read_csv(round_dir + filename) 19 | data["day"] = int(filename.split(".")[0]) 20 | return data 21 | data = pd.concat([readDayFile(f) for f in file_list]).reset_index(drop=True) 22 | data = data.sort_values(["code","day"]).reset_index(drop=True) 23 | data["yesterday"] = data.groupby("code")["close"].shift(1) 24 | data["ex_right"] = (data.close / data.yesterday - 1) <= -0.11 25 | data["ex_prod"] = np.where(data.ex_right == True, data.yesterday / data.close, 1) 26 | data.ex_prod = data.groupby("code")["ex_prod"].cumprod() 27 | data["close_ori"] = data.close * data.ex_prod 28 | return data 29 | 30 | 31 | # ----------------------------------- 32 | # 2. 根据训练集的起止时间生成rawX 33 | def getRawX(data, start_day, end_day, min_days_cnt, target_day=None): 34 | ''' 35 | target_day: 后面的函数会用到,对每只股票,根据target_day和end_day的收盘价计算收益率 36 | ''' 37 | stockX1 = data.code[data.day == end_day].unique() 38 | if target_day: 39 | stockX2 = data.code[data.day == target_day].unique() 40 | stockX = np.intersect1d(stockX1, stockX2) 41 | else: 42 | stockX = stockX1 43 | dataX = data[(data.code.isin(stockX)) & (data.day >= start_day) & (data.day <= end_day)] 44 | dataX.loc[:, "day_cnt"] = dataX.groupby("code")["day"].transform("count") 45 | dataX = dataX[dataX.day_cnt >= min_days_cnt] 46 | stocks = pd.DataFrame({"key":1, "code": dataX.code.unique()}) 47 | days = pd.DataFrame({"key":1, "day": np.array(range(start_day, end_day+1))}) 48 | stock_day = pd.merge(stocks, days, how="outer").drop(columns="key") 49 | dataX = pd.merge(dataX, stock_day, how="right", on=["code","day"]) 50 | dataX = dataX[["code","day","close","close_ori","yesterday","ex_right","day_cnt"] + 51 | ["f"+str(i) for i in range(1, 87+1)]] 52 | dataX = dataX.sort_values(["code","day"]).reset_index(drop=True) 53 | dataX = dataX.groupby("code").apply(lambda g: g.interpolate(method='linear')) 54 | dataX = dataX.fillna(method="backfill") 55 | def replaceInf(col): 56 | repl = col[col!=np.inf].max() 57 | col[col==np.inf] = repl 58 | return col 59 | inf_cols = (dataX==np.inf).sum() 60 | inf_cols = list(inf_cols[inf_cols>0].index) 61 | inf_rows = False 62 | for col in inf_cols: 63 | inf_rows = (dataX[col]==np.inf) | inf_rows 64 | inf_code = dataX.code[inf_rows].unique() 65 | dataX.loc[dataX.code.isin(inf_code), inf_cols] = \ 66 | dataX.loc[dataX.code.isin(inf_code), ["code"] + inf_cols].\ 67 | groupby("code").apply(lambda g: g[inf_cols].apply(replaceInf)) 68 | return dataX 69 | 70 | 71 | # 每一个round的数据生成两份训练集,增加样本量 72 | def getTrainData(round_dir): 73 | data = readRoundFiles(round_dir) 74 | rawX1 = getRawX(data, start_day=1, end_day=244, min_days_cnt=135, target_day=366) 75 | rawX2 = getRawX(data, start_day=123, end_day=366, min_days_cnt=135, target_day=488) 76 | feas_X1 = quickFeaX(rawX1, end_day=244) 77 | feas_X2 = quickFeaX(rawX2, end_day=366) 78 | y1_df = getY(data, rawX1.code.unique(), X_end_day=244, target_day=366) 79 | y2_df = getY(data, rawX1.code.unique(), X_end_day=366, target_day=488) 80 | train1 = y1_df.merge(feas_X1, on="code").rename(columns={"rate": "y"}) 81 | train2 = y2_df.merge(feas_X2, on="code").rename(columns={"rate": "y"}) 82 | return train1, train2 83 | 84 | 85 | # ----------------------------------- 86 | # 3. 特征工程 87 | def joinFeas(*args): 88 | assert len(args)>=2 89 | features = args[0] 90 | for fea in args[1:]: 91 | features = features.merge(fea, how="left", on="code") 92 | return features 93 | 94 | # 以下是每类特征的构造函数 95 | def closeMean(rawX, end_day, last_days_num): 96 | rawX = rawX.loc[(rawX.day >= end_day-last_days_num+1) & (rawX.day <= end_day), ["code","day","close_ori"]] 97 | mean = rawX.groupby("code")["close_ori"].mean().to_frame() 98 | mean.loc[:, "code"] = mean.index.values 99 | mean.columns.values[0] = "mean_close_last" + str(last_days_num) 100 | res = mean.iloc[:,[1,0]].reset_index(drop=True) 101 | return res 102 | 103 | def RSV(rawX, end_day, last_days_num, days_in_compute=244): 104 | rawX = rawX[["code","day","close_ori"]] 105 | fea = pd.DataFrame({"code": rawX.code.unique()}) 106 | fea.loc[:, "min_close"] = rawX.loc[(rawX.day >= end_day-days_in_compute+1) & (rawX.day <= end_day), ].\ 107 | groupby("code")["close_ori"].min().values 108 | fea.loc[:, "max_close"] = rawX.loc[(rawX.day >= end_day-days_in_compute+1) & (rawX.day <= end_day), ].\ 109 | groupby("code")["close_ori"].max().values 110 | fea.loc[:, "mean_close"] = rawX.loc[(rawX.day >= end_day-last_days_num+1) & (rawX.day <= end_day), ].\ 111 | groupby("code")["close_ori"].mean().values 112 | fea.loc[:, "rsv_last"+str(last_days_num)+"_in"+str(days_in_compute)] = \ 113 | (fea.mean_close - fea.min_close) / (fea.max_close - fea.min_close) 114 | return fea[["code", "rsv_last"+str(last_days_num)+"_in"+str(days_in_compute)]] 115 | 116 | def rateStat(rawX, end_day, last_days_num, method="mean", days_before=122): 117 | rawX = rawX[["code","day","close_ori"]] 118 | rawX.loc[:, "close_bf"] = rawX.groupby("code")["close_ori"].shift(days_before) 119 | rawX.loc[:, "rate"] = rawX.close_ori / rawX.close_bf - 1 120 | if method == "mean": 121 | res = rawX.loc[(rawX.day >= end_day-last_days_num+1) & (rawX.day <= end_day), :].\ 122 | groupby("code")["rate"].mean().to_frame() 123 | if method == "std": 124 | res = rawX.loc[(rawX.day >= end_day-last_days_num+1) & (rawX.day <= end_day), :].\ 125 | groupby("code")["rate"].std().to_frame() 126 | res.loc[:, "code"] = res.index.values 127 | res.columns.values[0] = method + "_rate_last" + str(last_days_num) 128 | res = res.iloc[:,[1,0]].reset_index(drop=True) 129 | return res 130 | 131 | def rateRise(rawX, end_day, last_days_num): 132 | rawX = rawX[["code","day","close_ori"]] 133 | start = rawX[rawX.day == end_day-last_days_num+1].rename(columns={"close_ori":"start"}).reset_index(drop=True) 134 | end = rawX[rawX.day == end_day].drop(columns="day").rename(columns={"close_ori":"end"}).reset_index(drop=True) 135 | res = start.merge(end, on="code") 136 | res.loc[:, "rise_last"+str(last_days_num)] = res.end / res.start - 1 137 | res = res[["code", "rise_last"+str(last_days_num)]] 138 | return res 139 | 140 | def RSI(rawX, end_day, last_days_num): 141 | rawX = rawX[["code","day","close_ori"]] 142 | rawX.loc[:, "ystday"] = rawX.groupby("code")["close_ori"].shift(1) 143 | rawX = rawX[(rawX.day >= end_day-last_days_num+1) & (rawX.day <= end_day)] 144 | rawX.loc[:, "is_up"] = np.where(rawX.close_ori - rawX.ystday > 0, 1, 0) 145 | rawX.loc[:, "is_down"] = np.where(rawX.close_ori - rawX.ystday < 0, 1, 0) 146 | res = rawX.groupby("code")[["is_up","is_down"]].sum() 147 | res.loc[:, "code"] = res.index.values 148 | res.loc[:, "rsi_last"+str(last_days_num)] = res.is_up / (res.is_down + 1e-2) 149 | res = res[["code", "rsi_last"+str(last_days_num)]].reset_index(drop=True) 150 | return res 151 | 152 | def exrightStat(rawX, end_day, days_in_compute=244): 153 | rawX = rawX[["code","day","close","yesterday","ex_right"]] 154 | rawX["ex_prod"] = np.where(rawX.ex_right == True, rawX.yesterday / rawX.close, 1) 155 | rawX = rawX[(rawX.day >= end_day-days_in_compute+1) & (rawX.day <= end_day)] 156 | fea = pd.DataFrame({"code": rawX.code.unique()}) 157 | fea.loc[:, "cnt_ex_in"+str(days_in_compute)] = rawX.groupby("code")["ex_right"].sum().values 158 | fea.loc[:, "mean_exprod_in"+str(days_in_compute)] = rawX.groupby("code")["ex_prod"].mean().values 159 | return fea 160 | 161 | def feaFi(rawX, end_day, last_days_num=244): 162 | rawX = rawX.loc[(rawX.day >= end_day-last_days_num+1) & (rawX.day <= end_day), 163 | ["code","day"] + ["f"+str(i) for i in range(1,87+1)]] 164 | res = rawX.groupby("code").mean().drop(columns="day") 165 | res.loc[:, "code"] = res.index.values 166 | return res.reset_index(drop=True) 167 | 168 | # 整合全部特征 169 | def quickFeaX(rawXi, end_day): 170 | fea_Xi_closeMean = joinFeas(closeMean(rawXi, end_day, 5), closeMean(rawXi, end_day, 20), closeMean(rawXi, end_day, 50), 171 | closeMean(rawXi, end_day, 80), closeMean(rawXi, end_day, 122), closeMean(rawXi, end_day, 244)) 172 | fea_Xi_RSV = joinFeas(RSV(rawXi, end_day, 1), RSV(rawXi, end_day, 5), RSV(rawXi, end_day, 10)) 173 | fea_Xi_rateStat = joinFeas(rateStat(rawXi, end_day, 10, "mean"), rateStat(rawXi, end_day, 30, "mean"), 174 | rateStat(rawXi, end_day, 60, "mean"), rateStat(rawXi, end_day, 122, "mean"), 175 | rateStat(rawXi, end_day, 10, "std"), rateStat(rawXi, end_day, 30, "std"), 176 | rateStat(rawXi, end_day, 60, "std"), rateStat(rawXi, end_day, 122, "std")) 177 | fea_Xi_rateRise = joinFeas(rateRise(rawXi, end_day, 20), rateRise(rawXi, end_day, 50), rateRise(rawXi, end_day, 80), 178 | rateRise(rawXi, end_day, 122), rateRise(rawXi, end_day, 244)) 179 | fea_Xi_RSI = joinFeas(RSI(rawXi, end_day, 20), RSI(rawXi, end_day, 50), RSI(rawXi, end_day, 80), 180 | RSI(rawXi, end_day, 122), RSI(rawXi, end_day, 244)) 181 | fea_Xi_ex = exrightStat(rawXi, end_day) 182 | fea_Xi_fi = feaFi(rawXi, end_day) 183 | feas_Xi = joinFeas(fea_Xi_closeMean, fea_Xi_RSV, fea_Xi_rateStat, fea_Xi_rateRise, fea_Xi_RSI, fea_Xi_ex, fea_Xi_fi) 184 | return feas_Xi 185 | 186 | 187 | # ----------------------------------- 188 | # 5. 计算标签 189 | def getY(data, codes, X_end_day, target_day): 190 | dataY = data.loc[data.code.isin(codes), ["code","day","close_ori"]] 191 | dataY = dataY[dataY.day.isin([X_end_day, target_day])].sort_values(["code","day"]).reset_index(drop=True) 192 | dataY["close_tg"] = dataY.groupby("code")["close_ori"].shift(-1) 193 | dataY["rate"] = dataY.close_tg / dataY.close_ori - 1 194 | res = dataY.loc[dataY.day==X_end_day, ["code","rate"]].reset_index(drop=True) 195 | return res 196 | 197 | 198 | # ----------------------------------- 199 | # 6. 模型线下评估 200 | def myEval(y_true, y_pred): 201 | y_true, y_pred = np.array(y_true), np.array(y_pred) 202 | min20 = y_true[np.argsort(y_true)[:20]] 203 | max20 = y_true[np.argsort(y_true)[-20:][::-1]] 204 | pred20 = y_true[np.argsort(y_pred)[-20:][::-1]] 205 | print(min20.mean(), max20.mean(), pred20.mean()) 206 | res = (pred20.mean() - min20.mean()) / (max20.mean() - min20.mean()) 207 | return res 208 | 209 | 210 | def myLgbEval(y_pred, train_data): 211 | y_true = train_data.get_label() 212 | y_true, y_pred = np.array(y_true), np.array(y_pred) 213 | min20 = y_true[np.argsort(y_true)[:20]] 214 | max20 = y_true[np.argsort(y_true)[-20:][::-1]] 215 | pred20 = y_true[np.argsort(y_pred)[-20:][::-1]] 216 | res = (pred20.mean() - min20.mean()) / (max20.mean() - min20.mean()) 217 | return 'eval', res, True 218 | 219 | def lgbImp(model): 220 | assert "train1_rB" in globals() 221 | res = pd.DataFrame({"feature": train1_rB.columns[2:], "imp": model.feature_importance("split")}) 222 | res = res.sort_values("imp", ascending=False).reset_index(drop=True) 223 | return res 224 | 225 | 226 | # 基本上代码运行还是很快的 227 | if __name__ == '__main__': 228 | 229 | # Make sure the current directory is the parent directory of "data" !!! 230 | # os.chdir(r"xx/xx/xx/凤凰金融") 231 | train1_r4, train2_r4 = getTrainData(r"./data/round4/") 232 | train1_rB, train2_rB = getTrainData(r"./data/roundB/") 233 | 234 | train_final = pd.concat([train1_r4, train1_rB, train2_r4, train2_rB]).reset_index(drop=True) 235 | weight_final = np.array([1]*len(train1_r4) + [1]*len(train1_rB) + [2]*len(train2_r4) + [2]*len(train2_rB)) 236 | lgb_dat = lgb.Dataset(train_final.iloc[:,2:], train_final.y, weight=weight_final) 237 | 238 | lgb_params = { 239 | 'boosting_type': 'gbdt', 240 | 'application': 'regression', 241 | 'metric': 'mae', 242 | 'num_leaves': 61, 243 | 'max_depth': 12, 244 | 'learning_rate': 0.01, 245 | 'verbose': 1, 246 | 'seed': 2018, 247 | } 248 | # LightGBM建模 249 | lgb_model = lgb.train(lgb_params, lgb_dat, num_boost_round=100, verbose_eval=10, 250 | valid_sets=lgb_dat, early_stopping_rounds=None, feval=myLgbEval) 251 | 252 | data_r4 = readRoundFiles("./data/round4/") 253 | rawX3_r4 = getRawX(data_r4, start_day=245, end_day=488, min_days_cnt=135, target_day=None) 254 | test_r4 = quickFeaX(rawX3_r4, end_day=488) 255 | 256 | lgb_pred = lgb_model.predict(test_r4.iloc[:,1:]) 257 | lgb_pred = pd.DataFrame({"code": test_r4.code, "rate": lgb_pred}).sort_values("rate", ascending=False) 258 | submit = lgb_pred.code[:20] # To Submit for Round 4, maybe a little difference because of the random "seed". 259 | print("--------- Submit Results ---------") 260 | print(submit) 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | -------------------------------------------------------------------------------- /榜4冠军.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZJUguquan/FengjrCompetitionPrelim/a3ebd32c1fd2599024a4336441107e28ee1568be/榜4冠军.png --------------------------------------------------------------------------------