├── README.md ├── barra_generate.py ├── configs.py ├── portfolio_optimization.py ├── requirements.txt └── tools ├── __init__.py ├── barra_calculator.py ├── barra_optimizer.py ├── factor_utils.py ├── file_utils.py └── price_data.py /README.md: -------------------------------------------------------------------------------- 1 | # Barra 2 | 1. 计算Barra因子及其收益率 3 | 2. 进行持仓分析 4 | 3. 利用Barra进行投资组合优化 5 | -------------------------------------------------------------------------------- /barra_generate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 9 09:33:10 2022 4 | 5 | @author: PCXU 6 | """ 7 | 8 | 9 | import os 10 | 11 | from matplotlib.backends.backend_pdf import PdfPages 12 | from matplotlib import pyplot as plt 13 | from matplotlib import ticker 14 | from tqdm import tqdm 15 | import pandas as pd 16 | import numpy as np 17 | 18 | import configs as cfg 19 | from tools import * 20 | 21 | plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 22 | plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 23 | 24 | 25 | def get_pos_expose(pos_data: pd.DataFrame, 26 | asset_expose: pd.DataFrame, 27 | industry_df: pd.DataFrame = None) -> pd.DataFrame: 28 | """ 29 | 获取持仓在Barra风格因子上的暴露 30 | 31 | Parameters 32 | ---------- 33 | pos_data : pd.DataFrame, index=(datetime, asset) 34 | 持仓信息,至少包含weight一列 35 | asset_expose : pd.DataFrame, index=(datetime, asset), columns=style_factor 36 | 个股在Barra风格因子上的暴露信息 37 | industry_df: pd.DataFrame, optional, index=asset, columns=industry_name 38 | 行业分类信息,若不为None,则返回在国家、行业和风格因子上的暴露, 39 | 否则,仅返回风格因子暴露 40 | The default is None. 41 | 42 | Returns 43 | ------- 44 | pd.DataFrame 45 | 46 | """ 47 | dt = asset_expose.index.get_level_values(0).unique() 48 | pos_data = pos_data[dt[0]:dt[-1]] 49 | datetime = pos_data.index.get_level_values(0).unique() 50 | if type(industry_df) == pd.DataFrame: 51 | col_names = ['country', ] + \ 52 | list(industry_df.columns)+list(asset_expose.columns) 53 | else: 54 | col_names = asset_expose.columns 55 | pos_expose = pd.DataFrame(index=datetime, columns=col_names) 56 | 57 | for i in tqdm(datetime): 58 | hold = pos_data.loc[i, 'weight'] 59 | assets = hold.index 60 | hold = hold.values.reshape(1, -1) 61 | 62 | if type(industry_df) == pd.DataFrame: 63 | country = np.ones((len(assets), 1)) 64 | industry = industry_df.loc[assets, :] 65 | style = asset_expose.loc[(i, assets), :].fillna(0).values 66 | expose = np.hstack((country, industry, style)) 67 | else: 68 | expose = asset_expose.loc[(i, assets), :].fillna(0).values 69 | 70 | pos_expose.loc[i, :] = np.matmul(hold, expose) 71 | return pos_expose 72 | 73 | 74 | def get_his_q(df: pd.DataFrame, 75 | min_window: int = 252) -> pd.DataFrame: 76 | """ 77 | 返回df的历史分位数 78 | 79 | Parameters 80 | ---------- 81 | df : pd.DataFrame, index=datetime 82 | 要计算的df 83 | min_window : int, optional 84 | 计算历史分位数的最小窗口 85 | The default is 252. 86 | 87 | Returns 88 | ------- 89 | pd.DataFrame 90 | 91 | """ 92 | datetime = df.index 93 | factors = df.columns 94 | df_q = pd.DataFrame(index=datetime, columns=factors) 95 | for i in range(df.shape[0]): 96 | if i >= min_window: 97 | for f in factors: 98 | his = df.loc[:datetime[i-1], f] 99 | now = df.loc[datetime[i], f] 100 | df_q.loc[datetime[i], f] = (his < now).sum()/i 101 | return df_q 102 | 103 | 104 | if __name__ == '__main__': 105 | # # 行业分类信息 106 | # industry_df = pd.read_csv(os.path.join(cfg.DATA_ROOT_PATH, 'industry.csv'), 107 | # index_col=[0]) 108 | # industry_df.index = industry_df.index.astype(str) 109 | # industry_df = industry_df.sort_index() 110 | # industry_df.index.name = 'asset' 111 | # industry_df = pd.merge(industry_df, pd.get_dummies( 112 | # industry_df['industry']), left_index=True, right_index=True) 113 | # industry_df.to_pickle(cfg.INDUSTRY_PATH) 114 | 115 | # 资产列表 116 | industry_df = pd.read_pickle(cfg.INDUSTRY_PATH) 117 | ticker_list = list(industry_df.index) 118 | 119 | # # 日频价量数据 120 | # FR = load_FR_pkl(cfg.FR_PATH) 121 | # # 交易日 122 | # trade_day = pd.DataFrame( 123 | # index=FR.index.get_level_values(0).unique().sort_values()) 124 | # trade_day.to_pickle(cfg.DATA_ROOT_PATH+"trade_day.pkl") 125 | # trade_day = trade_day.index 126 | 127 | # daily_data = pd.DataFrame() 128 | # fields = ['open', 'high', 'low', 'close', 'volume', 'pb', 'total_share'] 129 | # for f in fields: 130 | # df = FR[f].unstack() 131 | # df.columns = df.columns.astype('str') 132 | # df = stand_df(df, ticker_list, trade_day) 133 | # daily_data[f] = df.stack() 134 | 135 | # close = daily_data['close'].unstack() 136 | # total_share = daily_data['total_share'].unstack() 137 | # market_value = close*total_share 138 | # daily_data['market_value'] = market_value.stack() 139 | # daily_data.index.names = ['datetime', 'asset'] 140 | 141 | # daily_data.to_pickle(os.path.join( 142 | # cfg.DATA_ROOT_PATH, 'daily_info.pkl')) 143 | 144 | daily_data = PriceDataBase(pd.read_pickle(os.path.join( 145 | cfg.DATA_ROOT_PATH, 'daily_info.pkl'))) 146 | trade_day = pd.read_pickle(cfg.DATA_ROOT_PATH+"trade_day.pkl").index 147 | 148 | # # 财报基本面数据 149 | # net_income, oper_revenue = load_quarter_data(cfg.QUARTER_DATA, pd.date_range( 150 | # start='20040331', end='20210930', freq='3M'), ticker_list) 151 | # net_income_TTM = get_TTM(net_income) 152 | # oper_revenue_TTM = get_TTM(oper_revenue) 153 | # quarter_data = pd.concat( 154 | # [oper_revenue_TTM.stack(), net_income_TTM.stack()], axis=1, sort=True) 155 | # quarter_data.columns = ['oper_revenue_TTM', 'net_income_TTM'] 156 | # quarter_data.index.names = ['datetime', 'asset'] 157 | # quarter_data.to_pickle(os.path.join( 158 | # cfg.DATA_ROOT_PATH, 'quarter_info.pkl')) 159 | 160 | quarter_data = PriceDataBase(pd.read_pickle(os.path.join( 161 | cfg.DATA_ROOT_PATH, 'quarter_info.pkl'))) 162 | 163 | market_value = daily_data.get_table('market_value') 164 | close = daily_data.get_table('close') 165 | # pb = daily_data.get_table('pb') 166 | # volume = daily_data.get_table('volume') 167 | # total_share = daily_data.get_table('total_share') 168 | 169 | ret = close.pct_change().replace([np.inf, -np.inf], 0) 170 | # net_income = quarter_data.get_table('net_income_TTM') 171 | # oper_revenue = quarter_data.get_table('oper_revenue_TTM') 172 | 173 | # # 计算二级Barra因子 174 | # BC = BarraCalculator(rf=cfg.Rf_daily, datetime=trade_day, start='20090401') 175 | # size = BC.get_size(market_value) 176 | # beta, hsigma = BC.get_beta_hsigma(ret, market_value, 252) 177 | # rstr = BC.get_rstr(ret, 504, 126, 21) 178 | # dastd = BC.get_dastd(ret, 252, 42) 179 | # cmra = BC.get_cmra(ret, 12) 180 | # nlsize = BC.get_nlsize(size**3, size) 181 | # btop = BC.get_btop(pb) 182 | # stom = BC.get_turnover(volume, total_share, 21) 183 | # stoq = BC.get_turnover(volume, total_share, 63) 184 | # stoa = BC.get_turnover(volume, total_share, 252) 185 | # egro = BC.get_growth_factor(net_income, ticker_list) 186 | # sgro = BC.get_growth_factor(oper_revenue, ticker_list) 187 | 188 | # factor_level2 = pd.DataFrame() 189 | # factor_level2['size'] = size.stack() 190 | # factor_level2['beta'] = beta.stack() 191 | # factor_level2['rstr'] = rstr.stack() 192 | # factor_level2['dastd'] = dastd.stack() 193 | # factor_level2['cmra'] = cmra.stack() 194 | # factor_level2['hsigma'] = hsigma.stack() 195 | # factor_level2['nlsize'] = nlsize.stack() 196 | # factor_level2['btop'] = btop.stack() 197 | # factor_level2['stom'] = stom.stack() 198 | # factor_level2['stoq'] = stoq.stack() 199 | # factor_level2['stoa'] = stoa.stack() 200 | # factor_level2['egro'] = egro.stack() 201 | # factor_level2['sgro'] = sgro.stack() 202 | # factor_level2.to_pickle(os.path.join(cfg.FACTOR_PATH, 'factor_level2.pkl')) 203 | 204 | # factor_level2 = PriceDataBase(pd.read_pickle( 205 | # os.path.join(cfg.FACTOR_PATH, 'factor_level2.pkl'))) 206 | 207 | # size = z_score(winsorize(factor_level2.get_table('size'), 3)) 208 | # beta = z_score(winsorize(factor_level2.get_table('beta'), 3)) 209 | # rstr = z_score(winsorize(factor_level2.get_table('rstr'), 3)) 210 | # dastd = z_score(winsorize(factor_level2.get_table('dastd'), 3)) 211 | # cmra = z_score(winsorize(factor_level2.get_table('cmra'), 3)) 212 | # hsigma = z_score(winsorize(factor_level2.get_table('hsigma'), 3)) 213 | # nlsize = z_score(winsorize(factor_level2.get_table('nlsize'), 3)) 214 | # btop = z_score(winsorize(factor_level2.get_table('btop'), 3)) 215 | # stom = z_score(winsorize(factor_level2.get_table('stom'), 3)) 216 | # stoq = z_score(winsorize(factor_level2.get_table('stoq'), 3)) 217 | # stoa = z_score(winsorize(factor_level2.get_table('stoa'), 3)) 218 | # egro = z_score(winsorize(factor_level2.get_table('egro'), 3)) 219 | # sgro = z_score(winsorize(factor_level2.get_table('sgro'), 3)) 220 | 221 | # # 利用二级Barra因子合成一级因子 222 | # factor_level1 = pd.DataFrame() 223 | # factor_level1['Size'] = size.stack() 224 | # factor_level1['Beta'] = beta.stack() 225 | # factor_level1['Momentum'] = rstr.stack() 226 | # factor_level1['Residual_Volatility'] = z_score( 227 | # 0.74*dastd+0.16*cmra+0.1*hsigma).stack() 228 | # factor_level1['Non-linear_Size'] = nlsize.stack() 229 | # factor_level1['Book-to-Price'] = btop.stack() 230 | # factor_level1['Liquidty'] = z_score(0.35*stom+0.35*stoq+0.3*stoa).stack() 231 | # factor_level1['Growth'] = z_score( 232 | # (0.24/(0.24+0.47))*egro+(0.47/(0.24+0.47))*sgro).stack() 233 | # factor_level1 = factor_level1.astype('float64') 234 | 235 | # factor_level1.to_pickle(os.path.join(cfg.FACTOR_PATH, 'factor_level1.pkl')) 236 | 237 | factor_level1 = PriceDataBase(pd.read_pickle( 238 | os.path.join(cfg.FACTOR_PATH, 'factor_level1.pkl'))) 239 | 240 | # # 分20组因子多空收益率 241 | # # 注:此处仅做空第1组,做多第20组,对因子的方向未做判断 242 | # factor_lst = ['Size', 'Beta', 'Momentum', 'Residual_Volatility', 243 | # 'Non-linear_Size', 'Book-to-Price', 'Liquidty', 'Growth'] 244 | # Barra_ret_20q = pd.DataFrame() 245 | # for f in factor_lst: 246 | # factor = factor_level1.get_table(f) 247 | # Barra_ret_20q[f] = get_factor_ret(factor, ret, market_value, quantile=20) 248 | 249 | # Barra_ret_20q.to_pickle(os.path.join(cfg.FACTOR_RETURN, 'Barra_ret_20q.pkl')) 250 | Barra_ret_20q = pd.read_pickle(os.path.join( 251 | cfg.FACTOR_RETURN, 'Barra_ret_20q.pkl')) 252 | 253 | # # 纯因子收益率 254 | # datetime = factor_level1.data.index.get_level_values( 255 | # 'datetime').unique().sort_values() 256 | # factor_name = ['country', '光電業', '其他業', '其他電子業', '化學工業', 257 | # '半導體業', '塑膠工業', '建材營造業', '橡膠工業', '水泥工業', 258 | # '汽車工業', '油電燃氣業', '玻璃陶瓷', '生技醫療業', '紡織纖維', 259 | # '航運業', '觀光事業', '貿易百貨業', '資訊服務業', '通信網路業', 260 | # '造紙工業', '金融保險業', '鋼鐵工業', '電器電纜', '電子通路業', 261 | # '電子零組件業', '電機機械', '電腦及週邊設備業', '食品工業', 262 | # 'Size', 'Beta', 'Momentum', 'Residual_Volatility', 263 | # 'Non-linear_Size', 'Book-to-Price', 'Liquidty', 'Growth'] 264 | # Barra_ret_net = pd.DataFrame(index=datetime, columns=factor_name) 265 | # for i in tqdm(datetime): 266 | # # Barra因子矩阵 267 | # style = factor_level1.data.loc[i, :, :] 268 | # style.index = style.index.droplevel(0) 269 | # industry = industry_df.loc[style.index, industry_df.columns[1:]] 270 | # country = pd.DataFrame( 271 | # np.ones((style.shape[0], 1)), index=style.index, columns=['country']) 272 | # X = pd.concat([country, industry, style], axis=1).values 273 | # X[~np.isfinite(X)] = 0. 274 | # # 权重矩阵 275 | # W = np.sqrt(market_value.loc[i, style.index].values) 276 | # W = np.diag(W/W.sum()) 277 | # # 约束矩阵 278 | # industry_value = industry.mul( 279 | # market_value.loc[i, style.index], axis=0).sum(axis=0) 280 | # adj_industry_weights = -1 * \ 281 | # (industry_value[:-1]/industry_value[-1]).values 282 | # C = np.eye(1+industry.shape[1]+style.shape[1]) 283 | # C = np.delete(C, industry.shape[1], axis=1) 284 | # C[industry.shape[1], 1:industry.shape[1]] = adj_industry_weights 285 | # # 纯因子投资组合权重矩阵 286 | # a = np.matmul(X, C) 287 | # b = np.matmul(np.matmul(a.T, W), a) 288 | # invb = np.linalg.inv(b) 289 | # omega = np.matmul(np.matmul(np.matmul(C, invb), a.T), W) 290 | # # 纯因子收益率 291 | # Barra_ret_net.loc[i, :] = np.matmul( 292 | # omega, ret.loc[i, style.index].fillna(0).values) 293 | 294 | # Barra_ret_net.to_pickle(os.path.join( 295 | # cfg.FACTOR_RETURN, 'Barra_ret_net.pkl')) 296 | Barra_ret_net = pd.read_pickle(os.path.join( 297 | cfg.FACTOR_RETURN, 'Barra_ret_net.pkl')) 298 | 299 | # # 20组多空分析 300 | # # 持仓信息 301 | # pos_data = pd.read_excel(os.path.join( 302 | # cfg.POSITION_PATH, 'positi...roup20.xlsx'), parse_dates=[0]) 303 | # pos_data['asset'] = pos_data['asset'].astype('str') 304 | # pos_data.index = pd.MultiIndex.from_frame(pos_data[['datetime', 'asset']]) 305 | # # 持仓暴露 306 | # pos_expose = get_pos_expose(pos_data, factor_level1.data) 307 | # pos_ret = pos_expose*Barra_ret_20q.loc[pos_expose.index] 308 | 309 | # hold_analysis = pd.DataFrame() 310 | # hold_analysis['expose'] = pos_expose.stack() 311 | # hold_analysis['return'] = pos_ret.stack() 312 | # hold_analysis.to_pickle(os.path.join( 313 | # cfg.POSITION_PATH, 'hold_analysis_20q.pkl')) 314 | 315 | # # 持仓分析 316 | # hold_analysis = PriceDataBase(pd.read_pickle( 317 | # os.path.join(cfg.POSITION_PATH, 'hold_analysis_20q.pkl'))) 318 | # factor_expose = hold_analysis.get_table( 319 | # 'expose').astype('float64').round(4) 320 | # factor_ret = hold_analysis.get_table('return').astype('float64').round(4) 321 | # factor_lst = factor_expose.columns 322 | # x_label = [t.strftime('%Y-%m-%d') for t in factor_expose.index] 323 | # factor_expose_q = get_his_q(factor_expose) 324 | # factor_ret_q = get_his_q(factor_ret) 325 | # with PdfPages(os.path.join(cfg.POSITION_PATH, 'hold_analysis_20q.pdf')) as pdf: 326 | # for f in factor_lst: 327 | # fig = plt.figure(figsize=(18, 6)) 328 | 329 | # ax1 = fig.add_subplot(221) 330 | # ax1.bar(x_label, factor_expose[f]) 331 | # ax1.xaxis.set_visible(False) 332 | # plt.xlim(-5, len(x_label)+5) 333 | # plt.grid(axis="y") 334 | # ax1.set_title('portfolio exposure on "{}" factor'.format(f)) 335 | 336 | # ax3 = fig.add_subplot(223) 337 | # ax3.scatter(x_label, factor_expose_q[f], s=1, c='r') 338 | # ax3.xaxis.set_major_locator( 339 | # ticker.MultipleLocator(int(len(x_label)/5))) 340 | # plt.xticks(rotation=-45) 341 | # plt.xlim(-5, len(x_label)+5) 342 | # plt.grid(axis="y") 343 | # ax3.set_title( 344 | # 'historical quantile of "{}" factor exposures'.format(f)) 345 | 346 | # ax2 = fig.add_subplot(222) 347 | # ax2.plot(x_label, (1+factor_ret[f]).cumprod()) 348 | # ax2.xaxis.set_visible(False) 349 | # plt.xlim(-5, len(x_label)+5) 350 | # plt.grid(axis="y") 351 | # ax2.set_title( 352 | # 'portfolio cumulative return on "{}" factor'.format(f)) 353 | 354 | # ax4 = fig.add_subplot(224) 355 | # ax4.scatter(x_label, factor_ret_q[f], s=1, c='r') 356 | # ax4.xaxis.set_major_locator( 357 | # ticker.MultipleLocator(int(len(x_label)/5))) 358 | # plt.xticks(rotation=-45) 359 | # plt.xlim(-5, len(x_label)+5) 360 | # plt.grid(axis="y") 361 | # ax4.set_title( 362 | # 'historical quantile of "{}" factor return'.format(f)) 363 | 364 | # pdf.savefig(fig) 365 | 366 | # # 纯因子收益率分析分析 367 | # # 持仓信息 368 | # pos_data = pd.read_excel(os.path.join( 369 | # cfg.POSITION_PATH, 'positi...roup20.xlsx'), parse_dates=[0]) 370 | # pos_data['asset'] = pos_data['asset'].astype('str') 371 | # pos_data.index = pd.MultiIndex.from_frame(pos_data[['datetime', 'asset']]) 372 | # # 持仓暴露 373 | # pos_expose = get_pos_expose( 374 | # pos_data, factor_level1.data, industry_df.drop('industry', axis=1)) 375 | # pos_ret = pos_expose*Barra_ret_net.loc[pos_expose.index] 376 | 377 | # hold_analysis = pd.DataFrame() 378 | # hold_analysis['expose'] = pos_expose.stack() 379 | # hold_analysis['return'] = pos_ret.stack() 380 | # hold_analysis.to_pickle(os.path.join( 381 | # cfg.POSITION_PATH, 'hold_analysis_net.pkl')) 382 | 383 | # 持仓分析 384 | hold_analysis = PriceDataBase(pd.read_pickle( 385 | os.path.join(cfg.POSITION_PATH, 'hold_analysis_net.pkl'))) 386 | factor_expose = hold_analysis.get_table( 387 | 'expose').astype('float64').round(4) 388 | factor_ret = hold_analysis.get_table('return').astype('float64').round(4) 389 | factor_lst = factor_expose.columns 390 | x_label = [t.strftime('%Y-%m-%d') for t in factor_expose.index] 391 | factor_expose_q = get_his_q(factor_expose) 392 | factor_ret_q = get_his_q(factor_ret) 393 | with PdfPages(os.path.join(cfg.POSITION_PATH, 'hold_analysis_net.pdf')) as pdf: 394 | for f in factor_lst: 395 | fig = plt.figure(figsize=(18, 6)) 396 | 397 | ax1 = fig.add_subplot(221) 398 | ax1.bar(x_label, factor_expose[f]) 399 | ax1.xaxis.set_visible(False) 400 | plt.xlim(-5, len(x_label)+5) 401 | plt.grid(axis="y") 402 | ax1.set_title('portfolio exposure on "{}" factor'.format(f)) 403 | 404 | ax3 = fig.add_subplot(223) 405 | ax3.scatter(x_label, factor_expose_q[f], s=1, c='r') 406 | ax3.xaxis.set_major_locator( 407 | ticker.MultipleLocator(int(len(x_label)/5))) 408 | plt.xticks(rotation=-45) 409 | plt.xlim(-5, len(x_label)+5) 410 | plt.grid(axis="y") 411 | ax3.set_title( 412 | 'historical quantile of "{}" factor exposures'.format(f)) 413 | 414 | ax2 = fig.add_subplot(222) 415 | ax2.plot(x_label, (1+factor_ret[f]).cumprod()) 416 | ax2.xaxis.set_visible(False) 417 | plt.xlim(-5, len(x_label)+5) 418 | plt.grid(axis="y") 419 | ax2.set_title( 420 | 'portfolio cumulative return on "{}" factor'.format(f)) 421 | 422 | ax4 = fig.add_subplot(224) 423 | ax4.scatter(x_label, factor_ret_q[f], s=1, c='r') 424 | ax4.xaxis.set_major_locator( 425 | ticker.MultipleLocator(int(len(x_label)/5))) 426 | plt.xticks(rotation=-45) 427 | plt.xlim(-5, len(x_label)+5) 428 | plt.grid(axis="y") 429 | ax4.set_title( 430 | 'historical quantile of "{}" factor return'.format(f)) 431 | 432 | pdf.savefig(fig) 433 | -------------------------------------------------------------------------------- /configs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 9 09:22:24 2022 4 | 5 | @author: PCXU 6 | """ 7 | 8 | 9 | # 无风险利率 10 | Rf_annual = 0.02 11 | Rf_daily = (Rf_annual+1)**(1/252)-1 12 | 13 | # 根目录 14 | ROOT_PATH = '*****' 15 | # 数据根目录 16 | DATA_ROOT_PATH = ROOT_PATH+'data\\' 17 | # 价量数据 18 | PV_PATH = DATA_ROOT_PATH+'daily_prices_2005\\adj_prices.ftr' 19 | # 基本面数据 20 | FR_PATH = DATA_ROOT_PATH+'TWSE\\daily_fundamentals.pkl' 21 | # 行业数据 22 | INDUSTRY_PATH = DATA_ROOT_PATH+'industry_info.pkl' 23 | # 季度数据 24 | QUARTER_DATA = DATA_ROOT_PATH+'quarter_TWSE\\' 25 | # 因子值目录 26 | FACTOR_PATH = DATA_ROOT_PATH+'factor\\' 27 | # 因子收益率 28 | FACTOR_RETURN = FACTOR_PATH+'factor_return\\' 29 | -------------------------------------------------------------------------------- /portfolio_optimization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Apr 7 17:28:08 2022 4 | 5 | @author: PCXU 6 | @contact: xupengchengmail@163.com 7 | """ 8 | 9 | import os 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | import configs as cfg 15 | from tools import * 16 | 17 | 18 | if __name__ == '__main__': 19 | # 加载数据 20 | industry_df = pd.read_pickle(cfg.INDUSTRY_PATH) 21 | ticker_list = list(industry_df.index) 22 | 23 | daily_data = PriceDataBase(pd.read_pickle(os.path.join( 24 | cfg.DATA_ROOT_PATH, 'daily_info.pkl'))) 25 | trade_day = pd.read_pickle(cfg.DATA_ROOT_PATH+"trade_day.pkl").index 26 | factor_level1 = PriceDataBase(pd.read_pickle( 27 | os.path.join(cfg.FACTOR_PATH, 'factor_level1.pkl'))) 28 | # 收益率估计 29 | close = daily_data.get_table('close') 30 | ret_M = close.pct_change().replace( 31 | [np.inf, -np.inf], 0).rolling(21).mean().dropna(axis=0, how='all') 32 | 33 | date = '20091001' 34 | BO = BarraOptimizer() 35 | ret = ret_M.loc[date, :] 36 | assets_pool = ret[~ret.isna()].index 37 | # 中性化的风格因子 38 | beta_style = pd.DataFrame(index=ticker_list) 39 | beta_style['Size'] = factor_level1.get_table('Size').loc[date, :].fillna(0) 40 | beta_style['Beta'] = factor_level1.get_table('Beta').loc[date, :].fillna(0) 41 | beta_style['Book-to-Price'] = factor_level1.get_table( 42 | 'Book-to-Price').loc[date, :].fillna(0) 43 | 44 | weight = BO.get_weight(ret=ret, 45 | assets_pool=assets_pool, 46 | beta_industry=industry_df.loc[:, 47 | industry_df.columns[1:]], 48 | beta_style=factor_level1.get_table('Size').loc[date, :]) 49 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.21.5 2 | pandas==1.3.5 3 | statsmodels==0.13.1 4 | tqdm==4.62.3 5 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .price_data import PriceDataBase 2 | from .factor_utils import * 3 | from .file_utils import * 4 | -------------------------------------------------------------------------------- /tools/barra_calculator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Mar 18 10:18:45 2022 4 | 5 | @author: PCXU 6 | """ 7 | 8 | import statsmodels.api as sm 9 | from typing import List 10 | from tqdm import tqdm 11 | import pandas as pd 12 | import numpy as np 13 | 14 | 15 | class BarraCalculator: 16 | """ 17 | 这个类用来定义计算Barra因子的相关函数 18 | """ 19 | 20 | def __init__(self, 21 | rf: float, 22 | datetime: pd.DatetimeIndex, 23 | start: str = None, 24 | end: str = None, 25 | freq: int = 1): 26 | """ 27 | 28 | 29 | Parameters 30 | ---------- 31 | rf : float 32 | 日频无风险利率 33 | 34 | datetime : pd.DatetimeIndex 35 | 时间格式的交易日索引 36 | 37 | start : str, optional 38 | 格式:"YYYYMMdd"。The default is None. 39 | 生成因子值的开始时间,默认为尽可能充分利用datetime 40 | 41 | end : str, optional 42 | 格式:"YYYYMMdd"。The default is None. 43 | 生成因子值的结束时间,默认为尽可能充分利用datetime 44 | 45 | freq : int, optional 46 | 因子计算频率,间隔freq个交易日计算一次因子值 47 | The default is 1. 48 | 49 | """ 50 | self.rf = rf 51 | self.freq = freq 52 | self.datetime = datetime 53 | self.start = start if start else self.datetime[0] 54 | self.end = end if end else self.datetime[-1] 55 | self.datetime = self.datetime[( 56 | self.datetime >= self.start) & (self.datetime <= self.end)] 57 | self.datetime = self.datetime[::self.freq] 58 | 59 | def get_size(self, 60 | market_value: pd.DataFrame) -> pd.DataFrame: 61 | """ 62 | 计算市值因子 63 | 64 | Parameters 65 | ---------- 66 | market_value : pd.DataFrame 67 | index=datetime, columns=asset 68 | 市值数据 69 | 70 | Returns 71 | ------- 72 | size : pd.DataFrame 73 | 74 | """ 75 | market_value = market_value.loc[self.datetime, :] 76 | 77 | size = market_value.apply(np.log) 78 | return size.replace([np.inf, -np.inf], 0) 79 | 80 | def get_beta_hsigma(self, 81 | ret: pd.DataFrame, 82 | market_value: pd.DataFrame, 83 | T: int) -> tuple: 84 | """ 85 | 计算beta和hsigma(beta回归残差年化波动率)因子 86 | 87 | Parameters 88 | ---------- 89 | ret : pd.DataFrame 90 | index=datetime, columns=asset 91 | 资产原始日收益率 92 | ret的开始日期应至少比self.start早T-1期 93 | market_value : pd.DataFrame 94 | index=datetime, columns=asset 95 | 市值数据 96 | market_value的开始日期应至少比self.start早T-1期 97 | T : int 98 | 回归区间长度 99 | 100 | Returns 101 | ------- 102 | tuple 103 | beta : pd.DataFrame 104 | hsigma : pd.DataFrame 105 | 106 | """ 107 | market_ret = (ret * market_value).sum(axis=1) / \ 108 | market_value.sum(axis=1) 109 | Re = ret-self.rf 110 | Re = Re.dropna(axis=0, how='all') 111 | Rm = market_ret - self.rf 112 | 113 | date_in_src = np.intersect1d(list(Re.index), list(Rm.index)) 114 | beta = pd.DataFrame(index=self.datetime, columns=Re.columns) 115 | hsigma = pd.DataFrame(index=self.datetime, columns=Re.columns) 116 | # weights 117 | alpha = 1-np.exp(np.log(0.5)/63) 118 | w = [(1-alpha)**(T-i) for i in range(252)] 119 | w = w/sum(w) 120 | 121 | for end in tqdm(self.datetime): 122 | ind = np.argwhere(date_in_src == end)[0, 0] 123 | start = date_in_src[ind-T+1] 124 | for code in Re.columns: 125 | y = Re.loc[start:end, code] 126 | x = Rm[start:end] 127 | X = sm.add_constant(x) 128 | wls_model = sm.WLS(y, X, weights=w) 129 | results = wls_model.fit() 130 | beta.loc[end, code] = results.params[0] 131 | hsigma.loc[end, code] = results.resid.std() 132 | return beta, hsigma 133 | 134 | def get_rstr(self, 135 | ret: pd.DataFrame, 136 | T: int, 137 | halflife: int, 138 | L: int) -> pd.DataFrame: 139 | """ 140 | 计算动量因子 141 | 142 | Parameters 143 | ---------- 144 | ret : pd.DataFrame 145 | index=datetime, columns=asset 146 | 资产原始日收益率 147 | ret的开始日期应至少比self.start早T-1期 148 | T : int 149 | 长期动量周期 150 | halflife : int 151 | 半衰期 152 | L : int 153 | 短期动量周期 154 | 155 | Returns 156 | ------- 157 | rstr : pd.DataFrame 158 | 159 | """ 160 | def func(series, T, halflife, L): 161 | series = series.ewm(halflife=halflife).mean() 162 | return series[-1]-series[-(T-L)] 163 | 164 | rt = ((ret+1)/(self.rf+1)).apply(np.log) 165 | rstr = rt.rolling(T).apply(lambda x: func(x, T, halflife, L)) 166 | return rstr.loc[self.datetime, :] 167 | 168 | def get_dastd(self, 169 | ret: pd.DataFrame, 170 | T: int, 171 | halflife: int) -> pd.DataFrame: 172 | """ 173 | 计算超额收益年化波动率 174 | 175 | Parameters 176 | ---------- 177 | ret : pd.DataFrame 178 | index=datetime, columns=asset 179 | 资产原始日收益率 180 | ret的开始日期应至少比self.start早T-1期 181 | T : int 182 | 计算区间长度 183 | halflife : int 184 | 半衰期 185 | 186 | Returns 187 | ------- 188 | dastd : pd.DataFrame 189 | 190 | """ 191 | re = ret-self.rf 192 | re = re.sub(re.mean(axis=1), axis=0) 193 | re = re**2 194 | dastd = re.rolling(T).apply( 195 | lambda x: x.ewm(halflife=halflife).mean()[-1]) 196 | return dastd.loc[self.datetime, :] 197 | 198 | def get_cmra(self, 199 | ret: pd.DataFrame, 200 | M: int) -> pd.DataFrame: 201 | """ 202 | 计算年度超额收益率离差 203 | 204 | Parameters 205 | ---------- 206 | ret : pd.DataFrame 207 | index=datetime, columns=asset 208 | 资产原始日收益率 209 | ret的开始日期应至少比self.start早M*21期 210 | M : int 211 | 计算区间的月份数 212 | 213 | Returns 214 | ------- 215 | cmra : pd.DataFrame 216 | 217 | """ 218 | rt = ((ret+1)/(self.rf+1)).apply(np.log) 219 | ZT = rt.rolling(21*M).sum() 220 | cmra = ZT.rolling(21*M).apply(lambda x: max(x)-min(x)) 221 | return cmra.loc[self.datetime, :] 222 | 223 | def get_nlsize(self, 224 | size3: pd.DataFrame, 225 | size: pd.DataFrame) -> pd.DataFrame: 226 | """ 227 | 计算非线性因子 228 | 229 | Parameters 230 | ---------- 231 | size3 : pd.Dataframe 232 | index=datetime, columns=asset 233 | 市值因子的立方 234 | size : pd.DataFrame 235 | index=datetime, columns=asset 236 | 市值因子 237 | 238 | Returns 239 | ------- 240 | nlsize : pd.DataFrame 241 | 242 | """ 243 | nlsize = pd.DataFrame( 244 | index=self.datetime, columns=size.columns) 245 | 246 | for i in tqdm(self.datetime): 247 | x = size.loc[i, :] 248 | y = size3.loc[i, :] 249 | if not x.isna().all(): 250 | X = sm.add_constant(x) 251 | model = sm.OLS(y, X, missing='drop') 252 | results = model.fit() 253 | y_fitted = results.fittedvalues 254 | nlsize.loc[i, :] = y-y_fitted 255 | return nlsize 256 | 257 | def get_btop(self, 258 | pb: pd.DataFrame) -> pd.DataFrame: 259 | """ 260 | 计算账面市值比因子 261 | 262 | Parameters 263 | ---------- 264 | pb : pd.DataFrame 265 | index=datetime, columns=asset 266 | pb数据 267 | 268 | Returns 269 | ------- 270 | bp : pd.DataFrame 271 | 272 | """ 273 | pb = pb.loc[self.datetime, :] 274 | bp = 1./pb 275 | return bp.replace([np.inf, -np.inf], 0) 276 | 277 | def get_turnover(self, 278 | volume: pd.DataFrame, 279 | total_share: pd.DataFrame, 280 | T: int) -> pd.DataFrame: 281 | """ 282 | 计算换手率因子 283 | 284 | Parameters 285 | ---------- 286 | volume : pd.DataFrame 287 | index=datetime, columns=asset 288 | 日成交量数据 289 | volume的开始日期应至少比self.start早T期 290 | total_share : pd.DataFrame 291 | index=datetime, columns=asset 292 | 流通股本数据 293 | total_share的开始日期应至少比self.start早T期 294 | T : int 295 | 计算区间长度 296 | 297 | Returns 298 | ------- 299 | turnover : pd.DataFrame 300 | 301 | """ 302 | turnover = (volume/total_share).rolling(T).sum().apply(np.log) 303 | return turnover.loc[self.datetime, :] 304 | 305 | def get_growth_factor(self, 306 | f_data: pd.DataFrame, 307 | asset: List) -> pd.DataFrame: 308 | """ 309 | 计算成长因子 310 | 311 | Parameters 312 | ---------- 313 | f_data : pd.DataFrame 314 | index=datetime, columns=asset 315 | 季频财报数据,营业收入/净利润 316 | asset : List 317 | 资产列表 318 | 319 | Returns 320 | ------- 321 | factor : pd.DataFrame 322 | 323 | """ 324 | def get_growth_rate(dep): 325 | x = sm.add_constant(np.array([1, 2, 3, 4, 5])) 326 | model = sm.OLS(dep, x) 327 | results = model.fit() 328 | return results.params[1]/dep.mean() 329 | 330 | f_data = f_data.resample('Y').last() 331 | factor = f_data.rolling(5).apply(get_growth_rate) 332 | factor = factor.resample('M').ffill().shift(3) 333 | factor = factor.dropna(axis=0, how='all') 334 | 335 | stand_df = pd.DataFrame( 336 | index=pd.date_range(start=factor.index[0], 337 | end=self.datetime[-1]), columns=asset) 338 | factor = factor.reindex_like(stand_df).ffill() 339 | return factor.loc[self.datetime, :] 340 | -------------------------------------------------------------------------------- /tools/barra_optimizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Apr 7 14:11:34 2022 4 | 5 | @author: PCXU 6 | @contact: xupengchengmail@163.com 7 | """ 8 | 9 | from scipy.optimize import minimize 10 | import pandas as pd 11 | import numpy as np 12 | 13 | 14 | class BarraOptimizer: 15 | """ 16 | 这个类用来定义进行Barra组合优化的相关函数 17 | """ 18 | 19 | def __init__(self, 20 | lambd: float = 0, 21 | max_w: float = 0.1): 22 | """ 23 | 24 | 25 | Parameters 26 | ---------- 27 | lambd : float, optional 28 | 风险厌恶系数. The default is 0. 29 | max_w : float, optional 30 | 单资产最大权重. The default is 10.0%. 31 | 32 | """ 33 | self.lambd = lambd 34 | self.max_w = max_w 35 | 36 | def get_weight(self, 37 | ret: pd.Series, 38 | assets_pool: pd.Index, 39 | beta_industry: pd.DataFrame, 40 | beta_style: pd.DataFrame, 41 | sigma: pd.DataFrame = None) -> pd.Series: 42 | """ 43 | 获取使风险调整后的收益最大化的权重 44 | 45 | Parameters 46 | ---------- 47 | ret : pd.Series 48 | 资产收益率的估计 49 | assets_pool : pd.Index 50 | 股票池 51 | beta_industry : pd.DataFrame 52 | index=assets, columns=industry 53 | 要中性化的行业因子暴露 54 | beta_style : pd.DataFrame 55 | index=assets, columns=style 56 | 要中性化的风格因子暴露 57 | sigma : pd.DataFrame, optional 58 | 资产协方差矩阵 The default is None. 59 | 60 | Returns 61 | ------- 62 | pd.Series 63 | 64 | """ 65 | r = ret[assets_pool].values.reshape((1, -1)) 66 | beta_industry = beta_industry.loc[assets_pool, :].values 67 | beta_style = beta_style.loc[assets_pool, :].values 68 | X_ini = np.zeros((1, len(assets_pool))) 69 | 70 | def target_func(r): 71 | def func(x): return -1*np.matmul(x.reshape(1, -1), r.T)[0, 0] 72 | return func 73 | 74 | def neutral_func(array): 75 | def func(x): return np.matmul(x.reshape(1, -1), array)[0, 0] 76 | return func 77 | 78 | # 持仓上限限制 79 | bnds = tuple((-1*self.max_w, self.max_w) for _ in assets_pool) 80 | # 货币中性限制 81 | cons = [{'type': 'eq', 'fun': lambda x: x.sum()},] 82 | for i in range(beta_industry.shape[1]): 83 | # 行业因子暴露限制 84 | cons.append({'type': 'eq', 'fun': neutral_func( 85 | beta_industry[:, i].reshape(-1, 1))}) 86 | for i in range(beta_style.shape[1]): 87 | # 风格因子暴露限制 88 | cons.append({'type': 'eq', 'fun': neutral_func( 89 | beta_style[:, i].reshape(-1, 1))}) 90 | cons = tuple(cons) 91 | 92 | res = minimize(target_func(r), X_ini, method='SLSQP', 93 | bounds=bnds, constraints=cons) 94 | if res.success: 95 | return pd.DataFrame(res.x, 96 | index=assets_pool, columns=['weight']) 97 | else: 98 | print(res.message) 99 | -------------------------------------------------------------------------------- /tools/factor_utils.py: -------------------------------------------------------------------------------- 1 | from statsmodels.regression.rolling import RollingOLS 2 | import statsmodels.api as sm 3 | from tqdm import tqdm 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | def winsorize(df, n): 9 | """ 10 | n倍MAD去极值 11 | 12 | :param df: pd.DataFrame, index = date, columns = assets 13 | :param n: int 14 | :return: pd.DataFrame 15 | """ 16 | MAD = (df.sub(df.median(axis=1), axis=0).apply(abs)).median(axis=1) 17 | up = df.median(axis=1) + n * 1.4826 * MAD 18 | down = df.median(axis=1) - n * 1.4826 * MAD 19 | return df.clip(lower=down, upper=up, axis=0) 20 | 21 | 22 | def z_score(df): 23 | """ 24 | z_score标准化方法 25 | 26 | :param df: pd.DataFrame, index = date, columns = assets 27 | :return: pd.DataFrame 28 | """ 29 | return df.sub(df.mean(axis=1), axis=0).div(df.std(axis=1), axis=0) 30 | 31 | 32 | def get_factor_ret(factor, ret, market_value, quantile=5): 33 | """ 34 | 获取因子收益率,用多空组合收益率之差表示 35 | 36 | :param factor: pd.DataFrame, index = date, columns = assets 37 | :param pricing: pd.DataFrame, 交易价格数据 38 | :param market_value: pd.DataFrame, 股票市值 39 | :param quantile: int, default = 5, 分组数,取第一组和最后一组市值加权收益率差值作为 40 | 41 | :return: pd.Series, 因子收益率序列 42 | """ 43 | 44 | indexs = factor.index 45 | ret = ret.loc[indexs, :] 46 | market_value = market_value.loc[indexs, :] 47 | 48 | f_rank = factor.rank(method='first', axis=1) 49 | q = f_rank.quantile(q=(1./quantile, 1.-1./quantile), axis=1, 50 | numeric_only=True, interpolation='linear').T 51 | up = pd.DataFrame(f_rank.values > q[1.-1./quantile].values.reshape( 52 | (q.shape[0], 1)), index=f_rank.index, columns=f_rank.columns) 53 | down = pd.DataFrame(f_rank.values < q[1./quantile].values.reshape( 54 | (q.shape[0], 1)), index=f_rank.index, columns=f_rank.columns) 55 | 56 | top = (ret[up]*market_value[up]).sum(axis=1)/market_value[up].sum(axis=1) 57 | bottom = (ret[down]*market_value[down]).sum(axis=1)/market_value[down].sum(axis=1) 58 | LS = top - bottom 59 | return LS 60 | 61 | 62 | def stand_df(df, ticker_list, trade_day): 63 | """ 64 | 标准化df 65 | 66 | Parameters 67 | ---------- 68 | df : pd.DataFrame 69 | 要标准化的df 70 | ticker_list : array like 71 | 列名,资产列表 72 | trade_day : array like 73 | 索引,交易日序列 74 | 75 | Returns 76 | ------- 77 | refined_df : pd.DataFrame 78 | 标准化后的df 79 | 80 | """ 81 | stand_df = pd.DataFrame(index=trade_day, columns=ticker_list) 82 | df = df.reindex_like(stand_df[df.index[0]:df.index[-1]]) 83 | df = df.ffill().replace([np.inf, -np.inf], 0) 84 | return df 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /tools/file_utils.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import re 6 | 7 | 8 | def load_daily_data(ticker_list, dir_path): 9 | dfs = [] 10 | for ticker in tqdm(ticker_list): 11 | df = pd.read_csv(dir_path + "{}.csv".format(str(ticker)), thousands=',' 12 | ).drop(columns=['Unnamed: 0']) 13 | df['asset'] = str(ticker) 14 | dfs.append(df) 15 | else: 16 | ndf = pd.concat(dfs).reset_index(drop=True) 17 | ndf['Date'] = ndf['Date'].astype(str) 18 | ndf['Date'] = pd.to_datetime(ndf['Date']) 19 | ndf['證券代號'] = ndf['證券代號'].astype('str') 20 | ndf.index = pd.MultiIndex.from_frame( 21 | ndf[['Date', '證券代號']], names=('datetime', 'asset')) 22 | ndf = ndf.dropna(how='all').sort_index() 23 | ndf = ndf.drop(['Date', '證券代號'], axis=1) 24 | return ndf 25 | 26 | 27 | def load_FR_pkl(path): 28 | df = pd.read_pickle(path) 29 | df = df[['證券名稱', '開盤價', '最高價', '最低價', '收盤價', 30 | '成交股數', '股價淨值比', '發行股數']] 31 | df.columns = ['asset_name', 'open', 'high', 'low', 'close', 32 | 'volume', 'pb', 'total_share'] 33 | df.index = df.index.set_names(['datetime', 'asset']) 34 | fields = ['open', 'high', 'low', 'close', 35 | 'volume', 'pb', 'total_share'] 36 | df[fields] = df[fields].astype('str') 37 | for f in fields: 38 | df[f] = df[f].apply(lambda x: x.replace( 39 | ',', '').replace('--', '0.0').replace('-', '0.0')) 40 | df[fields] = df[fields].astype('float64') 41 | return df 42 | 43 | 44 | def load_quarter_data(path, datetime, ticker_list): 45 | net_income = pd.DataFrame(index=datetime, columns=ticker_list) 46 | oper_revenue = pd.DataFrame(index=datetime, columns=ticker_list) 47 | 48 | file_lst = os.listdir(path) 49 | file_lst.sort() 50 | for i in tqdm(range(len(file_lst))): 51 | df = pd.read_excel(os.path.join( 52 | path, file_lst[i]), header=2, keep_default_na=False) 53 | df = df.iloc[:, [0, 2, 9]] 54 | df.columns = ['code', 'oper_revenue', 'net_income'] 55 | df['code'] = df['code'].astype('str') 56 | df['code'] = df['code'].apply(lambda x: x.replace(' ', '')) 57 | 58 | flag = '^[1-9]{1}[0-9]{3}$' 59 | drop_lst = [] 60 | for j in range(df.shape[0]): 61 | code = df.loc[j, 'code'] 62 | if code in ticker_list: 63 | if re.match(flag, code): 64 | continue 65 | drop_lst.append(j) 66 | df = df.drop(drop_lst, axis=0) 67 | df = df.replace('', np.nan) 68 | 69 | df[['oper_revenue', 'net_income']] = df[[ 70 | 'oper_revenue', 'net_income']].astype('float64')*1000 71 | df = df.set_index('code') 72 | 73 | net_income.loc[datetime[i], df.index] = df['net_income'].values 74 | oper_revenue.loc[datetime[i], df.index] = df['oper_revenue'].values 75 | 76 | return net_income.astype('float64'), oper_revenue.astype('float64') 77 | 78 | 79 | def get_TTM(df): 80 | # 计算单季度数据 81 | df_Q1 = df[df.index.month == 3] 82 | 83 | df_Q2 = df[df.index.month == 6] 84 | Q2_np = df_Q2.values-df_Q1.values 85 | Q2_np = pd.DataFrame(Q2_np, index=df_Q2.index, columns=df_Q2.columns) 86 | 87 | df_Q3 = df[df.index.month == 9] 88 | Q3_np = df_Q3.values-df_Q2.values 89 | Q3_np = pd.DataFrame(Q3_np, index=df_Q3.index, columns=df_Q3.columns) 90 | 91 | df_Q4 = df[df.index.month == 12] 92 | Q4_np = df_Q4.values-df_Q3.values[0:-1, :] 93 | Q4_np = pd.DataFrame(Q4_np, index=df_Q4.index, columns=df_Q4.columns) 94 | 95 | df_Q = pd.concat([df_Q1, Q2_np, Q3_np, Q4_np]) 96 | df_Q.sort_index(inplace=True) 97 | 98 | # 计算TTM 99 | df_TTM = df_Q.rolling(4).sum().dropna(how='all') 100 | return df_TTM 101 | 102 | 103 | def load_feather_file(file_path): 104 | df = pd.read_feather(file_path) 105 | return df 106 | 107 | 108 | def save_factor_to_pkl(factor_path, factor_tag, factor_name, factor_data): 109 | factor_path = os.path.join(factor_path, factor_tag) 110 | if not os.path.exists(factor_path): 111 | os.makedirs(factor_path) 112 | 113 | factor_data.to_pickle(os.path.join( 114 | factor_path, "{}.pkl".format(factor_name))) 115 | 116 | return None 117 | 118 | 119 | def load_factor_from_pkl(factor_path, factor_tag, factor_name): 120 | factor_path = os.path.join(factor_path, factor_tag) 121 | 122 | try: 123 | factor = pd.read_pickle(os.path.join( 124 | factor_path, "{}.pkl".format(factor_name))) 125 | return factor 126 | except Exception as e: 127 | print(factor_name, e) 128 | return None 129 | -------------------------------------------------------------------------------- /tools/price_data.py: -------------------------------------------------------------------------------- 1 | # %% 2 | from abc import ABCMeta 3 | import pandas as pd 4 | 5 | 6 | # %% 7 | 8 | 9 | class PriceDataBase(object, metaclass=ABCMeta): 10 | def __init__(self, data=None): 11 | self.data = data 12 | 13 | def split_data(self, time_constraint_from, time_constraint_to): 14 | self.data = self.data.loc[time_constraint_from:time_constraint_to, :, :] 15 | return None 16 | 17 | def get_table(self, field_name): 18 | return self.data[field_name].unstack() 19 | 20 | def stack_and_insert(self, table, field_name): 21 | self.data[field_name] = table.stack() 22 | return None 23 | 24 | def get_sub_data(self, columns): 25 | return self.data[columns] 26 | 27 | def __repr__(self): 28 | try: 29 | return str(self.data.head(10)) 30 | except: 31 | return str(self.data) 32 | --------------------------------------------------------------------------------