├── LICENSE ├── README.md ├── BoxPlot_Error.py ├── compute_vol.py ├── Summary_Results.py ├── data_subsample.py ├── Summary_Regime.py ├── MCS.py ├── GHAR.py └── GNNHAR.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Chao Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Forecasting Realized Volatility with Spillover Effects: Perspectives from Graph Neural Networks 2 | 3 | This is the README file for the project [Forecasting Realized Volatility with Spillover Effects: Perspectives from Graph Neural Networks](https://www.sciencedirect.com/science/article/pii/S0169207024000967), published in [International Journal of Forecasting](https://www.sciencedirect.com/journal/international-journal-of-forecasting). 4 | It provides an overview of the project structure and instructions on how to use and contribute to the codebase. 5 | 6 | ## Table of Contents 7 | 8 | - [Project Structure](#project-structure) 9 | - [Usage](#usage) 10 | - [Data](#data) 11 | - [Computing Environment](#computing-environment) 12 | 13 | ## Project Structure 14 | 15 | The project is organized as follows: 16 | 17 | - `data_subsample.py`: Subsample the minutely data to 5 minutes and merge the data of all stocks in the stock list. 18 | - `compute_vol.py`: Compute the daily variance from 5-min return data. 19 | - `GHAR.py`: Linear models to forecast the realized volatility, including HAR and GHAR. HAR is a special case of GHAR, assuming the adjacency matrix is identity. 20 | - `GNNHAR.py`: Proposed GNNHAR models to forecast the realized volatility. 21 | - `MCS.py`: Implementation of Econometrica Paper: "The model confidence set." by Hansen, Peter R., Asger Lunde, and James M. Nason. 22 | - `Summary_Results.py`: Summarize the results of the forecast models, including the MSE, QLIKE, and the MCS tests. 23 | - `Summary_Regime.py`: Summarize the results of the forecast models, based on different regimes. 24 | - `BoxPlot_Error.py`: Plot the boxplot of the forecast error and ratio for different models 25 | 26 | ## Usage 27 | 28 | To use the project, follow these steps: 29 | 30 | 1. Download LOBSTER data (minutely or higher freq) and save to your local path 31 | 2. Run data_subsample.py and compute_vol.py sequentially 32 | 3. Run GHAR.py to obtain the baseline forecasts from linear regressions 33 | 4. Run GNNHAR.py to obtain the forecasts for proposed GNNHAR models 34 | 5. Compare their forecasts by using Summary_Results.py and Summary_Regime.py 35 | 6. Generate plots by BoxPlot_Error.py 36 | 37 | 38 | ## Data 39 | The data used in this reproducibility check is LOBSTER (https://lobsterdata.com/), which needs to be purchased by users. 40 | 41 | ## Computing Environment 42 | To run the reproducibility check, the following computing environment and package(s) are required: 43 | - Environment: These experiments were conducted on a system equipped with an Nvidia A100 GPU with 40 GB of GPU memory, an AMD EPYC 7713 64-Core Processor @ 1.80GHz with 128 cores, and 1.0TB of RAM, running Ubuntu 20.04.4 LTS. 44 | 45 | - Package(s): 46 | - Python 3.8.18 47 | - PyTorch 2.0.1+cu117 48 | - numpy 1.22.3 49 | - pandas 2.0.3 50 | - scikit-learn 1.3.0 51 | - matplotlib 3.7.2 52 | -------------------------------------------------------------------------------- /BoxPlot_Error.py: -------------------------------------------------------------------------------- 1 | """ 2 | Plot the boxplot of the forecast error and ratio for different models 3 | """ 4 | import os 5 | from os.path import * 6 | from MCS import * 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | import matplotlib 12 | matplotlib.use('Agg') 13 | import matplotlib.pyplot as plt 14 | import seaborn as sns 15 | from matplotlib.backends.backend_pdf import PdfPages 16 | 17 | path = 'your_local_path' 18 | sum_path = join(path, 'Var_Results_Sum') 19 | plot_path = join(path, 'Var_Results_Plot') 20 | 21 | 22 | def load_data(universe, horizon): 23 | var_df = pd.read_csv(join(path, 'Data', f'{universe}_var_FH{horizon}.csv'), index_col=0) 24 | var_df.fillna(method="ffill", inplace=True) 25 | vech_df = var_df[var_df.index <= '2021-07-01'] 26 | vech_df = vech_df.sort_index(axis=1) 27 | return vech_df 28 | 29 | 30 | def QLIKE(y_true, y_pred): 31 | y_true, y_pred = np.array(y_true), np.array(y_pred) 32 | return np.mean(y_true / y_pred - np.log(y_true / y_pred) - 1) 33 | 34 | 35 | def Loss(vech_df, test_pred_df): 36 | test_df = vech_df.loc[test_pred_df.index] 37 | ticker_l = vech_df.columns.tolist() 38 | test_pred_df.columns = ticker_l 39 | forecast_error = test_pred_df - test_df 40 | forecast_ratio = test_pred_df / test_df 41 | return forecast_error, forecast_ratio 42 | 43 | 44 | def Result(vech_df, version_name, universe, horizon): 45 | result_files = [i for i in files if 46 | ('_pred' in i) and version_name in i and '_' + universe + '_' in i and f'F{horizon}' in i and 'W22' in i] 47 | 48 | result_files.sort() 49 | for (i, item) in enumerate(result_files): 50 | print(i, item) 51 | 52 | error_dic = {} 53 | ratio_dic = {} 54 | 55 | for filename in result_files: 56 | test_pred_df = pd.read_csv(join(sum_path, filename), index_col=0) 57 | test_pred_df = test_pred_df.sort_index(axis=1) 58 | test_pred_df[test_pred_df<=0] = np.nan 59 | test_pred_df.fillna(method="ffill", inplace=True) 60 | 61 | forecast_error, forecast_ratio = Loss(vech_df/horizon, test_pred_df/horizon) 62 | 63 | file_key_name = filename.split('_')[2] + '_' + filename.split('_')[3] 64 | error_dic[file_key_name] = forecast_error 65 | ratio_dic[file_key_name] = forecast_ratio 66 | 67 | return error_dic, ratio_dic 68 | 69 | 70 | def BoxPlot_Error_Ratio(error_dic, ratio_dic, name, horizon): 71 | if name == 'Error': 72 | data_dic = error_dic 73 | else: 74 | data_dic = ratio_dic 75 | 76 | pdf_name = join(plot_path, 'BoxPlot_%s_%d.pdf' % (name, horizon)) 77 | 78 | cmap = plt.get_cmap("tab10") 79 | 80 | pair_l = [['GHAR_iden', 'QLike_HAR'], ['GHAR_iden+glasso', 'QLike_GHAR'], ['MSE_GNNHAR1L', 'QLike_GNNHAR1L'], ['MSE_GNNHAR2L', 'QLike_GNNHAR2L'], ['MSE_GNNHAR3L', 'QLike_GNNHAR3L']] 81 | 82 | new_df_l = [] 83 | for pair in pair_l: 84 | df_mse = data_dic[pair[0]] 85 | df_qli = data_dic[pair[1]] 86 | new_df = pd.DataFrame([df_mse.values.reshape(-1), df_qli.values.reshape(-1)], index=['MSE', 'QLIKE']).T 87 | new_df_l.append(new_df) 88 | 89 | all_df = pd.concat(new_df_l, axis=1) 90 | all_df.columns = [r'HAR$_M$', r'HAR$_Q$', r'GHAR$_M$', r'GHAR$_Q$', r'GNNHAR1L$_M$', r'GNNHAR1L$_Q$', r'GNNHAR2L$_M$', r'GNNHAR2L$_Q$', r'GNNHAR3L$_M$', r'GNNHAR3L$_Q$'] 91 | 92 | with PdfPages(pdf_name) as pdf: 93 | f, ax = plt.subplots() 94 | box_plot = ax.boxplot(all_df, 0, '', vert=False, whis=0, positions=[1, 1.5, 2.5, 3., 4, 4.5, 5.5, 6., 7., 7.5]) 95 | for median in box_plot['medians'][::2]: 96 | median.set_color(cmap(0)) 97 | for median in box_plot['medians'][1::2]: 98 | median.set_color(cmap(1)) 99 | 100 | if name == 'Error': 101 | plt.axvline(x=0, color='grey', linestyle='--') 102 | else: 103 | plt.axvline(x=1, color='grey', linestyle='--') 104 | 105 | plt.yticks([1, 1.5, 2.5, 3., 4, 4.5, 5.5, 6., 7., 7.5], all_df.columns) 106 | plt.tight_layout() 107 | pdf.savefig() 108 | plt.close() 109 | 110 | 111 | if __name__ == '__main__': 112 | horizon = 1 113 | universe = 'DJIA' 114 | 115 | vech_df = load_data(universe, horizon) 116 | files = os.listdir(sum_path) 117 | files.sort() 118 | 119 | error_dic, ratio_dic = Result(vech_df, 'Forecast_Var', universe, horizon) 120 | BoxPlot_Error_Ratio(error_dic, ratio_dic, 'Error', horizon) 121 | BoxPlot_Error_Ratio(error_dic, ratio_dic, 'Ratio', horizon) 122 | -------------------------------------------------------------------------------- /compute_vol.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compute the daily variance from 5-min return data 3 | Compute the variance data for multi-horizon and various universes 4 | """ 5 | 6 | import argparse 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.metrics import r2_score, mean_squared_error 10 | from scipy.stats.mstats import winsorize 11 | from sklearn.linear_model import HuberRegressor, LinearRegression, LassoCV, Ridge, ElasticNet 12 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 13 | from multiprocessing import cpu_count, Pool 14 | from joblib import dump 15 | from os.path import join 16 | import os 17 | from datetime import datetime 18 | from sklearn import preprocessing 19 | from numpy import linalg as LA 20 | import scipy 21 | 22 | DJIA_stocks_l = ['MMM', 'AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CVX', 'CSCO', 'KO', 'DIS', 'HD', 'HON', 'IBM', 'GS', 'NKE', 23 | 'INTC', 'JNJ', 'JPM', 'MCD', 'MRK', 'MSFT', 'PG', 'CRM', 'TRV', 'UNH', 'VZ', 'WMT'] 24 | DJIA_stocks_l.sort() 25 | 26 | SP100_stocks_l = ['AAPL', 'ABT', 'ACN', 'ADBE', 'ADP', 'AMGN', 'AMT', 'AMZN', 'AXP', 'BA', 'BAC', 'BDX', 'BMY', 27 | 'BSX', 'C', 'CAT', 'CB', 'CI', 'CMCSA', 'CME', 'COP', 'COST', 'CRM', 'CSCO', 'CVS', 'CVX', 'D', 28 | 'DHR', 'DIS', 'DUK', 'FIS', 'FISV', 'GE', 'GILD', 'GOOG', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'INTU', 29 | 'ISRG', 'JNJ', 'JPM', 'KO', 'LLY', 'LMT', 'LOW', 'MA', 'MCD', 'MDT', 'MMM', 'MO', 'MRK', 'MS', 30 | 'MSFT', 'NFLX', 'NKE', 'NVDA', 'ORCL', 'PEP', 'PFE', 'PG', 'PNC', 'QCOM', 'SBUX', 'SO', 'SYK', 31 | 'T', 'TGT', 'TJX', 'TMO', 'TXN', 'UNH', 'UNP', 'UPS', 'USB', 'VZ', 'WFC', 'WMT'] 32 | SP100_stocks_l.sort() 33 | 34 | 35 | data_name_dic = {'DJIA': DJIA_stocks_l, 'SP100': SP100_stocks_l} 36 | 37 | def load_data(path): 38 | ret_data = pd.read_csv(join(path, 'Data', 'data_5min.csv')) 39 | 40 | stocks_l = [i for i in ret_data.columns if i not in ['Date', 'Time']] 41 | ret_data[stocks_l] *= 100 42 | 43 | # winsorize the data to avoid measurement errors in LOBSTER 44 | up = 99.5 45 | low = 0.5 46 | for clm in ret_data.columns: 47 | if clm not in ['Date', 'Time']: 48 | max_p = np.nanpercentile(ret_data[clm], up) 49 | min_p = np.nanpercentile(ret_data[clm], low) 50 | 51 | ret_data.loc[ret_data[clm] > max_p, clm] = max_p 52 | ret_data.loc[ret_data[clm] < min_p, clm] = min_p 53 | 54 | return ret_data 55 | 56 | 57 | # compute the variance of the data 58 | def compute_variance(sub_data): 59 | stocks_l = [i for i in sub_data.columns if i not in ['Date', 'Time']] 60 | sq_data = sub_data[stocks_l] ** 2 61 | var_sum = sq_data.sum(min_count=1) 62 | var_sum = pd.DataFrame(var_sum).T 63 | return var_sum 64 | 65 | 66 | # compute the variance for different horizons and universes 67 | def Compute_Horizon(path, univese, ret_vol, horizon): 68 | if ret_vol == 'ret': 69 | daily_var_data = pd.read_csv(join(path, 'Data', 'daily_return.csv'), index_col=0) 70 | elif ret_vol == 'var': 71 | daily_var_data = pd.read_csv(join(path, 'Data', 'daily_variance.csv'), index_col=0) 72 | else: 73 | print('Please choose ret or var') 74 | return 75 | 76 | var_data = 0 77 | for i in range(horizon): 78 | var_data += daily_var_data.shift(-i) 79 | var_data.dropna(inplace=True) 80 | var_univ = var_data[data_name_dic[univese]] 81 | var_univ.to_csv(join(path, 'Data', f'{univese}_{ret_vol}_FH{horizon}.csv')) 82 | 83 | 84 | if __name__ == '__main__': 85 | path = 'your_local_path' 86 | 87 | ret_data = load_data(path) 88 | stocks_l = [i for i in ret_data.columns if i not in ['Date', 'Time']] 89 | date_l = list(set(ret_data['Date'].tolist())) 90 | date_l.sort() 91 | 92 | ### Compute daily return 93 | daily_return_data = ret_data.groupby(by='Date').sum(min_count=1) 94 | daily_return_data.index = list(daily_return_data.index) 95 | daily_return_data.to_csv(join(path, 'Data', 'daily_return.csv')) 96 | 97 | ### Compute daily variance 98 | var_df = ret_data.groupby(by='Date').apply(compute_variance) 99 | var_df.index = date_l 100 | 101 | var_df.columns = stocks_l 102 | var_df.to_csv(join(path, 'Data', 'daily_variance.csv')) 103 | 104 | ### Compute variance over different horizons 105 | horizon = 5 106 | for name in ['DJIA30', 'SP100']: 107 | Compute_Horizon(path, name, 'ret', horizon) 108 | Compute_Horizon(path, name, 'var', horizon) -------------------------------------------------------------------------------- /Summary_Results.py: -------------------------------------------------------------------------------- 1 | """ 2 | Summarize the results of the forecast models, including the MSE, QLIKE, and the MCS tests. 3 | """ 4 | 5 | import os 6 | from os.path import * 7 | from MCS import * 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn.metrics import mean_squared_error 12 | 13 | path = 'your_local_path' 14 | sum_path = join(path, 'Var_Results_Sum') 15 | 16 | 17 | def load_data(universe, horizon): 18 | var_df = pd.read_csv(join(path, 'Data', f'{universe}_var_FH{horizon}.csv'), index_col=0) 19 | var_df.fillna(method="ffill", inplace=True) 20 | vech_df = var_df[var_df.index <= '2021-07-01'] 21 | vech_df = vech_df.sort_index(axis=1) 22 | return vech_df 23 | 24 | 25 | def QLIKE(y_true, y_pred): 26 | y_true, y_pred = np.array(y_true), np.array(y_pred) 27 | return np.mean(y_true / y_pred - np.log(y_true / y_pred) - 1) 28 | 29 | 30 | def Loss(vech_df, test_pred_df): 31 | test_df = vech_df.loc[test_pred_df.index] 32 | ticker_l = vech_df.columns.tolist() 33 | test_pred_df.columns = ticker_l 34 | df_l = [] 35 | 36 | for ticker in ticker_l: 37 | y_true = test_df[ticker].values 38 | y_pred = test_pred_df[ticker].values 39 | assert (y_pred > 0).all() 40 | mse = mean_squared_error(y_true, y_pred) 41 | qlike = QLIKE(y_true, y_pred) 42 | 43 | df_l.append([np.round(mse, 4), np.round(qlike, 4)]) 44 | 45 | df = pd.DataFrame(np.array(df_l), index=ticker_l, columns=['MSE', 'QLIKE']) 46 | return df 47 | 48 | 49 | def Result(vech_df, version_name, universe, horizon): 50 | result_files = [i for i in files if 51 | ('_pred' in i) and version_name in i and '_' + universe + '_' in i and f'F{horizon}' in i and 'W22' in i] 52 | 53 | result_files.sort() 54 | for (i, item) in enumerate(result_files): 55 | print(i, item) 56 | 57 | E_df_l = [] 58 | Q_df_l = [] 59 | 60 | files_l = [] 61 | for filename in result_files: 62 | test_pred_df = pd.read_csv(join(sum_path, filename), index_col=0) 63 | test_pred_df = test_pred_df.sort_index(axis=1) 64 | test_pred_df[test_pred_df<=0] = np.nan 65 | test_pred_df.fillna(method="ffill", inplace=True) 66 | 67 | df = Loss(vech_df, test_pred_df) 68 | 69 | E_df_l.append(df['MSE']) 70 | Q_df_l.append(df['QLIKE']) 71 | 72 | file_key_name = filename.split('_')[2] + '_' + filename.split('_')[3] 73 | # file_key_name = filename 74 | files_l.append(file_key_name) 75 | 76 | E_df = pd.concat(E_df_l, axis=1) 77 | E_df.columns = files_l 78 | Q_df = pd.concat(Q_df_l, axis=1) 79 | Q_df.columns = files_l 80 | return result_files, E_df, Q_df 81 | 82 | 83 | def norm_loss(df): 84 | return df.apply(lambda x: x/df['GHAR_iden'], axis=0) 85 | 86 | 87 | def rank_MCS(loss_df, pval_df): 88 | loss_mean_df = loss_df.mean(0) 89 | rank_df = loss_mean_df.rank() 90 | pval_df = pd.DataFrame(pval_df, columns=['p-value']) 91 | pval_df['loss'] = loss_mean_df 92 | pval_df['ratio'] = loss_mean_df / loss_mean_df.loc['GHAR_iden'] 93 | pval_df['rank'] = rank_df 94 | # model names, may need to modify according to the user's choices 95 | idx_l = ['GHAR_iden', 'GHAR_iden+glasso', 'MSE_GNNHAR1L', 'MSE_GNNHAR2L', 'MSE_GNNHAR3L', 'QLike_HAR', 'QLike_GHAR', 'QLike_GNNHAR1L', 'QLike_GNNHAR2L', 'QLike_GNNHAR3L'] 96 | 97 | return pval_df.loc[idx_l, ['ratio', 'p-value']] 98 | 99 | 100 | if __name__ == '__main__': 101 | horizon = 1 102 | universe = 'DJIA' 103 | 104 | vech_df = load_data(universe, horizon) 105 | files = os.listdir(sum_path) 106 | files.sort() 107 | 108 | result_files, E_df, Q_df = Result(vech_df, 'Forecast_Var', universe, horizon) 109 | print(E_df.mean(0)) 110 | print(Q_df.mean(0)) 111 | 112 | print(' * ' * 30) 113 | print(norm_loss(E_df).mean(0)) 114 | print(norm_loss(Q_df).mean(0)) 115 | 116 | mcs_E = ModelConfidenceSet(E_df, 0.05, 10000, 2).run() 117 | sum_E = rank_MCS(E_df, mcs_E.pvalues) 118 | 119 | mcs_Q = ModelConfidenceSet(Q_df, 0.05, 10000, 2).run() 120 | sum_Q = rank_MCS(Q_df, mcs_Q.pvalues) 121 | 122 | print(" * * * * * MCS of MSE * * * * * ") 123 | print(np.round(sum_E, 3)) 124 | 125 | print(" * * * * * MCS of QLIKE * * * * * ") 126 | print(np.round(sum_Q, 3)) 127 | 128 | # print(' * ' * 30) 129 | mse_ticker_model = norm_loss(E_df).T 130 | qlike_ticker_model = norm_loss(Q_df).T 131 | 132 | # save results to csv 133 | mse_ticker_model.to_csv(join(sum_path, 'mse_ticker_model.csv')) 134 | qlike_ticker_model.to_csv(join(sum_path, 'qlike_ticker_model.csv')) -------------------------------------------------------------------------------- /data_subsample.py: -------------------------------------------------------------------------------- 1 | """ 2 | Subsample the minutely data to 5 minutes and merge the data of all stocks in the stock list. 3 | The output data is of the shape (T, N), where T is the number of 5-minute intervals for all trading days and N is the number of stocks. 4 | """ 5 | 6 | import argparse 7 | import numpy as np 8 | import pandas as pd 9 | from scipy.stats.mstats import winsorize 10 | from multiprocessing import cpu_count, Pool 11 | from joblib import dump 12 | from os.path import join 13 | import os 14 | from datetime import datetime 15 | from sklearn import preprocessing 16 | import time 17 | 18 | # stock list 19 | stocks_l = ['AAPL', 'MSFT', 'AMZN', 'FB', 'BRK.B', 'JPM', 'GOOG', 'GOOGL', 'JNJ', 'V', 'PG', 'XOM', 'BAC', 'T', 20 | 'UNH', 'DIS', 'MA', 'INTC', 'VZ', 'HD', 'MRK', 'CVX', 'WFC', 'PFE', 'KO', 'CMCSA', 'CSCO', 'PEP', 21 | 'BA', 'C', 'WMT', 'ADBE', 'MDT', 'ABT', 'MCD', 'BMY', 'AMGN', 'CRM', 'NVDA', 'PM', 'NFLX', 'ABBV', 22 | 'ACN', 'COST', 'PYPL', 'TMO', 'AVGO', 'HON', 'UNP', 'NKE', 'UTX', 'ORCL', 'IBM', 'TXN', 'NEE', 'LIN', 23 | 'SBUX', 'LLY', 'QCOM', 'MMM', 'GE', 'CVS', 'DHR', 'LMT', 'AMT', 'MO', 'LOW', 'USB', 'BKNG', 'AXP', 24 | 'FIS', 'GILD', 'UPS', 'CAT', 'MDLZ', 'CHTR', 'TFC', 'ANTM', 'GS', 'CI', 'TJX', 'ADP', 'BDX', 'CME', 25 | 'CB', 'PNC', 'COP', 'INTU', 'ISRG', 'D', 'SPGI', 'FISV', 'DUK', 'SYK', 'SO', 'TGT', 'MS', 'BSX', 'AGN', 'RTN'] 26 | stocks_l.sort() 27 | 28 | 29 | def logret_data(data): 30 | data['Price'] = (data['ask_1'] + data['bid_1']) / 2 31 | data['ret'] = np.log(data['Price']).diff() 32 | return data[['time', 'ret']][1:] 33 | 34 | 35 | # sanity check for the data and save the good data 36 | def data_sanity(path, ticker): 37 | ticker_path = join(path, 'LOBData', ticker) 38 | files = os.listdir(ticker_path) 39 | files = [i for i in files if i.endswith('.csv')] 40 | files.sort() 41 | 42 | data_l = [] 43 | 44 | for file in files: 45 | date = file.split('_')[1] 46 | data = pd.read_csv(join(ticker_path, file)) 47 | # 07-03, 12-24 are half trading days, removed from the data 48 | if len(data) == 391: 49 | if '07-03' in date or '12-24' in date: 50 | pass 51 | elif (np.abs(data['ask_1'] / data['bid_1'] - 1) < 0.5).all(): 52 | ret_data = logret_data(data) 53 | ret_data['date'] = date 54 | data_l.append(ret_data) 55 | # if the spread is too large, print the data to check 56 | else: 57 | a = data['ask_1'] / data['bid_1'] 58 | idx = a.argmax() 59 | print(data[idx-5:idx+5]) 60 | print(data[-10:]) 61 | ret_data = logret_data(data) 62 | ret_data['date'] = date 63 | data_l.append(ret_data) 64 | else: 65 | print('- ' * 10 + date + ' Missing') 66 | print(len(data)) 67 | 68 | all_data = pd.concat(data_l) 69 | all_data.to_csv(join(path, 'Minute_Data', ticker+'.csv'), index=False) 70 | return all_data 71 | 72 | 73 | # merge the minute data of all stocks in the stock list 74 | def merge_data(path): 75 | data_l = [] 76 | tickers = os.listdir(join(path, 'Minute_Data')) 77 | tickers.sort() 78 | for i in tickers: 79 | ticker = i.split('.csv')[0] 80 | print(ticker) 81 | all_data = pd.read_csv(join(path, 'Minute_Data', ticker+'.csv')) 82 | print(all_data.shape) 83 | month_l = list(set([i[:7] for i in all_data['date']])) 84 | month_l.sort() 85 | print(len(month_l)) 86 | all_data.rename(columns={'ret':ticker}, inplace=True) 87 | data_l.append(all_data[['date', 'time', ticker]]) 88 | 89 | from functools import reduce 90 | df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['date', 'time'], how='outer'), data_l) 91 | df.rename(columns={'date':'Date', 'time':'Time'}, inplace=True) 92 | print(df) 93 | os.makedirs(join(path, 'Data'), exist_ok=True) 94 | df.to_csv(join(path, 'Data', 'data_1min.csv'), index=False) 95 | 96 | 97 | # subsample the data to 5 minutes 98 | def data_subsample(path): 99 | df = pd.read_csv(join(path, 'Data', 'data_1min.csv')) 100 | df_gb = df.groupby(by=df.index // 5) 101 | df_5min = df_gb.sum(min_count=1) 102 | df_5min['Date'] = df['Date'].to_list()[4::5] 103 | df_5min['Time'] = df['Time'].to_list()[4::5] 104 | clms = [i for i in df_5min.columns if i not in ['Date', 'Time']] 105 | clms.sort() 106 | df_5min = df_5min[['Date', 'Time'] + clms] 107 | os.makedirs(join(path, 'Data'), exist_ok=True) 108 | df_5min.to_csv(join(path, 'Data', 'data_5min.csv'), index=False) 109 | 110 | 111 | if __name__ == '__main__': 112 | path = 'your local path for storing minutely LOBSTER data and processed data' 113 | # under the path, the code will create 3 folders: 114 | # 1. LOBSTER: the raw LOBSTER data, 115 | # 2. Minute_Data: the sanity checked minutely return data for each stock separately, 116 | # 3. Data: processed panel data, like 1-min, 5-min returns, realized variance 117 | 118 | for ticker in stocks_l: 119 | print(' * ' * 20 + ticker + ' * ' * 20) 120 | all_data = data_sanity(path, ticker) 121 | print(all_data) 122 | 123 | merge_data(path) 124 | data_subsample(path) -------------------------------------------------------------------------------- /Summary_Regime.py: -------------------------------------------------------------------------------- 1 | """ 2 | Summarize the results of the forecast models, based on different regimes. 3 | We split the testing period into 2 sub-periods, based on the volatility of the SPY. 4 | """ 5 | 6 | import os 7 | from os.path import * 8 | from MCS import * 9 | 10 | import numpy as np 11 | import pandas as pd 12 | from sklearn.metrics import mean_squared_error 13 | 14 | path = 'your_local_path' 15 | sum_path = join(path, 'Var_Results_Sum') 16 | 17 | 18 | def load_RV_SPY(horizon): 19 | spy_rv_df = pd.read_csv(join(path, 'Data', f'SPY_var_FH{horizon}.csv'), index_col=0) 20 | return spy_rv_df 21 | 22 | 23 | # split the testing period into 2 sub-periods, based on the volatility of the SPY 24 | # q is the percentile of the SPY volatility, default is 90%. 25 | # In other words, we consider the high-volatility period as the top 10% of the SPY volatility, and the rest as the low-volatility period. 26 | def split_period(spy_rv_df, test_pred_df, q=90.0): 27 | spy_rv_df = spy_rv_df.loc[test_pred_df.index] 28 | perc_choice = np.percentile(spy_rv_df['SPY'], q) 29 | low_vol_dates = spy_rv_df[spy_rv_df['SPY'] < perc_choice].index.tolist() 30 | high_vol_dates = spy_rv_df[spy_rv_df['SPY'] >= perc_choice].index.tolist() 31 | return low_vol_dates, high_vol_dates 32 | 33 | 34 | def load_data(universe, horizon): 35 | var_df = pd.read_csv(join(path, 'Data', f'{universe}_var_FH{horizon}.csv'), index_col=0) 36 | var_df.fillna(method="ffill", inplace=True) 37 | vech_df = var_df[var_df.index <= '2021-07-01'] 38 | vech_df = vech_df.sort_index(axis=1) 39 | return vech_df 40 | 41 | 42 | def QLIKE(y_true, y_pred): 43 | y_true, y_pred = np.array(y_true), np.array(y_pred) 44 | return np.mean(y_true / y_pred - np.log(y_true / y_pred) - 1) 45 | 46 | 47 | def Loss(vech_df, test_pred_df, spy_rv_df): 48 | low_vol_dates, high_vol_dates = split_period(spy_rv_df, test_pred_df) 49 | test_df = vech_df.loc[test_pred_df.index] 50 | ticker_l = vech_df.columns.tolist() 51 | test_pred_df.columns = ticker_l 52 | all_df_l = [] 53 | for date_l in [low_vol_dates, high_vol_dates]: 54 | df_l = [] 55 | 56 | for ticker in ticker_l: 57 | y_true = test_df.loc[date_l, ticker].values 58 | y_pred = test_pred_df.loc[date_l, ticker].values 59 | assert (y_pred > 0).all() 60 | mse = mean_squared_error(y_true, y_pred) 61 | qlike = QLIKE(y_true, y_pred) 62 | 63 | df_l.append([np.round(mse, 4), np.round(qlike, 4)]) 64 | 65 | df = pd.DataFrame(np.array(df_l), index=ticker_l, columns=['MSE', 'QLIKE']) 66 | all_df_l.append(df) 67 | 68 | return all_df_l 69 | 70 | 71 | def Result(vech_df, version_name, universe, horizon): 72 | result_files = [i for i in files if 73 | ('_pred' in i) and version_name in i and '_' + universe + '_' in i and f'F{horizon}' in i and 'W22' in i] 74 | 75 | result_files.sort() 76 | for (i, item) in enumerate(result_files): 77 | print(i, item) 78 | 79 | spy_rv_df = load_RV_SPY(horizon) 80 | 81 | E_low_l = [] 82 | E_high_l = [] 83 | Q_low_l = [] 84 | Q_high_l = [] 85 | files_l = [] 86 | 87 | for filename in result_files: 88 | test_pred_df = pd.read_csv(join(sum_path, filename), index_col=0) 89 | test_pred_df = test_pred_df.sort_index(axis=1) 90 | # print(filename) 91 | # print(test_pred_df) 92 | test_pred_df[test_pred_df<=0] = np.nan 93 | test_pred_df.fillna(method="ffill", inplace=True) 94 | 95 | df_l = Loss(vech_df, test_pred_df, spy_rv_df) 96 | low_df, high_df = df_l 97 | 98 | E_low_l.append(low_df['MSE']) 99 | E_high_l.append(high_df['MSE']) 100 | 101 | Q_low_l.append(low_df['QLIKE']) 102 | Q_high_l.append(high_df['QLIKE']) 103 | 104 | file_key_name = filename.split('_')[2] + '_' + filename.split('_')[3] 105 | # file_key_name = filename 106 | files_l.append(file_key_name) 107 | 108 | E_df_low = pd.concat(E_low_l, axis=1) 109 | E_df_low.columns = files_l 110 | E_df_high = pd.concat(E_high_l, axis=1) 111 | E_df_high.columns = files_l 112 | 113 | Q_df_low = pd.concat(Q_low_l, axis=1) 114 | Q_df_low.columns = files_l 115 | Q_df_high = pd.concat(Q_high_l, axis=1) 116 | Q_df_high.columns = files_l 117 | return E_df_low, E_df_high, Q_df_low, Q_df_high 118 | 119 | 120 | def norm_loss(df): 121 | return df.apply(lambda x: x/df['GHAR_iden'], axis=0) 122 | 123 | 124 | def rank_MCS(loss_df, pval_df): 125 | loss_mean_df = loss_df.mean(0) 126 | rank_df = loss_mean_df.rank() 127 | pval_df = pd.DataFrame(pval_df, columns=['p-value']) 128 | pval_df['loss'] = loss_mean_df 129 | pval_df['ratio'] = loss_mean_df / loss_mean_df.loc['GHAR_iden'] 130 | pval_df['rank'] = rank_df 131 | # model names, may need to modify according to the user's choices 132 | idx_l = ['GHAR_iden', 'GHAR_iden+glasso', 'MSE_GNNHAR1L', 'MSE_GNNHAR2L', 'MSE_GNNHAR3L', 'QLike_HAR', 'QLike_GHAR', 'QLike_GNNHAR1L', 'QLike_GNNHAR2L', 'QLike_GNNHAR3L'] 133 | 134 | return pval_df.loc[idx_l, ['ratio', 'p-value']] 135 | 136 | 137 | if __name__ == '__main__': 138 | horizon = 1 139 | universe = 'DJIA' 140 | 141 | vech_df = load_data(universe, horizon) 142 | files = os.listdir(sum_path) 143 | files.sort() 144 | 145 | E_df_low, E_df_high, Q_df_low, Q_df_high = Result(vech_df, 'Forecast_Var', universe, horizon) 146 | print(' * ' * 10 + '| MSE |' + ' * ' * 10) 147 | print(E_df_low.mean(0)) 148 | print(E_df_high.mean(0)) 149 | 150 | print(' * ' * 10 + '| QLIKE |' + ' * ' * 10) 151 | print(Q_df_low.mean(0)) 152 | print(Q_df_high.mean(0)) 153 | 154 | for E_df in [(E_df_low), (E_df_high)]: 155 | mcs_E = ModelConfidenceSet(E_df, 0.05, 10000, 2).run() 156 | sum_E = rank_MCS(E_df, mcs_E.pvalues) 157 | print(" * * * * * MCS of MSE * * * * * ") 158 | print(np.round(sum_E, 3)) 159 | 160 | for Q_df in [(Q_df_low), (Q_df_high)]: 161 | mcs_Q = ModelConfidenceSet(Q_df, 0.05, 10000, 2).run() 162 | sum_Q = rank_MCS(Q_df, mcs_Q.pvalues) 163 | print(" * * * * * MCS of QLIKE * * * * * ") 164 | print(np.round(sum_Q, 3)) -------------------------------------------------------------------------------- /MCS.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Econometrica Paper: 3 | Hansen, Peter R., Asger Lunde, and James M. Nason. "The model confidence set." Econometrica 79.2 (2011): 453-497. 4 | """ 5 | 6 | import numpy as np 7 | from numpy.random import rand 8 | from numpy import ix_ 9 | import pandas as pd 10 | np.random.seed(0) 11 | 12 | 13 | def bootstrap_sample(data, B, w): 14 | ''' 15 | Bootstrap the input data 16 | data: input numpy data array 17 | B: boostrap size 18 | w: block length of the boostrap 19 | ''' 20 | t = len(data) 21 | p = 1 / w 22 | indices = np.zeros((t, B), dtype=int) 23 | indices[0, :] = np.ceil(t * rand(1, B)) 24 | select = np.asfortranarray(rand(B, t).T < p) 25 | vals = np.ceil(rand(1, np.sum(np.sum(select))) * t).astype(int) 26 | indices_flat = indices.ravel(order="F") 27 | indices_flat[select.ravel(order="F")] = vals.ravel() 28 | indices = indices_flat.reshape([B, t]).T 29 | for i in range(1, t): 30 | indices[i, ~select[i, :]] = indices[i - 1, ~select[i, :]] + 1 31 | indices[indices > t] = indices[indices > t] - t 32 | indices -= 1 33 | return data[indices] 34 | 35 | 36 | def compute_dij(losses, bsdata): 37 | '''Compute the loss difference''' 38 | t, M0 = losses.shape 39 | B = bsdata.shape[1] 40 | dijbar = np.zeros((M0, M0)) 41 | for j in range(M0): 42 | dijbar[j, :] = np.mean(losses - losses[:, [j]], axis=0) 43 | 44 | dijbarstar = np.zeros((B, M0, M0)) 45 | for b in range(B): 46 | meanworkdata = np.mean(losses[bsdata[:, b], :], axis=0) 47 | for j in range(M0): 48 | dijbarstar[b, j, :] = meanworkdata - meanworkdata[j] 49 | 50 | vardijbar = np.mean((dijbarstar - np.expand_dims(dijbar, 0)) ** 2, axis=0) 51 | vardijbar += np.eye(M0) 52 | 53 | return dijbar, dijbarstar, vardijbar 54 | 55 | 56 | def calculate_PvalR(z, included, zdata0): 57 | '''Calculate the p-value of relative algorithm''' 58 | empdistTR = np.max(np.max(np.abs(z), 2), 1) 59 | zdata = zdata0[ix_(included - 1, included - 1)] 60 | TR = np.max(zdata) 61 | pval = np.mean(empdistTR > TR) 62 | return pval 63 | 64 | 65 | def calculate_PvalSQ(z, included, zdata0): 66 | '''Calculate the p-value of sequential algorithm''' 67 | empdistTSQ = np.sum(z ** 2, axis=1).sum(axis=1) / 2 68 | zdata = zdata0[ix_(included - 1, included - 1)] 69 | TSQ = np.sum(zdata ** 2) / 2 70 | pval = np.mean(empdistTSQ > TSQ) 71 | return pval 72 | 73 | 74 | def iterate(dijbar, dijbarstar, vardijbar, alpha, algorithm="R"): 75 | '''Iteratively excluding inferior model''' 76 | B, M0, _ = dijbarstar.shape 77 | z0 = (dijbarstar - np.expand_dims(dijbar, 0)) / np.sqrt( 78 | np.expand_dims(vardijbar, 0) 79 | ) 80 | zdata0 = dijbar / np.sqrt(vardijbar) 81 | 82 | excludedR = np.zeros([M0, 1], dtype=int) 83 | pvalsR = np.ones([M0, 1]) 84 | 85 | for i in range(M0 - 1): 86 | included = np.setdiff1d(np.arange(1, M0 + 1), excludedR) 87 | m = len(included) 88 | z = z0[ix_(range(B), included - 1, included - 1)] 89 | 90 | if algorithm == "R": 91 | pvalsR[i] = calculate_PvalR(z, included, zdata0) 92 | elif algorithm == "SQ": 93 | pvalsR[i] = calculate_PvalSQ(z, included, zdata0) 94 | 95 | scale = m / (m - 1) 96 | dibar = np.mean(dijbar[ix_(included - 1, included - 1)], 0) * scale 97 | dibstar = np.mean(dijbarstar[ix_(range(B), included - 1, included - 1)], 1) * ( 98 | m / (m - 1) 99 | ) 100 | vardi = np.mean((dibstar - dibar) ** 2, axis=0) 101 | t = dibar / np.sqrt(vardi) 102 | modeltoremove = np.argmax(t) 103 | excludedR[i] = included[modeltoremove] 104 | 105 | maxpval = pvalsR[0] 106 | for i in range(1, M0): 107 | if pvalsR[i] < maxpval: 108 | pvalsR[i] = maxpval 109 | else: 110 | maxpval = pvalsR[i] 111 | 112 | excludedR[-1] = np.setdiff1d(np.arange(1, M0 + 1), excludedR) 113 | pl = np.argmax(pvalsR > alpha) 114 | includedR = excludedR[pl:] 115 | excludedR = excludedR[:pl] 116 | return includedR - 1, excludedR - 1, pvalsR 117 | 118 | 119 | def MCS(losses, alpha, B, w, algorithm): 120 | '''Main function of the MCS''' 121 | t, M0 = losses.shape 122 | bsdata = bootstrap_sample(np.arange(t), B, w) 123 | dijbar, dijbarstar, vardijbar = compute_dij(losses, bsdata) 124 | includedR, excludedR, pvalsR = iterate( 125 | dijbar, dijbarstar, vardijbar, alpha, algorithm=algorithm 126 | ) 127 | return includedR, excludedR, pvalsR 128 | 129 | 130 | class ModelConfidenceSet(object): 131 | def __init__(self, data, alpha, B, w, algorithm="SQ", names=None): 132 | """ 133 | Input: 134 | data->pandas.DataFrame or numpy.ndarray: input data, columns are the losses of each model 135 | alpha->float: confidence level 136 | B->int: bootstrap size for computation covariance 137 | w->int: block size for bootstrap sampling 138 | algorithm->str: SQ or R, SQ is the first t-statistics in Hansen (2011) p.465, and R is the second t-statistics 139 | names->list: the name of each model (corresponding to each columns). 140 | 141 | Method: 142 | run(self): compute the MCS procedure 143 | 144 | Attributes: 145 | included: models that are in the model confidence sets at confidence level of alpha 146 | excluded: models that are NOT in the model confidence sets at confidence level of alpha 147 | pvalues: the bootstrap p-values of each models 148 | """ 149 | 150 | if isinstance(data, pd.DataFrame): 151 | self.data = data.values 152 | self.names = data.columns.values if names is None else names 153 | elif isinstance(data, np.ndarray): 154 | self.data = data 155 | self.names = np.arange(data.shape[1]) if names is None else names 156 | 157 | if alpha < 0 or alpha > 1: 158 | raise ValueError( 159 | f"alpha must be larger than zero and less than 1, found {alpha}" 160 | ) 161 | if not isinstance(B, int): 162 | try: 163 | B = int(B) 164 | except Exception as identifier: 165 | raise RuntimeError( 166 | f"Bootstrap size B must be a integer, fail to convert", identifier 167 | ) 168 | if B < 1: 169 | raise ValueError(f"Bootstrap size B must be larger than 1, found {B}") 170 | if not isinstance(w, int): 171 | try: 172 | w = int(w) 173 | except Exception as identifier: 174 | raise RuntimeError( 175 | f"Bootstrap block size w must be a integer, fail to convert", 176 | identifier, 177 | ) 178 | if w < 1: 179 | raise ValueError(f"Bootstrap block size w must be larger than 1, found {w}") 180 | 181 | if algorithm not in ["R", "SQ"]: 182 | raise TypeError(f"Only R and SQ algorithm supported, found {algorithm}") 183 | 184 | self.alpha = alpha 185 | self.B = B 186 | self.w = w 187 | self.algorithm = algorithm 188 | 189 | def run(self): 190 | included, excluded, pvals = MCS( 191 | self.data, self.alpha, self.B, self.w, self.algorithm 192 | ) 193 | 194 | self.included = self.names[included].ravel().tolist() 195 | self.excluded = self.names[excluded].ravel().tolist() 196 | self.pvalues = pd.Series(pvals.ravel(), index=self.excluded + self.included) 197 | return self -------------------------------------------------------------------------------- /GHAR.py: -------------------------------------------------------------------------------- 1 | """ 2 | Linear models to forecast the realized volatility, including HAR and GHAR. HAR is a special case of GHAR, assuming the adjacency matrix is identity. 3 | """ 4 | 5 | import argparse 6 | import os 7 | from os.path import join 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn.linear_model import LinearRegression 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--window", type=int, default=22, help="forward-looking period") 15 | parser.add_argument("--horizon", type=int, default=1, help="forecasting horizon") 16 | parser.add_argument("--model_name", type=str, default='GHAR', help="model name") 17 | parser.add_argument("--adj_name", type=str, default='iden+glasso', help="adj choices") 18 | parser.add_argument("--universe", type=str, default='DJIA', help="data name") 19 | parser.add_argument("--version", type=str, default='Forecast_Var', help="version name") 20 | 21 | opt = parser.parse_args() 22 | print(opt) 23 | 24 | # Specific version 25 | this_version = '_'.join( 26 | [opt.version, 27 | opt.model_name, 28 | opt.adj_name, 29 | opt.universe, 30 | 'W' + str(opt.window), 31 | 'F' + str(opt.horizon)]) 32 | 33 | 34 | path = 'your_local_path' 35 | model_save_path = join('your_model_storage_path', this_version) 36 | os.makedirs(model_save_path, exist_ok=True) 37 | 38 | 39 | def load_feature_data(universe): 40 | feature_df = pd.read_csv(join(path, 'Data', f'{universe}_var_FH1.csv'), index_col=0) 41 | feature_df.fillna(method="ffill", inplace=True) 42 | feature_df = feature_df[feature_df.index <= '2021-07-01'] 43 | feature_df = feature_df.sort_index(axis=1) 44 | return feature_df 45 | 46 | 47 | def load_data(universe, horizon): 48 | var_df = pd.read_csv(join(path, 'Data', f'{universe}_var_FH{horizon}.csv'), index_col=0) 49 | var_df.fillna(method="ffill", inplace=True) 50 | vech_df = var_df[var_df.index <= '2021-07-01'] 51 | vech_df = vech_df.sort_index(axis=1) 52 | return vech_df 53 | 54 | 55 | def load_ret(universe): 56 | ret_df = pd.read_csv(join(path, 'Data', f'{universe}_ret_FH1.csv'), index_col=0) 57 | ret_df.fillna(method="ffill", inplace=True) 58 | ret_df = ret_df[ret_df.index <= '2021-07-01'] 59 | ret_df = ret_df.sort_index(axis=1) 60 | return ret_df 61 | 62 | 63 | def preprocess_HAR(feature_df, vech_df): 64 | subdf_l = [] 65 | all_assets_l = [i for i in vech_df.columns if i not in ['Date', 'Time']] 66 | all_assets_l.sort() 67 | 68 | har_lags = [1, 5, 22] 69 | for target_var in vech_df: 70 | subdf = pd.DataFrame() 71 | subdf['Target'] = vech_df[target_var].copy() 72 | subdf['Date'] = vech_df.index 73 | subdf['Ticker'] = target_var 74 | indpt_df_l = [] 75 | for lag in har_lags: 76 | tmp_indpdt_df = 0 77 | for il in range(1, 1+lag): 78 | tmp_indpdt_df += feature_df[target_var].shift(il) 79 | 80 | indpt_df_l.append(tmp_indpdt_df / lag) 81 | 82 | # reverse the time order 83 | explain_df = pd.concat(indpt_df_l, axis=1) 84 | explain_df.columns = ['var+lag%d' % i for i in har_lags] 85 | 86 | subdf = pd.merge(subdf, explain_df, left_index=True, right_index=True) 87 | subdf.replace([np.inf, -np.inf], np.nan, inplace=True) 88 | subdf.dropna(inplace=True) 89 | subdf_l.append(subdf) 90 | 91 | df = pd.concat(subdf_l) 92 | df.reset_index(drop=True, inplace=True) 93 | 94 | date_l = list(set(df['Date'].tolist())) 95 | date_l.sort() 96 | 97 | subdf_dic = {} 98 | for date in date_l: 99 | subdf = df[df['Date'] == date] 100 | subdf_dic[date] = subdf 101 | 102 | print('Finish preparation!') 103 | return subdf_dic, date_l 104 | 105 | 106 | 107 | def preprocess_adj_l(date_l, subdf_dic, adj_df_l): 108 | new_subdf_l = [] 109 | for date in date_l: 110 | subdf = subdf_dic[date] 111 | # print(subdf) 112 | tmp_subdf_l = [] 113 | clms = [i for i in subdf.columns if 'lag' in i] 114 | # print(clms) 115 | for k, adj_df in enumerate(adj_df_l): 116 | # print(adj_df) 117 | tmp_subdf = pd.DataFrame(np.dot(adj_df, subdf[clms]), columns=['sec'+str(k)+i for i in clms], index=subdf.index) 118 | tmp_subdf_l.append(tmp_subdf) 119 | new_subdf = pd.concat([subdf[['Target', 'Date', 'Ticker']]]+tmp_subdf_l, axis=1) 120 | new_subdf_l.append(new_subdf) 121 | 122 | df = pd.concat(new_subdf_l) 123 | df.reset_index(drop=True, inplace=True) 124 | print('Finish transformation!') 125 | return df 126 | 127 | 128 | def df2arr(df, vars_l): 129 | all_inputs = df[vars_l].values 130 | all_targets = df[['Target']].values 131 | return all_inputs, all_targets 132 | 133 | 134 | def GLASSO_Precision(subret): 135 | from sklearn.covariance import GraphicalLassoCV 136 | n = subret.shape[1] 137 | tickers = subret.columns 138 | cov = GraphicalLassoCV().fit(subret) 139 | print('Alpha in GLASSO: %.3f' % cov.alpha_) 140 | corr = cov.precision_ != 0 141 | print('Sparsity of Adj: %.3f' % corr.mean()) 142 | corr_adj = corr - np.identity(n) 143 | d_sqrt_inv = np.diag(np.sqrt(1/(corr_adj.sum(1)+1e-8))) 144 | adj_df = pd.DataFrame(np.dot(np.dot(d_sqrt_inv, corr_adj), d_sqrt_inv), columns=tickers, index=tickers) 145 | return adj_df 146 | 147 | 148 | def Train(ret_df, vech_df, subdf_dic, date, date_l): 149 | timestamp = date_l.index(date) 150 | # split time 151 | s_p = max(timestamp-1000, 0) 152 | f_p = min(timestamp + opt.window, len(date_l)-1) 153 | 154 | s_date = date_l[s_p] 155 | f_date = date_l[f_p] 156 | 157 | subret = ret_df[ret_df.index < date] 158 | subret = subret[subret.index >= s_date] 159 | 160 | subdata = vech_df[vech_df.index < date] 161 | subdata = subdata[subdata.index >= s_date] 162 | tickers = subret.columns 163 | 164 | n = vech_df.shape[1] 165 | adj_name_l = opt.adj_name.split('+') 166 | adj_df_l = [] 167 | for adj_name in adj_name_l: 168 | if adj_name == 'iden': 169 | adj_df = pd.DataFrame(np.identity(n), index=tickers, columns=tickers) 170 | elif adj_name == 'glasso': 171 | adj_df = GLASSO_Precision(subret) 172 | else: 173 | adj_df = pd.DataFrame(np.zeros((n, n)), index=tickers, columns=tickers) 174 | 175 | adj_df_l.append(adj_df) 176 | 177 | df = preprocess_adj_l(date_l[s_p:f_p+1], subdf_dic, adj_df_l) 178 | 179 | vars_l = [i for i in df.columns if 'lag' in i] 180 | # split data 181 | train_df = df[df['Date'] >= s_date] 182 | train_df = train_df[train_df['Date'] < date] 183 | print(train_df) 184 | 185 | test_df = df[df['Date'] >= date] 186 | test_df = test_df[test_df['Date'] < f_date] 187 | print(test_df) 188 | 189 | train_x, train_y = df2arr(train_df, vars_l) 190 | test_x, test_y = df2arr(test_df, vars_l) 191 | 192 | best_model = LinearRegression() 193 | best_model.fit(train_x, train_y) 194 | print(best_model.coef_) 195 | 196 | test_pred_df = test_df[['Ticker', 'Date']] 197 | test_pred_df['Pred_VHAR'] = best_model.predict(test_x) 198 | test_pred_df = test_pred_df.pivot(index='Date', columns='Ticker', values='Pred_VHAR') 199 | 200 | test_pred_df.columns = list(test_pred_df.columns) 201 | test_pred_df.index = list(test_pred_df.index) 202 | 203 | print('Before: %.3f' % test_pred_df.min().min()) 204 | 205 | # adjust the negative forecasts to the minimum of the training data 206 | for clm in test_pred_df.columns: 207 | clm_pred_df = test_pred_df[clm] 208 | clm_train_df = train_df[train_df['Ticker'] == clm]['Target'] 209 | clm_pred_df[clm_pred_df <= 0] = clm_train_df.min() 210 | test_pred_df[clm] = clm_pred_df 211 | 212 | print('After: %.3f' % test_pred_df.min().min()) 213 | 214 | save_path = join(path, 'Var_Pred_Results', this_version) 215 | os.makedirs(save_path, exist_ok=True) 216 | 217 | test_pred_df.to_csv(join(save_path, 'Pred_%s.csv' % date)) 218 | 219 | 220 | def connect_pred(): 221 | save_path = join(path, 'Var_Pred_Results', this_version) 222 | files_l = os.listdir(save_path) 223 | pred_files = [i for i in files_l if 'Pred_' in i] 224 | pred_files.sort() 225 | test_pred_df_l = [] 226 | for i in pred_files: 227 | test_pred_df = pd.read_csv(join(save_path, i), index_col=0) 228 | test_pred_df_l.append(test_pred_df) 229 | 230 | test_pred_df = pd.concat(test_pred_df_l) 231 | print(test_pred_df) 232 | 233 | sum_path = join(path, 'Var_Results_Sum') 234 | os.makedirs(sum_path, exist_ok=True) 235 | test_pred_df.to_csv(join(sum_path, this_version + '_pred.csv')) 236 | 237 | 238 | if __name__ == '__main__': 239 | feature_df = load_feature_data(opt.universe) 240 | vech_df = load_data(opt.universe, opt.horizon) 241 | ret_df = load_ret(opt.universe) 242 | 243 | n = vech_df.shape[1] 244 | 245 | subdf_dic, date_l = preprocess_HAR(feature_df, vech_df) 246 | 247 | print('Training Starts Now ...') 248 | idx = date_l.index('2011-07-01') 249 | 250 | for date in date_l[idx::opt.window]: 251 | print(' * ' * 20 + date + ' * ' * 20) 252 | Train(ret_df, vech_df, subdf_dic, date, date_l) 253 | 254 | connect_pred() 255 | -------------------------------------------------------------------------------- /GNNHAR.py: -------------------------------------------------------------------------------- 1 | """ 2 | Proposed GNNHAR models to forecast the realized volatility. 3 | Include HAR, GHAR, GNNHAR1L, GNNHAR2L, and GNNHAR3L, with different loss functions, implemented in PyTorch. 4 | For linear regressions with MSE loss, we also provide another implementation in GHAR.py, through the LinearRegression class in sklearn. 5 | """ 6 | 7 | import argparse 8 | import os 9 | from os.path import join 10 | 11 | import numpy as np 12 | import pandas as pd 13 | from torch.autograd import Variable 14 | import torch 15 | import torch.nn as nn 16 | from torch.utils.data import DataLoader, Dataset, TensorDataset, Subset 17 | import torch.optim as optim 18 | 19 | cuda = True if torch.cuda.is_available() else False 20 | Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("--window", type=int, default=22, help="moving window") 24 | parser.add_argument("--horizon", type=int, default=1, help="forecasting horizon") 25 | parser.add_argument("--valid_len", type=int, default=22, help="validation period") 26 | parser.add_argument("--model_name", type=str, default='GNNHAR1L', help="model name") 27 | parser.add_argument("--adj_name", type=str, default='glasso', help="adj choices") 28 | parser.add_argument("--universe", type=str, default='DJIA', help="data name") 29 | parser.add_argument("--loss", type=str, default='MSE', help="loss function") 30 | parser.add_argument("--n_epochs", type=int, default=5000, help="epochs for training") 31 | parser.add_argument("--n_hid", type=int, default=9, help="hidden neurons") 32 | parser.add_argument("--batch_size", type=int, default=128, help="size of the batches") 33 | parser.add_argument("--lr", type=float, default=1e-3, help="learning rate") 34 | parser.add_argument("--ens", type=int, default=0, help="No. Ensemble") 35 | parser.add_argument("--numNN", type=int, default=1, help="number of NNs") 36 | parser.add_argument("--version", type=str, default='Forecast_Var', help="version name") 37 | 38 | opt = parser.parse_args() 39 | print(opt) 40 | 41 | # Specific version 42 | this_version = '_'.join( 43 | [opt.version, 44 | opt.loss, 45 | opt.model_name, 46 | opt.adj_name, 47 | opt.universe, 48 | 'E' + str(opt.n_epochs), 49 | 'H' + str(opt.n_hid), 50 | 'BS' + str(opt.batch_size), 51 | 'LR' + str(opt.lr), 52 | 'W' + str(opt.window), 53 | 'F' + str(opt.horizon), 54 | 'Val' + str(opt.valid_len)]) 55 | 56 | path = 'your_local_path' 57 | model_save_path = join('your_model_storage_path', this_version) 58 | os.makedirs(model_save_path, exist_ok=True) 59 | 60 | 61 | def load_feature_data(universe): 62 | feature_df = pd.read_csv(join(path, 'Data', f'{universe}_var_FH1.csv'), index_col=0) 63 | feature_df.fillna(method="ffill", inplace=True) 64 | feature_df = feature_df[feature_df.index <= '2021-07-01'] 65 | feature_df = feature_df.sort_index(axis=1) 66 | return feature_df 67 | 68 | 69 | def load_data(universe, horizon): 70 | var_df = pd.read_csv(join(path, 'Data', f'{universe}_var_FH{horizon}.csv'), index_col=0) 71 | var_df.fillna(method="ffill", inplace=True) 72 | vech_df = var_df[var_df.index <= '2021-07-01'] 73 | vech_df = vech_df.sort_index(axis=1) 74 | return vech_df 75 | 76 | 77 | def load_ret(universe): 78 | ret_df = pd.read_csv(join(path, 'Data', f'{universe}_ret_FH1.csv'), index_col=0) 79 | ret_df.fillna(method="ffill", inplace=True) 80 | ret_df = ret_df[ret_df.index <= '2021-07-01'] 81 | ret_df = ret_df.sort_index(axis=1) 82 | return ret_df 83 | 84 | 85 | def get_lag_avg(df, lag): 86 | res = pd.DataFrame(columns=df.columns, index=df.index).fillna(0) 87 | for l in range(1, lag + 1): 88 | res += (1 / lag) * df.shift(l) 89 | return res 90 | 91 | 92 | def preprocess_adj_l(date_l, subdf_dic, adj_df_l): 93 | new_subdf_l = [] 94 | for date in date_l: 95 | subdf = subdf_dic[date] 96 | # print(subdf) 97 | tmp_subdf_l = [] 98 | clms = [i for i in subdf.columns if 'lag' in i] 99 | # print(clms) 100 | for k, adj_df in enumerate(adj_df_l): 101 | # print(adj_df) 102 | tmp_subdf = pd.DataFrame(np.dot(adj_df, subdf[clms]), columns=['sec'+str(k)+i for i in clms], index=subdf.index) 103 | tmp_subdf_l.append(tmp_subdf) 104 | new_subdf = pd.concat([subdf[['Target', 'Date', 'Ticker']]]+tmp_subdf_l, axis=1) 105 | new_subdf_l.append(new_subdf) 106 | 107 | df = pd.concat(new_subdf_l) 108 | df.reset_index(drop=True, inplace=True) 109 | print('Finish transformation!') 110 | return df 111 | 112 | 113 | def GLASSO_Precision(subret): 114 | from sklearn.covariance import GraphicalLassoCV 115 | n = subret.shape[1] 116 | tickers = subret.columns 117 | cov = GraphicalLassoCV().fit(subret) 118 | print('Alpha in GLASSO: %.3f' % cov.alpha_) 119 | corr = cov.precision_ != 0 120 | print('Sparsity of Adj: %.3f' % corr.mean()) 121 | corr_adj = corr - np.identity(n) 122 | # adj_df = pd.DataFrame(corr_adj / (corr_adj.sum(1)[:, np.newaxis] + 1e-8), columns=tickers, index=tickers) 123 | d_sqrt_inv = np.diag(np.sqrt(1/(corr_adj.sum(1)+1e-8))) 124 | adj_df = pd.DataFrame(np.dot(np.dot(d_sqrt_inv, corr_adj), d_sqrt_inv), columns=tickers, index=tickers) 125 | return adj_df 126 | 127 | 128 | class GraphConvLayer(nn.Module): 129 | def __init__(self, in_features, out_features, bias=True): 130 | super(GraphConvLayer, self).__init__() 131 | 132 | self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features)) 133 | nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu')) 134 | 135 | if bias is True: 136 | self.bias = nn.Parameter(torch.FloatTensor(1, out_features)) 137 | nn.init.ones_(self.bias) 138 | else: 139 | self.bias = None 140 | 141 | def forward(self, node_feature, adj): 142 | h = torch.matmul(node_feature, self.weight) 143 | output = torch.matmul(adj, h) 144 | if self.bias is not None: 145 | return output + self.bias 146 | return output 147 | 148 | # HAR model 149 | class HAR(nn.Module): 150 | def __init__(self): 151 | super(HAR, self).__init__() 152 | 153 | self.linear1 = nn.Linear(3, 1, bias=True) 154 | self.relu = nn.ReLU() 155 | 156 | def forward(self, node_feat, adj): 157 | # node_feat: (batch_size, N, 3) 158 | # adj: (N, N) 159 | 160 | H1 = self.linear1(node_feat) 161 | res = self.relu(H1) 162 | 163 | return res.squeeze(-1) 164 | 165 | 166 | # GHAR model 167 | class GHAR(nn.Module): 168 | def __init__(self, n_hid): 169 | super(GHAR, self).__init__() 170 | 171 | self.linear1 = nn.Linear(3, 1, bias=True) 172 | 173 | self.gcn1 = GraphConvLayer(3, n_hid, bias=False) 174 | self.relu = nn.ReLU() 175 | 176 | def forward(self, node_feat, adj): 177 | # node_feat: (batch_size, N, 3) 178 | # adj: (N, N) 179 | 180 | H1 = self.linear1(node_feat) 181 | 182 | H2 = self.gcn1(node_feat, adj) 183 | res = H1 + H2 184 | res = self.relu(res) 185 | 186 | return res.squeeze(-1) 187 | 188 | # 1-layer GNNHAR 189 | class GNNHAR1L(nn.Module): 190 | def __init__(self, n_hid): 191 | super(GNNHAR1L, self).__init__() 192 | 193 | self.linear1 = nn.Linear(3, 1, bias=True) 194 | 195 | self.gcn1 = GraphConvLayer(3, n_hid, bias=False) 196 | self.mlp1 = nn.Linear(n_hid, 1, bias = False) 197 | self.relu = nn.ReLU() 198 | 199 | def forward(self, node_feat, adj): 200 | # node_feat: (batch_size, N, 3) 201 | # adj: (N, N) 202 | 203 | H1 = self.linear1(node_feat) 204 | 205 | H2 = self.gcn1(node_feat, adj) 206 | H2 = self.relu(H2) 207 | H2 = self.mlp1(H2) # (batch_size, N, 1) 208 | 209 | res = H1 + H2 210 | res = self.relu(res) 211 | 212 | return res.squeeze(-1) 213 | 214 | 215 | class GNNHAR2L(nn.Module): 216 | def __init__(self, nhid): 217 | super(GNNHAR2L, self).__init__() 218 | 219 | self.linear1 = nn.Linear(3, 1, bias=True) 220 | 221 | self.gcn1 = GraphConvLayer(3, nhid, bias=False) 222 | self.gcn2 = GraphConvLayer(nhid, nhid, bias = False) 223 | 224 | self.mlp1 = nn.Linear(nhid, 1, bias = False) 225 | self.relu = nn.ReLU() 226 | 227 | def forward(self, node_feat, adj): 228 | # node_feat: (batch_size, N, 3) 229 | # adj: (N, N) 230 | 231 | H1 = self.linear1(node_feat) 232 | 233 | # 2-layer of GCN: 234 | H2 = self.relu(self.gcn1(node_feat, adj)) 235 | H2 = self.relu(self.gcn2(H2, adj)) 236 | 237 | # dimension: nhid to 1 238 | H2 = self.mlp1(H2) # (batch_size, N, 1) 239 | 240 | res = H1 + H2 241 | res = self.relu(res) 242 | 243 | return res.squeeze(-1) 244 | 245 | 246 | class GNNHAR3L(nn.Module): 247 | def __init__(self, nhid): 248 | super(GNNHAR3L, self).__init__() 249 | 250 | self.linear1 = nn.Linear(3, 1, bias=True) 251 | 252 | self.gcn1 = GraphConvLayer(3, nhid, bias=False) 253 | self.gcn2 = GraphConvLayer(nhid, nhid, bias = False) 254 | self.gcn3 = GraphConvLayer(nhid, nhid, bias = False) 255 | 256 | self.mlp1 = nn.Linear(nhid, 1, bias = False) 257 | self.relu = nn.ReLU() 258 | 259 | def forward(self, node_feat, adj): 260 | # node_feat: (batch_size, N, 3) 261 | # adj: (N, N) 262 | 263 | H1 = self.linear1(node_feat) 264 | 265 | # 2-layer of GCN: 266 | H2 = self.relu(self.gcn1(node_feat, adj)) 267 | H2 = self.relu(self.gcn2(H2, adj)) 268 | H2 = self.relu(self.gcn3(H2, adj)) 269 | 270 | # dimension: nhid to 1 271 | H2 = self.mlp1(H2) # (batch_size, N, 1) 272 | 273 | res = H1 + H2 274 | res = self.relu(res) 275 | 276 | return res.squeeze(-1) 277 | 278 | 279 | def Compute_Adj(ret_df, vech_df, date, date_l): 280 | timestamp = date_l.index(date) 281 | # split time 282 | s_p = max(timestamp-1000, 0) 283 | v_p = timestamp - opt.valid_len 284 | f_p = min(timestamp + opt.window, len(date_l)-1) 285 | 286 | s_date = date_l[s_p] 287 | v_date = date_l[v_p] 288 | f_date = date_l[f_p] 289 | 290 | subret = ret_df[ret_df.index < date] 291 | subret = subret[subret.index >= s_date] 292 | 293 | subdata = vech_df[vech_df.index < date] 294 | subdata = subdata[subdata.index >= s_date] 295 | 296 | n = vech_df.shape[1] 297 | adj_name = opt.adj_name 298 | tickers = subret.columns 299 | 300 | if adj_name == 'glasso': 301 | adj_df = GLASSO_Precision(subret) 302 | else: 303 | adj_df = pd.DataFrame(np.zeros((n, n)), columns=tickers, index=tickers) 304 | 305 | print((s_date, v_date, f_date)) 306 | adj_df = Tensor(adj_df.values) 307 | return adj_df, s_p, v_p, timestamp, f_p 308 | 309 | 310 | def df2arr(df, vars_l): 311 | all_inputs = Tensor(df[vars_l].values) 312 | all_targets = Tensor(df[['Target']].values) 313 | return all_inputs, all_targets 314 | 315 | 316 | class Loss(nn.Module): 317 | def __init__(self): 318 | super().__init__() 319 | 320 | def forward(self, outputs, forecast_y): 321 | if opt.loss == 'QLike': 322 | true_fore = outputs / (forecast_y + 1e-4) # stablize the training 323 | l_v = torch.mean(true_fore - torch.log(true_fore)) 324 | else: 325 | mseloss = nn.MSELoss() 326 | l_v = mseloss(outputs, forecast_y) 327 | return l_v 328 | 329 | 330 | # Train a single model 331 | def Train_Single(train_loader, valid_loader, model_index, seed, date): 332 | torch.manual_seed(seed) 333 | print("------ Model %d Starts with Random Seed %d " % (model_index, seed)) 334 | if opt.model_name == 'HAR': 335 | model = HAR() 336 | elif opt.model_name == 'GHAR': 337 | model = GHAR(opt.n_hid) 338 | elif opt.model_name == 'GNNHAR1L': 339 | model = GNNHAR1L(opt.n_hid) 340 | elif opt.model_name == 'GNNHAR2L': 341 | model = GNNHAR2L(opt.n_hid) 342 | elif opt.model_name == 'GNNHAR3L': 343 | model = GNNHAR3L(opt.n_hid) 344 | else: 345 | print('Please choose the correct model') 346 | return 347 | 348 | if cuda: 349 | model.cuda() 350 | 351 | for parameter in model.parameters(): 352 | print(parameter) 353 | 354 | # optimizer 355 | loss_function = Loss() 356 | optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=1e-5) 357 | best_val_mse = 1e8 358 | 359 | train_loss = [] 360 | valid_loss = [] 361 | for epoch in range(opt.n_epochs): # loop over the dataset multiple times 362 | epoch_loss_train = [] 363 | epoch_loss_valid = [] 364 | 365 | model.train() 366 | for _, (train_X, train_y) in enumerate(train_loader): 367 | train_X, train_y = Variable(train_X), Variable(train_y) 368 | 369 | # zero the parameter gradients 370 | optimizer.zero_grad() 371 | 372 | # forward + backward + optimize 373 | forecast_y = model(train_X, adj_df) 374 | loss = loss_function(train_y, forecast_y) 375 | loss.backward() 376 | optimizer.step() 377 | epoch_loss_train.append(loss.item()) 378 | 379 | # validation data 380 | model.eval() 381 | for _, (val_X, val_y) in enumerate(valid_loader): 382 | val_X, val_y = Variable(val_X), Variable(val_y) 383 | 384 | val_out = model(val_X, adj_df) 385 | loss = loss_function(val_y, val_out) 386 | epoch_loss_valid.append(loss.item()) 387 | 388 | train_loss_epoch = np.mean(epoch_loss_train) 389 | valid_loss_epoch = np.mean(epoch_loss_valid) 390 | train_loss.append(train_loss_epoch) 391 | valid_loss.append(valid_loss_epoch) 392 | 393 | if epoch % int(opt.n_epochs/10) == 0: 394 | print("[Epoch %d] [Train Loss: %.4f] [Valid Loss: %.4f]" % (epoch, train_loss_epoch, valid_loss_epoch)) 395 | 396 | # if validation loss decreases, save the model parameters 397 | if loss.item() < best_val_mse: 398 | best_val_mse = loss.item() 399 | torch.save(model.state_dict(), join(model_save_path, 'Best_Model' + '_' + date + '_index%d' % model_index)) 400 | 401 | train_loss_arr = np.array(train_loss) 402 | valid_loss_arr = np.array(valid_loss) 403 | loss_arr = np.stack([train_loss_arr, valid_loss_arr], axis=1) 404 | loss_df = pd.DataFrame(loss_arr, columns=['Train', 'Valid']) 405 | loss_df.to_csv(join(model_save_path, 'loss_%s_index%d.csv' % (date, model_index)), index=False) 406 | return loss_df 407 | 408 | 409 | def Train(dataset, adj_df, s_p, v_p, timestamp, f_p, targets, date): 410 | train_idx = range(s_p, v_p) 411 | val_idx = range(v_p, timestamp) 412 | test_idx = range(timestamp, f_p) 413 | 414 | train_dataset = Subset(dataset, train_idx) 415 | val_dataset = Subset(dataset, val_idx) 416 | test_dataset = Subset(dataset, test_idx) 417 | 418 | train_loader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True) 419 | valid_loader = DataLoader(val_dataset, batch_size=opt.batch_size, shuffle=False) 420 | test_loader = DataLoader(test_dataset, batch_size=len(test_idx), shuffle=False) 421 | 422 | # Train multiple models with different random seeds 423 | for iii in range(opt.ens*opt.numNN, (opt.ens+1)*opt.numNN): 424 | seed = np.random.randint(low=1, high=10000) 425 | loss_df = Train_Single(train_loader, valid_loader, model_index=iii, seed=seed, date=date) 426 | 427 | while (np.abs(loss_df['Valid'].diff()) < 1e-6).mean() > 0.5 or loss_df['Valid'].iloc[-1] > 100: 428 | print(' * ' * 20) 429 | print(' Attention!!! Restart Training!!! ') 430 | print(' * ' * 20) 431 | seed = np.random.randint(low=1, high=10000) 432 | loss_df = Train_Single(train_loader, valid_loader, model_index=iii, seed=seed, date=date) 433 | 434 | # Forecast testing period 435 | for iii in range(opt.ens*opt.numNN, (opt.ens+1)*opt.numNN): 436 | with torch.no_grad(): 437 | if opt.model_name == 'HAR': 438 | model = HAR() 439 | elif opt.model_name == 'GHAR': 440 | model = GHAR(opt.n_hid) 441 | elif opt.model_name == 'GNNHAR1L': 442 | model = GNNHAR1L(opt.n_hid) 443 | elif opt.model_name == 'GNNHAR2L': 444 | model = GNNHAR2L(opt.n_hid) 445 | elif opt.model_name == 'GNNHAR3L': 446 | model = GNNHAR3L(opt.n_hid) 447 | else: 448 | print('Please choose the correct model') 449 | return 450 | 451 | model.load_state_dict(torch.load(join(model_save_path, 'Best_Model' + '_' + date + '_index%d' % iii))) 452 | model.eval() 453 | 454 | if cuda: 455 | model.cuda() 456 | 457 | for _, (test_X, test_y) in enumerate(test_loader): 458 | test_X, test_y = Variable(test_X), Variable(test_y) 459 | forecast_test_y = model(test_X, adj_df) 460 | 461 | y_pred = forecast_test_y.cpu().detach().numpy() 462 | test_pred_df = pd.DataFrame(y_pred, index=targets.index[test_idx], columns=targets.columns) 463 | 464 | print('Min: %.3f' % test_pred_df.min().min()) 465 | 466 | save_path = join(path, 'Var_Pred_Results', this_version) 467 | os.makedirs(save_path, exist_ok=True) 468 | 469 | test_pred_df.to_csv(join(save_path, 'Pred_%s_Ens%d.csv' % (date, iii))) 470 | 471 | 472 | # Some trained models may not converge well, we only use the forecasts from those models with a good converge 473 | # This selection is based on the training and validation data, so no look-forward bias 474 | def Screen_Ensemble(date, thres_perc=50): 475 | loss_l = [] 476 | for j in range(opt.numNN): 477 | loss_df = pd.read_csv(join(model_save_path, 'loss_%s_index%d.csv' % (date, j))) 478 | loss_l.append(loss_df['Valid'].iloc[-1]) 479 | 480 | threshold_loss = np.percentile(loss_l, thres_perc) 481 | select_l = [] 482 | for j in range(opt.numNN): 483 | if loss_l[j] <= threshold_loss: 484 | select_l.append(j) 485 | else: 486 | pass 487 | return select_l 488 | 489 | 490 | # Connect the forecasts for each sub-period 491 | # Forecast variance matrix with shape T_t * N; T_t is the length of the entire testing period 492 | def connect_pred(): 493 | save_path = join(path, 'Var_Pred_Results', this_version) 494 | files_l = os.listdir(save_path) 495 | dates_l = [i.split('_')[1] for i in files_l if 'Pred_' in i and '_Ens0.csv'] 496 | dates_l = list(set(dates_l)) 497 | dates_l.sort() 498 | 499 | test_pred_df_l = [] 500 | for date in dates_l: 501 | tmp_pred_df_l = [] 502 | select_l = Screen_Ensemble(date) 503 | for j in select_l: 504 | tmp_test_pred_df = pd.read_csv(join(save_path, '_'.join(['Pred', date, 'Ens%d.csv' % j])), index_col=0) 505 | tmp_pred_df_l.append(tmp_test_pred_df) 506 | 507 | test_pred_df = pd.DataFrame(np.stack(tmp_pred_df_l).mean(0), index=tmp_test_pred_df.index, columns=tmp_test_pred_df.columns) 508 | test_pred_df_l.append(test_pred_df) 509 | 510 | test_pred_df = pd.concat(test_pred_df_l) * opt.horizon 511 | print(test_pred_df) 512 | 513 | sum_path = join(path, 'Var_Results_Sum') 514 | os.makedirs(sum_path, exist_ok=True) 515 | test_pred_df.to_csv(join(sum_path, this_version + '_pred.csv')) 516 | 517 | 518 | if __name__ == '__main__': 519 | feature_df = load_feature_data(opt.universe) 520 | vech_df = load_data(opt.universe, opt.horizon) 521 | ret_df = load_ret(opt.universe) 522 | 523 | n = vech_df.shape[1] 524 | 525 | if opt.horizon == 1: 526 | lag1 = get_lag_avg(feature_df, 1).iloc[22:] 527 | lag5 = get_lag_avg(feature_df, 5).iloc[22:] 528 | lag22 = get_lag_avg(feature_df, 22).iloc[22:] 529 | else: 530 | e_idx = -opt.horizon + 1 531 | lag1 = get_lag_avg(feature_df, 1).iloc[22:e_idx] 532 | lag5 = get_lag_avg(feature_df, 5).iloc[22:e_idx] 533 | lag22 = get_lag_avg(feature_df, 22).iloc[22:e_idx] 534 | 535 | targets = vech_df.iloc[22:] 536 | 537 | Y, lag1, lag5, lag22 = np.array(targets), np.array(lag1), np.array(lag5), np.array(lag22) 538 | 539 | Y /= opt.horizon 540 | 541 | X = [lag1, lag5, lag22] 542 | 543 | X = np.stack(X, axis=-1) 544 | X, Y = Tensor(X), Tensor(Y) 545 | 546 | dataset = TensorDataset(X, Y) 547 | 548 | print('Training Starts Now ...') 549 | date_l = targets.index.tolist() 550 | idx = date_l.index('2011-07-01') 551 | 552 | for date in date_l[idx::opt.window]: 553 | print(' * ' * 20 + date + ' * ' * 20) 554 | adj_df, s_p, v_p, timestamp, f_p = Compute_Adj(ret_df, vech_df, date, date_l) 555 | Train(dataset, adj_df, s_p, v_p, timestamp, f_p, targets, date) 556 | 557 | connect_pred() --------------------------------------------------------------------------------