├── economic_data.xlsx
├── FactorAugmentedVAR.py
├── Functions.py
└── copper_price_forecast.ipynb
/economic_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoagieT/Factor-Augmented-Vector-Autoregression/HEAD/economic_data.xlsx
--------------------------------------------------------------------------------
/FactorAugmentedVAR.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Nov 14 23:00:48 2020
4 |
5 | @author: Hogan Tong
6 | """
7 |
8 | from Functions import *
9 | from scipy.interpolate import interp1d
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | import pandas as pd
13 | import os
14 | from datetime import datetime
15 | import tkinter.filedialog
16 | from datetime import timedelta
17 | import math
18 | import time
19 | import calendar
20 | from numba import jit
21 | from sklearn.metrics import mean_squared_error
22 | from statsmodels.tsa.api import VAR
23 | import scipy
24 | import statsmodels.tsa.stattools as ts
25 | import statsmodels.tsa as tsa
26 | import matplotlib.pyplot as plt
27 | import numpy as np
28 | import sklearn
29 | from sklearn.metrics import mean_squared_error, r2_score
30 | import statsmodels.api as sm
31 | from datetime import datetime,timedelta
32 | from statsmodels.tsa.stattools import adfuller
33 | from numpy.linalg import cholesky
34 |
35 | def Factor(Y, X_slow, n_factors, X_fast='None'):
36 | #n_time = len(Y.index)
37 | n_var = len(Y.columns)
38 | if isinstance(X_fast,str)==True:
39 | hist = Y.join(X_slow)
40 | else:
41 | hist = Y.join(X_slow).join(X_fast)
42 |
43 | hist=hist.dropna(axis=0,how='any')
44 |
45 | "step 1 - PCA on all observable variables"
46 | x = np.mat(hist - hist.mean())
47 | z = np.mat((hist - hist.mean())/hist.std())
48 | D, V, S = calculate_pca(hist, n_factors + n_var)
49 | Psi = np.mat(np.diag(np.diag(S - V.dot(D).dot(V.T))))
50 | factors = V.T.dot(z.T).T
51 | C = pd.DataFrame(data=factors, index=hist.index, columns=['C' + str(i+1) for i in range(n_factors+n_var)])
52 | Loadings_C = calculate_factor_loadings(hist, C)
53 |
54 | "step 2 - PCA on slow moving variables"
55 | x = np.mat(X_slow-X_slow.mean())
56 | z = np.mat((X_slow-X_slow.mean())/X_slow.std())
57 | D, V, S = calculate_pca(X_slow, n_factors)
58 | Psi = np.mat(np.diag(np.diag(S - V.dot(D).dot(V.T))))
59 | factors = V.T.dot(z.T).T
60 | F_minus = pd.DataFrame(data=factors, index=X_slow.index, columns=['F_minus' + str(i+1) for i in range(n_factors)])
61 | Loadings_F_slow = calculate_factor_loadings(X_slow, F_minus)
62 |
63 | "step 3 - C_t = b1*Y_t + b2*F_t"
64 | X = Y.join(F_minus)
65 | B = calculate_factor_loadings(C, X)
66 | Lambda_y, Lambda_f = B[:,0:n_var], B[:,n_var:]
67 | # F_t= Lambda_f^-1*(C_t-Lambda_y*Y_t)
68 | F = Lambda_f.I.dot((np.mat(C).T - Lambda_y.dot(Y.T))).T
69 | F = pd.DataFrame(data=F, index=X_slow.index, columns=['F' + str(i+1) for i in range(n_factors)])
70 |
71 | return FactorResultsWrapper(C=C, Lambda_c=Loadings_C, F_minus=F_minus, F=F)
72 |
73 | class FactorResultsWrapper():
74 | def __init__(self, C, Lambda_c, F_minus, F):
75 | self.C = C
76 | self.Lambda_c = Lambda_c
77 | self.F_minus = F_minus
78 | self.F = F
79 |
80 | def FAVAR(Factor, Y, lag):
81 | hist = Y.join(Factor)
82 | model=VAR(hist,missing='drop').fit(lag,trend='nc')
83 |
84 | return FAVARResultsWrapper(VAR=model)
85 |
86 | class FAVARResultsWrapper():
87 | def __init__(self, VAR):
88 | self.VAR = VAR
89 |
90 | def summary(self):
91 | print(self.VAR.summary())
92 | return
93 |
94 | def predict(self, Factor, Y, step, freq='M', alpha=0.05):
95 | hist = Y.join(Factor)
96 | [forecast_mean,forecast_low,forecast_up] = self.VAR.forecast_interval(hist.values, step, alpha)
97 | mean = np.concatenate((hist.values, forecast_mean), axis=0)
98 | up = np.concatenate((hist.values, forecast_up), axis=0)
99 | low = np.concatenate((hist.values, forecast_low), axis=0)
100 |
101 |
102 | dates = pd.date_range(Y.index[0], periods=len(Y.index)+step,freq=freq)
103 |
104 | mean = pd.DataFrame(data=mean[:,0:len(Y.columns)], columns=Y.columns.tolist(), index=dates)
105 | low = pd.DataFrame(data=low[:,0:len(Y.columns)], columns=Y.columns.tolist(), index=dates)
106 | up = pd.DataFrame(data=up[:,0:len(Y.columns)], columns=Y.columns.tolist(), index=dates)
107 |
108 | return [mean,low,up]
109 |
110 | def predict_plot(self, Factor, Y, step, freq='M', alpha=0.05, figure_size=[18,12],line_width=3.0,font_size='xx-large', actural='None'):
111 | mean, low, up = self.predict(Factor, Y, step, freq, alpha)
112 | n_var = len(mean.columns)
113 | n_act = len(Y.index)
114 |
115 | plt.rcParams['figure.figsize'] = (figure_size[0], figure_size[1])
116 | plt.rcParams['lines.markersize'] = 6
117 | plt.rcParams['image.cmap'] = 'gray'
118 |
119 | for i in range(n_var):
120 | plt.figure()
121 | plt.plot(mean.index[n_act-1:],mean.iloc[n_act-1:,i],color='r',label='forecast', linewidth=line_width)
122 | plt.plot(mean.index[:n_act],mean.iloc[:n_act,i],color='k',label='observed',linewidth=line_width)
123 | plt.plot(mean.index[n_act-1:],low.iloc[n_act-1:,i],color='r', linestyle = '--', label='lower - '+str(int(100-alpha*100))+'%',linewidth=line_width)
124 | plt.plot(mean.index[n_act-1:],up.iloc[n_act-1:,i],color='r', linestyle = ':', label='upper - '+str(int(100-alpha*100))+'%',linewidth=line_width)
125 | plt.legend()
126 | if isinstance(actural,str)!=True:
127 | plt.plot(mean.index[n_act-1:],actural.iloc[:,i],color='k',label='observed', linewidth=line_width)
128 | plt.title(mean.columns[i], fontweight='bold', fontsize=font_size)
129 | #plt.xlabel('Date')
130 | #plt.ylabel('Value')
131 | plt.show()
132 |
133 | return
134 |
--------------------------------------------------------------------------------
/Functions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Nov 13 21:45:00 2020
4 |
5 | @author: Hogan Tong
6 | """
7 | from scipy.interpolate import interp1d
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import pandas as pd
11 | import os
12 | from datetime import datetime
13 | import tkinter.filedialog
14 | from datetime import timedelta
15 | import math
16 | import time
17 | import calendar
18 | from numba import jit
19 | from sklearn.metrics import mean_squared_error
20 | import scipy
21 | import statsmodels.tsa.stattools as ts
22 | import statsmodels.tsa as tsa
23 | import matplotlib.pyplot as plt
24 | import numpy as np
25 | import sklearn
26 | from sklearn.metrics import mean_squared_error, r2_score
27 | import statsmodels.api as sm
28 | from datetime import datetime,timedelta
29 |
30 | def import_data(file_name, sheet_name, start=0, interpolation=False, encoding='gb18030'):
31 | Temp = pd.read_excel(file_name, sheet_name, encoding = encoding)
32 | res = Temp.iloc[start:,1:]
33 | res.index = Temp.iloc[start:,0]
34 | if interpolation==True:
35 | res = DataInterpolation(res, 0, len(res.index), 'cubic').dropna(axis=0,how='any')
36 | return res
37 |
38 | def DataInterpolation(data, start, end, method):
39 | # data must be a time series dataframe
40 | n_row = len(data.index)
41 | n_col = len(data.columns)
42 | res = np.array(np.zeros(shape=(n_row,n_col)))
43 |
44 | for i in range(n_col):
45 | res[:,i] = np.array(data.iloc[:,i]).T
46 | y=data.iloc[start:end,i]
47 | location = np.where(y.notnull())[0]
48 | upper_bound=max(location)
49 | lower_bound=min(location)
50 | f2 = interp1d(location, y[y.notnull()], kind=method)
51 | x = np.linspace(lower_bound, upper_bound, num=upper_bound-lower_bound, endpoint=False)
52 | res[lower_bound:upper_bound,i]=np.array(f2(x)).T
53 |
54 | res = pd.DataFrame(res, index=data.index, columns=data.columns)
55 |
56 | return res
57 |
58 | def rand_Matrix(n_row, n_col):
59 | randArr = np.random.randn(n_row, n_col)
60 | randMat = np.mat(randArr)
61 | return randMat
62 |
63 |
64 | def calculate_factor_loadings(observables, factors):
65 | # Both dataframes should have the same time stamp
66 | n_time = len(observables.index)
67 | x = np.mat(observables-observables.mean())
68 | F=np.mat(factors)
69 | temp = F[0].T.dot(F[0])
70 | for i in range(1,n_time):
71 | temp = temp + F[i].T.dot(F[i])
72 |
73 | Lambda = x[0].T.dot(F[0]).dot(temp.I)
74 | for i in range(1,n_time):
75 | Lambda = Lambda + x[i].T.dot(F[i]).dot(temp.I)
76 |
77 | return Lambda
78 |
79 | def calculate_prediction_matrix(factors):
80 | n_time = len(factors.index)
81 | F=np.mat(factors)
82 |
83 | temp = F[0].T.dot(F[0])
84 | for i in range(2,n_time):
85 | temp = temp + F[i-1].T.dot(F[i-1])
86 |
87 | A = F[1].T.dot(F[0]).dot(temp.I)
88 | for i in range(2,n_time):
89 | A = A + F[i].T.dot(F[i-1]).dot(temp.I)
90 |
91 | return A
92 |
93 | def calculate_shock_matrix(factors, prediction_matrix, n_shocks):
94 | n_time = len(factors.index)
95 | F = np.mat(factors)
96 | A = prediction_matrix
97 |
98 | temp = F[0].T.dot(F[0])
99 | for i in range(2,n_time):
100 | temp = temp + F[i-1].T.dot(F[i-1])
101 |
102 | term1 = F[1].T.dot(F[1])
103 | for i in range(2,n_time):
104 | term1 = term1 + F[i].T.dot(F[i])
105 | term1 = term1/(n_time-1)
106 | term2 = A.dot(temp/(n_time-1)).dot(A.T)
107 | Sigma = term1 - term2
108 |
109 | eigenvalues, eigenvectors = np.linalg.eig(Sigma)
110 | sorted_indices = np.argsort(eigenvalues)
111 | evalues = eigenvalues[sorted_indices[:-n_shocks-1:-1]]
112 | M = eigenvectors[:,sorted_indices[:-n_shocks-1:-1]]
113 | B = M.dot(np.diag(pow(evalues,0.5)))
114 |
115 | return B, Sigma
116 |
117 | def calculate_pca(observables, n_factors):
118 | # syntax:
119 | n_time = len(observables.index)
120 | x = np.mat(observables-observables.mean())
121 | z = np.mat((observables-observables.mean())/observables.std())
122 |
123 | S = z[0].T.dot(z[0])
124 | for i in range(1,n_time):
125 | S = S + z[i].T.dot(z[i])
126 |
127 | eigenvalues, eigenvectors = np.linalg.eig(S)
128 | sorted_indices = np.argsort(eigenvalues)
129 | evalues = eigenvalues[sorted_indices[:-n_factors-1:-1]]
130 | V = np.mat(eigenvectors[:,sorted_indices[:-n_factors-1:-1]])
131 | D = np.diag(evalues)
132 |
133 | return D, V, S
134 |
135 | def calculate_covariance(factors):
136 | n_time = len(factors.index)
137 | F = np.mat(factors)
138 | temp = [factors.iloc[:,i] for i in range(len(factors.columns))]
139 | return np.cov(temp)
140 |
141 | def plot_double_axis(x1,x2, label1, label2, title=False, shift1=0,shift2=0, fig_size=[24,16], line_width=3.0,font_size='xx-large', legend_size=12, grid=False, legend_loc=(1,1)):
142 | plt.rcParams['figure.figsize'] = (fig_size[0], fig_size[1])
143 | plt.rcParams['lines.markersize'] = 6
144 | plt.rcParams['legend.fontsize'] = legend_size
145 | plt.rcParams['font.sans-serif']=['SimHei']
146 | plt.rcParams['axes.unicode_minus'] = False
147 |
148 | fig = plt.figure()
149 | plt.grid(grid)
150 | ax1 = fig.add_subplot(111)
151 | Label1 = label1
152 | if shift1 > 0:
153 | Label1 = label1 + '-lag'+ str(shift1)
154 | line1=ax1.plot(x1.index, x1.shift(shift1).values,color='k',label=Label1, linewidth=line_width,zorder=-1)
155 | #ax1.legend(loc=2)
156 |
157 | ax2 = ax1.twinx()
158 | Label2 = label2
159 | if shift2 > 0:
160 | Label2 = label2 + '-lag'+ str(shift2)
161 | line2=ax2.plot(x2.index, x2.shift(shift2).values,color='r',label=Label2, linewidth=line_width,zorder=1)
162 | #ax2.legend(loc=1)
163 | #plt.legend(loc=1)
164 | if title!=False:
165 | plt.title(title, fontweight='bold', fontsize=font_size)
166 |
167 | #ax1.legend(loc=2)
168 | fig.legend( bbox_to_anchor=legend_loc, bbox_transform=ax2.transAxes)
169 | plt.show()
170 |
171 | return
172 |
173 | def data_transform(data, method):
174 | # 0 - no transformation
175 | # 1 - first difference
176 | # 2 - 12th difference
177 | # 3 - logarithm
178 | # 4 - first difference of logarithm
179 | if len(data.columns) != len(method):
180 | return print('Does not match')
181 |
182 | data_transformed = pd.DataFrame(data=np.nan, index=data.index, columns=data.columns)
183 | columns = data.columns.tolist()
184 |
185 | for i in range(len(data.columns)):
186 | if method[i] == 0:
187 | data_transformed.iloc[:,i] = data.iloc[:,i]
188 | if method[i] == 1:
189 | data_transformed.iloc[:,i] = data.iloc[:,i].diff(1)
190 | columns[i] = columns[i] +'.D1'
191 | if method[i] == 2:
192 | data_transformed.iloc[:,i] = data.iloc[:,i].diff(12)
193 | columns[i] = columns[i] +'.D12'
194 | if method[i] == 3:
195 | data_transformed.iloc[:,i] = np.log(data.iloc[:,i])
196 | columns[i] = 'Log.' + columns[i]
197 | if method[i] == 4:
198 | data_transformed.iloc[:,i] = np.log(data.iloc[:,i]).diff(1)
199 | columns[i] = 'Log.' + columns[i] + '.D1'
200 |
201 | data_transformed.columns = columns
202 |
203 | return data_transformed
204 |
205 |
--------------------------------------------------------------------------------
/copper_price_forecast.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
1. Methodology Description
\n",
8 | "\n",
9 | "In this script, I used a factor augmented vector autoregression (FAVAR) model to forecast copper price. FAVAR was first developed by Bernanke, Boivin and Eliasz (2004) to estimate the impact of monetary policy. Vanguard later applied this model to create the Vanguard Capital Market Model to forecast asset prices. Plese refer to https://www.nber.org/system/files/working_papers/w10220/w10220.pdf for detailed explaination of the model.\n",
10 | "\n",
11 | "The mechanism is simple: the math behind FAVAR and classic VAR are the same. But unlike VAR, which can only take a few variables, FAVAR can incorporate a very large data set and reduce them to a few factors using principal component analysis.\n",
12 | "\n",
13 | "Principal component analysis is an algorithm that finds a few common factors that can explain the most variation in the underlying data. It is like taking a two-dimensional photo for a three-dimentinal person. We can take the photo from many angles. Some angles can clearly identify the person, e.g. the front anglge, and others lose a lot of critical information, e.g. the back angle. Principal component analysis is an algorithm that finds the best shooting angle when we take a x-dimentional photo for a N-dimentional data set (x< 2. Data Description \n",
40 | "\n",
41 | "The four largest metal consumers are US, China, Germany and Japan. Therefore, I included some of the most important economic indicators of these four companies in the data set.\n",
42 | "\n",
43 | "I took out the data of January and February of China because of the Spring Festical anomaly and use interpolations instead. This is a common practice when studying China's economic indicators because the Spring Festival can be either in Januray or February.\n",
44 | "\n",
45 | "Missing data are all replaced with interpolation. Alternatively, we can also use Kalman Filter to estimate the missing data. I included a KalmanFilter.py in the files. For simplicity, I didn't use Kalman Filter for this exercise.\n",
46 | "\n",
47 | "All data was transformed to make times series stationary."
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "\"Data Import\"\n",
57 | "file_name='economic_data.xlsx'\n",
58 | "sheet_name='transformed_vintage'\n",
59 | "Temp = pd.read_excel(file_name, sheet_name, encoding = 'gb18030')\n",
60 | "vintage_transformed = Temp.iloc[:,1:]\n",
61 | "vintage_transformed.index = Temp.iloc[:,0]\n",
62 | "\n",
63 | "\"Data Interpolation\"\n",
64 | "vintage_intrpl = DataInterpolation(vintage_transformed[1:], 0, len(vintage_transformed.index), 'slinear').dropna(axis=0,how='any')\n",
65 | "\n",
66 | "# vintage_intrpl"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "