├── output
├── ehat_py.xlsx
├── pred_py.xlsx
└── fred_factors_py.xlsx
├── __pycache__
├── mrsq.cpython-36.pyc
├── factors_em.cpython-36.pyc
├── prepare_missing.cpython-36.pyc
└── remove_outliers.cpython-36.pyc
├── download_data.py
├── README.md
├── remove_outliers.py
├── mrsq.py
├── prepare_missing.py
├── fredfactors.py
└── factors_em.py
/output/ehat_py.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/output/ehat_py.xlsx
--------------------------------------------------------------------------------
/output/pred_py.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/output/pred_py.xlsx
--------------------------------------------------------------------------------
/output/fred_factors_py.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/output/fred_factors_py.xlsx
--------------------------------------------------------------------------------
/__pycache__/mrsq.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/__pycache__/mrsq.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/factors_em.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/__pycache__/factors_em.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/prepare_missing.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/__pycache__/prepare_missing.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/remove_outliers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/__pycache__/remove_outliers.cpython-36.pyc
--------------------------------------------------------------------------------
/download_data.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | # data links available from
4 | # https://research.stlouisfed.org/econ/mccracken/fred-databases/
5 |
6 | url = 'https://s3.amazonaws.com/files.fred.stlouisfed.org/fred-md/monthly/current.csv'
7 | r = requests.get(url, allow_redirects=True)
8 | open('data/current.csv', 'wb').write(r.content)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | README
2 |
3 | This is a python implementation of McCracken & Ng (2017) Matlab code which is used to
4 | estimate factor models and make predictions on the basis of FRED-MD (monthly)
5 | and FRED-QD (quarterly) macroeconomic databases.
6 |
7 | For details regarding the data, and the original Matlab codes, see
8 | http://research.stlouisfed.org/econ/mccracken/fred-databases/
9 |
10 | The code loads in the data, transforms each series to be stationary,
11 | removes outliers, estimates factors, and computes the R-squared and
12 | marginal R-squared values from the estimated factors and factor loadings.
13 |
14 | ===================================================
15 | List of files:
16 |
17 | 1. fredfactors.py - Performs all the tasks mentioned above using the auxiliary functions described below
18 |
19 | 2. prepare_missing.py - Transforms the raw data into stationary form
20 |
21 | 3. remove_outliers.py - Removes outliers from the data. A data point x is considered an outlier if |x-median|>10*interquartile_range.
22 |
23 | 4. factors_em.py - Estimates a set of factors for a given dataset using principal component analysis.
24 | The number of factors estimated is determined by an information criterion specified by the user.
25 | Missing values in the original dataset are handled using an iterative
26 | expectation-maximization algorithm.
27 |
28 | 5. mrsq.py - Computes the R-squared and marginal R-squared values from estimated factors and factor loadings.
29 | ===================================================
30 |
31 | * prepare_missing -> transforms data according to the rules given in the first row of the data spreadsheet
32 | * remove outliners -> set outliers to na -> still missing observations
33 | * factors_em
34 | -> first set missing values to unconditional mean
35 | a) transform_data -> standardise based on DEMEAN method (pandas -> numpy)
36 | b) baing -> compute the number of factors (numpy <-> numpy)
37 | c) pc2 -> compute factors & make a prediction
38 |
39 | ===================================================
40 | Code ported to python 3 by George Milunovich
41 | george.milunovich@mq.edu.au
42 | ===================================================
--------------------------------------------------------------------------------
/remove_outliers.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | '''
4 | =========================================================================
5 | Author (ported to python): George Milunovich
6 | Date: 5 November 2019
7 |
8 |
9 | Based on Matlab code by: Michael W. McCracken and Serena Ng
10 | Date: 6/7/2017
11 | Version: MATLAB 2014a
12 | Required Toolboxes: None
13 | =========================================================================
14 | '''
15 |
16 |
17 | def remove_outliers(X):
18 | '''
19 | =========================================================================
20 | DESCRIPTION:
21 | This function takes a set of series aligned in the columns of a matrix
22 | and replaces outliers with the value nan.
23 |
24 | -------------------------------------------------------------------------
25 | INPUT:
26 | X = dataset (one series per column)
27 |
28 | OUTPUT:
29 | Y = dataset with outliers replaced with NaN
30 | n = number of outliers found in each series
31 |
32 | -------------------------------------------------------------------------
33 | NOTES:
34 | 1) Outlier definition: a data point x of a series X[:,i] is
35 | considered an outlier if abs(x-median)>10*interquartile_range.
36 |
37 | 2) This function ignores values of nan and thus is capable of
38 | replacing outliers for series that have missing values.
39 |
40 | =========================================================================
41 | '''
42 |
43 | median_X = X.median(axis=0) # Calculate median of each series
44 | median_X_mat = X*0 + median_X # Substitute all values of each series in X with their median
45 |
46 | IRQ = X.quantile(0.75) - X.quantile(0.25) # Calculate interquartile range (IQR) of each series
47 | IRQ_mat = X*0 + IRQ # Substitute all values of each series in X with their IRQ
48 |
49 | Z = abs(X - median_X_mat) # Compute distance from median
50 | outliers = Z > (10*IRQ_mat) # Determine outliers given distance
51 |
52 | Y = X[outliers == False] # Replace outliers with nan
53 | n = outliers.sum() # Count the number of outliers
54 | return Y, n
55 |
56 |
57 |
58 | # if __name__ == "__main__":
59 | # data = pd.read_csv('../../data/2019-07-transformed.csv', index_col=0) # read in data
60 | # data_removed_outliers, count_outliers = remove_outliers(data)
61 | # data_removed_outliers.to_csv('../../data/2019-07-transformed-removed-outliers.csv')
62 | # count_outliers.to_csv('../../data/2019-07-count-outliers.csv')
63 |
--------------------------------------------------------------------------------
/mrsq.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | '''
4 | =========================================================================
5 | Author (ported to python): George Milunovich
6 | Date: 5 November 2019
7 |
8 |
9 | Based on Matlab code by: Michael W. McCracken and Serena Ng
10 | Date: 6/7/2017
11 | Version: MATLAB 2014a
12 | Required Toolboxes: None
13 | =========================================================================
14 | '''
15 |
16 |
17 | def mrsq(Fhat,lamhat,ve2,series):
18 | ''' =========================================================================
19 | DESCRIPTION
20 | This function computes the R-squared and marginal R-squared from
21 | estimated factors and factor loadings.
22 |
23 | -------------------------------------------------------------------------
24 | INPUTS
25 | Fhat = estimated factors (one factor per column)
26 | lamhat = factor loadings (one factor per column)
27 | ve2 = eigenvalues of covariance matrix
28 | series = series names
29 |
30 | OUTPUTS
31 | R2 = R-squared for each series for each factor
32 | mR2 = marginal R-squared for each series for each factor
33 | mR2_F = marginal R-squared for each factor
34 | R2_T = total variation explained by all factors
35 | t10_s = top 10 series that load most heavily on each factor
36 | t10_mR2 = marginal R-squared corresponding to top 10 series
37 | that load most heavily on each factor
38 |
39 | '''
40 |
41 | N, ic = lamhat.shape # N = number of series, ic = number of factors
42 | Fhat = Fhat.values
43 |
44 | print(N, ic)
45 |
46 | # Preallocate memory for output
47 | R2 = np.full((N, ic), np.nan)
48 | mR2 = np.full((N, ic), np.nan)
49 | t10_mR2 = np.full((10, ic), np.nan)
50 | t10_s = []
51 |
52 |
53 | # Compute R-squared and marginal R-squared for each series for each factor
54 | for i in range(ic):
55 | R2[:, i] = (np.var(Fhat[:, :i+1]@lamhat[:, :i+1].T, axis=0))
56 | mR2[:, i] = (np.var(Fhat[:, i:i+1]@lamhat[:, i:i+1].T, axis=0))
57 |
58 | # Compute marginal R-squared for each factor
59 | mR2_F = ve2/np.sum(ve2)
60 | mR2_F = mR2_F[0:ic]
61 |
62 | # Compute total variation explained by all factors
63 | R2_T = np.sum(mR2_F)
64 |
65 | # Sort series by marginal R-squared for each factor
66 | ind = mR2.argsort(axis=0)[::-1]
67 | vals = mR2[ind, np.arange(ind.shape[1])]
68 |
69 | # Get top 10 series that load most heavily on each factor and the
70 | # corresponding marginal R-squared values
71 |
72 | for i in range(ic):
73 | t10_s.append(series[ind[0:10, i]])
74 | t10_mR2[:, i] = vals[0:10, i]
75 |
76 | t10_s = list(map(list, zip(*t10_s))) # transpose list
77 | return R2, mR2, mR2_F, R2_T, t10_s, t10_mR2
--------------------------------------------------------------------------------
/prepare_missing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | '''
5 | =========================================================================
6 | Author (ported to python): George Milunovich
7 | Date: 5 November 2019
8 |
9 |
10 | Based on Matlab code by: Michael W. McCracken and Serena Ng
11 | Date: 6/7/2017
12 | Version: MATLAB 2014a
13 | Required Toolboxes: None
14 | =========================================================================
15 | '''
16 |
17 |
18 | def transxf(x, tcode):
19 | '''
20 | =========================================================================
21 | DESCRIPTION:
22 | This function transforms a SINGLE SERIES (in a column vector) as specified
23 | by a given transformation code.
24 |
25 | -------------------------------------------------------------------------
26 | INPUT:
27 | x = series (in a column vector) to be transformed
28 | tcode = transformation code (1-7)
29 |
30 | OUTPUT:
31 | y = transformed series (as a column vector)
32 | -------------------------------------------------------------------------
33 | '''
34 | assert x.shape[1] == 1, 'x must contain one column'
35 |
36 | name = x.columns.values[0]
37 | x.rename(columns={name:'original'}, inplace=True)
38 |
39 | small = 1e-6 # Value close to zero
40 |
41 | if tcode == 1: # Level (i.e. no transformation): x(t)
42 | x[name] = x
43 |
44 | elif tcode == 2: # First difference: x(t)-x(t-1)
45 | x[name] = x.diff()
46 |
47 | elif tcode == 3: # Second difference: (x(t)-x(t-1))-(x(t-1)-x(t-2))
48 | x[name] = x.diff().diff()
49 |
50 | elif tcode == 4: # Natural log: ln(x)
51 | if x.min()[0] > small:
52 | x[name] = np.log(x)
53 |
54 | elif tcode == 5: # First difference of natural log: ln(x)-ln(x-1)
55 | if x.min()[0] > small:
56 | x[name] = np.log(x).diff()
57 |
58 | elif tcode == 6: # Second difference of natural log: (ln(x)-ln(x-1))-(ln(x-1)-ln(x-2))
59 | if x.min()[0] > small:
60 | x[name] = np.log(x).diff().diff()
61 |
62 | elif tcode == 7: # First difference of percent change: (x(t)/x(t-1)-1)-(x(t-1)/x(t-2)-1)
63 | x[name] = x.pct_change().diff()
64 |
65 | else:
66 | x[name] = np.nan
67 |
68 | return x[name]
69 |
70 |
71 | def prepare_missing(rawdata, tcode):
72 | ''' =========================================================================
73 | DESCRIPTION:
74 | This function transforms raw data based on each series' transformation
75 | code.
76 |
77 | -------------------------------------------------------------------------
78 | INPUT:
79 | rawdata = raw data
80 | tcode = transformation codes for each series
81 |
82 | OUTPUT:
83 | yt = transformed data
84 |
85 | -------------------------------------------------------------------------
86 | SUBFUNCTION:
87 | transxf: transforms a single series as specified by a
88 | given transfromation code
89 |
90 | ========================================================================='''
91 |
92 | transformed_data = pd.DataFrame()
93 | variables = rawdata.columns.values # get variable names
94 |
95 | for var in variables:
96 | x = rawdata[[var]].copy()
97 | transformed_data[var] = transxf(x, int(tcode[var]))
98 |
99 | return transformed_data
100 |
101 |
102 | # if __name__ == "__main__":
103 | # data = pd.read_csv('../../data/2019-07.csv') # read in data
104 | # tcode = data.iloc[0, :] # get transformation for each variable
105 | #
106 | # rawdata = data.iloc[1:, :] # set data
107 | # rawdata.set_index('sasdate', inplace=True, drop=True)
108 | # rawdata.index.name = 'date'
109 | #
110 | # transformed_data = prepare_missing(rawdata, tcode)
111 | # print(transformed_data)
112 | # transformed_data.to_csv('../../data/2019-07-transformed.csv')
113 |
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/fredfactors.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import prepare_missing as pm
3 | import remove_outliers as ro
4 | import factors_em as fem
5 | import mrsq
6 | # np.set_printoptions(precision=12, suppress=True)
7 |
8 |
9 | ''' =========================================================================
10 | DESCRIPTION
11 | This script loads in a FRED-MD dataset, processes the dataset, and then
12 | estimates factors.
13 |
14 | -------------------------------------------------------------------------
15 | BREAKDOWN OF THE SCRIPT
16 |
17 | Part 1: Load and label FRED-MD data.
18 |
19 | Part 2: Process data -- transform each series to be stationary and remove
20 | outliers.
21 |
22 | Part 3: Estimate factors and compute R-squared and marginal R-squared.
23 |
24 | -------------------------------------------------------------------------
25 | AUXILIARY FUNCTIONS
26 | List of auxiliary functions to be saved in same folder as this script.
27 |
28 | prepare_missing() - transforms series based on given transformation
29 | numbers
30 |
31 | remove_outliers() - removes outliers
32 |
33 | factors_em() - estimates factors
34 |
35 | mrsq() - computes R-squared and marginal R-squared from factor
36 | estimates and factor loadings
37 |
38 | =========================================================================
39 | Author (ported to python): George Milunovich
40 | Date: 5 November 2019
41 |
42 |
43 | Based on Matlab code by: Michael W. McCracken and Serena Ng
44 | Date: 6/7/2017
45 | Version: MATLAB 2014a
46 | Required Toolboxes: None
47 | =========================================================================
48 | '''
49 |
50 |
51 | # PARAMETERS TO BE CHANGED
52 |
53 | csv_in = 'data/current.csv' # File name of desired FRED-MD vintage
54 |
55 |
56 | # Type of transformation performed on each series before factors are estimated
57 | # 0 --> no transformation
58 | # 1 --> demean only
59 | # 2 --> demean and standardize
60 | # 3 --> recursively demean and then standardize
61 |
62 | DEMEAN = 2
63 |
64 | # Information criterion used to select number of factors; for more details,
65 | # see auxiliary function factors_em()
66 | # 1 --> information criterion PC_p1
67 | # 2 --> information criterion PC_p2
68 | # 3 --> information criterion PC_p3
69 |
70 | jj = 2
71 |
72 | # Maximum number of factors to be estimated; if set to 99, the number of
73 | # factors selected is forced to equal 8
74 | kmax = 8
75 |
76 | # =========================================================================
77 | # PART 1: LOAD AND LABEL DATA
78 |
79 |
80 | dum = pd.read_csv(csv_in).dropna(how='all') # Load data from CSV file
81 |
82 | series = dum.columns.values # Variable names
83 | tcode = dum.iloc[0, :] # Transformation numbers
84 | rawdata = dum.iloc[1:, :] # Raw data
85 | rawdata.set_index('sasdate', inplace=True, drop=True)
86 | rawdata.index.name = 'date'
87 | T = len(rawdata) # T = number of months in sample
88 |
89 |
90 | # =========================================================================
91 | # PART 2: PROCESS DATA
92 |
93 | # Transform raw data to be stationary using auxiliary function & prepare_missing()
94 | yt = pm.prepare_missing(rawdata, tcode)
95 |
96 |
97 | # Reduce sample to usable dates: remove first two months because some
98 | # series have been first differenced
99 | yt = yt.iloc[2:,:]
100 |
101 | # Remove outliers using auxiliary function remove_outliers(); see function
102 | # or readme.txt for definition of outliers
103 | # data = matrix of transformed series with outliers removed
104 | # n = number of outliers removed from each series
105 | #data, n = ro.remove_outliers(yt)
106 | data = yt
107 |
108 |
109 |
110 | # =========================================================================
111 | # PART 3: ESTIMATE FACTORS AND COMPUTE R-SQUARED
112 | #
113 | # Estimate factors using function factors_em()
114 | # ehat = difference between data and values of data predicted by the
115 | # factors
116 | # Fhat = set of factors
117 | # lamhat = factor loadings
118 | # ve2 = eigenvalues of data'*data
119 | # x2 = data with missing values replaced from the EM algorithm
120 |
121 | pred, ehat, Fhat, lamhat, ve2, x2 = fem.factors_em(data, kmax, jj, DEMEAN)
122 |
123 |
124 | Fhat = pd.DataFrame(Fhat, index = data.index)
125 | ehat = pd.DataFrame(ehat, index = data.index)
126 | pred = pd.DataFrame(pred, index = data.index)
127 |
128 | Fhat.to_excel('output/fred_factors_py.xlsx')
129 | ehat.to_excel('output/ehat_py.xlsx')
130 | pred.to_excel('output/pred_py.xlsx')
131 |
132 | # Compute R-squared and marginal R-squared from estimated factors and
133 | # factor loadings using function mrsq()
134 | # R2 = R-squared for each series for each factor
135 | # mR2 = marginal R-squared for each series for each factor
136 | # mR2_F = marginal R-squared for each factor
137 | # R2_T = total variation explained by all factors
138 | # t10_s = top 10 series that load most heavily on each factor
139 | # t10_mR2 = marginal R-squared corresponding to top 10 series
140 | # that load most heavily on each factor
141 | #
142 | #
143 | R2, mR2, mR2_F, R2_T, t10_s, t10_mR2 = mrsq.mrsq(Fhat,lamhat,ve2,data.columns.values)
144 |
145 | print('R2', pd.DataFrame(R2).to_string())
146 | print('mR2', pd.DataFrame(mR2).to_string())
147 | print('mR2_F', mR2_F)
148 | print('R2_T', R2_T)
149 | print('t10_s', pd.DataFrame(t10_s).to_string())
150 | print('t10_mR2', pd.DataFrame(t10_mR2).to_string())
151 |
152 |
--------------------------------------------------------------------------------
/factors_em.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | '''
5 | =========================================================================
6 | Author (ported to python): George Milunovich
7 | Date: 5 November 2019
8 |
9 |
10 | Based on Matlab code by: Michael W. McCracken and Serena Ng
11 | Date: 6/7/2017
12 | Version: MATLAB 2014a
13 | Required Toolboxes: None
14 | =========================================================================
15 | '''
16 |
17 | def transform_data(X2,DEMEAN):
18 | # take in pandas <-> return numpy
19 | '''
20 | =========================================================================
21 | DESCRIPTION
22 | This function transforms a given set of series based upon the input
23 | variable DEMEAN. The following transformations are possible:
24 |
25 | 1) No transformation.
26 |
27 | 2) Each series is demeaned only (i.e. each series is rescaled to have a
28 | mean of 0).
29 |
30 | 3) Each series is demeaned and standardized (i.e. each series is
31 | rescaled to have a mean of 0 and a standard deviation of 1).
32 |
33 | 4) Each series is recursively demeaned and then standardized. For a
34 | given series x(t), where t=1,...,T, the recursively demeaned series
35 | x'(t) is calculated as x'(t) = x(t) - mean(x(1:t)). After the
36 | recursively demeaned series x'(t) is calculated, it is standardized by
37 | dividing x'(t) by the standard deviation of the original series x. Note
38 | that this transformation does not rescale the original series to have a
39 | specified mean or standard deviation.
40 |
41 | -------------------------------------------------------------------------
42 | INPUTS
43 | X2 = set of series to be transformed (one series per
44 | column); no missing values;
45 | DEMEAN = an integer indicating the type of transformation
46 | performed on each series in x2; it can take on the
47 | following values:
48 | 0 (no transformation)
49 | 1 (demean only)
50 | 2 (demean and standardize)
51 | 3 (recursively demean and then standardize)
52 |
53 | OUTPUTS
54 | X22 = transformed dataset
55 | mut = matrix containing the values subtracted from X2
56 | during the transformation
57 | sdt = matrix containing the values that X2 was divided by
58 | during the transformation
59 |
60 | -------------------------------------------------------------------------
61 | '''
62 | assert DEMEAN in [0, 1, 2, 3], 'DEMEAN value incorrectly set, must be in [0, 1, 2, 3]'
63 | mut = X2 * 0 # initialize values at no tranformation, i.e. DEMEAN = 0
64 | std = X2 * 0 + 1
65 |
66 | if DEMEAN == 1: # Each series is demeaned only
67 | mut = X2*0 + X2.mean()
68 |
69 | elif DEMEAN == 2: # Each series is demeaned and standardized
70 | mut = X2 * 0 + X2.mean()
71 | std = X2 * 0 + X2.std()
72 |
73 | elif DEMEAN == 3: # Each series is recursively demeaned and then standardized
74 | for t in range(0, len(X2)):
75 | mut.loc[X2.index[t], X2.columns] = X2.iloc[:t+1, :].mean()
76 | std = X2 * 0 + X2.std()
77 |
78 | X22 = (X2 - mut) / std
79 |
80 | return X22.values, mut.values, std.values
81 |
82 |
83 |
84 | def minindc(X):
85 | ''' =========================================================================
86 | takes np <-> returns np
87 | DESCRIPTION
88 | This function finds the index of the minimum value for each column of a
89 | given matrix. The function assumes that the minimum value of each column
90 | occurs only once within that column. The function returns an error if
91 | this is not the case.
92 |
93 | -------------------------------------------------------------------------
94 | INPUT
95 | x = matrix
96 |
97 | OUTPUT
98 | pos = column vector with pos(i) containing the row number
99 | corresponding to the minimum value of x(:,i)
100 |
101 | ========================================================================= '''
102 |
103 | mins = X.argmin(axis=0)
104 | assert sum(X == X[mins]) == 1, 'Minimum value occurs more than once.'
105 | return mins
106 |
107 |
108 |
109 | def pc2(X,nfac):
110 | '''' =========================================================================
111 | DESCRIPTION
112 | This function runs principal component analysis.
113 |
114 | -------------------------------------------------------------------------
115 | INPUTS
116 | X = dataset (one series per column)
117 | nfac = number of factors to be selected
118 |
119 | OUTPUTS
120 | chat f = values of X predicted by the factors
121 | fhat = factors scaled by (1/sqrt(N)) where N is the number of
122 | series
123 | lambda = factor loadings scaled by number of series
124 | ss = eigenvalues of X'*X
125 |
126 | ========================================================================= '''
127 |
128 | N = X.shape[1] # Number of series in X (i.e. number of columns)
129 | # The rows of vh are the eigenvectors of A'A and the columns of u are the eigenvectors of AA'.
130 | # In both cases the corresponding (possibly non-zero) eigenvalues are given by s**2.
131 | U, S, Vh = np.linalg.svd(X.T@X) # Singular value decomposition: X'*X = U*S*V where V=U'
132 |
133 | lambda_ = U[:, :nfac]*np.sqrt(N) # Factor loadings scaled by sqrt(N)
134 | fhat = np.dot(X, lambda_)*(1/N) # Factors scaled by 1/sqrt(N) (note that lambda is scaled by sqrt(N))
135 | chat = np.dot(fhat, lambda_.T) # Estimate initial dataset X using the factors (note that U'=inv(U))
136 | ss = S # a vector of singular values of X'*X, eigenvalues are ss**2??
137 |
138 | return chat, fhat, lambda_, ss
139 |
140 |
141 |
142 |
143 | def baing(X,kmax,jj):
144 | #take in and return numpy arrays
145 | ''' =========================================================================
146 | DESCRIPTION
147 | This function determines the number of factors to be selected for a given
148 | dataset using one of three information criteria specified by the user.
149 | The user also specifies the maximum number of factors to be selected.
150 |
151 | -------------------------------------------------------------------------
152 | INPUTS
153 | X = dataset (one series per column)
154 | kmax = an integer indicating the maximum number of factors
155 | to be estimated
156 | jj = an integer indicating the information criterion used
157 | for selecting the number of factors; it can take on
158 | the following values:
159 | 1 (information criterion PC_p1)
160 | 2 (information criterion PC_p2)
161 | 3 (information criterion PC_p3)
162 |
163 | OUTPUTS
164 | ic1 = number of factors selected
165 | chat = values of X predicted by the factors
166 | Fhat = factors
167 | eigval = eivenvalues of X'*X (or X*X' if N>T)
168 |
169 | -------------------------------------------------------------------------
170 | SUBFUNCTIONS USED
171 |
172 | minindc() - finds the index of the minimum value for each column of a given matrix
173 |
174 | -------------------------------------------------------------------------
175 | BREAKDOWN OF THE FUNCTION
176 |
177 | Part 1: Setup.
178 |
179 | Part 2: Calculate the overfitting penalty for each possible number of
180 | factors to be selected (from 1 to kmax).
181 |
182 | Part 3: Select the number of factors that minimizes the specified
183 | information criterion by utilizing the overfitting penalties calculated in Part 2.
184 |
185 | Part 4: Save other output variables to be returned by the function (chat,
186 | Fhat, and eigval).
187 |
188 | ========================================================================= '''
189 | assert kmax <= X.shape[1] and kmax >= 1 and np.floor(kmax) == kmax or kmax == 99, 'kmax is specified incorrectly'
190 | assert jj in [1, 2, 3], 'jj is specified incorrectly'
191 |
192 |
193 | # PART 1: SETUP
194 |
195 | T = X.shape[0] # Number of observations per series (i.e. number of rows)
196 | N = X.shape[1] # Number of series (i.e. number of columns)
197 | NT = N * T # Total number of observations
198 | NT1 = N + T # Number of rows + columns
199 |
200 | # =========================================================================
201 | # PART 2: OVERFITTING PENALTY
202 | # Determine penalty for overfitting based on the selected information
203 | # criterion.
204 |
205 | CT = np.zeros(kmax) # overfitting penalty
206 | ii = np.arange(1, kmax + 1) # Array containing possible number of factors that can be selected (1 to kmax)
207 | GCT = min(N,T) # The smaller of N and T
208 |
209 | # Calculate penalty based on criterion determined by jj.
210 | if jj == 1: # Criterion PC_p1
211 | CT[:] = np.log(NT / NT1) * ii * (NT1 / NT)
212 |
213 | elif jj == 2: # Criterion PC_p2
214 | CT[:] = np.log(min(N, T)) * ii * (NT1 / NT)
215 |
216 | elif jj == 3: # Criterion PC_p3
217 | CT[:] = np.log(GCT) / GCT * ii
218 |
219 | # =========================================================================
220 | # PART 3: SELECT NUMBER OF FACTORS
221 | # Perform principal component analysis on the dataset and select the number
222 | # of factors that minimizes the specified information criterion.
223 | #
224 | # -------------------------------------------------------------------------
225 | # RUN PRINCIPAL COMPONENT ANALYSIS
226 | # Get components, loadings, and eigenvalues
227 |
228 | if T < N:
229 | ev, eigval, V = np.linalg.svd(np.dot(X, X.T)) # Singular value decomposition
230 | Fhat0 = ev*np.sqrt(T) # Components
231 | Lambda0 = np.dot(X.T, Fhat0) / T # Loadings
232 | else:
233 | ev, eigval, V = np.linalg.svd(np.dot(X.T, X)) # Singular value decomposition
234 | Lambda0 = ev*np.sqrt(N) # Loadings
235 | Fhat0 = np.dot(X, Lambda0) / N # Components
236 | # -------------------------------------------------------------------------
237 |
238 | # SELECT NUMBER OF FACTORS
239 | # Preallocate memory
240 | Sigma = np.zeros(kmax + 1) # sum of squared residuals divided by NT, kmax factors + no factor
241 | IC1 = np.zeros(kmax + 1) # information criterion value, kmax factors + no factor
242 |
243 | for i in range(0, kmax) : # Loop through all possibilites for the number of factors
244 | Fhat = Fhat0[:, :i+1] # Identify factors as first i components
245 | lambda_ = Lambda0[:, :i+1] # % Identify factor loadings as first i loadings
246 |
247 | chat = np.dot(Fhat, lambda_.T) # % Predict X using i factors
248 | ehat = X - chat # Residuals from predicting X using the factors
249 | Sigma[i] = ((ehat*ehat/T).sum(axis = 0)).mean() # Sum of squared residuals divided by NT
250 |
251 | IC1[i] = np.log(Sigma[i]) + CT[i] # Value of the information criterion when using i factors
252 |
253 |
254 | Sigma[kmax] = (X*X/T).sum(axis = 0).mean() # Sum of squared residuals when using no factors to predict X (i.e. fitted values are set to 0)
255 |
256 | IC1[kmax] = np.log(Sigma[kmax]) # % Value of the information criterion when using no factors
257 |
258 | ic1 = minindc(IC1) # % Number of factors that minimizes the information criterion
259 | # Set ic1=0 if ic1>kmax (i.e. no factors are selected if the value of the
260 | # information criterion is minimized when no factors are used)
261 | ic1 = ic1 *(ic1 < kmax) # if = kmax -> 0
262 |
263 | # =========================================================================
264 | # PART 4: SAVE OTHER OUTPUT
265 | #
266 | # Factors and loadings when number of factors set to kmax
267 |
268 | Fhat = Fhat0[:, :kmax] # factors
269 | Lambda = Lambda0[:, :kmax] #factor loadings
270 |
271 | chat = np.dot(Fhat, Lambda.T) # Predict X using kmax factors
272 |
273 | return ic1+1, chat, Fhat, eigval
274 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
275 |
276 | def factors_em(X, kmax, jj, DEMEAN):
277 | ''' =========================================================================
278 | DESCRIPTION
279 | This program estimates a set of factors for a given dataset using
280 | principal component analysis. The number of factors estimated is
281 | determined by an information criterion specified by the user. Missing
282 | values in the original dataset are handled using an iterative
283 | expectation-maximization (EM) algorithm.
284 |
285 | -------------------------------------------------------------------------
286 | INPUTS
287 | x = dataset (one series per column)
288 | kmax = an integer indicating the maximum number of factors
289 | to be estimated; if set to 99, the number of factors
290 | selected is forced to equal 8
291 | jj = an integer indicating the information criterion used
292 | for selecting the number of factors; it can take on
293 | the following values:
294 | 1 (information criterion PC_p1)
295 | 2 (information criterion PC_p2)
296 | 3 (information criterion PC_p3)
297 | DEMEAN = an integer indicating the type of transformation
298 | performed on each series in x before the factors are
299 | estimated; it can take on the following values:
300 | 0 (no transformation)
301 | 1 (demean only)
302 | 2 (demean and standardize)
303 | 3 (recursively demean and then standardize)
304 |
305 | OUTPUTS
306 | ehat = difference between x and values of x predicted by
307 | the factors
308 | Fhat = set of factors
309 | lamhat = factor loadings
310 | ve2 = eigenvalues of x3'*x3 (where x3 is the dataset x post
311 | transformation and with missing values filled in)
312 | x2 = x with missing values replaced from the EM algorithm
313 |
314 | -------------------------------------------------------------------------
315 | SUBFUNCTIONS
316 |
317 | baing() - selects number of factors
318 | pc2() - runs principal component analysis
319 | minindc() - finds the index of the minimum value for each column of a
320 | given matrix
321 | transform_data() - performs data transformation
322 |
323 | -------------------------------------------------------------------------'''
324 |
325 | # BREAKDOWN OF THE FUNCTION
326 | # Part 1: Check that inputs are specified correctly.
327 | # Part 2: Setup.
328 | # Part 3: Initialize the EM algorithm -- fill in missing values with
329 | # unconditional mean and estimate factors using the updated
330 | # dataset.
331 |
332 | # Part 4: Perform the EM algorithm -- update missing values using factors,
333 | # construct a new set of factors from the updated dataset, and
334 | # repeat until the factor estimates do not change.
335 | #
336 | # -------------------------------------------------------------------------
337 |
338 | # Details for the three possible information criteria can be found in the
339 | # paper "Determining the Number of Factors in Approximate Factor Models" by
340 | # Bai and Ng (2002).
341 |
342 | # The EM algorithm is essentially the one given in the paper "Macroeconomic
343 | # Forecasting Using Diffusion Indexes" by Stock and Watson (2002). The
344 | # algorithm is initialized by filling in missing values with the
345 | # unconditional mean of the series, demeaning and standardizing the updated
346 | # dataset, estimating factors from this demeaned and standardized dataset,
347 | # and then using these factors to predict the dataset. The algorithm then
348 | # proceeds as follows: update missing values using values predicted by the
349 | # latest set of factors, demean and standardize the updated dataset,
350 | # estimate a new set of factors using the demeaned and standardized updated
351 | # dataset, and repeat the process until the factor estimates do not change.
352 |
353 | # =========================================================================
354 | # PART 1: CHECKS
355 |
356 | # Check that x is not missing values for an entire row
357 | assert (X.isna().sum(axis=1) == X.shape[1]).sum() == 0, 'X contains entire rows of missing values'
358 |
359 | # Check that x is not missing values for an entire column
360 | assert (X.isna().sum(axis=0) == X.shape[0]).sum() == 0, 'X contains entire columns of missing values'
361 |
362 | # Check that kmax is an integer between 1 and the number of columns of x, or 99
363 | assert kmax <= X.shape[1] and kmax >= 1 and np.floor(kmax) == kmax or kmax == 99, 'kmax is specified incorrectly'
364 |
365 | # Check that jj is one of 1, 2, 3
366 | assert jj in [1, 2, 3], 'jj is specified incorrectly'
367 |
368 | # Check that DEMEAN is one of 0, 1, 2, 3
369 | assert DEMEAN in [0, 1, 2, 3], 'DEMEAN value incorrectly set, must be in [0, 1, 2, 3]'
370 |
371 | # =========================================================================
372 | # PART 2: SETUP
373 |
374 |
375 | maxit = 50 # Maximum number of iterations for the EM algorithm
376 | T = X.shape[0] # Number of observations per series in x (i.e. number of rows)
377 | N = X.shape[1] # Number of series in x (i.e. number of columns)
378 |
379 |
380 | err = 99999 # Set error to arbitrarily high number
381 | it = 0 # Set iteration counter to 0
382 | X1 = X.isna() # Locate missing values in x
383 |
384 | # =========================================================================
385 | # PART 3: INITIALIZE EM ALGORITHM
386 | # Fill in missing values for each series with the unconditional mean of
387 | # that series. Demean and standardize the updated dataset. Estimate factors
388 | # using the demeaned and standardized dataset, and use these factors to
389 | # predict the original dataset.
390 | # Get unconditional mean of the non-missing values of each series
391 |
392 |
393 |
394 | mut = (X*0).fillna(0) + X.mean(axis = 0) # Get unconditional mean of the non-missing values of each series
395 | # mut has no missing values (na)
396 | X2 = X.fillna(mut) # Replace missing values with unconditional mean
397 |
398 | # Demean and standardize data using subfunction transform_data()
399 | # x3 = transformed dataset
400 | # mut = matrix containing the values subtracted from x2 during the
401 | # transformation, TYPE OF DEMEANING USED - CAN BE SIMPLE COLUMN MEAN
402 | # sdt = matrix containing the values that x2 was divided by during the
403 | # transformation
404 |
405 | X3, mut, std = transform_data(X2, DEMEAN) # these are numpy arrays, DEMEAN = 2
406 | # If input 'kmax' is not set to 99, use subfunction baing() to determine
407 | # the number of factors to estimate. Otherwise, set number of factors equal to 8
408 | if kmax != 99:
409 | icstar, _, _, _ = baing(X3, kmax, jj)
410 | else:
411 | icstar = 8
412 |
413 |
414 | # Run principal components on updated dataset using subfunction pc2()
415 | # chat = values of x3 predicted by the factors
416 | # Fhat = factors scaled by (1/sqrt(N)) where N is the number of series
417 | # lamhat = factor loadings scaled by number of series
418 | # ve2 = eigenvalues of x3'*x3
419 |
420 | chat, Fhat, lamhat, ve2 = pc2(X3, icstar)
421 | chat0 = chat # Save predicted series values
422 |
423 | # =========================================================================
424 | # PART 4: PERFORM EM ALGORITHM
425 | # Update missing values using values predicted by the latest set of
426 | # factors. Demean and standardize the updated dataset. Estimate a new set
427 | # of factors using the updated dataset. Repeat the process until the factor
428 | # estimates do not change.
429 | #
430 | # Run while error is large and have yet to exceed maximum number of
431 | # iterations
432 |
433 | while (err > 0.000001) and (it < maxit):
434 |
435 | # ---------------------------------------------------------------------
436 | it += 1 # Increase iteration counter by 1
437 | print(f'Iteration {it}: obj {err} IC {icstar} \n') # Display iteration counter, error, and number of factors
438 |
439 | # ---------------------------------------------------------------------
440 | # UPDATE MISSING VALUES
441 | # Replace missing observations with latest values predicted by the
442 | # factors (after undoing any transformation)
443 |
444 | temp = X.fillna(0)*0 + chat*std + mut # temp must not have na's in the df as it will keep them
445 | X2 = X.fillna(temp)
446 |
447 | # ---------------------------------------------------------------------
448 | # ESTIMATE FACTORS
449 | # Demean/standardize new dataset and recalculate mut and sdt using
450 | # subfunction transform_data()
451 | # x3 = transformed dataset
452 | # mut = matrix containing the values subtracted from x2 during the
453 | # transformation
454 | # sdt = matrix containing the values that x2 was divided by during
455 | # the transformation
456 |
457 | X3, mut, sdt = transform_data(X2, DEMEAN)
458 |
459 | # Determine number of factors to estimate for the new dataset using
460 | # subfunction baing() (or set to 8 if kmax equals 99)
461 |
462 | if kmax != 99:
463 | icstar, _, _, _ = baing(X3, kmax, jj)
464 | else:
465 | icstar = 8
466 |
467 | # Run principal components on the new dataset using subfunction pc2()
468 | # chat = values of x3 predicted by the factors
469 | # Fhat = factors scaled by (1/sqrt(N)) where N is the number of
470 | # series
471 | # lamhat = factor loadings scaled by number of series
472 | # ve2 = eigenvalues of x3'*x3
473 |
474 | chat, Fhat, lamhat, ve2 = pc2(X3, icstar)
475 | # ---------------------------------------------------------------------
476 | # CALCULATE NEW ERROR VALUE
477 | # Calculate difference between the predicted values of the new dataset
478 | # and the predicted values of the previous dataset
479 | diff = chat - chat0
480 | # The error value is equal to the sum of the squared differences
481 | # between chat and chat0 divided by the sum of the squared values of chat0
482 |
483 | v1 = diff.flatten(order = 'F') # vectorise columns
484 | v2 = chat0.flatten(order = 'F')
485 |
486 | err = (np.dot(v1.T, v1) / np.dot(v2.T, v2))
487 | chat0 = chat # Set chat0 equal to the current chat
488 |
489 | if it == maxit: # Produce warning if maximum number of iterations is reached
490 | print('Maximum number of iterations reached in EM algorithm')
491 |
492 | # -------------------------------------------------------------------------
493 | # FINAL DIFFERENCE
494 | # Calculate the difference between the initial dataset and the values
495 | # predicted by the final set of factors
496 | pred = chat*sdt + mut
497 | ehat = X - pred
498 | # ehat = X - chat*sdt + mut
499 |
500 | return pred, ehat, Fhat, lamhat, ve2, X2
501 |
502 |
503 | # if __name__ == "__main__":
504 | # X = pd.read_csv('../../data/2019-07-transformed-removed-outliers.csv', index_col=0) # read in data
505 | # kmax = 7
506 | # jj = 2
507 | # DEMEAN = 2
508 |
--------------------------------------------------------------------------------