├── output ├── ehat_py.xlsx ├── pred_py.xlsx └── fred_factors_py.xlsx ├── __pycache__ ├── mrsq.cpython-36.pyc ├── factors_em.cpython-36.pyc ├── prepare_missing.cpython-36.pyc └── remove_outliers.cpython-36.pyc ├── download_data.py ├── README.md ├── remove_outliers.py ├── mrsq.py ├── prepare_missing.py ├── fredfactors.py └── factors_em.py /output/ehat_py.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/output/ehat_py.xlsx -------------------------------------------------------------------------------- /output/pred_py.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/output/pred_py.xlsx -------------------------------------------------------------------------------- /output/fred_factors_py.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/output/fred_factors_py.xlsx -------------------------------------------------------------------------------- /__pycache__/mrsq.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/__pycache__/mrsq.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/factors_em.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/__pycache__/factors_em.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/prepare_missing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/__pycache__/prepare_missing.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/remove_outliers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoluna/FactorModels/HEAD/__pycache__/remove_outliers.cpython-36.pyc -------------------------------------------------------------------------------- /download_data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | # data links available from 4 | # https://research.stlouisfed.org/econ/mccracken/fred-databases/ 5 | 6 | url = 'https://s3.amazonaws.com/files.fred.stlouisfed.org/fred-md/monthly/current.csv' 7 | r = requests.get(url, allow_redirects=True) 8 | open('data/current.csv', 'wb').write(r.content) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | README 2 | 3 | This is a python implementation of McCracken & Ng (2017) Matlab code which is used to 4 | estimate factor models and make predictions on the basis of FRED-MD (monthly) 5 | and FRED-QD (quarterly) macroeconomic databases. 6 | 7 | For details regarding the data, and the original Matlab codes, see 8 | http://research.stlouisfed.org/econ/mccracken/fred-databases/ 9 | 10 | The code loads in the data, transforms each series to be stationary, 11 | removes outliers, estimates factors, and computes the R-squared and 12 | marginal R-squared values from the estimated factors and factor loadings. 13 | 14 | ===================================================
15 | List of files: 16 | 17 | 1. fredfactors.py - Performs all the tasks mentioned above using the auxiliary functions described below 18 | 19 | 2. prepare_missing.py - Transforms the raw data into stationary form 20 | 21 | 3. remove_outliers.py - Removes outliers from the data. A data point x is considered an outlier if |x-median|>10*interquartile_range. 22 | 23 | 4. factors_em.py - Estimates a set of factors for a given dataset using principal component analysis. 24 | The number of factors estimated is determined by an information criterion specified by the user. 25 | Missing values in the original dataset are handled using an iterative 26 | expectation-maximization algorithm. 27 | 28 | 5. mrsq.py - Computes the R-squared and marginal R-squared values from estimated factors and factor loadings. 29 | ===================================================
30 | 31 | * prepare_missing -> transforms data according to the rules given in the first row of the data spreadsheet 32 | * remove outliners -> set outliers to na -> still missing observations 33 | * factors_em 34 | -> first set missing values to unconditional mean 35 | a) transform_data -> standardise based on DEMEAN method (pandas -> numpy) 36 | b) baing -> compute the number of factors (numpy <-> numpy) 37 | c) pc2 -> compute factors & make a prediction 38 | 39 | ===================================================
40 | Code ported to python 3 by George Milunovich
41 | george.milunovich@mq.edu.au
42 | ===================================================
-------------------------------------------------------------------------------- /remove_outliers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | ''' 4 | ========================================================================= 5 | Author (ported to python): George Milunovich 6 | Date: 5 November 2019 7 | 8 | 9 | Based on Matlab code by: Michael W. McCracken and Serena Ng 10 | Date: 6/7/2017 11 | Version: MATLAB 2014a 12 | Required Toolboxes: None 13 | ========================================================================= 14 | ''' 15 | 16 | 17 | def remove_outliers(X): 18 | ''' 19 | ========================================================================= 20 | DESCRIPTION: 21 | This function takes a set of series aligned in the columns of a matrix 22 | and replaces outliers with the value nan. 23 | 24 | ------------------------------------------------------------------------- 25 | INPUT: 26 | X = dataset (one series per column) 27 | 28 | OUTPUT: 29 | Y = dataset with outliers replaced with NaN 30 | n = number of outliers found in each series 31 | 32 | ------------------------------------------------------------------------- 33 | NOTES: 34 | 1) Outlier definition: a data point x of a series X[:,i] is 35 | considered an outlier if abs(x-median)>10*interquartile_range. 36 | 37 | 2) This function ignores values of nan and thus is capable of 38 | replacing outliers for series that have missing values. 39 | 40 | ========================================================================= 41 | ''' 42 | 43 | median_X = X.median(axis=0) # Calculate median of each series 44 | median_X_mat = X*0 + median_X # Substitute all values of each series in X with their median 45 | 46 | IRQ = X.quantile(0.75) - X.quantile(0.25) # Calculate interquartile range (IQR) of each series 47 | IRQ_mat = X*0 + IRQ # Substitute all values of each series in X with their IRQ 48 | 49 | Z = abs(X - median_X_mat) # Compute distance from median 50 | outliers = Z > (10*IRQ_mat) # Determine outliers given distance 51 | 52 | Y = X[outliers == False] # Replace outliers with nan 53 | n = outliers.sum() # Count the number of outliers 54 | return Y, n 55 | 56 | 57 | 58 | # if __name__ == "__main__": 59 | # data = pd.read_csv('../../data/2019-07-transformed.csv', index_col=0) # read in data 60 | # data_removed_outliers, count_outliers = remove_outliers(data) 61 | # data_removed_outliers.to_csv('../../data/2019-07-transformed-removed-outliers.csv') 62 | # count_outliers.to_csv('../../data/2019-07-count-outliers.csv') 63 | -------------------------------------------------------------------------------- /mrsq.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | ''' 4 | ========================================================================= 5 | Author (ported to python): George Milunovich 6 | Date: 5 November 2019 7 | 8 | 9 | Based on Matlab code by: Michael W. McCracken and Serena Ng 10 | Date: 6/7/2017 11 | Version: MATLAB 2014a 12 | Required Toolboxes: None 13 | ========================================================================= 14 | ''' 15 | 16 | 17 | def mrsq(Fhat,lamhat,ve2,series): 18 | ''' ========================================================================= 19 | DESCRIPTION 20 | This function computes the R-squared and marginal R-squared from 21 | estimated factors and factor loadings. 22 | 23 | ------------------------------------------------------------------------- 24 | INPUTS 25 | Fhat = estimated factors (one factor per column) 26 | lamhat = factor loadings (one factor per column) 27 | ve2 = eigenvalues of covariance matrix 28 | series = series names 29 | 30 | OUTPUTS 31 | R2 = R-squared for each series for each factor 32 | mR2 = marginal R-squared for each series for each factor 33 | mR2_F = marginal R-squared for each factor 34 | R2_T = total variation explained by all factors 35 | t10_s = top 10 series that load most heavily on each factor 36 | t10_mR2 = marginal R-squared corresponding to top 10 series 37 | that load most heavily on each factor 38 | 39 | ''' 40 | 41 | N, ic = lamhat.shape # N = number of series, ic = number of factors 42 | Fhat = Fhat.values 43 | 44 | print(N, ic) 45 | 46 | # Preallocate memory for output 47 | R2 = np.full((N, ic), np.nan) 48 | mR2 = np.full((N, ic), np.nan) 49 | t10_mR2 = np.full((10, ic), np.nan) 50 | t10_s = [] 51 | 52 | 53 | # Compute R-squared and marginal R-squared for each series for each factor 54 | for i in range(ic): 55 | R2[:, i] = (np.var(Fhat[:, :i+1]@lamhat[:, :i+1].T, axis=0)) 56 | mR2[:, i] = (np.var(Fhat[:, i:i+1]@lamhat[:, i:i+1].T, axis=0)) 57 | 58 | # Compute marginal R-squared for each factor 59 | mR2_F = ve2/np.sum(ve2) 60 | mR2_F = mR2_F[0:ic] 61 | 62 | # Compute total variation explained by all factors 63 | R2_T = np.sum(mR2_F) 64 | 65 | # Sort series by marginal R-squared for each factor 66 | ind = mR2.argsort(axis=0)[::-1] 67 | vals = mR2[ind, np.arange(ind.shape[1])] 68 | 69 | # Get top 10 series that load most heavily on each factor and the 70 | # corresponding marginal R-squared values 71 | 72 | for i in range(ic): 73 | t10_s.append(series[ind[0:10, i]]) 74 | t10_mR2[:, i] = vals[0:10, i] 75 | 76 | t10_s = list(map(list, zip(*t10_s))) # transpose list 77 | return R2, mR2, mR2_F, R2_T, t10_s, t10_mR2 -------------------------------------------------------------------------------- /prepare_missing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | ''' 5 | ========================================================================= 6 | Author (ported to python): George Milunovich 7 | Date: 5 November 2019 8 | 9 | 10 | Based on Matlab code by: Michael W. McCracken and Serena Ng 11 | Date: 6/7/2017 12 | Version: MATLAB 2014a 13 | Required Toolboxes: None 14 | ========================================================================= 15 | ''' 16 | 17 | 18 | def transxf(x, tcode): 19 | ''' 20 | ========================================================================= 21 | DESCRIPTION: 22 | This function transforms a SINGLE SERIES (in a column vector) as specified 23 | by a given transformation code. 24 | 25 | ------------------------------------------------------------------------- 26 | INPUT: 27 | x = series (in a column vector) to be transformed 28 | tcode = transformation code (1-7) 29 | 30 | OUTPUT: 31 | y = transformed series (as a column vector) 32 | ------------------------------------------------------------------------- 33 | ''' 34 | assert x.shape[1] == 1, 'x must contain one column' 35 | 36 | name = x.columns.values[0] 37 | x.rename(columns={name:'original'}, inplace=True) 38 | 39 | small = 1e-6 # Value close to zero 40 | 41 | if tcode == 1: # Level (i.e. no transformation): x(t) 42 | x[name] = x 43 | 44 | elif tcode == 2: # First difference: x(t)-x(t-1) 45 | x[name] = x.diff() 46 | 47 | elif tcode == 3: # Second difference: (x(t)-x(t-1))-(x(t-1)-x(t-2)) 48 | x[name] = x.diff().diff() 49 | 50 | elif tcode == 4: # Natural log: ln(x) 51 | if x.min()[0] > small: 52 | x[name] = np.log(x) 53 | 54 | elif tcode == 5: # First difference of natural log: ln(x)-ln(x-1) 55 | if x.min()[0] > small: 56 | x[name] = np.log(x).diff() 57 | 58 | elif tcode == 6: # Second difference of natural log: (ln(x)-ln(x-1))-(ln(x-1)-ln(x-2)) 59 | if x.min()[0] > small: 60 | x[name] = np.log(x).diff().diff() 61 | 62 | elif tcode == 7: # First difference of percent change: (x(t)/x(t-1)-1)-(x(t-1)/x(t-2)-1) 63 | x[name] = x.pct_change().diff() 64 | 65 | else: 66 | x[name] = np.nan 67 | 68 | return x[name] 69 | 70 | 71 | def prepare_missing(rawdata, tcode): 72 | ''' ========================================================================= 73 | DESCRIPTION: 74 | This function transforms raw data based on each series' transformation 75 | code. 76 | 77 | ------------------------------------------------------------------------- 78 | INPUT: 79 | rawdata = raw data 80 | tcode = transformation codes for each series 81 | 82 | OUTPUT: 83 | yt = transformed data 84 | 85 | ------------------------------------------------------------------------- 86 | SUBFUNCTION: 87 | transxf: transforms a single series as specified by a 88 | given transfromation code 89 | 90 | =========================================================================''' 91 | 92 | transformed_data = pd.DataFrame() 93 | variables = rawdata.columns.values # get variable names 94 | 95 | for var in variables: 96 | x = rawdata[[var]].copy() 97 | transformed_data[var] = transxf(x, int(tcode[var])) 98 | 99 | return transformed_data 100 | 101 | 102 | # if __name__ == "__main__": 103 | # data = pd.read_csv('../../data/2019-07.csv') # read in data 104 | # tcode = data.iloc[0, :] # get transformation for each variable 105 | # 106 | # rawdata = data.iloc[1:, :] # set data 107 | # rawdata.set_index('sasdate', inplace=True, drop=True) 108 | # rawdata.index.name = 'date' 109 | # 110 | # transformed_data = prepare_missing(rawdata, tcode) 111 | # print(transformed_data) 112 | # transformed_data.to_csv('../../data/2019-07-transformed.csv') 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /fredfactors.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import prepare_missing as pm 3 | import remove_outliers as ro 4 | import factors_em as fem 5 | import mrsq 6 | # np.set_printoptions(precision=12, suppress=True) 7 | 8 | 9 | ''' ========================================================================= 10 | DESCRIPTION 11 | This script loads in a FRED-MD dataset, processes the dataset, and then 12 | estimates factors. 13 | 14 | ------------------------------------------------------------------------- 15 | BREAKDOWN OF THE SCRIPT 16 | 17 | Part 1: Load and label FRED-MD data. 18 | 19 | Part 2: Process data -- transform each series to be stationary and remove 20 | outliers. 21 | 22 | Part 3: Estimate factors and compute R-squared and marginal R-squared. 23 | 24 | ------------------------------------------------------------------------- 25 | AUXILIARY FUNCTIONS 26 | List of auxiliary functions to be saved in same folder as this script. 27 | 28 | prepare_missing() - transforms series based on given transformation 29 | numbers 30 | 31 | remove_outliers() - removes outliers 32 | 33 | factors_em() - estimates factors 34 | 35 | mrsq() - computes R-squared and marginal R-squared from factor 36 | estimates and factor loadings 37 | 38 | ========================================================================= 39 | Author (ported to python): George Milunovich 40 | Date: 5 November 2019 41 | 42 | 43 | Based on Matlab code by: Michael W. McCracken and Serena Ng 44 | Date: 6/7/2017 45 | Version: MATLAB 2014a 46 | Required Toolboxes: None 47 | ========================================================================= 48 | ''' 49 | 50 | 51 | # PARAMETERS TO BE CHANGED 52 | 53 | csv_in = 'data/current.csv' # File name of desired FRED-MD vintage 54 | 55 | 56 | # Type of transformation performed on each series before factors are estimated 57 | # 0 --> no transformation 58 | # 1 --> demean only 59 | # 2 --> demean and standardize 60 | # 3 --> recursively demean and then standardize 61 | 62 | DEMEAN = 2 63 | 64 | # Information criterion used to select number of factors; for more details, 65 | # see auxiliary function factors_em() 66 | # 1 --> information criterion PC_p1 67 | # 2 --> information criterion PC_p2 68 | # 3 --> information criterion PC_p3 69 | 70 | jj = 2 71 | 72 | # Maximum number of factors to be estimated; if set to 99, the number of 73 | # factors selected is forced to equal 8 74 | kmax = 8 75 | 76 | # ========================================================================= 77 | # PART 1: LOAD AND LABEL DATA 78 | 79 | 80 | dum = pd.read_csv(csv_in).dropna(how='all') # Load data from CSV file 81 | 82 | series = dum.columns.values # Variable names 83 | tcode = dum.iloc[0, :] # Transformation numbers 84 | rawdata = dum.iloc[1:, :] # Raw data 85 | rawdata.set_index('sasdate', inplace=True, drop=True) 86 | rawdata.index.name = 'date' 87 | T = len(rawdata) # T = number of months in sample 88 | 89 | 90 | # ========================================================================= 91 | # PART 2: PROCESS DATA 92 | 93 | # Transform raw data to be stationary using auxiliary function & prepare_missing() 94 | yt = pm.prepare_missing(rawdata, tcode) 95 | 96 | 97 | # Reduce sample to usable dates: remove first two months because some 98 | # series have been first differenced 99 | yt = yt.iloc[2:,:] 100 | 101 | # Remove outliers using auxiliary function remove_outliers(); see function 102 | # or readme.txt for definition of outliers 103 | # data = matrix of transformed series with outliers removed 104 | # n = number of outliers removed from each series 105 | #data, n = ro.remove_outliers(yt) 106 | data = yt 107 | 108 | 109 | 110 | # ========================================================================= 111 | # PART 3: ESTIMATE FACTORS AND COMPUTE R-SQUARED 112 | # 113 | # Estimate factors using function factors_em() 114 | # ehat = difference between data and values of data predicted by the 115 | # factors 116 | # Fhat = set of factors 117 | # lamhat = factor loadings 118 | # ve2 = eigenvalues of data'*data 119 | # x2 = data with missing values replaced from the EM algorithm 120 | 121 | pred, ehat, Fhat, lamhat, ve2, x2 = fem.factors_em(data, kmax, jj, DEMEAN) 122 | 123 | 124 | Fhat = pd.DataFrame(Fhat, index = data.index) 125 | ehat = pd.DataFrame(ehat, index = data.index) 126 | pred = pd.DataFrame(pred, index = data.index) 127 | 128 | Fhat.to_excel('output/fred_factors_py.xlsx') 129 | ehat.to_excel('output/ehat_py.xlsx') 130 | pred.to_excel('output/pred_py.xlsx') 131 | 132 | # Compute R-squared and marginal R-squared from estimated factors and 133 | # factor loadings using function mrsq() 134 | # R2 = R-squared for each series for each factor 135 | # mR2 = marginal R-squared for each series for each factor 136 | # mR2_F = marginal R-squared for each factor 137 | # R2_T = total variation explained by all factors 138 | # t10_s = top 10 series that load most heavily on each factor 139 | # t10_mR2 = marginal R-squared corresponding to top 10 series 140 | # that load most heavily on each factor 141 | # 142 | # 143 | R2, mR2, mR2_F, R2_T, t10_s, t10_mR2 = mrsq.mrsq(Fhat,lamhat,ve2,data.columns.values) 144 | 145 | print('R2', pd.DataFrame(R2).to_string()) 146 | print('mR2', pd.DataFrame(mR2).to_string()) 147 | print('mR2_F', mR2_F) 148 | print('R2_T', R2_T) 149 | print('t10_s', pd.DataFrame(t10_s).to_string()) 150 | print('t10_mR2', pd.DataFrame(t10_mR2).to_string()) 151 | 152 | -------------------------------------------------------------------------------- /factors_em.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | ''' 5 | ========================================================================= 6 | Author (ported to python): George Milunovich 7 | Date: 5 November 2019 8 | 9 | 10 | Based on Matlab code by: Michael W. McCracken and Serena Ng 11 | Date: 6/7/2017 12 | Version: MATLAB 2014a 13 | Required Toolboxes: None 14 | ========================================================================= 15 | ''' 16 | 17 | def transform_data(X2,DEMEAN): 18 | # take in pandas <-> return numpy 19 | ''' 20 | ========================================================================= 21 | DESCRIPTION 22 | This function transforms a given set of series based upon the input 23 | variable DEMEAN. The following transformations are possible: 24 | 25 | 1) No transformation. 26 | 27 | 2) Each series is demeaned only (i.e. each series is rescaled to have a 28 | mean of 0). 29 | 30 | 3) Each series is demeaned and standardized (i.e. each series is 31 | rescaled to have a mean of 0 and a standard deviation of 1). 32 | 33 | 4) Each series is recursively demeaned and then standardized. For a 34 | given series x(t), where t=1,...,T, the recursively demeaned series 35 | x'(t) is calculated as x'(t) = x(t) - mean(x(1:t)). After the 36 | recursively demeaned series x'(t) is calculated, it is standardized by 37 | dividing x'(t) by the standard deviation of the original series x. Note 38 | that this transformation does not rescale the original series to have a 39 | specified mean or standard deviation. 40 | 41 | ------------------------------------------------------------------------- 42 | INPUTS 43 | X2 = set of series to be transformed (one series per 44 | column); no missing values; 45 | DEMEAN = an integer indicating the type of transformation 46 | performed on each series in x2; it can take on the 47 | following values: 48 | 0 (no transformation) 49 | 1 (demean only) 50 | 2 (demean and standardize) 51 | 3 (recursively demean and then standardize) 52 | 53 | OUTPUTS 54 | X22 = transformed dataset 55 | mut = matrix containing the values subtracted from X2 56 | during the transformation 57 | sdt = matrix containing the values that X2 was divided by 58 | during the transformation 59 | 60 | ------------------------------------------------------------------------- 61 | ''' 62 | assert DEMEAN in [0, 1, 2, 3], 'DEMEAN value incorrectly set, must be in [0, 1, 2, 3]' 63 | mut = X2 * 0 # initialize values at no tranformation, i.e. DEMEAN = 0 64 | std = X2 * 0 + 1 65 | 66 | if DEMEAN == 1: # Each series is demeaned only 67 | mut = X2*0 + X2.mean() 68 | 69 | elif DEMEAN == 2: # Each series is demeaned and standardized 70 | mut = X2 * 0 + X2.mean() 71 | std = X2 * 0 + X2.std() 72 | 73 | elif DEMEAN == 3: # Each series is recursively demeaned and then standardized 74 | for t in range(0, len(X2)): 75 | mut.loc[X2.index[t], X2.columns] = X2.iloc[:t+1, :].mean() 76 | std = X2 * 0 + X2.std() 77 | 78 | X22 = (X2 - mut) / std 79 | 80 | return X22.values, mut.values, std.values 81 | 82 | 83 | 84 | def minindc(X): 85 | ''' ========================================================================= 86 | takes np <-> returns np 87 | DESCRIPTION 88 | This function finds the index of the minimum value for each column of a 89 | given matrix. The function assumes that the minimum value of each column 90 | occurs only once within that column. The function returns an error if 91 | this is not the case. 92 | 93 | ------------------------------------------------------------------------- 94 | INPUT 95 | x = matrix 96 | 97 | OUTPUT 98 | pos = column vector with pos(i) containing the row number 99 | corresponding to the minimum value of x(:,i) 100 | 101 | ========================================================================= ''' 102 | 103 | mins = X.argmin(axis=0) 104 | assert sum(X == X[mins]) == 1, 'Minimum value occurs more than once.' 105 | return mins 106 | 107 | 108 | 109 | def pc2(X,nfac): 110 | '''' ========================================================================= 111 | DESCRIPTION 112 | This function runs principal component analysis. 113 | 114 | ------------------------------------------------------------------------- 115 | INPUTS 116 | X = dataset (one series per column) 117 | nfac = number of factors to be selected 118 | 119 | OUTPUTS 120 | chat f = values of X predicted by the factors 121 | fhat = factors scaled by (1/sqrt(N)) where N is the number of 122 | series 123 | lambda = factor loadings scaled by number of series 124 | ss = eigenvalues of X'*X 125 | 126 | ========================================================================= ''' 127 | 128 | N = X.shape[1] # Number of series in X (i.e. number of columns) 129 | # The rows of vh are the eigenvectors of A'A and the columns of u are the eigenvectors of AA'. 130 | # In both cases the corresponding (possibly non-zero) eigenvalues are given by s**2. 131 | U, S, Vh = np.linalg.svd(X.T@X) # Singular value decomposition: X'*X = U*S*V where V=U' 132 | 133 | lambda_ = U[:, :nfac]*np.sqrt(N) # Factor loadings scaled by sqrt(N) 134 | fhat = np.dot(X, lambda_)*(1/N) # Factors scaled by 1/sqrt(N) (note that lambda is scaled by sqrt(N)) 135 | chat = np.dot(fhat, lambda_.T) # Estimate initial dataset X using the factors (note that U'=inv(U)) 136 | ss = S # a vector of singular values of X'*X, eigenvalues are ss**2?? 137 | 138 | return chat, fhat, lambda_, ss 139 | 140 | 141 | 142 | 143 | def baing(X,kmax,jj): 144 | #take in and return numpy arrays 145 | ''' ========================================================================= 146 | DESCRIPTION 147 | This function determines the number of factors to be selected for a given 148 | dataset using one of three information criteria specified by the user. 149 | The user also specifies the maximum number of factors to be selected. 150 | 151 | ------------------------------------------------------------------------- 152 | INPUTS 153 | X = dataset (one series per column) 154 | kmax = an integer indicating the maximum number of factors 155 | to be estimated 156 | jj = an integer indicating the information criterion used 157 | for selecting the number of factors; it can take on 158 | the following values: 159 | 1 (information criterion PC_p1) 160 | 2 (information criterion PC_p2) 161 | 3 (information criterion PC_p3) 162 | 163 | OUTPUTS 164 | ic1 = number of factors selected 165 | chat = values of X predicted by the factors 166 | Fhat = factors 167 | eigval = eivenvalues of X'*X (or X*X' if N>T) 168 | 169 | ------------------------------------------------------------------------- 170 | SUBFUNCTIONS USED 171 | 172 | minindc() - finds the index of the minimum value for each column of a given matrix 173 | 174 | ------------------------------------------------------------------------- 175 | BREAKDOWN OF THE FUNCTION 176 | 177 | Part 1: Setup. 178 | 179 | Part 2: Calculate the overfitting penalty for each possible number of 180 | factors to be selected (from 1 to kmax). 181 | 182 | Part 3: Select the number of factors that minimizes the specified 183 | information criterion by utilizing the overfitting penalties calculated in Part 2. 184 | 185 | Part 4: Save other output variables to be returned by the function (chat, 186 | Fhat, and eigval). 187 | 188 | ========================================================================= ''' 189 | assert kmax <= X.shape[1] and kmax >= 1 and np.floor(kmax) == kmax or kmax == 99, 'kmax is specified incorrectly' 190 | assert jj in [1, 2, 3], 'jj is specified incorrectly' 191 | 192 | 193 | # PART 1: SETUP 194 | 195 | T = X.shape[0] # Number of observations per series (i.e. number of rows) 196 | N = X.shape[1] # Number of series (i.e. number of columns) 197 | NT = N * T # Total number of observations 198 | NT1 = N + T # Number of rows + columns 199 | 200 | # ========================================================================= 201 | # PART 2: OVERFITTING PENALTY 202 | # Determine penalty for overfitting based on the selected information 203 | # criterion. 204 | 205 | CT = np.zeros(kmax) # overfitting penalty 206 | ii = np.arange(1, kmax + 1) # Array containing possible number of factors that can be selected (1 to kmax) 207 | GCT = min(N,T) # The smaller of N and T 208 | 209 | # Calculate penalty based on criterion determined by jj. 210 | if jj == 1: # Criterion PC_p1 211 | CT[:] = np.log(NT / NT1) * ii * (NT1 / NT) 212 | 213 | elif jj == 2: # Criterion PC_p2 214 | CT[:] = np.log(min(N, T)) * ii * (NT1 / NT) 215 | 216 | elif jj == 3: # Criterion PC_p3 217 | CT[:] = np.log(GCT) / GCT * ii 218 | 219 | # ========================================================================= 220 | # PART 3: SELECT NUMBER OF FACTORS 221 | # Perform principal component analysis on the dataset and select the number 222 | # of factors that minimizes the specified information criterion. 223 | # 224 | # ------------------------------------------------------------------------- 225 | # RUN PRINCIPAL COMPONENT ANALYSIS 226 | # Get components, loadings, and eigenvalues 227 | 228 | if T < N: 229 | ev, eigval, V = np.linalg.svd(np.dot(X, X.T)) # Singular value decomposition 230 | Fhat0 = ev*np.sqrt(T) # Components 231 | Lambda0 = np.dot(X.T, Fhat0) / T # Loadings 232 | else: 233 | ev, eigval, V = np.linalg.svd(np.dot(X.T, X)) # Singular value decomposition 234 | Lambda0 = ev*np.sqrt(N) # Loadings 235 | Fhat0 = np.dot(X, Lambda0) / N # Components 236 | # ------------------------------------------------------------------------- 237 | 238 | # SELECT NUMBER OF FACTORS 239 | # Preallocate memory 240 | Sigma = np.zeros(kmax + 1) # sum of squared residuals divided by NT, kmax factors + no factor 241 | IC1 = np.zeros(kmax + 1) # information criterion value, kmax factors + no factor 242 | 243 | for i in range(0, kmax) : # Loop through all possibilites for the number of factors 244 | Fhat = Fhat0[:, :i+1] # Identify factors as first i components 245 | lambda_ = Lambda0[:, :i+1] # % Identify factor loadings as first i loadings 246 | 247 | chat = np.dot(Fhat, lambda_.T) # % Predict X using i factors 248 | ehat = X - chat # Residuals from predicting X using the factors 249 | Sigma[i] = ((ehat*ehat/T).sum(axis = 0)).mean() # Sum of squared residuals divided by NT 250 | 251 | IC1[i] = np.log(Sigma[i]) + CT[i] # Value of the information criterion when using i factors 252 | 253 | 254 | Sigma[kmax] = (X*X/T).sum(axis = 0).mean() # Sum of squared residuals when using no factors to predict X (i.e. fitted values are set to 0) 255 | 256 | IC1[kmax] = np.log(Sigma[kmax]) # % Value of the information criterion when using no factors 257 | 258 | ic1 = minindc(IC1) # % Number of factors that minimizes the information criterion 259 | # Set ic1=0 if ic1>kmax (i.e. no factors are selected if the value of the 260 | # information criterion is minimized when no factors are used) 261 | ic1 = ic1 *(ic1 < kmax) # if = kmax -> 0 262 | 263 | # ========================================================================= 264 | # PART 4: SAVE OTHER OUTPUT 265 | # 266 | # Factors and loadings when number of factors set to kmax 267 | 268 | Fhat = Fhat0[:, :kmax] # factors 269 | Lambda = Lambda0[:, :kmax] #factor loadings 270 | 271 | chat = np.dot(Fhat, Lambda.T) # Predict X using kmax factors 272 | 273 | return ic1+1, chat, Fhat, eigval 274 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 275 | 276 | def factors_em(X, kmax, jj, DEMEAN): 277 | ''' ========================================================================= 278 | DESCRIPTION 279 | This program estimates a set of factors for a given dataset using 280 | principal component analysis. The number of factors estimated is 281 | determined by an information criterion specified by the user. Missing 282 | values in the original dataset are handled using an iterative 283 | expectation-maximization (EM) algorithm. 284 | 285 | ------------------------------------------------------------------------- 286 | INPUTS 287 | x = dataset (one series per column) 288 | kmax = an integer indicating the maximum number of factors 289 | to be estimated; if set to 99, the number of factors 290 | selected is forced to equal 8 291 | jj = an integer indicating the information criterion used 292 | for selecting the number of factors; it can take on 293 | the following values: 294 | 1 (information criterion PC_p1) 295 | 2 (information criterion PC_p2) 296 | 3 (information criterion PC_p3) 297 | DEMEAN = an integer indicating the type of transformation 298 | performed on each series in x before the factors are 299 | estimated; it can take on the following values: 300 | 0 (no transformation) 301 | 1 (demean only) 302 | 2 (demean and standardize) 303 | 3 (recursively demean and then standardize) 304 | 305 | OUTPUTS 306 | ehat = difference between x and values of x predicted by 307 | the factors 308 | Fhat = set of factors 309 | lamhat = factor loadings 310 | ve2 = eigenvalues of x3'*x3 (where x3 is the dataset x post 311 | transformation and with missing values filled in) 312 | x2 = x with missing values replaced from the EM algorithm 313 | 314 | ------------------------------------------------------------------------- 315 | SUBFUNCTIONS 316 | 317 | baing() - selects number of factors 318 | pc2() - runs principal component analysis 319 | minindc() - finds the index of the minimum value for each column of a 320 | given matrix 321 | transform_data() - performs data transformation 322 | 323 | -------------------------------------------------------------------------''' 324 | 325 | # BREAKDOWN OF THE FUNCTION 326 | # Part 1: Check that inputs are specified correctly. 327 | # Part 2: Setup. 328 | # Part 3: Initialize the EM algorithm -- fill in missing values with 329 | # unconditional mean and estimate factors using the updated 330 | # dataset. 331 | 332 | # Part 4: Perform the EM algorithm -- update missing values using factors, 333 | # construct a new set of factors from the updated dataset, and 334 | # repeat until the factor estimates do not change. 335 | # 336 | # ------------------------------------------------------------------------- 337 | 338 | # Details for the three possible information criteria can be found in the 339 | # paper "Determining the Number of Factors in Approximate Factor Models" by 340 | # Bai and Ng (2002). 341 | 342 | # The EM algorithm is essentially the one given in the paper "Macroeconomic 343 | # Forecasting Using Diffusion Indexes" by Stock and Watson (2002). The 344 | # algorithm is initialized by filling in missing values with the 345 | # unconditional mean of the series, demeaning and standardizing the updated 346 | # dataset, estimating factors from this demeaned and standardized dataset, 347 | # and then using these factors to predict the dataset. The algorithm then 348 | # proceeds as follows: update missing values using values predicted by the 349 | # latest set of factors, demean and standardize the updated dataset, 350 | # estimate a new set of factors using the demeaned and standardized updated 351 | # dataset, and repeat the process until the factor estimates do not change. 352 | 353 | # ========================================================================= 354 | # PART 1: CHECKS 355 | 356 | # Check that x is not missing values for an entire row 357 | assert (X.isna().sum(axis=1) == X.shape[1]).sum() == 0, 'X contains entire rows of missing values' 358 | 359 | # Check that x is not missing values for an entire column 360 | assert (X.isna().sum(axis=0) == X.shape[0]).sum() == 0, 'X contains entire columns of missing values' 361 | 362 | # Check that kmax is an integer between 1 and the number of columns of x, or 99 363 | assert kmax <= X.shape[1] and kmax >= 1 and np.floor(kmax) == kmax or kmax == 99, 'kmax is specified incorrectly' 364 | 365 | # Check that jj is one of 1, 2, 3 366 | assert jj in [1, 2, 3], 'jj is specified incorrectly' 367 | 368 | # Check that DEMEAN is one of 0, 1, 2, 3 369 | assert DEMEAN in [0, 1, 2, 3], 'DEMEAN value incorrectly set, must be in [0, 1, 2, 3]' 370 | 371 | # ========================================================================= 372 | # PART 2: SETUP 373 | 374 | 375 | maxit = 50 # Maximum number of iterations for the EM algorithm 376 | T = X.shape[0] # Number of observations per series in x (i.e. number of rows) 377 | N = X.shape[1] # Number of series in x (i.e. number of columns) 378 | 379 | 380 | err = 99999 # Set error to arbitrarily high number 381 | it = 0 # Set iteration counter to 0 382 | X1 = X.isna() # Locate missing values in x 383 | 384 | # ========================================================================= 385 | # PART 3: INITIALIZE EM ALGORITHM 386 | # Fill in missing values for each series with the unconditional mean of 387 | # that series. Demean and standardize the updated dataset. Estimate factors 388 | # using the demeaned and standardized dataset, and use these factors to 389 | # predict the original dataset. 390 | # Get unconditional mean of the non-missing values of each series 391 | 392 | 393 | 394 | mut = (X*0).fillna(0) + X.mean(axis = 0) # Get unconditional mean of the non-missing values of each series 395 | # mut has no missing values (na) 396 | X2 = X.fillna(mut) # Replace missing values with unconditional mean 397 | 398 | # Demean and standardize data using subfunction transform_data() 399 | # x3 = transformed dataset 400 | # mut = matrix containing the values subtracted from x2 during the 401 | # transformation, TYPE OF DEMEANING USED - CAN BE SIMPLE COLUMN MEAN 402 | # sdt = matrix containing the values that x2 was divided by during the 403 | # transformation 404 | 405 | X3, mut, std = transform_data(X2, DEMEAN) # these are numpy arrays, DEMEAN = 2 406 | # If input 'kmax' is not set to 99, use subfunction baing() to determine 407 | # the number of factors to estimate. Otherwise, set number of factors equal to 8 408 | if kmax != 99: 409 | icstar, _, _, _ = baing(X3, kmax, jj) 410 | else: 411 | icstar = 8 412 | 413 | 414 | # Run principal components on updated dataset using subfunction pc2() 415 | # chat = values of x3 predicted by the factors 416 | # Fhat = factors scaled by (1/sqrt(N)) where N is the number of series 417 | # lamhat = factor loadings scaled by number of series 418 | # ve2 = eigenvalues of x3'*x3 419 | 420 | chat, Fhat, lamhat, ve2 = pc2(X3, icstar) 421 | chat0 = chat # Save predicted series values 422 | 423 | # ========================================================================= 424 | # PART 4: PERFORM EM ALGORITHM 425 | # Update missing values using values predicted by the latest set of 426 | # factors. Demean and standardize the updated dataset. Estimate a new set 427 | # of factors using the updated dataset. Repeat the process until the factor 428 | # estimates do not change. 429 | # 430 | # Run while error is large and have yet to exceed maximum number of 431 | # iterations 432 | 433 | while (err > 0.000001) and (it < maxit): 434 | 435 | # --------------------------------------------------------------------- 436 | it += 1 # Increase iteration counter by 1 437 | print(f'Iteration {it}: obj {err} IC {icstar} \n') # Display iteration counter, error, and number of factors 438 | 439 | # --------------------------------------------------------------------- 440 | # UPDATE MISSING VALUES 441 | # Replace missing observations with latest values predicted by the 442 | # factors (after undoing any transformation) 443 | 444 | temp = X.fillna(0)*0 + chat*std + mut # temp must not have na's in the df as it will keep them 445 | X2 = X.fillna(temp) 446 | 447 | # --------------------------------------------------------------------- 448 | # ESTIMATE FACTORS 449 | # Demean/standardize new dataset and recalculate mut and sdt using 450 | # subfunction transform_data() 451 | # x3 = transformed dataset 452 | # mut = matrix containing the values subtracted from x2 during the 453 | # transformation 454 | # sdt = matrix containing the values that x2 was divided by during 455 | # the transformation 456 | 457 | X3, mut, sdt = transform_data(X2, DEMEAN) 458 | 459 | # Determine number of factors to estimate for the new dataset using 460 | # subfunction baing() (or set to 8 if kmax equals 99) 461 | 462 | if kmax != 99: 463 | icstar, _, _, _ = baing(X3, kmax, jj) 464 | else: 465 | icstar = 8 466 | 467 | # Run principal components on the new dataset using subfunction pc2() 468 | # chat = values of x3 predicted by the factors 469 | # Fhat = factors scaled by (1/sqrt(N)) where N is the number of 470 | # series 471 | # lamhat = factor loadings scaled by number of series 472 | # ve2 = eigenvalues of x3'*x3 473 | 474 | chat, Fhat, lamhat, ve2 = pc2(X3, icstar) 475 | # --------------------------------------------------------------------- 476 | # CALCULATE NEW ERROR VALUE 477 | # Calculate difference between the predicted values of the new dataset 478 | # and the predicted values of the previous dataset 479 | diff = chat - chat0 480 | # The error value is equal to the sum of the squared differences 481 | # between chat and chat0 divided by the sum of the squared values of chat0 482 | 483 | v1 = diff.flatten(order = 'F') # vectorise columns 484 | v2 = chat0.flatten(order = 'F') 485 | 486 | err = (np.dot(v1.T, v1) / np.dot(v2.T, v2)) 487 | chat0 = chat # Set chat0 equal to the current chat 488 | 489 | if it == maxit: # Produce warning if maximum number of iterations is reached 490 | print('Maximum number of iterations reached in EM algorithm') 491 | 492 | # ------------------------------------------------------------------------- 493 | # FINAL DIFFERENCE 494 | # Calculate the difference between the initial dataset and the values 495 | # predicted by the final set of factors 496 | pred = chat*sdt + mut 497 | ehat = X - pred 498 | # ehat = X - chat*sdt + mut 499 | 500 | return pred, ehat, Fhat, lamhat, ve2, X2 501 | 502 | 503 | # if __name__ == "__main__": 504 | # X = pd.read_csv('../../data/2019-07-transformed-removed-outliers.csv', index_col=0) # read in data 505 | # kmax = 7 506 | # jj = 2 507 | # DEMEAN = 2 508 | --------------------------------------------------------------------------------