├── .gitignore ├── 0 - General functions.R ├── 0 - Portfolio choice functions.R ├── 0 - Return prediction functions.R ├── 1 - Prepare Data.R ├── 2 - Fit Models.R ├── 3 - Estimate Covariance Matrix.R ├── 4 - Prepare Portfolio Data.R ├── 5 - Base case.R ├── 5 - Feature importance IEF.R ├── 5 - Feature importance base.R ├── 5 - Feature importance ret.R ├── 6 - Base analysis.R ├── 6 - Economic intuition.R ├── 6 - Feature importance.R ├── 6 - Implementable efficient frontier.R ├── 6 - Performance across size distribution.R ├── 6 - RF example.R ├── 6 - Short selling fees.R ├── 7 - Figures.R ├── 7 - Numbers.R ├── 7 - Tables.R ├── Joblists ├── joblist_models.txt ├── joblist_pfchoice_all.txt ├── joblist_pfchoice_base.txt ├── joblist_pfchoice_fi.txt ├── joblist_pfchoice_ief.txt ├── joblist_pfchoice_size.txt └── joblist_simulations.txt ├── Main.R ├── README.md ├── config_files ├── config_models1.txt ├── config_models10.txt ├── config_models11.txt ├── config_models12.txt ├── config_models2.txt ├── config_models3.txt ├── config_models4.txt ├── config_models5.txt ├── config_models6.txt ├── config_models7.txt ├── config_models8.txt ├── config_models9.txt ├── config_pfchoice_all.txt ├── config_pfchoice_base.txt ├── config_pfchoice_fi_base.txt ├── config_pfchoice_fi_ief.txt ├── config_pfchoice_fi_ret.txt ├── config_pfchoice_ief_w1g1.txt ├── config_pfchoice_ief_w1g2.txt ├── config_pfchoice_ief_w1g3.txt ├── config_pfchoice_ief_w1g4.txt ├── config_pfchoice_ief_w1g5.txt ├── config_pfchoice_ief_w2g1.txt ├── config_pfchoice_ief_w2g2.txt ├── config_pfchoice_ief_w2g3.txt ├── config_pfchoice_ief_w2g4.txt ├── config_pfchoice_ief_w2g5.txt ├── config_pfchoice_ief_w3g1.txt ├── config_pfchoice_ief_w3g2.txt ├── config_pfchoice_ief_w3g3.txt ├── config_pfchoice_ief_w3g4.txt ├── config_pfchoice_ief_w3g5.txt ├── config_pfchoice_ief_w4g1.txt ├── config_pfchoice_ief_w4g2.txt ├── config_pfchoice_ief_w4g3.txt ├── config_pfchoice_ief_w4g4.txt ├── config_pfchoice_ief_w4g5.txt ├── config_pfchoice_size1.txt ├── config_pfchoice_size2.txt ├── config_pfchoice_size3.txt ├── config_pfchoice_size4.txt ├── config_pfchoice_size5.txt ├── config_sim1.txt ├── config_sim10.txt ├── config_sim11.txt ├── config_sim12.txt ├── config_sim13.txt ├── config_sim14.txt ├── config_sim15.txt ├── config_sim2.txt ├── config_sim3.txt ├── config_sim4.txt ├── config_sim5.txt ├── config_sim6.txt ├── config_sim7.txt ├── config_sim8.txt └── config_sim9.txt ├── ewma.cpp ├── ml-and-the-implementable-efficient-frontier.Rproj ├── separate_analysis_run.R ├── simulations ├── sim_functions.R ├── sim_results.R └── simulations.R ├── slurm_build_portfolios.R ├── slurm_fit_models.R └── sqrtm_cpp.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | ief_for_slides.R 6 | Data/ 7 | Figures/ 8 | slurm_output/ 9 | simulations/results/ 10 | simulations/Depreceated code/ 11 | scribbles/ -------------------------------------------------------------------------------- /0 - General functions.R: -------------------------------------------------------------------------------- 1 | # Read configuration file --------------- 2 | read_config <- function(file) { 3 | lines <- readLines(file) 4 | lines <- lines[!grepl("^#", lines)] # Remove lines that are comments 5 | config <- list() 6 | for (line in lines) { 7 | key_value <- strsplit(line, "=")[[1]] 8 | key <- trimws(key_value[1]) 9 | value <- trimws(key_value[2]) 10 | config[[key]] <- eval(parse(text = value)) 11 | } 12 | return(config) 13 | } 14 | 15 | # Create cov -------------- 16 | create_cov <- function(x, ids=NULL) { 17 | # Extract the relevant loadings and ivol 18 | if (is.null(ids)) { 19 | load <- x$fct_load 20 | ivol <- x$ivol_vec 21 | } else { 22 | load <- x$fct_load[as.character(ids),] 23 | ivol <- x$ivol_vec[as.character(ids)] 24 | } 25 | # Create the covariance matrix 26 | (load %*% x$fct_cov %*% t(load) + diag(ivol)) 27 | } 28 | 29 | # Create lambda --------------- 30 | create_lambda <- function(x, ids) { 31 | x[ids] |> diag() 32 | } 33 | 34 | # Compute expected risk --------------- 35 | expected_risk_fun <- function(ws, dates, cov_list) { 36 | # Make sure that ids are sorted identically for each method 37 | ws |> setorder(type, eom, id) 38 | types <- unique(ws$type) 39 | w_list <- ws |> split(by = "eom") 40 | # Compute variance for each porfolios 41 | dates |> map(.progress = T, function(d) { 42 | w_sub <- w_list[[as.character(d)]] 43 | ids <- w_sub$id |> unique() 44 | sigma <- cov_list[[as.character(d)]] |> create_cov(ids=as.character(ids)) 45 | types |> map(function(x) { 46 | w <- w_sub[type==x]$w 47 | pf_var <- t(w) %*% sigma %*% w 48 | data.table(type=x, pf_var=drop(pf_var)) 49 | }) |> rbindlist() |> mutate(eom=d) 50 | }) |> rbindlist() 51 | } 52 | 53 | # Long horizon returns ------------------ 54 | long_horizon_ret <- function(data, h, impute) { # Impute in c("zero", "mean", "median") 55 | dates <- data[!is.na(ret_exc), .(eom, "merge_date"=eom)] %>% unique() 56 | ids <- data[!is.na(ret_exc), .( 57 | start = min(eom), 58 | end = max(eom) 59 | ), by = id] 60 | full_ret <- ids[dates, on = .(start<=merge_date, end>=merge_date), allow.cartesian=T][, c("start", "end") := NULL] 61 | full_ret <- data[, .(id, eom, ret_exc)][full_ret, on = .(id, eom)] 62 | full_ret %>% setorder(id, eom) 63 | cols <- paste0("ret_ld", 1:h) 64 | for(l in 1:h) { 65 | full_ret[, paste0("ret_ld", l) := dplyr::lead(ret_exc, l), by = id] 66 | } 67 | full_ret[, ret_exc := NULL] 68 | # Remove rows with all observations missing 69 | all_missing <- (is.na(full_ret[, cols, with=F]) %>% rowSums() == h) 70 | print(paste0("All missing excludes ", round(mean(all_missing)*100, 2), "% of the observations")) 71 | full_ret <- full_ret[all_missing==F] # Is equivalent to assuming that investors new the security would be delisted. Think it's reasonable since we include deliting returns. 72 | # Impute missing return 73 | if (impute == "zero") { 74 | full_ret[, (cols) := lapply(.SD, function(x) if_else(is.na(x), 0, x)), .SDcols=cols] 75 | } 76 | if (impute == "mean") { 77 | full_ret[, (cols) := lapply(.SD, function(x) if_else(is.na(x), mean(x, na.rm=T), x)), .SDcols=cols, by = eom] 78 | } 79 | if (impute == "median") { 80 | full_ret[, (cols) := lapply(.SD, function(x) if_else(is.na(x), median(x, na.rm=T), x)), .SDcols=cols, by = eom] 81 | } 82 | return(full_ret) 83 | } 84 | 85 | # Portfolio Function ------------------ 86 | sigma_gam_adj <- function(sigma_gam, g, cov_type) { 87 | if (cov_type=="cov_mult") { 88 | return(sigma_gam*g) 89 | } 90 | if (cov_type == "cov_add") { 91 | return(sigma_gam+diag(diag(sigma_gam)*g)) 92 | } 93 | if (cov_type == "cor_shrink") { 94 | stopifnot(abs(g) <= 1) 95 | sd_vec <- diag(sqrt(diag(sigma_gam))) 96 | sd_vec_inv <- solve(sd_vec) 97 | cor_mat <- sd_vec_inv %*% sigma_gam %*% sd_vec_inv 98 | cor_mat_adj <- cor_mat * (1-g) + diag(nrow(cor_mat))*g 99 | cov_adj <- sd_vec %*% cor_mat_adj %*% sd_vec 100 | rownames(cov_adj) <- rownames(sigma_gam) 101 | colnames(cov_adj) <- colnames(sigma_gam) 102 | return(cov_adj) 103 | } 104 | } 105 | 106 | initial_weights_new <- function(data, w_type, udf_weights=NULL) { 107 | if (w_type == "vw") { 108 | pf_w <- data[, .(id, w_start = me/sum(me)), by = eom] 109 | } 110 | if (w_type == "ew") { 111 | pf_w <- data[, .(id, w_start = 1/.N), by = eom] 112 | } 113 | if (w_type == "rand_pos") { 114 | pf_w <- data[, .(id, eom)][, w_start := runif(.N)][, w_start := w_start / sum(w_start), by = eom] 115 | } 116 | if (w_type == "udf") { 117 | pf_w <- udf_weights[data, on = .(id, eom)] 118 | } 119 | pf_w[eom != min(eom), w_start := NA_real_] 120 | pf_w[, w := NA_real_] 121 | } 122 | 123 | # Portfolio function -------------- 124 | pf_ts_fun <- function(weights, data, wealth, gam) { 125 | comb <- data[, .(id, eom, ret_ld1, pred_ld1, lambda)][weights, on = .(id, eom)] 126 | comb <- wealth[, .(eom, wealth)][comb, on = "eom"] 127 | comb[, .( 128 | inv = sum(abs(w)), 129 | shorting = sum(abs(w[w<0])), 130 | turnover = sum(abs(w-w_start)), 131 | r = sum(w*ret_ld1), 132 | tc = unique(wealth)/2*sum(lambda*t(w-w_start)^2) 133 | ), by = eom][, eom_ret := eom+1+months(1)-1][, eom := NULL] 134 | } 135 | 136 | # Size-based screen ---------------- 137 | # Important: Function modifies in place 138 | size_screen_fun <- function(chars, type) { 139 | count <- 0 # Count ensures that at least one screen is applied 140 | # All 141 | if (type=="all") { 142 | print("No size screen") 143 | chars[valid_data==T, valid_size := T] 144 | count <- count + 1 145 | } 146 | # Top 1000 147 | if (str_detect(type, "top")) { 148 | top_n <- type |> str_remove("top") |> as.integer() 149 | chars[valid_data==T, me_rank := frank(-me), by = eom] 150 | chars[, valid_size := (me_rank <= top_n & !is.na(me_rank))] 151 | chars[, me_rank := NULL] 152 | count <- count + 1 153 | } 154 | # Bottom N 155 | if (str_detect(type, "bottom")) { 156 | bot_n <- type |> str_remove("bottom") |> as.integer() 157 | chars[valid_data==T, me_rank := frank(me), by = eom] 158 | chars[, valid_size := (me_rank <= bot_n & !is.na(me_rank))] 159 | chars[, me_rank := NULL] 160 | count <- count + 1 161 | } 162 | # Size group 163 | if (str_detect(type, "size_grp_")) { 164 | size_grp_screen <- type |> str_remove("size_grp_") 165 | chars[, valid_size := (size_grp==size_grp_screen & valid_data==T)] 166 | count <- count + 1 167 | } 168 | # Percentile range - min N! 169 | if (str_detect(type, "perc")) { 170 | low_p <- str_extract(type, "(?<=low)\\d+") |> as.integer() 171 | high_p <- str_extract(type, "(?<=high)\\d+") |> as.integer() 172 | min_n <- str_extract(type, "(?<=min)\\d+") |> as.integer() 173 | print(paste0("Percentile-based screening: Range ", low_p, "% - ", high_p, "%, min_n: ", min_n, " stocks")) 174 | # Extract relevant range 175 | chars[valid_data==T, me_perc := ecdf(me)(me), by = eom] 176 | chars[, valid_size := (me_perc > low_p/100 & me_perc <= high_p/100 & !is.na(me_perc))] # ecdf never returns 0, which is why lower bound is a strict inequality 177 | chars[, n_tot := sum(valid_data, na.rm=T), by = eom] 178 | chars[, n_size := sum(valid_size, na.rm=T), by = eom] 179 | chars[, n_less := sum(valid_data==T & me_perc<=low_p/100, na.rm=T), by = eom] 180 | chars[, n_more := sum(valid_data==T & me_perc>high_p/100, na.rm=T), by = eom] 181 | # How many additional stocks from each side? 182 | chars[, n_miss := pmax(min_n-n_size, 0)] 183 | chars[, n_below := ceiling(pmin(n_miss/2, n_less))] 184 | chars[, n_above := ceiling(pmin(n_miss/2, n_more))] 185 | chars[n_below+n_above < n_miss & n_above>n_below, n_above := n_above + n_miss-n_above-n_below] 186 | chars[n_below+n_above < n_miss & n_above (low_p/100-n_below/n_tot) & me_perc <= high_p/100+n_above/n_tot & !is.na(me_perc))] 188 | if (FALSE) { 189 | # See that method workds 190 | chars[, n_size2 := sum(valid_size, na.rm=T), by = eom] 191 | chars[eom==as.Date("1979-12-31"), .(id, eom, me, me_perc, n_less, n_more, n_tot, n_size, n_size2, n_miss, n_below, n_above, valid_size)][order(me)] 192 | 193 | unique(chars[, .(eom, n_size, n_size2, n_tot)]) |> pivot_longer(-eom) |> ggplot(aes(eom, value, colour=name)) + geom_point() + geom_hline(yintercept = min_n, linetype="dotted") 194 | chars[valid_size==T, median(me_perc), by = eom] |> ggplot(aes(eom, V1)) + geom_point() + ylim(c(0,1)) 195 | } 196 | chars[, c("me_perc", "n_tot", "n_size", "n_less", "n_more", "n_miss", "n_below", "n_above") := NULL] 197 | count <- count + 1 198 | } 199 | if (count != 1) { 200 | stop("Invalid size screen applied!!!!") 201 | } 202 | } 203 | 204 | # Addition/deletion rule ---------------- 205 | # Helper 206 | investment_universe <- function(add, delete) { 207 | n <- length(add) 208 | included <- logical(n) 209 | state <- F 210 | for (i in 2:n) { 211 | # Include if stock has been valid for 12 months 212 | if (state==F & add[i]==T & add[i-1]==F) { 213 | state <- T 214 | } 215 | # Exclude if stock has not been valid in past 12 months 216 | if (state==T & delete[i]==T) { 217 | state <- F 218 | } 219 | included[i] <- state 220 | } 221 | return(included) 222 | } 223 | # Function 224 | addition_deletion_fun <- function(chars, addition_n, deletion_n) { 225 | chars[, valid_temp := (valid_data==T & valid_size==T)] # Valid without the addition/deletion rule 226 | chars %>% setorder(id, eom) 227 | chars[, addition_count := roll::roll_sum(valid_temp, addition_n), by = id] 228 | chars[, deletion_count := roll::roll_sum(valid_temp, deletion_n), by = id] 229 | chars[, add := (addition_count==addition_n)] 230 | chars[is.na(add), add := F] 231 | chars[, delete := (deletion_count==0)] 232 | chars[is.na(delete), delete := F] 233 | chars[, n := .N, by = id] 234 | chars[n>1, valid := investment_universe(add = add, delete = delete), by = id] 235 | chars[n==1, valid := F] 236 | # Ensure that data is valid 237 | chars[valid_data==F, valid := F] 238 | # Check Turnover 239 | chars[, chg_raw := (valid_temp != lag(valid_temp)), by = id] 240 | chars[, chg_adj := (valid != lag(valid)), by = id] 241 | to <- chars[, .( 242 | raw_n = sum(valid_temp, na.rm=T), 243 | adj_n = sum(valid, na.rm=T), 244 | raw = sum(chg_raw, na.rm=T)/sum(valid_temp, na.rm=T), 245 | adj = sum(chg_adj, na.rm=T)/sum(valid, na.rm=T) 246 | ), by = eom][!is.nan(raw) & !is.nan(adj) & adj != 0][, .( 247 | n_months = .N, 248 | n_raw = mean(raw_n), 249 | n_adj = mean(adj_n), 250 | turnover_raw = mean(raw), 251 | turnover_adjusted = mean(adj) 252 | )] 253 | cat(paste0("Turnover wo addition/deletion rule: ", round(to$turnover_raw*100, 2), "%", 254 | "\n", 255 | "Turnover w addition/deletion rule: ", round(to$turnover_adjusted*100, 2), "%", "\n")) 256 | chars[, c("n", "addition_count", "deletion_count", "add", "delete", "valid_temp", "valid_data", "valid_size", "chg_raw", "chg_adj") := NULL] 257 | } 258 | -------------------------------------------------------------------------------- /0 - Return prediction functions.R: -------------------------------------------------------------------------------- 1 | # Split data 2 | data_split <- function(data, type, val_end, val_years, train_start, train_lookback, retrain_lookback, test_inc, test_end) { 3 | train_end <- val_end-years(val_years) 4 | train_start <- max(train_start, train_end - years(train_lookback)) 5 | op <- list() 6 | op$val <- data[eom >= train_end & eom_pred_last <= val_end] 7 | op$train <- data[eom >= train_start & eom_pred_last <= train_end] 8 | op$train_full <- data[eom >= val_end-years(retrain_lookback) & eom_pred_last <= val_end] 9 | op$test <- data[eom >= val_end & eom < min(val_end+years(test_inc), test_end)] 10 | return(op) 11 | } 12 | 13 | # OLS -------------- 14 | ols_fit <- function(data, feat) { 15 | fit <- lm(paste0("ret_pred~",paste0(feat, collapse = "+")), data=data$train_full) 16 | pred <- data$test[, .(id, eom, eom_pred_last, pred=predict(fit, newdata = data$test))] 17 | list("fit"=fit, "pred"=pred) 18 | } 19 | 20 | # Random Fourier Features -------------------- 21 | rff <- function(X, p=NULL, g=NULL, W=NULL) { # P=number of features [MUST BE DIVISIBLE WITH 2!] 22 | # Draw random weights 23 | if (is.null(W)) { 24 | k <- ncol(X) 25 | W <- MASS::mvrnorm(n=p/2, mu=rep(0, k), Sigma = g*diag(k)) %>% t() 26 | } 27 | X_new <- as.matrix(X) %*% W 28 | # Output 29 | list(W=W, X_cos=cos(X_new), X_sin=sin(X_new)) 30 | } 31 | 32 | rff_hp_search <- function(data, feat, p_vec, g_vec, l_vec, seed) { 33 | # Search over g 34 | val_errors <- g_vec %>% lapply(function(g) { 35 | set.seed(seed) # Ensures that draw is always the same irrespective of g 36 | print(paste0("g: ", formatC(g, digits=2, format="f"), " (", match(g, g_vec), " out of ", length(g_vec), ")")) 37 | # Create random features in training sets 38 | rff_train <- data$train[, feat, with=F] %>% rff(p = max(p_vec), g = g) 39 | rff_val <- data$val[, feat, with=F] %>% rff(W=rff_train$W) 40 | # Search over p 41 | err <- p_vec %>% lapply(function(p) { 42 | print(paste0("--> p: ", p, " (", match(p, p_vec), " out of ", length(p_vec), ")")) 43 | X_train <- p^(-0.5)*cbind(rff_train$X_cos[, 1:(p/2)], rff_train$X_sin[, 1:(p/2)]) 44 | X_val <- p^(-0.5)*cbind(rff_val$X_cos[, 1:(p/2)], rff_val$X_sin[, 1:(p/2)]) 45 | # Ridge fit 46 | fit <- glmnet(x = X_train, y = data$train$ret_pred, 47 | family = "gaussian", alpha = 0, lambda = l_vec, standardize = F) 48 | pred <- fit %>% predict(newx=X_val, type = "response", s = l_vec) 49 | 1:length(l_vec) %>% lapply(function(i) { 50 | data.table(lambda=l_vec[i], mse=mean((pred[, i]-data$val$ret_pred)^2)) 51 | }) %>% 52 | rbindlist() %>% 53 | mutate(p = p) 54 | }) %>% rbindlist() %>% mutate(g=g) 55 | list("err"=err, W=rff_train$W) 56 | }) 57 | names(val_errors) <- g_vec 58 | # Optimal hps 59 | val_mse <- val_errors %>% map("err") %>% rbindlist() 60 | opt_hps <- val_mse[mse==min(mse)] 61 | opt_w <- val_errors[[as.character(opt_hps$g)]]$W[, 1:(opt_hps$p/2)] 62 | print(val_mse %>% 63 | mutate( 64 | lambda=if_else(log10(lambda)< -5, 0, lambda), 65 | g = paste0("g=", formatC(g, digits = 2, format = "f")), 66 | p = factor(p) 67 | ) %>% 68 | ggplot(aes(log10(lambda), mse, colour=p)) + 69 | geom_point(alpha=0.25) + 70 | geom_line() + 71 | # annotate(geom="label", x=Inf, y=Inf, label=paste0("Optimal: p=", opt_hps$p, ", g=", opt_hps$g, ", log10(l)=", round(log10(opt_hps$lambda), 2)), hjust=1, vjust=1) + 72 | facet_wrap(~g) + 73 | labs(y="Mean squared error")) 74 | # Re-fit on train-val data 75 | X_train_full <- data$train_full[, feat, with=F] %>% rff(W=opt_w) 76 | X_train_full <- opt_hps$p^(-0.5)*cbind(X_train_full$X_cos, X_train_full$X_sin) 77 | final_fit <- glmnet(x = X_train_full, y = data$train_full$ret_pred, 78 | family = "gaussian", alpha = 0, lambda = opt_hps$lambda, standardize = F) 79 | # Predict on test data 80 | X_test <- data$test[, feat, with=F] %>% rff(W=opt_w) 81 | X_test <- opt_hps$p^(-0.5)*cbind(X_test$X_cos, X_test$X_sin) 82 | pred_op <- data$test[, .(id, eom, eom_pred_last, pred=drop(predict(final_fit, newx = X_test, s=opt_hps$lambda)))] 83 | # Output 84 | list("fit"=final_fit, "pred"=pred_op, "hp_search"=val_mse, "W"=opt_w, "opt_hps"=opt_hps) 85 | } 86 | 87 | # Ridge HP search 88 | ridge_hp_search <- function(data, feat, vol_scale, lambdas) { 89 | fit <- glmnet(x = data$train[, feat, with=F] %>% as.matrix(), y = data$train$ret_pred, 90 | family = "gaussian", alpha = 0, lambda = lambdas, standardize = T) 91 | pred <- fit %>% predict(newx=data$val[, feat, with=F] %>% as.matrix(), type = "response", s = lambdas) 92 | lambda_search <- 1:length(lambdas) %>% lapply(function(i) { 93 | data.table(lambda=lambdas[i], mse=mean((pred[, i]-data$val$ret_pred)^2)) 94 | }) %>% 95 | rbindlist() 96 | print(lambda_search[log(lambda)!=-100] %>% ggplot(aes(log(lambda), mse)) + geom_point()) 97 | lambda_opt <- lambda_search[mse==min(mse)]$lambda 98 | pred_val_op <- data$val[, .(id, eom, eom_pred_last, pred=pred[, match(lambda_opt, lambdas)])] 99 | # Re-fit to all training data 100 | final_fit <- glmnet(x = data$train_full[, feat, with=F] %>% as.matrix(), y = data$train_full$ret_pred, 101 | family = "gaussian", alpha = 0, lambda = lambda_opt, standardize = T) # Maybe a bad idea to re-standardize 102 | pred <- final_fit %>% predict(newx=data$test[, feat, with=F] %>% as.matrix(), s=lambda_opt) # final_fit %>% broom::tidy() %>% filter(term %in% c(weekly_feat, "rvol_21d")) %>% arrange(-abs(estimate)) %>% ggplot(aes(reorder(term, abs(estimate)), abs(estimate))) + geom_col() + coord_flip() 103 | feat_imp <- final_fit %>% broom::tidy() %>% filter(term!="(Intercept)") %>% mutate(imp = frank(-abs(estimate))) 104 | pred_op <- data$test[, .(id, eom, eom_pred_last, pred=drop(pred))] 105 | if (vol_scale) { 106 | pred_val_op <- pred_val_op %>% mutate(pred_vol=pred, pred=pred_vol*data$val$rvol_m) 107 | pred_op <- pred_op %>% mutate(pred_vol=pred, pred=pred_vol*data$test$rvol_m) 108 | } 109 | # Output 110 | return(list("fit"=final_fit, "hp_search"=lambda_search, "l_opt"=lambda_opt, "pred"=pred_op, "pred_val"=pred_val_op, "feat_imp"=feat_imp)) 111 | } 112 | 113 | # XGB ------------------ 114 | fit_xgb <- function(train, val, params, iter, es, cores, seed) { # train and val should be xgb.Dmatrix objects 115 | set.seed(seed) 116 | 117 | params_all <- list( 118 | objective = "reg:squarederror", 119 | base_score = 0, 120 | eval_metric = "rmse", 121 | booster = "gbtree", 122 | max_depth = params$tree_depth, 123 | eta = params$learn_rate, 124 | gamma = params$loss_reduction, 125 | subsample = params$sample_size, # Row subsampling 126 | colsample_bytree = params$mtry, # Column Subsampling 127 | min_child_weight = params$min_n, 128 | lambda = params$penalty 129 | ) 130 | 131 | # Fit Model 132 | model <- xgb.train( 133 | data = train, 134 | params = params_all, 135 | watchlist = list(train=train, val=val), 136 | nrounds = iter, 137 | early_stopping_rounds = es, 138 | verbose = 0, 139 | maximize = F, 140 | nthread = cores 141 | ) 142 | return(model) 143 | } 144 | 145 | 146 | # XGB HP search 147 | xgb_hp_search <- function(data, feat, vol_scale, hp_grid, iter, es, cores, seed) { 148 | # Find hyperparameters 149 | train <- xgb.DMatrix(data=as.matrix(data$train[, feat, with=F]), label=data$train$ret_pred) 150 | val <- xgb.DMatrix(data=as.matrix(data$val[, feat, with=F]), label=data$val$ret_pred) 151 | xgb_search <- 1:nrow(hp_grid) %>% lapply(function(j) { 152 | print(paste0("HP: ", j)) 153 | xgb_fit <- fit_xgb(train = train, val = val, params = hp_grid[j, ], iter = iter, 154 | es = es, cores = cores, seed = seed) 155 | data.table(hp_no=j, val_rmse=xgb_fit$best_score, best_iter=xgb_fit$best_iteration) 156 | }) %>% rbindlist() 157 | print(xgb_search %>% ggplot(aes(hp_no, val_rmse)) + geom_point()) 158 | best_hp_no <- xgb_search[val_rmse==min(val_rmse)][1]$hp_no # Had one case with identical rmse(!), arbitrarily just choose the first 159 | best_hp <- hp_grid[best_hp_no, ] 160 | best_iter <- xgb_search[hp_no==best_hp_no]$best_iter 161 | print("Best HP:") 162 | print(best_hp) 163 | print(paste("With", best_iter, "iterations")) 164 | # Re-fit to all training data 165 | train_all <- xgb.DMatrix(data=as.matrix(data$train_full[, feat, with=F]), label=data$train_full$ret_pred) 166 | final_fit <- fit_xgb(train = train_all, val = train_all, params = best_hp, iter = best_iter, 167 | es = NULL, cores = cores, seed = seed) 168 | # Feature importance 169 | set.seed(seed) 170 | shap_contrib <- predict(object=final_fit, newdata=as.matrix(sample_n(data$train_full[, feat, with=F], 10000)),predcontrib=T) 171 | global_imp <- abs(shap_contrib) %>% colMeans() %>% as_tibble(rownames = "char") %>% 172 | filter(char != "BIAS") %>% 173 | mutate( 174 | rank = frank(-abs(value)) 175 | ) 176 | print(global_imp %>% 177 | filter(rank <= 20) %>% 178 | ggplot(aes(reorder(char, abs(value)), abs(value))) + coord_flip() + geom_col() + labs(y = "Global feature Importance") + theme(axis.title.y = element_blank())) 179 | 180 | # Predictions 181 | pred <- final_fit %>% predict(newdata=data$test[, feat, with=F] %>% as.matrix()) 182 | if (vol_scale) { 183 | pred_op <- data$test[, .(id, eom, pred_vol=drop(pred), pred=drop(pred)*rvol_m)] 184 | } else { 185 | pred_op <- data$test[, .(id, eom, pred=drop(pred))] 186 | } 187 | # Output 188 | list("fit"=final_fit, "best_hp"=best_hp, "best_iter"=best_iter, "hp_search"=xgb_search, "pred"=pred_op, "feat_imp"=global_imp) 189 | } -------------------------------------------------------------------------------- /1 - Prepare Data.R: -------------------------------------------------------------------------------- 1 | # Risk-free rate -------------------- 2 | risk_free <- fread("Data/ff3_m.csv", select = c("yyyymm", "RF")) %>% mutate(rf=RF/100, eom = paste0(yyyymm, "01") %>% as.Date("%Y%m%d") %>% ceiling_date(unit="month")-1) %>% select(eom, rf) 3 | 4 | # Market ----------------------- 5 | market <- fread("Data/market_returns.csv", colClasses = c("eom"="character")) 6 | market <- market[excntry == "USA", .(eom_ret = as.Date(eom, format="%Y%m%d"), mkt_vw_exc)] 7 | 8 | # Wealth: Assumed portfolio growth -------------------- 9 | wealth_func <- function(wealth_end, end, market, risk_free) { 10 | wealth <- risk_free[, .("eom_ret"=eom, rf)][market, on = .(eom_ret)][, tret := mkt_vw_exc+rf] 11 | wealth <- wealth[eom_ret <= end] 12 | wealth <- wealth[order(-eom_ret)][, wealth := cumprod(1-tret)*wealth_end] 13 | wealth[, .(eom = floor_date(eom_ret, unit = "month")-1, wealth, mu_ld1 = tret)] %>% 14 | rbind(data.table(eom=end, wealth = wealth_end, mu_ld1 = NA_real_)) %>% 15 | arrange(eom) 16 | } 17 | wealth <- wealth_func(wealth_end = pf_set$wealth, end = settings$split$test_end, market = market, risk_free = risk_free) 18 | 19 | if (FALSE) { 20 | wealth %>% ggplot(aes(eom, wealth)) + geom_point() + scale_y_log10() 21 | chars[!is.na(dolvol), .(id, eom, dolvol, me)][, .(vw = sum(dolvol*me)/sum(me), ew = mean(dolvol), median = median(dolvol)), by = eom] %>% ggplot(aes(eom, vw)) + geom_point() + scale_y_log10() 22 | } 23 | 24 | # Cluster Labels (for covariance estimation) ------------- 25 | cluster_labels <- fread("Data/Cluster Labels.csv") 26 | cluster_labels[, cluster := cluster %>% str_to_lower() %>% str_replace_all("\\s|-", "_")] 27 | factor_signs <- readxl::read_xlsx("Data/Factor Details.xlsx") %>% 28 | select("characteristic"=abr_jkp, direction) %>% 29 | filter(!is.na(characteristic)) %>% 30 | mutate(direction=direction %>% as.numeric) %>% 31 | setDT() 32 | cluster_labels <- factor_signs[cluster_labels, on = "characteristic"] 33 | cluster_labels <- cluster_labels %>% rbind(data.table(characteristic="rvol_252d", cluster="low_risk", direction=-1)) # Assign rvol_252d to Low Risk 34 | 35 | # Prepare matrix of future monthly returns --------------- 36 | monthly <- fread("Data/world_ret_monthly.csv", select = c("excntry", "id", "eom", "ret_exc"), colClasses = c("eom"="character")) 37 | monthly <- monthly[excntry == "USA" & id<=99999] # Only CRSP observations 38 | monthly[, eom := eom %>% fast_strptime(format = "%Y%m%d") %>% as.Date()] 39 | data_ret <- monthly %>% long_horizon_ret(h = settings$pf$hps$m1$K, impute = "zero") 40 | data_ret_ld1 <- data_ret[, .(id, eom, eom_ret = eom+1+months(1)-1, ret_ld1)] 41 | data_ret_ld1 <- risk_free[data_ret_ld1, on = "eom"] 42 | data_ret_ld1[, tr_ld1 := ret_ld1 + rf] # Total return 43 | data_ret_ld1[, rf := NULL] 44 | # Add total return at t-1 45 | data_ret_ld1 <- data_ret_ld1[, .(id, eom=eom+1+months(1)-1, "tr_ld0" = tr_ld1)][data_ret_ld1, on = .(id, eom)] 46 | rm(monthly) 47 | 48 | # Prepare data ----------------------------------- 49 | chars <- fread(paste0("Data/usa.csv"), 50 | select = unique(c("id", "eom", "sic", "ff49", "size_grp", "me", "crsp_exchcd", "rvol_252d", "dolvol_126d", features)), 51 | colClasses = c("eom" = "character", "sic"="character")) 52 | chars <- chars[id <= 99999] # Only CRSP observations 53 | chars[, eom := eom %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()] 54 | # Add useful columns ------ 55 | chars[, dolvol := dolvol_126d] 56 | chars[, lambda := 2/dolvol*settings$pi] 57 | chars[, rvol_m := rvol_252d * sqrt(21)] 58 | # Add total return data 59 | chars <- data_ret_ld1[chars, on=.(id, eom)] 60 | # Add wealth change 61 | chars <- wealth[, .(eom=eom+1+months(1)-1, "mu_ld0" = mu_ld1)][chars, on = .(eom)] 62 | # Screens ---- 63 | # Exchange code screen 64 | if (settings$screens$nyse_stocks) { 65 | print(paste0(" NYSE stock screen excludes ", round(mean(chars$crsp_exchcd != 1) * 100, 2), "% of the observations")) 66 | chars <- chars[crsp_exchcd==1] 67 | } 68 | # Date screen 69 | print(paste0(" Date screen excludes ", round(mean(chars$eom < settings$screens$start | chars$eom > settings$screens$end) * 100, 2), "% of the observations")) 70 | chars <- chars[eom >= settings$screens$start & eom <= settings$screens$end] 71 | # Monitor screen impact 72 | n_start <- nrow(chars) 73 | me_start <- sum(chars$me, na.rm = T) 74 | # Require me 75 | print(paste0(" Non-missing me excludes ", round(mean(is.na(chars$me)) * 100, 2), "% of the observations")) 76 | chars <- chars[!is.na(me)] 77 | # Require non-missing return t and t+1 78 | print(paste0(" Valid return req excludes ", round(chars[, mean(is.na(tr_ld1) | is.na(tr_ld0))] * 100, 2), "% of the observations")) 79 | chars <- chars[!is.na(tr_ld0) & !is.na(tr_ld1)] 80 | # Require dolvol 81 | print(paste0(" Non-missing/non-zero dolvol excludes ", round(mean(is.na(chars$dolvol) | chars$dolvol==0) * 100, 2), "% of the observations")) 82 | chars <- chars[!is.na(dolvol) & dolvol > 0] 83 | # Require stock to have SIC code (for covariance estimation with industry) 84 | print(paste0(" Valid SIC code excludes ", round(mean(chars$sic=="") * 100, 2), "% of the observations")) 85 | chars <- chars[!is.na(sic)] 86 | 87 | # Feature screens 88 | feat_available <- chars %>% select(all_of(features)) %>% apply(1, function(x) sum(!is.na(x))) 89 | min_feat <- floor(length(features)*settings$screens$feat_pct) 90 | print(paste0(" At least ", settings$screens$feat_pct*100, "% of feature excludes ", round(mean(feat_available < min_feat)*100, 2), "% of the observations")) 91 | chars <- chars[feat_available >= min_feat] 92 | # Summary 93 | print(paste0(" In total, the final dataset has ", round( (nrow(chars) / n_start)*100, 2), "% of the observations and ", round((sum(chars$me) / me_start)*100, 2), "% of the market cap in the post ", settings$screens$start, " data")) 94 | # Screen out if running subset -------------------------------- 95 | if (run_sub) { 96 | set.seed(settings$seed) 97 | chars <- chars[id %in% sample(unique(chars$id), 2500, replace = F)] 98 | } 99 | # Feature standardization ----------------------------- 100 | if (settings$feat_prank) { 101 | chars[, (features) := lapply(.SD, as.double), .SDcols = features] # Convert feature columns to double to avoid loosing precision 102 | for(f in features) { 103 | if (match(f, features) %% 10 == 0) print(paste0("Feature ", match(f, features), " out of ", length(features))) 104 | chars[, zero := (get(f) == 0)] 105 | chars[!is.na(get(f)), (f) := ecdf(get(f))(get(f)), by = eom] # Didn't have by statement before!! 106 | chars[zero == T, (f) := 0][, zero := NULL] # Set exact zeros to 0 (ecdf always returns >0) 107 | } 108 | } 109 | # Feature Imputation ---------------------------------- 110 | if (settings$feat_impute) { 111 | if (settings$feat_prank) { 112 | chars[, (features) := lapply(.SD, function(x) if_else(is.na(x), 0.5, x)), .SDcols=features] 113 | } else { 114 | chars[, (features) := lapply(.SD, function(x) if_else(is.na(x), median(x, na.rm=T), x)), .SDcols=features, by=eom] 115 | } 116 | } 117 | # Industry classification ----------------------------- 118 | chars[, ff12 := case_when( 119 | sic %in% c(100:999, 2000:2399, 2700:2749, 2770:2799, 3100:3199, 3940:3989) ~ "NoDur", 120 | sic %in% c(2500:2519, 3630:3659, 3710:3711, 3714, 3716, 3750:3751, 3792, 3900:3939, 3990:3999) ~ "Durbl", 121 | sic %in% c(2520:2589, 2600:2699, 2750:2769, 3000:3099, 3200:3569, 3580:3629, 122 | 3700:3709, 3712:3713, 3715, 3717:3749, 3752:3791, 3793:3799, 3830:3839, 3860:3899) ~ "Manuf", 123 | sic %in% c(1200:1399, 2900:2999) ~ "Enrgy", 124 | sic %in% c(2800:2829, 2840:2899) ~ "Chems", 125 | sic %in% c(3570:3579, 3660:3692, 3694:3699, 3810:3829, 7370:7379) ~ "BusEq", 126 | sic %in% c(4800:4899) ~ "Telcm", 127 | sic %in% c(4900:4949) ~ "Utils", 128 | sic %in% c(5000:5999, 7200:7299, 7600:7699) ~ "Shops", 129 | sic %in% c(2830:2839, 3693, 3840:3859, 8000:8099) ~ "Hlth", 130 | sic %in% c(6000:6999) ~ "Money", 131 | TRUE ~ "Other" 132 | )] 133 | # Check which observations are valid ----------------- 134 | chars[, valid_data := T] 135 | # Check that stock is eligible for portfolio-ml 136 | chars %>% setorder(id, eom) 137 | lb <- pf_set$lb_hor+1 # Plus 1 to get the last signal of last periods portfolio for portfolio-ML 138 | chars[, eom_lag := shift(eom, n = lb, type="lag"), by = id] 139 | chars[, month_diff := interval(eom_lag, eom) %/% months(1)] 140 | print(paste0(" Valid lookback observation screen excludes ", round(chars[valid_data==T, mean(month_diff != lb | is.na(month_diff))] * 100, 2), "% of the observations")) 141 | chars[, valid_data := (valid_data==T & month_diff==lb & !is.na(month_diff))] 142 | chars[, c("eom_lag", "month_diff") := NULL] 143 | rm(lb) 144 | 145 | # Size based screen -------------------------------------- 146 | chars |> size_screen_fun(type=settings$screens$size_screen) 147 | 148 | # Addition/deletion rule ---------------------------------- 149 | chars |> addition_deletion_fun(addition_n=settings$addition_n, deletion_n=settings$deletion_n) 150 | 151 | # Show investable universe -------------------------------- 152 | chars[valid==T, .N, by = eom] %>% ggplot(aes(eom, N)) + geom_point() + labs(y = "Valid stocks") + geom_hline(yintercept = 0) 153 | 154 | # Valid summary 155 | print(paste0(" The valid_data subset has ", round(chars[, mean(valid==T)]*100, 2), "% of the observations and ", round((sum(chars[valid==T]$me) / sum(chars$me))*100, 2), "% of the market cap")) 156 | 157 | # Daily returns ---------------------------------- 158 | daily <- fread("Data/usa_dsf.csv", colClasses = c("date"="character"), select = c("id", "date", "ret_exc")) 159 | daily <- daily[!is.na(ret_exc) & id<=99999 & id %in% unique(chars[valid==T]$id)]; gc() # Only CRSP observations 160 | daily[, date := date %>% fast_strptime(format="%Y%m%d") %>% as.Date()]; gc() 161 | daily[, eom := date %>% ceiling_date(unit = "month")-1] -------------------------------------------------------------------------------- /2 - Fit Models.R: -------------------------------------------------------------------------------- 1 | start_time <- proc.time() 2 | 3 | models <- 1:nrow(search_grid) %>% lapply(function(i) { 4 | # Prepare y variable 5 | h <- search_grid[i, ]$horizon %>% unlist() 6 | pred_y <- data_ret[, paste0("ret_ld", h), with=F] %>% rowMeans() 7 | pred_y <- data_ret[, .(id, eom, eom_pred_last=eom+1+months(max(h))-1, ret_pred=pred_y)] 8 | data_pred <- pred_y[chars[valid==T], on = .(id, eom)] 9 | 10 | print(paste0("horizons: ", list(h))) 11 | 12 | # Fit model 13 | if (settings$split$model_update_freq == "once") { 14 | val_ends <- settings$split$train_end 15 | test_inc <- 1000 16 | } 17 | if (settings$split$model_update_freq == "yearly") { 18 | val_ends <- seq.Date(from = settings$split$train_end, to = settings$split$test_end, by = "1 year") 19 | test_inc <- 1 20 | } 21 | if (settings$split$model_update_freq == "decade") { 22 | val_ends <- seq.Date(from = settings$split$train_end, to = settings$split$test_end, by = "10 years") 23 | test_inc <- 10 24 | } 25 | system.time({op <- val_ends %>% sapply(simplify = F, USE.NAMES = T, function(val_end) { 26 | print(val_end) 27 | train_test_val <- data_pred %>% data_split(type = settings$split$model_update_freq, val_end = val_end, val_years = settings$split$val_years, train_start = settings$screens$start, 28 | train_lookback = settings$split$train_lookback, retrain_lookback = settings$split$retrain_lookback, 29 | test_inc = test_inc, test_end = settings$split$test_end) 30 | # Fit models 31 | print(system.time(model_op <- train_test_val %>% rff_hp_search(feat = features, p_vec=settings$rff$p_vec, g_vec=settings$rff$g_vec, l_vec=settings$rff$l, seed=settings$seed_no))) 32 | 33 | return(model_op) 34 | })}) 35 | op %>% saveRDS(paste0(output_path, "/model_", h,".RDS")) 36 | # op %>% saveRDS(paste0(output_path, "/models/model_", settings$split$model_update_freq, "_", min(h), "_", max(h), ".RDS")) 37 | return(op) 38 | }) 39 | # Save output ------ 40 | # names(models) <- search_grid$name 41 | 42 | # models %>% saveRDS(paste0(output_path, "/models/model_full.RDS")) 43 | 44 | proc.time()-start_time # 15 hours with all stocks, xgb, and yearly updates, 45 | 46 | -------------------------------------------------------------------------------- /3 - Estimate Covariance Matrix.R: -------------------------------------------------------------------------------- 1 | # Prepare cluster characteristics --------------- 2 | cluster_data_m <- chars[valid==T][, c("id", "eom", "size_grp", "ff12", features), with=F] 3 | clusters <- unique(cluster_labels$cluster) 4 | cluster_ranks <- clusters %>% lapply(function(cl) { 5 | chars_sub <- cluster_labels[cluster==cl & characteristic %in% features] 6 | # print(paste0(cl, ", n: ", nrow(chars_sub))) 7 | data_sub <- cluster_data_m[, chars_sub$characteristic, with=F] 8 | for (c in chars_sub$characteristic) { 9 | dir <- chars_sub[characteristic == c, direction] 10 | if (dir == -1) { 11 | data_sub[, (c) := 1-get(c)] 12 | } 13 | } 14 | data.table(x=data_sub %>% rowMeans()) %>% setnames(old = "x", new = cl) 15 | }) %>% bind_cols() 16 | cluster_data_m <- cluster_data_m[, .(id, eom, "eom_ret"=ceiling_date(eom+1, unit = "month")-1, size_grp, ff12)] %>% cbind(cluster_ranks) 17 | # Add Industry/market dummies 18 | if (settings$cov_set$industries) { 19 | # Create industry dummies 20 | industries <- sort(unique(cluster_data_m$ff12)) 21 | for (ind in industries) { 22 | cluster_data_m[, (ind) := as.integer(ff12==ind)] 23 | } 24 | ind_factors <- industries 25 | } else { 26 | # Add market factor 27 | cluster_data_m[, mkt := 1] 28 | ind_factors <- "mkt" 29 | } 30 | # Standardize factors 31 | cluster_data_m[, (clusters) := lapply(.SD, function(x) (x-mean(x))/sd(x)), by=eom, .SDcols=clusters] 32 | # Add daily return data 33 | cluster_data_d <- cluster_data_m[daily[date>= min(cluster_data_m$eom)][, .(id, date, ret_exc, "eom_ret"=eom)], on = .(id, eom_ret)] 34 | # Omit observations without data 35 | cluster_data_d <- cluster_data_d %>% na.omit() 36 | 37 | # Create factor returns -------------------- 38 | fct_ret_est <- cluster_data_d %>% 39 | group_by(date) %>% 40 | nest() %>% 41 | mutate( 42 | fit = data %>% map(~lm(paste0("ret_exc~-1+", paste0(c(ind_factors, clusters), collapse = "+")), data=.x)), 43 | # fit = data %>% map(~lm(paste0("ret_exc~-1+mkt+", paste0(clusters, collapse = "+")), data=.x)), 44 | res = fit %>% map(residuals), 45 | tidied = fit %>% map(broom::tidy) 46 | ) 47 | fct_ret <- fct_ret_est %>% 48 | unnest(tidied) %>% 49 | select(date, term, estimate) %>% 50 | pivot_wider(names_from = term, values_from = estimate) %>% 51 | arrange(date) %>% 52 | setDT() 53 | 54 | # Factor Risk ----------------------------- 55 | w_cor <- (0.5^(1/settings$cov_set$hl_cor))^(settings$cov_set$obs:1) 56 | w_var <- (0.5^(1/settings$cov_set$hl_var))^(settings$cov_set$obs:1) 57 | 58 | fct_dates <- sort(fct_ret$date) 59 | calc_dates <- sort(unique(cluster_data_m[eom>=floor_date(fct_dates[settings$cov_set$obs], unit="m")-1]$eom)) # to ensure sufficient data for cov calculation 60 | factor_cov_est <- as.character(calc_dates) %>% map(.progress = T, function(d){ 61 | first_obs <- min(tail(fct_dates[fct_dates <= as.Date(d)], settings$cov_set$obs)) 62 | cov_data <- fct_ret[date >= first_obs & date <= as.Date(d)] 63 | t <- nrow(cov_data) 64 | if (t < settings$cov_set$obs-30) warning("INSUFFICIENT NUMBER OF OBSERVATIONS!!") # Only an issue with the first calc_date 65 | cov_est <- cov_data %>% select(-date) %>% cov.wt(wt = tail(w_cor, t), cor=T, center=T, method = "unbiased") 66 | cor_est <- cov_est$cor 67 | var_est <- cov_data %>% select(-date) %>% cov.wt(wt = tail(w_var, t), cor=F, center=T, method = "unbiased") # inefficient solution but super fast with few factors 68 | sd_diag <- diag(sqrt(diag(var_est$cov))) 69 | # Prepare cov 70 | cov_est <- sd_diag %*% cor_est %*% sd_diag 71 | rownames(cov_est) <- colnames(cov_est) <- colnames(cor_est) 72 | # Output 73 | return(cov_est) 74 | }) 75 | names(factor_cov_est) <- as.character(calc_dates) 76 | 77 | # Specific Risk --------------------------- 78 | spec_risk <- fct_ret_est %>% 79 | mutate(id = data %>% map(~.x$id)) %>% 80 | select(id, date, res) %>% 81 | unnest(c(id, res)) %>% 82 | arrange(id, date) %>% 83 | setDT() 84 | # EWMA variance 85 | spec_risk[, res_vol := ewma_c(res, lambda = 0.5^(1/settings$cov_set$hl_stock_var), start = settings$cov_set$initial_var_obs), by = id] # Lambda=exp(log(0.5)/half_life) 86 | # Require that the observation at least 200 non-missing observations out of the last 252 trading days 87 | td_range <- data.table(date=fct_dates, td_252d = lag(fct_dates, 252)) 88 | spec_risk <- td_range[spec_risk, on = "date"] 89 | spec_risk[, date_200d := lag(date, 200), by = id] 90 | spec_risk <- spec_risk[date_200d>=td_252d & !is.na(res_vol)][, .(id, date, res_vol)] 91 | # Extract specific risk by month end 92 | spec_risk[, eom_ret := date %>% ceiling_date(unit="month")-1] 93 | spec_risk[, max_date := max(date), by = .(id, eom_ret)] 94 | spec_risk_m <- spec_risk[date==max_date, .(id, "eom"=eom_ret, res_vol)] 95 | 96 | # Stock covariance matrix -------------------------- 97 | barra_cov <- as.character(calc_dates) %>% map(.progress = T, function(d){ 98 | char_data <- cluster_data_m[eom == as.Date(d)] 99 | # Add specific risk 100 | char_data <- spec_risk_m[char_data, on = .(id, eom)] 101 | # char_data <- char_data[!is.na(res_vol)] # Consider if we should do some imputation e.g. median res_vol? 102 | char_data[, med_res_vol := median(res_vol, na.rm=T), by = .(size_grp, eom)] 103 | if (any(is.na(char_data$med_res_vol))) { 104 | char_data[, med_res_vol_all := median(res_vol, na.rm=T), by = .(eom)] # Crazy edge case: in April 1996, there's only 1 micro stock and it has na for resvol.. 105 | char_data[is.na(med_res_vol), med_res_vol := med_res_vol_all] 106 | } 107 | char_data[is.na(res_vol), res_vol := med_res_vol] 108 | fct_cov <- factor_cov_est[[d]]*21 # Annualize 109 | char_data %>% setorder(id) # Important to align ids across different datasets 110 | X <- char_data[, colnames(fct_cov), with=F] %>% as.matrix() # Using colnames(fct_cov) is important so char and factor is aligned across x and fct_cov 111 | rownames(X) <- char_data$id %>% as.character() 112 | # Stock covariance matrix 113 | # cov_est <- X %*% fct_cov %*% t(X) + diag(char_data$res_vol^2) 114 | # rownames(cov_est) <- colnames(cov_est) <- as.character(char_data$id) 115 | # return(cov_est) 116 | ivol_vec <- char_data$res_vol^2*21 # Annualize. Also, would more accurately be called "ivar_vec" 117 | names(ivol_vec) <- char_data$id %>% as.character() 118 | list("fct_load"=X, "fct_cov"=fct_cov, "ivol_vec"=ivol_vec) 119 | }) 120 | names(barra_cov) <- as.character(calc_dates) 121 | 122 | # Sanity check 123 | if (FALSE) { 124 | # Check implied volatility of market factor over time 125 | check <- names(barra_cov) |> map(function(d) { 126 | data.table( 127 | eom = as.Date(d), 128 | # mkt_vol = barra_cov[[d]]$fct_cov["Hlth", "Hlth"]^0.5, 129 | mkt_vol = barra_cov[[d]]$fct_cov["mkt", "mkt"]^0.5, 130 | ivol = mean(barra_cov[[d]]$ivol_vec^0.5) 131 | ) 132 | }) |> rbindlist() 133 | 134 | check |> 135 | pivot_longer(-eom) |> 136 | ggplot(aes(eom, value*sqrt(12), color=name)) + 137 | geom_line() 138 | 139 | # Why we impute 140 | test <- calc_dates %>% sapply(simplify = F, USE.NAMES = T, function(d){ 141 | char_data <- cluster_data_m[eom == d] 142 | # Add specific risk 143 | char_data <- spec_risk_m[char_data, on = .(id, eom)] 144 | n_miss <- char_data[is.na(res_vol), .N] 145 | data.table(d=d, n_miss=n_miss) 146 | }) %>% rbindlist() 147 | test %>% ggplot(aes(d, n_miss)) + geom_point() 148 | # Sanity check 149 | pred_sd_avg <- names(barra_cov) %>% lapply(function(d) { 150 | pred_var <- diag(barra_cov[[d]]) 151 | data.table(eom = as.Date(d), n = length(pred_var), sd_avg = sqrt(mean(pred_var)*252)) 152 | }) %>% rbindlist() 153 | pred_sd_avg %>% ggplot(aes(eom, n)) + geom_point(size=1) + theme(axis.title.x = element_blank()) 154 | pred_sd_avg %>% ggplot(aes(eom, sd_avg)) + geom_point(size=1) + labs(y = "Average Predicted Volatility (Annualized)", title = "Covariance Sanity Check") + theme(axis.title.x = element_blank()) 155 | 156 | 157 | (output$cov_check <- pred_sd_avg %>% 158 | ggplot(aes(eom, sd_avg)) + 159 | geom_point(size=1) + 160 | labs(y = "Average Predicted Volatility (Annualized)") + 161 | theme(axis.title.x = element_blank())) 162 | 163 | me <- chars[!is.na(me), .(id, eom, me)] 164 | me %>% setorder(eom) 165 | me <- me %>% split(by="eom") 166 | mkt_vol <- names(barra_cov) %>% lapply(function(d) { 167 | sigma <- barra_cov[[d]] 168 | me_sub <- me[[d]] 169 | me_sub <- me_sub[id %in% colnames(sigma)] 170 | me_sub[, w := me/sum(me)] 171 | sigma <- sigma[as.character(me_sub$id), as.character(me_sub$id)] 172 | data.table(eom=as.Date(d), mkt_vol = sqrt(drop(t(me_sub$w) %*% sigma %*% me_sub$w))) 173 | }) %>% rbindlist() 174 | 175 | (output$mkt_vol <- mkt_vol %>% ggplot(aes(eom, mkt_vol*sqrt(252))) + geom_point() + geom_hline(yintercept = 0)) 176 | 177 | # Check: All valid stocks have covariance estimate 178 | valid_cov_est <- names(barra_cov) %>% lapply(function(d) { 179 | data.table(eom=as.Date(d), id = barra_cov[[d]] %>% colnames() %>% as.integer(), valid_cov = T) 180 | }) %>% rbindlist() 181 | test <- valid_cov_est[chars[, .(id, eom, valid)], on = .(id, eom)] 182 | test[valid==T & eom>=min(valid_cov_est$eom), .N, by = valid_cov] 183 | } -------------------------------------------------------------------------------- /4 - Prepare Portfolio Data.R: -------------------------------------------------------------------------------- 1 | # Add return predictions --------------------------------- 2 | for (h in 1:settings$pf$hps$m1$K) { 3 | pred_data <- readRDS(paste0(get_from_path_model, "/model_", h, ".RDS")) |> 4 | lapply(function(x) x$pred) |> 5 | rbindlist() |> 6 | select(id, eom, pred) |> 7 | rename_with(~ paste0("pred_ld", h), .cols = "pred") 8 | chars <- pred_data[chars, on = .(id, eom)] 9 | } 10 | 11 | # Create lambda list ------------------------------------- 12 | lambda_dates <- unique(chars$eom) 13 | lambda_list <- lambda_dates |> map(function(d) { 14 | x <- chars[eom==d, .(id, lambda)][order(id)] 15 | ids <- x$id 16 | x <- x$lambda 17 | names(x) <- ids 18 | return(x) 19 | }) |> setNames(as.character(lambda_dates)) 20 | rm(lambda_dates) 21 | 22 | # Important dates ---------------------- 23 | first_cov_date <- names(barra_cov) %>% as.Date() %>% min() 24 | hp_years <- seq(from=settings$pf$dates$start_year, to=settings$pf$dates$end_yr) 25 | start_oos <- settings$pf$dates$start_year+settings$pf$dates$split_years 26 | 27 | dates_m1 <- seq.Date(from = settings$split$train_end+1, to = settings$split$test_end+1-months(1), by = "1 month")-1 28 | dates_m2 <- seq.Date(from = first_cov_date+1+months(pf_set$lb_hor+1), to = settings$split$test_end+1-months(1), by = "1 month")-1 29 | dates_oos <- seq.Date(from = as.Date(paste0(start_oos, "-", "01-01")), to = settings$split$test_end+1-months(1), by = "1 month")-1 30 | dates_hp <- seq.Date(from = as.Date(paste0(min(hp_years), "-", "01-01")), to = settings$split$test_end+1-months(1), by = "1 month")-1 31 | -------------------------------------------------------------------------------- /5 - Base case.R: -------------------------------------------------------------------------------- 1 | # Benchmark portfolios ------------------------------------------ 2 | # Markowitz-ML 3 | tpf <- chars |> tpf_implement(cov_list = barra_cov, wealth = wealth, dates = dates_oos, gam = pf_set$gamma_rel) 4 | 5 | # Factor-ML 6 | factor_ml <- chars %>% factor_ml_implement(dates = dates_oos, n_pfs = settings$factor_ml$n_pfs, wealth = wealth, gam = pf_set$gamma_rel) 7 | 8 | # Market 9 | mkt <- chars %>% mkt_implement(dates = dates_oos, wealth = wealth) 10 | 11 | # 1/n 12 | ew <- chars %>% ew_implement(dates = dates_oos, wealth = wealth) 13 | 14 | # Fama-MacBeth / Rank weighted portfolios 15 | rw <- chars %>% rw_implement(dates = dates_oos, wealth = wealth) 16 | 17 | # Minimum variance 18 | mv <- chars %>% mv_implement(cov_list = barra_cov, dates = dates_oos, wealth = wealth) 19 | 20 | # Output 21 | bm_pfs <- rbind(tpf$pf, factor_ml$pf, ew$pf, mkt$pf, rw$pf, mv$pf) 22 | bm_pfs %>% fwrite(paste0(output_path, "/bms.csv")) 23 | 24 | # Static-ML ------------------------------------------------------ 25 | # Implement 26 | static <- static_implement( 27 | data_tc = chars, 28 | cov_list = barra_cov, 29 | lambda_list = lambda_list, 30 | rf = risk_free, # Data 31 | wealth = wealth, 32 | mu = pf_set$mu, 33 | gamma_rel = pf_set$gamma_rel, # Investor 34 | dates_full = dates_m1, 35 | dates_oos = dates_oos, 36 | dates_hp = dates_hp, 37 | hp_years = hp_years, # Dates 38 | k_vec = settings$pf$hps$static$k, 39 | u_vec = settings$pf$hps$static$u, 40 | g_vec = settings$pf$hps$static$g, 41 | cov_type = settings$pf$hps$cov_type, 42 | validation = NULL 43 | ) 44 | # Output 45 | static |> saveRDS(paste0(output_path, "/static-ml.RDS")) 46 | 47 | # Portfolio-ML --------------------------------------------------- 48 | # Implement 49 | pfml <- pfml_implement( 50 | data_tc = chars, 51 | cov_list = barra_cov, 52 | lambda_list = lambda_list, 53 | features = features, 54 | risk_free = risk_free, # Data 55 | wealth = wealth, 56 | mu = pf_set$mu, 57 | gamma_rel = pf_set$gamma_rel, # Investor 58 | dates_full = dates_m2, 59 | dates_oos = dates_oos, 60 | lb = pf_set$lb_hor, 61 | hp_years = hp_years, # Dates 62 | rff_feat = T, 63 | g_vec = settings$pf_ml$g_vec, 64 | p_vec = settings$pf_ml$p_vec, 65 | l_vec = settings$pf_ml$l_vec, 66 | scale = settings$pf_ml$scale, 67 | orig_feat = settings$pf_ml$orig_feat, # Hyperparameters 68 | iter = 10, 69 | hps = NULL, 70 | balanced = F, 71 | seed = settings$seed_no # Other 72 | ) 73 | # Output 74 | pfml |> saveRDS(paste0(output_path, "/portfolio-ml.RDS")) 75 | 76 | # Multiperiod-ML ------------------------------------------------ 77 | if (config_params$update_mp) { 78 | # Implement 79 | mp <- mp_implement( 80 | data_tc = chars, 81 | cov_list = barra_cov, 82 | lambda_list = lambda_list, 83 | rf = risk_free, # Data 84 | wealth = wealth, 85 | mu = pf_set$mu, 86 | gamma_rel = pf_set$gamma_rel, # Investor 87 | dates_full = dates_m1, 88 | dates_oos = dates_oos, 89 | dates_hp = dates_hp, 90 | hp_years = hp_years, # Dates 91 | k_vec = settings$pf$hps$m1$k, 92 | u_vec = settings$pf$hps$m1$u, 93 | g_vec = settings$pf$hps$m1$g, 94 | cov_type = settings$pf$hps$cov_type, 95 | validation = NULL, 96 | iter = 10, 97 | K = settings$pf$hps$m1$K 98 | ) 99 | # Save output 100 | mp |> saveRDS(paste0(output_path, "/multiperiod-ml.RDS")) 101 | } -------------------------------------------------------------------------------- /5 - Feature importance IEF.R: -------------------------------------------------------------------------------- 1 | # Portfolio-ML - IEF -------------------- 2 | # Only some clusters are implemented for full IEF 3 | ief_path <- "Data/Generated/Portfolios/IEF/" 4 | ief_cf_clusters <- c("quality", "value", "momentum", "short_term_reversal") 5 | pfml_cf_ief <- settings$ef$gamma_rel |> map(.progress = T, function(gamma_rel) { 6 | # Find hyper-parameters for specific gamma 7 | x <- paste0("20240514-20_WEALTH1e+10_GAMMA", gamma_rel, "_SIZEperc_low50_high100_min50_INDTRUE") 8 | pfml <- readRDS(paste0(ief_path, x, "/portfolio-ml.RDS")) 9 | # Implement for each cluster 10 | pfml_cf_base <- ief_cf_clusters |> map(.progress = T, function(cf_cluster) { 11 | chars[valid==T] |> pfml_cf_fun( 12 | cf_cluster = cf_cluster, 13 | pfml_base = pfml, 14 | dates = dates_oos, 15 | cov_list = barra_cov, 16 | scale = settings$pf_ml$scale, 17 | orig_feat = settings$pf_ml$orig_feat, 18 | gamma_rel = gamma_rel, 19 | wealth = wealth, 20 | risk_free = risk_free, 21 | mu = pf_set$mu, 22 | iter = 10, 23 | seed = settings$seed_no 24 | ) 25 | }) |> rbindlist() |> mutate(gamma_rel = gamma_rel) 26 | }) |> rbindlist() 27 | # Save 28 | pfml_cf_ief |> fwrite(paste0(output_path, "/pfml_cf_ief.csv")) 29 | -------------------------------------------------------------------------------- /5 - Feature importance base.R: -------------------------------------------------------------------------------- 1 | # Markowitz-ML - Base case --------------------- 2 | cf_clusters <- c("bm", clusters) 3 | tpf_cf_wealth <- wealth_func(wealth_end = 0, end = settings$split$test_end, market = market, risk_free = risk_free) 4 | er_models <- readRDS(paste0(get_from_path_model, "/model_1.RDS")) 5 | tpf_cf_base <- cf_clusters |> map(.progress = T, function(cf_cluster) { 6 | chars[valid==T] |> tpf_cf_fun( 7 | cf_cluster = cf_cluster, 8 | er_models = er_models, 9 | dates = dates_oos, 10 | cluster_labels = cluster_labels, 11 | cov_list = barra_cov, 12 | gamma_rel = 100, # Use higher gamma to make TPF less extreme (and increase realized utility) 13 | wealth = tpf_cf_wealth, 14 | seed = settings$seed_no 15 | ) 16 | }) |> rbindlist() 17 | # Save 18 | tpf_cf_base |> fwrite(paste0(output_path, "/tpf_cf_base.csv")) 19 | 20 | # tpf_cf_base[, .(mean = mean(r)*12, sd = sd(r)*sqrt(12), u = (mean(r)-pf_set$gamma_rel/2*var(r))*12), by = cluster][, sr := mean/sd][order(sr)] 21 | 22 | # Portfolio-ML - Base case -------------------- 23 | # Load PFML base case model, for its HPs 24 | base_path <- "Data/Generated/Portfolios/Base/" 25 | pfml <- readRDS(paste0(base_path, list.files(base_path), "/portfolio-ml.RDS")) 26 | # All clusters are implemented for the base case 27 | cf_clusters <- c("bm", clusters) 28 | pfml_cf_base <- cf_clusters |> map(.progress = T, function(cf_cluster) { 29 | chars[valid==T] |> pfml_cf_fun( 30 | cf_cluster = cf_cluster, 31 | pfml_base = pfml, 32 | dates = dates_oos, 33 | cov_list = barra_cov, 34 | scale = settings$pf_ml$scale, 35 | orig_feat = settings$pf_ml$orig_feat, 36 | gamma_rel = pf_set$gamma_rel, 37 | wealth = wealth, 38 | risk_free = risk_free, 39 | mu = pf_set$mu, 40 | iter = 10, 41 | seed = settings$seed_no 42 | ) 43 | }) |> rbindlist() 44 | # Save 45 | pfml_cf_base |> fwrite(paste0(output_path, "/pfml_cf_base.csv")) -------------------------------------------------------------------------------- /5 - Feature importance ret.R: -------------------------------------------------------------------------------- 1 | # Estimate counterfactual predictions 2 | ret_cf <- 1:12 %>% map(.progress = "model iteration", function(h) { 3 | model <- readRDS(paste0(get_from_path_model, "/model_", h, ".RDS")) 4 | # Prepare data 5 | pred_y <- data_ret[, .(id, eom, ret_pred=get(paste0("ret_ld", h)))] 6 | ret_cf_data <- pred_y[chars, on = .(id, eom)] 7 | preds <- model %>% map("pred") %>% rbindlist() %>% select(id, eom, pred) 8 | ret_cf_data <- ret_cf_data[preds, on = .(id, eom)][!is.na(ret_pred)] 9 | # Implement counterfactual 10 | c("bm", sort(clusters)) %>% map(progress=T, function(cls) { 11 | set.seed(settings$seed) # Set seed for reproducibility 12 | print(paste0(" ", cls)) 13 | if (cls=="bm") { 14 | cf <- copy(ret_cf_data) 15 | } else { 16 | cf <- copy(ret_cf_data) %>% select(-c(pred)) # Counterfactual 17 | cf[, id_shuffle := sample(id, .N, replace=F), by = eom] 18 | cls_chars <- cluster_labels[cluster==cls & characteristic %in% features, characteristic] 19 | chars_data <- cf %>% select(all_of(c("id", "eom", cls_chars))) %>% rename("id_shuffle"=id) 20 | cf <- chars_data[cf[, (cls_chars) := NULL], on = .(id_shuffle, eom)] 21 | # Expected Returns 22 | for (m_sub in model) { 23 | if (!is.null(m_sub$pred)) { 24 | sub_dates <- unique(m_sub$pred$eom) 25 | cf_x <- cf[eom %in% sub_dates, features, with=F] %>% as.matrix() 26 | cf_new_x <- cf_x %>% rff(W=m_sub$W) 27 | cf_new_x <- m_sub$opt_hps$p^(-0.5)*cbind(cf_new_x$X_cos, cf_new_x$X_sin) 28 | cf[eom %in% sub_dates, pred := drop(m_sub$fit %>% predict(newx = cf_new_x, s = m_sub$opt_hps$lambda))] 29 | } 30 | } 31 | } 32 | cf[, .(cluster = cls, n = .N, mse = mean((pred-ret_pred)^2)), by = eom] 33 | }) %>% rbindlist() %>% mutate(h = h) 34 | }) %>% rbindlist() 35 | # Save 36 | ret_cf |> fwrite(paste0(output_path, "/ret_cf.csv")) -------------------------------------------------------------------------------- /6 - Base analysis.R: -------------------------------------------------------------------------------- 1 | # Need to run everything from main to right before bm_pfs in 5 - base case 2 | # Load base case portfolios ----------- 3 | base_path <- "Data/Generated/Portfolios/Base/" 4 | base_folder <- list.files(base_path) 5 | mp <- readRDS(paste0(base_path, base_folder, "/multiperiod-ml.RDS")) 6 | pfml <- readRDS(paste0(base_path, base_folder, "/portfolio-ml.RDS")) 7 | static <- readRDS(paste0(base_path, base_folder, "/static-ml.RDS")) 8 | bm_pfs <- fread(paste0(base_path, base_folder, "/bms.csv")) |> 9 | mutate( 10 | eom_ret = as.Date(eom_ret), 11 | type = if_else(type == "Rank-Weighted", "Rank-ML", type) # Old naming convention 12 | ) 13 | 14 | # Final Portfolios --------------------- 15 | pfs <- rbind( 16 | mp$pf, 17 | pfml$pf, 18 | static$pf, 19 | bm_pfs, 20 | mp$hps[eom_ret %in% mp$pf$eom_ret & k==1 & g==0 & u==1, .(eom_ret=as.Date(eom_ret), inv, shorting, turnover, r, tc)][, type := "Multiperiod-ML"], 21 | static$hps[eom_ret %in% static$pf$eom_ret & k==1 & g==0 & u==1, .(eom_ret=as.Date(eom_ret), inv, shorting, turnover, r, tc)][, type := "Static-ML"] 22 | ) 23 | pfs[, type := type %>% factor(levels = pf_order)] 24 | pfs %>% setorder(type, eom_ret) 25 | pfs[, e_var_adj := (r-mean(r))^2, by=type] 26 | pfs[, utility_t := r-tc-0.5*e_var_adj*pf_set$gamma_rel] 27 | 28 | # Portfolio summary stats -------------- 29 | pf_summary <- pfs[, .( 30 | n = .N, 31 | inv = mean(inv), 32 | shorting = mean(shorting), 33 | turnover_notional = mean(turnover), 34 | r = mean(r)*12, 35 | sd = sd(r)*sqrt(12), 36 | sr_gross = mean(r)/sd(r)*sqrt(12), 37 | tc = mean(tc)*12, 38 | r_tc = mean((r-tc))*12, 39 | sr = mean(r-tc)/sd(r)*sqrt(12), 40 | obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12#, 41 | # obj_exp = (mean(e_r)-0.5*mean(e_var)*pf_set$gamma_rel-mean(tc))*12 42 | ), by = .(type)][order(type)] 43 | # Remove not essential types 44 | pfs <- pfs[type %in% main_types][, type := type %>% factor(levels = main_types)] 45 | 46 | # Performance Time-Series ----------- 47 | types <- unique(pfs$type) 48 | pfs[, cumret := cumsum(r), by=.(type)] 49 | pfs[, cumret_tc := cumsum(r-tc), by=.(type)] 50 | pfs[, cumret_tc_risk := cumsum(utility_t), by=.(type)] 51 | ts_data <- pfs[type %in% main_types, .(type, eom_ret, cumret, cumret_tc, cumret_tc_risk)] %>% 52 | pivot_longer(-c(type, eom_ret)) %>% 53 | bind_rows(tibble( 54 | eom_ret = ceiling_date(min(pfs$eom_ret), unit = "month")-1, 55 | type = rep(types, 3), 56 | value = rep(0, 3*length(types)), 57 | name = c(rep("cumret", length(types)), rep("cumret_tc", length(types)), rep("cumret_tc_risk", length(types))) 58 | )) %>% 59 | mutate( 60 | name_pretty = case_when( 61 | name == "cumret" ~ "Gross return", 62 | name == "cumret_tc" ~ "Return net of TC", 63 | name == "cumret_tc_risk" ~ "Return net of TC and Risk" 64 | ) %>% factor(levels = c("Gross return", "Return net of TC", "Return net of TC and Risk")) 65 | ) %>% 66 | setDT() 67 | 68 | ts_plots <- unique(ts_data$name) %>% lapply(function(x) { 69 | p <- ts_data[name == x] %>% 70 | ggplot(aes(eom_ret, value, colour=type, linetype=type)) + 71 | geom_line() + 72 | coord_cartesian(ylim=c(0, NA)) + 73 | facet_wrap(~name_pretty, scales = "free") + 74 | labs(y = "Cummulative performance", colour = "Method:", linetype="Method:") + 75 | scale_x_date(date_labels = "%Y", breaks = seq.Date(as.Date("1980-12-31"), as.Date("2020-12-31"), by = "20 years")) + 76 | theme( 77 | axis.title.x = element_blank(), 78 | strip.background = element_rect(fill = "white", color = "black"), 79 | text = element_text(size=11) 80 | ) 81 | if (x == "cumret") { 82 | y_range <- ts_data[name == x & type != "Markowitz-ML" & name=="cumret", .(max = max(value), min = min(value))] 83 | p <- p + coord_cartesian(ylim=c(y_range$min, y_range$max)) 84 | } 85 | p 86 | }) 87 | # ts_legend <- cowplot::get_legend( 88 | # ts_plots[[1]] + guides(color = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.title = element_blank()) 89 | # ) 90 | ts_legend <- cowplot::get_plot_component( 91 | plot = ts_plots[[1]] + guides(color = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.title = element_blank(), legend.justification = "center"), 92 | 'guide-box-top', return_all = TRUE) 93 | ts_plot <- cowplot::plot_grid( 94 | ts_plots[[1]] + theme(legend.position = "none"), 95 | ts_plots[[2]] + theme(axis.title.y = element_blank(), legend.position = "none"), 96 | ts_plots[[3]] + theme(axis.title.y = element_blank(), legend.position = "none"), 97 | nrow = 1, rel_widths = c(1.07, 1, 1) 98 | ) 99 | (output$ts <- cowplot::plot_grid(ts_legend, ts_plot, ncol=1, rel_heights = c(.1, 1))) 100 | 101 | # Test probability of outperformance --------------------------------------------------- 102 | pfs_wide <- pfs[, .(type, eom_ret, utility_t)] %>% dcast(eom_ret~type, value.var = "utility_t") 103 | 104 | prob_outperformance <- main_types %>% lapply(function(t) { 105 | x <- pfs_wide %>% 106 | melt(id.vars = c("eom_ret", t), variable.name = "alt") 107 | x[, diff := get(t)-value, by = alt] 108 | x[, .( 109 | prob_main_op = 1-pnorm(q = 0, mean=mean(diff), sd = sd(diff)/sqrt(length(diff))) 110 | ), by = alt] %>% mutate(main = t) 111 | }) %>% rbindlist() 112 | 113 | # Portfolio statistics over time -------------------- 114 | # Compute expected portfolio risk 115 | ws <- rbind( 116 | pfml$w |> mutate(type = "Portfolio-ML"), 117 | tpf$w |> mutate(type = "Markowitz-ML"), 118 | mp$w |> mutate(type = "Multiperiod-ML*"), 119 | static$w |> mutate(type = "Static-ML*") |> select(-pred_ld1), 120 | factor_ml$w |> mutate(type = "Factor-ML") 121 | ) |> mutate(type = type %>% factor(levels = pf_order)) 122 | pf_vars <- ws |> expected_risk_fun(dates=dates_oos, cov_list=barra_cov) 123 | # Compute portfolios statistics 124 | (output$comp_stats <- pf_vars[pfs[, .(type, "eom"=eom_ret+1-months(1)-1, inv, turnover)], on = .(type, eom)] %>% # , e_sd 125 | filter(type %in% main_types) |> 126 | mutate(e_sd = sqrt(pf_var*252)) |> 127 | select(-pf_var) |> 128 | pivot_longer(-c(type, eom)) %>% 129 | mutate( 130 | name = case_when( 131 | name == "e_sd" ~ "Ex-ante Volatility", 132 | name == "turnover" ~ "Turnover", 133 | name == "inv" ~ "Leverage" 134 | ) 135 | ) %>% 136 | ggplot(aes(eom, value, colour=type, linetype=type)) + 137 | geom_line() + 138 | scale_y_log10() + 139 | labs(y = "Value") + 140 | facet_wrap(~name, scales="free_y", ncol=1) + 141 | theme( 142 | legend.position = "top", 143 | legend.title = element_blank(), 144 | axis.title.x = element_blank(), 145 | axis.title.y = element_blank() 146 | )) 147 | 148 | # Correlation ---------------------------------- 149 | pf_cors <- pfs[, .(eom_ret, r, type)] %>% 150 | pivot_wider(names_from = type, values_from = r) %>% 151 | select(-eom_ret) %>% 152 | cor(method = "pearson") 153 | 154 | pf_cors %>% 155 | corrplot::corrplot(method = "number", type = "lower") 156 | 157 | # Apple vs. Xerox ---------------------------------------------------- 158 | liquid <- 22111 # 14593 159 | # liquid <- c(93436, 14593, 91103, 88352, 19561, 10107) 160 | illiquid <- 27983 # Xerox (illiquid stock with all observations: data_tc[valid==T & year(eom)>=2015][, n := .N, by = id][n==max(n)][, .(id, eom, n, me, dolvol)][eom==max(eom)][order(dolvol)]) 161 | # illiquid <- 26825 # Kellog (smallest stock with all observations: data_tc[valid==T & size_grp=="mega"][, n := .N, by = id][n == max(n)][, .(id, eom, n, me)][eom==max(eom)][order(me)]) 162 | position <- rbind( 163 | mp$w[id %in% c(liquid, illiquid)][, type := "Multiperiod-ML*"], 164 | pfml$w[id %in% c(liquid, illiquid)][, type := "Portfolio-ML"], 165 | static$w[id %in% c(liquid, illiquid)][, type := "Static-ML*"], 166 | tpf$w[id %in% c(liquid, illiquid)][, type := "Markowitz-ML"], 167 | factor_ml$w[id %in% c(liquid, illiquid)][, type := "Factor-ML"], 168 | mkt$w[id %in% c(liquid, illiquid)][, type := "Market"], 169 | fill=T 170 | ) %>% mutate( 171 | stock_type = case_when( 172 | id == 14593 ~ "Apple (liquid)", 173 | id == 27983 ~ "Xerox (illiquid)", 174 | id == 93436 ~ "Tesla", 175 | id == 91103 ~ "Visa", 176 | id == 19561 ~ "Boing", 177 | id == 10107 ~ "Microsoft", 178 | id == 22111 ~ "Johnson and Johnson (liquid)", 179 | id == 55976 ~ "Walmart (liquid)", 180 | TRUE ~ as.character(id) 181 | ) 182 | ) 183 | position <- position[year(eom)>=2015] 184 | position <- pfs[, .(type, "eom"=eom_ret %>% ceiling_date(unit="m")-1, inv)][position, on = .(type, eom)] 185 | position[, w_z := (w-mean(w))/sd(w), by = .(type, id)] 186 | 187 | (output$example <- position %>% 188 | mutate(type = type %>% factor(levels = pf_order)) %>% 189 | ggplot(aes(eom, w, colour=stock_type, linetype = stock_type)) + 190 | # geom_point(alpha=0.2) + 191 | geom_line() + 192 | geom_hline(yintercept = 0) + 193 | facet_wrap(~type, scales = "free_y") + 194 | labs(y = "Portfolio weight") + 195 | theme( 196 | legend.position = "top", 197 | legend.title = element_blank(), 198 | legend.justification = "center", 199 | strip.background = element_rect(fill = "white", color = "black"), 200 | axis.title.x = element_blank() 201 | )) 202 | 203 | if (FALSE) { 204 | position %>% 205 | mutate(tc_type = (type %in% c("Markowitz-ML", "Portfolio Sort"))) %>% 206 | filter(type != "Portfolio Sort") %>% 207 | ggplot(aes(eom, w_z, colour=type)) + # , alpha = tc_type 208 | # geom_point(alpha=0.2) + 209 | geom_line(alpha=0.6) + 210 | # scale_alpha_discrete(range = c(0.6, 0.35)) + 211 | geom_hline(yintercept = 0) + 212 | facet_wrap(~stock_type) 213 | } 214 | 215 | # Optimal Hyper-parameters ---------------------------- 216 | model_1 <- readRDS(paste0(get_from_path_model, "/model_1.RDS")) 217 | model_6 <- readRDS(paste0(get_from_path_model, "/model_6.RDS")) 218 | model_12 <- readRDS(paste0(get_from_path_model, "/model_12.RDS")) 219 | (output$er_tuning <- rbind( 220 | model_1 %>% lapply(function(x) { 221 | x$pred[, .(eom_ret = eom+1+months(1)-1)] %>% cbind(x$opt_hps[, .(lambda, p, g)]) %>% unique() 222 | }) %>% rbindlist() %>% filter(!is.na(eom_ret)) %>% mutate(horizon = "Return t+1"), 223 | model_6 %>% lapply(function(x) { 224 | x$pred[, .(eom_ret = eom+1+months(1)-1)] %>% cbind(x$opt_hps[, .(lambda, p, g)]) %>% unique() 225 | }) %>% rbindlist() %>% filter(!is.na(eom_ret)) %>% mutate(horizon = "Return t+6"), 226 | model_12 %>% lapply(function(x) { 227 | x$pred[, .(eom_ret = eom+1+months(1)-1)] %>% cbind(x$opt_hps[, .(lambda, p, g)]) %>% unique() 228 | }) %>% rbindlist() %>% filter(!is.na(eom_ret)) %>% mutate(horizon = "Return t+12") 229 | ) %>% 230 | mutate(lambda = log(lambda)) |> 231 | rename("eta"=g, "log(lambda)"=lambda) %>% 232 | pivot_longer(-c(horizon, eom_ret)) %>% 233 | mutate( 234 | comb_name = paste0("'", horizon, ":'~", name) %>% factor(levels = c( 235 | paste0("'Return t+1:'~", c("log(lambda)", "p", "eta")), 236 | paste0("'Return t+6:'~", c("log(lambda)", "p", "eta")), 237 | paste0("'Return t+12:'~", c("log(lambda)", "p", "eta"))) 238 | ) 239 | ) %>% 240 | ggplot(aes(eom_ret, value)) + 241 | geom_point(alpha=0.75, colour = colours_theme[10]) + 242 | facet_wrap(~comb_name, scales = "free_y", labeller = label_parsed) + 243 | labs(y = "Optimal hyper-parameter") + 244 | theme( 245 | legend.position = "none", 246 | axis.title.x = element_blank(), 247 | text = element_text(size=9), 248 | strip.background = element_rect(fill = "white", color = "black"), 249 | axis.text = element_text(size=8) 250 | )) 251 | 252 | (output$portfolio_tuning <- rbind( 253 | mp$hps[rank == 1 & year(eom_ret)>=1981 & month(eom_ret)==12, .(eom_ret, k, g, u)][, type := "Multiperiod-ML*"], 254 | static$hps[rank == 1 & year(eom_ret) >= 1981 & month(eom_ret)==12, .(eom_ret, k, g, u)][, type := "Static-ML*"] 255 | ) %>% 256 | rename("v"=g) %>% 257 | pivot_longer(-c(type, eom_ret)) %>% 258 | rbind(pfml$best_hps[eom_ret>=1981, .(type = "Portfolio-ML", eom_ret, "log(lambda)"=log(l), p, "eta"=g)] %>% pivot_longer(-c(type, eom_ret))) %>% 259 | mutate( 260 | comb_name = paste0("'", type, ":'~", name) %>% factor(levels = c(paste0("'Portfolio-ML:'~", c("log(lambda)", "p", "eta")), 261 | paste0("'Multiperiod-ML*:'~", c("k", "u", "v")), 262 | paste0("'Static-ML*:'~", c("k", "u", "v")))) 263 | ) %>% 264 | ggplot(aes(eom_ret, value, colour = type)) + 265 | geom_point(alpha=0.75) + 266 | facet_wrap(~comb_name, scales = "free_y", labeller = label_parsed) + 267 | labs(y = "Optimal hyper-parameter") + 268 | theme( 269 | legend.position = "none", 270 | axis.title.x = element_blank(), 271 | text = element_text(size=9), 272 | strip.background = element_rect(fill = "white", color = "black"), 273 | axis.text = element_text(size=8) 274 | )) 275 | 276 | # AR1 plot ------------------------------------------- 277 | ar1 <- chars[order(id, eom)] 278 | ar1[, lag_ok := (eom - lag(eom)) %in% c(28:31), by = id] 279 | 280 | ar1_ss <- features %>% map(.progress = T, function(x) { 281 | ar1[, var := get(x)] 282 | ar1[, var_l1 := lag(var), by = id] 283 | sub <- ar1[!is.na(var) & !is.na(var_l1) & lag_ok == T & var != 0.5 & var_l1 != 0.5 & valid==T][, n := .N, by = id][n >= 12*5] # 0.5 indicates missing 284 | sub[, .( 285 | n = .N, 286 | ar1 = cor(var, var_l1) 287 | ), by = id][is.na(ar1), ar1 := 1][, .( 288 | char = x, 289 | ar1 = mean(ar1) 290 | )] 291 | }) %>% 292 | rbindlist() 293 | temp_order <- cluster_labels[, .("char"=characteristic, cluster)][ar1_ss, on = "char"][, mean(ar1), by = cluster][order(V1)]$cluster %>% 294 | str_replace_all("_", " ") %>% str_replace("short term", "short-term") %>% str_to_title() 295 | (output$ar1 <- cluster_labels[, .("char"=characteristic, cluster)][ar1_ss, on = "char"] %>% 296 | group_by(cluster) %>% 297 | mutate(sort_var = mean(ar1)+ar1/100000) %>% 298 | mutate( 299 | pretty_name = cluster %>% str_replace_all("_", " ") %>% str_replace("short term", "short-term") %>% str_to_title() %>% 300 | factor(levels = temp_order) 301 | ) %>% 302 | ggplot(aes(reorder(char, sort_var), ar1, fill=pretty_name)) + 303 | geom_col() + 304 | coord_flip() + 305 | labs(y = "Average Monthly Autocorrelation", fill = "Theme") + 306 | guides(fill = guide_legend(reverse = TRUE)) + 307 | theme( 308 | axis.title.y = element_blank() 309 | )) 310 | 311 | # Features with sufficient coverage ------------------ 312 | if (FALSE) { 313 | features_all <- c(features_m, feat_excl) 314 | 315 | ids <- c("source_crsp", "common", "obs_main", "primary_sec", "exch_main", "id", "eom", "sic", "ff49", "size_grp", "me", "rvol_21d", "dolvol_126d") 316 | data <- fread(paste0("../Data/usa.csv"), 317 | select = unique(c(ids, features_all)), #, "excntry" 318 | colClasses = c("eom" = "character", "sic"="character")) 319 | data[, eom := eom %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()] 320 | data[, dolvol := dolvol_126d] 321 | data[, rvol_m := rvol_252d * sqrt(21)] 322 | data <- data[source_crsp == 1 & common==1 & obs_main==1 & primary_sec==1 & exch_main==1][, c("source_crsp", "common", "obs_main", "primary_sec", "exch_main") := NULL] 323 | # Screen Data ----------------- 324 | # Start Date Screens 325 | print(paste0(" Start date screen excludes ", round(mean(data$eom < settings$screens$start) * 100, 2), "% of the observations")) 326 | data <- data[eom >= settings$screens$start] 327 | # Monitor screen impact 328 | n_start <- nrow(data) 329 | me_start <- sum(data$me, na.rm = T) 330 | # Require me, dolvol and rvol 331 | print(paste0(" Non-missing me excludes ", round(mean(is.na(data$me)) * 100, 2), "% of the observations")) 332 | data <- data[!is.na(me)] 333 | if (settings$screens$require_rvol) { 334 | print(paste0(" Non-missing rvol_252d excludes ", round(mean(is.na(data$rvol_m)) * 100, 2), "% of the observations")) 335 | data <- data[!is.na(rvol_m)] 336 | } 337 | if (settings$screens$require_dolvol) { 338 | print(paste0(" Non-missing dolvol_126d excludes ", round(mean(is.na(data$dolvol)) * 100, 2), "% of the observations")) 339 | data <- data[!is.na(dolvol)] 340 | } 341 | # Size Screen 342 | print(paste0(" Size screen excludes ", round(mean(!(data$size_grp %in% settings$screens$size_grps)) * 100, 2), "% of the observations")) 343 | data <- data[size_grp %in% settings$screens$size_grps] 344 | # Feature Screens 345 | feat_available <- data %>% select(all_of(features_all)) %>% apply(1, function(x) sum(!is.na(x))) 346 | min_feat <- floor(length(features_all)*settings$screens$feat_pct) 347 | print(paste0(" At least ", settings$screens$feat_pct*100, "% of feature excludes ", round(mean(feat_available < min_feat)*100, 2), "% of the observations")) 348 | data <- data[feat_available >= min_feat] 349 | # Summary 350 | print(paste0(" In total, the final dataset has ", round( (nrow(data) / n_start)*100, 2), "% of the observations and ", round((sum(data$me) / me_start)*100, 2), "% of the market cap in the post ", settings$screens$start, " data")) 351 | 352 | # Check coverage by eom [Seems like we could include seas_16_20na and seas_16_20an] 353 | coverage <- data[, lapply(.SD, function(x) mean(!is.na(x))), by = eom, .SDcols=features_all] 354 | coverage[eom==min(eom)] %>% 355 | pivot_longer(-eom) %>% 356 | arrange(value) 357 | } 358 | -------------------------------------------------------------------------------- /6 - Economic intuition.R: -------------------------------------------------------------------------------- 1 | # How do weigths differ for PF-ML, Market, and Markowitz 2 | d <- as.Date("2020-11-30") 3 | w_ex <- rbind( 4 | static$w[eom==d, .(id, eom, w, type = "Static-ML*")], 5 | pfml$w[eom==d, .(id, eom, w, type = "Portfolio-ML")], 6 | mkt$w[eom==d, .(id, eom, w, type = "Market")], 7 | tpf$w[eom==d, .(id, eom, w, type = "Markowitz-ML")] 8 | ) 9 | w_ex <- chars[, .(id, eom, dolvol)][w_ex, on = .(id, eom)] 10 | 11 | w_ex[, n := .N, by = .(id, eom)] 12 | w_ex <- w_ex[n==max(n)] 13 | 14 | set.seed(settings$seed_no) 15 | sample_ids <- sample(unique(w_ex$id), 100) 16 | 17 | # w_ex %>% mutate(w = abs(w)) %>% pivot_wider(names_from = type, values_from = w) %>% select(-id, -n) %>% cor(method = "spearman") 18 | 19 | w_ex[, cor(dolvol, abs(w), method = "spearman"), by = type] 20 | 21 | w_ex %>% 22 | group_by(id) %>% 23 | # mutate(sort_var = w[type=="Market"]) %>% 24 | filter(id %in% sample_ids) %>% 25 | filter(type %in% c("Markowitz-ML", "Portfolio-ML")) %>% 26 | ggplot(aes(reorder(id, dolvol), abs(w))) + 27 | geom_col() + 28 | # geom_point() + 29 | coord_flip() + 30 | facet_wrap(~type, scales = "free_x") + 31 | labs(x = "Absolute portfolio weight", y = "Dollar volume rank") 32 | 33 | w_ex %>% 34 | group_by(type) %>% 35 | mutate(dolvol_rank = frank(dolvol)) %>% 36 | ggplot(aes(dolvol_rank, abs(w))) + 37 | geom_point() + 38 | geom_smooth(se=F) + 39 | facet_wrap(~type, scales = "free_y") 40 | 41 | 42 | 43 | # Portfolio analysis ---------- 44 | w_ex <- rbind( 45 | static$w[, .(id, eom, w, type = "Static-ML*")], 46 | pfml$w[, .(id, eom, w, type = "Portfolio-ML")], 47 | mkt$w[, .(id, eom, w, type = "Market")], 48 | tpf$w[, .(id, eom, w, type = "Markowitz-ML")] 49 | ) 50 | w_ex <- chars[, .(id, eom, dolvol)][w_ex, on = .(id, eom)] 51 | 52 | w_ex[, n := .N, by = .(id, eom)] 53 | w_ex <- w_ex[n==max(n)] 54 | 55 | w_ex[, dv_pf := ceiling(ecdf(dolvol)(dolvol)*10), by = .(type, eom)] 56 | 57 | w_ex[, .( 58 | w_abs = mean(abs(w)) 59 | ), by = .(type, eom, dv_pf)][, .( 60 | w_abs = mean(w_abs) 61 | ), by = .(type, dv_pf)] %>% 62 | filter(type %in% c("Markowitz-ML", "Portfolio-ML", "Market")) %>% 63 | ggplot(aes(factor(dv_pf), w_abs)) + 64 | geom_col(fill = colours_theme[1]) + 65 | facet_wrap(~type, scales = "free_y", ncol = 3) + 66 | labs(x = "Dollar volume sorted portfolios (1=low)") 67 | 68 | 69 | w_ex[, long := if_else(w>=0, "Long positions", "Short positions")] 70 | (output$w_liq <- w_ex[, .( 71 | w_abs = mean(w) 72 | ), by = .(type, eom, dv_pf, long)][, .( 73 | w_abs = mean(w_abs) 74 | ), by = .(type, dv_pf, long)] %>% 75 | filter(type %in% c("Markowitz-ML", "Portfolio-ML", "Market")) %>% 76 | ggplot(aes(factor(dv_pf), w_abs, fill=long)) + 77 | geom_col() + 78 | facet_wrap(~type, scales = "free_y", ncol = 3) + 79 | # scale_x_continuous(breaks = c(1, seq(5, 20, 5))) + 80 | labs(x = "Dollar volume sorted portfolios (1=low)", y = "Average stock weight") + 81 | theme( 82 | legend.position = "top", 83 | legend.title = element_blank() 84 | )) 85 | -------------------------------------------------------------------------------- /6 - Feature importance.R: -------------------------------------------------------------------------------- 1 | # FI in base case ----------------------- 2 | fi_path <- "Data/Generated/Portfolios/FI/" 3 | fi_folder <- paste0(fi_path, list.files(fi_path)) 4 | 5 | tpf_cf_base <- fread(paste0(fi_folder, "/tpf_cf_base.csv")) 6 | pfml_cf_base <- fread(paste0(fi_folder, "/pfml_cf_base.csv")) 7 | ret_cf <- fread(paste0(fi_folder, "/ret_cf.csv")) 8 | pfml_cf_ief <- fread(paste0(fi_folder, "/pfml_cf_ief.csv")) 9 | 10 | pfml_cf_ss <- pfml_cf_base %>% 11 | group_by(type, cluster) %>% 12 | summarise(cf_obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12) %>% 13 | ungroup() %>% 14 | mutate(fi = cf_obj[cluster=="bm"] - cf_obj) |> 15 | select(type, cluster, fi) 16 | 17 | 18 | tpf_cf_ss <- tpf_cf_base %>% 19 | group_by(type, cluster) %>% 20 | summarise(cf_obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12) %>% 21 | ungroup() %>% 22 | mutate(fi = cf_obj[cluster=="bm"]-cf_obj) |> 23 | select(type, cluster, fi) 24 | 25 | (output$feature_importance <- rbind( 26 | pfml_cf_ss %>% mutate(wealth = 1e10), 27 | tpf_cf_ss %>% mutate(wealth = 0)#, 28 | # shap_er1 %>% mutate(type = "Expected 1m Return") %>% select(type, cluster, fi) %>% mutate(wealth = 0) 29 | ) %>% 30 | filter(cluster != "bm") |> 31 | group_by(type) %>% 32 | # mutate(fi = fi / max(fi)) %>% 33 | group_by(cluster) %>% 34 | mutate( 35 | sort_var = sum(fi*(type == "Portfolio-ML")), 36 | # sort_var = sum(fi*(type == "Expected 1m Return")), 37 | cluster = cluster %>% str_replace_all("_", " ") %>% str_replace("short term", "short-term") %>% str_to_title(), 38 | type = type %>% factor(levels = c("Portfolio-ML", "Multiperiod-ML*", "Markowitz-ML", "Expected 1m Return")) 39 | ) %>% 40 | filter(type != "Expected 1m Return") %>% 41 | ggplot(aes(reorder(cluster, sort_var), fi, fill = type)) + 42 | geom_col(position = "dodge") + 43 | scale_fill_manual(values = c(colours_theme[1], colours_theme[2], colours_theme[5])) + 44 | coord_flip() + 45 | facet_wrap(~type, nrow = 1, scales = "free_x") + 46 | theme( 47 | legend.position = "none", 48 | axis.title.y = element_blank(), 49 | strip.background = element_rect(fill = "white", color = "black") 50 | ) + 51 | labs(y = "Drop in realized utility from permuting theme features")) 52 | 53 | 54 | # FI in IEF ------------------------- 55 | # Summary stats 56 | ef_cf_ss <- pfml_cf_ief[, .( 57 | obj = (mean(r)-0.5*var(r)*gamma_rel-mean(tc))*12, 58 | r_tc = mean(r-tc)*12, 59 | sd = sd(r)*sqrt(12) 60 | ), by = .(gamma_rel, cluster)][, sr := r_tc / sd] %>% rename("shuffled"=cluster) 61 | # Add benchmark 62 | ef_cf_ss <- ef_cf_ss %>% rbind(ef_ss[wealth_end==pf_set$wealth, .(gamma_rel, shuffled = "none", obj, r_tc, sd, sr)]) 63 | 64 | # With trading costs 65 | sub <- c("Quality", "Value", "Short-Term Reversal", "None") 66 | (output$cf_ef_tc <- ef_cf_ss %>% 67 | rbind(expand.grid(shuffled = unique(ef_cf_ss$shuffled), sd=0, r_tc=0), fill=T) %>% 68 | mutate( 69 | shuffled = shuffled %>% str_replace_all("_", " ") %>% str_replace_all("short term", "short-term") %>% str_to_title(), 70 | shuffled = shuffled %>% factor(levels = c("None", "Quality", "Value", "Short-Term Reversal", "Momentum")) 71 | ) %>% 72 | filter(shuffled %in% sub) %>% 73 | ggplot(aes(sd, r_tc, colour = factor(shuffled), group = factor(shuffled), shape = factor(gamma_rel), linetype=factor(shuffled))) + 74 | geom_line() + 75 | geom_point(size=2) + 76 | # coord_cartesian(xlim = c(0, 0.6), ylim = c(0, 0.5)) + 77 | coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 0.28), expand=F) + 78 | # scale_x_continuous(breaks = seq(0, 0.3, 0.1)) + 79 | labs(x = "Volatility", y = "Excess returns (net of trading cost)", shape = "Relative Risk Aversion:", 80 | colour = "Theme permuted:", linetype = "Theme permuted:") + 81 | guides( 82 | colour = guide_legend(order = 1, override.aes = list(size=1, shape=NA)), 83 | linetype = guide_legend(order = 1), 84 | shape = "none" #guide_legend(order = 2, label.hjust = 1) 85 | ) + 86 | theme( 87 | legend.position = c(0.5, 0.97), 88 | legend.direction="horizontal", 89 | legend.justification = "center" 90 | )) 91 | 92 | # Counterfactual EF without TC -------------------------- 93 | tpf_cf_ss <- tpf_cf_base[, .( 94 | sr = mean(r)/sd(r)*sqrt(12) 95 | ), by = cluster][cluster=="bm", cluster := "none"] 96 | 97 | x_values <- data.table(sd = seq(0, 0.35, 0.01)) 98 | 99 | (output$cf_ef_markowitz <- tidyr::crossing(tpf_cf_ss, x_values) %>% 100 | mutate( 101 | ret = sd*sr, 102 | shuffled = cluster %>% str_replace_all("_", " ") %>% str_replace_all("short term", "short-term") %>% str_to_title(), 103 | shuffled = shuffled %>% factor(levels = c("None", "Quality", "Value", "Short-Term Reversal", "Momentum")) 104 | ) %>% 105 | filter(shuffled %in% sub) %>% 106 | ggplot(aes(sd, ret, colour = factor(shuffled), group = factor(shuffled), linetype = factor(shuffled))) + 107 | geom_line() + 108 | # coord_cartesian(xlim = c(0.15, 0.3), ylim = c(0.25, 0.5)) + 109 | coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 0.55), expand=F) + 110 | # coord_cartesian(xlim = c(0, 0.165), ylim = c(0, 0.35), expand=F) + 111 | labs(x = "Volatility", y = "Excess returns", colour = "Theme permuted:", linetype = "Theme permuted:") + 112 | guides( 113 | colour = guide_legend(order = 1, override.aes = list(size=1)), 114 | linetype = guide_legend(order = 1) 115 | ) + 116 | theme( 117 | legend.position = c(0.5, 0.97), 118 | legend.direction="horizontal" 119 | )) 120 | 121 | # Feature importance for return predictions models -------------------------------- 122 | ret_cf_ss <- ret_cf[, .(mse = mean(mse)), by = .(h, cluster)] 123 | bm <- ret_cf_ss[cluster == "bm", .(h, bm=mse)] 124 | ret_cf_ss <- bm[ret_cf_ss[cluster != "bm"], on = "h"] 125 | ret_cf_ss[, fi := mse-bm] 126 | 127 | if (TRUE) { 128 | (output$fi_returns <- ret_cf_ss %>% 129 | mutate( 130 | cluster = cluster %>% str_replace_all("_", " "), 131 | cluster = cluster %>% str_replace_all("short term", "short-term"), 132 | cluster = cluster %>% str_to_title() 133 | ) %>% 134 | group_by(h) %>% 135 | mutate(fi = fi / max(fi)) %>% 136 | group_by(cluster) %>% 137 | mutate(sort_var = fi[h==1]) %>% 138 | ggplot(aes(reorder(cluster, sort_var), fi, fill=factor(h))) + 139 | geom_col(position = "dodge") + 140 | # Scale_y_continuous with percentages 141 | scale_y_continuous(labels = scales::label_percent(), breaks = seq(-0.2, 1, 0.2)) + 142 | geom_hline(yintercept = 1, linetype = "dashed", color = "black") + 143 | labs(y = "Drop in MSE from permuting theme features (% of max)", fill = "Horizon") + # Otherwise difficult to visualize due to small x-axis values 144 | theme( 145 | axis.title.x = element_blank() 146 | )) 147 | } else { 148 | (output$fi_returns <- ret_cf_ss %>% 149 | mutate( 150 | title = paste0("Horizon: ", h, " month") |> factor(levels = paste0("Horizon: ", 1:12, " month")), 151 | cluster = cluster %>% str_replace_all("_", " "), 152 | cluster = cluster %>% str_replace_all("short term", "short-term"), 153 | cluster = cluster %>% str_to_title() 154 | ) %>% 155 | group_by(h) %>% 156 | mutate(fi = fi / max(fi) * 100) %>% 157 | group_by(cluster) %>% 158 | mutate(sort_var = fi[h==1]) %>% 159 | ggplot(aes(reorder(cluster, sort_var), fi)) + 160 | geom_col(fill = colours_theme[1]) + 161 | coord_flip() + 162 | facet_wrap(~title, scales = "free_x") + 163 | labs(y = "Drop in MSE from permuting theme features (% of max)") + # Otherwise difficult to visualize due to small x-axis values 164 | theme( 165 | axis.title.y = element_blank() 166 | )) 167 | } 168 | 169 | # Wierdly, ret_1_0 strongly predicts t+12? But note that it's with the opposite sign of t+1 (I know! It's the seasonality effect) 170 | melt(chars[!is.na(pred_ld1), c("id", "eom", paste0("pred_ld", 1:12), "ret_1_0", "ret_12_1", "be_me", "gp_at")], id.vars = c("id", "eom", paste0("pred_ld", 1:12)))[, lapply(.SD, function(x) cor(value, x)), .SDcols = paste0("pred_ld", 1:12), by = .(variable, eom)][, lapply(.SD, mean), by = variable] |> 171 | select(-eom) |> 172 | pivot_longer(-variable, names_to = "h", values_to = "cor") |> 173 | mutate(h = h |> str_remove("pred_ld") |> as.numeric()) |> 174 | ggplot(aes(h, cor, colour = variable)) + 175 | geom_hline(yintercept = 0) + 176 | geom_line() + 177 | geom_point() + 178 | theme( 179 | legend.position = "top" 180 | ) 181 | -------------------------------------------------------------------------------- /6 - Implementable efficient frontier.R: -------------------------------------------------------------------------------- 1 | # Generate HML/Markowitz for multiple volaility levels ----------------------------------- 2 | vol_range <- seq(0, 0.5, 0.01) 3 | 4 | # Factor-ML 5 | factor_base <- chars %>% factor_ml_implement(dates = dates_oos, n_pfs = settings$factor_ml$n_pfs, wealth = wealth, gam = pf_set$gamma_rel) 6 | factor_base_vol <- factor_base$pf[, sd(r)*sqrt(12)] 7 | factor_ef <- vol_range %>% map(.progress = T, function(vol_target) { 8 | scale <- vol_target / factor_base_vol # Target same vol as m2 9 | copy(factor_base$w)[, w := w*scale][, w_start:=w_start*scale] %>% pf_ts_fun(data = chars, wealth = wealth, gam = pf_set$gamma_rel) %>% mutate(vol_target = vol_target) 10 | }) %>% rbindlist() %>% mutate(type = "Factor-ML") 11 | factor_ss <- factor_ef[, .( 12 | n = .N, 13 | inv = mean(inv), 14 | to = mean(turnover), 15 | r = mean(r)*12, 16 | sd = sd(r)*sqrt(12), 17 | sr_gross = mean(r)/sd(r)*sqrt(12), 18 | tc = mean(tc)*12, 19 | r_tc = mean((r-tc))*12, 20 | sr = mean(r-tc)/sd(r)*sqrt(12), 21 | obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12 22 | ), by = .(vol_target)] 23 | 24 | # Markowitz-ML 25 | tpf_base <- chars |> tpf_implement(cov_list = barra_cov, wealth = wealth, dates = dates_oos, gam = pf_set$gamma_rel) 26 | tpf_base_vol <- tpf_base$pf[, sd(r)*sqrt(12)] 27 | tpf_ef <- vol_range %>% lapply(function(vol_target) { 28 | scale <- vol_target / tpf_base_vol # Target same vol as m2 29 | copy(tpf_base$w)[, w := w*scale][, w_start:=w_start*scale] %>% pf_ts_fun(data = chars, wealth = wealth, gam = pf_set$gamma_rel) %>% mutate(vol_target = vol_target) 30 | }) %>% rbindlist() %>% mutate(type = "Markowitz-ML") 31 | tpf_ss <- tpf_ef[, .( 32 | n = .N, 33 | inv = mean(inv), 34 | to = mean(turnover), 35 | r = mean(r)*12, 36 | sd = sd(r)*sqrt(12), 37 | sr_gross = mean(r)/sd(r)*sqrt(12), 38 | tc = mean(tc)*12, 39 | r_tc = mean((r-tc))*12, 40 | sr = mean(r-tc)/sd(r)*sqrt(12), 41 | obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12 42 | ), by = .(vol_target)] 43 | 44 | # Mean-variance efficient frontier of risky assets --------------------------------------------- 45 | u_vec <- c(seq(-0.5, 0.5, 0.05), 0.6, 0.75, 1, 2) / 12 46 | wealth_0 <- wealth_func(wealth_end = 0, end = settings$split$test_end, market = market, risk_free = risk_free) 47 | mv_risky_ef <- chars |> mv_risky_fun(cov_list=barra_cov, wealth=wealth_0, dates=dates_oos, gam=pf_set$gamma_rel, u_vec) 48 | mv_ss <- mv_risky_ef[, .( 49 | n = .N, 50 | inv = mean(inv), 51 | to = mean(turnover), 52 | r = mean(r)*12, 53 | sd = sd(r)*sqrt(12), 54 | sr_gross = mean(r)/sd(r)*sqrt(12), 55 | tc = mean(tc)*12, 56 | r_tc = mean((r-tc))*12, 57 | sr = mean(r-tc)/sd(r)*sqrt(12), 58 | obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12 59 | ), by = .(u)] 60 | 61 | # Get IEF portfolios --------------------------------------------------------------------------- 62 | ief_path <- "Data/Generated/Portfolios/IEF/" 63 | ief_pfs <- list.files(path = ief_path) |> map(.progress = T, function(x) { 64 | w <- x |> str_extract("(?<=WEALTH)[^_]+") |> as.numeric() 65 | g <- x |> str_extract("(?<=GAMMA)[^_]+") |> as.numeric() 66 | # Portfolios 67 | rbind( 68 | fread(paste0(ief_path, x, "/bms.csv")) |> mutate(eom_ret=as.Date(eom_ret)), 69 | readRDS(paste0(ief_path, x, "/static-ml.RDS"))$pf, 70 | readRDS(paste0(ief_path, x, "/portfolio-ml.RDS"))$pf, 71 | readRDS(paste0(ief_path, x, "/static-ml.RDS"))$hps[eom_ret %in% readRDS(paste0(ief_path, x, "/static-ml.RDS"))$pf$eom_ret & k==1 & g==0 & u==1, .(eom_ret=as.Date(eom_ret), inv, shorting, turnover, r, tc)][, type := "Static-ML"] 72 | ) |> mutate(wealth_end=w, gamma_rel=g) 73 | }) |> rbindlist() 74 | stopifnot(nrow(ief_pfs[, .N, by = .(wealth_end, gamma_rel, type, eom_ret)][N!=1])==0)# Check for duplicates 75 | 76 | ief_ss <- ief_pfs[, .( 77 | inv = mean(inv), 78 | to = mean(turnover), 79 | r = mean(r)*12, 80 | sd = sd(r)*sqrt(12), 81 | sr_gross = mean(r)/sd(r)*sqrt(12), 82 | tc = mean(tc)*12, 83 | r_tc = mean((r-tc))*12, 84 | sr = mean(r-tc)/sd(r)*sqrt(12), 85 | obj = (mean(r)-0.5*var(r)*gamma_rel-mean(tc))*12 86 | ), by = .(type, wealth_end, gamma_rel)] 87 | 88 | ef_ss <- ief_ss[type=="Portfolio-ML"] 89 | 90 | # Build inputs for IEF ----------------------------------------------------------------------- 91 | # Summary stats 92 | ef_all_ss <- rbind( 93 | ief_ss[type %in% c("Portfolio-ML", "Static-ML*", "Static-ML")], 94 | factor_ss |> mutate(gamma_rel = NA, type = "Factor-ML", wealth_end=pf_set$wealth), 95 | tpf_ss |> mutate(gamma_rel = NA, type = "Markowitz-ML", wealth_end=pf_set$wealth), 96 | fill=T 97 | ) 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | # Indifference curves ------- 106 | points <- ef_all_ss[ 107 | (type %in% c("Portfolio-ML") & wealth_end == pf_set$wealth & gamma_rel==pf_set$gamma_rel) | 108 | (type %in% c("Static-ML*") & wealth_end == pf_set$wealth & gamma_rel==pf_set$gamma_rel) | 109 | (type %in% c("Static-ML") & wealth_end == pf_set$wealth & gamma_rel==pf_set$gamma_rel) #| 110 | # (type %in% c("Factor-ML") & wealth_end == pf_set$wealth & vol_target==0.01) 111 | ] 112 | 113 | indifference_curves <- 1:nrow(points) %>% lapply(function(i) { 114 | u_target <- points[i, obj] 115 | sd_target <- points[i, sd] 116 | # vol_space <- c( 117 | # seq(from = floor(sd_target*100)/100, by = -0.01, length.out = 8), 118 | # sd_target, 119 | # seq(from = ceiling(sd_target*100)/100, by = 0.01, length.out = 8) 120 | # ) %>% sort() 121 | # vol_space <- pmax(vol_space, 0) %>% unique() 122 | seq(0, 0.4, 0.01) %>% lapply(function(v) { 123 | data.table(sd = v, r_tc = u_target + pf_set$gamma_rel/2*v^2) 124 | }) %>% rbindlist() %>% mutate(u = u_target) 125 | }) %>% rbindlist() 126 | 127 | # Figure 1A ---------------- 128 | tpf_slope <- ef_all_ss[type=="Markowitz-ML"]$sr_gross[2] # All the same (except first which is NAN) 129 | static_raw <- ef_all_ss[(type %in% c("Static-ML") & wealth_end == pf_set$wealth & gamma_rel==pf_set$gamma_rel)] 130 | # Plot settings 131 | ef_y_low <- -0.2 132 | tpf_col <- 10 133 | ef_xmax <- 0.35 134 | plot_txt_size <- 2.5 135 | sta_1l_x <- 0.265 136 | sta_1l_y <- static_raw$r_tc 137 | 138 | 139 | 140 | ef_all_ss <- ef_all_ss |> 141 | mutate(type = type %>% factor(levels = c(main_types, "Markowitz-ML (gross)"))) 142 | 143 | (output$ef_all_wo_ic <- ef_all_ss %>% 144 | filter(type != "Static-ML") |> 145 | filter(wealth_end == pf_set$wealth) %>% 146 | rbind(expand.grid(type = c("Portfolio-ML", "Static-ML*"), r_tc=0, sd=0), fill=T) %>% 147 | rbind(data.table(type = "Markowitz-ML (gross)"), fill=T) %>% 148 | ggplot() + 149 | annotate("text", x=0.1, y=0.31, label = "Without TC", size=plot_txt_size, hjust=0.5) + 150 | annotate("segment", x=0.1, xend = 0.12, y=0.3, yend=0.2, arrow = arrow(length = unit(.1,"cm")), size=0.1, alpha=0.75) + 151 | annotate("segment", x=0.1, xend = 0.1, y=0.3, yend=0.22, arrow = arrow(length = unit(.1,"cm")), size=0.1, alpha=0.75) + 152 | geom_abline(intercept = 0, slope = tpf_slope, alpha = 1, colour = colours_theme[tpf_col]) + 153 | geom_line(aes(sd, r_tc, colour = type, group = type)) + 154 | coord_cartesian(xlim = c(0, ef_xmax), ylim = c(ef_y_low, 0.54), expand = F) + 155 | geom_point(aes(sd, r_tc, colour = type, shape = factor(gamma_rel), group = type), size=2) + 156 | geom_path(data = mv_ss[r_tc <= 0.7], aes(sd, r_tc), linetype = "dotted", colour = colours_theme[tpf_col]) + # colours_theme[match("Markowitz-ML", main_types)] 157 | scale_colour_manual(values = colours_theme[c(match(sort(unique(ef_all_ss$type)), main_types), tpf_col)]) + 158 | # annotate(geom = "point", y = static_raw$r_tc, x = static_raw$sd, shape = "square", size = 2) + 159 | # annotate("text", x=static_raw$sd-0.01, y=sta_1l_y, label = "Static-ML (one layer)", size=plot_txt_size, hjust=1, vjust=0.5) + 160 | # annotate("segment", x=static_raw$sd-0.008, xend = static_raw$sd-0.005, y=sta_1l_y, yend=sta_1l_y, arrow = arrow(length = unit(.1,"cm")), size=0.1, alpha=0.75) + 161 | annotate(geom = "point", y = static_raw$r_tc, x = static_raw$sd, shape = "square", size = 2) + 162 | annotate("text", x=static_raw$sd, y=0.05, label = "Static-ML (one layer)", size=plot_txt_size, hjust=0.5, vjust=0.5) + 163 | annotate("segment", x=static_raw$sd, xend = static_raw$sd, y=0.07, yend=static_raw$r_tc-0.02, arrow = arrow(length = unit(.1,"cm")), size=0.1, alpha=0.75) + 164 | labs(x = "Volatility", y = "Excess returns (net of trading cost)", shape = "Relative Risk Aversion:") + 165 | guides( 166 | colour = guide_legend(order = 1, override.aes = list(size=1, shape=NA)), 167 | shape = "none" #guide_legend(order = 2, label.hjust = 1) 168 | ) + 169 | theme( 170 | legend.title = element_blank(), 171 | legend.position = c(0.5, 0.97), 172 | legend.direction="horizontal", 173 | legend.text = element_text(size=8) 174 | ) + 175 | scale_x_continuous(expand = c(0, 0.01))) 176 | output$ef_all <- copy(output$ef_all_wo_ic) 177 | for (u_tgt in points$obj) { 178 | output$ef_all <- output$ef_all + geom_path(data = indifference_curves[u==u_tgt], aes(sd, r_tc), linetype = "dashed", alpha=0.40) 179 | } 180 | output$ef_all 181 | 182 | # Figure 1B -------- 183 | comb_data <- ef_ss %>% 184 | rbind(expand.grid(wealth_end=settings$ef$wealth, sd=0, r_tc=0), fill=T) %>% 185 | mutate( 186 | type = "tpf", 187 | wealth_end = if_else(wealth_end==1, 1, wealth_end) 188 | ) %>% 189 | rbind( 190 | mv_ss %>% mutate(wealth_end=1, type = "mv"), # %>% mutate(wealth_end=0, type = "mv", r_tc = r_tc+0.04)If we want risky frontier to be tangent for illustrative purposes 191 | # pf_summary %>% 192 | # filter(type %in% c("Portfolio Sort", "Markowitz")) %>% 193 | # mutate(wealth_end = pf_set$wealth, gamma_rel = pf_set$gamma_rel, r_tc = -Inf, sd = if_else(type == "Markowitz", Inf, sd), type = "bm"), 194 | fill=T 195 | ) %>% 196 | mutate(gamma_rel = gamma_rel %>% factor(levels = settings$ef$gamma_rel)) %>% 197 | filter(wealth_end %in% c(1, 1e9, 1e10, 1e11)) 198 | 199 | (output$ef <- comb_data %>% 200 | ggplot(aes(sd, r_tc, colour = factor(wealth_end), group = factor(wealth_end))) + 201 | geom_blank() + 202 | # geom_point(data = comb_data %>% filter(type == "bm"), aes(shape = gamma_rel), size=3) + 203 | geom_line(data = comb_data %>% filter(type == "tpf")) + 204 | geom_point(data = comb_data %>% filter(type == "tpf"), aes(shape = gamma_rel), size=2) + 205 | geom_path(data = comb_data %>% filter(type == "mv"), linetype = "dotted") + 206 | coord_cartesian(xlim = c(0, ef_xmax), ylim = c(0, 0.75), expand=F) + 207 | labs(x = "Volatility", y = "Excess returns (net of trading cost)", shape = "Relative Risk Aversion:", 208 | colour = "Wealth by 2020:") + 209 | guides( 210 | colour = guide_legend(order = 1, override.aes = list(size=1, shape=NA)), 211 | shape = "none" #guide_legend(order = 2, label.hjust = 1) 212 | ) + 213 | theme( 214 | legend.position = c(0.5, 0.97), 215 | legend.direction="horizontal", 216 | legend.title = element_text(size=8), 217 | legend.text = element_text(size=8) 218 | )) 219 | 220 | -------------------------------------------------------------------------------- /6 - Performance across size distribution.R: -------------------------------------------------------------------------------- 1 | # Download files -------------------------- 2 | size_cuts <- c("all", "low80_high100", "low60_high80", "low40_high60", "low20_high40", "low00_high20") 3 | size_path <- "Data/Generated/Portfolios/Size/" 4 | size_folders <- list.files(size_path) 5 | pf_by_size <- size_cuts |> map(function(x) { 6 | folder <- size_folders[str_detect(size_folders, x)] 7 | rbind( 8 | fread(paste0(size_path, folder, "/bms.csv")) |> mutate(eom_ret=as.Date(eom_ret)), 9 | readRDS(paste0(size_path, folder, "/static-ml.RDS"))$pf, 10 | readRDS(paste0(size_path, folder, "/multiperiod-ml.RDS"))$pf, 11 | readRDS(paste0(size_path, folder, "/portfolio-ml.RDS"))$pf 12 | ) |> mutate(size = x) 13 | }) |> 14 | rbindlist() |> 15 | mutate( 16 | size = case_when( 17 | size == "all" ~ "All", 18 | size == "low80_high100" ~ "Largest (80-100)", 19 | size == "low60_high80" ~ "Large (60-80)", 20 | size == "low40_high60" ~ "Mid (40-60)", 21 | size == "low20_high40" ~ "Small (20-40)", 22 | size == "low00_high20" ~ "Smallest (00-20)", 23 | ) |> factor(levels = c("All", "Largest (80-100)", "Large (60-80)", "Mid (40-60)", "Small (20-40)", "Smallest (00-20)")), 24 | type = if_else(type=="Rank-Weighted", "Rank-ML", type) 25 | ) 26 | 27 | # Performance ----------------------------- 28 | pf_by_size[, type := type %>% factor(levels = pf_order)] 29 | pf_by_size %>% setorder(size, type, eom_ret) 30 | pf_by_size[, e_var_adj := (r-mean(r))^2, by=.(size, type)] 31 | pf_by_size[, utility_t := r-tc-0.5*e_var_adj*pf_set$gamma_rel] 32 | pf_summary_size <- pf_by_size[, .( 33 | n = .N, 34 | inv = mean(inv), 35 | shorting = mean(shorting), 36 | turnover_notional = mean(turnover), 37 | r = mean(r)*12, 38 | sd = sd(r)*sqrt(12), 39 | sr_gross = mean(r)/sd(r)*sqrt(12), 40 | tc = mean(tc)*12, 41 | r_tc = mean((r-tc))*12, 42 | sr = mean(r-tc)/sd(r)*sqrt(12), 43 | obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12#, 44 | # obj_exp = (mean(e_r)-0.5*mean(e_var)*pf_set$gamma_rel-mean(tc))*12 45 | ), by = .(size, type)][order(size, type)] 46 | 47 | # Plots ----------------------------------- 48 | (output$by_size <- pf_summary_size |> 49 | mutate(obj = if_else(obj < -1000, -1000, obj)) |> # Helps iron out a bug in geom_col that makes Markowitz look weirds 50 | ggplot(aes(type, obj)) + 51 | geom_col(fill=colours_theme[1], position="dodge") + 52 | geom_hline(yintercept = 0) + 53 | facet_wrap(~size, ncol=3) + 54 | coord_cartesian(ylim = c(-0.05, NA)) + 55 | labs(y = "Realized Utility") + 56 | theme( 57 | axis.title.x = element_blank(), 58 | axis.text.x = element_text(angle = 45, hjust = 1), 59 | legend.position = "none" 60 | )) 61 | -------------------------------------------------------------------------------- /6 - RF example.R: -------------------------------------------------------------------------------- 1 | source("main.R") 2 | # Settings --- 3 | f <- function(x) { 4 | 10*x+0.5*x^2-0.01*x^3-1.08^x+100*sin(x) 5 | } 6 | set.seed(1) 7 | # Data 8 | n <- 1000 9 | x <- runif(n, -30, 70) 10 | y_true <- f(x) 11 | y <- y_true + rnorm(n, 0, 150) 12 | 13 | # True function and OLS approximation ---- 14 | tibble( 15 | x=seq(-30, 70, by = 1), 16 | y_true=f(x), 17 | y = y_true+rnorm(101, 0, 150) 18 | ) %>% 19 | ggplot(aes(x=x, y=y)) + 20 | geom_point() + 21 | geom_line(aes(y=y_true)) + 22 | geom_smooth(method = "lm", se=F) 23 | 24 | # Various RFF approximations ------------ 25 | set.seed(1) 26 | ps <- seq(2^1, 2^9, by = 2) 27 | lambdas <- c(exp(c(0:10))) 28 | # Train/test split 29 | train_idx <- sample(1:n, size = n/2, replace=F) 30 | # Predict multiple lambdas and collect in nice tibble 31 | pred_tidy <- function(fit, x_act, x, y, y_true, lambdas) { 32 | preds <- predict(fit, newx = x, s = lambdas) 33 | colnames(preds) <- log(lambdas) 34 | preds |> 35 | as_tibble() |> 36 | mutate(y = y, x = x_act, y_true=y_true) |> 37 | pivot_longer(-c(x,y, y_true), names_to = "log_lambda", values_to = "pred") 38 | } 39 | # Simulate 40 | log_gs <- -3:2 41 | data <- exp(log_gs) |> map(function(g) { 42 | rff_x <- as.matrix(x) %>% rff(p = max(ps), g = g) 43 | data <- ps %>% lapply(function(p) { 44 | rff_x_new <- p^(-0.5)*cbind(rff_x$X_cos[, 1:(p/2)], rff_x$X_sin[, 1:(p/2)]) 45 | # Fit on training data 46 | x_train <- rff_x_new[train_idx, ] 47 | y_train <- y[train_idx] 48 | fit <- glmnet(x=x_train, y=y_train, alpha=0, lambda = lambdas) 49 | # Predict on train data 50 | y_true_train <- y_true[train_idx] 51 | train_op <- fit |> 52 | pred_tidy(x = x_train, x_act = x[train_idx], y = y_train, y_true = y_true_train, lambdas = lambdas) |> 53 | mutate(split = "train") 54 | # Predict on test data (for R2) 55 | x_test <- rff_x_new[-train_idx, ] 56 | y_test <- y[-train_idx] 57 | y_true_test <- y_true[-train_idx] 58 | test_op <- fit |> pred_tidy(x = x_test, x_act = x[-train_idx], y = y_test, y_true = y_true_test, lambdas = lambdas) |> 59 | mutate(split = "test") 60 | # Output 61 | rbind(train_op, test_op) |> 62 | mutate(p = p) 63 | }) %>% bind_rows() |> mutate(g) 64 | }) |> 65 | bind_rows() |> 66 | mutate( 67 | log_lambda = log_lambda |> factor(levels = log(lambdas)), 68 | g_name = paste0("log(g): ", log(g)) |> factor(levels = paste0("log(g): ", log_gs)) 69 | ) 70 | 71 | # Figure: See fit ------ 72 | # By ridgeless 73 | (output$rff_specific0 <- data |> 74 | filter(split=="train") |> 75 | filter(log_lambda %in% c(0) & g %in% c(1), p %in% 2^(1:6)) |> 76 | pivot_longer(c(y_true, pred)) |> 77 | # filter(log_lambda %in% c(2:5) & g %in% c(1)) |> 78 | mutate(p = paste0("P=", p) %>% factor(levels = paste0("P=", ps))) |> 79 | mutate(y = if_else(name == "y_true", y, NA_real_)) |> 80 | mutate(name = case_when( 81 | name == "pred" ~ "RF prediction", 82 | name == "y_true" ~ "True function" 83 | )) |> 84 | ggplot(aes(x, y = value, colour = name)) + 85 | geom_point(aes(y=y), colour = "grey", alpha=0.5) + 86 | geom_line() + 87 | # geom_line(aes(y=y_true), colour="black") + 88 | # geom_line(aes(y=pred)) + 89 | facet_wrap(~p) + 90 | labs(y="Value") + 91 | theme(legend.position = "top", legend.title = element_blank())) 92 | 93 | # By lambda 94 | (output$rff_specific1 <- data |> 95 | filter(split=="train") |> 96 | filter(log_lambda %in% c(0, 6, 10) & g %in% c(1), p %in% 2^(1:6)) |> 97 | # filter(log_lambda %in% c(2:5) & g %in% c(1)) |> 98 | mutate(p = paste0("P=", p) %>% factor(levels = paste0("P=", ps))) |> 99 | ggplot(aes(x, y = pred, colour = factor(log_lambda))) + 100 | geom_point(aes(y=y), colour = "grey", alpha=0.5) + 101 | geom_line(aes(y=y_true), colour="black") + 102 | geom_line(aes(y=pred)) + 103 | facet_wrap(~p) + 104 | labs(colour="Log(lambda)")) 105 | # By g 106 | (output$rff_specific2 <- data |> 107 | filter(split=="train") |> 108 | filter(log_lambda %in% c(0), p %in% 2^(1:6)) |> 109 | mutate(p = paste0("P=", p) %>% factor(levels = paste0("P=", ps))) |> 110 | ggplot(aes(x, y = pred, colour = factor(log(g)))) + 111 | geom_point(aes(y=y), colour = "grey", alpha=0.5) + 112 | geom_line(aes(y=y_true), colour="black") + 113 | geom_line(aes(y=pred)) + 114 | facet_wrap(~p) + 115 | labs(colour="Log(g)")) 116 | 117 | # Figure: R2 by g, p, and lambda ---- 118 | (output$rff_overview <- data |> 119 | filter(split=="test") |> 120 | group_by(g, p, log_lambda) |> 121 | summarise( 122 | r2 = 1 - sum((y - pred)^2) / sum((y - mean(y))^2) 123 | ) |> 124 | mutate(g = paste0("log(g): ", log(g)) |> factor(levels = paste0("log(g): ", log_gs))) |> 125 | ggplot(aes(x=p, y=r2, colour=factor(log_lambda))) + 126 | geom_point(size=1) + 127 | geom_line() + 128 | facet_wrap(~g) + 129 | labs(x = "Number of random features, p", colour = "Log(lambda)", y = "Out-of-sample R2")) 130 | 131 | 132 | 133 | 134 | # OLD SCRIBBLES ------------------------- 135 | if (FALSE) { 136 | # Simple example ------------------------------------------------- 137 | x <- seq(0, 1, 0.01) 138 | y <- case_when( 139 | x <= 0.25 ~ -0.005, 140 | x <= 0.50 ~ 0, 141 | x <= 0.75 ~ 0.005, 142 | x > 0.75 ~ 0.02, 143 | ) 144 | y <- case_when( 145 | x <= 0.75 ~ 0.005, 146 | x > 0.75 ~ 0.005 + (x-0.75)*0.1 147 | ) 148 | y <- -0.01+x*0.02 149 | y <- -0.01+0.01+x + 1*x^3 150 | y_true <- -0.01+20^x 151 | y_true <- -0.005+0.001*x+0.01*(1 / (1 + exp(-10*(x-0.1)))) 152 | # x <- seq(-30, 70, by = 1) 153 | # y_true <- 10*x+0.5*x^2-0.01*x^3-1.08^x#+100*sin(x) 154 | # y <- y_true + rnorm(length(y_true), 0, 150) 155 | 156 | # True function and OLS approximation 157 | y_true <- -10-100*x+300*x^2-1500*(x-0.5)^3+50^x 158 | y <- y_true + rnorm(length(y_true), 0, 10) 159 | tibble( 160 | x=x, 161 | y=y 162 | ) %>% 163 | ggplot(aes(x=x, y=y)) + 164 | geom_point() + 165 | geom_smooth(method = "lm", se=F) 166 | 167 | # RFF approximation 168 | set.seed(settings$seed_no) 169 | ps <- 1:6 170 | 171 | data <- c(settings$rff$g_vec[1], 1) |> map(function(g) { 172 | rff_x <- as.matrix(x) %>% rff(p = 2^max(ps), g = g) 173 | data <- ps %>% lapply(function(i) { 174 | p <- 2^i 175 | rff_x_new <- p^(-0.5)*cbind(rff_x$X_cos[, 1:(2^i/2)], rff_x$X_sin[, 1:(2^i/2)]) 176 | rff_reg <- lm(y~-1+rff_x_new) 177 | tibble( 178 | p = p, 179 | y_true = y_true, 180 | y = y, 181 | x = x, 182 | preds = drop(predict(rff_reg)), 183 | ols_pred = drop(predict(lm(y~x))) 184 | ) 185 | }) %>% bind_rows() |> mutate(g) 186 | }) |> bind_rows() 187 | data %>% 188 | pivot_longer(-c(g, p, x)) %>% 189 | mutate( 190 | p = paste0("P=", p) %>% factor(levels = paste0("P=", 2^ps)), 191 | name = case_when( 192 | name == "preds" ~ "RFF prediction", 193 | name == "y_true" ~ "Truth", 194 | name == "y" ~ "y", 195 | name == "ols_pred" ~ "OLS prediction" 196 | ) 197 | ) %>% 198 | filter(name != "OLS prediction") |> 199 | ggplot(aes(x, value, colour=name, linetype=factor(g))) + 200 | # geom_point(alpha=0.5) + 201 | geom_line() + 202 | facet_wrap(~p, scales = "free_y") + 203 | theme( 204 | legend.position = "top", 205 | legend.title = element_blank() 206 | ) 207 | 208 | data |> 209 | mutate(p = paste0("P=", p) %>% factor(levels = paste0("P=", 2^ps))) |> 210 | ggplot(aes(x, y = preds, colour = factor(round(g,2)))) + 211 | geom_point(aes(y=y), colour = "grey") + 212 | geom_line(aes(y=y_true), colour="black") + 213 | geom_line(aes(y=preds)) + 214 | facet_wrap(~p) + 215 | labs(colour="RFF with g") 216 | 217 | 218 | 219 | 220 | # Sequential plots 221 | rff_x_2 <- 2^(-0.5)*cbind(rff_x$X_cos[, 1], rff_x$X_sin[, 1]) 222 | rff_reg2 <- lm(y~-1+rff_x_2) 223 | rff_x_4 <- 4^(-0.5)*cbind(rff_x$X_cos[, 1:2], rff_x$X_sin[, 1:2]) 224 | rff_reg4 <- lm(y~-1+rff_x_4) 225 | 226 | 227 | 228 | rbind( 229 | tibble(x = x, y = rff_reg2$coefficients[1]*rff_x_2[, 1], name = "cos1", p = 2), 230 | tibble(x = x, y = rff_reg2$coefficients[2]*rff_x_2[, 2], name = "sin1", p = 2), 231 | tibble(x = x, y = rff_reg4$coefficients[1]*rff_x_4[, 1], name = "cos1", p = 4), 232 | tibble(x = x, y = rff_reg4$coefficients[2]*rff_x_4[, 2], name = "cos2", p = 4), 233 | tibble(x = x, y = rff_reg4$coefficients[3]*rff_x_4[, 3], name = "sin1", p = 4), 234 | tibble(x = x, y = rff_reg4$coefficients[4]*rff_x_4[, 4], name = "sin2", p = 4) 235 | ) %>% 236 | ggplot(aes(x, y, colour=name)) + 237 | geom_line() + 238 | facet_wrap(~p, scales = "free_y") 239 | 240 | rbind( 241 | tibble(x = x, y = rff_reg2$coefficients[1]*rff_x_2[, 1], name = "cos1", p = 2), 242 | tibble(x = x, y = rff_reg2$coefficients[2]*rff_x_2[, 2], name = "sin1", p = 2), 243 | tibble(x = x, y = rff_reg4$coefficients[1]*rff_x_4[, 1], name = "cos1", p = 4), 244 | tibble(x = x, y = rff_reg4$coefficients[2]*rff_x_4[, 2], name = "cos2", p = 4), 245 | tibble(x = x, y = rff_reg4$coefficients[3]*rff_x_4[, 3], name = "sin1", p = 4), 246 | tibble(x = x, y = rff_reg4$coefficients[4]*rff_x_4[, 4], name = "sin2", p = 4) 247 | ) %>% 248 | group_by(p, x) %>% 249 | summarise(pred_y = sum(y)) %>% 250 | ggplot(aes(x, pred_y, colour=factor(p))) + 251 | geom_line() 252 | 253 | test <- cbind(rff_x$X_cos, rff_x$X_sin) 254 | colnames(test) <- c(paste0("cos", 1:ncol(rff_x$X_cos)), paste0("sin", 1:ncol(rff_x$X_sin))) 255 | test <- cbind(x=x, test) 256 | 257 | as_tibble(test) %>% 258 | pivot_longer(-x) %>% 259 | mutate( 260 | type = substr(name, 1, 3), 261 | no = as.integer(substr(name, 4, 4)) 262 | ) %>% 263 | ggplot(aes(x, value, colour=factor(no))) + 264 | geom_line() + 265 | facet_wrap(~type, scales="free_y") 266 | 267 | tibble(x = x, cos_x = cos(-x), sin_x = sin(-x)) %>% 268 | pivot_longer(-x) %>% 269 | ggplot(aes(x, value,colour=name)) + 270 | geom_line() 271 | 272 | 273 | # One independent variable --------------------------------------- 274 | # True model ---------------------- 275 | x <- seq(-30, 70, by = 1) 276 | y <- 10*x+0.5*x^2-0.01*x^3-1.08^x#+100*sin(x) 277 | 278 | tibble(x=x, y=y) %>% 279 | ggplot(aes(x=x, y=y)) + 280 | geom_point() + 281 | geom_line() + 282 | geom_smooth(method = "lm", se=F) + 283 | labs(title = "y=10x+0.5x^2-0.01x^3-1.08^x+100*sin(x)") 284 | 285 | # RFF approximation -------------- 286 | set.seed(settings$seed_no) 287 | ps <- 1:6 288 | g_vec = exp(-4) 289 | data <- ps %>% lapply(function(p) { 290 | rff_x <- as.matrix(x) %>% rff(p = 2^p, g = 1) 291 | rff_x <- cbind(rff_x$X_cos, rff_x$X_sin) 292 | rff_reg <- lm(y~rff_x) 293 | tibble( 294 | p = p, 295 | y = y, 296 | x = x, 297 | preds = drop(predict(rff_reg)), 298 | ols_pred = drop(predict(lm(y~x))) 299 | ) 300 | }) %>% bind_rows() 301 | 302 | data %>% 303 | pivot_longer(-c(p, x)) %>% 304 | mutate( 305 | p = paste0("P=", 2^p) %>% factor(levels = paste0("P=", 2^ps)), 306 | name = case_when( 307 | name == "preds" ~ "RFF prediction", 308 | name == "y" ~ "Truth", 309 | name == "ols_pred" ~ "OLS prediction" 310 | ) 311 | ) %>% 312 | ggplot(aes(x, value, colour=name)) + 313 | # geom_point(alpha=0.5) + 314 | geom_line() + 315 | facet_wrap(~p, scales = "free_y") + 316 | theme( 317 | legend.position = "top", 318 | legend.title = element_blank() 319 | ) + 320 | labs(title = "y=10x+0.5x^2-0.01x^3-1.08^x+100*sin(x)") 321 | 322 | # Two variables ----------------------------- 323 | # True model ---------------------- 324 | x <- seq(-30, 70, by = 1) 325 | z <- c(rep(-1, length(x)), rep(1, length(x))) 326 | x <- rep(x, 2) 327 | y <- z*(10*x+0.5*x^2-0.01*x^3-1.08^x)#+100*sin(x)) 328 | X <- cbind(z, x) 329 | 330 | tibble(x=x, y=y, z=z) %>% 331 | ggplot(aes(x=x, y=y, colour = factor(z))) + 332 | geom_point() + 333 | geom_line() + 334 | geom_smooth(method = "lm", se=F) + 335 | labs(title = "y=10x+0.5x^2-0.01x^3-1.08^x+100*sin(x)") 336 | 337 | # RFF approximation -------------- 338 | set.seed(settings$seed_no) 339 | ps <- 1:7 340 | rff_x <- as.matrix(X) %>% rff(p = 2^max(ps), g = 1) 341 | data <- ps %>% lapply(function(i) { 342 | rff_x_new <- cbind(rff_x$X_cos[, 1:(2^i/2)], rff_x$X_sin[, 1:(2^i/2)]) 343 | tibble( 344 | p = 2^i, 345 | y = y, 346 | z = z, 347 | x = x, 348 | preds = drop(predict(lm(y~rff_x_new))), 349 | ols_pred = drop(predict(lm(y~X))) 350 | ) 351 | }) %>% bind_rows() 352 | 353 | (output$rff <- data %>% 354 | filter(p %in% 2^c(1, 5, 6)) %>% 355 | pivot_longer(-c(p, x, z)) %>% 356 | mutate( 357 | p = paste0("P=", p) %>% factor(levels = paste0("P=", 2^ps)), 358 | z = paste0("Z=", z), 359 | name = case_when( 360 | name == "preds" ~ "RF prediction", 361 | name == "y" ~ "Truth", 362 | name == "ols_pred" ~ "OLS prediction" 363 | ) 364 | ) %>% 365 | ggplot(aes(x, value, colour=name)) + 366 | # geom_point(alpha=0.5) + 367 | geom_line() + 368 | facet_wrap(factor(z)~p, ncol=3) + 369 | theme( 370 | legend.position = "top", 371 | legend.title = element_blank() 372 | ) + 373 | labs(x = "X", y = "Y"))#+ 374 | # labs(title = "y=z[10x+0.5x^2-0.01x^3-1.08^x+100*sin(x)]") 375 | 376 | # data %>% 377 | # filter(p %in% 2^(4:6)) %>% 378 | # pivot_wider(names_from = p, values_from = preds, names_prefix = "RFF") %>% 379 | # pivot_longer(-c(z, x)) %>% 380 | # ggplot(aes(x, value, colour=name)) + 381 | # geom_line() + 382 | # facet_wrap(~z) 383 | 384 | # Check RFF variation -------- 385 | x <- seq(0, 1, 0.01) 386 | rff_x <- as.matrix(x) %>% rff(p = 2^4, g = exp(4)) 387 | cbind(rff_x$X_cos, rff_x$X_sin) |> 388 | as_tibble() |> 389 | mutate(x = x) |> 390 | pivot_longer(-x) |> 391 | ggplot(aes(x, value, colour=name)) + 392 | geom_point() + 393 | facet_wrap(~name, scales = "free_y") 394 | 395 | x1 <- runif(n=115) 396 | sin(x1 %*% rnorm(n=115, mean=0, sd=exp(-4))) 397 | 398 | 399 | 400 | } 401 | -------------------------------------------------------------------------------- /6 - Short selling fees.R: -------------------------------------------------------------------------------- 1 | # Load shorting data 2 | short_fees <- fread("Data/short_fees.csv") 3 | short_fees <- short_fees[!is.na(dcbs) & record_type==1 & market_area=="USA Equity" & !is.na(permno)] 4 | 5 | # Expand shorting data 6 | if (FALSE) { 7 | # Check that shorting fees are somewhat stable over time and DCBS group 8 | short_fees[!is.na(indicative_fee), .(mean=mean(indicative_fee), median=median(indicative_fee)), by = .(date, dcbs)] %>% 9 | pivot_longer(-c(date, dcbs)) %>% 10 | ggplot(aes(date, value, colour=name)) + 11 | geom_point(size=0.5, alpha = 0.5) + 12 | # geom_line() + 13 | facet_wrap(~factor(dcbs), scales = "free_y") # Declining trend is natural as Markit adds more stocks 14 | } 15 | # For stocks in the data -------- 16 | short_fees[, sfee := if_else(!is.na(indicative_fee), indicative_fee, mean(indicative_fee, na.rm=T)), by = dcbs] 17 | 18 | # Take the last observation in a month for each stock 19 | short_fees[, eom := ceiling_date(date, unit="m")-1] 20 | short_fees[, max_date := max(date), by = .(permno, eom)] 21 | short_fees <- short_fees[date==max_date, .(permno, eom, sfee, dcbs)] 22 | 23 | # Remove duplicates # Very rare: short_fees[, .N, by = .(permno, eom)][, table(N)] 24 | short_fees[, n := .N, by = .(permno, eom)] 25 | short_fees <- short_fees[n == 1][, n := NULL] 26 | 27 | # Compute shorting fees for stocks not in the sample --------------------- 28 | # Analysis 29 | if (FALSE) { 30 | short_fees2 <- short_fees[, .(id=permno, eom, sfee, dcbs)][chars[, .(id, eom, market_equity, rvol_252d, dolvol_126d)], on = .(id, eom)] 31 | # How many are non-missing? Very few 32 | short_fees2[, .(non_miss=mean(!is.na(sfee))), by = eom][non_miss != 0] %>% ggplot(aes(eom, non_miss)) + geom_point() 33 | # What is the typical dcbs group? 96% of the sample are in the "easiest to borrow" group 34 | short_fees2[!is.na(dcbs), table(dcbs)/.N] 35 | # Surprisingly difficult to predict 36 | felm(sfee~market_equity | eom | 0 | eom+id, data = short_fees2) %>% summary() 37 | felm(sfee~market_equity+rvol_252d | eom | 0 | eom+id, data = short_fees2) %>% summary() 38 | felm(sfee~market_equity+rvol_252d+dolvol_126d | eom | 0 | eom+id, data = short_fees2) %>% summary() 39 | # Maybe it's caused by outliers? It helps, but not a lot 40 | short_fees2[!is.na(sfee) & eom==max(eom)] %>% ggplot(aes(market_equity, sfee)) + geom_point() 41 | felm(sfee~market_equity | eom | 0 | eom+id, data = short_fees2[sfee <= quantile(short_fees2$sfee, probs=0.99, na.rm=T)]) %>% summary() 42 | felm(sfee~market_equity+rvol_252d | eom | 0 | eom+id, data = short_fees2[sfee <= quantile(short_fees2$sfee, probs=0.99, na.rm=T)]) %>% summary() 43 | felm(sfee~market_equity+rvol_252d+dolvol_126d | eom | 0 | eom+id, data = short_fees2[sfee <= quantile(short_fees2$sfee, probs=0.99, na.rm=T)]) %>% summary() 44 | # Maybe impute by median borrow fee for stocks in sample? 45 | short_fees2[!is.na(sfee), .(n = .N, mean=mean(sfee), median=median(sfee))] 46 | short_fees2[!is.na(sfee), .(n = .N, mean=mean(sfee), median=median(sfee)), by = dcbs][, n := round(n / sum(n), 2)][order(dcbs)] 47 | } 48 | # Impute outside of sample using median 49 | short_fees <- short_fees[, .(id=permno, eom, sfee)][chars[, .(id, eom)], on = .(id, eom)] 50 | short_fees[, sfee := if_else(!is.na(sfee), sfee, median(sfee, na.rm=T))] 51 | 52 | # Incorporate shorting cost of each method ------------------------------------- 53 | list_short <- list( 54 | "Multiperiod-ML*" = mp$w, 55 | "Portfolio-ML" = pfml$w, 56 | "Static-ML*" = static$w, 57 | "Markowitz-ML" = tpf$w, 58 | "Factor-ML" = factor_ml$w, 59 | "1/N" = ew$w, 60 | "Market" = mkt$w, 61 | "Rank-ML" = rw$w, 62 | "Minimum Variance" = mv$w 63 | ) 64 | 65 | shorting_costs_ts <- names(list_short) %>% lapply(function(nm) { 66 | w <- list_short[[nm]] 67 | # Pay 100% of fee if short get 70% of fee if long [from p.10 in "anomalies and their short-sale cost"] 68 | short_cost <- short_fees[w, on = .(id, eom)][w<0, .(short_cost = sum(sfee*(-w))), by = eom] #[order(shorting_cost)][, mean(shorting_cost)] 69 | long_rev <- short_fees[w, on = .(id, eom)][w>0, .(long_rev = sum(sfee*w)), by = eom] #[order(shorting_cost)][, mean(shorting_cost)] 70 | sub <- w[, .(long = sum(w[w>0]), short = sum(w[w<0]), net_lev = sum(w)), by = eom][, type := nm] 71 | sub <- short_cost[sub, on = "eom"] 72 | long_rev[sub, on = .(eom)] 73 | }) %>% rbindlist() 74 | 75 | # Shorting costs 76 | shorting_costs <- shorting_costs_ts[, lapply(.SD, mean), .SDcols = c("long_rev", "short_cost", "long", "short", "net_lev"), by = type] 77 | shorting_costs[short==0, short_cost := 0] 78 | shorting_costs[, net_short_cost := short_cost-long_rev] 79 | 80 | # Output: Shorting costs are almost completely unimportant for our strategies [but this is ofcourse conditional on our ability to borroe] 81 | (test_tbl <- pf_summary[shorting_costs, on = .(type)][, .(type, long, short, short_cost, long_rev, net_short_cost, 82 | r_tc, 83 | r_tc_short = r_tc-short_cost, 84 | r_tc_short_net = r_tc-net_short_cost, 85 | sr, 86 | sr_short = (r_tc-short_cost)/sd, 87 | sr_short_net = (r_tc-net_short_cost)/sd)]) 88 | test_tbl[, .(type, sr, sr_short, sr_short_net)] 89 | 90 | # Alternative 1 91 | test_tbl %>% 92 | mutate(type = type %>% factor(levels = pf_order_new)) %>% 93 | arrange(type) %>% 94 | select(-c("r_tc", "r_tc_short", "r_tc_short_net")) %>% 95 | xtable(digits = c(0, 0, rep(2, 2), rep(4, 3), rep(2, 3))) %>% 96 | print(include.rownames=FALSE) 97 | 98 | # Alternative 2 99 | test_tbl %>% 100 | mutate(type = type %>% factor(levels = pf_order_new)) %>% 101 | arrange(type) %>% 102 | select(-c("sr", "sr_short", "sr_short_net")) %>% 103 | xtable(digits = c(0, 0, rep(2, 2), rep(4, 3), rep(2, 3))) %>% 104 | print(include.rownames=FALSE) 105 | 106 | # Figure ------------- 107 | (output$shorting <- pf_summary[shorting_costs, on = .(type)] |> 108 | filter(type %in% c("Portfolio-ML", "Multiperiod-ML*", "Static-ML*")) |> 109 | # select(type, shorting, r_tc, sd, obj, short_cost, long_rev) |> 110 | mutate( 111 | type = type %>% factor(levels = pf_order), 112 | obj_gross_short = obj - short_cost, 113 | obj_net_short = obj - short_cost + long_rev 114 | ) |> 115 | select(type, obj, obj_gross_short, obj_net_short) |> 116 | pivot_longer(-type) |> 117 | mutate( 118 | name = case_when( 119 | name == "obj" ~ "Utility", 120 | name == "obj_gross_short" ~ "Utility - short costs", 121 | name == "obj_net_short" ~ "Utility - short costs + long revenue" 122 | ) 123 | ) |> 124 | ggplot(aes(type, value, fill=name)) + 125 | geom_col(position = position_dodge(width = 0.93)) + 126 | geom_text(aes(label = formatC(value, format="f", digits=3)), position = position_dodge(width = 0.93), vjust = -0.2) + 127 | coord_cartesian(ylim = c(0, 0.1)) + 128 | labs(y = "Utility") + 129 | theme( 130 | legend.position = "top", 131 | legend.justification = "center", 132 | legend.title = element_blank(), 133 | axis.title.x = element_blank() 134 | )) 135 | 136 | # Plot: Utility: without shorting costs, with only gross shorting costs, with new shorting costs 137 | 138 | 139 | -------------------------------------------------------------------------------- /7 - Figures.R: -------------------------------------------------------------------------------- 1 | # Figures ------------------------- 2 | output_fig <- function(path, name, format, width, height) { 3 | path <- paste0(path, "/", name) # <- change path as desired 4 | if (format == "tex") { 5 | tikz(str_c(path, ".tex"), width = width, height = height) 6 | } 7 | if (format == "pdf") { 8 | pdf(str_c(path, ".pdf"), width = width, height = height) 9 | } 10 | if (format == "jpg") { 11 | w_pix <- width / (2.54 / 96 / 2.54) 12 | h_pix <- height / (2.54 / 96 / 2.54) 13 | jpeg(str_c(path, ".jpg"), width = w_pix, height = h_pix) 14 | } 15 | if (format == "eps") { 16 | cairo_ps(filename = str_c(path, ".eps"), 17 | width = width, height = height) 18 | } 19 | if (format == "tiff") { 20 | tiff(filename = str_c(path, ".tiff"), units="in", width=width, height=height, res=500) 21 | } 22 | } 23 | # For Paper ----------------------------------------- 24 | # Efficient frontier 25 | output$ef + theme(text = element_text(size=txt_size)) 26 | ggsave("Figures/ief_by_wealth.pdf", width = fig_w, height = fig_h, units = "in") 27 | 28 | # Efficient frontier all methods 29 | output$ef_all + theme(text = element_text(size=txt_size)) 30 | ggsave("Figures/ief_by_method.pdf", width = fig_w, height = fig_h, units = "in") 31 | 32 | # Portfolios: Cumulative Performance 33 | output$ts 34 | ggsave("Figures/cumret_pf.pdf", width = fig_w, height = fig_h, units = "in") 35 | 36 | # Portfollios: Stats over time 37 | output$comp_stats + theme(text = element_text(size=txt_size)) 38 | ggsave("Figures/stats_ts.pdf", width = fig_w, height = fig_h*1.5, units = "in") 39 | 40 | # Example: Apple vs. Xerox 41 | output$example + 42 | scale_y_continuous(labels = scales::number_format(accuracy = 0.01)) + 43 | theme( 44 | axis.text.y = element_text(size = 8) 45 | ) 46 | ggsave("Figures/example_weights.pdf", width = fig_w, height = fig_h*1.5, units = "in") 47 | 48 | # Feature Importance 49 | output$feature_importance + theme(text = element_text(size=txt_size)) 50 | ggsave("Figures/feat_imp.pdf", width = fig_w, height = fig_h, units = "in") 51 | 52 | # Optimal Hyper-parameters [Expected return] 53 | output$er_tuning 54 | ggsave("Figures/optimal_hps_er.pdf", width = fig_w, height = fig_h, units = "in") 55 | 56 | # Optimal Hyper-parameters 57 | output$portfolio_tuning 58 | ggsave("Figures/optimal_hps.pdf", width = fig_w, height = fig_h, units = "in") 59 | 60 | # Feature importance - Returns 61 | output$fi_returns + theme( 62 | axis.text.x = element_text(angle = 360-20, hjust = 0, vjust = 1), 63 | text = element_text(size = 8), 64 | axis.title = element_text(size = 7), 65 | legend.text = element_text(size = 5), # Adjust text size 66 | legend.title = element_text(size = 8), 67 | legend.key.size = unit(0.5, "cm") 68 | ) 69 | ggsave("Figures/fi_returns.pdf", width = fig_w, height = fig_h, units = "in") 70 | 71 | # Feature auto-correlation 72 | output$ar1 + theme(axis.text.y = element_text(size=4)) 73 | ggsave("Figures/feature_ar1.pdf", width = fig_w, height = fig_h*2, units = "in") 74 | 75 | # Efficient frontier with trading cost 76 | output$cf_ef_tc 77 | ggsave("Figures/ef_cf_tc.pdf", width = fig_w, height = fig_h, units = "in") 78 | 79 | # Efficient frontier without trading cost 80 | output$cf_ef_markowitz 81 | ggsave("Figures/ef_cf_no_tc.pdf", width = fig_w, height = fig_h, units = "in") 82 | 83 | # Performance across size distribution 84 | output$by_size 85 | ggsave("Figures/by_size.pdf", width = fig_w, height = fig_h*1.5, units = "in") 86 | 87 | # Shorting costs 88 | output$shorting 89 | ggsave("Figures/shorting.pdf", width = fig_w, height = fig_h, units = "in") 90 | 91 | # RFF example 92 | output_fig(path=output_path_fig, name = "rff_ex", format = format, width = fig_w, height = fig_h*1.5) 93 | output$rff_specific0 + theme(strip.text.x = element_text(size = 8)) 94 | dev.off() 95 | 96 | # Relation between liquidity and portfolio weight 97 | output$w_liq 98 | ggsave("Figures/w_liq.pdf", width = fig_w, height = fig_h, units = "in") 99 | 100 | # Simulations 101 | output$simulations + theme(text = element_text(size=txt_size)) 102 | ggsave("Figures/simulations.pdf", width = fig_w, height = fig_h, units = "in") -------------------------------------------------------------------------------- /7 - Numbers.R: -------------------------------------------------------------------------------- 1 | # Numbers mentioned in the text ----------------- 2 | # Number of stocks by the end of 2020: 3 | data[eom==as.Date("2020-12-31"), .N] 4 | # Dollar volume of Walmart and Xerox 5 | chars[id %in% c(22111, 27983),.(id, eom, dolvol=dolvol/1e6)][eom==as.Date("2020-12-31")] 6 | # Auto correlation of ret_1_0 and be_me 7 | ar1_ss[char %in% c("be_me", "ret_1_0")] 8 | cluster_labels[ar1_ss, on = .(characteristic=char)][cluster=="quality", median(ar1)] 9 | cluster_labels[ar1_ss, on = .(characteristic=char)][cluster=="momentum"] 10 | 11 | data_tc[valid==T & !is.na(be_me) & !is.na(ret_12_1) & be_me != 0.5 & ret_12_1 != 0.5, cor(be_me, ret_12_1), by = eom][, summary(V1)] 12 | data_tc[valid==T & !is.na(be_me) & !is.na(ret_12_1) & be_me != 0.5 & ret_12_1 != 0.5, cor(be_me, ret_12_1)] 13 | 14 | # Realized utility without second layer of portfolio tuning 15 | validation_m1[k==1 & g==0 & u==1, .(obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12, sr = mean(r-tc)/sd(r)*sqrt(12))] 16 | validation_static[k==1 & g==0 & u==1, .(obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12, sr = mean(r-tc)/sd(r)*sqrt(12))] 17 | validation_static[k==0.2 & g==0 & u==1, .(obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12, sr = mean(r-tc)/sd(r)*sqrt(12))] 18 | 19 | # Move investors from 10b to 1b at a relative risk aversion of 10 20 | large <- ef_ss[wealth_end==1e10 & gamma_rel == 10] 21 | # Large investors for a vol of 14% can get 22 | large[, r_tc] 23 | # Small investors for a vol of 14% can get 24 | (small_er <- ef_ss[wealth_end==1e9, r_tc[gamma_rel==100]+(large$sd-sd[gamma_rel==100])*(r_tc[gamma_rel==20]-r_tc[gamma_rel==100])/(sd[gamma_rel==20]-sd[gamma_rel==100])]) 25 | output$ef + annotate("point", x = large$sd, y = small_er) 26 | 27 | # Median dollar volume by end of sample (for simulations) 28 | chars[valid==T, median(dolvol/1e6), by = eom][eom==max(eom)] 29 | 30 | 31 | # Shorting fees ------------------------ 32 | short_fees[date==max(date)][,.(n=.N,fee=mean(indicative_fee)),by=dcbs][order(dcbs)][, pct := n/sum(n)][] 33 | short_sub <- short_fees[date==as.Date("2020-12-31")][chars[valid==T & eom==max(eom), .(id, eom, size_grp)], on = .(permno=id, date=eom)] 34 | short_sub[, mean(!is.na(indicative_fee))] 35 | short_sub$indicative_fee |> quantile(na.rm=T, probs = seq(0, 1, 0.01)) 36 | short_sub[,.(n=.N,fee=mean(indicative_fee)),by=dcbs][order(dcbs)][, pct := n/sum(n)][] 37 | -------------------------------------------------------------------------------- /7 - Tables.R: -------------------------------------------------------------------------------- 1 | # Tables ------------------- 2 | # Portfolio summary [Add "\\-1.2em after Markowitz-ML and remove TC, R-TC, SR_new and Utility from Markowtiz-ML] 3 | if (FALSE) { 4 | pf_summary %>% 5 | filter(type %in% c("Portfolio-ML", "Multiperiod-ML", "Static-ML", "Factor-ML", "Markowitz-ML", 6 | "Multiperiod-ML*", "Static-ML*")) %>% 7 | rbind( 8 | data.table(type="One tuning layer"), 9 | data.table(type="Two tuning layers"), fill=T 10 | ) %>% 11 | mutate( 12 | # tc = if_else(type == "Markowitz-ML", NA_real_, tc), 13 | # r_tc = if_else(type == "Markowitz-ML", NA_real_, r_tc), 14 | # sr = if_else(type == "Markowitz-ML", NA_real_, sr), 15 | # obj = if_else(type == "Markowitz-ML", NA_real_, obj), 16 | order = type %>% factor(levels = c("One tuning layer", "Portfolio-ML", "Multiperiod-ML", "Static-ML", "Factor-ML", "Markowitz-ML", 17 | "Two tuning layers", "Multiperiod-ML*", "Static-ML*")), 18 | Method = if_else(type %in% c("One tuning layer", "Two tuning layers"), paste0("\\textbf{", as.character(type), "}"), paste0("\\hspace{0.5em}", as.character(type))) 19 | ) %>% 20 | arrange(order) %>% 21 | select(Method, "R"=r, "Vol."=sd, "SR$_\\text{gross}$"=sr_gross, "TC"=tc, "R-TC"=r_tc, "SR$_\\text{net}$"=sr, "Utility"=obj, "Turnover"=turnover_notional, "Lev."=inv) %>% 22 | xtable(digits = c(0, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2)) %>% 23 | print.xtable(sanitize.text.function = identity, include.rownames = F, caption.placement = "top") 24 | } else { 25 | # With all alternatives 26 | pf_summary %>% 27 | filter(type %in% c("Portfolio-ML", "Multiperiod-ML", "Static-ML", "Markowitz-ML", "Rank-ML", 28 | "Factor-ML", "Minimum Variance", "1/N", "Market", 29 | "Multiperiod-ML*", "Static-ML*")) %>% 30 | rbind( 31 | data.table(type="One tuning layer"), 32 | data.table(type="Two tuning layers"), fill=T 33 | ) %>% 34 | mutate( 35 | # tc = if_else(type == "Markowitz-ML", NA_real_, tc), 36 | # r_tc = if_else(type == "Markowitz-ML", NA_real_, r_tc), 37 | # sr = if_else(type == "Markowitz-ML", NA_real_, sr), 38 | # obj = if_else(type == "Markowitz-ML", NA_real_, obj), 39 | order = type %>% factor(levels = c("One tuning layer", "Portfolio-ML", "Multiperiod-ML", "Static-ML", "Markowitz-ML", "Factor-ML", 40 | "Rank-ML", "Minimum Variance", "1/N", "Market", 41 | "Two tuning layers", "Multiperiod-ML*", "Static-ML*")), 42 | Method = if_else(type %in% c("One tuning layer", "Two tuning layers"), paste0("\\textbf{", as.character(type), "}"), paste0("\\hspace{0.5em}", as.character(type))) 43 | ) %>% 44 | arrange(order) %>% 45 | select(Method, "R"=r, "Vol."=sd, "SR$_\\text{gross}$"=sr_gross, "TC"=tc, "R-TC"=r_tc, "SR$_\\text{net}$"=sr, "Utility"=obj, "Turnover"=turnover_notional, "Lev."=inv) %>% 46 | xtable(digits = c(0, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2)) %>% 47 | print.xtable(sanitize.text.function = identity, include.rownames = F, caption.placement = "top") 48 | } 49 | 50 | 51 | 52 | 53 | # Probability of outperformance 54 | prob_outperformance %>% 55 | mutate( 56 | prob_main_op = paste0(formatC(prob_main_op*100, digits = 1, format = "f"), "\\%") 57 | ) %>% 58 | pivot_wider(names_from = alt, values_from = prob_main_op) %>% 59 | select(all_of(c("main", main_types))) %>% 60 | rename(" "=main) %>% 61 | xtable(align = "llrrrrr") %>% 62 | print.xtable(sanitize.text.function = identity, include.rownames = F, caption.placement = "top") 63 | 64 | # Correlation table 65 | pf_cors_op <- formatC(pf_cors, digits = 2, format="f") 66 | pf_cors_op[upper.tri(pf_cors_op, diag = F)]<-"" 67 | # pf_cors_op[-1, -ncol(pf_cors_op)] %>% 68 | pf_cors_op %>% 69 | xtable(align="lccccc") %>% 70 | print.xtable(include.rownames = T, caption.placement = "top") 71 | 72 | # Features information ----------------------------------------------- 73 | char_tbl <- cluster_labels[characteristic %in% features][order(cluster)][, .(n=1:.N,"Characteristic"=characteristic, "Cluster"=cluster)] 74 | start <- ceiling(nrow(char_tbl)/3) 75 | cbind(char_tbl[1:start], char_tbl[(start+1):(start*2)], rbind(char_tbl[(start*2+1):nrow(char_tbl)], data.table(n="", Characteristic="", Cluster = ""), data.table(n="", Characteristic="", Cluster = ""))) %>% 76 | xtable(align="llllllllll", caption = "tbl:chars") %>% 77 | print.xtable(include.rownames = F, caption.placement = "top") 78 | -------------------------------------------------------------------------------- /Joblists/joblist_models.txt: -------------------------------------------------------------------------------- 1 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models1.txt 2 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models2.txt 3 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models3.txt 4 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models4.txt 5 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models5.txt 6 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models6.txt 7 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models7.txt 8 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models8.txt 9 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models9.txt 10 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models10.txt 11 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models11.txt 12 | module load R/4.3.0-foss-2020b; Rscript slurm_fit_models.R config_files/config_models12.txt -------------------------------------------------------------------------------- /Joblists/joblist_pfchoice_all.txt: -------------------------------------------------------------------------------- 1 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_all.txt -------------------------------------------------------------------------------- /Joblists/joblist_pfchoice_base.txt: -------------------------------------------------------------------------------- 1 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_base.txt -------------------------------------------------------------------------------- /Joblists/joblist_pfchoice_fi.txt: -------------------------------------------------------------------------------- 1 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_fi_base.txt 2 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_fi_ief.txt 3 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_fi_ret.txt -------------------------------------------------------------------------------- /Joblists/joblist_pfchoice_ief.txt: -------------------------------------------------------------------------------- 1 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g1.txt 2 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g2.txt 3 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g3.txt 4 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g4.txt 5 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g5.txt 6 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g1.txt 7 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g2.txt 8 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g3.txt 9 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g4.txt 10 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g5.txt 11 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g1.txt 12 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g2.txt 13 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g3.txt 14 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g4.txt 15 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g5.txt 16 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g1.txt 17 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g2.txt 18 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g3.txt 19 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g4.txt 20 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g5.txt -------------------------------------------------------------------------------- /Joblists/joblist_pfchoice_size.txt: -------------------------------------------------------------------------------- 1 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_size1.txt 2 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_size2.txt 3 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_size3.txt 4 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_size4.txt 5 | module load R/4.3.0-foss-2020b; Rscript slurm_build_portfolios.R config_files/config_pfchoice_size5.txt -------------------------------------------------------------------------------- /Joblists/joblist_simulations.txt: -------------------------------------------------------------------------------- 1 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim1.txt 2 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim2.txt 3 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim3.txt 4 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim4.txt 5 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim5.txt 6 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim6.txt 7 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim7.txt 8 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim8.txt 9 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim9.txt 10 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim10.txt 11 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim11.txt 12 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim12.txt 13 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim13.txt 14 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim14.txt 15 | module load R/4.3.0-foss-2020b; Rscript simulations/simulations.R config_files/config_sim15.txt -------------------------------------------------------------------------------- /Main.R: -------------------------------------------------------------------------------- 1 | # Libraries --------------------------------------- 2 | library(tictoc) 3 | library(lfe) 4 | library(cowplot) 5 | library(MASS) 6 | library(xtable) 7 | library(roll) 8 | library(expm) 9 | library(glmnet) 10 | library(lubridate) 11 | library(tidyverse) 12 | library(data.table) 13 | source("0 - General functions.R") 14 | source("0 - Return prediction functions.R") 15 | source("0 - Portfolio choice functions.R") 16 | Rcpp::sourceCpp("ewma.cpp") 17 | Rcpp::sourceCpp("sqrtm_cpp.cpp") 18 | options(dplyr.summarise.inform = FALSE) 19 | 20 | output <- list() 21 | 22 | # Layout Settings --------------- 23 | theme_set(theme_classic()) 24 | colours_theme <- c("#0C6291", "#A63446", RColorBrewer::brewer.pal(8, "Dark2"), 25 | "darkslategrey", "blue3", "red3", "purple2", "yellow2", "aquamarine", 26 | "grey", "salmon", "antiquewhite", "chartreuse") 27 | scale_colour_discrete <- function(...) { 28 | scale_colour_manual(..., values = colours_theme) 29 | } 30 | scale_fill_discrete <- function(...) { 31 | scale_fill_manual(..., values = colours_theme) 32 | } 33 | 34 | pf_order <- c("Portfolio-ML", 35 | "Multiperiod-ML", "Multiperiod-ML*", 36 | "Static-ML", "Static-ML*", 37 | "Market", "1/N", "Minimum Variance", "Factor-ML", "Rank-ML", "Markowitz-ML") 38 | pf_order_new <- c("Portfolio-ML", "Multiperiod-ML*", "Static-ML*", "Multiperiod-ML", "Static-ML", "Markowitz-ML", "Rank-ML", 39 | "Factor-ML", "Minimum Variance", "1/N", "Market") 40 | main_types <- c("Portfolio-ML", "Multiperiod-ML*", "Static-ML*", "Factor-ML", "Markowitz-ML") 41 | cluster_order <- c("Accruals", "Debt Issuance", "Investment", "Short-Term Reversal", "Value", 42 | "Low Risk", "Quality", "Momentum", "Profitability", "Profit Growth", 43 | "Seasonality", "Size", "Low Leverage") 44 | 45 | txt_size <- 10 46 | 47 | output_path_fig <- "Figures" 48 | format <- "pdf" 49 | fig_h <- 3.2 50 | fig_w <- 6.5 # 6.5 is rouhghly the page width 51 | 52 | # Setup ------------------------- 53 | # Data settings 54 | settings <- list( 55 | parallel = T, 56 | seed_no = 1, 57 | months = FALSE, 58 | split = list( 59 | train_end = as.Date("1970-12-31"), # Change to 1994-12-31? 60 | test_end = as.Date("2020-12-31"), 61 | val_years = 10, 62 | model_update_freq = "yearly", # In c("once", "yearly", "decade") 63 | train_lookback = 1000, # Set to high number (e.g. 1000) if you want an expanding 64 | retrain_lookback = 1000 # Set to high number (e.g. 1000) if you want an expanding 65 | ), 66 | feat_prank = T, # Percentile rank features? 67 | ret_impute = "zero", # Impute missing returns with 68 | feat_impute = T, 69 | addition_n = 12, # Need to be valid for the past n months to be included in investment universe 70 | deletion_n = 12, # Exclude from universe after N periods where the stocks is non-valid 71 | screens = list( 72 | start = as.Date("1952-12-31"), 73 | end = as.Date("2020-12-31"), 74 | feat_pct = 0.5, 75 | nyse_stocks = T 76 | ), 77 | pi = 0.1, # What is the price impact as a function of the daily trading volume? For example, if pi=0.1, then trading 1% of the daily dollar volume has an average impact of 0.1% 78 | rff = list( 79 | p_vec = 2^c(1:9), 80 | g_vec = exp(-3:-2), 81 | l_vec = c(0, exp(seq(-10, 10, length.out = 100))) # Before we also had exp(-100) but that's effectively zero 82 | ), 83 | pf = list( 84 | dates = list( 85 | start_year = 1971, 86 | end_yr = 2020, 87 | split_years = 10 88 | ), 89 | hps = list( 90 | cov_type = "cov_add", 91 | m1 = list( 92 | k = c(1, 2, 3), 93 | u = c(0.25, 0.5, 1), 94 | g = c(0, 1, 2), 95 | K = 12 96 | ), 97 | # m2 = list(l = c(0, exp(-3:3))), # c(0, exp(-10:3)) 98 | static = list( 99 | k = c(1/1, 1/3, 1/5), 100 | u = c(0.25, 0.5, 1), 101 | g = c(0, 1, 2) 102 | ) 103 | ) 104 | ), 105 | pf_ml = list( 106 | g_vec = exp(-3:-2), 107 | p_vec = c(2^(6:9)), 108 | l_vec = c(0, exp(seq(-10, 10, length.out = 100))), # Vastly expanded the search set for lambda 109 | orig_feat = F, # Should original features be added to the list of random features? 110 | scale = T 111 | ), 112 | ef = list( 113 | wealth = c(1, 1e9, 1e10, 1e11), 114 | gamma_rel = c(1, 5, 10, 20, 100) 115 | ), 116 | cov_set = list( 117 | industries = T, # Should we include industry dummies in the covariance matrix? No feasible when we look across size groups (because not all induistries have observations) 118 | obs = 252*10, # Check tibble(t = 1:(settings$cov_set$obs), w = (0.5^(1/(settings$cov_set$hl_cor)))^t) %>% ggplot(aes(t, w)) + geom_point() + geom_hline(yintercept = 0) 119 | hl_cor = 252*3/2, # Barra uses 48 months as half-life, seems high. https://www.alacra.com/alacra/help/barra_handbook_GEM.pdf 120 | hl_var = 252/2, 121 | hl_stock_var = 252/2, 122 | min_stock_obs = 252, 123 | initial_var_obs = 21*3 124 | ), 125 | factor_ml = list( 126 | n_pfs = 10 127 | ) 128 | ) 129 | 130 | set.seed(settings$seed_no) 131 | 132 | # Portfolio settings ----------- 133 | pf_set <- list( 134 | wealth = 1e10, 135 | gamma_rel = 10, 136 | mu = 0.007, # How much do we expect the portfolio to grow each month? market[year(eom_ret)>=1980, mean(mkt_vw_exc, na.rm=T)] 137 | # aim_hor = max_t_hor-1, 138 | lb_hor = 11 # Change?? 139 | ) 140 | 141 | # Features --------------------- 142 | features <- c( 143 | "age", "aliq_at", "aliq_mat", "ami_126d", 144 | "at_be", "at_gr1", "at_me", "at_turnover", 145 | "be_gr1a", "be_me", "beta_60m", "beta_dimson_21d", 146 | "betabab_1260d", "betadown_252d", "bev_mev", "bidaskhl_21d", 147 | "capex_abn", "capx_gr1", "capx_gr2", "capx_gr3", 148 | "cash_at", "chcsho_12m", "coa_gr1a", "col_gr1a", 149 | "cop_at", "cop_atl1", "corr_1260d", "coskew_21d", 150 | "cowc_gr1a", "dbnetis_at", "debt_gr3", "debt_me", 151 | "dgp_dsale", "div12m_me", "dolvol_126d", "dolvol_var_126d", 152 | "dsale_dinv", "dsale_drec", "dsale_dsga", "earnings_variability", 153 | "ebit_bev", "ebit_sale", "ebitda_mev", "emp_gr1", 154 | "eq_dur", "eqnetis_at", "eqnpo_12m", "eqnpo_me", 155 | "eqpo_me", "f_score", "fcf_me", "fnl_gr1a", 156 | "gp_at", "gp_atl1", "ival_me", "inv_gr1", 157 | "inv_gr1a", "iskew_capm_21d", "iskew_ff3_21d", "iskew_hxz4_21d", 158 | "ivol_capm_21d", "ivol_capm_252d", "ivol_ff3_21d", "ivol_hxz4_21d", 159 | "kz_index", "lnoa_gr1a", "lti_gr1a", "market_equity", 160 | "mispricing_mgmt", "mispricing_perf", "ncoa_gr1a", "ncol_gr1a", 161 | "netdebt_me", "netis_at", "nfna_gr1a", "ni_ar1", 162 | "ni_be", "ni_inc8q", "ni_ivol", "ni_me", 163 | "niq_at", "niq_at_chg1", "niq_be", "niq_be_chg1", 164 | "niq_su", "nncoa_gr1a", "noa_at", "noa_gr1a", 165 | "o_score", "oaccruals_at", "oaccruals_ni", "ocf_at", 166 | "ocf_at_chg1", "ocf_me", "ocfq_saleq_std", "op_at", 167 | "op_atl1", "ope_be", "ope_bel1", "opex_at", 168 | "pi_nix", "ppeinv_gr1a", "prc", "prc_highprc_252d", 169 | "qmj", "qmj_growth", "qmj_prof", "qmj_safety", 170 | "rd_me", "rd_sale", "rd5_at", "resff3_12_1", 171 | "resff3_6_1", "ret_1_0", "ret_12_1", "ret_12_7", 172 | "ret_3_1", "ret_6_1", "ret_60_12", "ret_9_1", 173 | "rmax1_21d", "rmax5_21d", "rmax5_rvol_21d", "rskew_21d", 174 | "rvol_21d", "sale_bev", "sale_emp_gr1", "sale_gr1", 175 | "sale_gr3", "sale_me", "saleq_gr1", "saleq_su", 176 | "seas_1_1an", "seas_1_1na", "seas_11_15an", "seas_11_15na", 177 | "seas_16_20an", "seas_16_20na", "seas_2_5an", "seas_2_5na", 178 | "seas_6_10an", "seas_6_10na", "sti_gr1a", "taccruals_at", 179 | "taccruals_ni", "tangibility", "tax_gr1a", "turnover_126d", 180 | "turnover_var_126d", "z_score", "zero_trades_126d", "zero_trades_21d", 181 | "zero_trades_252d", 182 | "rvol_252d" 183 | ) 184 | # Exclude features without sufficient coverage 185 | feat_excl <- c("capex_abn", "capx_gr2", "capx_gr3", "debt_gr3", "dgp_dsale", 186 | "dsale_dinv", "dsale_drec", "dsale_dsga", "earnings_variability", "eqnetis_at", 187 | "eqnpo_me", "eqpo_me", "f_score", "iskew_hxz4_21d", "ivol_hxz4_21d", 188 | "netis_at", "ni_ar1", "ni_inc8q", "ni_ivol", "niq_at", "niq_at_chg1", "niq_be", 189 | "niq_be_chg1", "niq_su", "ocfq_saleq_std", "qmj", "qmj_growth", "rd_me", 190 | "rd_sale", "rd5_at", "resff3_12_1", "resff3_6_1", "sale_gr3", "saleq_gr1", 191 | "saleq_su", "seas_16_20an", "seas_16_20na", "sti_gr1a", "z_score") 192 | features <- features[!(features %in% feat_excl)] 193 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | This repository contains the code used for the paper [Machine Learning and the Implementable Efficient Frontier](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4187217) by Jensen, Kelly, Malamud, and Pedersen (2024). Please cite this paper if you are using the code: 3 | ``` 4 | @article{JensenKellyMalamudPedersen2024, 5 | author = {Jensen, Theis Ingerslev and Kelly, Bryan and Malamud, Semyon and Pedersen, Lasse Heje}, 6 | title = {Machine Learning and the Implementable Efficient Frontier}, 7 | year = {2024} 8 | } 9 | ``` 10 | 11 | Please send questions about the code to Theis I. Jensen at [theis.jensen@yale.edu](mailto:theis.jensen@yale.edu). 12 | 13 | 14 | # How to run the code 15 | To run the code, clone this repo to your local computing environment, and follow the steps explained below. We note that replicating our analysis requires substantial computational resources, and the code is set up to be executed on a high performance computing cluster with a SLURM scheduler. 16 | 17 | ## Data 18 | You need eight data sets to run the code. 19 | - `usa.csv` 20 | - Firm characteristics at a monthly frequency from the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/10.1111/jofi.13249) by Jensen, Kelly, and Pedersen (2023) 21 | - Download from [WRDS](https://wrds-www.wharton.upenn.edu/pages/get-data/contributed-data-forms/global-factor-data/). To get US data, require that the column `excntry` is equal to "USA" 22 | - `usa_dsf.csv` 23 | - Stock returns at a daily frequency 24 | - The data can be generated by following the instructions from the [GitHub repository](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) from "Is There a Replication Crisis in Finance.'' Alternatively, you can request the data from us by sending an email to [theis.jensen@yale.edu](mailto:theis.jensen@yale.edu) 25 | - `world_ret_monthly.csv` 26 | - Stock returns at a monthly frequency 27 | - The data can be generated by following the instructions from the [GitHub repository](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) from "Is There a Replication Crisis in Finance.'' Alternatively, you can request the data from us by sending an email to [theis.jensen@yale.edu](mailto:theis.jensen@yale.edu) 28 | - `Factor Details.xlsx` 29 | - Information about factor characteristics from "Is There a Replication Crisis in Finance" 30 | - Download from [GitHub/bkelly-lab/ReplicationCrisis/GlobalFactors/Factor Details.xlsx](https://github.com/bkelly-lab/ReplicationCrisis/blob/master/GlobalFactors/Factor%20Details.xlsx) 31 | - `Cluster Labels.csv` 32 | - Information about factor characteristics from "Is There a Replication Crisis in Finance" 33 | - Download from [GitHub/bkelly-lab/ReplicationCrisis/GlobalFactors/Cluster Labels.csv](https://github.com/bkelly-lab/ReplicationCrisis/blob/master/GlobalFactors/Cluster%20Labels.csv) 34 | - `market_returns.csv` 35 | - Market returns from "Is There a Replication Crisis in Finance" 36 | - Download from [Dropbox](https://www.dropbox.com/sh/xq278bryrj0qf9s/AABUTvTGok91kakyL07LKyQoa?dl=0) 37 | - `ff3_m.csv` 38 | - The Fama-French 3-factor model data (used to get the risk-free rate) 39 | - Download from [Kenneth French's data library](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_CSV.zip) 40 | - `short_fees` 41 | - Short-selling fees based on the Markit Securities Finance Analytics - American Equities database. You can run the vast majority of the code without this data set (the exception being `6 - Short selling fees.R`) 42 | - Download from [WRDS](https://wrds-www.wharton.upenn.edu/pages/get-data/markit/markit-securities-finance-analytics-equities/american-equities/) 43 | 44 | These data sets should be saved in the `Data` folder with the exact names used above. 45 | 46 | ## Data generation 47 | In this section, we'll go through the steps needed to implement the portfolio choice methods considered in the paper and implement the portfolio choice methods we use in the paper. This step is by far the most computationally intensive. We used the [dSQ module](https://docs.ycrc.yale.edu/clusters-at-yale/job-scheduling/dsq/) to submit multiple jobs at the same time to a Slurm scheduler. Below, we include our dSQ calls to give you a sense of the computational resources required to run each step. 48 | 49 | ### Return prediction models 50 | - What: Estimate the 12 models used to predict realized returns at time t+1, t+2, ..., t+12 51 | - dSQ call: 52 | ``dsq --job-file Joblists/joblist_models.txt --cpus-per-task=32 --mem=100G --partition=day -t 06:00:00 --mail-type ALL --output slurm_output/dsq-joblist_models-%A_%1a-%N.out``. 53 | This call will start 12 independent jobs, which for us took a maximum of 5 hours and required approximately 75GB RAM for each job 54 | - Main R script: `slurm_fit_models.R` 55 | - Output folder: `Data/Generated/Models` 56 | 57 | ### Portfolios: base case 58 | - What: Implement portfolio choice methods with the base case parameters used for tables 2-4 and figures 2-4 and D.4 59 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_base.txt --cpus-per-task=48 --mem=60G --partition=week -t 1-10:00:00 --mail-type AL 60 | L --output slurm_output/dsq-joblist_pfchoice-base-%A_%1a-%N.out`. 61 | This call will start 1 job, which for us took a maximum of 6 hours and required approximately 40GB RAM 62 | - Main R script: ``slurm_build_portfolios.R`` 63 | - Output folder: `Data/Generated/Portfolios` 64 | 65 | ### Portfolios: all 66 | - What: Implement the portfolio choice methods for all stocks used for the top-left panel in Figure 8 67 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_all.txt --cpus-per-task=32 --mem=100G --partition=week -t 5-00:00:00 --mail-type AL 68 | L --output slurm_output/dsq-joblist_pfchoice-all-%A_%1a-%N.out`. This call will start 1 job, which for us took a maximum of 2 days and 16 hours and required approximately 70GB RAM 69 | - Main R script: ``slurm_build_portfolios.R`` 70 | - Output folder: `Data/Generated/Portfolios` 71 | 72 | ### Portfolios: size groups 73 | - What: Implement the portfolio choice methods for stocks in different size groups used for the remaining panels in Figure 8 74 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_size.txt --cpus-per-task=16 --mem=50G --partition=day -t 8:00:00 --mail-type ALL -- 75 | output slurm_output/dsq-joblist_pfchoice-size-%A_%1a-%N.out`. This call will start 5 jobs, which for us took a maximum of 5 hours and required approximately 30GB RAM for each job 76 | - Main R script: ``slurm_build_portfolios.R`` 77 | - Output folder: `Data/Generated/Portfolios` 78 | 79 | ### Implementable Efficient Frontier: 80 | - What: Implement portfolio choice methods for different combinations of wealth and risk aversion to generate the implementable efficient frontier from Figure 1 81 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_ief.txt --cpus-per-task=16 --mem=50G --partition=day -t 10:00:00 --mail-type ALL -- 82 | output slurm_output/dsq-joblist_pfchoice-ief-%A_%1a-%N.out`. This call will start 20 independent jobs, which for us took a maximum of 7 hours and required approximately 40GB RAM for each job 83 | - Main R script: ``slurm_build_portfolios.R`` 84 | - Output folder: `Data/Generated/Portfolios` 85 | 86 | ### Economic feature importance 87 | - What: Implement the permutation-based feature importance analysis used for figures 5, 6, and D.3 88 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_fi.txt --cpus-per-task=48 --mem=70G --partition=day -t 23:00:00 --mail-type ALL --o 89 | utput slurm_output/dsq-joblist_pfchoice-fi-%A_%1a-%N.out` 90 | This call will start 3 independent jobs, which for us took a maximum of 3 hours and required approximately 35GB RAM for each job 91 | - Main R script: `slurm_build_portfolios.R` 92 | - Output folder: `Data/Generated/Portfolios` 93 | 94 | ### Simulations 95 | - What: Implement simulations from Appendix Section E 96 | - dSQ call: `dsq --job-file Joblists/joblist_simulations.txt --cpus-per-task=32 --mem=50G --partition=day -t 10:00:00 --mail-type ALL --o 97 | utput slurm_output/dsq-joblist_simulations-%A_%1a-%N.out`. 98 | This call will start 15 independent jobs, which for us took a maximum of 9 hours and required approximately 25GB for each job 99 | - Main R script: `simulations/simulations.R` 100 | - Output folder: `simulations/results` 101 | 102 | ## Data analysis 103 | After generating the data from the previous section, you can analyze it on your local PC. Specifically, you can generate all figures and tables from the paper by running the scripts below. Importantly, you need to go through each script and ensure they point to the correct files (the names of the files from the previous sections depend on when the code was submitted). 104 | 105 | You can generate the tables and figures from the paper by running the `separate_analysis_run.R` script. This script will generate the main data set, import the results from the previous section, and generate the figures and tables from the following scripts: 106 | 107 | - `6 - Implementable efficient frontier.R` 108 | - `6 - Base analysis.R` 109 | - `6 - Performance across size distribution.R` 110 | - `6 - Feature Importance.R` 111 | - `6 - Economic intuition.R` 112 | - `6 - Short selling fees.R` 113 | - `6 - RF Example.R` 114 | 115 | Finally, run the scripts below to save the figures and tables, as well as generate various numbers mentioned in the paper: 116 | 117 | - `7 - Figures.R` 118 | - `7 - Tables.R` 119 | - `7 - Numbers.R` 120 | 121 | After running these scripts, you should have the figures from the paper in the `Figures` folder and be able to copy-paste the tables in latex format and the numbers from the console. 122 | -------------------------------------------------------------------------------- /config_files/config_models1.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 1 -------------------------------------------------------------------------------- /config_files/config_models10.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 10 -------------------------------------------------------------------------------- /config_files/config_models11.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 11 -------------------------------------------------------------------------------- /config_files/config_models12.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 12 -------------------------------------------------------------------------------- /config_files/config_models2.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 2 -------------------------------------------------------------------------------- /config_files/config_models3.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 3 -------------------------------------------------------------------------------- /config_files/config_models4.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 4 -------------------------------------------------------------------------------- /config_files/config_models5.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 5 -------------------------------------------------------------------------------- /config_files/config_models6.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 6 -------------------------------------------------------------------------------- /config_files/config_models7.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 7 -------------------------------------------------------------------------------- /config_files/config_models8.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 8 -------------------------------------------------------------------------------- /config_files/config_models9.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | horizon = 9 -------------------------------------------------------------------------------- /config_files/config_pfchoice_all.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "all" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 10 6 | update_mp = TRUE 7 | update_base = TRUE 8 | update_fi_base = FALSE 9 | update_fi_ief = FALSE 10 | update_fi_ret = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_base.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 10 6 | update_mp = TRUE 7 | update_base = TRUE 8 | update_fi_base = FALSE 9 | update_fi_ief = FALSE 10 | update_fi_ret = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_fi_base.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 10 6 | update_mp = FALSE 7 | update_base = FALSE 8 | update_fi_base = TRUE 9 | update_fi_ief = FALSE 10 | update_fi_ret = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_fi_ief.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 10 6 | update_mp = FALSE 7 | update_base = FALSE 8 | update_fi_base = FALSE 9 | update_fi_ief = TRUE 10 | update_fi_ret = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_fi_ret.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 10 6 | update_mp = FALSE 7 | update_base = FALSE 8 | update_fi_base = FALSE 9 | update_fi_ief = FALSE 10 | update_fi_ret = TRUE 11 | -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w1g1.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1 5 | gamma_rel = 1 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w1g2.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1 5 | gamma_rel = 5 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w1g3.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1 5 | gamma_rel = 10 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w1g4.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1 5 | gamma_rel = 20 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w1g5.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1 5 | gamma_rel = 100 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w2g1.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e9 5 | gamma_rel = 1 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w2g2.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e9 5 | gamma_rel = 5 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w2g3.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e9 5 | gamma_rel = 10 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w2g4.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e9 5 | gamma_rel = 20 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w2g5.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e9 5 | gamma_rel = 100 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w3g1.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 1 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w3g2.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 5 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w3g3.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 10 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w3g4.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 20 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w3g5.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e10 5 | gamma_rel = 100 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w4g1.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e11 5 | gamma_rel = 1 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w4g2.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e11 5 | gamma_rel = 5 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w4g3.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e11 5 | gamma_rel = 10 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w4g4.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e11 5 | gamma_rel = 20 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_ief_w4g5.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low50_high100_min50" 3 | industry_cov = TRUE 4 | wealth = 1e11 5 | gamma_rel = 100 6 | update_mp = FALSE -------------------------------------------------------------------------------- /config_files/config_pfchoice_size1.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low00_high20_min50" 3 | wealth = 1e10 4 | industry_cov = FALSE 5 | gamma_rel = 10 6 | update_mp = TRUE -------------------------------------------------------------------------------- /config_files/config_pfchoice_size2.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low20_high40_min50" 3 | wealth = 1e10 4 | industry_cov = FALSE 5 | gamma_rel = 10 6 | update_mp = TRUE -------------------------------------------------------------------------------- /config_files/config_pfchoice_size3.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low40_high60_min50" 3 | wealth = 1e10 4 | industry_cov = FALSE 5 | gamma_rel = 10 6 | update_mp = TRUE -------------------------------------------------------------------------------- /config_files/config_pfchoice_size4.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low60_high80_min50" 3 | wealth = 1e10 4 | industry_cov = FALSE 5 | gamma_rel = 10 6 | update_mp = TRUE -------------------------------------------------------------------------------- /config_files/config_pfchoice_size5.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | size_screen = "perc_low80_high100_min50" 3 | industry_cov = FALSE 4 | wealth = 1e10 5 | gamma_rel = 10 6 | update_mp = TRUE -------------------------------------------------------------------------------- /config_files/config_sim1.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 1 -------------------------------------------------------------------------------- /config_files/config_sim10.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 10 -------------------------------------------------------------------------------- /config_files/config_sim11.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 11 -------------------------------------------------------------------------------- /config_files/config_sim12.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 12 -------------------------------------------------------------------------------- /config_files/config_sim13.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 13 -------------------------------------------------------------------------------- /config_files/config_sim14.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 14 -------------------------------------------------------------------------------- /config_files/config_sim15.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 15 -------------------------------------------------------------------------------- /config_files/config_sim2.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 2 -------------------------------------------------------------------------------- /config_files/config_sim3.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 3 -------------------------------------------------------------------------------- /config_files/config_sim4.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 4 -------------------------------------------------------------------------------- /config_files/config_sim5.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 5 -------------------------------------------------------------------------------- /config_files/config_sim6.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 6 -------------------------------------------------------------------------------- /config_files/config_sim7.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 7 -------------------------------------------------------------------------------- /config_files/config_sim8.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 8 -------------------------------------------------------------------------------- /config_files/config_sim9.txt: -------------------------------------------------------------------------------- 1 | # Parameters for script 2 | seed = 9 -------------------------------------------------------------------------------- /ewma.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | // [[Rcpp::export]] 5 | NumericVector ewma_c(NumericVector x, double lambda, int start) { 6 | int n = x.size(); 7 | int na = 0; 8 | double initial_value = 0; 9 | NumericVector var_vec(n, NA_REAL); 10 | 11 | if(n <= start) { 12 | return var_vec; 13 | } 14 | 15 | // Calculate simple variance to get started (assuming mean = 0) 16 | for(int i = 0; i < start; ++i) { 17 | if(NumericVector::is_na(x(i))) { 18 | na += 1; 19 | } else { 20 | initial_value += pow(x(i), 2); 21 | } 22 | } 23 | initial_value = initial_value / (start - 1 - na); 24 | 25 | // Iteratively Updating EWMA Vol 26 | var_vec(start) = initial_value; 27 | for(int j = start + 1; j < n; ++j) { 28 | if(NumericVector::is_na(x(j - 1))) { 29 | var_vec(j) = var_vec(j - 1); 30 | } else { 31 | var_vec(j) = lambda * var_vec(j - 1) + (1 - lambda) * pow(x(j - 1), 2); 32 | } 33 | } 34 | 35 | return sqrt(var_vec); 36 | } 37 | 38 | 39 | -------------------------------------------------------------------------------- /ml-and-the-implementable-efficient-frontier.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /separate_analysis_run.R: -------------------------------------------------------------------------------- 1 | # Load packages and settings --------------------------------------------------- 2 | source("Main.R") 3 | 4 | get_from_path_model <- "Data/Generated/Models/20240424-all" # From which directory should the data be loaded? (if update$x==F) 5 | run_sub <- F # T when testing code, F otherwise 6 | 7 | # Screens for main sample ------------------------------------------------------ 8 | settings$screens$size_screen <- "perc_low50_high100_min50" 9 | pf_set$wealth <- 1e10 10 | pf_set$gamma_rel <- 10 11 | 12 | # Generate data for analysis --------------------------------------------------- 13 | source("1 - Prepare Data.R", echo = T) 14 | source("3 - Estimate Covariance Matrix.R", echo = T) 15 | source("4 - Prepare Portfolio Data.R", echo = T) 16 | 17 | # Create benchmark portfolio objects ------------------------------------------- 18 | # This repeats some of the data in 5 - Base case.R, without re-doing the computationally heavy parts 19 | # Markowitz-ML 20 | tpf <- chars |> tpf_implement(cov_list = barra_cov, wealth = wealth, dates = dates_oos, gam = pf_set$gamma_rel) 21 | 22 | # Factor-ML 23 | factor_ml <- chars %>% factor_ml_implement(dates = dates_oos, n_pfs = settings$factor_ml$n_pfs, wealth = wealth, gam = pf_set$gamma_rel) 24 | 25 | # Market 26 | mkt <- chars %>% mkt_implement(dates = dates_oos, wealth = wealth) 27 | 28 | # 1/n 29 | ew <- chars %>% ew_implement(dates = dates_oos, wealth = wealth) 30 | 31 | # Fama-MacBeth / Rank weighted portfolios 32 | rw <- chars %>% rw_implement(dates = dates_oos, wealth = wealth) 33 | 34 | # Minimum variance 35 | mv <- chars %>% mv_implement(cov_list = barra_cov, dates = dates_oos, wealth = wealth) 36 | 37 | # Generate results from analysis ----------------------------------------------- 38 | source("6 - Implementable efficient frontier.R", echo = T) 39 | source("6 - Base analysis.R", echo = T) 40 | source("6 - Performance across size distribution.R", echo = T) 41 | source("6 - Feature Importance.R", echo = T) 42 | source("6 - Economic intuition.R", echo = T) 43 | source("6 - Short selling fees.R", echo = T) 44 | source("6 - RF Example.R", echo = T) 45 | -------------------------------------------------------------------------------- /simulations/sim_functions.R: -------------------------------------------------------------------------------- 1 | # Simulate data -------- 2 | sim_prep_data <- function(t, n, disp_x, tv_sds, tv_thetas, tv_rps, dolvol, dates_full, rf, seed, add_noise_feat=T, feat_standardize=T) { 3 | set.seed(seed) 4 | # Economy ------------------------------- 5 | dt <- 1 # Time increment 6 | mu_x <- 1 # Long-term mean of x1 7 | rpx <- 0.05 # Annual risk premium of X 8 | n_tv <- length(tv_sds) 9 | features <- c("x", paste0("tv", 1:n_tv)) 10 | 11 | # One constant factor that determines covariances 12 | X <- rep(rnorm(n = n, mean = mu_x, sd = disp_x), t) |> matrix(nrow = n, ncol = t) |> t() 13 | 14 | # Multiple time-varying return predicting factors, unrelated to covariances 15 | tv_list <- 1:n_tv |> map(function(k) { 16 | tv_start <- rnorm(n = n, mean = 0, sd = tv_sds[k]) # Initial value 17 | tv <- matrix(NA, nrow = t, ncol = n) 18 | tv[1,] <- tv_start 19 | for (j in 2:t) { 20 | tv[j,] <- tv[j-1,] + (1-tv_thetas[k]) * (0 - tv[j-1,]) * dt + tv_sds[k] * rnorm(n = n, mean = 0, sd = sqrt(dt)) 21 | } 22 | return(tv) 23 | }) 24 | 25 | # Simulate return -------------------- 26 | sig_m <- 0.2 27 | sig_e <- 0.4 28 | ivol <- diag(sig_e^2/12, nrow=n) 29 | mkt_ret <- rnorm(n=t, mean=0, sd = sig_m/sqrt(12)) # Demeaned market return 30 | e_ret <- matrix(rnorm(n*t, sd = sig_e/sqrt(12)), ncol = n, nrow = t) 31 | data_tc <- 1:t |> map(function(j) { 32 | x <- X[j,] 33 | # Only x is related to sigma 34 | sigma <- x %*% as.matrix(sig_m^2/12) %*% t(x) + ivol 35 | # Both x and TV factors are related to the expected return 36 | mu <- rpx/12*x 37 | for (k in 1:n_tv) { 38 | mu <- mu + tv_rps[k]/12*tv_list[[k]][j,] 39 | } 40 | # Realized returns 41 | if (F) { 42 | r <- mvrnorm(n = 1, mu = mu, Sigma = sigma) 43 | } else { 44 | r <- mu + x*mkt_ret[j] + e_ret[j, ] 45 | } 46 | w_tpf <- drop(solve(sigma) %*% mu) 47 | data <- data.table(eom=dates_full[j], id=1:n, x=x, ret_ld1 = r, er_true = mu, sr = mu/diag(sigma)^0.5, w_tpf=w_tpf) 48 | for (k in 1:n_tv) { 49 | data[, paste0("tv", k) := tv_list[[k]][j,]] 50 | } 51 | return(data) 52 | }) |> rbindlist() 53 | if (add_noise_feat) { 54 | noise_n <- 5 55 | for (i in 1:noise_n) { 56 | data_tc[, paste0("feat", i) := rnorm(t*n)] 57 | } 58 | features <- c(features, paste0("feat", 1:noise_n)) 59 | } 60 | data_tc[, me := 1] 61 | 62 | # Lambda (consider it fixed for now) -- 63 | data_tc[, lambda := 0.02/dolvol] # Assumes that all stocks have same daily trading volume 64 | # data_tc[, ret_ld1 := shift(r, 1, type="lead"), by=id] 65 | data_tc[, tr_ld1 := ret_ld1+rf] 66 | data_tc[, tr_ld0 := lag(tr_ld1, 1), by = id] 67 | data_tc[eom==min(eom), tr_ld0 := 0] # hack 68 | # data_tc[, pred_ld1 := 0.05/12] 69 | # data_tc[, pred_ld1 := er_true] 70 | for (h in 1:12) { 71 | # data_tc[, paste0("pred_ld", h) := rp1/12*(theta^(h-1)*x1+(1-theta^(h-1))*mu_x1) + rp2/12*(theta^(h-1)*x2+(1-theta^(h-1))*mu_x2)] # Risk premium times expected x: 72 | # data_tc[, paste0("pred_ld", h) := rp1/12*x1 + rp2/12*(theta^(h-1)*x2+(1-theta^(h-1))*mu_x2)] # Risk premium times expected x: 73 | data_tc[, paste0("pred_ld", h) := rpx/12*x] 74 | for (k in 1:n_tv) { 75 | data_tc[, paste0("pred_ld", h) := get(paste0("pred_ld", h)) + tv_rps[k]/12*(tv_thetas[k]^(h-1)*get(paste0("tv", k))+(1-tv_rps[k]^(h-1))*0)] 76 | } 77 | } 78 | print(data_tc[, c("id", "eom", paste0("pred_ld", 1:12)), with=F][id<=5 & eom==max(eom)] |> pivot_longer(-c(id, eom)) |> mutate(ld = str_remove(name, "pred_ld") |> as.integer()) |> ggplot(aes(ld, value, colour=factor(id))) + geom_point() + geom_line() + 79 | labs(title=paste0("Expected return for ID 1:5 on last date, tv thetas: ", paste0(tv_thetas, collapse="-")))) 80 | # Prepare for multiperiod-ML 81 | data_tc[, pred_ld2_6 := rowMeans(.SD), .SDcols = paste0("pred_ld", 2:6)] 82 | data_tc[, pred_ld7_12 := rowMeans(.SD), .SDcols = paste0("pred_ld", 7:12)] 83 | if (FALSE) { 84 | data_tc[id<=2, c("id", "eom", paste0("pred_ld", 1:4))][year(eom) %in% 2020:2021] |> pivot_longer(-c(id,eom)) |> ggplot(aes(eom, value, colour=name)) + geom_line() + facet_wrap(~id) + geom_hline(yintercept = rpx/12) 85 | data_tc[id<=2, c("id", "eom", "pred_ld1", "pred_ld2_6", "pred_ld7_12")][year(eom) %in% 2020:2021] |> pivot_longer(-c(id,eom)) |> ggplot(aes(eom, value, colour=name)) + geom_line() + facet_wrap(~id) + geom_hline(yintercept = rpx/12) 86 | } 87 | # Standardize features 88 | if (feat_standardize) { 89 | for(f in features) { 90 | data_tc[, (f) := ecdf(get(f))(get(f)), by = eom] 91 | } 92 | } 93 | # For existing functions 94 | data_tc[, valid := T] 95 | data_tc[, mu_ld0 := 0] #pf_set$mu 96 | data_tc[, eom_ret := eom+1+months(1)-1] 97 | # Create list of sigmas 98 | barra_cov <- 1:t |> map(function(i) { 99 | fct_load <- as.matrix(X[i,]) 100 | rownames(fct_load) <- as.character(1:n) 101 | ivol_vec <- diag(ivol) 102 | names(ivol_vec) <- as.character(1:n) 103 | fct_cov <- as.matrix(sig_m^2/12) 104 | list("fct_load"=fct_load, "fct_cov"=fct_cov, "ivol_vec"=ivol_vec) 105 | }) 106 | names(barra_cov) <- dates_full 107 | # Output 108 | list("barra_cov"=barra_cov, "data"=data_tc, "features"=features) 109 | } 110 | -------------------------------------------------------------------------------- /simulations/sim_results.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(data.table) 3 | 4 | # Layout settings ---------------------------------------------- 5 | theme_set(theme_classic(base_size = 13)) 6 | colours_theme <- c("#0C6291", "#A63446", RColorBrewer::brewer.pal(8, "Dark2"), 7 | "darkslategrey", "blue3", "red3", "purple2", "yellow2", "aquamarine", 8 | "grey", "salmon", "antiquewhite", "chartreuse") 9 | scale_colour_discrete <- function(...) { 10 | scale_colour_manual(..., values = colours_theme) 11 | } 12 | scale_fill_discrete <- function(...) { 13 | scale_fill_manual(..., values = colours_theme) 14 | } 15 | 16 | # Aggregate results ------------------------------------------- 17 | files <- list.files("simulations/results") 18 | files <- files[str_detect(files, "stocks500") & str_detect(files, "-05-18")] # | str_detect(files, "-05-19") 19 | 20 | results <- files |> map(function(f) { 21 | fread(paste0("simulations/results/", f)) 22 | }) |> rbindlist() 23 | if (FALSE) { 24 | # Forgot to add theta in run on april 17th 25 | results <- 1:length(files) |> map(function(i) { 26 | fread(paste0("simulations/results/", files[i])) |> mutate(theta1=search_grid$theta1[i], theta2=search_grid$theta2[i]) 27 | }) |> rbindlist() 28 | } 29 | 30 | # Results ----------------------------------------------------- 31 | results[, e_var_adj := (r-mean(r))^2, by=.(run, seed, type, w, theta1, theta2, n)] 32 | results[, utility_t := r-tc-0.5*e_var_adj*10] 33 | 34 | results_overall <- results[, .( 35 | n_obs = .N, 36 | inv = mean(inv), 37 | shorting = mean(shorting), 38 | turnover_notional = mean(turnover), 39 | r = mean(r)*12, 40 | sd = sd(r)*sqrt(12), 41 | sr_gross = mean(r)/sd(r)*sqrt(12), 42 | tc = mean(tc)*12, 43 | r_tc = mean((r-tc))*12, 44 | sr = mean(r-tc)/sd(r)*sqrt(12), 45 | obj = (mean(r)-0.5*var(r)*10-mean(tc))*12#, 46 | # obj_exp = (mean(e_r)-0.5*mean(e_var)*10-mean(tc))*12 47 | ), by = .(run, seed, type, w, theta1, theta2, n)] 48 | results_overall <- results_overall[type != "Static-ML"] 49 | results_overall[type=="Multiperiod-ML*", type := "Multiperiod-ML"] # No tuning, so no need for "*" 50 | 51 | # Utility ---------------- 52 | (output$simulations <- results_overall |> 53 | filter(!(theta1==0 & theta2==0.95)) |> 54 | # filter(w != 1e8) |> # Effectively zero cost at this level (need to re-check with new data) 55 | group_by(run, w, type, theta1, theta2, n) |> 56 | summarise( 57 | n = n(), 58 | obj = median(obj), 59 | high = max(obj), 60 | low = min(obj) 61 | ) |> 62 | mutate( 63 | theta = paste0("theta1=", theta1, ", theta2=", theta2) 64 | ) |> 65 | # filter(type!="Static-ML*") |> 66 | # filter(type!="Static-ML") |> 67 | ggplot(aes(x=factor(w/1e9), y=obj, colour=type, group=type)) + 68 | geom_point() + 69 | geom_line() + 70 | geom_hline(yintercept = 0, linetype="dashed", colour="black") + 71 | coord_cartesian(ylim = c(-0.01, NA )) + 72 | facet_wrap(~theta, nrow = 1, scales="free_y") + 73 | labs(x = "Wealth by 2020 ($Billion)", y = "Realized utility") + 74 | theme( 75 | legend.position = "top", 76 | legend.title = element_blank(), 77 | )) 78 | 79 | if (FALSE) { 80 | # Old scribbles --------------- 81 | # Relevant range: One fast, one slow 82 | results_overall |> 83 | filter(theta1==0 & theta2==0.95) |> 84 | group_by(seed, w, type, theta1, theta2, n) |> 85 | group_by(run, w, type, theta1, theta2, n) |> 86 | summarise( 87 | n = n(), 88 | obj = median(obj), 89 | high = max(obj), 90 | low = min(obj) 91 | ) |> 92 | mutate( 93 | theta = paste0("Theta1=", theta1, ", theta2=", theta2) 94 | ) |> 95 | # filter(type!="Static-ML*") |> 96 | # filter(type!="Static-ML") |> 97 | ggplot(aes(x=factor(w), y=obj, colour=type, group=type)) + 98 | geom_point() + 99 | geom_line() + 100 | coord_cartesian(ylim = c(-0.01, NA )) + 101 | facet_wrap(~theta, ncol = 2, scales="free_y") + 102 | labs(x = "Wealth", y = "Realized utility OOS") + 103 | theme( 104 | legend.position = "top", 105 | legend.title = element_blank(), 106 | ) 107 | 108 | results_overall |> 109 | # group_by(w, type, theta, n) |> 110 | # summarise( 111 | # obj = median(obj), 112 | # high = max(obj), 113 | # low = min(obj) 114 | # ) |> 115 | # filter(type!="Static-ML*") |> 116 | # filter(type!="Static-ML") |> 117 | ggplot(aes(x=factor(w), y=obj, colour=type, group=type)) + 118 | geom_point(alpha=0.5, size=1) + 119 | geom_line() + 120 | coord_cartesian(ylim = c(-0.2, NA )) + 121 | facet_wrap(~theta, ncol = 3, scales="free_y") 122 | 123 | # Turnover --------------- 124 | results_overall |> 125 | ggplot(aes(x=factor(w), y=turnover_notional, colour=type, group=type)) + 126 | geom_point() + 127 | geom_line() + 128 | facet_wrap(~theta) + 129 | labs(x = "Wealth", y = "Realized turnover") + 130 | theme( 131 | legend.position = "top", 132 | legend.title = element_blank(), 133 | ) 134 | 135 | 136 | # Gross Sharpe ratio ------------ 137 | results_overall |> 138 | ggplot(aes(x=factor(w), y=sr_gross, colour=type, group=type)) + 139 | geom_point() + 140 | geom_line() + 141 | facet_wrap(~theta) + 142 | labs(x = "Wealth", y = "Realized gross Sharpe ratio") + 143 | theme( 144 | legend.position = "top", 145 | legend.title = element_blank(), 146 | ) 147 | 148 | results[n==500 & theta==0.05 & w==1e11][, cum_u := cumsum(utility_t)] |> 149 | ggplot(aes(eom_ret, cum_u, colour=type)) + 150 | geom_point() + 151 | geom_line() 152 | 153 | results[n==100 & theta==0.95 & w==1e11][, cum_u := cumsum(r)] |> 154 | ggplot(aes(eom_ret, cum_u, colour=type)) + 155 | geom_point() + 156 | geom_line() 157 | 158 | 159 | # Test: Expected utility from trade ---------------------------- 160 | # For Static and TPF, compute expected change in utility from trade 161 | start_end_u_fun <- function(ws) { 162 | ds <- unique(ws$eom) 163 | ds |> map(function(d) { 164 | w <- ws[eom==d] 165 | er <- data_tc_list[[as.character(d)]]$er 166 | sigma_gam <- data_tc_list[[as.character(d)]]$sigma_gam 167 | lambda <- data_tc_list[[as.character(d)]]$lambda 168 | wealth_t <- wealth[eom == d]$wealth 169 | start <- w$w_start 170 | end <- w$w 171 | u_start <- start %*% er - 1/2*(t(start) %*% sigma_gam %*% start) 172 | u_end <- end %*% er - 1/2*(t(end) %*% sigma_gam %*% end) 173 | tc <- wealth_t/2 * (end-start) %*% lambda %*% (end-start) 174 | data.table(eom=d, u_start=drop(u_start), 175 | u_end=drop(u_end), 176 | tc=drop(tc)) 177 | }) |> rbindlist() 178 | } 179 | 180 | static$w |> start_end_u_fun() |> mutate(trade_imp = u_end-u_start-tc) 181 | tpf$w |> start_end_u_fun() |> mutate(trade_imp = u_end-u_start-tc) 182 | 183 | # Analysis ----------------------------------------------------- 184 | # Mean-variance efficient portfolio OOS 185 | data_tc[eom %in% dates_oos, .(rr = sum(ret_ld1*w_tpf), er = sum(mu*w_tpf)), by=eom][, .( 186 | n = .N, 187 | mean_er = mean(er)*12, 188 | mean_rr = mean(rr)*12, 189 | sd_rr = sd(rr)*sqrt(12) 190 | )][, sr := mean_rr/sd_rr][] 191 | 192 | # Mean-variance efficient portfolio FULL 193 | data_tc[, .(rr = sum(ret_ld1*w_tpf), er = sum(mu*w_tpf)), by=eom][, .( 194 | n = .N, 195 | mean_er = mean(er)*12, 196 | mean_rr = mean(rr)*12, 197 | sd_rr = sd(rr)*sqrt(12) 198 | )][, sr := mean_rr/sd_rr][] 199 | 200 | data_tc[, .(rr = sum(ret_ld1*w_tpf), er = sum(mu*w_tpf)), by=eom][, .( 201 | n = .N, 202 | mean_er = mean(er)*12, 203 | mean_rr = mean(rr)*12, 204 | sd_rr = sd(rr)*sqrt(12) 205 | )][, sr := mean_rr/sd_rr][] 206 | 207 | 208 | data_tc[, .(rr = sum(r*w_tpf), er = sum(mu*w_tpf)), by=t][, cumret := cumsum(rr)] |> 209 | ggplot(aes(t, cumret)) + 210 | geom_point() 211 | # SR by stock (SR increase in x because of constant ivol) 212 | data_tc[, .( 213 | sr = mean(sr), 214 | x = mean(x) 215 | ), by = i][order(x)] 216 | 217 | 218 | data_tc[id <=2 & year(eom) %in% 2019:2020] |> ggplot(aes(eom, mu*12, colour=factor(id))) + geom_line() + geom_point() 219 | 220 | 221 | # Expected returns ------------- 222 | ers <- c(0, 0.95) |> map(function(theta) { 223 | rf <- 0 224 | risk_free <- data.table(eom=dates_full, rf=rf) 225 | # Inputs to prepare_data_list ---- 226 | wealth <- data.table( 227 | eom=dates_full, 228 | wealth=1e10, 229 | mu_ld1 = pf_set$mu) 230 | # Prepare data 231 | data_list <- sim_prep_data( 232 | t = t, 233 | n = 10, 234 | theta = theta, 235 | dates_full = dates_full, 236 | rf = rf, 237 | seed = set$seed_no 238 | ) 239 | data_list$data |> mutate(theta=theta) 240 | }, .progress = T) |> rbindlist() 241 | 242 | ers[id==1] |> 243 | ggplot(aes(eom, pred_ld1, colour=factor(id))) + 244 | geom_line() + 245 | facet_wrap(~theta) 246 | 247 | ers[, .(ar1=cor(pred_ld1, lag(pred_ld1, 1), use="complete.obs")), by = .(id, theta)] |> 248 | group_by(theta) |> 249 | summarise(ar1=mean(ar1)) |> 250 | ggplot(aes(theta, ar1)) + 251 | geom_point() + 252 | geom_abline(intercept=0, slope=1, linetype="dashed") 253 | 254 | # Trade aggresiveness of TPF ------------------------------- 255 | tpf_ws <- c(0, 0.95) |> map(function(theta) { 256 | rf <- 0 257 | risk_free <- data.table(eom=dates_full, rf=rf) 258 | # Inputs to prepare_data_list ---- 259 | wealth <- data.table( 260 | eom=dates_full, 261 | wealth=1e10, 262 | mu_ld1 = pf_set$mu) 263 | # Prepare data 264 | data_list <- sim_prep_data( 265 | t = t, 266 | n = 100, 267 | theta = theta, 268 | dates_full = dates_full, 269 | rf = rf, 270 | seed = set$seed_no 271 | ) 272 | # TPF 273 | data_tc_list <- data_list$data[valid==T] %>% prepare_data_list(dates=dates_oos, wealth=wealth, risk_free=risk_free, 274 | barra_cov = data_list$barra_cov, gamma_rel = pf_set$gamma_rel) 275 | tpf <- data_list$data %>% tpf_implement(data_list = data_tc_list, dates = dates_oos, wealth = wealth) 276 | # Add expected returns 277 | data_list$data[, .(id, eom, x1, x2, pred_ld1)][tpf$w, on = .(id, eom)] |> mutate(theta=theta) 278 | }, .progress = T) |> rbindlist() 279 | 280 | tpf_ws[id<=3] |> 281 | pivot_longer(c(w, x1, pred_ld1)) |> 282 | ggplot(aes(eom, value, colour=factor(id))) + 283 | geom_line() + 284 | facet_wrap(name~theta, scales="free_y", ncol = 2) 285 | 286 | tpf_ws[, .( 287 | turnover = sum(abs(w-w_start)) 288 | ), by = .(theta, eom)][, .(turnover = mean(turnover)), by = theta] 289 | } 290 | 291 | -------------------------------------------------------------------------------- /simulations/simulations.R: -------------------------------------------------------------------------------- 1 | # This file relative to simulation.R: Run for 1, rather than multiple seeds 2 | # TO DO 3 | # - Prepare portfolio-ML for real data 4 | library(MASS) 5 | library(expm) 6 | library(tidyverse) 7 | library(data.table) 8 | source("0 - General functions.R") 9 | source("0 - Return prediction functions.R") 10 | source("0 - Portfolio choice functions.R") 11 | Rcpp::sourceCpp("sqrtm_cpp.cpp") 12 | # Goal: Understand the performance of the different methods with a fixed DGP 13 | # Setup: One-factor that drives expected returns and covariances 14 | # Result: Evaluation of how well Static/Portfolio/multiperiod performs 15 | 16 | add_noise_feat <- T 17 | 18 | source("simulations/sim_functions.R") 19 | # source("simulations/m2_fun_new_2.R") 20 | # source("simulations/new_static_fun.R") 21 | # source("simulations/new_multi_fun.R") 22 | # source("simulations/memory_fun_new.R") 23 | 24 | # Submit a config file with slurm job? ----------- 25 | if (TRUE) { 26 | # Prepare for slurm to submit configuration files flexibly 27 | args <- commandArgs(trailingOnly = TRUE) 28 | config_file <- args[1] # Get the first argument, which is the config file name 29 | 30 | config_params <- read_config(config_file) 31 | } else { 32 | config_params <- list( 33 | "seed" = 14 34 | ) 35 | } 36 | 37 | # Settings ------- 38 | set <- list( 39 | seed_no = 1, 40 | split = list( 41 | train_end = as.Date("1970-12-31"), # Change to 1994-12-31? 42 | test_end = as.Date("2020-12-31"), 43 | val_years = 10 44 | ), 45 | pf = list( 46 | dates = list( 47 | start_year = 1971, 48 | end_yr = 2020, 49 | split_years = 10 50 | ), 51 | hps = list( 52 | cov_type = "cov_add", 53 | static = list( 54 | k = c(1/1, 1/6, 1/12), 55 | u = c(1), 56 | g = c(0) 57 | ), 58 | m1 = list( 59 | k = c(1), 60 | u = c(1), 61 | g = c(0) 62 | ) 63 | ) 64 | ), 65 | pf_ml = list( 66 | g_vec = exp(-3:-2), 67 | p_vec = c(2^(6:9)), 68 | l_vec = c(0, exp(seq(-10, 10, length.out = 100))), 69 | orig_feat = F, # Should original features be added to the list of random features? 70 | scale = T 71 | ) 72 | ) 73 | 74 | t <- 12*70 75 | 76 | # Important dates ---------------------- 77 | dates_full <- rev(set$split$test_end+1 - months(1:t))-1 78 | 79 | hp_years <- seq(from=set$pf$dates$start_year, to=set$pf$dates$end_yr) 80 | start_oos <- set$pf$dates$start_year+set$pf$dates$split_years 81 | 82 | dates_oos <- seq.Date(from = as.Date(paste0(start_oos, "-", "01-01")), to = set$split$test_end+1-months(1), by = "1 month")-1 83 | dates_hp <- seq.Date(from = as.Date(paste0(min(hp_years), "-", "01-01")), to = set$split$test_end+1-months(1), by = "1 month")-1 84 | 85 | # Investor settings ------------------ 86 | pf_set <- list( 87 | # wealth = 1e11, 88 | gamma_rel = 10, 89 | mu = 0, # How much do we expect the portfolio to grow each month? market[year(eom_ret)>=1980, mean(mkt_vw_exc, na.rm=T)] 90 | # aim_hor = max_t_hor-1, 91 | lb_hor = 11 # Change?? 92 | ) 93 | 94 | # Simulation ------------------------------------- 95 | set$n_stocks <- 500 96 | search_grid <- expand.grid( 97 | w = 10^(8:12), 98 | theta1 = c(0, 0.95), 99 | theta2 = c(0, 0.95), 100 | n = set$n_stocks 101 | ) |> filter(!(theta1==0 & theta2==0.95)) # Effectively the same as theta1=0.95, theta2=0, so no need to run it 102 | 103 | do_static <- T 104 | do_tpf <- T 105 | do_mp <- T 106 | do_pf <- T 107 | 108 | # Takes 2-6 hours with 500 stocks, memory utilized is 7.54 GB 109 | # With 3 repetitions and 500 stocks, I suggest requesting 15GB for 23 hours 110 | # - It took between 8 and 14 hours with 32 CPUs 111 | search_params <- search_grid[config_params$seed, ] 112 | tictoc::tic(paste0("Run with parameters: wealth=", search_params$w, ", theta1=", search_params$theta1, ", theta2=", search_params$theta2, ", n=", search_params$n)) 113 | results <- 1:3 |> map(function(seed) { 114 | rf <- 0 115 | risk_free <- data.table(eom=dates_full, rf=rf) 116 | # Inputs to prepare_data_list ---- 117 | wealth <- data.table( 118 | eom=dates_full, 119 | wealth=search_params$w, 120 | mu_ld1 = pf_set$mu) 121 | # Prepare data 122 | system.time(data_list <- sim_prep_data( 123 | t = t, 124 | n = search_params$n, 125 | dates_full = dates_full, 126 | dolvol = 84*1e6, # Median dolvol for valid stocks by 2020 127 | rf = rf, 128 | tv_sds = rep(0.25, 2), 129 | tv_thetas = c(search_params$theta1, search_params$theta2), 130 | tv_rps = rep(0.05, 2), 131 | disp_x = 0.5, 132 | seed = seed, 133 | add_noise_feat = add_noise_feat, 134 | feat_standardize = T 135 | )) 136 | lambda_dates <- unique(data_list$data$eom) 137 | lambda_list <- lambda_dates |> map(function(d) { 138 | x <- data_list$data[eom==d, .(id, lambda)][order(id)] 139 | ids <- x$id 140 | x <- x$lambda 141 | names(x) <- ids 142 | return(x) 143 | }) |> setNames(as.character(lambda_dates)) 144 | pfs <- data.table() 145 | if (do_static) { 146 | # Static portfolio 147 | static <- static_implement( 148 | data_tc = data_list$data, 149 | cov_list = data_list$barra_cov, 150 | lambda_list = lambda_list, 151 | rf = risk_free, # Data 152 | wealth = wealth, 153 | mu = pf_set$mu, 154 | gamma_rel = pf_set$gamma_rel, # Investor 155 | dates_full = dates_full, 156 | dates_oos = dates_oos, 157 | dates_hp = dates_hp, 158 | hp_years = hp_years, # Dates 159 | k_vec = set$pf$hps$static$k, 160 | u_vec = set$pf$hps$static$u, 161 | g_vec = set$pf$hps$static$g, 162 | cov_type = set$pf$hps$cov_type, 163 | validation = NULL 164 | ) 165 | static_raw <- static$hps[eom_ret %in% static$pf$eom_ret & k==1 & g==0 & u==1, .(eom_ret=as.Date(eom_ret), inv, shorting, turnover, r, tc)][, type := "Static-ML"] 166 | pfs <- pfs |> rbind(static$pf, static_raw) 167 | } 168 | if (do_tpf) { 169 | # TPF 170 | tpf <- data_list$data |> tpf_implement(cov_list = data_list$barra_cov, wealth = wealth, dates = dates_oos, gam = pf_set$gamma_rel) 171 | pfs <- pfs |> rbind(tpf$pf) 172 | } 173 | if (do_mp) { 174 | # Multiperiod-ML 175 | m1 <- mp_implement( 176 | data_tc = data_list$data, 177 | cov_list = data_list$barra_cov, 178 | lambda_list = lambda_list, 179 | rf = risk_free, # Data 180 | wealth = wealth, 181 | mu = pf_set$mu, 182 | gamma_rel = pf_set$gamma_rel, # Investor 183 | dates_full = dates_full, 184 | dates_oos = dates_oos, 185 | dates_hp = dates_hp, 186 | hp_years = hp_years, # Dates 187 | k_vec = set$pf$hps$m1$k, 188 | u_vec = set$pf$hps$m1$u, 189 | g_vec = set$pf$hps$m1$g, 190 | cov_type = set$pf$hps$cov_type, 191 | validation = NULL, 192 | iter = 10, 193 | K = 12 194 | ) 195 | pfs <- pfs |> rbind(m1$pf) 196 | } 197 | if (do_pf) { 198 | m2 <- pfml_implement( 199 | data_tc = data_list$data, 200 | cov_list = data_list$barra_cov, 201 | lambda_list = lambda_list, 202 | features = data_list$features, 203 | risk_free = risk_free, # Data 204 | wealth = wealth, 205 | mu = pf_set$mu, 206 | gamma_rel = pf_set$gamma_rel, # Investor 207 | dates_full = dates_full[13:length(dates_full)], 208 | dates_oos = dates_oos, 209 | lb = pf_set$lb_hor, 210 | hp_years = hp_years, # Dates 211 | rff_feat = T, 212 | g_vec = set$pf_ml$g_vec, 213 | p_vec = set$pf_ml$p_vec, 214 | l_vec = set$pf_ml$l_vec, 215 | scale = set$pf_ml$scale, 216 | orig_feat = set$pf_ml$orig_feat, # Hyperparameters 217 | iter = 10, 218 | hps = NULL, 219 | balanced = T, # With 10 stocks, setting balanced=T increases time from 1.5 to 3mins. But with 100 stocks, it made almost no difference? Wierd 220 | seed = set$seed_no # Other 221 | ) 222 | pfs <- pfs |> rbind(m2$pf) 223 | } 224 | # Output 225 | pfs |> mutate(seed=seed, w=search_params$w, n=search_params$n, theta1=search_params$theta1, theta2=search_params$theta2) 226 | }, .progress = paste0("Progress run ", config_params$seed)) |> rbindlist() |> mutate(run=config_params$seed) 227 | tictoc::toc() 228 | if (T) { 229 | results |> fwrite(paste0("simulations/results/results", Sys.Date(), 230 | "_run", config_params$seed, 231 | "_nstocks", set$n_stocks, ".csv")) 232 | } -------------------------------------------------------------------------------- /slurm_build_portfolios.R: -------------------------------------------------------------------------------- 1 | # Run this code to fit models in parallel on a slurm HPC 2 | source("Main.R") 3 | 4 | # Submit a config file with slurm job? ----------- 5 | if (TRUE) { 6 | # Prepare for slurm to submit configuration files flexibly 7 | args <- commandArgs(trailingOnly = TRUE) 8 | config_file <- args[1] # Get the first argument, which is the config file name 9 | 10 | config_params <- read_config(config_file) 11 | # Time notes ---- 12 | # Running all three methods on quantiles splits of the stocks took roughly 20 hours 13 | # Runing just Static on all stocks took 3.5 hours, PFML took 65 hours, and Multiperiod-ML likely takes longer.... 14 | 15 | } else { 16 | config_params <- list( 17 | "size_screen"="perc_low50_high100_min50", #"top100" 18 | "wealth" = 1e10, 19 | "gamma_rel" = 10, 20 | "industry_cov" = T, 21 | "update_mp" = F, 22 | "update_base" = F, 23 | "update_fi_base" = F, 24 | "update_fi_ief" = F, 25 | "update_fi_ret" = F 26 | ) 27 | } 28 | 29 | settings$screens$size_screen <- config_params$size_screen 30 | settings$cov_set$industries <- config_params$industry_cov 31 | pf_set$wealth <- config_params$wealth 32 | pf_set$gamma_rel <- config_params$gamma_rel 33 | 34 | 35 | # Run settings -------------------- 36 | get_from_path_model <- "Data/Generated/Models/20240424-all" # From which directory should the data be loaded? (if update$x==F) 37 | run_sub <- F # T when testing code, F otherwise 38 | 39 | # Create folder to save output -------------------- 40 | folder_naming <- function(config_params) { 41 | paste0( 42 | "Data/Generated/Portfolios/", format(Sys.time(), "%Y%m%d-%H%m"), 43 | "_WEALTH", config_params$wealth, 44 | "_GAMMA", config_params$gamma_rel, 45 | "_SIZE", config_params$size_screen, 46 | "_IND", config_params$industry_cov 47 | ) 48 | } 49 | output_path <- config_params |> folder_naming() 50 | if (!dir.exists(output_path)) { 51 | # Create relevant directories 52 | dir.create(output_path) # Overall 53 | # Save Main.R and settings file to new directory 54 | file.copy("Main.R", paste0(output_path, "/Main.R")) 55 | file.copy("slurm_build_portfolios.R", paste0(output_path, "/slurm_build_portfolios.R")) 56 | settings |> saveRDS(paste0(output_path, "/settings.RDS")) 57 | pf_set |> saveRDS(paste0(output_path, "/pf_set.RDS")) 58 | } 59 | 60 | # Run code -------------------- 61 | tic("Total run time") 62 | 63 | source("1 - Prepare Data.R", echo = T) 64 | # source("2 - Fit Models.R", echo = T) # Loaded in prepare portfolio data 65 | source("3 - Estimate Covariance Matrix.R", echo = T) 66 | source("4 - Prepare Portfolio Data.R", echo = T) 67 | # Base case 68 | if (config_params$update_base) { 69 | tic("Base case:") 70 | source("5 - Base case.R", echo = T) 71 | toc() 72 | } 73 | # Feature importance - base case 74 | if (config_params$update_fi_base) { 75 | tic("Feature importance - base:") 76 | source("5 - Feature importance base.R", echo = T) 77 | toc() 78 | } 79 | # Feature importance - IEF 80 | if (config_params$update_fi_ief) { 81 | tic("Feature importance - IEF:") 82 | source("5 - Feature importance IEF.R", echo = T) 83 | toc() 84 | } 85 | # Feature importance - Expected return models 86 | if (config_params$update_fi_ret) { 87 | tic("Feature importance - returns:") 88 | source("5 - Feature importance ret.R", echo = T) 89 | toc() 90 | } 91 | 92 | toc() 93 | -------------------------------------------------------------------------------- /slurm_fit_models.R: -------------------------------------------------------------------------------- 1 | # Run this code to fit models in parallel on a slurm HPC 2 | # Note: when distributing 12 models across 4 instances, request 20 hours and 150gb memory on each instance 3 | source("Main.R") 4 | 5 | # Run settings -------------------- 6 | settings$screens$size_screen <- "all" 7 | settings$screens$nyse_stocks <- F # Also include non-NYSE stocks when fitting expected return models 8 | run_sub <- F # T when testing code, F otherwise 9 | 10 | # Range of Prediction Horizons ----- 11 | search_grid <- tibble( 12 | name = paste0("m", 1:12), 13 | horizon = as.list(1:12) 14 | ) 15 | # Read configuration 16 | if (TRUE) { 17 | # Prepare for slurm to submit configuration files flexibly 18 | args <- commandArgs(trailingOnly = TRUE) 19 | config_file <- args[1] # Get the first argument, which is the config file name 20 | 21 | config_params <- read_config(config_file) 22 | } else { 23 | config_params <- list( 24 | "horizon" = 1 25 | ) 26 | } 27 | search_grid <- search_grid |> filter(name %in% paste0("m", config_params$horizon)) 28 | # Create output folder if missing 29 | output_path <- paste0("Data/Generated/Models/", format(Sys.time(), "%Y%m%d"), "-", settings$screens$size_screen) 30 | if (!dir.exists(output_path)) { 31 | # Create relevant directories 32 | dir.create(output_path) # Overall 33 | # Save Main.R and settings file to new directory 34 | file.copy("Main.R", paste0(output_path, "/Main.R")) 35 | file.copy("slurm_fit_models.R", paste0(output_path, "/slurm_fit_models.R")) 36 | settings |> saveRDS(paste0(output_path, "/settings.RDS")) 37 | } 38 | tictoc::tic("Total run time") 39 | source("1 - Prepare Data.R", echo = T) 40 | source("2 - Fit Models.R", echo = T) 41 | -------------------------------------------------------------------------------- /sqrtm_cpp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | // [[Rcpp::depends(RcppArmadillo)]] 3 | 4 | //' Calculate square root of a matrix from Armadillo 5 | //' @export 6 | // [[Rcpp::export]] 7 | arma::cx_mat sqrtm_cpp (arma::mat x) { 8 | arma::cx_mat x_sqrt; 9 | x_sqrt = arma::sqrtmat(x); 10 | return(x_sqrt); 11 | } --------------------------------------------------------------------------------