├── .gitignore
├── 0 - General functions.R
├── 0 - Portfolio choice functions.R
├── 0 - Return prediction functions.R
├── 1 - Prepare Data.R
├── 2 - Fit Models.R
├── 3 - Estimate Covariance Matrix.R
├── 4 - Prepare Portfolio Data.R
├── 5 - Base case.R
├── 5 - Feature importance IEF.R
├── 5 - Feature importance base.R
├── 5 - Feature importance ret.R
├── 6 - Base analysis.R
├── 6 - Economic intuition.R
├── 6 - Feature importance.R
├── 6 - Implementable efficient frontier.R
├── 6 - Performance across size distribution.R
├── 6 - RF example.R
├── 6 - Short selling fees.R
├── 7 - Figures.R
├── 7 - Numbers.R
├── 7 - Tables.R
├── Joblists
    ├── joblist_models.txt
    ├── joblist_pfchoice_all.txt
    ├── joblist_pfchoice_base.txt
    ├── joblist_pfchoice_fi.txt
    ├── joblist_pfchoice_ief.txt
    ├── joblist_pfchoice_size.txt
    └── joblist_simulations.txt
├── Main.R
├── README.md
├── config_files
    ├── config_models1.txt
    ├── config_models10.txt
    ├── config_models11.txt
    ├── config_models12.txt
    ├── config_models2.txt
    ├── config_models3.txt
    ├── config_models4.txt
    ├── config_models5.txt
    ├── config_models6.txt
    ├── config_models7.txt
    ├── config_models8.txt
    ├── config_models9.txt
    ├── config_pfchoice_all.txt
    ├── config_pfchoice_base.txt
    ├── config_pfchoice_fi_base.txt
    ├── config_pfchoice_fi_ief.txt
    ├── config_pfchoice_fi_ret.txt
    ├── config_pfchoice_ief_w1g1.txt
    ├── config_pfchoice_ief_w1g2.txt
    ├── config_pfchoice_ief_w1g3.txt
    ├── config_pfchoice_ief_w1g4.txt
    ├── config_pfchoice_ief_w1g5.txt
    ├── config_pfchoice_ief_w2g1.txt
    ├── config_pfchoice_ief_w2g2.txt
    ├── config_pfchoice_ief_w2g3.txt
    ├── config_pfchoice_ief_w2g4.txt
    ├── config_pfchoice_ief_w2g5.txt
    ├── config_pfchoice_ief_w3g1.txt
    ├── config_pfchoice_ief_w3g2.txt
    ├── config_pfchoice_ief_w3g3.txt
    ├── config_pfchoice_ief_w3g4.txt
    ├── config_pfchoice_ief_w3g5.txt
    ├── config_pfchoice_ief_w4g1.txt
    ├── config_pfchoice_ief_w4g2.txt
    ├── config_pfchoice_ief_w4g3.txt
    ├── config_pfchoice_ief_w4g4.txt
    ├── config_pfchoice_ief_w4g5.txt
    ├── config_pfchoice_size1.txt
    ├── config_pfchoice_size2.txt
    ├── config_pfchoice_size3.txt
    ├── config_pfchoice_size4.txt
    ├── config_pfchoice_size5.txt
    ├── config_sim1.txt
    ├── config_sim10.txt
    ├── config_sim11.txt
    ├── config_sim12.txt
    ├── config_sim13.txt
    ├── config_sim14.txt
    ├── config_sim15.txt
    ├── config_sim2.txt
    ├── config_sim3.txt
    ├── config_sim4.txt
    ├── config_sim5.txt
    ├── config_sim6.txt
    ├── config_sim7.txt
    ├── config_sim8.txt
    └── config_sim9.txt
├── ewma.cpp
├── ml-and-the-implementable-efficient-frontier.Rproj
├── separate_analysis_run.R
├── simulations
    ├── sim_functions.R
    ├── sim_results.R
    └── simulations.R
├── slurm_build_portfolios.R
├── slurm_fit_models.R
└── sqrtm_cpp.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | ief_for_slides.R
 6 | Data/
 7 | Figures/
 8 | slurm_output/
 9 | simulations/results/
10 | simulations/Depreceated code/
11 | scribbles/


--------------------------------------------------------------------------------
/0 - General functions.R:
--------------------------------------------------------------------------------
  1 | # Read configuration file ---------------
  2 | read_config <- function(file) {
  3 |   lines <- readLines(file)
  4 |   lines <- lines[!grepl("^#", lines)]  # Remove lines that are comments
  5 |   config <- list()
  6 |   for (line in lines) {
  7 |     key_value <- strsplit(line, "=")[[1]]
  8 |     key <- trimws(key_value[1])
  9 |     value <- trimws(key_value[2])
 10 |     config[[key]] <- eval(parse(text = value))
 11 |   }
 12 |   return(config)
 13 | }
 14 | 
 15 | # Create cov --------------
 16 | create_cov <- function(x, ids=NULL) {
 17 |   # Extract the relevant loadings and ivol
 18 |   if (is.null(ids)) {
 19 |     load <- x$fct_load 
 20 |     ivol <- x$ivol_vec
 21 |   } else {
 22 |     load <- x$fct_load[as.character(ids),] 
 23 |     ivol <- x$ivol_vec[as.character(ids)]
 24 |   }
 25 |   # Create the covariance matrix
 26 |   (load %*% x$fct_cov %*% t(load) + diag(ivol))
 27 | }
 28 | 
 29 | # Create lambda ---------------
 30 | create_lambda <- function(x, ids) {
 31 |   x[ids] |> diag()
 32 | }
 33 | 
 34 | # Compute expected risk ---------------
 35 | expected_risk_fun <- function(ws, dates, cov_list) {
 36 |   # Make sure that ids are sorted identically for each method
 37 |   ws |> setorder(type, eom, id)
 38 |   types <- unique(ws$type)
 39 |   w_list <- ws |> split(by = "eom")
 40 |   # Compute variance for each porfolios
 41 |   dates |> map(.progress = T, function(d) {
 42 |     w_sub <- w_list[[as.character(d)]]
 43 |     ids <- w_sub$id |> unique()
 44 |     sigma <- cov_list[[as.character(d)]] |> create_cov(ids=as.character(ids))
 45 |     types |> map(function(x) {
 46 |       w <- w_sub[type==x]$w
 47 |       pf_var <- t(w) %*% sigma %*% w
 48 |       data.table(type=x, pf_var=drop(pf_var))
 49 |     }) |> rbindlist() |> mutate(eom=d)
 50 |   }) |> rbindlist()
 51 | }
 52 | 
 53 | # Long horizon returns ------------------
 54 | long_horizon_ret <- function(data, h, impute) { # Impute in c("zero", "mean", "median")
 55 |   dates <- data[!is.na(ret_exc), .(eom, "merge_date"=eom)] %>% unique()
 56 |   ids <- data[!is.na(ret_exc), .(
 57 |     start = min(eom),
 58 |     end = max(eom)
 59 |   ), by = id]
 60 |   full_ret <- ids[dates, on = .(start<=merge_date, end>=merge_date), allow.cartesian=T][, c("start", "end") := NULL]
 61 |   full_ret <- data[, .(id, eom, ret_exc)][full_ret, on = .(id, eom)]
 62 |   full_ret %>% setorder(id, eom)
 63 |   cols <- paste0("ret_ld", 1:h)
 64 |   for(l in 1:h) {
 65 |     full_ret[, paste0("ret_ld", l) := dplyr::lead(ret_exc, l), by = id]
 66 |   }
 67 |   full_ret[, ret_exc := NULL]
 68 |   # Remove rows with all observations missing
 69 |   all_missing <- (is.na(full_ret[, cols, with=F]) %>% rowSums() == h)
 70 |   print(paste0("All missing excludes ", round(mean(all_missing)*100, 2), "% of the observations"))
 71 |   full_ret <- full_ret[all_missing==F] # Is equivalent to assuming that investors new the security would be delisted. Think it's reasonable since we include deliting returns.
 72 |   # Impute missing return
 73 |   if (impute == "zero") {
 74 |     full_ret[, (cols) := lapply(.SD, function(x) if_else(is.na(x), 0, x)), .SDcols=cols]
 75 |   }
 76 |   if (impute == "mean") {
 77 |     full_ret[, (cols) := lapply(.SD, function(x) if_else(is.na(x), mean(x, na.rm=T), x)), .SDcols=cols, by = eom]
 78 |   }
 79 |   if (impute == "median") {
 80 |     full_ret[, (cols) := lapply(.SD, function(x) if_else(is.na(x), median(x, na.rm=T), x)), .SDcols=cols, by = eom]
 81 |   }
 82 |   return(full_ret)
 83 | }
 84 | 
 85 | # Portfolio Function ------------------
 86 | sigma_gam_adj <- function(sigma_gam, g, cov_type) {
 87 |   if (cov_type=="cov_mult") {
 88 |     return(sigma_gam*g)
 89 |   }
 90 |   if (cov_type == "cov_add") {
 91 |     return(sigma_gam+diag(diag(sigma_gam)*g))
 92 |   }
 93 |   if (cov_type == "cor_shrink") {
 94 |     stopifnot(abs(g) <= 1)
 95 |     sd_vec <- diag(sqrt(diag(sigma_gam)))
 96 |     sd_vec_inv <- solve(sd_vec)
 97 |     cor_mat <- sd_vec_inv %*% sigma_gam %*% sd_vec_inv
 98 |     cor_mat_adj <- cor_mat * (1-g) + diag(nrow(cor_mat))*g
 99 |     cov_adj <- sd_vec %*% cor_mat_adj %*% sd_vec
100 |     rownames(cov_adj) <- rownames(sigma_gam)
101 |     colnames(cov_adj) <- colnames(sigma_gam)
102 |     return(cov_adj)
103 |   }
104 | }
105 | 
106 | initial_weights_new <- function(data, w_type, udf_weights=NULL) {
107 |   if (w_type == "vw") {
108 |     pf_w <- data[, .(id, w_start = me/sum(me)), by = eom]
109 |   }
110 |   if (w_type == "ew") {
111 |     pf_w <- data[, .(id, w_start = 1/.N), by = eom]
112 |   }
113 |   if (w_type == "rand_pos") {
114 |     pf_w <- data[, .(id, eom)][, w_start := runif(.N)][, w_start := w_start / sum(w_start), by = eom]
115 |   }
116 |   if (w_type == "udf") {
117 |     pf_w <- udf_weights[data, on = .(id, eom)]
118 |   }
119 |   pf_w[eom != min(eom), w_start := NA_real_]
120 |   pf_w[, w := NA_real_]
121 | }
122 | 
123 | # Portfolio function --------------
124 | pf_ts_fun <- function(weights, data, wealth, gam) {
125 |   comb <- data[, .(id, eom, ret_ld1, pred_ld1, lambda)][weights, on = .(id, eom)] 
126 |   comb <- wealth[, .(eom, wealth)][comb, on = "eom"]
127 |   comb[, .(
128 |     inv = sum(abs(w)),
129 |     shorting = sum(abs(w[w<0])),
130 |     turnover = sum(abs(w-w_start)),
131 |     r = sum(w*ret_ld1),
132 |     tc = unique(wealth)/2*sum(lambda*t(w-w_start)^2)
133 |   ), by = eom][, eom_ret := eom+1+months(1)-1][, eom := NULL]
134 | }
135 | 
136 | # Size-based screen ----------------
137 | # Important: Function modifies in place
138 | size_screen_fun <- function(chars, type) {
139 |   count <- 0 # Count ensures that at least one screen is applied
140 |   # All 
141 |   if (type=="all") {
142 |     print("No size screen")
143 |     chars[valid_data==T, valid_size := T]
144 |     count <- count + 1
145 |   }
146 |   # Top 1000
147 |   if (str_detect(type, "top")) {
148 |     top_n <- type |> str_remove("top") |> as.integer() 
149 |     chars[valid_data==T, me_rank := frank(-me), by = eom]
150 |     chars[, valid_size := (me_rank <= top_n & !is.na(me_rank))]
151 |     chars[, me_rank := NULL]
152 |     count <- count + 1
153 |   }
154 |   # Bottom N
155 |   if (str_detect(type, "bottom")) {
156 |     bot_n <- type |> str_remove("bottom") |> as.integer() 
157 |     chars[valid_data==T, me_rank := frank(me), by = eom]
158 |     chars[, valid_size := (me_rank <= bot_n & !is.na(me_rank))]
159 |     chars[, me_rank := NULL]
160 |     count <- count + 1
161 |   }
162 |   # Size group
163 |   if (str_detect(type, "size_grp_")) {
164 |     size_grp_screen <- type |> str_remove("size_grp_")
165 |     chars[, valid_size := (size_grp==size_grp_screen & valid_data==T)]
166 |     count <- count + 1  
167 |   }
168 |   # Percentile range - min N!
169 |   if (str_detect(type, "perc")) {
170 |     low_p <- str_extract(type, "(?<=low)\\d+") |> as.integer()
171 |     high_p <- str_extract(type, "(?<=high)\\d+") |> as.integer()
172 |     min_n <- str_extract(type, "(?<=min)\\d+") |> as.integer()
173 |     print(paste0("Percentile-based screening: Range ", low_p, "% - ", high_p, "%, min_n: ", min_n, " stocks"))
174 |     # Extract relevant range
175 |     chars[valid_data==T, me_perc := ecdf(me)(me), by = eom]
176 |     chars[, valid_size := (me_perc > low_p/100 & me_perc <= high_p/100 & !is.na(me_perc))] # ecdf never returns 0, which is why lower bound is a strict inequality
177 |     chars[, n_tot := sum(valid_data, na.rm=T), by = eom]
178 |     chars[, n_size := sum(valid_size, na.rm=T), by = eom]
179 |     chars[, n_less := sum(valid_data==T &  me_perc<=low_p/100, na.rm=T), by = eom]
180 |     chars[, n_more := sum(valid_data==T &  me_perc>high_p/100, na.rm=T), by = eom]
181 |     # How many additional stocks from each side?
182 |     chars[, n_miss := pmax(min_n-n_size, 0)]
183 |     chars[, n_below := ceiling(pmin(n_miss/2, n_less))]
184 |     chars[, n_above := ceiling(pmin(n_miss/2, n_more))]
185 |     chars[n_below+n_above < n_miss & n_above>n_below, n_above := n_above + n_miss-n_above-n_below]
186 |     chars[n_below+n_above < n_miss & n_above<n_below, n_below := n_below + n_miss-n_above-n_below]
187 |     chars[, valid_size := (me_perc > (low_p/100-n_below/n_tot) & me_perc <= high_p/100+n_above/n_tot & !is.na(me_perc))]
188 |     if (FALSE) {
189 |       # See that method workds
190 |       chars[, n_size2 := sum(valid_size, na.rm=T), by = eom]
191 |       chars[eom==as.Date("1979-12-31"), .(id, eom, me, me_perc, n_less, n_more, n_tot, n_size, n_size2, n_miss, n_below, n_above, valid_size)][order(me)]
192 |       
193 |       unique(chars[, .(eom, n_size, n_size2, n_tot)]) |> pivot_longer(-eom) |>  ggplot(aes(eom, value, colour=name)) + geom_point() + geom_hline(yintercept = min_n, linetype="dotted")
194 |       chars[valid_size==T, median(me_perc), by = eom] |> ggplot(aes(eom, V1)) + geom_point() + ylim(c(0,1))
195 |     }
196 |     chars[, c("me_perc", "n_tot", "n_size", "n_less", "n_more", "n_miss", "n_below", "n_above") := NULL]
197 |     count <- count + 1
198 |   }
199 |   if (count != 1) {
200 |     stop("Invalid size screen applied!!!!")
201 |   }
202 | }
203 | 
204 | # Addition/deletion rule ----------------
205 | # Helper
206 | investment_universe <- function(add, delete) {
207 |   n <- length(add)
208 |   included <- logical(n)
209 |   state <- F
210 |   for (i in 2:n) {
211 |     # Include if stock has been valid for 12 months
212 |     if (state==F & add[i]==T & add[i-1]==F) {
213 |       state <- T
214 |     }
215 |     # Exclude if stock has not been valid in past 12 months
216 |     if (state==T & delete[i]==T) {
217 |       state <- F
218 |     }
219 |     included[i] <- state
220 |   }
221 |   return(included)
222 | }
223 | # Function
224 | addition_deletion_fun <- function(chars, addition_n, deletion_n) {
225 |   chars[, valid_temp := (valid_data==T & valid_size==T)]  # Valid without the addition/deletion rule
226 |   chars %>% setorder(id, eom)
227 |   chars[, addition_count := roll::roll_sum(valid_temp, addition_n), by = id]
228 |   chars[, deletion_count := roll::roll_sum(valid_temp, deletion_n), by = id]
229 |   chars[, add := (addition_count==addition_n)]
230 |   chars[is.na(add), add := F]
231 |   chars[, delete := (deletion_count==0)]
232 |   chars[is.na(delete), delete := F]
233 |   chars[, n := .N, by = id]
234 |   chars[n>1, valid := investment_universe(add = add, delete = delete), by = id]
235 |   chars[n==1, valid := F]
236 |   # Ensure that data is valid
237 |   chars[valid_data==F, valid := F]
238 |   # Check Turnover 
239 |   chars[, chg_raw := (valid_temp != lag(valid_temp)), by = id]
240 |   chars[, chg_adj := (valid != lag(valid)), by = id]
241 |   to <- chars[, .(
242 |     raw_n = sum(valid_temp, na.rm=T),
243 |     adj_n = sum(valid, na.rm=T),
244 |     raw = sum(chg_raw, na.rm=T)/sum(valid_temp, na.rm=T),
245 |     adj = sum(chg_adj, na.rm=T)/sum(valid, na.rm=T)
246 |   ), by = eom][!is.nan(raw) & !is.nan(adj) & adj != 0][, .(
247 |     n_months = .N, 
248 |     n_raw = mean(raw_n),
249 |     n_adj = mean(adj_n),
250 |     turnover_raw = mean(raw),
251 |     turnover_adjusted = mean(adj)
252 |   )]
253 |   cat(paste0("Turnover wo addition/deletion rule: ", round(to$turnover_raw*100, 2), "%", 
254 |              "\n", 
255 |              "Turnover w  addition/deletion rule: ", round(to$turnover_adjusted*100, 2), "%", "\n"))
256 |   chars[, c("n", "addition_count", "deletion_count", "add", "delete", "valid_temp", "valid_data", "valid_size", "chg_raw", "chg_adj") := NULL]
257 | } 
258 | 


--------------------------------------------------------------------------------
/0 - Return prediction functions.R:
--------------------------------------------------------------------------------
  1 | # Split data 
  2 | data_split <- function(data, type, val_end, val_years, train_start, train_lookback, retrain_lookback, test_inc, test_end) {
  3 |   train_end <- val_end-years(val_years)
  4 |   train_start <- max(train_start, train_end - years(train_lookback))
  5 |   op <- list()
  6 |   op$val <- data[eom >= train_end & eom_pred_last <= val_end]
  7 |   op$train <- data[eom >= train_start & eom_pred_last <= train_end]
  8 |   op$train_full <- data[eom >= val_end-years(retrain_lookback) & eom_pred_last <= val_end]
  9 |   op$test <- data[eom >= val_end & eom < min(val_end+years(test_inc), test_end)]
 10 |   return(op)
 11 | }
 12 | 
 13 | # OLS --------------
 14 | ols_fit <- function(data, feat) {
 15 |   fit <- lm(paste0("ret_pred~",paste0(feat, collapse = "+")), data=data$train_full)
 16 |   pred <- data$test[, .(id, eom, eom_pred_last, pred=predict(fit, newdata = data$test))]
 17 |   list("fit"=fit, "pred"=pred)
 18 | }
 19 | 
 20 | # Random Fourier Features --------------------
 21 | rff <- function(X, p=NULL, g=NULL, W=NULL) { # P=number of features [MUST BE DIVISIBLE WITH 2!]
 22 |   # Draw random weights
 23 |   if (is.null(W)) {
 24 |     k <- ncol(X)
 25 |     W <- MASS::mvrnorm(n=p/2, mu=rep(0, k), Sigma = g*diag(k)) %>% t()
 26 |   } 
 27 |   X_new <- as.matrix(X) %*% W
 28 |   # Output
 29 |   list(W=W, X_cos=cos(X_new), X_sin=sin(X_new))
 30 | }
 31 | 
 32 | rff_hp_search <- function(data, feat, p_vec, g_vec, l_vec, seed) {
 33 |   # Search over g
 34 |   val_errors <- g_vec %>% lapply(function(g) {
 35 |     set.seed(seed)  # Ensures that draw is always the same irrespective of g
 36 |     print(paste0("g: ", formatC(g, digits=2, format="f"), " (", match(g, g_vec), " out of ", length(g_vec), ")"))
 37 |     # Create random features in training sets
 38 |     rff_train <- data$train[, feat, with=F] %>% rff(p = max(p_vec), g = g)
 39 |     rff_val <- data$val[, feat, with=F] %>% rff(W=rff_train$W)
 40 |     # Search over p
 41 |     err <- p_vec %>% lapply(function(p) {
 42 |       print(paste0("--> p: ", p, " (", match(p, p_vec), " out of ", length(p_vec), ")"))
 43 |       X_train <- p^(-0.5)*cbind(rff_train$X_cos[, 1:(p/2)], rff_train$X_sin[, 1:(p/2)])
 44 |       X_val <- p^(-0.5)*cbind(rff_val$X_cos[, 1:(p/2)], rff_val$X_sin[, 1:(p/2)])
 45 |       # Ridge fit
 46 |       fit <- glmnet(x = X_train, y = data$train$ret_pred, 
 47 |                     family = "gaussian", alpha = 0, lambda = l_vec, standardize = F)
 48 |       pred <- fit %>% predict(newx=X_val, type = "response", s = l_vec)
 49 |       1:length(l_vec) %>% lapply(function(i) {
 50 |         data.table(lambda=l_vec[i], mse=mean((pred[, i]-data$val$ret_pred)^2))
 51 |       }) %>%
 52 |         rbindlist() %>%
 53 |         mutate(p = p)
 54 |     }) %>% rbindlist() %>% mutate(g=g)
 55 |     list("err"=err, W=rff_train$W)
 56 |   })
 57 |   names(val_errors) <- g_vec
 58 |   # Optimal hps
 59 |   val_mse <- val_errors %>% map("err") %>% rbindlist()
 60 |   opt_hps <- val_mse[mse==min(mse)]
 61 |   opt_w <- val_errors[[as.character(opt_hps$g)]]$W[, 1:(opt_hps$p/2)]
 62 |   print(val_mse %>%
 63 |           mutate(
 64 |             lambda=if_else(log10(lambda)< -5, 0, lambda),
 65 |             g = paste0("g=", formatC(g, digits = 2, format = "f")),
 66 |             p = factor(p)
 67 |           ) %>%
 68 |           ggplot(aes(log10(lambda), mse, colour=p)) +
 69 |           geom_point(alpha=0.25) +
 70 |           geom_line() +
 71 |           # annotate(geom="label", x=Inf, y=Inf, label=paste0("Optimal: p=", opt_hps$p, ", g=", opt_hps$g, ", log10(l)=", round(log10(opt_hps$lambda), 2)), hjust=1, vjust=1) +
 72 |           facet_wrap(~g) +
 73 |           labs(y="Mean squared error"))
 74 |   # Re-fit on train-val data
 75 |   X_train_full <- data$train_full[, feat, with=F] %>% rff(W=opt_w)
 76 |   X_train_full <- opt_hps$p^(-0.5)*cbind(X_train_full$X_cos, X_train_full$X_sin)
 77 |   final_fit <- glmnet(x = X_train_full, y = data$train_full$ret_pred, 
 78 |                       family = "gaussian", alpha = 0, lambda = opt_hps$lambda, standardize = F)
 79 |   # Predict on test data
 80 |   X_test <- data$test[, feat, with=F] %>% rff(W=opt_w)
 81 |   X_test <- opt_hps$p^(-0.5)*cbind(X_test$X_cos, X_test$X_sin)
 82 |   pred_op <- data$test[, .(id, eom, eom_pred_last, pred=drop(predict(final_fit, newx = X_test, s=opt_hps$lambda)))]
 83 |   # Output
 84 |   list("fit"=final_fit, "pred"=pred_op, "hp_search"=val_mse, "W"=opt_w, "opt_hps"=opt_hps)
 85 | }
 86 | 
 87 | # Ridge HP search 
 88 | ridge_hp_search <- function(data, feat, vol_scale, lambdas) {
 89 |   fit <- glmnet(x = data$train[, feat, with=F] %>% as.matrix(), y = data$train$ret_pred, 
 90 |                 family = "gaussian", alpha = 0, lambda = lambdas, standardize = T)
 91 |   pred <- fit %>% predict(newx=data$val[, feat, with=F] %>% as.matrix(), type = "response", s = lambdas)
 92 |   lambda_search <- 1:length(lambdas) %>% lapply(function(i) {
 93 |     data.table(lambda=lambdas[i], mse=mean((pred[, i]-data$val$ret_pred)^2))
 94 |   }) %>%
 95 |     rbindlist()
 96 |   print(lambda_search[log(lambda)!=-100] %>% ggplot(aes(log(lambda), mse)) + geom_point())
 97 |   lambda_opt <- lambda_search[mse==min(mse)]$lambda
 98 |   pred_val_op <- data$val[, .(id, eom, eom_pred_last, pred=pred[, match(lambda_opt, lambdas)])]
 99 |   # Re-fit to all training data
100 |   final_fit <- glmnet(x = data$train_full[, feat, with=F] %>% as.matrix(), y = data$train_full$ret_pred, 
101 |                       family = "gaussian", alpha = 0, lambda = lambda_opt, standardize = T) # Maybe a bad idea to re-standardize
102 |   pred <- final_fit %>% predict(newx=data$test[, feat, with=F] %>% as.matrix(), s=lambda_opt)  # final_fit %>% broom::tidy() %>% filter(term %in% c(weekly_feat, "rvol_21d")) %>% arrange(-abs(estimate)) %>% ggplot(aes(reorder(term, abs(estimate)), abs(estimate))) + geom_col() + coord_flip()
103 |   feat_imp <- final_fit %>% broom::tidy() %>% filter(term!="(Intercept)") %>% mutate(imp = frank(-abs(estimate)))
104 |   pred_op <- data$test[, .(id, eom, eom_pred_last, pred=drop(pred))]
105 |   if (vol_scale) {
106 |     pred_val_op <- pred_val_op %>% mutate(pred_vol=pred, pred=pred_vol*data$val$rvol_m)
107 |     pred_op <- pred_op %>% mutate(pred_vol=pred, pred=pred_vol*data$test$rvol_m)
108 |   }
109 |   # Output
110 |   return(list("fit"=final_fit, "hp_search"=lambda_search, "l_opt"=lambda_opt, "pred"=pred_op, "pred_val"=pred_val_op, "feat_imp"=feat_imp))
111 | }
112 | 
113 | # XGB ------------------
114 | fit_xgb <- function(train, val, params, iter, es, cores, seed) { # train and val should be xgb.Dmatrix objects
115 |   set.seed(seed)
116 |   
117 |   params_all <- list(
118 |     objective = "reg:squarederror",
119 |     base_score = 0,            
120 |     eval_metric = "rmse", 
121 |     booster = "gbtree",
122 |     max_depth = params$tree_depth, 
123 |     eta = params$learn_rate, 
124 |     gamma = params$loss_reduction,  
125 |     subsample = params$sample_size,        # Row subsampling
126 |     colsample_bytree = params$mtry, # Column Subsampling
127 |     min_child_weight = params$min_n,
128 |     lambda = params$penalty
129 |   )
130 |   
131 |   # Fit Model
132 |   model <- xgb.train(
133 |     data = train, 
134 |     params = params_all,
135 |     watchlist = list(train=train, val=val),
136 |     nrounds = iter, 
137 |     early_stopping_rounds = es,
138 |     verbose = 0, 
139 |     maximize = F,
140 |     nthread = cores
141 |   )
142 |   return(model)
143 | }
144 | 
145 | 
146 | # XGB HP search
147 | xgb_hp_search <- function(data, feat, vol_scale, hp_grid, iter, es, cores, seed) {
148 |   # Find hyperparameters
149 |   train <- xgb.DMatrix(data=as.matrix(data$train[, feat, with=F]), label=data$train$ret_pred)
150 |   val <- xgb.DMatrix(data=as.matrix(data$val[, feat, with=F]), label=data$val$ret_pred)
151 |   xgb_search <- 1:nrow(hp_grid) %>% lapply(function(j) {
152 |     print(paste0("HP: ", j))
153 |     xgb_fit <- fit_xgb(train = train, val = val, params = hp_grid[j, ], iter = iter, 
154 |                        es = es, cores = cores, seed = seed)
155 |     data.table(hp_no=j, val_rmse=xgb_fit$best_score, best_iter=xgb_fit$best_iteration)
156 |   }) %>% rbindlist()
157 |   print(xgb_search %>% ggplot(aes(hp_no, val_rmse)) + geom_point())
158 |   best_hp_no <- xgb_search[val_rmse==min(val_rmse)][1]$hp_no # Had one case with identical rmse(!), arbitrarily just choose the first
159 |   best_hp <- hp_grid[best_hp_no, ]
160 |   best_iter <- xgb_search[hp_no==best_hp_no]$best_iter
161 |   print("Best HP:")
162 |   print(best_hp)
163 |   print(paste("With", best_iter, "iterations"))
164 |   # Re-fit to all training data
165 |   train_all <- xgb.DMatrix(data=as.matrix(data$train_full[, feat, with=F]), label=data$train_full$ret_pred)
166 |   final_fit <- fit_xgb(train = train_all, val = train_all, params = best_hp, iter = best_iter, 
167 |                        es = NULL, cores = cores, seed = seed)
168 |   # Feature importance
169 |   set.seed(seed)
170 |   shap_contrib <- predict(object=final_fit, newdata=as.matrix(sample_n(data$train_full[, feat, with=F], 10000)),predcontrib=T)
171 |   global_imp <- abs(shap_contrib) %>% colMeans() %>% as_tibble(rownames = "char") %>% 
172 |     filter(char != "BIAS") %>%
173 |     mutate(
174 |       rank = frank(-abs(value))
175 |     )
176 |   print(global_imp %>%
177 |           filter(rank <= 20) %>% 
178 |           ggplot(aes(reorder(char, abs(value)), abs(value))) + coord_flip() + geom_col()  + labs(y = "Global feature Importance") + theme(axis.title.y =  element_blank()))
179 |   
180 |   # Predictions
181 |   pred <- final_fit %>% predict(newdata=data$test[, feat, with=F] %>% as.matrix())
182 |   if (vol_scale) {
183 |     pred_op <- data$test[, .(id, eom, pred_vol=drop(pred), pred=drop(pred)*rvol_m)]
184 |   } else {
185 |     pred_op <- data$test[, .(id, eom, pred=drop(pred))]
186 |   }
187 |   # Output
188 |   list("fit"=final_fit, "best_hp"=best_hp, "best_iter"=best_iter, "hp_search"=xgb_search, "pred"=pred_op, "feat_imp"=global_imp)
189 | }


--------------------------------------------------------------------------------
/1 - Prepare Data.R:
--------------------------------------------------------------------------------
  1 | # Risk-free rate --------------------
  2 | risk_free <- fread("Data/ff3_m.csv", select = c("yyyymm", "RF")) %>% mutate(rf=RF/100, eom = paste0(yyyymm, "01") %>% as.Date("%Y%m%d") %>% ceiling_date(unit="month")-1) %>% select(eom, rf)
  3 | 
  4 | # Market -----------------------
  5 | market <- fread("Data/market_returns.csv", colClasses = c("eom"="character"))
  6 | market <- market[excntry == "USA", .(eom_ret = as.Date(eom, format="%Y%m%d"), mkt_vw_exc)]
  7 | 
  8 | # Wealth: Assumed portfolio growth --------------------
  9 | wealth_func <- function(wealth_end, end, market, risk_free) {
 10 |   wealth <- risk_free[, .("eom_ret"=eom, rf)][market, on = .(eom_ret)][, tret := mkt_vw_exc+rf]
 11 |   wealth <- wealth[eom_ret <= end]
 12 |   wealth <- wealth[order(-eom_ret)][, wealth := cumprod(1-tret)*wealth_end]
 13 |   wealth[, .(eom = floor_date(eom_ret, unit = "month")-1, wealth, mu_ld1 = tret)] %>%
 14 |     rbind(data.table(eom=end, wealth = wealth_end, mu_ld1 = NA_real_)) %>%
 15 |     arrange(eom)
 16 | }
 17 | wealth <- wealth_func(wealth_end = pf_set$wealth, end = settings$split$test_end, market = market, risk_free = risk_free)
 18 | 
 19 | if (FALSE) {
 20 |   wealth %>% ggplot(aes(eom, wealth)) + geom_point() + scale_y_log10()
 21 |   chars[!is.na(dolvol), .(id, eom, dolvol, me)][, .(vw = sum(dolvol*me)/sum(me), ew = mean(dolvol), median = median(dolvol)), by = eom] %>% ggplot(aes(eom, vw)) + geom_point() + scale_y_log10()
 22 | }
 23 | 
 24 | # Cluster Labels (for covariance estimation) -------------
 25 | cluster_labels <- fread("Data/Cluster Labels.csv")
 26 | cluster_labels[, cluster := cluster %>% str_to_lower() %>% str_replace_all("\\s|-", "_")]
 27 | factor_signs <- readxl::read_xlsx("Data/Factor Details.xlsx") %>%
 28 |   select("characteristic"=abr_jkp, direction) %>%
 29 |   filter(!is.na(characteristic)) %>%
 30 |   mutate(direction=direction %>% as.numeric) %>%
 31 |   setDT()
 32 | cluster_labels <- factor_signs[cluster_labels, on = "characteristic"]
 33 | cluster_labels <- cluster_labels %>% rbind(data.table(characteristic="rvol_252d", cluster="low_risk", direction=-1)) # Assign rvol_252d to Low Risk
 34 | 
 35 | # Prepare matrix of future monthly returns ---------------
 36 | monthly <- fread("Data/world_ret_monthly.csv", select = c("excntry", "id", "eom", "ret_exc"), colClasses = c("eom"="character"))
 37 | monthly <- monthly[excntry == "USA" & id<=99999] # Only CRSP observations
 38 | monthly[, eom := eom %>% fast_strptime(format = "%Y%m%d") %>% as.Date()]
 39 | data_ret <- monthly %>% long_horizon_ret(h = settings$pf$hps$m1$K, impute = "zero")
 40 | data_ret_ld1 <- data_ret[, .(id, eom, eom_ret = eom+1+months(1)-1, ret_ld1)]
 41 | data_ret_ld1 <- risk_free[data_ret_ld1, on = "eom"]
 42 | data_ret_ld1[, tr_ld1 := ret_ld1 + rf] # Total return
 43 | data_ret_ld1[, rf := NULL]
 44 | # Add total return at t-1
 45 | data_ret_ld1 <- data_ret_ld1[, .(id, eom=eom+1+months(1)-1, "tr_ld0" = tr_ld1)][data_ret_ld1, on = .(id, eom)]
 46 | rm(monthly)
 47 | 
 48 | # Prepare data -----------------------------------
 49 | chars <- fread(paste0("Data/usa.csv"), 
 50 |                select = unique(c("id", "eom", "sic", "ff49", "size_grp", "me", "crsp_exchcd", "rvol_252d", "dolvol_126d", features)),  
 51 |                colClasses = c("eom" = "character", "sic"="character"))
 52 | chars <- chars[id <= 99999] # Only CRSP observations
 53 | chars[, eom := eom %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()]
 54 | # Add useful columns ------
 55 | chars[, dolvol := dolvol_126d] 
 56 | chars[, lambda := 2/dolvol*settings$pi]
 57 | chars[, rvol_m := rvol_252d * sqrt(21)]
 58 | # Add total return data 
 59 | chars <- data_ret_ld1[chars, on=.(id, eom)]
 60 | # Add wealth change
 61 | chars <- wealth[, .(eom=eom+1+months(1)-1, "mu_ld0" = mu_ld1)][chars, on = .(eom)]
 62 | # Screens ----
 63 | # Exchange code screen
 64 | if (settings$screens$nyse_stocks) {
 65 |   print(paste0("   NYSE stock screen excludes ", round(mean(chars$crsp_exchcd != 1) * 100, 2), "% of the observations"))
 66 |   chars <- chars[crsp_exchcd==1]
 67 | }
 68 | # Date screen
 69 | print(paste0("   Date screen excludes ", round(mean(chars$eom < settings$screens$start | chars$eom > settings$screens$end) * 100, 2), "% of the observations"))
 70 | chars <- chars[eom >= settings$screens$start & eom <= settings$screens$end]
 71 | # Monitor screen impact
 72 | n_start <- nrow(chars)
 73 | me_start <- sum(chars$me, na.rm = T)
 74 | # Require me
 75 | print(paste0("   Non-missing me excludes ", round(mean(is.na(chars$me)) * 100, 2), "% of the observations"))
 76 | chars <- chars[!is.na(me)]
 77 | # Require non-missing return t and t+1
 78 | print(paste0("   Valid return req excludes ", round(chars[, mean(is.na(tr_ld1) | is.na(tr_ld0))] * 100, 2), "% of the observations"))
 79 | chars <- chars[!is.na(tr_ld0) & !is.na(tr_ld1)]
 80 | # Require dolvol
 81 | print(paste0("   Non-missing/non-zero dolvol excludes ", round(mean(is.na(chars$dolvol) | chars$dolvol==0) * 100, 2), "% of the observations"))
 82 | chars <- chars[!is.na(dolvol) & dolvol > 0]
 83 | # Require stock to have SIC code (for covariance estimation with industry) 
 84 | print(paste0("   Valid SIC code excludes ", round(mean(chars$sic=="") * 100, 2), "% of the observations"))
 85 | chars <- chars[!is.na(sic)]
 86 | 
 87 | # Feature screens
 88 | feat_available <- chars %>% select(all_of(features)) %>% apply(1, function(x) sum(!is.na(x)))
 89 | min_feat <- floor(length(features)*settings$screens$feat_pct)
 90 | print(paste0("   At least ", settings$screens$feat_pct*100, "% of feature excludes ", round(mean(feat_available < min_feat)*100, 2), "% of the observations"))
 91 | chars <- chars[feat_available >= min_feat]
 92 | # Summary
 93 | print(paste0("   In total, the final dataset has ", round( (nrow(chars) / n_start)*100, 2), "% of the observations and ", round((sum(chars$me) / me_start)*100, 2), "% of the market cap in the post ", settings$screens$start, " data"))
 94 | # Screen out if running subset --------------------------------
 95 | if (run_sub) {
 96 |   set.seed(settings$seed)
 97 |   chars <- chars[id %in% sample(unique(chars$id), 2500, replace = F)] 
 98 | }
 99 | # Feature standardization -----------------------------
100 | if (settings$feat_prank) {
101 |   chars[, (features) := lapply(.SD, as.double), .SDcols = features]  # Convert feature columns to double to avoid loosing precision 
102 |   for(f in features) {
103 |     if (match(f, features) %% 10 == 0) print(paste0("Feature ", match(f, features), " out of ", length(features)))
104 |     chars[, zero := (get(f) == 0)]
105 |     chars[!is.na(get(f)), (f) := ecdf(get(f))(get(f)), by = eom] # Didn't have by statement before!!
106 |     chars[zero == T, (f) := 0][, zero := NULL]  # Set exact zeros to 0 (ecdf always returns >0)
107 |   }
108 | } 
109 | # Feature Imputation ----------------------------------
110 | if (settings$feat_impute) {
111 |   if (settings$feat_prank) {
112 |     chars[, (features) := lapply(.SD, function(x) if_else(is.na(x), 0.5, x)), .SDcols=features]
113 |   } else {
114 |     chars[, (features) := lapply(.SD, function(x) if_else(is.na(x), median(x, na.rm=T), x)), .SDcols=features, by=eom]
115 |   }
116 | }
117 | # Industry classification -----------------------------
118 | chars[, ff12 := case_when(
119 |   sic %in% c(100:999, 2000:2399, 2700:2749, 2770:2799, 3100:3199, 3940:3989) ~ "NoDur",
120 |   sic %in% c(2500:2519, 3630:3659, 3710:3711, 3714, 3716, 3750:3751, 3792, 3900:3939, 3990:3999) ~ "Durbl",
121 |   sic %in% c(2520:2589, 2600:2699, 2750:2769, 3000:3099, 3200:3569, 3580:3629,
122 |              3700:3709, 3712:3713, 3715, 3717:3749, 3752:3791, 3793:3799, 3830:3839, 3860:3899) ~ "Manuf",
123 |   sic %in% c(1200:1399, 2900:2999) ~ "Enrgy",
124 |   sic %in% c(2800:2829, 2840:2899) ~ "Chems",
125 |   sic %in% c(3570:3579, 3660:3692, 3694:3699, 3810:3829, 7370:7379) ~ "BusEq",
126 |   sic %in% c(4800:4899) ~ "Telcm",
127 |   sic %in% c(4900:4949) ~ "Utils",
128 |   sic %in% c(5000:5999, 7200:7299, 7600:7699) ~ "Shops",
129 |   sic %in% c(2830:2839, 3693, 3840:3859, 8000:8099) ~ "Hlth",
130 |   sic %in% c(6000:6999) ~ "Money",
131 |   TRUE ~ "Other"
132 | )]
133 | # Check which observations are valid -----------------
134 | chars[, valid_data := T]
135 | # Check that stock is eligible for portfolio-ml
136 | chars %>% setorder(id, eom)
137 | lb <- pf_set$lb_hor+1 # Plus 1 to get the last signal of last periods portfolio for portfolio-ML
138 | chars[, eom_lag := shift(eom, n = lb, type="lag"), by = id]
139 | chars[, month_diff := interval(eom_lag, eom) %/% months(1)]
140 | print(paste0("   Valid lookback observation screen excludes ", round(chars[valid_data==T, mean(month_diff != lb | is.na(month_diff))] * 100, 2), "% of the observations"))
141 | chars[, valid_data := (valid_data==T & month_diff==lb & !is.na(month_diff))]
142 | chars[, c("eom_lag", "month_diff") := NULL]
143 | rm(lb)
144 | 
145 | # Size based screen --------------------------------------
146 | chars |> size_screen_fun(type=settings$screens$size_screen)
147 | 
148 | # Addition/deletion rule ----------------------------------
149 | chars |> addition_deletion_fun(addition_n=settings$addition_n, deletion_n=settings$deletion_n)
150 | 
151 | # Show investable universe --------------------------------
152 | chars[valid==T, .N, by = eom] %>% ggplot(aes(eom, N)) + geom_point() + labs(y = "Valid stocks") + geom_hline(yintercept = 0)
153 | 
154 | # Valid summary
155 | print(paste0("   The valid_data subset has ", round(chars[, mean(valid==T)]*100, 2), "% of the observations and ", round((sum(chars[valid==T]$me) / sum(chars$me))*100, 2), "% of the market cap"))
156 | 
157 | # Daily returns ----------------------------------
158 | daily <- fread("Data/usa_dsf.csv", colClasses = c("date"="character"), select = c("id", "date", "ret_exc"))
159 | daily <- daily[!is.na(ret_exc) & id<=99999 & id %in% unique(chars[valid==T]$id)]; gc() # Only CRSP observations
160 | daily[, date := date %>% fast_strptime(format="%Y%m%d") %>% as.Date()]; gc()
161 | daily[, eom := date %>% ceiling_date(unit = "month")-1]


--------------------------------------------------------------------------------
/2 - Fit Models.R:
--------------------------------------------------------------------------------
 1 | start_time <- proc.time()
 2 |   
 3 | models <- 1:nrow(search_grid) %>% lapply(function(i) {
 4 |   # Prepare y variable
 5 |   h <- search_grid[i, ]$horizon %>% unlist()
 6 |   pred_y <- data_ret[, paste0("ret_ld", h), with=F] %>% rowMeans()
 7 |   pred_y <- data_ret[, .(id, eom, eom_pred_last=eom+1+months(max(h))-1, ret_pred=pred_y)]
 8 |   data_pred <- pred_y[chars[valid==T], on = .(id, eom)]
 9 |   
10 |   print(paste0("horizons: ", list(h)))
11 |   
12 |   # Fit model
13 |   if (settings$split$model_update_freq == "once") {
14 |     val_ends <- settings$split$train_end
15 |     test_inc <- 1000
16 |   }
17 |   if (settings$split$model_update_freq == "yearly") {
18 |     val_ends <- seq.Date(from = settings$split$train_end, to = settings$split$test_end, by = "1 year")
19 |     test_inc <- 1
20 |   }
21 |   if (settings$split$model_update_freq == "decade") {
22 |     val_ends <- seq.Date(from = settings$split$train_end, to = settings$split$test_end, by = "10 years")
23 |     test_inc <- 10
24 |   }
25 |   system.time({op <- val_ends %>% sapply(simplify = F, USE.NAMES = T, function(val_end) {
26 |     print(val_end)
27 |     train_test_val <- data_pred %>% data_split(type = settings$split$model_update_freq, val_end = val_end, val_years = settings$split$val_years, train_start = settings$screens$start, 
28 |                                                train_lookback = settings$split$train_lookback, retrain_lookback = settings$split$retrain_lookback,
29 |                                                test_inc = test_inc, test_end = settings$split$test_end)
30 |     # Fit models
31 |     print(system.time(model_op <- train_test_val %>% rff_hp_search(feat = features, p_vec=settings$rff$p_vec, g_vec=settings$rff$g_vec, l_vec=settings$rff$l, seed=settings$seed_no)))
32 |     
33 |     return(model_op)
34 |   })})
35 |   op %>% saveRDS(paste0(output_path, "/model_", h,".RDS"))
36 |   # op %>% saveRDS(paste0(output_path, "/models/model_", settings$split$model_update_freq, "_", min(h), "_", max(h), ".RDS"))
37 |   return(op)
38 | })
39 | # Save output ------
40 | # names(models) <- search_grid$name
41 | 
42 | # models %>% saveRDS(paste0(output_path, "/models/model_full.RDS"))
43 | 
44 | proc.time()-start_time # 15 hours with all stocks, xgb, and yearly updates, 
45 | 
46 | 


--------------------------------------------------------------------------------
/3 - Estimate Covariance Matrix.R:
--------------------------------------------------------------------------------
  1 | # Prepare cluster characteristics ---------------
  2 | cluster_data_m <- chars[valid==T][, c("id", "eom", "size_grp", "ff12", features), with=F]
  3 | clusters <- unique(cluster_labels$cluster)
  4 | cluster_ranks <- clusters %>% lapply(function(cl) {
  5 |   chars_sub <- cluster_labels[cluster==cl & characteristic %in% features]
  6 |   # print(paste0(cl, ", n: ", nrow(chars_sub)))
  7 |   data_sub <- cluster_data_m[, chars_sub$characteristic, with=F]
  8 |   for (c in chars_sub$characteristic) {
  9 |     dir <- chars_sub[characteristic == c, direction]
 10 |     if (dir == -1) {
 11 |       data_sub[, (c) := 1-get(c)]
 12 |     }
 13 |   }
 14 |   data.table(x=data_sub %>% rowMeans()) %>% setnames(old = "x", new = cl)
 15 | }) %>% bind_cols()
 16 | cluster_data_m <- cluster_data_m[, .(id, eom, "eom_ret"=ceiling_date(eom+1, unit = "month")-1, size_grp, ff12)] %>% cbind(cluster_ranks)
 17 | # Add Industry/market dummies
 18 | if (settings$cov_set$industries) {
 19 |   # Create industry dummies
 20 |   industries <- sort(unique(cluster_data_m$ff12))
 21 |   for (ind in industries) {
 22 |     cluster_data_m[, (ind) := as.integer(ff12==ind)]
 23 |   }
 24 |   ind_factors <- industries
 25 | } else {
 26 |   # Add market factor
 27 |   cluster_data_m[, mkt := 1]
 28 |   ind_factors <- "mkt"
 29 | }
 30 | # Standardize factors
 31 | cluster_data_m[, (clusters) := lapply(.SD, function(x) (x-mean(x))/sd(x)), by=eom, .SDcols=clusters]
 32 | # Add daily return data
 33 | cluster_data_d <- cluster_data_m[daily[date>= min(cluster_data_m$eom)][, .(id, date, ret_exc, "eom_ret"=eom)], on = .(id, eom_ret)]
 34 | # Omit observations without data
 35 | cluster_data_d <- cluster_data_d %>% na.omit()
 36 | 
 37 | # Create factor returns --------------------
 38 | fct_ret_est <- cluster_data_d %>%
 39 |   group_by(date) %>%
 40 |   nest() %>%
 41 |   mutate(
 42 |     fit = data %>% map(~lm(paste0("ret_exc~-1+", paste0(c(ind_factors, clusters), collapse = "+")), data=.x)),
 43 |     # fit = data %>% map(~lm(paste0("ret_exc~-1+mkt+", paste0(clusters, collapse = "+")), data=.x)),
 44 |     res = fit %>% map(residuals),
 45 |     tidied = fit %>% map(broom::tidy)
 46 |   ) 
 47 | fct_ret <- fct_ret_est %>%
 48 |   unnest(tidied) %>%
 49 |   select(date, term, estimate) %>%
 50 |   pivot_wider(names_from = term, values_from = estimate) %>%
 51 |   arrange(date) %>%
 52 |   setDT()
 53 | 
 54 | # Factor Risk -----------------------------
 55 | w_cor <- (0.5^(1/settings$cov_set$hl_cor))^(settings$cov_set$obs:1)
 56 | w_var <- (0.5^(1/settings$cov_set$hl_var))^(settings$cov_set$obs:1)
 57 | 
 58 | fct_dates <- sort(fct_ret$date)
 59 | calc_dates <- sort(unique(cluster_data_m[eom>=floor_date(fct_dates[settings$cov_set$obs], unit="m")-1]$eom)) # to ensure sufficient data for cov calculation
 60 | factor_cov_est <- as.character(calc_dates) %>% map(.progress = T, function(d){
 61 |   first_obs <- min(tail(fct_dates[fct_dates <= as.Date(d)], settings$cov_set$obs))
 62 |   cov_data <- fct_ret[date >= first_obs & date <= as.Date(d)]
 63 |   t <- nrow(cov_data)
 64 |   if (t < settings$cov_set$obs-30) warning("INSUFFICIENT NUMBER OF OBSERVATIONS!!") # Only an issue with the first calc_date
 65 |   cov_est <- cov_data %>% select(-date) %>% cov.wt(wt = tail(w_cor, t), cor=T, center=T, method = "unbiased")
 66 |   cor_est <- cov_est$cor
 67 |   var_est <- cov_data %>% select(-date) %>% cov.wt(wt = tail(w_var, t), cor=F, center=T, method = "unbiased") # inefficient solution but super fast with few factors
 68 |   sd_diag <- diag(sqrt(diag(var_est$cov)))
 69 |   # Prepare cov
 70 |   cov_est <- sd_diag %*% cor_est %*% sd_diag
 71 |   rownames(cov_est) <- colnames(cov_est) <- colnames(cor_est)
 72 |   # Output
 73 |   return(cov_est)
 74 | })
 75 | names(factor_cov_est) <- as.character(calc_dates)
 76 | 
 77 | # Specific Risk ---------------------------
 78 | spec_risk <- fct_ret_est %>%
 79 |   mutate(id = data %>% map(~.x$id)) %>% 
 80 |   select(id, date, res) %>% 
 81 |   unnest(c(id, res)) %>%
 82 |   arrange(id, date) %>%
 83 |   setDT()
 84 | # EWMA variance
 85 | spec_risk[, res_vol := ewma_c(res, lambda = 0.5^(1/settings$cov_set$hl_stock_var), start = settings$cov_set$initial_var_obs), by = id] # Lambda=exp(log(0.5)/half_life)
 86 | # Require that the observation at least 200 non-missing observations out of the last 252 trading days
 87 | td_range <- data.table(date=fct_dates, td_252d = lag(fct_dates, 252))
 88 | spec_risk <- td_range[spec_risk, on = "date"]
 89 | spec_risk[, date_200d := lag(date, 200), by = id]
 90 | spec_risk <- spec_risk[date_200d>=td_252d & !is.na(res_vol)][, .(id, date, res_vol)]
 91 | # Extract specific risk by month end
 92 | spec_risk[, eom_ret := date %>% ceiling_date(unit="month")-1]
 93 | spec_risk[, max_date := max(date), by = .(id, eom_ret)]
 94 | spec_risk_m <- spec_risk[date==max_date, .(id, "eom"=eom_ret, res_vol)]
 95 | 
 96 | # Stock covariance matrix --------------------------
 97 | barra_cov <- as.character(calc_dates) %>% map(.progress = T, function(d){
 98 |   char_data <- cluster_data_m[eom == as.Date(d)]
 99 |   # Add specific risk
100 |   char_data <- spec_risk_m[char_data, on = .(id, eom)]
101 |   # char_data <- char_data[!is.na(res_vol)]  # Consider if we should do some imputation e.g. median res_vol?
102 |   char_data[, med_res_vol := median(res_vol, na.rm=T), by = .(size_grp, eom)]
103 |   if (any(is.na(char_data$med_res_vol))) {
104 |     char_data[, med_res_vol_all := median(res_vol, na.rm=T), by = .(eom)] # Crazy edge case: in April 1996, there's only 1 micro stock and it has na for resvol..
105 |     char_data[is.na(med_res_vol), med_res_vol := med_res_vol_all]  
106 |   }
107 |   char_data[is.na(res_vol), res_vol := med_res_vol]
108 |   fct_cov <- factor_cov_est[[d]]*21 # Annualize
109 |   char_data %>% setorder(id)  # Important to align ids across different datasets
110 |   X <- char_data[, colnames(fct_cov), with=F] %>% as.matrix() # Using colnames(fct_cov) is important so char and factor is aligned across x and fct_cov
111 |   rownames(X) <- char_data$id %>% as.character()
112 |   # Stock covariance matrix
113 |   # cov_est <- X %*% fct_cov %*% t(X) + diag(char_data$res_vol^2)
114 |   # rownames(cov_est) <- colnames(cov_est) <- as.character(char_data$id)
115 |   # return(cov_est)
116 |   ivol_vec <- char_data$res_vol^2*21 # Annualize. Also, would more accurately be called "ivar_vec"
117 |   names(ivol_vec) <- char_data$id %>% as.character()
118 |   list("fct_load"=X, "fct_cov"=fct_cov, "ivol_vec"=ivol_vec)
119 | })
120 | names(barra_cov) <- as.character(calc_dates)
121 | 
122 | # Sanity check
123 | if (FALSE) {
124 |   # Check implied volatility of market factor over time
125 |   check <- names(barra_cov) |> map(function(d) {
126 |     data.table(
127 |       eom = as.Date(d),
128 |       # mkt_vol = barra_cov[[d]]$fct_cov["Hlth", "Hlth"]^0.5,
129 |       mkt_vol = barra_cov[[d]]$fct_cov["mkt", "mkt"]^0.5,
130 |       ivol = mean(barra_cov[[d]]$ivol_vec^0.5)
131 |     )
132 |   }) |> rbindlist()
133 |   
134 |   check |> 
135 |     pivot_longer(-eom) |> 
136 |     ggplot(aes(eom, value*sqrt(12), color=name)) +
137 |     geom_line()
138 |   
139 |   # Why we impute
140 |   test <- calc_dates %>% sapply(simplify = F, USE.NAMES = T, function(d){
141 |     char_data <- cluster_data_m[eom == d]
142 |     # Add specific risk
143 |     char_data <- spec_risk_m[char_data, on = .(id, eom)]
144 |     n_miss <- char_data[is.na(res_vol), .N]
145 |     data.table(d=d, n_miss=n_miss)
146 |   }) %>% rbindlist()
147 |   test %>% ggplot(aes(d, n_miss)) + geom_point()
148 |   # Sanity check
149 |   pred_sd_avg <- names(barra_cov) %>% lapply(function(d) {
150 |     pred_var <- diag(barra_cov[[d]])
151 |     data.table(eom = as.Date(d), n = length(pred_var), sd_avg = sqrt(mean(pred_var)*252))
152 |   }) %>% rbindlist()
153 |   pred_sd_avg %>% ggplot(aes(eom, n)) + geom_point(size=1) + theme(axis.title.x = element_blank())
154 |   pred_sd_avg %>% ggplot(aes(eom, sd_avg)) + geom_point(size=1) + labs(y = "Average Predicted Volatility (Annualized)", title = "Covariance Sanity Check") + theme(axis.title.x = element_blank())
155 |   
156 |   
157 |   (output$cov_check <- pred_sd_avg %>% 
158 |     ggplot(aes(eom, sd_avg)) + 
159 |     geom_point(size=1) + 
160 |     labs(y = "Average Predicted Volatility (Annualized)") + 
161 |     theme(axis.title.x = element_blank()))
162 |   
163 |   me <- chars[!is.na(me), .(id, eom, me)]
164 |   me %>% setorder(eom)
165 |   me <- me %>% split(by="eom")
166 |   mkt_vol <- names(barra_cov) %>% lapply(function(d) {
167 |     sigma <- barra_cov[[d]]
168 |     me_sub <- me[[d]]
169 |     me_sub <- me_sub[id %in% colnames(sigma)]
170 |     me_sub[, w := me/sum(me)]
171 |     sigma <- sigma[as.character(me_sub$id), as.character(me_sub$id)]
172 |     data.table(eom=as.Date(d), mkt_vol = sqrt(drop(t(me_sub$w) %*% sigma %*% me_sub$w)))
173 |   }) %>% rbindlist()
174 |   
175 |   (output$mkt_vol <- mkt_vol %>% ggplot(aes(eom, mkt_vol*sqrt(252))) + geom_point() + geom_hline(yintercept = 0))
176 |   
177 |   # Check: All valid stocks have covariance estimate
178 |   valid_cov_est <- names(barra_cov) %>% lapply(function(d) {
179 |     data.table(eom=as.Date(d), id = barra_cov[[d]] %>% colnames() %>% as.integer(), valid_cov = T)
180 |   }) %>% rbindlist()
181 |   test <- valid_cov_est[chars[, .(id, eom, valid)], on = .(id, eom)]
182 |   test[valid==T & eom>=min(valid_cov_est$eom), .N, by = valid_cov]
183 | }


--------------------------------------------------------------------------------
/4 - Prepare Portfolio Data.R:
--------------------------------------------------------------------------------
 1 | # Add return predictions ---------------------------------
 2 | for (h in 1:settings$pf$hps$m1$K) {
 3 |   pred_data <- readRDS(paste0(get_from_path_model, "/model_", h, ".RDS")) |> 
 4 |     lapply(function(x) x$pred) |>  
 5 |     rbindlist() |>  
 6 |     select(id, eom, pred) |> 
 7 |     rename_with(~ paste0("pred_ld", h), .cols = "pred")
 8 |   chars <- pred_data[chars, on = .(id, eom)]
 9 | }
10 | 
11 | # Create lambda list -------------------------------------
12 | lambda_dates <- unique(chars$eom)
13 | lambda_list <- lambda_dates |> map(function(d) {
14 |   x <- chars[eom==d, .(id, lambda)][order(id)]
15 |   ids <- x$id
16 |   x <- x$lambda
17 |   names(x) <- ids
18 |   return(x)
19 | }) |> setNames(as.character(lambda_dates))
20 | rm(lambda_dates)
21 | 
22 | # Important dates ----------------------
23 | first_cov_date <- names(barra_cov) %>% as.Date() %>% min()
24 | hp_years <- seq(from=settings$pf$dates$start_year, to=settings$pf$dates$end_yr)
25 | start_oos <- settings$pf$dates$start_year+settings$pf$dates$split_years
26 | 
27 | dates_m1 <- seq.Date(from = settings$split$train_end+1, to = settings$split$test_end+1-months(1), by = "1 month")-1
28 | dates_m2 <- seq.Date(from = first_cov_date+1+months(pf_set$lb_hor+1), to = settings$split$test_end+1-months(1), by = "1 month")-1
29 | dates_oos <- seq.Date(from = as.Date(paste0(start_oos, "-", "01-01")), to = settings$split$test_end+1-months(1), by = "1 month")-1
30 | dates_hp <- seq.Date(from = as.Date(paste0(min(hp_years), "-", "01-01")), to = settings$split$test_end+1-months(1), by = "1 month")-1
31 | 


--------------------------------------------------------------------------------
/5 - Base case.R:
--------------------------------------------------------------------------------
  1 | # Benchmark portfolios ------------------------------------------
  2 | # Markowitz-ML
  3 | tpf <- chars |> tpf_implement(cov_list = barra_cov, wealth = wealth, dates = dates_oos, gam = pf_set$gamma_rel)
  4 | 
  5 | # Factor-ML
  6 | factor_ml <- chars %>% factor_ml_implement(dates = dates_oos, n_pfs = settings$factor_ml$n_pfs, wealth = wealth, gam = pf_set$gamma_rel)
  7 | 
  8 | # Market
  9 | mkt <- chars %>% mkt_implement(dates = dates_oos, wealth = wealth)
 10 | 
 11 | # 1/n 
 12 | ew <- chars %>% ew_implement(dates = dates_oos, wealth = wealth)
 13 | 
 14 | # Fama-MacBeth / Rank weighted portfolios 
 15 | rw <- chars %>% rw_implement(dates = dates_oos, wealth = wealth)
 16 | 
 17 | # Minimum variance 
 18 | mv <- chars %>% mv_implement(cov_list = barra_cov, dates = dates_oos, wealth = wealth)
 19 | 
 20 | # Output
 21 | bm_pfs <- rbind(tpf$pf, factor_ml$pf, ew$pf, mkt$pf, rw$pf, mv$pf)
 22 | bm_pfs %>% fwrite(paste0(output_path, "/bms.csv"))
 23 | 
 24 | # Static-ML ------------------------------------------------------
 25 | # Implement
 26 | static <- static_implement(
 27 |   data_tc = chars, 
 28 |   cov_list = barra_cov, 
 29 |   lambda_list = lambda_list,
 30 |   rf = risk_free,                                      # Data
 31 |   wealth = wealth, 
 32 |   mu = pf_set$mu, 
 33 |   gamma_rel = pf_set$gamma_rel,                                                       # Investor
 34 |   dates_full = dates_m1, 
 35 |   dates_oos = dates_oos, 
 36 |   dates_hp = dates_hp, 
 37 |   hp_years = hp_years,                               # Dates
 38 |   k_vec = settings$pf$hps$static$k, 
 39 |   u_vec = settings$pf$hps$static$u, 
 40 |   g_vec = settings$pf$hps$static$g, 
 41 |   cov_type = settings$pf$hps$cov_type,
 42 |   validation = NULL
 43 | )
 44 | # Output 
 45 | static |> saveRDS(paste0(output_path, "/static-ml.RDS"))
 46 | 
 47 | # Portfolio-ML ---------------------------------------------------
 48 | # Implement
 49 | pfml <- pfml_implement(
 50 |   data_tc = chars, 
 51 |   cov_list = barra_cov, 
 52 |   lambda_list = lambda_list,
 53 |   features = features, 
 54 |   risk_free = risk_free,                                      # Data
 55 |   wealth = wealth,
 56 |   mu = pf_set$mu, 
 57 |   gamma_rel = pf_set$gamma_rel,                                                       # Investor
 58 |   dates_full = dates_m2, 
 59 |   dates_oos = dates_oos, 
 60 |   lb = pf_set$lb_hor, 
 61 |   hp_years = hp_years,                               # Dates
 62 |   rff_feat = T, 
 63 |   g_vec = settings$pf_ml$g_vec, 
 64 |   p_vec = settings$pf_ml$p_vec, 
 65 |   l_vec = settings$pf_ml$l_vec, 
 66 |   scale = settings$pf_ml$scale, 
 67 |   orig_feat = settings$pf_ml$orig_feat,      # Hyperparameters
 68 |   iter = 10, 
 69 |   hps = NULL,  
 70 |   balanced = F,
 71 |   seed = settings$seed_no                                                      # Other
 72 | )
 73 | # Output
 74 | pfml |> saveRDS(paste0(output_path, "/portfolio-ml.RDS"))
 75 | 
 76 | # Multiperiod-ML ------------------------------------------------
 77 | if (config_params$update_mp) {
 78 |   # Implement
 79 |   mp <- mp_implement(
 80 |     data_tc = chars, 
 81 |     cov_list = barra_cov,
 82 |     lambda_list = lambda_list,
 83 |     rf = risk_free,                                      # Data
 84 |     wealth = wealth, 
 85 |     mu = pf_set$mu, 
 86 |     gamma_rel = pf_set$gamma_rel,                                                       # Investor
 87 |     dates_full = dates_m1, 
 88 |     dates_oos = dates_oos, 
 89 |     dates_hp = dates_hp, 
 90 |     hp_years = hp_years,                              # Dates
 91 |     k_vec = settings$pf$hps$m1$k, 
 92 |     u_vec = settings$pf$hps$m1$u, 
 93 |     g_vec = settings$pf$hps$m1$g, 
 94 |     cov_type = settings$pf$hps$cov_type,
 95 |     validation = NULL, 
 96 |     iter = 10,
 97 |     K = settings$pf$hps$m1$K
 98 |   )
 99 |   # Save output
100 |   mp |> saveRDS(paste0(output_path, "/multiperiod-ml.RDS"))
101 | } 


--------------------------------------------------------------------------------
/5 - Feature importance IEF.R:
--------------------------------------------------------------------------------
 1 | # Portfolio-ML - IEF --------------------
 2 | # Only some clusters are implemented for full IEF
 3 | ief_path <- "Data/Generated/Portfolios/IEF/"
 4 | ief_cf_clusters <- c("quality", "value", "momentum", "short_term_reversal")
 5 | pfml_cf_ief <- settings$ef$gamma_rel |> map(.progress = T, function(gamma_rel) {
 6 |   # Find hyper-parameters for specific gamma
 7 |   x <- paste0("20240514-20_WEALTH1e+10_GAMMA", gamma_rel, "_SIZEperc_low50_high100_min50_INDTRUE")
 8 |   pfml <- readRDS(paste0(ief_path, x, "/portfolio-ml.RDS"))
 9 |   # Implement for each cluster
10 |   pfml_cf_base <- ief_cf_clusters |> map(.progress = T, function(cf_cluster) {
11 |     chars[valid==T] |> pfml_cf_fun(
12 |       cf_cluster = cf_cluster, 
13 |       pfml_base = pfml, 
14 |       dates = dates_oos, 
15 |       cov_list = barra_cov, 
16 |       scale = settings$pf_ml$scale, 
17 |       orig_feat = settings$pf_ml$orig_feat, 
18 |       gamma_rel = gamma_rel, 
19 |       wealth = wealth, 
20 |       risk_free = risk_free, 
21 |       mu = pf_set$mu, 
22 |       iter = 10,
23 |       seed = settings$seed_no
24 |     )
25 |   }) |> rbindlist() |> mutate(gamma_rel = gamma_rel)
26 | }) |> rbindlist()
27 | # Save
28 | pfml_cf_ief |> fwrite(paste0(output_path, "/pfml_cf_ief.csv"))
29 | 


--------------------------------------------------------------------------------
/5 - Feature importance base.R:
--------------------------------------------------------------------------------
 1 | # Markowitz-ML - Base case ---------------------
 2 | cf_clusters <- c("bm", clusters)
 3 | tpf_cf_wealth <- wealth_func(wealth_end = 0, end = settings$split$test_end, market = market, risk_free = risk_free)
 4 | er_models <- readRDS(paste0(get_from_path_model, "/model_1.RDS"))
 5 | tpf_cf_base <- cf_clusters |> map(.progress = T, function(cf_cluster) {
 6 |   chars[valid==T] |> tpf_cf_fun(
 7 |     cf_cluster = cf_cluster, 
 8 |     er_models = er_models,
 9 |     dates = dates_oos, 
10 |     cluster_labels = cluster_labels,
11 |     cov_list = barra_cov, 
12 |     gamma_rel = 100,                 # Use higher gamma to make TPF less extreme (and increase realized utility) 
13 |     wealth = tpf_cf_wealth,
14 |     seed = settings$seed_no
15 |   )
16 | }) |> rbindlist()
17 | # Save
18 | tpf_cf_base |> fwrite(paste0(output_path, "/tpf_cf_base.csv"))
19 | 
20 | # tpf_cf_base[, .(mean = mean(r)*12, sd = sd(r)*sqrt(12), u = (mean(r)-pf_set$gamma_rel/2*var(r))*12), by = cluster][, sr := mean/sd][order(sr)]
21 | 
22 | # Portfolio-ML - Base case --------------------
23 | # Load PFML base case model, for its HPs
24 | base_path <- "Data/Generated/Portfolios/Base/"
25 | pfml <- readRDS(paste0(base_path, list.files(base_path), "/portfolio-ml.RDS"))
26 | # All clusters are implemented for the base case
27 | cf_clusters <- c("bm", clusters)
28 | pfml_cf_base <- cf_clusters |> map(.progress = T, function(cf_cluster) {
29 |   chars[valid==T] |> pfml_cf_fun(
30 |     cf_cluster = cf_cluster, 
31 |     pfml_base = pfml, 
32 |     dates = dates_oos, 
33 |     cov_list = barra_cov, 
34 |     scale = settings$pf_ml$scale, 
35 |     orig_feat = settings$pf_ml$orig_feat, 
36 |     gamma_rel = pf_set$gamma_rel, 
37 |     wealth = wealth, 
38 |     risk_free = risk_free, 
39 |     mu = pf_set$mu, 
40 |     iter = 10,
41 |     seed = settings$seed_no
42 |   )
43 | }) |> rbindlist()
44 | # Save
45 | pfml_cf_base |> fwrite(paste0(output_path, "/pfml_cf_base.csv"))


--------------------------------------------------------------------------------
/5 - Feature importance ret.R:
--------------------------------------------------------------------------------
 1 | # Estimate counterfactual predictions 
 2 | ret_cf <- 1:12 %>% map(.progress = "model iteration", function(h) {
 3 |   model <- readRDS(paste0(get_from_path_model, "/model_", h, ".RDS")) 
 4 |   # Prepare data
 5 |   pred_y <- data_ret[, .(id, eom, ret_pred=get(paste0("ret_ld", h)))]
 6 |   ret_cf_data <- pred_y[chars, on = .(id, eom)]
 7 |   preds <- model %>% map("pred") %>% rbindlist() %>% select(id, eom, pred)
 8 |   ret_cf_data <- ret_cf_data[preds, on = .(id, eom)][!is.na(ret_pred)]
 9 |   # Implement counterfactual
10 |   c("bm", sort(clusters)) %>% map(progress=T, function(cls) {
11 |     set.seed(settings$seed) # Set seed for reproducibility
12 |     print(paste0("   ", cls)) 
13 |     if (cls=="bm") {
14 |       cf <- copy(ret_cf_data)
15 |     } else {
16 |       cf <- copy(ret_cf_data) %>% select(-c(pred)) # Counterfactual
17 |       cf[, id_shuffle := sample(id, .N, replace=F), by = eom]
18 |       cls_chars <- cluster_labels[cluster==cls & characteristic %in% features, characteristic]
19 |       chars_data <- cf %>% select(all_of(c("id", "eom", cls_chars))) %>% rename("id_shuffle"=id)
20 |       cf <- chars_data[cf[, (cls_chars) := NULL], on = .(id_shuffle, eom)]
21 |       # Expected Returns
22 |       for (m_sub in model) {
23 |         if (!is.null(m_sub$pred)) {
24 |           sub_dates <- unique(m_sub$pred$eom)
25 |           cf_x <- cf[eom %in% sub_dates, features, with=F] %>% as.matrix()
26 |           cf_new_x <- cf_x %>% rff(W=m_sub$W)
27 |           cf_new_x <- m_sub$opt_hps$p^(-0.5)*cbind(cf_new_x$X_cos, cf_new_x$X_sin)
28 |           cf[eom %in% sub_dates, pred := drop(m_sub$fit %>% predict(newx = cf_new_x, s = m_sub$opt_hps$lambda))]
29 |         }
30 |       }
31 |     }
32 |     cf[, .(cluster = cls, n = .N, mse = mean((pred-ret_pred)^2)), by = eom]
33 |   }) %>% rbindlist() %>% mutate(h = h)
34 | }) %>% rbindlist()
35 | # Save
36 | ret_cf |> fwrite(paste0(output_path, "/ret_cf.csv"))


--------------------------------------------------------------------------------
/6 - Base analysis.R:
--------------------------------------------------------------------------------
  1 | # Need to run everything from main to right before bm_pfs in 5 - base case
  2 | # Load base case portfolios -----------
  3 | base_path <- "Data/Generated/Portfolios/Base/"
  4 | base_folder <- list.files(base_path)
  5 | mp <- readRDS(paste0(base_path, base_folder, "/multiperiod-ml.RDS"))
  6 | pfml <- readRDS(paste0(base_path, base_folder, "/portfolio-ml.RDS"))
  7 | static <- readRDS(paste0(base_path, base_folder, "/static-ml.RDS"))
  8 | bm_pfs <- fread(paste0(base_path, base_folder, "/bms.csv")) |> 
  9 |   mutate(
 10 |     eom_ret = as.Date(eom_ret),
 11 |     type = if_else(type == "Rank-Weighted", "Rank-ML", type) # Old naming convention
 12 |   )
 13 | 
 14 | # Final Portfolios ---------------------
 15 | pfs <- rbind(
 16 |   mp$pf,
 17 |   pfml$pf,
 18 |   static$pf, 
 19 |   bm_pfs,
 20 |   mp$hps[eom_ret %in% mp$pf$eom_ret & k==1 & g==0 & u==1, .(eom_ret=as.Date(eom_ret), inv, shorting, turnover, r, tc)][, type := "Multiperiod-ML"],
 21 |   static$hps[eom_ret %in% static$pf$eom_ret & k==1 & g==0 & u==1, .(eom_ret=as.Date(eom_ret), inv, shorting, turnover, r, tc)][, type := "Static-ML"]
 22 | )
 23 | pfs[, type := type %>% factor(levels = pf_order)]
 24 | pfs %>% setorder(type, eom_ret)
 25 | pfs[, e_var_adj := (r-mean(r))^2, by=type]
 26 | pfs[, utility_t := r-tc-0.5*e_var_adj*pf_set$gamma_rel]
 27 | 
 28 | # Portfolio summary stats --------------
 29 | pf_summary <- pfs[, .(
 30 |   n = .N,
 31 |   inv = mean(inv),
 32 |   shorting = mean(shorting),
 33 |   turnover_notional = mean(turnover),
 34 |   r = mean(r)*12,
 35 |   sd = sd(r)*sqrt(12),
 36 |   sr_gross = mean(r)/sd(r)*sqrt(12),
 37 |   tc = mean(tc)*12,
 38 |   r_tc = mean((r-tc))*12,
 39 |   sr = mean(r-tc)/sd(r)*sqrt(12),
 40 |   obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12#,
 41 |   # obj_exp = (mean(e_r)-0.5*mean(e_var)*pf_set$gamma_rel-mean(tc))*12
 42 | ), by = .(type)][order(type)]
 43 | # Remove not essential types
 44 | pfs <- pfs[type %in% main_types][, type := type %>% factor(levels = main_types)]
 45 | 
 46 | # Performance Time-Series -----------
 47 | types <- unique(pfs$type)
 48 | pfs[, cumret := cumsum(r), by=.(type)]
 49 | pfs[, cumret_tc := cumsum(r-tc), by=.(type)]
 50 | pfs[, cumret_tc_risk := cumsum(utility_t), by=.(type)]
 51 | ts_data <- pfs[type %in% main_types, .(type, eom_ret, cumret, cumret_tc, cumret_tc_risk)] %>%
 52 |   pivot_longer(-c(type, eom_ret)) %>%
 53 |   bind_rows(tibble(
 54 |     eom_ret = ceiling_date(min(pfs$eom_ret), unit = "month")-1,
 55 |     type = rep(types, 3),
 56 |     value = rep(0, 3*length(types)),
 57 |     name = c(rep("cumret", length(types)), rep("cumret_tc", length(types)), rep("cumret_tc_risk", length(types)))
 58 |   )) %>%
 59 |   mutate(
 60 |     name_pretty = case_when(
 61 |       name == "cumret" ~ "Gross return",
 62 |       name == "cumret_tc" ~ "Return net of TC",
 63 |       name == "cumret_tc_risk" ~ "Return net of TC and Risk"
 64 |     ) %>% factor(levels = c("Gross return", "Return net of TC", "Return net of TC and Risk"))
 65 |   ) %>%
 66 |   setDT()
 67 | 
 68 | ts_plots <- unique(ts_data$name) %>% lapply(function(x) {
 69 |   p <- ts_data[name == x] %>%
 70 |     ggplot(aes(eom_ret, value, colour=type, linetype=type)) +  
 71 |     geom_line() +
 72 |     coord_cartesian(ylim=c(0, NA)) +
 73 |     facet_wrap(~name_pretty, scales = "free") +
 74 |     labs(y = "Cummulative performance", colour = "Method:", linetype="Method:") +
 75 |     scale_x_date(date_labels = "%Y", breaks = seq.Date(as.Date("1980-12-31"), as.Date("2020-12-31"), by = "20 years")) +
 76 |     theme(
 77 |       axis.title.x = element_blank(),
 78 |       strip.background = element_rect(fill = "white", color = "black"),
 79 |       text = element_text(size=11)
 80 |     )
 81 |   if (x == "cumret") {
 82 |     y_range <- ts_data[name == x & type != "Markowitz-ML" & name=="cumret", .(max = max(value), min = min(value))]
 83 |     p <- p + coord_cartesian(ylim=c(y_range$min, y_range$max)) 
 84 |   }
 85 |   p
 86 | })
 87 | # ts_legend <- cowplot::get_legend(
 88 | #   ts_plots[[1]] + guides(color = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.title = element_blank())
 89 | # )
 90 | ts_legend <- cowplot::get_plot_component(
 91 |   plot = ts_plots[[1]] + guides(color = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.title = element_blank(), legend.justification = "center"), 
 92 |   'guide-box-top', return_all = TRUE)
 93 | ts_plot <- cowplot::plot_grid(
 94 |   ts_plots[[1]] + theme(legend.position = "none"),
 95 |   ts_plots[[2]] + theme(axis.title.y = element_blank(), legend.position = "none"),
 96 |   ts_plots[[3]] + theme(axis.title.y = element_blank(), legend.position = "none"),
 97 |   nrow = 1, rel_widths = c(1.07, 1, 1)
 98 | )
 99 | (output$ts <- cowplot::plot_grid(ts_legend, ts_plot, ncol=1, rel_heights = c(.1, 1)))
100 | 
101 | # Test probability of outperformance ---------------------------------------------------
102 | pfs_wide <- pfs[, .(type, eom_ret, utility_t)] %>% dcast(eom_ret~type, value.var = "utility_t")
103 | 
104 | prob_outperformance <- main_types %>% lapply(function(t) {
105 |   x <- pfs_wide %>% 
106 |     melt(id.vars = c("eom_ret", t), variable.name = "alt")
107 |   x[, diff := get(t)-value, by = alt]
108 |   x[, .(
109 |     prob_main_op = 1-pnorm(q = 0, mean=mean(diff), sd = sd(diff)/sqrt(length(diff)))
110 |   ), by = alt] %>% mutate(main = t)
111 | }) %>% rbindlist()
112 | 
113 | # Portfolio statistics over time --------------------
114 | # Compute expected portfolio risk
115 | ws <- rbind(
116 |   pfml$w |> mutate(type = "Portfolio-ML"),
117 |   tpf$w |> mutate(type = "Markowitz-ML"),
118 |   mp$w |> mutate(type = "Multiperiod-ML*"),
119 |   static$w |> mutate(type = "Static-ML*") |> select(-pred_ld1),
120 |   factor_ml$w |> mutate(type = "Factor-ML")
121 | ) |> mutate(type = type %>% factor(levels = pf_order))
122 | pf_vars <- ws |> expected_risk_fun(dates=dates_oos, cov_list=barra_cov)
123 | # Compute portfolios statistics
124 | (output$comp_stats <- pf_vars[pfs[, .(type, "eom"=eom_ret+1-months(1)-1, inv, turnover)], on = .(type, eom)] %>% # , e_sd
125 |     filter(type %in% main_types) |> 
126 |     mutate(e_sd = sqrt(pf_var*252)) |> 
127 |     select(-pf_var) |> 
128 |     pivot_longer(-c(type, eom)) %>%
129 |     mutate(
130 |       name = case_when(
131 |         name == "e_sd" ~ "Ex-ante Volatility",
132 |         name == "turnover" ~ "Turnover",
133 |         name == "inv" ~ "Leverage"
134 |       ) 
135 |     ) %>%
136 |     ggplot(aes(eom, value, colour=type, linetype=type)) +
137 |     geom_line() +
138 |     scale_y_log10() +
139 |     labs(y = "Value") +
140 |     facet_wrap(~name, scales="free_y", ncol=1) +
141 |     theme(
142 |       legend.position = "top",
143 |       legend.title = element_blank(),
144 |       axis.title.x = element_blank(),
145 |       axis.title.y = element_blank()
146 |     ))
147 | 
148 | # Correlation ----------------------------------
149 | pf_cors <- pfs[, .(eom_ret, r, type)] %>%
150 |   pivot_wider(names_from = type, values_from = r) %>%
151 |   select(-eom_ret) %>%
152 |   cor(method = "pearson") 
153 | 
154 | pf_cors %>%
155 |   corrplot::corrplot(method = "number", type = "lower")
156 | 
157 | # Apple vs. Xerox ----------------------------------------------------
158 | liquid <- 22111 # 14593
159 | # liquid <- c(93436, 14593, 91103, 88352, 19561, 10107)
160 | illiquid <- 27983 # Xerox (illiquid stock with all observations: data_tc[valid==T & year(eom)>=2015][, n := .N, by = id][n==max(n)][, .(id, eom, n, me, dolvol)][eom==max(eom)][order(dolvol)])
161 | # illiquid <- 26825 # Kellog (smallest stock with all observations: data_tc[valid==T & size_grp=="mega"][, n := .N, by = id][n == max(n)][, .(id, eom, n, me)][eom==max(eom)][order(me)])
162 | position <- rbind(
163 |   mp$w[id %in% c(liquid, illiquid)][, type := "Multiperiod-ML*"],
164 |   pfml$w[id %in% c(liquid, illiquid)][, type := "Portfolio-ML"],
165 |   static$w[id %in% c(liquid, illiquid)][, type := "Static-ML*"],
166 |   tpf$w[id %in% c(liquid, illiquid)][, type := "Markowitz-ML"],
167 |   factor_ml$w[id %in% c(liquid, illiquid)][, type := "Factor-ML"],
168 |   mkt$w[id %in% c(liquid, illiquid)][, type := "Market"], 
169 |   fill=T
170 | ) %>% mutate(
171 |   stock_type = case_when(
172 |     id == 14593 ~ "Apple (liquid)",
173 |     id == 27983 ~ "Xerox (illiquid)",
174 |     id == 93436 ~ "Tesla",
175 |     id == 91103 ~ "Visa",
176 |     id == 19561 ~ "Boing",
177 |     id == 10107 ~ "Microsoft",
178 |     id == 22111 ~ "Johnson and Johnson (liquid)",
179 |     id == 55976 ~ "Walmart (liquid)",
180 |     TRUE ~ as.character(id)
181 |   )
182 | )
183 | position <- position[year(eom)>=2015]
184 | position <- pfs[, .(type, "eom"=eom_ret %>% ceiling_date(unit="m")-1, inv)][position, on = .(type, eom)]
185 | position[, w_z := (w-mean(w))/sd(w), by = .(type, id)]
186 | 
187 | (output$example <- position %>%
188 |   mutate(type = type %>% factor(levels = pf_order)) %>%
189 |   ggplot(aes(eom, w, colour=stock_type, linetype = stock_type)) +
190 |   # geom_point(alpha=0.2) +
191 |   geom_line() +
192 |   geom_hline(yintercept = 0) +
193 |   facet_wrap(~type, scales = "free_y") +
194 |   labs(y = "Portfolio weight") +
195 |   theme(
196 |     legend.position = "top",
197 |     legend.title = element_blank(),
198 |     legend.justification = "center",
199 |     strip.background = element_rect(fill = "white", color = "black"),
200 |     axis.title.x = element_blank()
201 |   ))
202 | 
203 | if (FALSE) {
204 |   position %>%
205 |     mutate(tc_type = (type %in% c("Markowitz-ML", "Portfolio Sort"))) %>%
206 |     filter(type != "Portfolio Sort") %>%
207 |     ggplot(aes(eom, w_z, colour=type)) + # , alpha = tc_type
208 |     # geom_point(alpha=0.2) +
209 |     geom_line(alpha=0.6) +
210 |     # scale_alpha_discrete(range = c(0.6, 0.35)) +
211 |     geom_hline(yintercept = 0) +
212 |     facet_wrap(~stock_type)
213 | }
214 | 
215 | # Optimal Hyper-parameters ----------------------------
216 | model_1 <- readRDS(paste0(get_from_path_model, "/model_1.RDS"))
217 | model_6 <- readRDS(paste0(get_from_path_model, "/model_6.RDS"))
218 | model_12 <- readRDS(paste0(get_from_path_model, "/model_12.RDS"))
219 | (output$er_tuning <- rbind(
220 |   model_1 %>% lapply(function(x) {
221 |     x$pred[, .(eom_ret = eom+1+months(1)-1)] %>% cbind(x$opt_hps[, .(lambda, p, g)]) %>% unique()
222 |   }) %>% rbindlist() %>% filter(!is.na(eom_ret)) %>% mutate(horizon = "Return t+1"),
223 |   model_6 %>% lapply(function(x) {
224 |     x$pred[, .(eom_ret = eom+1+months(1)-1)] %>% cbind(x$opt_hps[, .(lambda, p, g)]) %>% unique()
225 |   }) %>% rbindlist() %>% filter(!is.na(eom_ret)) %>% mutate(horizon = "Return t+6"),
226 |   model_12 %>% lapply(function(x) {
227 |     x$pred[, .(eom_ret = eom+1+months(1)-1)] %>% cbind(x$opt_hps[, .(lambda, p, g)]) %>% unique()
228 |   }) %>% rbindlist() %>% filter(!is.na(eom_ret)) %>% mutate(horizon = "Return t+12")
229 | ) %>%
230 |   mutate(lambda = log(lambda)) |> 
231 |   rename("eta"=g, "log(lambda)"=lambda) %>%
232 |   pivot_longer(-c(horizon, eom_ret)) %>%
233 |   mutate(
234 |     comb_name = paste0("'", horizon, ":'~", name) %>% factor(levels = c(
235 |       paste0("'Return t+1:'~", c("log(lambda)", "p", "eta")), 
236 |       paste0("'Return t+6:'~", c("log(lambda)", "p", "eta")), 
237 |       paste0("'Return t+12:'~", c("log(lambda)", "p", "eta")))
238 |     )
239 |   ) %>%
240 |   ggplot(aes(eom_ret, value)) +
241 |   geom_point(alpha=0.75, colour = colours_theme[10]) +
242 |   facet_wrap(~comb_name, scales = "free_y", labeller = label_parsed) +
243 |   labs(y = "Optimal hyper-parameter") +
244 |   theme(
245 |     legend.position = "none",
246 |     axis.title.x = element_blank(),
247 |     text = element_text(size=9),
248 |     strip.background = element_rect(fill = "white", color = "black"),
249 |     axis.text = element_text(size=8)
250 |   ))
251 | 
252 | (output$portfolio_tuning <- rbind(
253 |   mp$hps[rank == 1 & year(eom_ret)>=1981 & month(eom_ret)==12, .(eom_ret, k, g, u)][, type := "Multiperiod-ML*"],
254 |   static$hps[rank == 1 & year(eom_ret) >= 1981  & month(eom_ret)==12, .(eom_ret, k, g, u)][, type := "Static-ML*"]
255 | ) %>% 
256 |   rename("v"=g) %>%
257 |   pivot_longer(-c(type, eom_ret)) %>%
258 |   rbind(pfml$best_hps[eom_ret>=1981, .(type = "Portfolio-ML", eom_ret, "log(lambda)"=log(l), p, "eta"=g)] %>% pivot_longer(-c(type, eom_ret))) %>%
259 |   mutate(
260 |     comb_name = paste0("'", type, ":'~", name) %>% factor(levels = c(paste0("'Portfolio-ML:'~", c("log(lambda)", "p", "eta")), 
261 |                                                                        paste0("'Multiperiod-ML*:'~", c("k", "u", "v")), 
262 |                                                                        paste0("'Static-ML*:'~", c("k", "u", "v"))))
263 |   ) %>%
264 |   ggplot(aes(eom_ret, value, colour = type)) +
265 |   geom_point(alpha=0.75) +
266 |     facet_wrap(~comb_name, scales = "free_y", labeller = label_parsed) +
267 |   labs(y = "Optimal hyper-parameter") +
268 |   theme(
269 |     legend.position = "none",
270 |     axis.title.x = element_blank(),
271 |     text = element_text(size=9),
272 |     strip.background = element_rect(fill = "white", color = "black"),
273 |     axis.text = element_text(size=8)
274 |   ))
275 | 
276 | # AR1 plot -------------------------------------------
277 | ar1 <- chars[order(id, eom)]
278 | ar1[, lag_ok := (eom - lag(eom)) %in% c(28:31), by = id]
279 | 
280 | ar1_ss <- features %>% map(.progress = T, function(x) {
281 |   ar1[, var := get(x)]
282 |   ar1[, var_l1 := lag(var), by = id]
283 |   sub <- ar1[!is.na(var) & !is.na(var_l1) & lag_ok == T & var != 0.5 & var_l1 != 0.5 & valid==T][, n := .N, by = id][n >= 12*5] # 0.5 indicates missing
284 |   sub[, .(
285 |     n = .N,
286 |     ar1 = cor(var, var_l1)
287 |   ), by = id][is.na(ar1), ar1 := 1][, .(
288 |     char = x,
289 |     ar1 = mean(ar1)
290 |   )]
291 | }) %>%
292 |   rbindlist()
293 | temp_order <- cluster_labels[, .("char"=characteristic, cluster)][ar1_ss, on = "char"][, mean(ar1), by = cluster][order(V1)]$cluster %>% 
294 |   str_replace_all("_", " ") %>% str_replace("short term", "short-term") %>% str_to_title()
295 | (output$ar1 <- cluster_labels[, .("char"=characteristic, cluster)][ar1_ss, on = "char"] %>%
296 |   group_by(cluster) %>%
297 |   mutate(sort_var = mean(ar1)+ar1/100000) %>%
298 |   mutate(
299 |     pretty_name = cluster %>% str_replace_all("_", " ") %>% str_replace("short term", "short-term") %>% str_to_title() %>%
300 |       factor(levels = temp_order)
301 |   ) %>%
302 |   ggplot(aes(reorder(char, sort_var), ar1, fill=pretty_name)) +
303 |   geom_col() +
304 |   coord_flip() +
305 |   labs(y = "Average Monthly Autocorrelation", fill = "Theme") +
306 |   guides(fill = guide_legend(reverse = TRUE)) +
307 |   theme(
308 |     axis.title.y = element_blank()
309 |   ))
310 | 
311 | # Features with sufficient coverage ------------------
312 | if (FALSE) {
313 |   features_all <- c(features_m, feat_excl)
314 |   
315 |   ids <- c("source_crsp", "common", "obs_main", "primary_sec", "exch_main", "id", "eom", "sic", "ff49", "size_grp", "me", "rvol_21d", "dolvol_126d")
316 |   data <- fread(paste0("../Data/usa.csv"), 
317 |                 select = unique(c(ids, features_all)), #, "excntry" 
318 |                 colClasses = c("eom" = "character", "sic"="character"))
319 |   data[, eom := eom %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()]
320 |   data[, dolvol := dolvol_126d] 
321 |   data[, rvol_m := rvol_252d * sqrt(21)]
322 |   data <- data[source_crsp == 1 & common==1 & obs_main==1 & primary_sec==1 & exch_main==1][, c("source_crsp", "common", "obs_main", "primary_sec", "exch_main") := NULL]
323 |   # Screen Data -----------------
324 |   # Start Date Screens
325 |   print(paste0("   Start date screen excludes ", round(mean(data$eom < settings$screens$start) * 100, 2), "% of the observations"))
326 |   data <- data[eom >= settings$screens$start]
327 |   # Monitor screen impact
328 |   n_start <- nrow(data)
329 |   me_start <- sum(data$me, na.rm = T)
330 |   # Require me, dolvol and rvol
331 |   print(paste0("   Non-missing me excludes ", round(mean(is.na(data$me)) * 100, 2), "% of the observations"))
332 |   data <- data[!is.na(me)]
333 |   if (settings$screens$require_rvol) {
334 |     print(paste0("   Non-missing rvol_252d excludes ", round(mean(is.na(data$rvol_m)) * 100, 2), "% of the observations"))
335 |     data <- data[!is.na(rvol_m)]
336 |   }
337 |   if (settings$screens$require_dolvol) {
338 |     print(paste0("   Non-missing dolvol_126d excludes ", round(mean(is.na(data$dolvol)) * 100, 2), "% of the observations"))
339 |     data <- data[!is.na(dolvol)]
340 |   }
341 |   # Size Screen 
342 |   print(paste0("   Size screen excludes ", round(mean(!(data$size_grp %in% settings$screens$size_grps)) * 100, 2), "% of the observations"))
343 |   data <- data[size_grp %in% settings$screens$size_grps]
344 |   # Feature Screens
345 |   feat_available <- data %>% select(all_of(features_all)) %>% apply(1, function(x) sum(!is.na(x)))
346 |   min_feat <- floor(length(features_all)*settings$screens$feat_pct)
347 |   print(paste0("   At least ", settings$screens$feat_pct*100, "% of feature excludes ", round(mean(feat_available < min_feat)*100, 2), "% of the observations"))
348 |   data <- data[feat_available >= min_feat]
349 |   # Summary
350 |   print(paste0("   In total, the final dataset has ", round( (nrow(data) / n_start)*100, 2), "% of the observations and ", round((sum(data$me) / me_start)*100, 2), "% of the market cap in the post ", settings$screens$start, " data"))
351 |   
352 |   # Check coverage by eom [Seems like we could include seas_16_20na and seas_16_20an]
353 |   coverage <- data[, lapply(.SD, function(x) mean(!is.na(x))), by = eom, .SDcols=features_all]
354 |   coverage[eom==min(eom)] %>%
355 |     pivot_longer(-eom) %>%
356 |     arrange(value)
357 | }
358 | 


--------------------------------------------------------------------------------
/6 - Economic intuition.R:
--------------------------------------------------------------------------------
 1 | # How do weigths differ for PF-ML, Market, and Markowitz
 2 | d <- as.Date("2020-11-30")
 3 | w_ex <- rbind(
 4 |   static$w[eom==d, .(id, eom, w, type = "Static-ML*")],  
 5 |   pfml$w[eom==d, .(id, eom, w, type = "Portfolio-ML")],
 6 |   mkt$w[eom==d, .(id, eom, w, type = "Market")],
 7 |   tpf$w[eom==d, .(id, eom, w, type = "Markowitz-ML")]
 8 | )
 9 | w_ex <- chars[, .(id, eom, dolvol)][w_ex, on = .(id, eom)]
10 | 
11 | w_ex[, n := .N, by = .(id, eom)]
12 | w_ex <- w_ex[n==max(n)] 
13 | 
14 | set.seed(settings$seed_no)
15 | sample_ids <- sample(unique(w_ex$id), 100)
16 | 
17 | # w_ex %>% mutate(w = abs(w)) %>% pivot_wider(names_from = type, values_from = w) %>% select(-id, -n) %>% cor(method = "spearman")
18 | 
19 | w_ex[, cor(dolvol, abs(w), method = "spearman"), by = type]
20 | 
21 | w_ex %>%
22 |   group_by(id) %>%
23 |   # mutate(sort_var = w[type=="Market"]) %>%
24 |   filter(id %in% sample_ids) %>%
25 |   filter(type %in% c("Markowitz-ML", "Portfolio-ML")) %>%
26 |   ggplot(aes(reorder(id, dolvol), abs(w))) +
27 |   geom_col() +
28 |   # geom_point() +
29 |   coord_flip() +
30 |   facet_wrap(~type, scales = "free_x") +
31 |   labs(x = "Absolute portfolio weight", y = "Dollar volume rank")
32 | 
33 | w_ex %>%
34 |   group_by(type) %>%
35 |   mutate(dolvol_rank = frank(dolvol)) %>%
36 |   ggplot(aes(dolvol_rank, abs(w))) +
37 |   geom_point() +
38 |   geom_smooth(se=F) +
39 |   facet_wrap(~type, scales = "free_y")
40 | 
41 | 
42 | 
43 | # Portfolio analysis ----------
44 | w_ex <- rbind(
45 |   static$w[, .(id, eom, w, type = "Static-ML*")],  
46 |   pfml$w[, .(id, eom, w, type = "Portfolio-ML")],
47 |   mkt$w[, .(id, eom, w, type = "Market")],
48 |   tpf$w[, .(id, eom, w, type = "Markowitz-ML")]
49 | )
50 | w_ex <- chars[, .(id, eom, dolvol)][w_ex, on = .(id, eom)]
51 | 
52 | w_ex[, n := .N, by = .(id, eom)]
53 | w_ex <- w_ex[n==max(n)] 
54 | 
55 | w_ex[, dv_pf := ceiling(ecdf(dolvol)(dolvol)*10), by = .(type, eom)]
56 | 
57 | w_ex[, .(
58 |   w_abs = mean(abs(w))
59 | ), by = .(type, eom, dv_pf)][, .(
60 |   w_abs = mean(w_abs)
61 | ), by = .(type, dv_pf)] %>%
62 |   filter(type %in% c("Markowitz-ML", "Portfolio-ML", "Market")) %>%
63 |   ggplot(aes(factor(dv_pf), w_abs)) +
64 |   geom_col(fill = colours_theme[1]) +
65 |   facet_wrap(~type, scales = "free_y", ncol = 3) +
66 |   labs(x = "Dollar volume sorted portfolios (1=low)")
67 | 
68 | 
69 | w_ex[, long := if_else(w>=0, "Long positions", "Short positions")]
70 | (output$w_liq <- w_ex[, .(
71 |   w_abs = mean(w)
72 | ), by = .(type, eom, dv_pf, long)][, .(
73 |   w_abs = mean(w_abs)
74 | ), by = .(type, dv_pf, long)] %>%
75 |   filter(type %in% c("Markowitz-ML", "Portfolio-ML", "Market")) %>%
76 |   ggplot(aes(factor(dv_pf), w_abs, fill=long)) +
77 |   geom_col() +
78 |   facet_wrap(~type, scales = "free_y", ncol = 3) +
79 |   # scale_x_continuous(breaks = c(1, seq(5, 20, 5))) +
80 |   labs(x = "Dollar volume sorted portfolios (1=low)", y = "Average stock weight") +
81 |   theme(
82 |     legend.position = "top",
83 |     legend.title = element_blank()
84 |   ))
85 | 


--------------------------------------------------------------------------------
/6 - Feature importance.R:
--------------------------------------------------------------------------------
  1 | # FI in base case -----------------------
  2 | fi_path <- "Data/Generated/Portfolios/FI/"
  3 | fi_folder <- paste0(fi_path, list.files(fi_path))
  4 | 
  5 | tpf_cf_base <- fread(paste0(fi_folder, "/tpf_cf_base.csv"))
  6 | pfml_cf_base <- fread(paste0(fi_folder, "/pfml_cf_base.csv"))
  7 | ret_cf <- fread(paste0(fi_folder, "/ret_cf.csv"))
  8 | pfml_cf_ief <- fread(paste0(fi_folder, "/pfml_cf_ief.csv"))
  9 | 
 10 | pfml_cf_ss <- pfml_cf_base %>% 
 11 |   group_by(type, cluster) %>% 
 12 |   summarise(cf_obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12) %>%
 13 |   ungroup() %>%
 14 |   mutate(fi = cf_obj[cluster=="bm"] - cf_obj) |> 
 15 |   select(type, cluster, fi)
 16 | 
 17 | 
 18 | tpf_cf_ss <- tpf_cf_base %>%
 19 |   group_by(type, cluster) %>% 
 20 |   summarise(cf_obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12) %>%
 21 |   ungroup() %>%
 22 |   mutate(fi = cf_obj[cluster=="bm"]-cf_obj) |> 
 23 |   select(type, cluster, fi)
 24 | 
 25 | (output$feature_importance <- rbind(
 26 |   pfml_cf_ss %>% mutate(wealth = 1e10),
 27 |   tpf_cf_ss %>% mutate(wealth = 0)#,
 28 |   # shap_er1 %>% mutate(type = "Expected 1m Return") %>% select(type, cluster, fi) %>% mutate(wealth = 0)
 29 | ) %>%
 30 |     filter(cluster != "bm") |> 
 31 |     group_by(type) %>%
 32 |     # mutate(fi = fi / max(fi)) %>%
 33 |     group_by(cluster) %>%
 34 |     mutate(
 35 |       sort_var = sum(fi*(type == "Portfolio-ML")),
 36 |       # sort_var = sum(fi*(type == "Expected 1m Return")),
 37 |       cluster = cluster %>% str_replace_all("_", " ") %>% str_replace("short term", "short-term") %>% str_to_title(),
 38 |       type = type %>% factor(levels = c("Portfolio-ML", "Multiperiod-ML*", "Markowitz-ML", "Expected 1m Return"))
 39 |     ) %>%
 40 |     filter(type != "Expected 1m Return") %>%
 41 |     ggplot(aes(reorder(cluster, sort_var), fi, fill = type)) +
 42 |     geom_col(position = "dodge") +
 43 |     scale_fill_manual(values = c(colours_theme[1], colours_theme[2], colours_theme[5])) + 
 44 |     coord_flip() +
 45 |     facet_wrap(~type, nrow = 1, scales = "free_x") +
 46 |     theme(
 47 |       legend.position = "none",
 48 |       axis.title.y = element_blank(),
 49 |       strip.background = element_rect(fill = "white", color = "black")
 50 |     ) +
 51 |     labs(y = "Drop in realized utility from permuting theme features"))
 52 | 
 53 | 
 54 | # FI in IEF -------------------------
 55 | # Summary stats
 56 | ef_cf_ss <- pfml_cf_ief[, .(
 57 |   obj = (mean(r)-0.5*var(r)*gamma_rel-mean(tc))*12,
 58 |   r_tc = mean(r-tc)*12,
 59 |   sd = sd(r)*sqrt(12)
 60 | ), by = .(gamma_rel, cluster)][, sr := r_tc / sd] %>% rename("shuffled"=cluster)
 61 | # Add benchmark
 62 | ef_cf_ss <- ef_cf_ss %>% rbind(ef_ss[wealth_end==pf_set$wealth, .(gamma_rel, shuffled = "none", obj, r_tc, sd, sr)])
 63 | 
 64 | # With trading costs
 65 | sub <- c("Quality", "Value", "Short-Term Reversal", "None")
 66 | (output$cf_ef_tc <- ef_cf_ss %>%
 67 |     rbind(expand.grid(shuffled = unique(ef_cf_ss$shuffled), sd=0, r_tc=0), fill=T) %>%
 68 |     mutate(
 69 |       shuffled = shuffled %>% str_replace_all("_", " ") %>% str_replace_all("short term", "short-term") %>% str_to_title(),
 70 |       shuffled = shuffled %>% factor(levels = c("None", "Quality", "Value", "Short-Term Reversal", "Momentum"))
 71 |     ) %>%
 72 |     filter(shuffled %in% sub) %>%
 73 |     ggplot(aes(sd, r_tc, colour = factor(shuffled), group = factor(shuffled), shape = factor(gamma_rel), linetype=factor(shuffled))) +
 74 |     geom_line() +
 75 |     geom_point(size=2) +
 76 |     # coord_cartesian(xlim = c(0, 0.6), ylim = c(0, 0.5)) +
 77 |     coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 0.28), expand=F) +
 78 |     # scale_x_continuous(breaks = seq(0, 0.3, 0.1)) +
 79 |     labs(x = "Volatility", y = "Excess returns (net of trading cost)", shape = "Relative Risk Aversion:", 
 80 |          colour = "Theme permuted:", linetype = "Theme permuted:") +
 81 |     guides(
 82 |       colour = guide_legend(order = 1, override.aes = list(size=1, shape=NA)),
 83 |       linetype = guide_legend(order = 1),
 84 |       shape = "none" #guide_legend(order = 2, label.hjust = 1)
 85 |     ) +
 86 |     theme(
 87 |       legend.position = c(0.5, 0.97),
 88 |       legend.direction="horizontal",
 89 |       legend.justification = "center"
 90 |     ))
 91 | 
 92 | # Counterfactual EF without TC --------------------------
 93 | tpf_cf_ss <- tpf_cf_base[, .(
 94 |   sr = mean(r)/sd(r)*sqrt(12)
 95 | ), by = cluster][cluster=="bm", cluster := "none"]
 96 | 
 97 | x_values <- data.table(sd = seq(0, 0.35, 0.01))
 98 | 
 99 | (output$cf_ef_markowitz <- tidyr::crossing(tpf_cf_ss, x_values) %>%
100 |     mutate(
101 |       ret = sd*sr,
102 |       shuffled = cluster %>% str_replace_all("_", " ") %>% str_replace_all("short term", "short-term") %>% str_to_title(),
103 |       shuffled = shuffled %>% factor(levels = c("None", "Quality", "Value", "Short-Term Reversal", "Momentum"))
104 |     ) %>%
105 |     filter(shuffled %in% sub) %>%
106 |     ggplot(aes(sd, ret, colour = factor(shuffled), group = factor(shuffled), linetype = factor(shuffled))) +
107 |     geom_line() +
108 |     # coord_cartesian(xlim = c(0.15, 0.3), ylim = c(0.25, 0.5)) +
109 |     coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 0.55), expand=F) +
110 |     # coord_cartesian(xlim = c(0, 0.165), ylim = c(0, 0.35), expand=F) +
111 |     labs(x = "Volatility", y = "Excess returns", colour = "Theme permuted:", linetype = "Theme permuted:") +
112 |     guides(
113 |       colour = guide_legend(order = 1, override.aes = list(size=1)),
114 |       linetype = guide_legend(order = 1)
115 |     ) +
116 |     theme(
117 |       legend.position = c(0.5, 0.97),
118 |       legend.direction="horizontal"
119 |     )) 
120 | 
121 | # Feature importance for return predictions models --------------------------------
122 | ret_cf_ss <- ret_cf[, .(mse = mean(mse)), by = .(h, cluster)]
123 | bm <- ret_cf_ss[cluster == "bm", .(h, bm=mse)]
124 | ret_cf_ss <- bm[ret_cf_ss[cluster != "bm"], on = "h"]      
125 | ret_cf_ss[, fi := mse-bm]
126 | 
127 | if (TRUE) {
128 |   (output$fi_returns <- ret_cf_ss %>%
129 |      mutate(
130 |        cluster = cluster %>% str_replace_all("_", " "),
131 |        cluster = cluster %>% str_replace_all("short term", "short-term"),
132 |        cluster = cluster %>% str_to_title()
133 |      ) %>%
134 |      group_by(h) %>%
135 |      mutate(fi = fi / max(fi)) %>%
136 |      group_by(cluster) %>%
137 |      mutate(sort_var = fi[h==1]) %>%
138 |      ggplot(aes(reorder(cluster, sort_var), fi, fill=factor(h))) +
139 |      geom_col(position = "dodge") +
140 |      # Scale_y_continuous with percentages
141 |      scale_y_continuous(labels = scales::label_percent(), breaks = seq(-0.2, 1, 0.2)) +
142 |      geom_hline(yintercept = 1, linetype = "dashed", color = "black") +
143 |      labs(y = "Drop in MSE from permuting theme features (% of max)", fill = "Horizon") + # Otherwise difficult to visualize due to small x-axis values
144 |      theme(
145 |        axis.title.x = element_blank()
146 |      ))
147 | } else {
148 |   (output$fi_returns <- ret_cf_ss %>%
149 |      mutate(
150 |        title = paste0("Horizon: ", h, " month") |> factor(levels = paste0("Horizon: ", 1:12, " month")),
151 |        cluster = cluster %>% str_replace_all("_", " "),
152 |        cluster = cluster %>% str_replace_all("short term", "short-term"),
153 |        cluster = cluster %>% str_to_title()
154 |      ) %>%
155 |      group_by(h) %>%
156 |      mutate(fi = fi / max(fi) * 100) %>%
157 |      group_by(cluster) %>%
158 |      mutate(sort_var = fi[h==1]) %>%
159 |      ggplot(aes(reorder(cluster, sort_var), fi)) +
160 |      geom_col(fill = colours_theme[1]) +
161 |      coord_flip() +
162 |      facet_wrap(~title, scales = "free_x") +
163 |      labs(y = "Drop in MSE from permuting theme features (% of max)") + # Otherwise difficult to visualize due to small x-axis values
164 |      theme(
165 |        axis.title.y = element_blank()
166 |      ))
167 | }
168 | 
169 | # Wierdly, ret_1_0 strongly predicts t+12? But note that it's with the opposite sign of t+1 (I know! It's the seasonality effect)
170 | melt(chars[!is.na(pred_ld1), c("id", "eom", paste0("pred_ld", 1:12), "ret_1_0", "ret_12_1", "be_me", "gp_at")], id.vars = c("id", "eom", paste0("pred_ld", 1:12)))[, lapply(.SD, function(x) cor(value, x)), .SDcols = paste0("pred_ld", 1:12), by = .(variable, eom)][, lapply(.SD, mean), by = variable] |> 
171 |   select(-eom) |> 
172 |   pivot_longer(-variable, names_to = "h", values_to = "cor") |>
173 |   mutate(h = h |> str_remove("pred_ld") |> as.numeric()) |>
174 |   ggplot(aes(h, cor, colour = variable)) +
175 |   geom_hline(yintercept = 0) + 
176 |   geom_line() +
177 |   geom_point() +
178 |   theme(
179 |     legend.position = "top"
180 |   )
181 | 


--------------------------------------------------------------------------------
/6 - Implementable efficient frontier.R:
--------------------------------------------------------------------------------
  1 | # Generate HML/Markowitz for multiple volaility levels -----------------------------------
  2 | vol_range <- seq(0, 0.5, 0.01)
  3 | 
  4 | # Factor-ML
  5 | factor_base <- chars %>% factor_ml_implement(dates = dates_oos, n_pfs = settings$factor_ml$n_pfs, wealth = wealth, gam = pf_set$gamma_rel)
  6 | factor_base_vol <- factor_base$pf[, sd(r)*sqrt(12)]
  7 | factor_ef <- vol_range %>% map(.progress = T, function(vol_target) {
  8 |   scale <- vol_target / factor_base_vol  # Target same vol as m2
  9 |   copy(factor_base$w)[, w := w*scale][, w_start:=w_start*scale] %>% pf_ts_fun(data = chars, wealth = wealth, gam = pf_set$gamma_rel) %>% mutate(vol_target = vol_target)
 10 | }) %>% rbindlist() %>% mutate(type = "Factor-ML")
 11 | factor_ss <- factor_ef[, .(
 12 |   n = .N, 
 13 |   inv = mean(inv), 
 14 |   to = mean(turnover), 
 15 |   r = mean(r)*12, 
 16 |   sd = sd(r)*sqrt(12), 
 17 |   sr_gross = mean(r)/sd(r)*sqrt(12),
 18 |   tc = mean(tc)*12, 
 19 |   r_tc = mean((r-tc))*12, 
 20 |   sr = mean(r-tc)/sd(r)*sqrt(12), 
 21 |   obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12
 22 | ), by = .(vol_target)]
 23 | 
 24 | # Markowitz-ML 
 25 | tpf_base <- chars |> tpf_implement(cov_list = barra_cov, wealth = wealth, dates = dates_oos, gam = pf_set$gamma_rel)
 26 | tpf_base_vol <- tpf_base$pf[, sd(r)*sqrt(12)]
 27 | tpf_ef <- vol_range %>% lapply(function(vol_target) {
 28 |   scale <- vol_target / tpf_base_vol  # Target same vol as m2
 29 |   copy(tpf_base$w)[, w := w*scale][, w_start:=w_start*scale] %>% pf_ts_fun(data = chars, wealth = wealth, gam = pf_set$gamma_rel) %>% mutate(vol_target = vol_target)
 30 | }) %>% rbindlist() %>% mutate(type = "Markowitz-ML")
 31 | tpf_ss <- tpf_ef[, .(
 32 |   n = .N, 
 33 |   inv = mean(inv), 
 34 |   to = mean(turnover), 
 35 |   r = mean(r)*12, 
 36 |   sd = sd(r)*sqrt(12), 
 37 |   sr_gross = mean(r)/sd(r)*sqrt(12),
 38 |   tc = mean(tc)*12, 
 39 |   r_tc = mean((r-tc))*12, 
 40 |   sr = mean(r-tc)/sd(r)*sqrt(12), 
 41 |   obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12
 42 | ), by = .(vol_target)]
 43 | 
 44 | # Mean-variance efficient frontier of risky assets ---------------------------------------------
 45 | u_vec <- c(seq(-0.5, 0.5, 0.05), 0.6, 0.75, 1, 2) / 12
 46 | wealth_0 <- wealth_func(wealth_end = 0, end = settings$split$test_end, market = market, risk_free = risk_free)
 47 | mv_risky_ef <- chars |> mv_risky_fun(cov_list=barra_cov, wealth=wealth_0, dates=dates_oos, gam=pf_set$gamma_rel, u_vec)
 48 | mv_ss <- mv_risky_ef[, .(
 49 |   n = .N, 
 50 |   inv = mean(inv), 
 51 |   to = mean(turnover), 
 52 |   r = mean(r)*12, 
 53 |   sd = sd(r)*sqrt(12), 
 54 |   sr_gross = mean(r)/sd(r)*sqrt(12),
 55 |   tc = mean(tc)*12, 
 56 |   r_tc = mean((r-tc))*12, 
 57 |   sr = mean(r-tc)/sd(r)*sqrt(12), 
 58 |   obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12
 59 | ), by = .(u)]
 60 | 
 61 | # Get IEF portfolios ---------------------------------------------------------------------------
 62 | ief_path <- "Data/Generated/Portfolios/IEF/"
 63 | ief_pfs <- list.files(path = ief_path) |> map(.progress = T, function(x) {
 64 |   w <- x |> str_extract("(?<=WEALTH)[^_]+") |> as.numeric()
 65 |   g <- x |> str_extract("(?<=GAMMA)[^_]+") |> as.numeric()
 66 |   # Portfolios
 67 |   rbind(
 68 |     fread(paste0(ief_path, x, "/bms.csv")) |> mutate(eom_ret=as.Date(eom_ret)),
 69 |     readRDS(paste0(ief_path, x, "/static-ml.RDS"))$pf,
 70 |     readRDS(paste0(ief_path, x, "/portfolio-ml.RDS"))$pf,
 71 |     readRDS(paste0(ief_path, x, "/static-ml.RDS"))$hps[eom_ret %in% readRDS(paste0(ief_path, x, "/static-ml.RDS"))$pf$eom_ret & k==1 & g==0 & u==1, .(eom_ret=as.Date(eom_ret), inv, shorting, turnover, r, tc)][, type := "Static-ML"]
 72 |   ) |> mutate(wealth_end=w, gamma_rel=g)
 73 | }) |> rbindlist()
 74 | stopifnot(nrow(ief_pfs[, .N, by = .(wealth_end, gamma_rel, type, eom_ret)][N!=1])==0)# Check for duplicates
 75 | 
 76 | ief_ss <- ief_pfs[, .(
 77 |   inv = mean(inv), 
 78 |   to = mean(turnover), 
 79 |   r = mean(r)*12, 
 80 |   sd = sd(r)*sqrt(12), 
 81 |   sr_gross = mean(r)/sd(r)*sqrt(12),
 82 |   tc = mean(tc)*12, 
 83 |   r_tc = mean((r-tc))*12, 
 84 |   sr = mean(r-tc)/sd(r)*sqrt(12), 
 85 |   obj = (mean(r)-0.5*var(r)*gamma_rel-mean(tc))*12
 86 | ), by = .(type, wealth_end, gamma_rel)]
 87 | 
 88 | ef_ss <- ief_ss[type=="Portfolio-ML"]
 89 | 
 90 | # Build inputs for IEF -----------------------------------------------------------------------
 91 | # Summary stats
 92 | ef_all_ss <- rbind(
 93 |   ief_ss[type %in% c("Portfolio-ML", "Static-ML*", "Static-ML")],
 94 |   factor_ss |> mutate(gamma_rel = NA, type = "Factor-ML", wealth_end=pf_set$wealth),
 95 |   tpf_ss |> mutate(gamma_rel = NA, type = "Markowitz-ML", wealth_end=pf_set$wealth),
 96 |   fill=T
 97 | ) 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | # Indifference curves -------
106 | points <- ef_all_ss[
107 |     (type %in% c("Portfolio-ML") & wealth_end == pf_set$wealth & gamma_rel==pf_set$gamma_rel) | 
108 |     (type %in% c("Static-ML*") & wealth_end == pf_set$wealth & gamma_rel==pf_set$gamma_rel) |
109 |     (type %in% c("Static-ML") & wealth_end == pf_set$wealth & gamma_rel==pf_set$gamma_rel) #|
110 |     # (type %in% c("Factor-ML") & wealth_end == pf_set$wealth & vol_target==0.01)
111 | ]
112 | 
113 | indifference_curves <- 1:nrow(points) %>% lapply(function(i) {
114 |   u_target <- points[i, obj]
115 |   sd_target <- points[i, sd]
116 |   # vol_space <- c(
117 |   #   seq(from = floor(sd_target*100)/100, by = -0.01, length.out = 8),
118 |   #   sd_target,
119 |   #   seq(from = ceiling(sd_target*100)/100, by = 0.01, length.out = 8)
120 |   # ) %>% sort()
121 |   # vol_space <- pmax(vol_space, 0) %>% unique()
122 |   seq(0, 0.4, 0.01) %>% lapply(function(v) {
123 |     data.table(sd = v, r_tc = u_target + pf_set$gamma_rel/2*v^2)
124 |   }) %>% rbindlist() %>% mutate(u = u_target)
125 | }) %>% rbindlist()
126 | 
127 | # Figure 1A ----------------
128 | tpf_slope <- ef_all_ss[type=="Markowitz-ML"]$sr_gross[2] # All the same (except first which is NAN)
129 | static_raw <- ef_all_ss[(type %in% c("Static-ML") & wealth_end == pf_set$wealth & gamma_rel==pf_set$gamma_rel)]
130 | # Plot settings
131 | ef_y_low <- -0.2
132 | tpf_col <- 10
133 | ef_xmax <- 0.35
134 | plot_txt_size <- 2.5
135 | sta_1l_x <- 0.265
136 | sta_1l_y <- static_raw$r_tc
137 | 
138 | 
139 | 
140 | ef_all_ss <- ef_all_ss |> 
141 |   mutate(type = type %>% factor(levels = c(main_types, "Markowitz-ML (gross)")))
142 | 
143 | (output$ef_all_wo_ic <-  ef_all_ss %>%
144 |   filter(type != "Static-ML") |>
145 |   filter(wealth_end == pf_set$wealth) %>%
146 |   rbind(expand.grid(type = c("Portfolio-ML", "Static-ML*"), r_tc=0, sd=0), fill=T) %>%
147 |   rbind(data.table(type = "Markowitz-ML (gross)"), fill=T) %>%
148 |   ggplot() +
149 |   annotate("text", x=0.1, y=0.31, label = "Without TC", size=plot_txt_size, hjust=0.5) +
150 |   annotate("segment", x=0.1, xend = 0.12, y=0.3, yend=0.2, arrow = arrow(length = unit(.1,"cm")), size=0.1, alpha=0.75) +
151 |   annotate("segment", x=0.1, xend = 0.1, y=0.3, yend=0.22, arrow = arrow(length = unit(.1,"cm")), size=0.1, alpha=0.75) +
152 |   geom_abline(intercept = 0, slope = tpf_slope, alpha = 1, colour = colours_theme[tpf_col]) +
153 |   geom_line(aes(sd, r_tc, colour = type, group = type)) +
154 |   coord_cartesian(xlim = c(0, ef_xmax), ylim = c(ef_y_low, 0.54), expand = F) +
155 |   geom_point(aes(sd, r_tc, colour = type, shape = factor(gamma_rel), group = type), size=2) +
156 |   geom_path(data = mv_ss[r_tc <= 0.7], aes(sd, r_tc), linetype = "dotted", colour = colours_theme[tpf_col]) + # colours_theme[match("Markowitz-ML", main_types)]
157 |   scale_colour_manual(values = colours_theme[c(match(sort(unique(ef_all_ss$type)), main_types), tpf_col)]) +
158 |   # annotate(geom = "point", y = static_raw$r_tc, x = static_raw$sd, shape = "square", size = 2) +
159 |   # annotate("text", x=static_raw$sd-0.01, y=sta_1l_y, label = "Static-ML (one layer)", size=plot_txt_size, hjust=1, vjust=0.5) +
160 |   # annotate("segment", x=static_raw$sd-0.008, xend = static_raw$sd-0.005, y=sta_1l_y, yend=sta_1l_y, arrow = arrow(length = unit(.1,"cm")), size=0.1, alpha=0.75) +
161 |   annotate(geom = "point", y = static_raw$r_tc, x = static_raw$sd, shape = "square", size = 2) +
162 |   annotate("text", x=static_raw$sd, y=0.05, label = "Static-ML (one layer)", size=plot_txt_size, hjust=0.5, vjust=0.5) +
163 |   annotate("segment", x=static_raw$sd, xend = static_raw$sd, y=0.07, yend=static_raw$r_tc-0.02, arrow = arrow(length = unit(.1,"cm")), size=0.1, alpha=0.75) +
164 |   labs(x = "Volatility", y = "Excess returns (net of trading cost)", shape = "Relative Risk Aversion:") +
165 |   guides(
166 |     colour = guide_legend(order = 1, override.aes = list(size=1, shape=NA)),
167 |     shape = "none" #guide_legend(order = 2, label.hjust = 1)
168 |   ) +
169 |   theme(
170 |     legend.title = element_blank(),
171 |     legend.position = c(0.5, 0.97),
172 |     legend.direction="horizontal",
173 |     legend.text = element_text(size=8)
174 |   ) +
175 |   scale_x_continuous(expand = c(0, 0.01)))
176 | output$ef_all <- copy(output$ef_all_wo_ic)
177 | for (u_tgt in points$obj) {
178 |   output$ef_all <- output$ef_all + geom_path(data = indifference_curves[u==u_tgt], aes(sd, r_tc), linetype = "dashed", alpha=0.40)
179 | }
180 | output$ef_all
181 | 
182 | # Figure 1B --------
183 | comb_data <- ef_ss %>%
184 |   rbind(expand.grid(wealth_end=settings$ef$wealth, sd=0, r_tc=0), fill=T) %>%
185 |   mutate(
186 |     type = "tpf",
187 |     wealth_end = if_else(wealth_end==1, 1, wealth_end)
188 |   ) %>%
189 |   rbind(
190 |     mv_ss %>% mutate(wealth_end=1, type = "mv"), #  %>% mutate(wealth_end=0, type = "mv", r_tc = r_tc+0.04)If we want risky frontier to be tangent for illustrative purposes
191 |     # pf_summary %>% 
192 |     #   filter(type %in% c("Portfolio Sort", "Markowitz")) %>% 
193 |     #   mutate(wealth_end = pf_set$wealth, gamma_rel = pf_set$gamma_rel, r_tc = -Inf, sd = if_else(type == "Markowitz", Inf, sd), type = "bm"),
194 |     fill=T
195 |   ) %>%
196 |   mutate(gamma_rel = gamma_rel %>% factor(levels = settings$ef$gamma_rel)) %>%
197 |   filter(wealth_end %in% c(1, 1e9, 1e10, 1e11))
198 | 
199 | (output$ef <- comb_data %>%
200 |    ggplot(aes(sd, r_tc, colour = factor(wealth_end), group = factor(wealth_end))) +
201 |    geom_blank() +
202 |    # geom_point(data = comb_data %>% filter(type == "bm"), aes(shape = gamma_rel), size=3) +
203 |    geom_line(data = comb_data %>% filter(type == "tpf")) +
204 |    geom_point(data = comb_data %>% filter(type == "tpf"), aes(shape = gamma_rel), size=2) +
205 |    geom_path(data = comb_data %>% filter(type == "mv"), linetype = "dotted") +
206 |    coord_cartesian(xlim = c(0, ef_xmax), ylim = c(0, 0.75), expand=F) +
207 |    labs(x = "Volatility", y = "Excess returns (net of trading cost)", shape = "Relative Risk Aversion:", 
208 |         colour = "Wealth by 2020:") +
209 |    guides(
210 |      colour = guide_legend(order = 1, override.aes = list(size=1, shape=NA)),
211 |      shape = "none" #guide_legend(order = 2, label.hjust = 1)
212 |    ) +
213 |    theme(
214 |      legend.position = c(0.5, 0.97),
215 |      legend.direction="horizontal",
216 |      legend.title = element_text(size=8),
217 |      legend.text = element_text(size=8)
218 |    ))
219 | 
220 | 


--------------------------------------------------------------------------------
/6 - Performance across size distribution.R:
--------------------------------------------------------------------------------
 1 | # Download files -------------------------- 
 2 | size_cuts <- c("all", "low80_high100", "low60_high80", "low40_high60", "low20_high40", "low00_high20")
 3 | size_path <- "Data/Generated/Portfolios/Size/"
 4 | size_folders <- list.files(size_path)
 5 | pf_by_size <- size_cuts |> map(function(x) {
 6 |   folder <- size_folders[str_detect(size_folders, x)]
 7 |   rbind(
 8 |     fread(paste0(size_path, folder, "/bms.csv")) |> mutate(eom_ret=as.Date(eom_ret)),
 9 |     readRDS(paste0(size_path, folder, "/static-ml.RDS"))$pf,
10 |     readRDS(paste0(size_path, folder, "/multiperiod-ml.RDS"))$pf,
11 |     readRDS(paste0(size_path, folder, "/portfolio-ml.RDS"))$pf
12 |   ) |> mutate(size = x)
13 | }) |> 
14 |   rbindlist() |> 
15 |   mutate(
16 |     size = case_when(
17 |       size == "all" ~ "All",
18 |       size == "low80_high100" ~ "Largest (80-100)",
19 |       size == "low60_high80" ~ "Large (60-80)",
20 |       size == "low40_high60" ~ "Mid (40-60)",
21 |       size == "low20_high40" ~ "Small (20-40)",
22 |       size == "low00_high20" ~ "Smallest (00-20)",
23 |     ) |> factor(levels = c("All", "Largest (80-100)", "Large (60-80)", "Mid (40-60)", "Small (20-40)", "Smallest (00-20)")),
24 |     type = if_else(type=="Rank-Weighted", "Rank-ML", type)
25 |   )
26 | 
27 | # Performance -----------------------------
28 | pf_by_size[, type := type %>% factor(levels = pf_order)]
29 | pf_by_size %>% setorder(size, type, eom_ret)
30 | pf_by_size[, e_var_adj := (r-mean(r))^2, by=.(size, type)]
31 | pf_by_size[, utility_t := r-tc-0.5*e_var_adj*pf_set$gamma_rel]
32 | pf_summary_size <- pf_by_size[, .(
33 |   n = .N,
34 |   inv = mean(inv),
35 |   shorting = mean(shorting),
36 |   turnover_notional = mean(turnover),
37 |   r = mean(r)*12,
38 |   sd = sd(r)*sqrt(12),
39 |   sr_gross = mean(r)/sd(r)*sqrt(12),
40 |   tc = mean(tc)*12,
41 |   r_tc = mean((r-tc))*12,
42 |   sr = mean(r-tc)/sd(r)*sqrt(12),
43 |   obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12#,
44 |   # obj_exp = (mean(e_r)-0.5*mean(e_var)*pf_set$gamma_rel-mean(tc))*12
45 | ), by = .(size, type)][order(size, type)]
46 | 
47 | # Plots -----------------------------------
48 | (output$by_size <- pf_summary_size |> 
49 |   mutate(obj = if_else(obj < -1000, -1000, obj)) |> # Helps iron out a bug in geom_col that makes Markowitz look weirds
50 |   ggplot(aes(type, obj)) +
51 |   geom_col(fill=colours_theme[1], position="dodge") +
52 |   geom_hline(yintercept = 0) +
53 |   facet_wrap(~size, ncol=3) +
54 |   coord_cartesian(ylim = c(-0.05, NA)) +
55 |   labs(y = "Realized Utility") + 
56 |   theme(
57 |     axis.title.x = element_blank(),
58 |     axis.text.x = element_text(angle = 45, hjust = 1),
59 |     legend.position = "none"
60 |   ))
61 | 


--------------------------------------------------------------------------------
/6 - RF example.R:
--------------------------------------------------------------------------------
  1 | source("main.R")
  2 | # Settings ---
  3 | f <- function(x) {
  4 |   10*x+0.5*x^2-0.01*x^3-1.08^x+100*sin(x)
  5 | }
  6 | set.seed(1)
  7 | # Data
  8 | n <- 1000
  9 | x <- runif(n, -30, 70)
 10 | y_true <- f(x)
 11 | y <- y_true + rnorm(n, 0, 150)
 12 | 
 13 | # True function and OLS approximation ----
 14 | tibble(
 15 |   x=seq(-30, 70, by = 1), 
 16 |   y_true=f(x),
 17 |   y = y_true+rnorm(101, 0, 150)
 18 | ) %>%
 19 |   ggplot(aes(x=x, y=y)) +
 20 |   geom_point() +
 21 |   geom_line(aes(y=y_true)) +
 22 |   geom_smooth(method = "lm", se=F)
 23 | 
 24 | # Various RFF approximations ------------
 25 | set.seed(1)
 26 | ps <- seq(2^1, 2^9, by = 2)
 27 | lambdas <- c(exp(c(0:10)))
 28 | # Train/test split
 29 | train_idx <- sample(1:n, size = n/2, replace=F)
 30 | # Predict multiple lambdas and collect in nice tibble
 31 | pred_tidy <- function(fit, x_act, x, y, y_true, lambdas) {
 32 |   preds <- predict(fit, newx = x, s = lambdas)
 33 |   colnames(preds) <- log(lambdas)
 34 |   preds |> 
 35 |     as_tibble() |> 
 36 |     mutate(y = y, x = x_act, y_true=y_true) |> 
 37 |     pivot_longer(-c(x,y, y_true), names_to = "log_lambda", values_to = "pred")
 38 | }
 39 | # Simulate 
 40 | log_gs <- -3:2
 41 | data <- exp(log_gs) |> map(function(g) {
 42 |   rff_x <- as.matrix(x) %>% rff(p = max(ps), g = g)
 43 |   data <- ps %>% lapply(function(p) {
 44 |     rff_x_new <- p^(-0.5)*cbind(rff_x$X_cos[, 1:(p/2)], rff_x$X_sin[, 1:(p/2)])
 45 |     # Fit on training data
 46 |     x_train <- rff_x_new[train_idx, ]
 47 |     y_train <- y[train_idx]
 48 |     fit <- glmnet(x=x_train, y=y_train, alpha=0, lambda = lambdas)
 49 |     # Predict on train data 
 50 |     y_true_train <- y_true[train_idx]
 51 |     train_op <- fit |> 
 52 |       pred_tidy(x = x_train, x_act = x[train_idx], y = y_train, y_true = y_true_train, lambdas = lambdas) |> 
 53 |       mutate(split = "train")
 54 |     # Predict on test data (for R2)
 55 |     x_test <- rff_x_new[-train_idx, ]
 56 |     y_test <- y[-train_idx]
 57 |     y_true_test <- y_true[-train_idx]
 58 |     test_op <- fit |> pred_tidy(x = x_test, x_act = x[-train_idx], y = y_test, y_true = y_true_test, lambdas = lambdas) |> 
 59 |       mutate(split = "test")
 60 |     # Output
 61 |     rbind(train_op, test_op) |> 
 62 |       mutate(p = p)
 63 |   }) %>% bind_rows() |> mutate(g)
 64 | })  |> 
 65 |   bind_rows() |> 
 66 |   mutate(
 67 |     log_lambda = log_lambda |> factor(levels = log(lambdas)),
 68 |     g_name = paste0("log(g): ", log(g)) |> factor(levels = paste0("log(g): ", log_gs))
 69 |   )
 70 | 
 71 | # Figure: See fit ------
 72 | # By ridgeless
 73 | (output$rff_specific0 <- data |> 
 74 |    filter(split=="train") |> 
 75 |    filter(log_lambda %in% c(0) & g %in% c(1), p %in% 2^(1:6)) |>
 76 |    pivot_longer(c(y_true, pred)) |>
 77 |    # filter(log_lambda %in% c(2:5) & g %in% c(1)) |>
 78 |    mutate(p = paste0("P=", p) %>% factor(levels = paste0("P=", ps))) |>
 79 |    mutate(y = if_else(name == "y_true", y, NA_real_)) |>
 80 |    mutate(name = case_when(
 81 |      name == "pred" ~ "RF prediction",
 82 |      name == "y_true" ~ "True function"
 83 |    )) |> 
 84 |    ggplot(aes(x, y = value, colour = name)) + 
 85 |    geom_point(aes(y=y), colour = "grey", alpha=0.5) +
 86 |    geom_line() + 
 87 |    # geom_line(aes(y=y_true), colour="black") +
 88 |    # geom_line(aes(y=pred)) +
 89 |    facet_wrap(~p) +
 90 |    labs(y="Value") +
 91 |    theme(legend.position = "top", legend.title = element_blank()))
 92 | 
 93 | # By lambda
 94 | (output$rff_specific1 <- data |> 
 95 |   filter(split=="train") |> 
 96 |   filter(log_lambda %in% c(0, 6, 10) & g %in% c(1), p %in% 2^(1:6)) |>
 97 |   # filter(log_lambda %in% c(2:5) & g %in% c(1)) |>
 98 |   mutate(p = paste0("P=", p) %>% factor(levels = paste0("P=", ps))) |>
 99 |   ggplot(aes(x, y = pred, colour = factor(log_lambda))) + 
100 |   geom_point(aes(y=y), colour = "grey", alpha=0.5) +
101 |   geom_line(aes(y=y_true), colour="black") +
102 |   geom_line(aes(y=pred)) +
103 |   facet_wrap(~p) +
104 |   labs(colour="Log(lambda)")) 
105 | # By g
106 | (output$rff_specific2 <- data |> 
107 |   filter(split=="train") |> 
108 |   filter(log_lambda %in% c(0), p %in% 2^(1:6)) |>
109 |   mutate(p = paste0("P=", p) %>% factor(levels = paste0("P=", ps))) |>
110 |   ggplot(aes(x, y = pred, colour = factor(log(g)))) + 
111 |   geom_point(aes(y=y), colour = "grey", alpha=0.5) +
112 |   geom_line(aes(y=y_true), colour="black") +
113 |   geom_line(aes(y=pred)) +
114 |   facet_wrap(~p) +
115 |   labs(colour="Log(g)"))
116 | 
117 | # Figure: R2 by g, p, and lambda ----
118 | (output$rff_overview <- data |> 
119 |   filter(split=="test") |> 
120 |   group_by(g, p, log_lambda) |>
121 |   summarise(
122 |     r2 = 1 - sum((y - pred)^2) / sum((y - mean(y))^2) 
123 |   ) |> 
124 |   mutate(g = paste0("log(g): ", log(g)) |> factor(levels = paste0("log(g): ", log_gs))) |> 
125 |   ggplot(aes(x=p, y=r2, colour=factor(log_lambda))) +
126 |   geom_point(size=1) +
127 |   geom_line() +
128 |   facet_wrap(~g) +
129 |   labs(x = "Number of random features, p", colour = "Log(lambda)", y = "Out-of-sample R2"))
130 | 
131 | 
132 | 
133 | 
134 | # OLD SCRIBBLES -------------------------
135 | if (FALSE) {
136 |   # Simple example -------------------------------------------------
137 |   x <- seq(0, 1, 0.01)
138 |   y <- case_when(
139 |     x <= 0.25 ~ -0.005,
140 |     x <= 0.50 ~ 0,
141 |     x <= 0.75 ~ 0.005,
142 |     x > 0.75 ~ 0.02,
143 |   )
144 |   y <- case_when(
145 |     x <= 0.75 ~ 0.005,
146 |     x > 0.75 ~ 0.005 + (x-0.75)*0.1
147 |   )
148 |   y <- -0.01+x*0.02
149 |   y <- -0.01+0.01+x + 1*x^3
150 |   y_true <- -0.01+20^x
151 |   y_true <- -0.005+0.001*x+0.01*(1 / (1 + exp(-10*(x-0.1))))
152 |   # x <- seq(-30, 70, by = 1)
153 |   # y_true <- 10*x+0.5*x^2-0.01*x^3-1.08^x#+100*sin(x)
154 |   # y <- y_true + rnorm(length(y_true), 0, 150)
155 |   
156 |   # True function and OLS approximation
157 |   y_true <- -10-100*x+300*x^2-1500*(x-0.5)^3+50^x
158 |   y <- y_true + rnorm(length(y_true), 0, 10)
159 |   tibble(
160 |     x=x, 
161 |     y=y
162 |   ) %>%
163 |     ggplot(aes(x=x, y=y)) +
164 |     geom_point() +
165 |     geom_smooth(method = "lm", se=F)
166 |   
167 |   # RFF approximation
168 |   set.seed(settings$seed_no)
169 |   ps <- 1:6
170 |   
171 |   data <- c(settings$rff$g_vec[1], 1) |> map(function(g) {
172 |     rff_x <- as.matrix(x) %>% rff(p = 2^max(ps), g = g)
173 |     data <- ps %>% lapply(function(i) {
174 |       p <- 2^i
175 |       rff_x_new <- p^(-0.5)*cbind(rff_x$X_cos[, 1:(2^i/2)], rff_x$X_sin[, 1:(2^i/2)])
176 |       rff_reg <- lm(y~-1+rff_x_new)
177 |       tibble(
178 |         p = p,
179 |         y_true = y_true,
180 |         y = y,
181 |         x = x, 
182 |         preds = drop(predict(rff_reg)),
183 |         ols_pred = drop(predict(lm(y~x)))
184 |       )
185 |     }) %>% bind_rows() |> mutate(g)
186 |   })  |> bind_rows()
187 |   data %>%
188 |     pivot_longer(-c(g, p, x)) %>%
189 |     mutate(
190 |       p = paste0("P=", p) %>% factor(levels = paste0("P=", 2^ps)),
191 |       name = case_when(
192 |         name == "preds" ~ "RFF prediction",
193 |         name == "y_true" ~ "Truth",
194 |         name == "y" ~ "y",
195 |         name == "ols_pred" ~ "OLS prediction"
196 |       )
197 |     ) %>%
198 |     filter(name != "OLS prediction") |> 
199 |     ggplot(aes(x, value, colour=name, linetype=factor(g))) +
200 |     # geom_point(alpha=0.5) +
201 |     geom_line() +
202 |     facet_wrap(~p, scales = "free_y") +
203 |     theme(
204 |       legend.position = "top",
205 |       legend.title = element_blank()
206 |     )
207 |   
208 |   data |> 
209 |     mutate(p = paste0("P=", p) %>% factor(levels = paste0("P=", 2^ps))) |>
210 |     ggplot(aes(x, y = preds, colour = factor(round(g,2)))) + 
211 |     geom_point(aes(y=y), colour = "grey") +
212 |     geom_line(aes(y=y_true), colour="black") +
213 |     geom_line(aes(y=preds)) +
214 |     facet_wrap(~p) +
215 |     labs(colour="RFF with g") 
216 |   
217 |   
218 |   
219 |   
220 |   # Sequential plots
221 |   rff_x_2 <- 2^(-0.5)*cbind(rff_x$X_cos[, 1], rff_x$X_sin[, 1])
222 |   rff_reg2 <- lm(y~-1+rff_x_2)
223 |   rff_x_4 <- 4^(-0.5)*cbind(rff_x$X_cos[, 1:2], rff_x$X_sin[, 1:2])
224 |   rff_reg4 <- lm(y~-1+rff_x_4)
225 |   
226 |   
227 |   
228 |   rbind(
229 |     tibble(x = x, y = rff_reg2$coefficients[1]*rff_x_2[, 1], name = "cos1", p = 2),
230 |     tibble(x = x, y = rff_reg2$coefficients[2]*rff_x_2[, 2], name = "sin1", p = 2),
231 |     tibble(x = x, y = rff_reg4$coefficients[1]*rff_x_4[, 1], name = "cos1", p = 4),
232 |     tibble(x = x, y = rff_reg4$coefficients[2]*rff_x_4[, 2], name = "cos2", p = 4),
233 |     tibble(x = x, y = rff_reg4$coefficients[3]*rff_x_4[, 3], name = "sin1", p = 4),
234 |     tibble(x = x, y = rff_reg4$coefficients[4]*rff_x_4[, 4], name = "sin2", p = 4)
235 |   ) %>%
236 |     ggplot(aes(x, y, colour=name)) +
237 |     geom_line() +
238 |     facet_wrap(~p, scales = "free_y")
239 |   
240 |   rbind(
241 |     tibble(x = x, y = rff_reg2$coefficients[1]*rff_x_2[, 1], name = "cos1", p = 2),
242 |     tibble(x = x, y = rff_reg2$coefficients[2]*rff_x_2[, 2], name = "sin1", p = 2),
243 |     tibble(x = x, y = rff_reg4$coefficients[1]*rff_x_4[, 1], name = "cos1", p = 4),
244 |     tibble(x = x, y = rff_reg4$coefficients[2]*rff_x_4[, 2], name = "cos2", p = 4),
245 |     tibble(x = x, y = rff_reg4$coefficients[3]*rff_x_4[, 3], name = "sin1", p = 4),
246 |     tibble(x = x, y = rff_reg4$coefficients[4]*rff_x_4[, 4], name = "sin2", p = 4)
247 |   ) %>% 
248 |     group_by(p, x) %>% 
249 |     summarise(pred_y = sum(y)) %>% 
250 |     ggplot(aes(x, pred_y, colour=factor(p))) + 
251 |     geom_line()
252 |   
253 |   test <- cbind(rff_x$X_cos, rff_x$X_sin)
254 |   colnames(test) <- c(paste0("cos", 1:ncol(rff_x$X_cos)), paste0("sin", 1:ncol(rff_x$X_sin)))
255 |   test <- cbind(x=x, test) 
256 |   
257 |   as_tibble(test) %>%
258 |     pivot_longer(-x) %>%
259 |     mutate(
260 |       type = substr(name, 1, 3),
261 |       no = as.integer(substr(name, 4, 4))
262 |     ) %>%
263 |     ggplot(aes(x, value, colour=factor(no))) +
264 |     geom_line() +
265 |     facet_wrap(~type, scales="free_y")
266 |   
267 |   tibble(x = x, cos_x = cos(-x), sin_x = sin(-x)) %>% 
268 |     pivot_longer(-x) %>% 
269 |     ggplot(aes(x, value,colour=name)) + 
270 |     geom_line()
271 |   
272 |   
273 |   # One independent variable ---------------------------------------
274 |   # True model ----------------------
275 |   x <- seq(-30, 70, by = 1)
276 |   y <- 10*x+0.5*x^2-0.01*x^3-1.08^x#+100*sin(x)
277 |   
278 |   tibble(x=x, y=y) %>%
279 |     ggplot(aes(x=x, y=y)) +
280 |     geom_point() +
281 |     geom_line() +
282 |     geom_smooth(method = "lm", se=F) +
283 |     labs(title = "y=10x+0.5x^2-0.01x^3-1.08^x+100*sin(x)")
284 |   
285 |   # RFF approximation --------------
286 |   set.seed(settings$seed_no)
287 |   ps <- 1:6
288 |   g_vec = exp(-4)
289 |   data <- ps %>% lapply(function(p) {
290 |     rff_x <- as.matrix(x) %>% rff(p = 2^p, g = 1)
291 |     rff_x <- cbind(rff_x$X_cos, rff_x$X_sin)
292 |     rff_reg <- lm(y~rff_x)
293 |     tibble(
294 |       p = p,
295 |       y = y,
296 |       x = x, 
297 |       preds = drop(predict(rff_reg)),
298 |       ols_pred = drop(predict(lm(y~x)))
299 |     )
300 |   }) %>% bind_rows()
301 |   
302 |   data %>%
303 |     pivot_longer(-c(p, x)) %>%
304 |     mutate(
305 |       p = paste0("P=", 2^p) %>% factor(levels = paste0("P=", 2^ps)),
306 |       name = case_when(
307 |         name == "preds" ~ "RFF prediction",
308 |         name == "y" ~ "Truth",
309 |         name == "ols_pred" ~ "OLS prediction"
310 |       )
311 |     ) %>%
312 |     ggplot(aes(x, value, colour=name)) +
313 |     # geom_point(alpha=0.5) +
314 |     geom_line() +
315 |     facet_wrap(~p, scales = "free_y") +
316 |     theme(
317 |       legend.position = "top",
318 |       legend.title = element_blank()
319 |     ) +
320 |     labs(title = "y=10x+0.5x^2-0.01x^3-1.08^x+100*sin(x)")
321 |   
322 |   # Two variables -----------------------------
323 |   # True model ----------------------
324 |   x <- seq(-30, 70, by = 1)
325 |   z <- c(rep(-1, length(x)), rep(1, length(x)))
326 |   x <- rep(x, 2)
327 |   y <- z*(10*x+0.5*x^2-0.01*x^3-1.08^x)#+100*sin(x))
328 |   X <- cbind(z, x)
329 |   
330 |   tibble(x=x, y=y, z=z) %>%
331 |     ggplot(aes(x=x, y=y, colour = factor(z))) +
332 |     geom_point() +
333 |     geom_line() +
334 |     geom_smooth(method = "lm", se=F) +
335 |     labs(title = "y=10x+0.5x^2-0.01x^3-1.08^x+100*sin(x)")
336 |   
337 |   # RFF approximation --------------
338 |   set.seed(settings$seed_no)
339 |   ps <- 1:7
340 |   rff_x <- as.matrix(X) %>% rff(p = 2^max(ps), g = 1)
341 |   data <- ps %>% lapply(function(i) {
342 |     rff_x_new <- cbind(rff_x$X_cos[, 1:(2^i/2)], rff_x$X_sin[, 1:(2^i/2)])
343 |     tibble(
344 |       p = 2^i,
345 |       y = y,
346 |       z = z, 
347 |       x = x, 
348 |       preds = drop(predict(lm(y~rff_x_new))),
349 |       ols_pred = drop(predict(lm(y~X)))
350 |     )
351 |   }) %>% bind_rows()
352 |   
353 |   (output$rff <- data %>%
354 |       filter(p %in% 2^c(1, 5, 6)) %>%
355 |       pivot_longer(-c(p, x, z)) %>%
356 |       mutate(
357 |         p = paste0("P=", p) %>% factor(levels = paste0("P=", 2^ps)),
358 |         z = paste0("Z=", z),
359 |         name = case_when(
360 |           name == "preds" ~ "RF prediction",
361 |           name == "y" ~ "Truth",
362 |           name == "ols_pred" ~ "OLS prediction"
363 |         )
364 |       ) %>%
365 |       ggplot(aes(x, value, colour=name)) +
366 |       # geom_point(alpha=0.5) +
367 |       geom_line() +
368 |       facet_wrap(factor(z)~p, ncol=3) +
369 |       theme(
370 |         legend.position = "top",
371 |         legend.title = element_blank()
372 |       ) +
373 |       labs(x = "X", y = "Y"))#+
374 |   # labs(title = "y=z[10x+0.5x^2-0.01x^3-1.08^x+100*sin(x)]")
375 |   
376 |   # data %>%
377 |   #   filter(p %in% 2^(4:6)) %>%
378 |   #   pivot_wider(names_from = p, values_from = preds, names_prefix = "RFF") %>%
379 |   #   pivot_longer(-c(z, x)) %>%
380 |   #   ggplot(aes(x, value, colour=name)) +
381 |   #   geom_line() +
382 |   #   facet_wrap(~z)
383 |   
384 |   # Check RFF variation --------
385 |   x <- seq(0, 1, 0.01)
386 |   rff_x <- as.matrix(x) %>% rff(p = 2^4, g = exp(4))
387 |   cbind(rff_x$X_cos, rff_x$X_sin) |> 
388 |     as_tibble() |> 
389 |     mutate(x = x) |> 
390 |     pivot_longer(-x) |> 
391 |     ggplot(aes(x, value, colour=name)) +
392 |     geom_point() +
393 |     facet_wrap(~name, scales = "free_y")
394 |   
395 |   x1 <- runif(n=115)
396 |   sin(x1 %*% rnorm(n=115, mean=0, sd=exp(-4)))
397 |   
398 |   
399 |   
400 | }
401 | 


--------------------------------------------------------------------------------
/6 - Short selling fees.R:
--------------------------------------------------------------------------------
  1 | # Load shorting data
  2 | short_fees <- fread("Data/short_fees.csv")
  3 | short_fees <- short_fees[!is.na(dcbs) & record_type==1 & market_area=="USA Equity" & !is.na(permno)]
  4 | 
  5 | # Expand shorting data
  6 | if (FALSE) {
  7 |   # Check that shorting fees are somewhat stable over time and DCBS group
  8 |   short_fees[!is.na(indicative_fee), .(mean=mean(indicative_fee), median=median(indicative_fee)), by = .(date, dcbs)] %>%
  9 |     pivot_longer(-c(date, dcbs)) %>%
 10 |     ggplot(aes(date, value, colour=name)) + 
 11 |     geom_point(size=0.5, alpha = 0.5) +
 12 |     # geom_line() +
 13 |     facet_wrap(~factor(dcbs), scales = "free_y") # Declining trend is natural as Markit adds more stocks
 14 | }
 15 | # For stocks in the data --------
 16 | short_fees[, sfee := if_else(!is.na(indicative_fee), indicative_fee, mean(indicative_fee, na.rm=T)), by = dcbs]
 17 | 
 18 | # Take the last observation in a month for each stock
 19 | short_fees[, eom := ceiling_date(date, unit="m")-1]
 20 | short_fees[, max_date := max(date), by = .(permno, eom)]
 21 | short_fees <- short_fees[date==max_date, .(permno, eom, sfee, dcbs)]
 22 | 
 23 | # Remove duplicates  # Very rare: short_fees[, .N, by = .(permno, eom)][, table(N)]
 24 | short_fees[, n := .N, by = .(permno, eom)]
 25 | short_fees <- short_fees[n == 1][, n := NULL]
 26 | 
 27 | # Compute shorting fees for stocks not in the sample ---------------------
 28 | # Analysis
 29 | if (FALSE) {
 30 |   short_fees2 <- short_fees[, .(id=permno, eom, sfee, dcbs)][chars[, .(id, eom, market_equity, rvol_252d, dolvol_126d)], on = .(id, eom)]
 31 |   # How many are non-missing? Very few
 32 |   short_fees2[, .(non_miss=mean(!is.na(sfee))), by = eom][non_miss != 0] %>% ggplot(aes(eom, non_miss)) + geom_point()
 33 |   # What is the typical dcbs group? 96% of the sample are in the "easiest to borrow" group
 34 |   short_fees2[!is.na(dcbs), table(dcbs)/.N] 
 35 |   # Surprisingly difficult to predict 
 36 |   felm(sfee~market_equity | eom | 0 | eom+id, data = short_fees2) %>% summary()
 37 |   felm(sfee~market_equity+rvol_252d | eom | 0 | eom+id, data = short_fees2) %>% summary()
 38 |   felm(sfee~market_equity+rvol_252d+dolvol_126d | eom | 0 | eom+id, data = short_fees2) %>% summary()
 39 |   # Maybe it's caused by outliers? It helps, but not a lot
 40 |   short_fees2[!is.na(sfee) & eom==max(eom)] %>% ggplot(aes(market_equity, sfee)) + geom_point()
 41 |   felm(sfee~market_equity | eom | 0 | eom+id, data = short_fees2[sfee <= quantile(short_fees2$sfee, probs=0.99, na.rm=T)]) %>% summary()
 42 |   felm(sfee~market_equity+rvol_252d | eom | 0 | eom+id, data = short_fees2[sfee <= quantile(short_fees2$sfee, probs=0.99, na.rm=T)]) %>% summary()
 43 |   felm(sfee~market_equity+rvol_252d+dolvol_126d | eom | 0 | eom+id, data = short_fees2[sfee <= quantile(short_fees2$sfee, probs=0.99, na.rm=T)]) %>% summary()
 44 |   # Maybe impute by median borrow fee for stocks in sample?
 45 |   short_fees2[!is.na(sfee), .(n = .N, mean=mean(sfee), median=median(sfee))]
 46 |   short_fees2[!is.na(sfee), .(n = .N, mean=mean(sfee), median=median(sfee)), by = dcbs][, n := round(n / sum(n), 2)][order(dcbs)]
 47 | }
 48 | # Impute outside of sample using median
 49 | short_fees <- short_fees[, .(id=permno, eom, sfee)][chars[, .(id, eom)], on = .(id, eom)]
 50 | short_fees[, sfee := if_else(!is.na(sfee), sfee, median(sfee, na.rm=T))]
 51 | 
 52 | # Incorporate shorting cost of each method -------------------------------------
 53 | list_short <- list(
 54 |   "Multiperiod-ML*" = mp$w, 
 55 |   "Portfolio-ML" = pfml$w, 
 56 |   "Static-ML*" = static$w, 
 57 |   "Markowitz-ML" = tpf$w, 
 58 |   "Factor-ML" = factor_ml$w, 
 59 |   "1/N" = ew$w, 
 60 |   "Market" = mkt$w, 
 61 |   "Rank-ML" = rw$w, 
 62 |   "Minimum Variance" = mv$w
 63 | )
 64 | 
 65 | shorting_costs_ts <- names(list_short) %>% lapply(function(nm) {
 66 |   w <- list_short[[nm]]
 67 |   # Pay 100% of fee if short get 70% of fee if long [from p.10 in "anomalies and their short-sale cost"] 
 68 |   short_cost <- short_fees[w, on = .(id, eom)][w<0, .(short_cost = sum(sfee*(-w))), by = eom] #[order(shorting_cost)][, mean(shorting_cost)]
 69 |   long_rev <- short_fees[w, on = .(id, eom)][w>0, .(long_rev = sum(sfee*w)), by = eom] #[order(shorting_cost)][, mean(shorting_cost)]
 70 |   sub <- w[, .(long = sum(w[w>0]), short = sum(w[w<0]), net_lev = sum(w)), by = eom][, type := nm]
 71 |   sub <- short_cost[sub, on = "eom"]
 72 |   long_rev[sub, on = .(eom)]
 73 | }) %>% rbindlist()
 74 | 
 75 | # Shorting costs
 76 | shorting_costs <- shorting_costs_ts[, lapply(.SD, mean), .SDcols = c("long_rev", "short_cost", "long", "short", "net_lev"), by = type]
 77 | shorting_costs[short==0, short_cost := 0]
 78 | shorting_costs[, net_short_cost := short_cost-long_rev]
 79 | 
 80 | # Output: Shorting costs are almost completely unimportant for our strategies [but this is ofcourse conditional on our ability to borroe]
 81 | (test_tbl <- pf_summary[shorting_costs, on = .(type)][, .(type, long, short, short_cost, long_rev, net_short_cost, 
 82 |                                                           r_tc, 
 83 |                                                           r_tc_short = r_tc-short_cost,
 84 |                                                           r_tc_short_net = r_tc-net_short_cost,
 85 |                                                           sr, 
 86 |                                                           sr_short = (r_tc-short_cost)/sd,
 87 |                                                           sr_short_net = (r_tc-net_short_cost)/sd)])
 88 | test_tbl[, .(type, sr, sr_short, sr_short_net)]
 89 | 
 90 | # Alternative 1 
 91 | test_tbl %>% 
 92 |   mutate(type = type %>% factor(levels = pf_order_new)) %>%
 93 |   arrange(type) %>%
 94 |   select(-c("r_tc", "r_tc_short", "r_tc_short_net")) %>%
 95 |   xtable(digits = c(0, 0, rep(2, 2), rep(4, 3), rep(2, 3))) %>%
 96 |   print(include.rownames=FALSE)
 97 | 
 98 | # Alternative 2
 99 | test_tbl %>% 
100 |   mutate(type = type %>% factor(levels = pf_order_new)) %>%
101 |   arrange(type) %>%
102 |   select(-c("sr", "sr_short", "sr_short_net")) %>%
103 |   xtable(digits = c(0, 0, rep(2, 2), rep(4, 3), rep(2, 3))) %>%
104 |   print(include.rownames=FALSE)
105 | 
106 | # Figure -------------
107 | (output$shorting <- pf_summary[shorting_costs, on = .(type)] |> 
108 |   filter(type %in% c("Portfolio-ML", "Multiperiod-ML*", "Static-ML*")) |>
109 |   # select(type, shorting, r_tc, sd, obj, short_cost, long_rev) |> 
110 |   mutate(
111 |     type = type %>% factor(levels = pf_order),
112 |     obj_gross_short = obj - short_cost,
113 |     obj_net_short = obj - short_cost + long_rev
114 |   ) |> 
115 |   select(type, obj, obj_gross_short, obj_net_short) |>
116 |   pivot_longer(-type) |> 
117 |   mutate(
118 |     name = case_when(
119 |       name == "obj" ~ "Utility",
120 |       name == "obj_gross_short" ~ "Utility - short costs",
121 |       name == "obj_net_short" ~ "Utility - short costs + long revenue"
122 |     )
123 |   ) |>
124 |   ggplot(aes(type, value, fill=name)) +
125 |   geom_col(position = position_dodge(width = 0.93)) +
126 |   geom_text(aes(label = formatC(value, format="f", digits=3)), position = position_dodge(width = 0.93), vjust = -0.2) +
127 |   coord_cartesian(ylim = c(0, 0.1)) +
128 |   labs(y = "Utility") +
129 |   theme(
130 |     legend.position = "top",
131 |     legend.justification = "center",
132 |     legend.title = element_blank(),
133 |     axis.title.x = element_blank()
134 |   ))
135 |   
136 | # Plot: Utility: without shorting costs, with only gross shorting costs, with new shorting costs
137 |   
138 |   
139 | 


--------------------------------------------------------------------------------
/7 - Figures.R:
--------------------------------------------------------------------------------
  1 | # Figures -------------------------
  2 | output_fig <- function(path, name, format, width, height) {
  3 |   path <- paste0(path, "/", name)   # <- change path as desired 
  4 |   if (format == "tex") {
  5 |     tikz(str_c(path, ".tex"), width = width, height = height) 
  6 |   }
  7 |   if (format == "pdf") {
  8 |     pdf(str_c(path, ".pdf"), width = width, height = height)
  9 |   }
 10 |   if (format == "jpg") {
 11 |     w_pix <- width / (2.54 / 96 / 2.54)
 12 |     h_pix <- height / (2.54 / 96 / 2.54)
 13 |     jpeg(str_c(path, ".jpg"), width = w_pix, height = h_pix)
 14 |   }
 15 |   if (format == "eps") {
 16 |     cairo_ps(filename = str_c(path, ".eps"),
 17 |              width = width, height = height)
 18 |   }
 19 |   if (format == "tiff") {
 20 |     tiff(filename = str_c(path, ".tiff"), units="in", width=width, height=height, res=500)
 21 |   }
 22 | }
 23 | # For Paper -----------------------------------------
 24 | # Efficient frontier
 25 | output$ef + theme(text = element_text(size=txt_size))
 26 | ggsave("Figures/ief_by_wealth.pdf", width = fig_w, height = fig_h, units = "in")
 27 | 
 28 | # Efficient frontier all methods
 29 | output$ef_all + theme(text = element_text(size=txt_size))
 30 | ggsave("Figures/ief_by_method.pdf", width = fig_w, height = fig_h, units = "in")
 31 | 
 32 | # Portfolios: Cumulative Performance
 33 | output$ts
 34 | ggsave("Figures/cumret_pf.pdf", width = fig_w, height = fig_h, units = "in")
 35 | 
 36 | # Portfollios: Stats over time
 37 | output$comp_stats + theme(text = element_text(size=txt_size))
 38 | ggsave("Figures/stats_ts.pdf", width = fig_w, height = fig_h*1.5, units = "in")
 39 | 
 40 | # Example: Apple vs. Xerox
 41 | output$example +
 42 |   scale_y_continuous(labels = scales::number_format(accuracy = 0.01)) +
 43 |   theme(
 44 |     axis.text.y = element_text(size = 8)
 45 |   )
 46 | ggsave("Figures/example_weights.pdf", width = fig_w, height = fig_h*1.5, units = "in")
 47 | 
 48 | # Feature Importance
 49 | output$feature_importance + theme(text = element_text(size=txt_size))
 50 | ggsave("Figures/feat_imp.pdf", width = fig_w, height = fig_h, units = "in")
 51 | 
 52 | # Optimal Hyper-parameters [Expected return]
 53 | output$er_tuning
 54 | ggsave("Figures/optimal_hps_er.pdf", width = fig_w, height = fig_h, units = "in")
 55 | 
 56 | # Optimal Hyper-parameters
 57 | output$portfolio_tuning
 58 | ggsave("Figures/optimal_hps.pdf", width = fig_w, height = fig_h, units = "in")
 59 | 
 60 | # Feature importance - Returns
 61 | output$fi_returns + theme(
 62 |   axis.text.x = element_text(angle = 360-20, hjust = 0, vjust = 1),
 63 |   text = element_text(size = 8),
 64 |   axis.title = element_text(size = 7),
 65 |   legend.text = element_text(size = 5),  # Adjust text size
 66 |   legend.title = element_text(size = 8),
 67 |   legend.key.size = unit(0.5, "cm")
 68 | )
 69 | ggsave("Figures/fi_returns.pdf", width = fig_w, height = fig_h, units = "in")
 70 | 
 71 | # Feature auto-correlation
 72 | output$ar1 + theme(axis.text.y = element_text(size=4))
 73 | ggsave("Figures/feature_ar1.pdf", width = fig_w, height = fig_h*2, units = "in")
 74 | 
 75 | # Efficient frontier with trading cost
 76 | output$cf_ef_tc
 77 | ggsave("Figures/ef_cf_tc.pdf", width = fig_w, height = fig_h, units = "in")
 78 | 
 79 | # Efficient frontier without trading cost
 80 | output$cf_ef_markowitz
 81 | ggsave("Figures/ef_cf_no_tc.pdf", width = fig_w, height = fig_h, units = "in")
 82 | 
 83 | # Performance across size distribution 
 84 | output$by_size
 85 | ggsave("Figures/by_size.pdf", width = fig_w, height = fig_h*1.5, units = "in")
 86 | 
 87 | # Shorting costs
 88 | output$shorting
 89 | ggsave("Figures/shorting.pdf", width = fig_w, height = fig_h, units = "in")
 90 | 
 91 | # RFF example
 92 | output_fig(path=output_path_fig, name = "rff_ex", format = format, width = fig_w, height = fig_h*1.5)
 93 | output$rff_specific0 + theme(strip.text.x = element_text(size = 8))
 94 | dev.off()
 95 | 
 96 | # Relation between liquidity and portfolio weight
 97 | output$w_liq
 98 | ggsave("Figures/w_liq.pdf", width = fig_w, height = fig_h, units = "in")
 99 | 
100 | # Simulations
101 | output$simulations + theme(text = element_text(size=txt_size))
102 | ggsave("Figures/simulations.pdf", width = fig_w, height = fig_h, units = "in")


--------------------------------------------------------------------------------
/7 - Numbers.R:
--------------------------------------------------------------------------------
 1 | # Numbers mentioned in the text -----------------
 2 | # Number of stocks by the end of 2020:
 3 | data[eom==as.Date("2020-12-31"), .N]
 4 | # Dollar volume of Walmart and Xerox
 5 | chars[id %in% c(22111, 27983),.(id, eom, dolvol=dolvol/1e6)][eom==as.Date("2020-12-31")]
 6 | # Auto correlation of ret_1_0 and be_me
 7 | ar1_ss[char %in% c("be_me", "ret_1_0")]
 8 | cluster_labels[ar1_ss, on = .(characteristic=char)][cluster=="quality", median(ar1)]
 9 | cluster_labels[ar1_ss, on = .(characteristic=char)][cluster=="momentum"]
10 | 
11 | data_tc[valid==T & !is.na(be_me) & !is.na(ret_12_1) & be_me != 0.5 & ret_12_1 != 0.5, cor(be_me, ret_12_1), by = eom][, summary(V1)]
12 | data_tc[valid==T & !is.na(be_me) & !is.na(ret_12_1) & be_me != 0.5 & ret_12_1 != 0.5, cor(be_me, ret_12_1)]
13 | 
14 | # Realized utility without second layer of portfolio tuning
15 | validation_m1[k==1 & g==0 & u==1, .(obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12, sr = mean(r-tc)/sd(r)*sqrt(12))]
16 | validation_static[k==1 & g==0 & u==1, .(obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12, sr = mean(r-tc)/sd(r)*sqrt(12))]
17 | validation_static[k==0.2 & g==0 & u==1, .(obj = (mean(r)-0.5*var(r)*pf_set$gamma_rel-mean(tc))*12, sr = mean(r-tc)/sd(r)*sqrt(12))]
18 | 
19 | # Move investors from 10b to 1b at a relative risk aversion of 10
20 | large <- ef_ss[wealth_end==1e10 & gamma_rel == 10]
21 | # Large investors for a vol of 14% can get 
22 | large[, r_tc]
23 | # Small investors for a vol of 14% can get 
24 | (small_er <- ef_ss[wealth_end==1e9, r_tc[gamma_rel==100]+(large$sd-sd[gamma_rel==100])*(r_tc[gamma_rel==20]-r_tc[gamma_rel==100])/(sd[gamma_rel==20]-sd[gamma_rel==100])])
25 | output$ef + annotate("point", x = large$sd, y = small_er)
26 | 
27 | # Median dollar volume by end of sample (for simulations)
28 | chars[valid==T, median(dolvol/1e6), by = eom][eom==max(eom)]
29 | 
30 | 
31 | # Shorting fees ------------------------
32 | short_fees[date==max(date)][,.(n=.N,fee=mean(indicative_fee)),by=dcbs][order(dcbs)][, pct := n/sum(n)][]
33 | short_sub <- short_fees[date==as.Date("2020-12-31")][chars[valid==T & eom==max(eom), .(id, eom, size_grp)], on = .(permno=id, date=eom)]
34 | short_sub[, mean(!is.na(indicative_fee))]
35 | short_sub$indicative_fee |> quantile(na.rm=T, probs = seq(0, 1, 0.01))
36 | short_sub[,.(n=.N,fee=mean(indicative_fee)),by=dcbs][order(dcbs)][, pct := n/sum(n)][]
37 | 


--------------------------------------------------------------------------------
/7 - Tables.R:
--------------------------------------------------------------------------------
 1 | # Tables -------------------
 2 | # Portfolio summary [Add "\\-1.2em after Markowitz-ML and remove TC, R-TC, SR_new and Utility from Markowtiz-ML]
 3 | if (FALSE) {
 4 |   pf_summary %>%
 5 |     filter(type %in% c("Portfolio-ML", "Multiperiod-ML", "Static-ML", "Factor-ML", "Markowitz-ML",
 6 |                        "Multiperiod-ML*", "Static-ML*")) %>%
 7 |     rbind(
 8 |       data.table(type="One tuning layer"),
 9 |       data.table(type="Two tuning layers"), fill=T
10 |     ) %>%
11 |     mutate(
12 |       # tc = if_else(type == "Markowitz-ML", NA_real_, tc),
13 |       # r_tc = if_else(type == "Markowitz-ML", NA_real_, r_tc),
14 |       # sr = if_else(type == "Markowitz-ML", NA_real_, sr),
15 |       # obj = if_else(type == "Markowitz-ML", NA_real_, obj),
16 |       order = type %>% factor(levels = c("One tuning layer", "Portfolio-ML", "Multiperiod-ML", "Static-ML", "Factor-ML", "Markowitz-ML",
17 |                                          "Two tuning layers", "Multiperiod-ML*", "Static-ML*")),
18 |       Method = if_else(type %in% c("One tuning layer", "Two tuning layers"), paste0("\\textbf{", as.character(type), "}"), paste0("\\hspace{0.5em}", as.character(type)))
19 |     ) %>%
20 |     arrange(order) %>%
21 |     select(Method, "R"=r, "Vol."=sd, "SR$_\\text{gross}$"=sr_gross, "TC"=tc, "R-TC"=r_tc, "SR$_\\text{net}$"=sr, "Utility"=obj, "Turnover"=turnover_notional, "Lev."=inv) %>%
22 |     xtable(digits = c(0, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2)) %>%
23 |     print.xtable(sanitize.text.function = identity, include.rownames = F, caption.placement = "top")
24 | } else {
25 |   # With all alternatives
26 |   pf_summary %>%
27 |     filter(type %in% c("Portfolio-ML", "Multiperiod-ML", "Static-ML", "Markowitz-ML", "Rank-ML", 
28 |                        "Factor-ML", "Minimum Variance", "1/N", "Market",
29 |                        "Multiperiod-ML*", "Static-ML*")) %>%
30 |     rbind(
31 |       data.table(type="One tuning layer"),
32 |       data.table(type="Two tuning layers"), fill=T
33 |     ) %>%
34 |     mutate(
35 |       # tc = if_else(type == "Markowitz-ML", NA_real_, tc),
36 |       # r_tc = if_else(type == "Markowitz-ML", NA_real_, r_tc),
37 |       # sr = if_else(type == "Markowitz-ML", NA_real_, sr),
38 |       # obj = if_else(type == "Markowitz-ML", NA_real_, obj),
39 |       order = type %>% factor(levels = c("One tuning layer", "Portfolio-ML", "Multiperiod-ML", "Static-ML", "Markowitz-ML", "Factor-ML", 
40 |                                          "Rank-ML", "Minimum Variance", "1/N", "Market",
41 |                                          "Two tuning layers", "Multiperiod-ML*", "Static-ML*")),
42 |       Method = if_else(type %in% c("One tuning layer", "Two tuning layers"), paste0("\\textbf{", as.character(type), "}"), paste0("\\hspace{0.5em}", as.character(type)))
43 |     ) %>%
44 |     arrange(order) %>%
45 |     select(Method, "R"=r, "Vol."=sd, "SR$_\\text{gross}$"=sr_gross, "TC"=tc, "R-TC"=r_tc, "SR$_\\text{net}$"=sr, "Utility"=obj, "Turnover"=turnover_notional, "Lev."=inv) %>%
46 |     xtable(digits = c(0, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2)) %>%
47 |     print.xtable(sanitize.text.function = identity, include.rownames = F, caption.placement = "top")
48 | }
49 | 
50 | 
51 | 
52 | 
53 | # Probability of outperformance
54 | prob_outperformance %>%
55 |   mutate(
56 |     prob_main_op = paste0(formatC(prob_main_op*100, digits = 1, format = "f"), "\\%")
57 |   ) %>%
58 |   pivot_wider(names_from = alt, values_from = prob_main_op) %>%
59 |   select(all_of(c("main", main_types))) %>%
60 |   rename(" "=main) %>%
61 |   xtable(align = "llrrrrr") %>%
62 |   print.xtable(sanitize.text.function = identity, include.rownames = F, caption.placement = "top")
63 | 
64 | # Correlation table
65 | pf_cors_op <- formatC(pf_cors, digits = 2, format="f")
66 | pf_cors_op[upper.tri(pf_cors_op, diag = F)]<-""
67 | # pf_cors_op[-1, -ncol(pf_cors_op)] %>%
68 | pf_cors_op %>%
69 |   xtable(align="lccccc") %>%
70 |   print.xtable(include.rownames = T, caption.placement = "top")
71 | 
72 | # Features information -----------------------------------------------
73 | char_tbl <- cluster_labels[characteristic %in% features][order(cluster)][, .(n=1:.N,"Characteristic"=characteristic, "Cluster"=cluster)]
74 | start <- ceiling(nrow(char_tbl)/3)
75 | cbind(char_tbl[1:start], char_tbl[(start+1):(start*2)], rbind(char_tbl[(start*2+1):nrow(char_tbl)], data.table(n="", Characteristic="", Cluster = ""), data.table(n="", Characteristic="", Cluster = ""))) %>%
76 |   xtable(align="llllllllll", caption = "tbl:chars") %>%
77 |   print.xtable(include.rownames = F, caption.placement = "top")
78 | 


--------------------------------------------------------------------------------
/Joblists/joblist_models.txt:
--------------------------------------------------------------------------------
 1 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models1.txt
 2 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models2.txt
 3 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models3.txt
 4 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models4.txt
 5 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models5.txt
 6 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models6.txt
 7 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models7.txt
 8 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models8.txt
 9 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models9.txt
10 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models10.txt
11 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models11.txt
12 | module load R/4.3.0-foss-2020b;  Rscript slurm_fit_models.R config_files/config_models12.txt


--------------------------------------------------------------------------------
/Joblists/joblist_pfchoice_all.txt:
--------------------------------------------------------------------------------
1 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_all.txt


--------------------------------------------------------------------------------
/Joblists/joblist_pfchoice_base.txt:
--------------------------------------------------------------------------------
1 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_base.txt


--------------------------------------------------------------------------------
/Joblists/joblist_pfchoice_fi.txt:
--------------------------------------------------------------------------------
1 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_fi_base.txt
2 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_fi_ief.txt
3 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_fi_ret.txt


--------------------------------------------------------------------------------
/Joblists/joblist_pfchoice_ief.txt:
--------------------------------------------------------------------------------
 1 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g1.txt
 2 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g2.txt
 3 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g3.txt
 4 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g4.txt
 5 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w1g5.txt
 6 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g1.txt
 7 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g2.txt
 8 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g3.txt
 9 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g4.txt
10 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w2g5.txt
11 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g1.txt
12 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g2.txt
13 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g3.txt
14 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g4.txt
15 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w3g5.txt
16 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g1.txt
17 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g2.txt
18 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g3.txt
19 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g4.txt
20 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_ief_w4g5.txt


--------------------------------------------------------------------------------
/Joblists/joblist_pfchoice_size.txt:
--------------------------------------------------------------------------------
1 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_size1.txt
2 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_size2.txt
3 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_size3.txt
4 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_size4.txt
5 | module load R/4.3.0-foss-2020b;  Rscript slurm_build_portfolios.R config_files/config_pfchoice_size5.txt


--------------------------------------------------------------------------------
/Joblists/joblist_simulations.txt:
--------------------------------------------------------------------------------
 1 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim1.txt
 2 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim2.txt
 3 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim3.txt
 4 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim4.txt
 5 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim5.txt
 6 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim6.txt
 7 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim7.txt
 8 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim8.txt
 9 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim9.txt
10 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim10.txt
11 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim11.txt
12 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim12.txt
13 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim13.txt
14 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim14.txt
15 | module load R/4.3.0-foss-2020b;  Rscript simulations/simulations.R config_files/config_sim15.txt


--------------------------------------------------------------------------------
/Main.R:
--------------------------------------------------------------------------------
  1 | # Libraries ---------------------------------------
  2 | library(tictoc)
  3 | library(lfe)
  4 | library(cowplot)
  5 | library(MASS)
  6 | library(xtable)
  7 | library(roll)
  8 | library(expm)
  9 | library(glmnet)
 10 | library(lubridate)
 11 | library(tidyverse)
 12 | library(data.table)
 13 | source("0 - General functions.R")
 14 | source("0 - Return prediction functions.R")
 15 | source("0 - Portfolio choice functions.R")
 16 | Rcpp::sourceCpp("ewma.cpp")
 17 | Rcpp::sourceCpp("sqrtm_cpp.cpp")
 18 | options(dplyr.summarise.inform = FALSE)
 19 | 
 20 | output <- list()
 21 | 
 22 | # Layout Settings ---------------
 23 | theme_set(theme_classic())
 24 | colours_theme <- c("#0C6291", "#A63446", RColorBrewer::brewer.pal(8, "Dark2"), 
 25 |                    "darkslategrey", "blue3", "red3", "purple2", "yellow2", "aquamarine",
 26 |                    "grey", "salmon", "antiquewhite", "chartreuse") 
 27 | scale_colour_discrete <- function(...) {
 28 |   scale_colour_manual(..., values = colours_theme)
 29 | }
 30 | scale_fill_discrete <- function(...) {
 31 |   scale_fill_manual(..., values = colours_theme)
 32 | }
 33 | 
 34 | pf_order <- c("Portfolio-ML", 
 35 |               "Multiperiod-ML", "Multiperiod-ML*",
 36 |               "Static-ML", "Static-ML*", 
 37 |               "Market", "1/N", "Minimum Variance", "Factor-ML", "Rank-ML", "Markowitz-ML")
 38 | pf_order_new <-  c("Portfolio-ML", "Multiperiod-ML*", "Static-ML*", "Multiperiod-ML", "Static-ML", "Markowitz-ML", "Rank-ML", 
 39 |                    "Factor-ML", "Minimum Variance", "1/N", "Market")
 40 | main_types <- c("Portfolio-ML", "Multiperiod-ML*", "Static-ML*", "Factor-ML", "Markowitz-ML")
 41 | cluster_order <- c("Accruals", "Debt Issuance", "Investment", "Short-Term Reversal", "Value",
 42 |                    "Low Risk", "Quality", "Momentum", "Profitability", "Profit Growth",
 43 |                    "Seasonality", "Size", "Low Leverage")
 44 | 
 45 | txt_size <- 10
 46 | 
 47 | output_path_fig <- "Figures"
 48 | format <- "pdf"
 49 | fig_h <- 3.2
 50 | fig_w <- 6.5 # 6.5 is rouhghly the page width
 51 | 
 52 | # Setup -------------------------
 53 | # Data settings
 54 | settings <- list(
 55 |   parallel = T,  
 56 |   seed_no = 1,
 57 |   months = FALSE,
 58 |   split = list(
 59 |     train_end = as.Date("1970-12-31"), # Change to 1994-12-31?
 60 |     test_end = as.Date("2020-12-31"),
 61 |     val_years = 10,
 62 |     model_update_freq = "yearly", # In c("once", "yearly", "decade") 
 63 |     train_lookback = 1000, # Set to high number (e.g. 1000) if you want an expanding
 64 |     retrain_lookback = 1000 # Set to high number (e.g. 1000) if you want an expanding
 65 |   ),
 66 |   feat_prank = T,  # Percentile rank features?
 67 |   ret_impute = "zero", # Impute missing returns with
 68 |   feat_impute = T,
 69 |   addition_n = 12,     # Need to be valid for the past n months to be included in investment universe
 70 |   deletion_n = 12,     # Exclude from universe after N periods where the stocks is non-valid
 71 |   screens = list(
 72 |     start = as.Date("1952-12-31"), 
 73 |     end = as.Date("2020-12-31"),
 74 |     feat_pct = 0.5,
 75 |     nyse_stocks = T
 76 |   ),
 77 |   pi = 0.1,    # What is the price impact as a function of the daily trading volume? For example, if pi=0.1, then trading 1% of the daily dollar volume has an average impact of 0.1%
 78 |   rff = list(
 79 |     p_vec = 2^c(1:9),
 80 |     g_vec = exp(-3:-2),
 81 |     l_vec = c(0, exp(seq(-10, 10, length.out = 100))) # Before we also had exp(-100) but that's effectively zero 
 82 |   ),
 83 |   pf = list(
 84 |     dates = list(
 85 |       start_year = 1971,
 86 |       end_yr = 2020,
 87 |       split_years = 10
 88 |     ),
 89 |     hps = list(
 90 |       cov_type = "cov_add",
 91 |       m1 = list(
 92 |         k = c(1, 2, 3),
 93 |         u = c(0.25, 0.5, 1),
 94 |         g = c(0, 1, 2),
 95 |         K = 12
 96 |       ),
 97 |       # m2 = list(l = c(0, exp(-3:3))), # c(0, exp(-10:3))
 98 |       static = list(
 99 |         k = c(1/1, 1/3, 1/5),
100 |         u = c(0.25, 0.5, 1),
101 |         g = c(0, 1, 2)
102 |       )
103 |     )
104 |   ),
105 |   pf_ml = list( 
106 |     g_vec = exp(-3:-2), 
107 |     p_vec = c(2^(6:9)), 
108 |     l_vec = c(0, exp(seq(-10, 10, length.out = 100))), # Vastly expanded the search set for lambda
109 |     orig_feat = F,          # Should original features be added to the list of random features?
110 |     scale = T
111 |   ),
112 |   ef = list(
113 |     wealth = c(1, 1e9, 1e10, 1e11),
114 |     gamma_rel = c(1, 5, 10, 20, 100)
115 |   ),
116 |   cov_set = list(
117 |     industries = T,   # Should we include industry dummies in the covariance matrix? No feasible when we look across size groups (because not all induistries have observations) 
118 |     obs = 252*10,    # Check tibble(t = 1:(settings$cov_set$obs), w = (0.5^(1/(settings$cov_set$hl_cor)))^t) %>% ggplot(aes(t, w)) + geom_point() + geom_hline(yintercept = 0)
119 |     hl_cor = 252*3/2,  # Barra uses 48 months as half-life, seems high. https://www.alacra.com/alacra/help/barra_handbook_GEM.pdf
120 |     hl_var = 252/2,
121 |     hl_stock_var = 252/2,
122 |     min_stock_obs = 252,
123 |     initial_var_obs = 21*3
124 |   ),
125 |   factor_ml = list(
126 |     n_pfs = 10
127 |   )
128 | )
129 | 
130 | set.seed(settings$seed_no)
131 | 
132 | # Portfolio settings -----------
133 | pf_set <- list(
134 |   wealth = 1e10,
135 |   gamma_rel = 10,
136 |   mu = 0.007, # How much do we expect the portfolio to grow each month? market[year(eom_ret)>=1980, mean(mkt_vw_exc, na.rm=T)]
137 |   # aim_hor = max_t_hor-1,
138 |   lb_hor = 11 # Change??
139 | )
140 | 
141 | # Features ---------------------
142 | features <- c(
143 |   "age",                 "aliq_at",             "aliq_mat",            "ami_126d",           
144 |   "at_be",               "at_gr1",              "at_me",               "at_turnover",        
145 |   "be_gr1a",             "be_me",               "beta_60m",            "beta_dimson_21d",    
146 |   "betabab_1260d",       "betadown_252d",       "bev_mev",             "bidaskhl_21d",       
147 |   "capex_abn",           "capx_gr1",            "capx_gr2",            "capx_gr3",           
148 |   "cash_at",             "chcsho_12m",          "coa_gr1a",            "col_gr1a",           
149 |   "cop_at",              "cop_atl1",            "corr_1260d",          "coskew_21d",         
150 |   "cowc_gr1a",           "dbnetis_at",          "debt_gr3",            "debt_me",            
151 |   "dgp_dsale",           "div12m_me",           "dolvol_126d",         "dolvol_var_126d",    
152 |   "dsale_dinv",          "dsale_drec",          "dsale_dsga",          "earnings_variability",
153 |   "ebit_bev",            "ebit_sale",           "ebitda_mev",          "emp_gr1",            
154 |   "eq_dur",              "eqnetis_at",          "eqnpo_12m",           "eqnpo_me",           
155 |   "eqpo_me",             "f_score",             "fcf_me",              "fnl_gr1a",           
156 |   "gp_at",               "gp_atl1",             "ival_me",             "inv_gr1",            
157 |   "inv_gr1a",            "iskew_capm_21d",      "iskew_ff3_21d",       "iskew_hxz4_21d",     
158 |   "ivol_capm_21d",       "ivol_capm_252d",      "ivol_ff3_21d",        "ivol_hxz4_21d",      
159 |   "kz_index",            "lnoa_gr1a",           "lti_gr1a",            "market_equity",      
160 |   "mispricing_mgmt",     "mispricing_perf",     "ncoa_gr1a",           "ncol_gr1a",          
161 |   "netdebt_me",          "netis_at",            "nfna_gr1a",           "ni_ar1",             
162 |   "ni_be",               "ni_inc8q",            "ni_ivol",             "ni_me",              
163 |   "niq_at",              "niq_at_chg1",         "niq_be",              "niq_be_chg1",        
164 |   "niq_su",              "nncoa_gr1a",          "noa_at",              "noa_gr1a",           
165 |   "o_score",             "oaccruals_at",        "oaccruals_ni",        "ocf_at",             
166 |   "ocf_at_chg1",         "ocf_me",              "ocfq_saleq_std",      "op_at",              
167 |   "op_atl1",             "ope_be",              "ope_bel1",            "opex_at",            
168 |   "pi_nix",              "ppeinv_gr1a",         "prc",                 "prc_highprc_252d",   
169 |   "qmj",                 "qmj_growth",          "qmj_prof",            "qmj_safety",         
170 |   "rd_me",               "rd_sale",             "rd5_at",              "resff3_12_1",        
171 |   "resff3_6_1",          "ret_1_0",             "ret_12_1",            "ret_12_7",           
172 |   "ret_3_1",             "ret_6_1",             "ret_60_12",           "ret_9_1",            
173 |   "rmax1_21d",           "rmax5_21d",           "rmax5_rvol_21d",      "rskew_21d",          
174 |   "rvol_21d",            "sale_bev",            "sale_emp_gr1",        "sale_gr1",           
175 |   "sale_gr3",            "sale_me",             "saleq_gr1",           "saleq_su",           
176 |   "seas_1_1an",          "seas_1_1na",          "seas_11_15an",        "seas_11_15na",       
177 |   "seas_16_20an",        "seas_16_20na",        "seas_2_5an",          "seas_2_5na",         
178 |   "seas_6_10an",         "seas_6_10na",         "sti_gr1a",            "taccruals_at",       
179 |   "taccruals_ni",        "tangibility",         "tax_gr1a",            "turnover_126d",      
180 |   "turnover_var_126d",   "z_score",             "zero_trades_126d",    "zero_trades_21d",    
181 |   "zero_trades_252d",
182 |   "rvol_252d"
183 | )
184 | # Exclude features without sufficient coverage
185 | feat_excl <- c("capex_abn", "capx_gr2", "capx_gr3", "debt_gr3", "dgp_dsale",           
186 |                "dsale_dinv", "dsale_drec", "dsale_dsga", "earnings_variability", "eqnetis_at",          
187 |                "eqnpo_me", "eqpo_me", "f_score", "iskew_hxz4_21d", "ivol_hxz4_21d",       
188 |                "netis_at", "ni_ar1", "ni_inc8q", "ni_ivol", "niq_at", "niq_at_chg1", "niq_be", 
189 |                "niq_be_chg1", "niq_su", "ocfq_saleq_std", "qmj", "qmj_growth", "rd_me", 
190 |                "rd_sale", "rd5_at", "resff3_12_1", "resff3_6_1", "sale_gr3", "saleq_gr1", 
191 |                "saleq_su", "seas_16_20an", "seas_16_20na", "sti_gr1a", "z_score")
192 | features <- features[!(features %in% feat_excl)]
193 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Overview
  2 | This repository contains the code used for the paper [Machine Learning and the Implementable Efficient Frontier](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4187217) by Jensen, Kelly, Malamud, and Pedersen (2024). Please cite this paper if you are using the code:
  3 | ```
  4 | @article{JensenKellyMalamudPedersen2024,
  5 | 	author = {Jensen, Theis Ingerslev and Kelly, Bryan and Malamud, Semyon and Pedersen, Lasse Heje},
  6 | 	title = {Machine Learning and the Implementable Efficient Frontier},
  7 | 	year = {2024}
  8 | }
  9 | ```
 10 | 
 11 | Please send questions about the code to Theis I. Jensen at [theis.jensen@yale.edu](mailto:theis.jensen@yale.edu).
 12 | 
 13 | 
 14 | # How to run the code
 15 | To run the code, clone this repo to your local computing environment, and follow the steps explained below. We note that replicating our analysis requires substantial computational resources, and the code is set up to be executed on a high performance computing cluster with a SLURM scheduler. 
 16 | 
 17 | ## Data
 18 | You need eight data sets to run the code. 
 19 | - `usa.csv`
 20 |     - Firm characteristics at a monthly frequency from the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/10.1111/jofi.13249) by Jensen, Kelly, and Pedersen (2023)
 21 |     - Download from [WRDS](https://wrds-www.wharton.upenn.edu/pages/get-data/contributed-data-forms/global-factor-data/). To get US data, require that the column `excntry` is equal to "USA"
 22 | - `usa_dsf.csv`
 23 |     - Stock returns at a daily frequency
 24 |     - The data can be generated by following the instructions from the [GitHub repository](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) from "Is There a Replication Crisis in Finance.'' Alternatively, you can request the data from us by sending an email to [theis.jensen@yale.edu](mailto:theis.jensen@yale.edu)
 25 | - `world_ret_monthly.csv`
 26 |     - Stock returns at a monthly frequency
 27 |     - The data can be generated by following the instructions from the [GitHub repository](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) from "Is There a Replication Crisis in Finance.'' Alternatively, you can request the data from us by sending an email to [theis.jensen@yale.edu](mailto:theis.jensen@yale.edu)
 28 | - `Factor Details.xlsx`
 29 |     - Information about factor characteristics from "Is There a Replication Crisis in Finance"
 30 |     - Download from [GitHub/bkelly-lab/ReplicationCrisis/GlobalFactors/Factor Details.xlsx](https://github.com/bkelly-lab/ReplicationCrisis/blob/master/GlobalFactors/Factor%20Details.xlsx)
 31 | - `Cluster Labels.csv`
 32 |     - Information about factor characteristics from "Is There a Replication Crisis in Finance"
 33 |     - Download from [GitHub/bkelly-lab/ReplicationCrisis/GlobalFactors/Cluster Labels.csv](https://github.com/bkelly-lab/ReplicationCrisis/blob/master/GlobalFactors/Cluster%20Labels.csv)
 34 | - `market_returns.csv`
 35 |     - Market returns from "Is There a Replication Crisis in Finance"
 36 |     - Download from [Dropbox](https://www.dropbox.com/sh/xq278bryrj0qf9s/AABUTvTGok91kakyL07LKyQoa?dl=0)
 37 | - `ff3_m.csv`
 38 |     - The Fama-French 3-factor model data (used to get the risk-free rate)
 39 |     - Download from  [Kenneth French's data library](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_CSV.zip)
 40 | - `short_fees`
 41 |     - Short-selling fees based on the Markit Securities Finance Analytics - American Equities database. You can run the vast majority of the code without this data set (the exception being `6 - Short selling fees.R`)
 42 |     - Download from [WRDS](https://wrds-www.wharton.upenn.edu/pages/get-data/markit/markit-securities-finance-analytics-equities/american-equities/)
 43 | 
 44 | These data sets should be saved in the `Data` folder with the exact names used above. 
 45 | 
 46 | ## Data generation
 47 | In this section, we'll go through the steps needed to implement the portfolio choice methods considered in the paper and implement the portfolio choice methods we use in the paper. This step is by far the most computationally intensive. We used the [dSQ module](https://docs.ycrc.yale.edu/clusters-at-yale/job-scheduling/dsq/) to submit multiple jobs at the same time to a Slurm scheduler. Below, we include our dSQ calls to give you a sense of the computational resources required to run each step. 
 48 | 
 49 | ### Return prediction models
 50 | - What: Estimate the 12 models used to predict realized returns at time t+1, t+2, ..., t+12
 51 | - dSQ call: 
 52 | ``dsq --job-file Joblists/joblist_models.txt --cpus-per-task=32 --mem=100G --partition=day -t 06:00:00 --mail-type ALL --output slurm_output/dsq-joblist_models-%A_%1a-%N.out``.
 53 | This call will start 12 independent jobs, which for us took a maximum of 5 hours and required approximately 75GB RAM for each job
 54 | - Main R script: `slurm_fit_models.R`
 55 | - Output folder: `Data/Generated/Models` 
 56 | 
 57 | ### Portfolios: base case
 58 | - What: Implement portfolio choice methods with the base case parameters used for tables 2-4 and figures 2-4 and D.4
 59 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_base.txt --cpus-per-task=48 --mem=60G --partition=week -t 1-10:00:00 --mail-type AL
 60 | L --output slurm_output/dsq-joblist_pfchoice-base-%A_%1a-%N.out`.
 61 |   This call will start 1 job, which for us took a maximum of 6 hours and required approximately 40GB RAM
 62 | - Main R script: ``slurm_build_portfolios.R``
 63 | - Output folder: `Data/Generated/Portfolios`
 64 | 
 65 | ### Portfolios: all
 66 | - What: Implement the portfolio choice methods for all stocks used for the top-left panel in Figure 8
 67 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_all.txt --cpus-per-task=32 --mem=100G --partition=week -t 5-00:00:00 --mail-type AL
 68 | L --output slurm_output/dsq-joblist_pfchoice-all-%A_%1a-%N.out`. This call will start 1 job, which for us took a maximum of 2 days and 16 hours and required approximately 70GB RAM
 69 | - Main R script: ``slurm_build_portfolios.R``
 70 | - Output folder: `Data/Generated/Portfolios`
 71 | 
 72 | ### Portfolios: size groups
 73 | - What: Implement the portfolio choice methods for stocks in different size groups used for the remaining panels in Figure 8
 74 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_size.txt --cpus-per-task=16 --mem=50G --partition=day -t 8:00:00 --mail-type ALL --
 75 | output slurm_output/dsq-joblist_pfchoice-size-%A_%1a-%N.out`. This call will start 5 jobs, which for us took a maximum of 5 hours and required approximately 30GB RAM for each job
 76 | - Main R script: ``slurm_build_portfolios.R``
 77 | - Output folder: `Data/Generated/Portfolios`
 78 | 
 79 | ### Implementable Efficient Frontier: 
 80 | - What: Implement portfolio choice methods for different combinations of wealth and risk aversion to generate the implementable efficient frontier from Figure 1
 81 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_ief.txt --cpus-per-task=16 --mem=50G --partition=day -t 10:00:00 --mail-type ALL --
 82 | output slurm_output/dsq-joblist_pfchoice-ief-%A_%1a-%N.out`. This call will start 20 independent jobs, which for us took a maximum of 7 hours and required approximately 40GB RAM for each job
 83 | - Main R script: ``slurm_build_portfolios.R``
 84 | - Output folder: `Data/Generated/Portfolios`
 85 | 
 86 | ### Economic feature importance
 87 | - What: Implement the permutation-based feature importance analysis used for figures 5, 6, and D.3
 88 | - dSQ call: `dsq --job-file Joblists/joblist_pfchoice_fi.txt --cpus-per-task=48 --mem=70G --partition=day -t 23:00:00 --mail-type ALL --o
 89 | utput slurm_output/dsq-joblist_pfchoice-fi-%A_%1a-%N.out`
 90 | This call will start 3 independent jobs, which for us took a maximum of 3 hours and required approximately 35GB RAM for each job
 91 | - Main R script: `slurm_build_portfolios.R`
 92 | - Output folder: `Data/Generated/Portfolios`
 93 | 
 94 | ### Simulations
 95 | - What: Implement simulations from Appendix Section E 
 96 | - dSQ call: `dsq --job-file Joblists/joblist_simulations.txt --cpus-per-task=32 --mem=50G --partition=day -t 10:00:00 --mail-type ALL --o
 97 | utput slurm_output/dsq-joblist_simulations-%A_%1a-%N.out`. 
 98 | This call will start 15 independent jobs, which for us took a maximum of 9 hours and required approximately 25GB for each job
 99 | - Main R script: `simulations/simulations.R`
100 | - Output folder: `simulations/results`
101 | 
102 | ## Data analysis
103 | After generating the data from the previous section, you can analyze it on your local PC. Specifically, you can generate all figures and tables from the paper by running the scripts below. Importantly, you need to go through each script and ensure they point to the correct files (the names of the files from the previous sections depend on when the code was submitted). 
104 | 
105 | You can generate the tables and figures from the paper by running the `separate_analysis_run.R` script. This script will generate the main data set, import the results from the previous section, and generate the figures and tables from the following scripts:
106 | 
107 | - `6 - Implementable efficient frontier.R`
108 | - `6 - Base analysis.R`
109 | - `6 - Performance across size distribution.R`
110 | - `6 - Feature Importance.R`
111 | - `6 - Economic intuition.R`
112 | - `6 - Short selling fees.R`
113 | - `6 - RF Example.R`
114 | 
115 | Finally, run the scripts below to save the figures and tables, as well as generate various numbers mentioned in the paper: 
116 | 
117 | - `7 - Figures.R`
118 | - `7 - Tables.R`
119 | - `7 - Numbers.R`
120 | 
121 | After running these scripts, you should have the figures from the paper in the `Figures` folder and be able to copy-paste the tables in latex format and the numbers from the console.
122 | 


--------------------------------------------------------------------------------
/config_files/config_models1.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 1


--------------------------------------------------------------------------------
/config_files/config_models10.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 10


--------------------------------------------------------------------------------
/config_files/config_models11.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 11


--------------------------------------------------------------------------------
/config_files/config_models12.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 12


--------------------------------------------------------------------------------
/config_files/config_models2.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 2


--------------------------------------------------------------------------------
/config_files/config_models3.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 3


--------------------------------------------------------------------------------
/config_files/config_models4.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 4


--------------------------------------------------------------------------------
/config_files/config_models5.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 5


--------------------------------------------------------------------------------
/config_files/config_models6.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 6


--------------------------------------------------------------------------------
/config_files/config_models7.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 7


--------------------------------------------------------------------------------
/config_files/config_models8.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 8


--------------------------------------------------------------------------------
/config_files/config_models9.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | horizon = 9


--------------------------------------------------------------------------------
/config_files/config_pfchoice_all.txt:
--------------------------------------------------------------------------------
 1 | # Parameters for script
 2 | size_screen = "all"
 3 | industry_cov = TRUE
 4 | wealth = 1e10
 5 | gamma_rel = 10
 6 | update_mp = TRUE
 7 | update_base = TRUE
 8 | update_fi_base = FALSE
 9 | update_fi_ief = FALSE
10 | update_fi_ret = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_base.txt:
--------------------------------------------------------------------------------
 1 | # Parameters for script
 2 | size_screen = "perc_low50_high100_min50"
 3 | industry_cov = TRUE
 4 | wealth = 1e10
 5 | gamma_rel = 10
 6 | update_mp = TRUE
 7 | update_base = TRUE
 8 | update_fi_base = FALSE
 9 | update_fi_ief = FALSE
10 | update_fi_ret = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_fi_base.txt:
--------------------------------------------------------------------------------
 1 | # Parameters for script
 2 | size_screen = "perc_low50_high100_min50"
 3 | industry_cov = TRUE
 4 | wealth = 1e10
 5 | gamma_rel = 10
 6 | update_mp = FALSE
 7 | update_base = FALSE
 8 | update_fi_base = TRUE
 9 | update_fi_ief = FALSE
10 | update_fi_ret = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_fi_ief.txt:
--------------------------------------------------------------------------------
 1 | # Parameters for script
 2 | size_screen = "perc_low50_high100_min50"
 3 | industry_cov = TRUE
 4 | wealth = 1e10
 5 | gamma_rel = 10
 6 | update_mp = FALSE
 7 | update_base = FALSE
 8 | update_fi_base = FALSE
 9 | update_fi_ief = TRUE
10 | update_fi_ret = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_fi_ret.txt:
--------------------------------------------------------------------------------
 1 | # Parameters for script
 2 | size_screen = "perc_low50_high100_min50"
 3 | industry_cov = TRUE
 4 | wealth = 1e10
 5 | gamma_rel = 10
 6 | update_mp = FALSE
 7 | update_base = FALSE
 8 | update_fi_base = FALSE
 9 | update_fi_ief = FALSE
10 | update_fi_ret = TRUE
11 | 


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w1g1.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1
5 | gamma_rel = 1
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w1g2.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1
5 | gamma_rel = 5
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w1g3.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1
5 | gamma_rel = 10
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w1g4.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1
5 | gamma_rel = 20
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w1g5.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1
5 | gamma_rel = 100
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w2g1.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e9
5 | gamma_rel = 1
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w2g2.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e9
5 | gamma_rel = 5
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w2g3.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e9
5 | gamma_rel = 10
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w2g4.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e9
5 | gamma_rel = 20
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w2g5.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e9
5 | gamma_rel = 100
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w3g1.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e10
5 | gamma_rel = 1
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w3g2.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e10
5 | gamma_rel = 5
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w3g3.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e10
5 | gamma_rel = 10
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w3g4.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e10
5 | gamma_rel = 20
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w3g5.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e10
5 | gamma_rel = 100
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w4g1.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e11
5 | gamma_rel = 1
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w4g2.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e11
5 | gamma_rel = 5
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w4g3.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e11
5 | gamma_rel = 10
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w4g4.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e11
5 | gamma_rel = 20
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_ief_w4g5.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low50_high100_min50"
3 | industry_cov = TRUE
4 | wealth = 1e11
5 | gamma_rel = 100
6 | update_mp = FALSE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_size1.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low00_high20_min50"
3 | wealth = 1e10
4 | industry_cov = FALSE
5 | gamma_rel = 10
6 | update_mp = TRUE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_size2.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low20_high40_min50"
3 | wealth = 1e10
4 | industry_cov = FALSE
5 | gamma_rel = 10
6 | update_mp = TRUE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_size3.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low40_high60_min50"
3 | wealth = 1e10
4 | industry_cov = FALSE
5 | gamma_rel = 10
6 | update_mp = TRUE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_size4.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low60_high80_min50"
3 | wealth = 1e10
4 | industry_cov = FALSE
5 | gamma_rel = 10
6 | update_mp = TRUE


--------------------------------------------------------------------------------
/config_files/config_pfchoice_size5.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | size_screen = "perc_low80_high100_min50"
3 | industry_cov = FALSE
4 | wealth = 1e10
5 | gamma_rel = 10
6 | update_mp = TRUE


--------------------------------------------------------------------------------
/config_files/config_sim1.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 1


--------------------------------------------------------------------------------
/config_files/config_sim10.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 10


--------------------------------------------------------------------------------
/config_files/config_sim11.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 11


--------------------------------------------------------------------------------
/config_files/config_sim12.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 12


--------------------------------------------------------------------------------
/config_files/config_sim13.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 13


--------------------------------------------------------------------------------
/config_files/config_sim14.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 14


--------------------------------------------------------------------------------
/config_files/config_sim15.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 15


--------------------------------------------------------------------------------
/config_files/config_sim2.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 2


--------------------------------------------------------------------------------
/config_files/config_sim3.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 3


--------------------------------------------------------------------------------
/config_files/config_sim4.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 4


--------------------------------------------------------------------------------
/config_files/config_sim5.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 5


--------------------------------------------------------------------------------
/config_files/config_sim6.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 6


--------------------------------------------------------------------------------
/config_files/config_sim7.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 7


--------------------------------------------------------------------------------
/config_files/config_sim8.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 8


--------------------------------------------------------------------------------
/config_files/config_sim9.txt:
--------------------------------------------------------------------------------
1 | # Parameters for script
2 | seed = 9


--------------------------------------------------------------------------------
/ewma.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | // [[Rcpp::export]]
 5 | NumericVector ewma_c(NumericVector x, double lambda, int start) {
 6 |   int n = x.size();
 7 |   int na = 0;
 8 |   double initial_value = 0;
 9 |   NumericVector var_vec(n, NA_REAL);
10 |   
11 |   if(n <= start) {
12 |     return var_vec;
13 |   }
14 |   
15 |   // Calculate simple variance to get started (assuming mean = 0)
16 |   for(int i = 0; i < start; ++i) {
17 |     if(NumericVector::is_na(x(i))) {
18 |       na += 1;  
19 |     } else {
20 |       initial_value += pow(x(i), 2);
21 |     }
22 |   }
23 |   initial_value = initial_value / (start - 1 - na); 
24 |   
25 |   // Iteratively Updating EWMA Vol 
26 |   var_vec(start) = initial_value;  
27 |   for(int j = start + 1; j < n; ++j) {
28 |     if(NumericVector::is_na(x(j - 1))) {
29 |       var_vec(j) = var_vec(j - 1);  
30 |     } else {
31 |       var_vec(j) = lambda * var_vec(j - 1) + (1 - lambda) * pow(x(j - 1), 2);  
32 |     }
33 |   }
34 |   
35 |   return sqrt(var_vec);
36 | }
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/ml-and-the-implementable-efficient-frontier.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/separate_analysis_run.R:
--------------------------------------------------------------------------------
 1 | # Load packages and settings ---------------------------------------------------
 2 | source("Main.R")
 3 | 
 4 | get_from_path_model <- "Data/Generated/Models/20240424-all"  # From which directory should the data be loaded? (if update$x==F)
 5 | run_sub <- F  # T when testing code, F otherwise
 6 | 
 7 | # Screens for main sample ------------------------------------------------------
 8 | settings$screens$size_screen <- "perc_low50_high100_min50"
 9 | pf_set$wealth <- 1e10
10 | pf_set$gamma_rel <- 10
11 | 
12 | # Generate data for analysis ---------------------------------------------------
13 | source("1 - Prepare Data.R", echo = T)
14 | source("3 - Estimate Covariance Matrix.R", echo = T)
15 | source("4 - Prepare Portfolio Data.R", echo = T)
16 | 
17 | # Create benchmark portfolio objects -------------------------------------------
18 | # This repeats some of the data in 5 - Base case.R, without re-doing the computationally heavy parts 
19 | # Markowitz-ML
20 | tpf <- chars |> tpf_implement(cov_list = barra_cov, wealth = wealth, dates = dates_oos, gam = pf_set$gamma_rel)
21 | 
22 | # Factor-ML
23 | factor_ml <- chars %>% factor_ml_implement(dates = dates_oos, n_pfs = settings$factor_ml$n_pfs, wealth = wealth, gam = pf_set$gamma_rel)
24 | 
25 | # Market
26 | mkt <- chars %>% mkt_implement(dates = dates_oos, wealth = wealth)
27 | 
28 | # 1/n 
29 | ew <- chars %>% ew_implement(dates = dates_oos, wealth = wealth)
30 | 
31 | # Fama-MacBeth / Rank weighted portfolios 
32 | rw <- chars %>% rw_implement(dates = dates_oos, wealth = wealth)
33 | 
34 | # Minimum variance 
35 | mv <- chars %>% mv_implement(cov_list = barra_cov, dates = dates_oos, wealth = wealth)
36 | 
37 | # Generate results from analysis -----------------------------------------------
38 | source("6 - Implementable efficient frontier.R", echo = T) 
39 | source("6 - Base analysis.R", echo = T)
40 | source("6 - Performance across size distribution.R", echo = T)
41 | source("6 - Feature Importance.R", echo = T)
42 | source("6 - Economic intuition.R", echo = T)
43 | source("6 - Short selling fees.R", echo = T)
44 | source("6 - RF Example.R", echo = T)
45 | 


--------------------------------------------------------------------------------
/simulations/sim_functions.R:
--------------------------------------------------------------------------------
  1 | # Simulate data --------
  2 | sim_prep_data <- function(t, n, disp_x, tv_sds, tv_thetas, tv_rps, dolvol, dates_full, rf, seed, add_noise_feat=T, feat_standardize=T) {
  3 |   set.seed(seed)
  4 |   # Economy -------------------------------
  5 |   dt <- 1 # Time increment
  6 |   mu_x <- 1 # Long-term mean of x1
  7 |   rpx <- 0.05  # Annual risk premium of X
  8 |   n_tv <- length(tv_sds)
  9 |   features <- c("x", paste0("tv", 1:n_tv))
 10 |   
 11 |   # One constant factor that determines covariances 
 12 |   X <- rep(rnorm(n = n, mean = mu_x, sd = disp_x), t) |> matrix(nrow = n, ncol = t) |> t() 
 13 |   
 14 |   # Multiple time-varying return predicting factors, unrelated to covariances
 15 |   tv_list <- 1:n_tv |> map(function(k) {
 16 |     tv_start <- rnorm(n = n, mean = 0, sd = tv_sds[k])  # Initial value
 17 |     tv <- matrix(NA, nrow = t, ncol = n)
 18 |     tv[1,] <- tv_start
 19 |     for (j in 2:t) {
 20 |       tv[j,] <- tv[j-1,] + (1-tv_thetas[k]) * (0 - tv[j-1,]) * dt + tv_sds[k] * rnorm(n = n, mean = 0, sd = sqrt(dt))
 21 |     }
 22 |     return(tv)
 23 |   })
 24 |   
 25 |   # Simulate return --------------------
 26 |   sig_m <- 0.2
 27 |   sig_e <- 0.4
 28 |   ivol <- diag(sig_e^2/12, nrow=n)
 29 |   mkt_ret <- rnorm(n=t, mean=0, sd = sig_m/sqrt(12)) # Demeaned market return
 30 |   e_ret <- matrix(rnorm(n*t, sd = sig_e/sqrt(12)), ncol = n, nrow = t)
 31 |   data_tc <- 1:t |> map(function(j) {
 32 |     x <- X[j,]
 33 |     # Only x is related to sigma
 34 |     sigma <- x %*% as.matrix(sig_m^2/12) %*%  t(x) + ivol
 35 |     # Both x and TV factors are related to the expected return
 36 |     mu <- rpx/12*x
 37 |     for (k in 1:n_tv) {
 38 |       mu <- mu + tv_rps[k]/12*tv_list[[k]][j,]  
 39 |     }
 40 |     # Realized returns
 41 |     if (F) {
 42 |       r <- mvrnorm(n = 1, mu = mu, Sigma = sigma)
 43 |     } else {
 44 |       r <- mu + x*mkt_ret[j] + e_ret[j, ]
 45 |     }
 46 |     w_tpf <- drop(solve(sigma) %*% mu) 
 47 |     data <- data.table(eom=dates_full[j], id=1:n, x=x, ret_ld1 = r, er_true = mu, sr = mu/diag(sigma)^0.5, w_tpf=w_tpf)
 48 |     for (k in 1:n_tv) {
 49 |       data[, paste0("tv", k) := tv_list[[k]][j,]]
 50 |     }
 51 |     return(data)
 52 |   }) |> rbindlist()
 53 |   if (add_noise_feat) {
 54 |     noise_n <- 5
 55 |     for (i in 1:noise_n) {
 56 |       data_tc[, paste0("feat", i) := rnorm(t*n)]
 57 |     }
 58 |     features <- c(features, paste0("feat", 1:noise_n))
 59 |   }
 60 |   data_tc[, me := 1]
 61 |   
 62 |   # Lambda (consider it fixed for now) --
 63 |   data_tc[, lambda := 0.02/dolvol] # Assumes that all stocks have same daily trading volume
 64 |   # data_tc[, ret_ld1 := shift(r, 1, type="lead"), by=id]
 65 |   data_tc[, tr_ld1 := ret_ld1+rf]
 66 |   data_tc[, tr_ld0 := lag(tr_ld1, 1), by = id]
 67 |   data_tc[eom==min(eom), tr_ld0 := 0] # hack
 68 |   # data_tc[, pred_ld1 := 0.05/12]
 69 |   # data_tc[, pred_ld1 := er_true]
 70 |   for (h in 1:12) {
 71 |     # data_tc[, paste0("pred_ld", h) := rp1/12*(theta^(h-1)*x1+(1-theta^(h-1))*mu_x1) + rp2/12*(theta^(h-1)*x2+(1-theta^(h-1))*mu_x2)] # Risk premium times expected x:
 72 |     # data_tc[, paste0("pred_ld", h) := rp1/12*x1 + rp2/12*(theta^(h-1)*x2+(1-theta^(h-1))*mu_x2)] # Risk premium times expected x:
 73 |     data_tc[, paste0("pred_ld", h) := rpx/12*x]
 74 |     for (k in 1:n_tv) {
 75 |       data_tc[, paste0("pred_ld", h) := get(paste0("pred_ld", h)) + tv_rps[k]/12*(tv_thetas[k]^(h-1)*get(paste0("tv", k))+(1-tv_rps[k]^(h-1))*0)]
 76 |     }
 77 |   }
 78 |   print(data_tc[, c("id", "eom", paste0("pred_ld", 1:12)), with=F][id<=5 & eom==max(eom)] |> pivot_longer(-c(id, eom)) |> mutate(ld = str_remove(name, "pred_ld") |> as.integer()) |> ggplot(aes(ld, value, colour=factor(id))) + geom_point() + geom_line() + 
 79 |           labs(title=paste0("Expected return for ID 1:5 on last date, tv thetas: ", paste0(tv_thetas, collapse="-"))))
 80 |   # Prepare for multiperiod-ML
 81 |   data_tc[, pred_ld2_6 := rowMeans(.SD), .SDcols = paste0("pred_ld", 2:6)]
 82 |   data_tc[, pred_ld7_12 := rowMeans(.SD), .SDcols = paste0("pred_ld", 7:12)]
 83 |   if (FALSE) {
 84 |     data_tc[id<=2, c("id", "eom", paste0("pred_ld", 1:4))][year(eom) %in% 2020:2021] |> pivot_longer(-c(id,eom)) |> ggplot(aes(eom, value, colour=name)) + geom_line() + facet_wrap(~id) + geom_hline(yintercept = rpx/12)
 85 |     data_tc[id<=2, c("id", "eom", "pred_ld1", "pred_ld2_6", "pred_ld7_12")][year(eom) %in% 2020:2021] |> pivot_longer(-c(id,eom)) |> ggplot(aes(eom, value, colour=name)) + geom_line() + facet_wrap(~id) + geom_hline(yintercept = rpx/12)
 86 |   }
 87 |   # Standardize features
 88 |   if (feat_standardize) {
 89 |     for(f in features) {
 90 |       data_tc[, (f) := ecdf(get(f))(get(f)), by = eom]
 91 |     }
 92 |   }
 93 |   # For existing functions 
 94 |   data_tc[, valid := T]
 95 |   data_tc[, mu_ld0 := 0]  #pf_set$mu
 96 |   data_tc[, eom_ret := eom+1+months(1)-1]
 97 |   # Create list of sigmas 
 98 |   barra_cov <- 1:t |> map(function(i) {
 99 |     fct_load <- as.matrix(X[i,])
100 |     rownames(fct_load) <- as.character(1:n)
101 |     ivol_vec <- diag(ivol)
102 |     names(ivol_vec) <- as.character(1:n)
103 |     fct_cov <- as.matrix(sig_m^2/12)
104 |     list("fct_load"=fct_load, "fct_cov"=fct_cov, "ivol_vec"=ivol_vec)
105 |   })
106 |   names(barra_cov) <- dates_full
107 |   # Output
108 |   list("barra_cov"=barra_cov, "data"=data_tc, "features"=features)
109 | }
110 | 


--------------------------------------------------------------------------------
/simulations/sim_results.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(data.table)
  3 | 
  4 | # Layout settings ----------------------------------------------
  5 | theme_set(theme_classic(base_size = 13))
  6 | colours_theme <- c("#0C6291", "#A63446", RColorBrewer::brewer.pal(8, "Dark2"), 
  7 |                    "darkslategrey", "blue3", "red3", "purple2", "yellow2", "aquamarine",
  8 |                    "grey", "salmon", "antiquewhite", "chartreuse") 
  9 | scale_colour_discrete <- function(...) {
 10 |   scale_colour_manual(..., values = colours_theme)
 11 | }
 12 | scale_fill_discrete <- function(...) {
 13 |   scale_fill_manual(..., values = colours_theme)
 14 | }
 15 | 
 16 | # Aggregate results -------------------------------------------
 17 | files <- list.files("simulations/results")
 18 | files <- files[str_detect(files, "stocks500") & str_detect(files, "-05-18")] #  | str_detect(files, "-05-19")
 19 | 
 20 | results <- files |> map(function(f) {
 21 |   fread(paste0("simulations/results/", f))
 22 | }) |> rbindlist()
 23 | if (FALSE) {
 24 |   # Forgot to add theta in run on april 17th
 25 |   results <- 1:length(files) |> map(function(i) {
 26 |     fread(paste0("simulations/results/", files[i])) |> mutate(theta1=search_grid$theta1[i], theta2=search_grid$theta2[i])
 27 |   }) |> rbindlist()
 28 | }
 29 | 
 30 | # Results -----------------------------------------------------
 31 | results[, e_var_adj := (r-mean(r))^2, by=.(run, seed, type, w, theta1, theta2, n)]
 32 | results[, utility_t := r-tc-0.5*e_var_adj*10]
 33 | 
 34 | results_overall <- results[, .(
 35 |   n_obs = .N,
 36 |   inv = mean(inv),
 37 |   shorting = mean(shorting),
 38 |   turnover_notional = mean(turnover),
 39 |   r = mean(r)*12,
 40 |   sd = sd(r)*sqrt(12),
 41 |   sr_gross = mean(r)/sd(r)*sqrt(12),
 42 |   tc = mean(tc)*12,
 43 |   r_tc = mean((r-tc))*12,
 44 |   sr = mean(r-tc)/sd(r)*sqrt(12),
 45 |   obj = (mean(r)-0.5*var(r)*10-mean(tc))*12#,
 46 |   # obj_exp = (mean(e_r)-0.5*mean(e_var)*10-mean(tc))*12
 47 | ), by = .(run, seed, type, w, theta1, theta2, n)]
 48 | results_overall <- results_overall[type != "Static-ML"]
 49 | results_overall[type=="Multiperiod-ML*", type := "Multiperiod-ML"] # No tuning, so no need for "*"
 50 | 
 51 | # Utility ----------------
 52 | (output$simulations <- results_overall |> 
 53 |   filter(!(theta1==0 & theta2==0.95)) |>
 54 |   # filter(w != 1e8) |> # Effectively zero cost at this level (need to re-check with new data)
 55 |   group_by(run, w, type, theta1, theta2, n) |>
 56 |   summarise(
 57 |     n = n(),
 58 |     obj = median(obj),
 59 |     high = max(obj),
 60 |     low = min(obj)
 61 |   ) |>
 62 |   mutate(
 63 |     theta = paste0("theta1=", theta1, ", theta2=", theta2)
 64 |   ) |> 
 65 |   # filter(type!="Static-ML*") |>
 66 |   # filter(type!="Static-ML") |>
 67 |   ggplot(aes(x=factor(w/1e9), y=obj, colour=type, group=type)) +
 68 |   geom_point() +
 69 |   geom_line() + 
 70 |   geom_hline(yintercept = 0, linetype="dashed", colour="black") +
 71 |   coord_cartesian(ylim = c(-0.01, NA )) + 
 72 |   facet_wrap(~theta, nrow = 1, scales="free_y") +
 73 |   labs(x = "Wealth by 2020 ($Billion)", y = "Realized utility") + 
 74 |   theme(
 75 |     legend.position = "top",
 76 |     legend.title = element_blank(),
 77 |   ))
 78 | 
 79 | if (FALSE) {
 80 |   # Old scribbles ---------------
 81 |   # Relevant range: One fast, one slow
 82 |   results_overall |> 
 83 |     filter(theta1==0 & theta2==0.95) |> 
 84 |     group_by(seed, w, type, theta1, theta2, n) |>
 85 |     group_by(run, w, type, theta1, theta2, n) |>
 86 |     summarise(
 87 |       n = n(),
 88 |       obj = median(obj),
 89 |       high = max(obj),
 90 |       low = min(obj)
 91 |     ) |>
 92 |     mutate(
 93 |       theta = paste0("Theta1=", theta1, ", theta2=", theta2)
 94 |     ) |> 
 95 |     # filter(type!="Static-ML*") |>
 96 |     # filter(type!="Static-ML") |>
 97 |     ggplot(aes(x=factor(w), y=obj, colour=type, group=type)) +
 98 |     geom_point() +
 99 |     geom_line() + 
100 |     coord_cartesian(ylim = c(-0.01, NA )) + 
101 |     facet_wrap(~theta, ncol = 2, scales="free_y") +
102 |     labs(x = "Wealth", y = "Realized utility OOS") + 
103 |     theme(
104 |       legend.position = "top",
105 |       legend.title = element_blank(),
106 |     )
107 |   
108 |   results_overall |> 
109 |     # group_by(w, type, theta, n) |>
110 |     # summarise(
111 |     #   obj = median(obj),
112 |     #   high = max(obj),
113 |     #   low = min(obj)
114 |     # ) |> 
115 |     # filter(type!="Static-ML*") |>
116 |     # filter(type!="Static-ML") |>
117 |     ggplot(aes(x=factor(w), y=obj, colour=type, group=type)) +
118 |     geom_point(alpha=0.5, size=1) +
119 |     geom_line() + 
120 |     coord_cartesian(ylim = c(-0.2, NA )) + 
121 |     facet_wrap(~theta, ncol = 3, scales="free_y")
122 |   
123 |   # Turnover ---------------
124 |   results_overall |> 
125 |     ggplot(aes(x=factor(w), y=turnover_notional, colour=type, group=type)) +
126 |     geom_point() +
127 |     geom_line() + 
128 |     facet_wrap(~theta) +
129 |     labs(x = "Wealth", y = "Realized turnover") + 
130 |     theme(
131 |       legend.position = "top",
132 |       legend.title = element_blank(),
133 |     )
134 |   
135 |   
136 |   # Gross Sharpe ratio ------------
137 |   results_overall |> 
138 |     ggplot(aes(x=factor(w), y=sr_gross, colour=type, group=type)) +
139 |     geom_point() +
140 |     geom_line() + 
141 |     facet_wrap(~theta) +
142 |     labs(x = "Wealth", y = "Realized gross Sharpe ratio") + 
143 |     theme(
144 |       legend.position = "top",
145 |       legend.title = element_blank(),
146 |     )
147 |   
148 |   results[n==500 & theta==0.05 & w==1e11][, cum_u := cumsum(utility_t)] |> 
149 |     ggplot(aes(eom_ret, cum_u, colour=type)) + 
150 |     geom_point() + 
151 |     geom_line()
152 |   
153 |   results[n==100 & theta==0.95 & w==1e11][, cum_u := cumsum(r)] |> 
154 |     ggplot(aes(eom_ret, cum_u, colour=type)) + 
155 |     geom_point() + 
156 |     geom_line()
157 |   
158 |   
159 |   # Test: Expected utility from trade ----------------------------
160 |   # For Static and TPF, compute expected change in utility from trade
161 |   start_end_u_fun <- function(ws) {
162 |     ds <- unique(ws$eom)
163 |     ds |> map(function(d) {
164 |       w <- ws[eom==d]
165 |       er <- data_tc_list[[as.character(d)]]$er
166 |       sigma_gam <- data_tc_list[[as.character(d)]]$sigma_gam
167 |       lambda <- data_tc_list[[as.character(d)]]$lambda
168 |       wealth_t <- wealth[eom == d]$wealth
169 |       start <- w$w_start
170 |       end <- w$w
171 |       u_start <- start %*% er - 1/2*(t(start) %*% sigma_gam %*% start)
172 |       u_end <- end %*% er - 1/2*(t(end) %*% sigma_gam %*% end)
173 |       tc <- wealth_t/2 * (end-start) %*% lambda %*% (end-start)
174 |       data.table(eom=d, u_start=drop(u_start), 
175 |                  u_end=drop(u_end), 
176 |                  tc=drop(tc))
177 |     }) |> rbindlist()
178 |   }
179 |   
180 |   static$w |> start_end_u_fun() |> mutate(trade_imp = u_end-u_start-tc)
181 |   tpf$w |> start_end_u_fun() |> mutate(trade_imp = u_end-u_start-tc)
182 |   
183 |   # Analysis -----------------------------------------------------
184 |   # Mean-variance efficient portfolio OOS
185 |   data_tc[eom %in% dates_oos, .(rr = sum(ret_ld1*w_tpf), er = sum(mu*w_tpf)), by=eom][, .(
186 |     n = .N,
187 |     mean_er = mean(er)*12,
188 |     mean_rr = mean(rr)*12,
189 |     sd_rr = sd(rr)*sqrt(12)
190 |   )][, sr := mean_rr/sd_rr][]
191 |   
192 |   # Mean-variance efficient portfolio FULL
193 |   data_tc[, .(rr = sum(ret_ld1*w_tpf), er = sum(mu*w_tpf)), by=eom][, .(
194 |     n = .N,
195 |     mean_er = mean(er)*12,
196 |     mean_rr = mean(rr)*12,
197 |     sd_rr = sd(rr)*sqrt(12)
198 |   )][, sr := mean_rr/sd_rr][]
199 |   
200 |   data_tc[, .(rr = sum(ret_ld1*w_tpf), er = sum(mu*w_tpf)), by=eom][, .(
201 |     n = .N,
202 |     mean_er = mean(er)*12,
203 |     mean_rr = mean(rr)*12,
204 |     sd_rr = sd(rr)*sqrt(12)
205 |   )][, sr := mean_rr/sd_rr][]
206 |   
207 |   
208 |   data_tc[, .(rr = sum(r*w_tpf), er = sum(mu*w_tpf)), by=t][, cumret := cumsum(rr)] |> 
209 |     ggplot(aes(t, cumret)) +
210 |     geom_point()
211 |   # SR by stock (SR increase in x because of constant ivol)
212 |   data_tc[, .(
213 |     sr = mean(sr),
214 |     x = mean(x)
215 |   ), by = i][order(x)]
216 |   
217 |   
218 |   data_tc[id <=2 & year(eom) %in% 2019:2020] |> ggplot(aes(eom, mu*12, colour=factor(id))) + geom_line() + geom_point()
219 |   
220 |   
221 |   # Expected returns -------------
222 |   ers <- c(0, 0.95) |> map(function(theta) {
223 |     rf <- 0
224 |     risk_free <- data.table(eom=dates_full, rf=rf)
225 |     # Inputs to prepare_data_list ----
226 |     wealth <- data.table(
227 |       eom=dates_full, 
228 |       wealth=1e10, 
229 |       mu_ld1 = pf_set$mu)
230 |     # Prepare data 
231 |     data_list <- sim_prep_data(
232 |       t = t,
233 |       n = 10,
234 |       theta = theta,
235 |       dates_full = dates_full, 
236 |       rf = rf,
237 |       seed = set$seed_no
238 |     )
239 |     data_list$data |> mutate(theta=theta)
240 |   }, .progress = T) |> rbindlist()
241 |   
242 |   ers[id==1] |>
243 |     ggplot(aes(eom, pred_ld1, colour=factor(id))) + 
244 |     geom_line() +
245 |     facet_wrap(~theta)
246 |   
247 |   ers[, .(ar1=cor(pred_ld1, lag(pred_ld1, 1), use="complete.obs")), by = .(id, theta)] |> 
248 |     group_by(theta) |>
249 |     summarise(ar1=mean(ar1)) |> 
250 |     ggplot(aes(theta, ar1)) +
251 |     geom_point() +
252 |     geom_abline(intercept=0, slope=1, linetype="dashed")
253 |   
254 |   # Trade aggresiveness of TPF -------------------------------
255 |   tpf_ws <- c(0, 0.95) |> map(function(theta) {
256 |     rf <- 0
257 |     risk_free <- data.table(eom=dates_full, rf=rf)
258 |     # Inputs to prepare_data_list ----
259 |     wealth <- data.table(
260 |       eom=dates_full, 
261 |       wealth=1e10, 
262 |       mu_ld1 = pf_set$mu)
263 |     # Prepare data 
264 |     data_list <- sim_prep_data(
265 |       t = t,
266 |       n = 100,
267 |       theta = theta,
268 |       dates_full = dates_full, 
269 |       rf = rf,
270 |       seed = set$seed_no
271 |     )
272 |     # TPF 
273 |     data_tc_list <- data_list$data[valid==T] %>% prepare_data_list(dates=dates_oos, wealth=wealth, risk_free=risk_free,
274 |                                                                    barra_cov = data_list$barra_cov, gamma_rel = pf_set$gamma_rel)
275 |     tpf <- data_list$data %>% tpf_implement(data_list = data_tc_list, dates = dates_oos, wealth = wealth)
276 |     # Add expected returns
277 |     data_list$data[, .(id, eom, x1, x2, pred_ld1)][tpf$w, on = .(id, eom)] |> mutate(theta=theta)
278 |   }, .progress = T) |> rbindlist()
279 |   
280 |   tpf_ws[id<=3] |>  
281 |     pivot_longer(c(w, x1, pred_ld1)) |> 
282 |     ggplot(aes(eom, value, colour=factor(id))) +
283 |     geom_line() +
284 |     facet_wrap(name~theta, scales="free_y", ncol = 2)
285 |   
286 |   tpf_ws[, .(
287 |     turnover = sum(abs(w-w_start))
288 |   ), by = .(theta, eom)][, .(turnover = mean(turnover)), by = theta]
289 | }
290 | 
291 | 


--------------------------------------------------------------------------------
/simulations/simulations.R:
--------------------------------------------------------------------------------
  1 | # This file relative to simulation.R: Run for 1, rather than multiple seeds
  2 | # TO DO
  3 | # - Prepare portfolio-ML for real data
  4 | library(MASS)
  5 | library(expm)
  6 | library(tidyverse)
  7 | library(data.table)
  8 | source("0 - General functions.R")
  9 | source("0 - Return prediction functions.R")
 10 | source("0 - Portfolio choice functions.R")
 11 | Rcpp::sourceCpp("sqrtm_cpp.cpp")
 12 | # Goal: Understand the performance of the different methods with a fixed DGP
 13 | # Setup: One-factor that drives expected returns and covariances
 14 | # Result: Evaluation of how well Static/Portfolio/multiperiod performs
 15 | 
 16 | add_noise_feat <- T
 17 | 
 18 | source("simulations/sim_functions.R")
 19 | # source("simulations/m2_fun_new_2.R")
 20 | # source("simulations/new_static_fun.R")
 21 | # source("simulations/new_multi_fun.R")
 22 | # source("simulations/memory_fun_new.R")
 23 | 
 24 | # Submit a config file with slurm job? -----------
 25 | if (TRUE) {
 26 |   # Prepare for slurm to submit configuration files flexibly
 27 |   args <- commandArgs(trailingOnly = TRUE)
 28 |   config_file <- args[1]  # Get the first argument, which is the config file name
 29 |   
 30 |   config_params <- read_config(config_file)
 31 | } else {
 32 |   config_params <- list(
 33 |     "seed" = 14
 34 |   )
 35 | }
 36 | 
 37 | # Settings -------
 38 | set <- list(
 39 |   seed_no = 1,
 40 |   split = list(
 41 |     train_end = as.Date("1970-12-31"), # Change to 1994-12-31?
 42 |     test_end = as.Date("2020-12-31"),
 43 |     val_years = 10
 44 |   ),
 45 |   pf = list(
 46 |     dates = list(
 47 |       start_year = 1971,
 48 |       end_yr = 2020,
 49 |       split_years = 10
 50 |     ),
 51 |     hps = list(
 52 |       cov_type = "cov_add",
 53 |       static = list(
 54 |         k = c(1/1, 1/6, 1/12),
 55 |         u = c(1),
 56 |         g = c(0)
 57 |       ),
 58 |       m1 = list(
 59 |         k = c(1),
 60 |         u = c(1),
 61 |         g = c(0)  
 62 |       )
 63 |     )
 64 |   ),
 65 |   pf_ml = list( 
 66 |     g_vec = exp(-3:-2), 
 67 |     p_vec = c(2^(6:9)), 
 68 |     l_vec = c(0, exp(seq(-10, 10, length.out = 100))), 
 69 |     orig_feat = F,          # Should original features be added to the list of random features?
 70 |     scale = T
 71 |   )
 72 | )
 73 | 
 74 | t <- 12*70
 75 | 
 76 | # Important dates ----------------------
 77 | dates_full <- rev(set$split$test_end+1 - months(1:t))-1
 78 | 
 79 | hp_years <- seq(from=set$pf$dates$start_year, to=set$pf$dates$end_yr)
 80 | start_oos <- set$pf$dates$start_year+set$pf$dates$split_years
 81 | 
 82 | dates_oos <- seq.Date(from = as.Date(paste0(start_oos, "-", "01-01")), to = set$split$test_end+1-months(1), by = "1 month")-1
 83 | dates_hp <- seq.Date(from = as.Date(paste0(min(hp_years), "-", "01-01")), to = set$split$test_end+1-months(1), by = "1 month")-1
 84 | 
 85 | # Investor settings ------------------
 86 | pf_set <- list(
 87 |   # wealth = 1e11,
 88 |   gamma_rel = 10,
 89 |   mu = 0, # How much do we expect the portfolio to grow each month? market[year(eom_ret)>=1980, mean(mkt_vw_exc, na.rm=T)]
 90 |   # aim_hor = max_t_hor-1,
 91 |   lb_hor = 11 # Change??
 92 | )
 93 | 
 94 | # Simulation -------------------------------------
 95 | set$n_stocks <- 500
 96 | search_grid <- expand.grid(
 97 |   w = 10^(8:12),
 98 |   theta1 = c(0, 0.95),
 99 |   theta2 = c(0, 0.95),
100 |   n = set$n_stocks
101 | ) |> filter(!(theta1==0 & theta2==0.95))  # Effectively the same as theta1=0.95, theta2=0, so no need to run it
102 | 
103 | do_static <- T
104 | do_tpf <- T
105 | do_mp <- T
106 | do_pf <- T
107 | 
108 | # Takes 2-6 hours with 500 stocks, memory utilized is 7.54 GB
109 | # With 3 repetitions and 500 stocks, I suggest requesting 15GB for 23 hours 
110 | #    - It took between 8 and 14 hours with 32 CPUs
111 | search_params <- search_grid[config_params$seed, ]
112 | tictoc::tic(paste0("Run with parameters: wealth=", search_params$w, ", theta1=", search_params$theta1, ", theta2=", search_params$theta2, ", n=", search_params$n))
113 | results <- 1:3 |> map(function(seed) {
114 |   rf <- 0
115 |   risk_free <- data.table(eom=dates_full, rf=rf)
116 |   # Inputs to prepare_data_list ----
117 |   wealth <- data.table(
118 |     eom=dates_full, 
119 |     wealth=search_params$w, 
120 |     mu_ld1 = pf_set$mu)
121 |   # Prepare data 
122 |   system.time(data_list <- sim_prep_data(
123 |     t = t,
124 |     n = search_params$n,
125 |     dates_full = dates_full, 
126 |     dolvol = 84*1e6, # Median dolvol for valid stocks by 2020
127 |     rf = rf,
128 |     tv_sds = rep(0.25, 2),
129 |     tv_thetas = c(search_params$theta1, search_params$theta2),
130 |     tv_rps = rep(0.05, 2),
131 |     disp_x = 0.5,
132 |     seed = seed, 
133 |     add_noise_feat = add_noise_feat,
134 |     feat_standardize = T
135 |   ))
136 |   lambda_dates <- unique(data_list$data$eom)
137 |   lambda_list <- lambda_dates |> map(function(d) {
138 |     x <- data_list$data[eom==d, .(id, lambda)][order(id)]
139 |     ids <- x$id
140 |     x <- x$lambda
141 |     names(x) <- ids
142 |     return(x)
143 |   }) |> setNames(as.character(lambda_dates))
144 |   pfs <- data.table()
145 |   if (do_static) {
146 |     # Static portfolio
147 |     static <- static_implement(
148 |       data_tc = data_list$data, 
149 |       cov_list = data_list$barra_cov, 
150 |       lambda_list = lambda_list,
151 |       rf = risk_free,                                      # Data
152 |       wealth = wealth, 
153 |       mu = pf_set$mu, 
154 |       gamma_rel = pf_set$gamma_rel,                                                       # Investor
155 |       dates_full = dates_full, 
156 |       dates_oos = dates_oos, 
157 |       dates_hp = dates_hp, 
158 |       hp_years = hp_years,                               # Dates
159 |       k_vec = set$pf$hps$static$k, 
160 |       u_vec = set$pf$hps$static$u, 
161 |       g_vec = set$pf$hps$static$g, 
162 |       cov_type = set$pf$hps$cov_type,
163 |       validation = NULL
164 |     )
165 |     static_raw <- static$hps[eom_ret %in% static$pf$eom_ret & k==1 & g==0 & u==1, .(eom_ret=as.Date(eom_ret), inv, shorting, turnover, r, tc)][, type := "Static-ML"]
166 |     pfs <- pfs |> rbind(static$pf, static_raw)
167 |   }
168 |   if (do_tpf) {
169 |     # TPF
170 |     tpf <- data_list$data |> tpf_implement(cov_list = data_list$barra_cov, wealth = wealth, dates = dates_oos, gam = pf_set$gamma_rel)
171 |     pfs <- pfs |> rbind(tpf$pf)
172 |   }
173 |   if (do_mp) {
174 |     # Multiperiod-ML 
175 |     m1 <- mp_implement(
176 |       data_tc = data_list$data, 
177 |       cov_list = data_list$barra_cov,
178 |       lambda_list = lambda_list,
179 |       rf = risk_free,                                      # Data
180 |       wealth = wealth, 
181 |       mu = pf_set$mu, 
182 |       gamma_rel = pf_set$gamma_rel,                                                       # Investor
183 |       dates_full = dates_full, 
184 |       dates_oos = dates_oos, 
185 |       dates_hp = dates_hp, 
186 |       hp_years = hp_years,                              # Dates
187 |       k_vec = set$pf$hps$m1$k, 
188 |       u_vec = set$pf$hps$m1$u, 
189 |       g_vec = set$pf$hps$m1$g, 
190 |       cov_type = set$pf$hps$cov_type,
191 |       validation = NULL, 
192 |       iter = 10,
193 |       K = 12
194 |     )
195 |     pfs <- pfs |> rbind(m1$pf)
196 |   }
197 |   if (do_pf) {
198 |     m2 <- pfml_implement(
199 |       data_tc = data_list$data, 
200 |       cov_list = data_list$barra_cov, 
201 |       lambda_list = lambda_list,
202 |       features = data_list$features, 
203 |       risk_free = risk_free,                                      # Data
204 |       wealth = wealth,
205 |       mu = pf_set$mu, 
206 |       gamma_rel = pf_set$gamma_rel,                                                       # Investor
207 |       dates_full = dates_full[13:length(dates_full)], 
208 |       dates_oos = dates_oos, 
209 |       lb = pf_set$lb_hor, 
210 |       hp_years = hp_years,                               # Dates
211 |       rff_feat = T, 
212 |       g_vec = set$pf_ml$g_vec, 
213 |       p_vec = set$pf_ml$p_vec, 
214 |       l_vec = set$pf_ml$l_vec, 
215 |       scale = set$pf_ml$scale, 
216 |       orig_feat = set$pf_ml$orig_feat,      # Hyperparameters
217 |       iter = 10, 
218 |       hps = NULL,  
219 |       balanced = T,                         # With 10 stocks, setting balanced=T increases time from 1.5 to 3mins. But with 100 stocks, it made almost no difference? Wierd
220 |       seed = set$seed_no                                                      # Other
221 |     )
222 |     pfs <- pfs |> rbind(m2$pf)
223 |   }
224 |   # Output
225 |   pfs |> mutate(seed=seed, w=search_params$w, n=search_params$n, theta1=search_params$theta1, theta2=search_params$theta2)
226 | }, .progress = paste0("Progress run ", config_params$seed)) |> rbindlist() |> mutate(run=config_params$seed)
227 | tictoc::toc()
228 | if (T) {
229 |   results |> fwrite(paste0("simulations/results/results", Sys.Date(), 
230 |                            "_run", config_params$seed, 
231 |                            "_nstocks", set$n_stocks, ".csv"))
232 | }


--------------------------------------------------------------------------------
/slurm_build_portfolios.R:
--------------------------------------------------------------------------------
 1 | # Run this code to fit models in parallel on a slurm HPC
 2 | source("Main.R")
 3 | 
 4 | # Submit a config file with slurm job? -----------
 5 | if (TRUE) {
 6 |   # Prepare for slurm to submit configuration files flexibly
 7 |   args <- commandArgs(trailingOnly = TRUE)
 8 |   config_file <- args[1]  # Get the first argument, which is the config file name
 9 |   
10 |   config_params <- read_config(config_file)
11 |   # Time notes ----
12 |   # Running all three methods on quantiles splits of the stocks took roughly 20 hours
13 |   # Runing just Static on all stocks took 3.5 hours, PFML took 65 hours, and Multiperiod-ML likely takes longer....
14 |   
15 | } else {
16 |   config_params <- list(
17 |     "size_screen"="perc_low50_high100_min50",  #"top100"
18 |     "wealth" = 1e10,
19 |     "gamma_rel" = 10,
20 |     "industry_cov" = T,
21 |     "update_mp" = F,
22 |     "update_base" = F,
23 |     "update_fi_base" = F,
24 |     "update_fi_ief" = F,
25 |     "update_fi_ret" = F
26 |   )
27 | }
28 | 
29 | settings$screens$size_screen <- config_params$size_screen
30 | settings$cov_set$industries <- config_params$industry_cov
31 | pf_set$wealth <- config_params$wealth
32 | pf_set$gamma_rel <- config_params$gamma_rel
33 | 
34 | 
35 | # Run settings --------------------
36 | get_from_path_model <- "Data/Generated/Models/20240424-all"  # From which directory should the data be loaded? (if update$x==F)
37 | run_sub <- F  # T when testing code, F otherwise
38 | 
39 | # Create folder to save output --------------------
40 | folder_naming <- function(config_params) {
41 |   paste0(
42 |     "Data/Generated/Portfolios/", format(Sys.time(), "%Y%m%d-%H%m"), 
43 |     "_WEALTH", config_params$wealth,
44 |     "_GAMMA", config_params$gamma_rel,
45 |     "_SIZE", config_params$size_screen,
46 |     "_IND", config_params$industry_cov
47 |   )
48 | }
49 | output_path <- config_params |> folder_naming()
50 | if (!dir.exists(output_path)) {
51 |   # Create relevant directories
52 |   dir.create(output_path) # Overall
53 |   # Save Main.R and settings file to new directory
54 |   file.copy("Main.R", paste0(output_path, "/Main.R"))
55 |   file.copy("slurm_build_portfolios.R", paste0(output_path, "/slurm_build_portfolios.R"))
56 |   settings |> saveRDS(paste0(output_path, "/settings.RDS"))
57 |   pf_set |> saveRDS(paste0(output_path, "/pf_set.RDS"))
58 | }
59 | 
60 | # Run code --------------------
61 | tic("Total run time")
62 | 
63 | source("1 - Prepare Data.R", echo = T)
64 | # source("2 - Fit Models.R", echo = T)  # Loaded in prepare portfolio data
65 | source("3 - Estimate Covariance Matrix.R", echo = T)
66 | source("4 - Prepare Portfolio Data.R", echo = T)
67 | # Base case
68 | if (config_params$update_base) {
69 |   tic("Base case:")
70 |   source("5 - Base case.R", echo = T)  
71 |   toc()
72 | }
73 | # Feature importance - base case
74 | if (config_params$update_fi_base) {
75 |   tic("Feature importance - base:")
76 |   source("5 - Feature importance base.R", echo = T)
77 |   toc()
78 | }
79 | # Feature importance - IEF
80 | if (config_params$update_fi_ief) {
81 |   tic("Feature importance - IEF:")
82 |   source("5 - Feature importance IEF.R", echo = T)
83 |   toc()
84 | }
85 | # Feature importance - Expected return models
86 | if (config_params$update_fi_ret) {
87 |   tic("Feature importance - returns:")
88 |   source("5 - Feature importance ret.R", echo = T)
89 |   toc()
90 | }
91 | 
92 | toc()
93 | 


--------------------------------------------------------------------------------
/slurm_fit_models.R:
--------------------------------------------------------------------------------
 1 | # Run this code to fit models in parallel on a slurm HPC
 2 | # Note: when distributing 12 models across 4 instances, request 20 hours and 150gb memory on each instance 
 3 | source("Main.R")
 4 | 
 5 | # Run settings --------------------
 6 | settings$screens$size_screen <- "all"
 7 | settings$screens$nyse_stocks <- F # Also include non-NYSE stocks when fitting expected return models
 8 | run_sub <- F  # T when testing code, F otherwise
 9 | 
10 | # Range of Prediction Horizons -----
11 | search_grid <- tibble(
12 |   name = paste0("m", 1:12),
13 |   horizon = as.list(1:12)
14 | )
15 | # Read configuration 
16 | if (TRUE) {
17 |   # Prepare for slurm to submit configuration files flexibly
18 |   args <- commandArgs(trailingOnly = TRUE)
19 |   config_file <- args[1]  # Get the first argument, which is the config file name
20 |   
21 |   config_params <- read_config(config_file)
22 | } else {
23 |   config_params <- list(
24 |     "horizon" = 1
25 |   )
26 | }
27 | search_grid <- search_grid |> filter(name %in% paste0("m", config_params$horizon))
28 | # Create output folder if missing 
29 | output_path <- paste0("Data/Generated/Models/", format(Sys.time(), "%Y%m%d"), "-", settings$screens$size_screen)
30 | if (!dir.exists(output_path)) {
31 |   # Create relevant directories
32 |   dir.create(output_path) # Overall
33 |   # Save Main.R and settings file to new directory
34 |   file.copy("Main.R", paste0(output_path, "/Main.R"))
35 |   file.copy("slurm_fit_models.R", paste0(output_path, "/slurm_fit_models.R"))
36 |   settings |> saveRDS(paste0(output_path, "/settings.RDS"))
37 | }
38 | tictoc::tic("Total run time")
39 | source("1 - Prepare Data.R", echo = T)
40 | source("2 - Fit Models.R", echo = T)
41 | 


--------------------------------------------------------------------------------
/sqrtm_cpp.cpp:
--------------------------------------------------------------------------------
 1 | #include <RcppArmadillo.h>
 2 | // [[Rcpp::depends(RcppArmadillo)]]
 3 | 
 4 | //' Calculate square root of a matrix from Armadillo
 5 |  //' @export
 6 |  // [[Rcpp::export]]
 7 |  arma::cx_mat sqrtm_cpp (arma::mat x) {
 8 |    arma::cx_mat x_sqrt;
 9 |    x_sqrt = arma::sqrtmat(x);
10 |    return(x_sqrt);
11 | }


--------------------------------------------------------------------------------