├── README.md └── Fama-French-Replication.R /README.md: -------------------------------------------------------------------------------- 1 | # Fama-French-Replication 2 | -------------------------------------------------------------------------------- /Fama-French-Replication.R: -------------------------------------------------------------------------------- 1 | ######################################################################################################## 2 | #------------------------------------------------------------------------------------------------------- 3 | # Author : A. John Woodill 4 | # Date : 04/07/2015 5 | # Filename : Fama-French-Replication.R 6 | # Code : Fama-French '92 Replication 7 | # Sections : 8 | # (1) CRSP Data setup 9 | # (2) Table 1 pre-beta, post-beta, post-beta (ln(ME)) 10 | # (3) 11 | # (**) CRSP and Compustat Data Merge) 12 | # (**) Data Wrangling 13 | #------------------------------------------------------------------------------------------------------- 14 | ######################################################################################################## 15 | 16 | rm(list=ls(all=TRUE)) 17 | library(readr) # For reading the data 18 | library(dplyr) # Data wrangling 19 | 20 | #----------------------------------- 21 | # (1) CRSP Data setup 22 | #----------------------------------- 23 | 24 | setwd("/run/media/john/1TB/Projects/Fama-French Replicatoin/") 25 | crsp <- read_csv(("Crsp.csv")) 26 | # crsp2 <- read.csv("Crsp.csv", stringsAsFactors = FALSE) 27 | 28 | # Convert colnames to lower case 29 | 30 | colnames(crsp) <- tolower(colnames(crsp)) 31 | 32 | # Fix missing fyears 33 | crsp$fyear <- substr(crsp$date, 1, 4) 34 | 35 | # Add Month 36 | 37 | crsp$month <- substr(crsp$date, 5, 6) 38 | # Only keep those stocks with returns at the end of June 39 | # crsp <- crsp %>% 40 | # group_by(permco, fyear) %>% 41 | # mutate(month = substr(date, 5, 6), 42 | # has_June = any(month == "06")) 43 | # 44 | # crsp <- filter(crsp, has_June == TRUE) 45 | 46 | # Only keep those stocks with returns at the end of December 47 | # crsp <- crsp %>% 48 | # group_by(permco, fyear) %>% 49 | # mutate(month = substr(date, 5, 6), 50 | # has_Dec = any(month == "12")) 51 | # 52 | # crsp <- filter(crsp, has_Dec == TRUE) 53 | 54 | # Calculate Market Equity (ME) : ME = prc*shrout 55 | crsp$me <- (abs(crsp$prc)*crsp$shrout)/1000 56 | crsp <- filter(crsp, me > 0) # Ensure has me 57 | 58 | # Remove all obs outside July 1962 - Dec 1990 ( Because of lag need to include 1962) 59 | crsp <- filter(crsp, date >= 19620700 & date <= 19901231) 60 | 61 | # Keep only share code 10, 11 62 | crsp <- filter(crsp, shrcd == 10 | shrcd == 11) 63 | 64 | # Remove ret with C as value 65 | crsp <- filter(crsp, ret != "C") 66 | crsp$ret <- as.numeric(crsp$ret) 67 | 68 | # Write to full sample 69 | write_csv(crsp, "crsp_92_data.csv") 70 | 71 | #--------------------------------------------------------- 72 | # (2) Table 1 pre-beta, post-beta, post-beta (ln(ME)) 73 | #--------------------------------------------------------- 74 | 75 | library(dplyr) 76 | library(readr) 77 | 78 | setwd("/run/media/john/1TB/Projects/Fama-French Replicatoin/") 79 | crsp <- read_csv("crsp_92_data.csv") 80 | 81 | # Select only columns needed 82 | crsp <- select(crsp, permco,date, ret, vwretd, ewretd, fyear, month, me) 83 | 84 | # Get 10% decile ME for each June and assign to portfolio 85 | crsp$fyear <- as.integer(crsp$fyear) 86 | crsp$month <- as.integer(crsp$month) 87 | 88 | # Build portfolios based on ME 89 | crsp$portfo=cut(crsp$me, breaks=quantile(crsp$me,probs=seq(0,1,1/10),na.rm=T),labels=F) 90 | 91 | # Lag ewretd 92 | crsp <- crsp %>% 93 | group_by(permco) %>% 94 | arrange(desc(date)) %>% 95 | mutate(lagewretd = lag(ewretd)) 96 | 97 | # Remove initial lagged variables 98 | crsp <- filter(crsp, lagewretd != "NA") 99 | 100 | ## convert fyear to a proper number and then exploit for sorting 101 | crsp <- crsp %>% 102 | mutate(fyear = fyear %>% as.integer) %>% 103 | arrange(fyear, month) 104 | 105 | ## figure out cumulative months available for each year (for each permco) 106 | years <- crsp %>% 107 | group_by(permco, fyear) %>% 108 | summarize(n = n()) %>% 109 | mutate(n_cum = cumsum(n)) 110 | 111 | # function to get coefficients 112 | # (further optimization should probably focus on improving this function) 113 | get_coefs <- function(.permco, .fyear, .n_cum){ 114 | if(.n_cum < 24) { 115 | data_frame(`(Intercept)` = NA_real_, ewretd = NA_real_, lagewretd = NA_real_) 116 | } else { 117 | my_dat <- crsp %>% 118 | filter(permco == .permco, fyear <= .fyear) %>% 119 | mutate(rn = row_number(desc(date))) 120 | lm(ret ~ ewretd + lagewretd, my_dat, subset = rn < 61) %>% 121 | coef %>% 122 | as.list %>% 123 | as_data_frame 124 | } 125 | } 126 | 127 | # dplyr option (Takes ~ 2 hours) 128 | models_dplyr <- years %>% 129 | group_by(fyear, permco) %>% 130 | do(get_coefs(.$permco, .$fyear, .$n_cum)) 131 | 132 | # Remove NA's 133 | models_dplyr <- filter(models_dplyr, ewretd != "NA" | lagewretd != "NA") 134 | models_dplyr$sum <- models_dplyr$ewretd + models_dplyr$lagewretd 135 | 136 | # Write out to save 137 | write.csv(models_dplyr, "prerank_betas.csv") 138 | 139 | # Read prerank 140 | models_dplyr <- read_csv("prerank_betas.csv") 141 | 142 | 143 | # Merge with crsp data set 144 | merge <- select(crsp, permco, portfo, me, ret, fyear, month) 145 | prerank <- inner_join(models_dplyr, merge, by = "permco") 146 | 147 | # Sum ewretd and lagewretd to get pre-beta 148 | prerank_betas <- prerank %>% 149 | group_by(permco) %>% 150 | summarize(pre_beta = mean(sum), ret = mean(ret), me = mean(me), ewr = mean(ewretd)) 151 | 152 | # Rank pre-betas and me 153 | prerank_betas$beta_rank=cut(prerank_betas$pre_beta, breaks=quantile(prerank_betas$pre_beta, probs=seq(0,1,1/10), na.rm=T),labels=F) 154 | prerank_betas$portfo=cut(prerank_betas$me, breaks=quantile(prerank_betas$me,probs=seq(0,1,1/10),na.rm=T),labels=F) 155 | 156 | prerank_betas <- filter(prerank_betas, beta_rank != "NA" & portfo != "NA") 157 | 158 | # Build data frame for pre-ranking betas 159 | 160 | df <- prerank_betas %>% 161 | group_by(portfo, beta_rank) %>% 162 | summarise(mer = mean(ewr)) 163 | 164 | df <- prerank_betas %>% 165 | group_by(beta_rank) %>% 166 | summarise(mer = mean(ewr)) 167 | df 168 | 169 | table1a <- read_csv("/home/john/Dropbox/UHM/Classes/Fin 701 - International Finance Theory/Replication/Table1_A.csv") 170 | table1a 171 | 172 | # Table 1B - Post Ranking Betas 173 | 174 | # Function to get coef 175 | get_postcoefs <- function(portfo){ 176 | my_dat <- prerank_betas %>% 177 | filter(portfo == portfo) %>% 178 | lm(ret ~ ewr, my_dat) %>% 179 | coef %>% 180 | as.list %>% 181 | as_data_frame 182 | } 183 | 184 | 185 | postrank <- prerank_betas %>% 186 | group_by(portfo) %>% 187 | do(get_postcoefs(.$portfo)) 188 | 189 | postrank <- prerank_betas 190 | postbetas <- data.frame(LowB = numeric(), 191 | B2 = numeric(), 192 | B3 = numeric(), 193 | B4 = numeric(), 194 | B5 = numeric(), 195 | B6 = numeric(), 196 | B7 = numeric(), 197 | B8 = numeric(), 198 | B9 = numeric(), 199 | B10 = numeric()) 200 | 201 | size <- data.frame(LowB = numeric(), 202 | B2 = numeric(), 203 | B3 = numeric(), 204 | B4 = numeric(), 205 | B5 = numeric(), 206 | B6 = numeric(), 207 | B7 = numeric(), 208 | B8 = numeric(), 209 | B9 = numeric(), 210 | B10 = numeric()) 211 | 212 | # Get Post Rank Betas 213 | for (i in unique(postrank$portfo)){ 214 | for (j in unique(postrank$beta_rank)) { 215 | frame <- filter(postrank, portfo == i & beta_rank == j) 216 | postbetas[[i,j]] <- as.numeric(lm(ret ~ ewr, data = frame)$coefficients[2]*100) 217 | } 218 | } 219 | 220 | # All post rank betas for portfolio rank 221 | for (i in unique(postrank$portfo)){ 222 | frame <- filter(postrank, portfo == i) 223 | betas[[i]] <- lm(ret ~ ewr, data = frame)$coefficients[2] 224 | } 225 | 226 | 227 | # Get all post rank betas for beta rank 228 | for (i in unique(postrank$beta_rank)){ 229 | frame <- filter(postrank, beta_rank == i) 230 | betas[[i]] <- lm(ret ~ ewr, data = frame)$coefficients[2] 231 | } 232 | 233 | # Get all betas for portfo data frame 234 | for (i in unique(postrank$portfo)){ 235 | frame <- filter(postrank, portfo == i & beta_rank == j) 236 | size[[i,j]] <- mean(log(frame$me)) 237 | } 238 | 239 | # Get all betas for beta data frame 240 | for (i in unique(postrank$beta_rank)){ 241 | frame <- filter(postrank, portfo == i & beta_rank == j) 242 | size[[i,j]] <- mean(log(frame$me)) 243 | } 244 | } 245 | 246 | portsize <- postrank %>% 247 | group_by(portfo) %>% 248 | summarize(sizeme = mean(log(me))) 249 | 250 | portsize <- postrank %>% 251 | group_by(beta_rank) %>% 252 | summarize(sizeme = mean(log(me))) 253 | 254 | ### NOT FINISHED 255 | #---------------------------------------- 256 | # (**) CRSP and Compustat Data Merge 257 | #---------------------------------------- 258 | 259 | 260 | setwd("/run/media/john/1TB/Projects/Fama-French Replicatoin/") 261 | 262 | compustat <- read.csv("Compustat.csv", header = TRUE, stringsAsFactors = FALSE) 263 | crsp <- read.csv("Crsp.csv", header = TRUE, stringsAsFactors = FALSE) 264 | 265 | # Remove last cusip digit from compustat to merge with crsp 266 | 267 | compustat$cusip <- substr(compustat$cusip, 1, nchar(compustat$cusip) - 1) 268 | 269 | # Merge by cusip and date, NA -> 0, subset for years and save 270 | 271 | data <- full_join(compustat, crsp, by=c("cusip" = "CUSIP", "datadate" = "date")) 272 | data[is.na(data)] <- 0 273 | data <- filter(data, datadate >= 19580731 & datadate <= 19890631) 274 | write.csv(data, "compustat_crsp_merged_1958-1989.csv") 275 | 276 | # Read in data from csv 277 | 278 | data <- read.csv("compustat_crsp_merged_1958-1989.csv") 279 | 280 | #---------------------------------------- 281 | # (**) Data Wrangling 282 | #---------------------------------------- 283 | # Fix missing fyears 284 | data$fyear <- substr(data$datadate, 1, 4) 285 | 286 | # Fix data elements 287 | 288 | data$cusip <- as.numeric(data$cusip) 289 | data$fyear <- as.numeric(data$fyear) 290 | 291 | # Only keep those stocks with returns at the end of June 292 | data %>% 293 | group_by(cusip, fyear) %>% 294 | mutate(month = substr(datadate, 5, 6), 295 | has_June = any(month == "06")) -> data 296 | 297 | data <- filter(data, has_June == TRUE) 298 | 299 | # Only keep those stocks with returns at the end of December 300 | data %>% 301 | group_by(cusip, fyear) %>% 302 | mutate(month = substr(datadate, 5, 6), 303 | has_Dec = any(month == "12")) -> data 304 | 305 | data <- filter(data, has_Dec == TRUE) 306 | 307 | # Calculate Book Equity (BE) : BE = CEQ + TXDB 308 | data$be <- data$ceq + data$txdb 309 | 310 | # Calculate Market Equity (ME) : ME = prcc_f*csho 311 | data$me <- data$prcc_f*data$csho 312 | data <- filter(data, me > 0) # Ensure has me 313 | 314 | # Calculate Book-to-Market (BE/ME) : be / me 315 | data$beme <- data$be/data$me 316 | 317 | # Calculate EP : EP = IB + TXDFED + TXDFO + TXDS - DVP/PRCC_F 318 | data$ep <- data$ib + data$txdfed + data$txdfo + data$txds - (data$dvp/data$prcc_f) 319 | 320 | 321 | ## Monthly returns for at least 24 of 60 months preceding July of year t 322 | 323 | # Need to ungroup data to run this 324 | data <- ungroup(data) 325 | 326 | data <- mutate(data, month = as.numeric(substr(datadate, 5, 6))) %>% 327 | mutate(datadate = as.POSIXct(gsub("^(\\d{4})(\\d{2}).*$", "\\1-\\2-01", datadate), 328 | format("%Y-%m-%d"), tz = "GMT")) %>% 329 | arrange(cusip, datadate) %>% 330 | filter(between(datadate, 331 | datadate[tail(which(month == 6, arr.ind = TRUE), n = 1)] - (60*60*24*30*60), 332 | datadate[tail(which(month == 6, arr.ind = TRUE), n = 1)] -(60*60*24*30*24))) %>% 333 | group_by(cusip) %>% 334 | mutate(check = abs(lead(month)-month) == 11|abs(lead(month)-month) == 1|abs(lead(month)-month) == 0) %>% 335 | filter(all(check == TRUE | check %in% NA)) 336 | 337 | # Write out sample dataset after all checks and ready for regressions (obs = 11,721) 338 | write.csv(data, "92_data.csv") 339 | 340 | 341 | 342 | 343 | ########################## 344 | --------------------------------------------------------------------------------