├── Code - No Passwords ├── FAMA_FRENCH_FACTORS.R ├── MAKE CRSP_COMP_LINK FILE.R ├── MAKE CRSP_COMP_MERGED.R ├── MAKE PERMNO_TO_GVKEY.R ├── MAKE_AMIHUD_ILLIQUIDITY_MEASURE.R ├── MAKE_INST_SHARES.R └── SCRAPE_FF_INDS.R └── README.md /Code - No Passwords/FAMA_FRENCH_FACTORS.R: -------------------------------------------------------------------------------- 1 | ############################################################################################################### 2 | ### REPLICATE FAMA-FRENCH-CARHART 4-FACTOR MODEL #### 3 | # Code by Wayne Chang (https://sites.google.com/site/waynelinchang/home) 4 | # Latest update assistance from Yifan Liu 5 | # Last Update: June 22, 2018 6 | # Based on methodology described on Ken French website and relevant Fama-French papers 7 | 8 | ############################################################################################################### 9 | ### SETUP ### 10 | 11 | library(tidyverse); library(zoo) 12 | require(data.table); library(lubridate) 13 | setwd("") 14 | 15 | ### WRDS CONNECTION ### 16 | 17 | library(RPostgres) 18 | wrds <- dbConnect(Postgres(), 19 | host='wrds-pgdata.wharton.upenn.edu', 20 | port=9737, 21 | user='', 22 | password='', 23 | dbname='wrds', 24 | sslmode='require') 25 | 26 | ############################################################################################################### 27 | ### LOAD COMPUSTAT FROM WRDS ### 28 | # Downloads Compustat and Compustat/Crsp merging link from WRDS 29 | # adds the linked permno to the compustat dataset 30 | # no filtering except must have PERMNO link 31 | 32 | # retrieve Compustat annual data (takes 10mins each below) 33 | res <- dbSendQuery(wrds,"select GVKEY, CUSIP, DATADATE, FYR, FYEAR, SICH, NAICSH, 34 | AT, LT, SEQ, CEQ, PSTKL, PSTKRV, PSTK, TXDITC, TXDB, ITCB, 35 | REVT, COGS, XINT, XSGA, IB, TXDI, DVC, ACT, CHE, LCT, 36 | DLC, TXP, DP, PPEGT, INVT 37 | from COMP.FUNDA 38 | where INDFMT='INDL' and DATAFMT='STD' and CONSOL='C' and POPSRC='D'") # STD is unrestatd data 39 | data.comp.funda <- dbFetch(res, n = -1) # n=-1 denotes no max but retrieve all record 40 | save(data.comp.funda, file = "180619 data.comp.funda.RData") 41 | 42 | # retrieve Compustat quarterly data 43 | res <- dbSendQuery(wrds,"select GVKEY, CUSIP, DATADATE, FYR, FYEARQ, 44 | ATQ, LTQ, SEQQ, CEQQ, PSTKRQ, PSTKQ, TXDITCQ, TXDBQ, 45 | REVTQ, COGSQ, XINTQ, XSGAQ, IBQ, TXDIQ, ACTQ, CHEQ, LCTQ, 46 | DLCQ, TXPQ, DPQ, PPEGTQ, INVTQ, EPSPXQ, RDQ 47 | from COMPM.FUNDQ 48 | where INDFMT='INDL' and DATAFMT='STD' and CONSOL='C' and POPSRC='D'") # STD is unrestatd data 49 | data.comp.fundq <- dbFetch(res, n = -1) # n=-1 denotes no max but retrieve all record 50 | save(data.comp.fundq, file = "180619 data.comp.fundq.RData") 51 | 52 | # retrieve Merged Compustat/CRSP link table 53 | res <- dbSendQuery(wrds,"select GVKEY, LPERMNO, LINKDT, LINKENDDT, LINKTYPE, LINKPRIM 54 | from crsp.ccmxpf_lnkhist") 55 | data.ccmlink <- dbFetch(res, n = -1) 56 | save(data.ccmlink, file = "180619 data.ccmlink.RData") 57 | 58 | # Merge the linked Permno onto Compustat dataset 59 | # compared to SAS code based on WRDS FF Research macro, I don't include all Linktypes but add J Linkprim 60 | # including J linkprim is key bc/ allows me to get the post-2010 Berkshire history 61 | # omitting non-primary linktypes led to 1% fewer obs (2,000) but cleaner data (datadate<="2013-12-31" for comparability) 62 | data.ccm <- data.ccmlink %>% 63 | # use only primary links (from WRDS Merged Compustat/CRSP examples) 64 | filter(linktype %in% c("LU", "LC", "LS")) %>% 65 | filter(linkprim %in% c("P", "C", "J")) %>% 66 | merge(data.comp.funda, by="gvkey") %>% # inner join, keep only if permno exists 67 | mutate(datadate = as.Date(datadate), 68 | permno = as.factor(lpermno), 69 | linkdt = as.Date(linkdt), 70 | linkenddt = as.Date(linkenddt), 71 | linktype = factor(linktype, levels=c("LC", "LU", "LS")), 72 | linkprim = factor(linkprim, levels=c("P", "C", "J"))) %>% 73 | # remove compustat fiscal ends that do not fall within linked period; linkenddt=NA (from .E) means ongoing 74 | filter(datadate >= linkdt & (datadate <= linkenddt | is.na(linkenddt))) %>% 75 | # prioritize linktype, linkprim based on order of preference/primary if duplicate 76 | arrange(datadate, permno, linktype, linkprim) %>% 77 | distinct(datadate, permno, .keep_all = TRUE) 78 | save(data.ccm, file = "180619 data.ccm.RData") 79 | rm(data.comp.funda, data.ccmlink) 80 | 81 | ############################################################################################################### 82 | ### COMPUSTAT CLEANING AND VAR CALC ### 83 | 84 | # load("180619 data.ccm.RData") 85 | data.comp <- data.ccm %>% 86 | rename(PERMNO=permno) %>% data.table %>% # ensure col names match crsp's 87 | group_by(PERMNO) %>% 88 | mutate(datadate = as.yearmon(datadate), 89 | comp.count = n()) %>% # number of years in data; future option to cut first year data; works but leads to warnings 90 | # tests based on BE spread show FF no longer impose this condition (even though mentioned in FF'93) 91 | ungroup %>% arrange(datadate, PERMNO) %>% data.frame %>% 92 | distinct(datadate, PERMNO, .keep_all = TRUE) # hasn't been issue but just in case 93 | 94 | data.comp.a <- data.comp %>% 95 | group_by(PERMNO) %>% 96 | mutate(BE = coalesce(seq, ceq + pstk, at - lt) + coalesce(txditc, txdb + itcb, 0) - 97 | coalesce(pstkrv, pstkl, pstk, 0), # consistent w/ French website variable definitions 98 | OpProf = (revt - coalesce(cogs, 0) - coalesce(xint, 0) - coalesce(xsga,0)), 99 | OpProf = as.numeric(ifelse(is.na(cogs) & is.na(xint) & is.na(xsga), NA, OpProf)), # FF condition 100 | GrProf = (revt - cogs), 101 | Cflow = ib + coalesce(txdi, 0) + dp, # operating; consistent w/ French website variable definitions 102 | Inv = (coalesce(ppegt - lag(ppegt), 0) + coalesce(invt - lag(invt), 0)) / lag(at), 103 | AstChg = (at - lag(at)) / lag(at) # note that lags use previously available (may be different from 1 yr) 104 | ) %>% ungroup %>% 105 | arrange(datadate, PERMNO) %>% 106 | select(datadate, PERMNO, comp.count, at, revt, ib, dvc, BE:AstChg) %>% 107 | mutate_if(is.numeric, funs(ifelse(is.infinite(.), NA, .))) %>% # replace Inf w/ NA's 108 | mutate_if(is.numeric, funs(round(., 5))) # round to 5 decimal places (for some reason, 0's not properly coded in some instances) 109 | save(data.comp.a, file="180619 data.comp.a.RData") 110 | rm(data.ccm, data.comp) 111 | 112 | ############################################################################################################### 113 | ### LOAD CRSP FROM WRDS ### 114 | # Downloads CRSP MSE, MSF, and MSEDELIST tables from WRDS 115 | # merges, cleans, and for market cap calc, combines permco's with multiple permnos (eg berkshire) 116 | # no filtering 117 | 118 | # SLOW CODE (30 mins) 119 | res <- dbSendQuery(wrds, "select DATE, PERMNO, PERMCO, CFACPR, CFACSHR, SHROUT, PRC, RET, RETX, VOL 120 | from CRSP.MSF") 121 | # where PRC is not null") 122 | crsp.msf <- dbFetch(res, n = -1) 123 | save(crsp.msf, file = "180619 crsp.msf.RData") 124 | 125 | res <- dbSendQuery(wrds, "select DATE, PERMNO, SHRCD, EXCHCD 126 | from CRSP.MSE") 127 | # where SHRCD is not null") 128 | crsp.mse <- dbFetch(res, n = -1) 129 | save(crsp.mse, file = "180619 crsp.mse.RData") 130 | 131 | res <- dbSendQuery(wrds, "select DLSTDT, PERMNO, dlret 132 | from crspq.msedelist") 133 | # where dlret is not null") 134 | crsp.msedelist <- dbFetch(res, n = -1) 135 | save(crsp.msedelist, file = "180619 crsp.msedelist.RData") 136 | 137 | # clean and marge data 138 | crsp.msf <- crsp.msf %>% 139 | filter(!is.na(prc)) %>% 140 | mutate(Date = as.yearmon(as.Date(date))) %>% 141 | select(-date) 142 | crsp.mse <- crsp.mse %>% 143 | filter(!is.na(shrcd)) %>% 144 | mutate(Date = as.yearmon(as.Date(date))) %>% 145 | select(-date) 146 | crsp.msedelist <- crsp.msedelist %>% 147 | filter(!is.na(dlret)) %>% 148 | mutate(Date = as.yearmon(as.Date(dlstdt))) %>% 149 | select(-dlstdt) 150 | data.crsp.m <- crsp.msf %>% 151 | merge(crsp.mse, by=c("Date", "permno"), all=TRUE, allow.cartesian=TRUE) %>% 152 | merge(crsp.msedelist, by=c("Date", "permno"), all=TRUE, allow.cartesian=TRUE) %>% 153 | rename(PERMNO=permno) %>% 154 | mutate_at(vars(PERMNO, permco, shrcd, exchcd), funs(as.factor)) %>% 155 | mutate(retadj=ifelse(!is.na(ret), ret, ifelse(!is.na(dlret), dlret, NA))) %>% # create retadj by merging ret and dlret 156 | arrange(Date, PERMNO) %>% 157 | group_by(PERMNO) %>% 158 | mutate_at(vars(shrcd, exchcd), funs(na.locf(., na.rm=FALSE))) # fill in NA's with latest available (must sort by Date and group by PERMNO) 159 | 160 | data.crsp.m <- data.crsp.m %>% 161 | mutate(meq = shrout * abs(prc)) %>% # me for each permno 162 | group_by(Date, permco) %>% 163 | mutate(ME = sum(meq)) %>% # to calc market cap, merge permnos with same permnco 164 | arrange(Date, permco, desc(meq)) %>% 165 | group_by(Date, permco) %>% 166 | slice(1) %>% # keep only permno with largest meq 167 | ungroup 168 | 169 | save(data.crsp.m, file = "180619 data.crsp.m.RData") 170 | rm(crsp.mse, crsp.msf, crsp.msedelist) 171 | 172 | ############################################################################################################### 173 | ### CRSP CLEANING ### 174 | # filters EXCHCD (NYSE, NASDAQ, AMEX) and SHRCD (10,11) 175 | 176 | Fill_TS_NAs <- function(main) { 177 | # takes datat frame with Date and PERMNO as columns and fills in NAs where there are gaps 178 | 179 | core <- select(main, Date, PERMNO) 180 | # find first and last dates of each PERMNO 181 | date.bookends <- core %>% 182 | group_by(PERMNO) %>% 183 | summarize(first = first(Date), last = last(Date)) 184 | 185 | # generate all dates for all PERMNOs then trim those outside of each PERMNO's first and last dates 186 | output <- core %>% 187 | mutate(temp = 1) %>% # create 3rd column so spread can be applied 188 | spread(., PERMNO, temp) %>% 189 | gather(., PERMNO, temp, -Date) %>% 190 | merge(date.bookends, by="PERMNO", all.x=TRUE) %>% 191 | group_by(PERMNO) %>% 192 | filter(Date>=first & Date<=last) %>% 193 | select(Date, PERMNO) %>% 194 | merge(., main, by=c("Date", "PERMNO"), all.x=TRUE) 195 | 196 | return(output) 197 | } 198 | 199 | # SLOW CODE (5 mins) 200 | load("180619 data.crsp.m.RData") 201 | data.crsp.cln <- data.crsp.m %>% 202 | select(Date, PERMNO, shrcd, exchcd, cfacpr, cfacshr, shrout, prc, vol, retx, retadj, ME) %>% 203 | mutate(ME = ME/1000) %>% # convert from thousands to millions (consistent with compustat values) 204 | filter((shrcd == 10 | shrcd == 11) & (exchcd == 1 | exchcd == 2 | exchcd == 3)) %>% 205 | Fill_TS_NAs %>% # fill in gap dates within each PERMNO with NAs to uses lead/lag (lead to NAs for SHRCD and EXCHCD); fn in AnoDecomp_Support 206 | mutate(PERMNO = as.factor(PERMNO)) %>% 207 | group_by(PERMNO) %>% 208 | mutate(port.weight = as.numeric(ifelse(!is.na(lag(ME)), lag(ME), ME/(1+retx))), # calc portweight as ME at beginning of period 209 | port.weight = ifelse(is.na(retadj) & is.na(prc), NA, port.weight)) %>% # remove portweights calc for date gaps 210 | ungroup %>% 211 | rename(retadj.1mn = retadj) %>% 212 | arrange(Date, PERMNO) %>% 213 | distinct(Date, PERMNO, .keep_all = TRUE) # hasn't been issue but just in case 214 | 215 | save(data.crsp.cln, file = "180619 data.crsp.cln.RData") 216 | rm(data.crsp.m) 217 | 218 | ############################################################################################################### 219 | ### MERGE COMPUSTAT AND CRSP ### 220 | # Merges CRSP and Compustat data fundamentals by PERMNO and DATE (annual-June-end portfolio formation) 221 | # Also get Davis book equity data (Compustat match begins 1951 but Davis book data available starting 20s) 222 | # Keep all CRSP info (drop Compustat if can't find CRSP) 223 | # Match Compustat and Davis data based on FF methodology (to following year June when data is first known at month end) 224 | 225 | load("180619 data.crsp.cln.RData") 226 | load("180619 data.comp.a.RData") 227 | 228 | data.Davis.bkeq <- read.csv("~/OneDrive/Research/Data/French/Davis Book Equity.csv") 229 | data.Davis.bkeq[data.Davis.bkeq == -999 | data.Davis.bkeq == -99.99] <- NA 230 | data.Davis.bkeq <- data.Davis.bkeq %>% 231 | mutate(PERMNO = factor(PERMNO)) %>% 232 | data.table %>% 233 | select(-FirstYr, -LastYr) %>% 234 | gather(Date, Davis.bkeq, -PERMNO, na.rm=TRUE) %>% 235 | mutate(Date = as.yearmon(ymd(paste0(substr(Date, 2, 5),"-6-01")))) 236 | 237 | na_locf_until = function(x, n) { 238 | # in time series data, fill in na's untill indicated n 239 | l <- cumsum(! is.na(x)) 240 | c(NA, x[! is.na(x)])[replace(l, ave(l, l, FUN=seq_along) > (n+1), 0) + 1] 241 | } 242 | 243 | # SLOW CODE (5 mins) 244 | data.both.m <- data.comp.a %>% 245 | mutate(Date = datadate + (18-month(datadate))/12) %>% # map to next year June period when data is known (must occur in previous year) 246 | merge(data.crsp.cln, ., by=c("Date", "PERMNO"), all.x=TRUE, allow.cartesian=TRUE) %>% # keep all CRSP records (Compustat only goes back to 1950) 247 | merge(data.Davis.bkeq, by=c("Date", "PERMNO"), all.x=TRUE, allow.cartesian=TRUE) %>% 248 | arrange(PERMNO, Date, desc(datadate)) %>% 249 | distinct(PERMNO, Date, .keep_all = TRUE) %>% # drop older datadates (must sort by desc(datadate)) 250 | group_by(PERMNO) %>% 251 | # fill in Compustat and Davis data NA's with latest available for subsequent year (must sort by Date and group by PERMNO) 252 | # filling max of 11 previous months means gaps may appear when fiscal year end changes (very strict) 253 | mutate_at(vars(datadate:Davis.bkeq), funs(na_locf_until(., 11))) %>% 254 | ungroup %>% 255 | mutate(datadate = yearmon(datadate)) %>% 256 | arrange(Date, PERMNO) 257 | 258 | save(data.both.m, file = "180619 data.both.m.RData") 259 | # company info has no Date gaps (filled with NA's) 260 | # all data publicly available by end of Date period (Compustat first data is June-1950 matched to CRSP Jun-51)) 261 | # includes all CRSP (but only Compustat/Davis data that matches CRSP) 262 | # CRSP first month price data Dec-25, return data Jan-26 263 | # CRSP last month data Dec-17 (Compustat 2017 data available but discarded bc/ must be mapped to CRSP 2018 data) 264 | # 180619 3.507 MM obs (Old: 170226 3.463 MM obs) 265 | rm(data.comp.a, data.crsp.cln, data.Davis.bkeq) 266 | 267 | ############################################################################################################### 268 | ### Add FF Variables ### 269 | 270 | # SLOW CODE (10 mins) 271 | load("180619 data.both.m.RData") 272 | data.both.FF.m <- data.both.m %>% 273 | group_by(PERMNO) %>% 274 | mutate(d.shares = (shrout*cfacshr)/(lag(shrout)*lag(cfacshr))-1, # change in monthly share count (adjusted for splits) 275 | ret.12t2 = (lag(retadj.1mn,1)+1)*(lag(retadj.1mn,2)+1)*(lag(retadj.1mn,3)+1)*(lag(retadj.1mn,4)+1)* 276 | (lag(retadj.1mn,5)+1)*(lag(retadj.1mn,6)+1)*(lag(retadj.1mn,7)+1)*(lag(retadj.1mn,8)+1)* 277 | (lag(retadj.1mn,9)+1)*(lag(retadj.1mn,10)+1)*(lag(retadj.1mn,11)+1)-1, # to calc momentum spread 278 | BE = coalesce(BE, Davis.bkeq), # data available by end-of-Jun based on Compustat Date mapping 279 | ME.Dec = as.numeric(ifelse(month(Date)==6 & lag(ME,6)>0, lag(ME,6), NA)), # previous Dec ME 280 | ME.Jun = as.numeric(ifelse(month(Date)==6, ME, NA)), 281 | BM.FF = as.numeric(ifelse(month(Date)==6 & ME.Dec>0, BE/ME.Dec, NA)), 282 | OpIB = as.numeric(ifelse(month(Date)==6 & BE>0, OpProf/BE, NA)), 283 | GrIA = as.numeric(ifelse(month(Date)==6 & at>0, GrProf/at, NA)), 284 | CFP.FF = as.numeric(ifelse(month(Date)==6 & ME.Dec>0, Cflow/ME.Dec, NA)), 285 | BM.m = BE/ME, # monthly updated version for spread calc 286 | CFP.m = Cflow/ME, # monthly updated version for spread calc 287 | lag.ME.Jun = lag(ME.Jun), # monthly data so only lag by 1 mn 288 | lag.BM.FF = lag(BM.FF), 289 | lag.ret.12t2 = lag(ret.12t2), 290 | lag.OpIB = lag(OpIB), 291 | lag.AstChg = lag(AstChg)) 292 | 293 | data.both.FF.m %<>% 294 | mutate_at(vars(d.shares:lag.AstChg), funs(ifelse(!is.infinite(.), ., NA))) %>% # code Inf values as NAs 295 | select(Date, datadate, PERMNO, exchcd, comp.count, prc, vol, retadj.1mn, d.shares, ME, port.weight, 296 | ret.12t2, at:AstChg, ME.Jun:lag.AstChg) %>% 297 | arrange(Date, PERMNO) %>% 298 | group_by(PERMNO) %>% 299 | mutate_at(vars(ME.Jun:CFP.FF, lag.ME.Jun:lag.AstChg), funs(na_locf_until(., 11))) %>% 300 | ungroup %>% 301 | mutate(port.weight = ifelse(is.na(port.weight), 0, port.weight)) # necessary to avoid NAs for weighted ret calc 302 | save(data.both.FF.m, file = "180619 data.both.FF.m.RData") 303 | rm(data.both.m) 304 | 305 | ############################################################################################################### 306 | ### Form FF Factors ### 307 | 308 | Form_CharSizePorts2 <- function(main, size, var, wght, ret) { # streamlined version 309 | # forms 2x3 (size x specificed-characteristc) and forms the 6 portfolios 310 | # variable broken by 30-70 percentiles, size broken up at 50 percentile (breakpoints uses NYSE data only) 311 | # requires Date and exchcd 312 | # outputs portfolio returns for each period, 313 | 314 | main.cln <- main %>% 315 | select(Date, PERMNO, exchcd, !!size, !!var, !!wght, !!ret) 316 | 317 | Bkpts.NYSE <- main.cln %>% # create size and var breakpoints based on NYSE stocks only 318 | filter(exchcd == 1) %>% # NYSE exchange 319 | group_by(Date) %>% 320 | summarize(var.P70 = quantile(!!var, probs=.7, na.rm=TRUE), 321 | var.P30 = quantile(!!var, probs=.3, na.rm=TRUE), 322 | size.Med = quantile(!!size, probs=.5, na.rm=TRUE)) 323 | 324 | # calculate size and var portfolio returns 325 | main.rank <- main.cln %>% 326 | merge(Bkpts.NYSE, by="Date", all.x=TRUE) %>% 327 | mutate(Size = ifelse(!!sizevar.P70, "High", "Neutral")), 329 | Port = paste(Size, Var, sep=".")) 330 | 331 | Ret <- main.rank %>% # name 2 x 3 size-var portfolios 332 | group_by(Date, Port) %>% 333 | summarize(ret.port = weighted.mean(!!ret, !!wght, na.rm=TRUE)) %>% # calc value-weighted returns 334 | spread(Port, ret.port) %>% # transpose portfolios expressed as rows into seperate columns 335 | mutate(Small = (Small.High + Small.Neutral + Small.Low)/3, 336 | Big = (Big.High + Big.Neutral + Big.Low)/3, 337 | SMB = Small - Big, 338 | High = (Small.High + Big.High)/2, 339 | Low = (Small.Low + Big.Low)/2, 340 | HML = High - Low) 341 | 342 | return(Ret) 343 | } 344 | 345 | Form_FF6Ports <- function(df) { 346 | # form FF6 factors from data (SMB, HML, RMW, CMA, UMD) 347 | output <- df %>% 348 | group_by(Date) %>% 349 | summarize(MyMkt = weighted.mean(retadj.1mn, w=port.weight, na.rm=TRUE)) %>% 350 | merge(Form_CharSizePorts2(df, quo(lag.ME.Jun), quo(lag.BM.FF), quo(port.weight), quo(retadj.1mn)), 351 | by="Date", all.x=TRUE) %>% # SMB, HML 352 | select(Date:MyMkt, MySMB=SMB, MySMBS=Small, MySMBB=Big, MyHML=HML, MyHMLH=High, MyHMLL=Low) %>% 353 | merge(Form_CharSizePorts2(df, quo(lag.ME.Jun), quo(lag.OpIB), quo(port.weight), quo(retadj.1mn)), 354 | by="Date", all.x=TRUE) %>% # RMW 355 | select(Date:MyHMLL, MyRMW=HML, MyRMWR=High, MyRMWW=Low) %>% 356 | merge(Form_CharSizePorts2(df, quo(lag.ME.Jun), quo(lag.AstChg), quo(port.weight), quo(retadj.1mn)), 357 | by="Date", all.x=TRUE) %>% # CMA 358 | select(Date:MyRMWW, MyCMA=HML, MyCMAC=Low, MyCMAA=High) %>% 359 | mutate(MyCMA=-MyCMA) %>% 360 | merge(Form_CharSizePorts2(df, quo(port.weight), quo(lag.ret.12t2), quo(port.weight), quo(retadj.1mn)), 361 | by="Date", all.x=TRUE) %>% # UMD 362 | select(Date:MyCMAA, MyUMD=HML, MyUMDU=High, MyUMDD=Low) 363 | return(output) 364 | } 365 | 366 | load("180619 data.both.FF.m.RData") 367 | dt.myFF6.m <- Form_FF6Ports(data.both.FF.m) %>% 368 | filter(year(Date) > 1925 & year(Date) < 2018) 369 | save(dt.myFF6.m, file = "180619 dt.myFF6.m.RData") 370 | 371 | ############################################################################################################### 372 | ### TEST FOR CONSISTENCY WITH POSTED FACTORS ### 373 | 374 | Compare_Two_Vectors2 <- function(v1, v2, sqnc=1) { # omits DATE option 375 | # takes two dataframes where each has a Date and another var column 376 | # compares the two var columns for similarity 377 | # v1.date, v2.date denotes col name of date which must be yearmon 378 | # sqnc affects the plot, e.g. if =3 then plot every third point (so not too crowded) 379 | # will not work if missing observations in between for one dataframe since cor needs exact same length 380 | 381 | # obtain common time segment among two vectors 382 | lo.date <- max(v1[["Date"]][min(which(!is.na(v1[[colnames(v1)[2]]])))], 383 | v2[["Date"]][min(which(!is.na(v2[[colnames(v2)[2]]])))]) 384 | hi.date <- min(v1[["Date"]][max(which(!is.na(v1[[colnames(v1)[2]]])))], 385 | v2[["Date"]][max(which(!is.na(v2[[colnames(v2)[2]]])))]) 386 | v1.trim <- subset(v1, lo.date <= Date & Date <= hi.date) 387 | v2.trim <- subset(v2, lo.date <= Date & Date <= hi.date) 388 | 389 | print("correlation") 390 | print(cor(v1.trim[[colnames(v1.trim)[2]]], v2.trim[[colnames(v2.trim)[2]]], use="complete.obs")) 391 | print("v1.trim mean") 392 | print(mean(v1.trim[[colnames(v1.trim)[2]]], na.rm=TRUE)) 393 | print("v2.trim mean") 394 | print(mean(v2.trim[[colnames(v2.trim)[2]]], na.rm=TRUE)) 395 | print("means: v1.trim-v2.trim") 396 | print(mean(v1.trim[[colnames(v1.trim)[2]]], na.rm=TRUE)-mean(v2.trim[[colnames(v2.trim)[2]]], na.rm=TRUE)) 397 | plot(v1.trim[["Date"]][seq(1,nrow(v1.trim),sqnc)], 398 | v1.trim[[colnames(v1)[2]]][seq(1,nrow(v1.trim),sqnc)], type="l", xlab="Date", ylab="Returns (%)") 399 | lines(v2.trim[["Date"]][seq(1,nrow(v2.trim),sqnc)], 400 | v2.trim[[colnames(v2)[2]]][seq(1,nrow(v2.trim),sqnc)], type="l", col="blue") 401 | legend("topright", c("1", "2"), lty=1, col=c("black","blue")) 402 | } 403 | 404 | # import FF6 and Mkt to check data 405 | library(XLConnect) 406 | wb <- loadWorkbook("~/OneDrive/Research/Data/French/180622 French_FF6.xlsx") 407 | dt.FF6.m <- wb %>% 408 | readWorksheet(sheet = 1) %>% 409 | mutate(Date = as.yearmon(ymd(paste0(Date,28))), 410 | Mkt = MktRF + RF) %>% 411 | mutate_at(vars(-Date), funs(./100)) %>% 412 | arrange(Date) 413 | 414 | # check FF6 factor returns (Jul '26 through Dec '17) 415 | load("180619 dt.myFF6.m.RData") 416 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyMkt), select(dt.FF6.m, Date, Mkt), sqnc=12) 417 | # Mkt: cor 99.999%; both means 93.87 bps 418 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MySMB), select(dt.FF6.m, Date, SMB), sqnc=12) 419 | # SMB: cor 99.6%; means 19.2 bps vs 20.1 bps 420 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyHML), select(dt.FF6.m, Date, HML), sqnc=12) 421 | # HML: cor 98.9%; means 37.2 bps vs 38.2 bps 422 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyRMW), select(dt.FF6.m, Date, RMW), sqnc=12) 423 | # RMW: cor 98.3%; means 26.2 bps vs 24.8 bps 424 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyCMA), select(dt.FF6.m, Date, CMA), sqnc=12) 425 | # CMA: cor 97.9%; means 23.2 bps vs 28.7 bps (SIZEABLE GAP) 426 | # tested using lag(ME) for port weights instead (avoid using RETX) but doesn't make a diff 427 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyUMD), select(dt.FF6.m, Date, UMD), sqnc=12) 428 | # UMD: cor 99.8%; means 65.3 bps vs 65.8 bps 429 | 430 | -------------------------------------------------------------------------------- /Code - No Passwords/MAKE CRSP_COMP_LINK FILE.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | library(RPostgres) 4 | library(data.table) 5 | 6 | # NOTE - THIS FILE WILL THE LINKING FILE BETWEEN CRSP AND COMPUSTAT 7 | # Connect to WRDS Server -------------------------------------------------- 8 | wrds <- dbConnect(Postgres(), 9 | host = 'wrds-pgdata.wharton.upenn.edu', 10 | port = 9737, 11 | user = '', 12 | password = '', 13 | dbname = 'wrds', 14 | sslmode = 'require') 15 | 16 | # download the link file from wrds 17 | # get the CRSP - Compustat link file 18 | link <- tbl(wrds, sql("SELECT * FROM crsp.ccmxpf_lnkhist")) %>% 19 | filter(linktype %in% c("LC", "LU", "LS")) %>% 20 | collect() %>% 21 | # if linkeendt is missing set to today 22 | mutate(linkenddt = if_else(is.na(linkenddt), 23 | lubridate::today(), linkenddt)) 24 | 25 | # fix two date errors which generate a very small number of duplicates at the 26 | # permno-date level 27 | link$linkenddt[which(link$gvkey == "177446" & link$lpermno == 86812)] <- ymd(20190728) 28 | link$linkenddt[which(link$gvkey == "021998" & link$lpermno == 15075)] <- ymd(20180117) 29 | # drop one extra duplicate 30 | link <- link %>% filter(!(gvkey == "002759" & linkprim == "J")) 31 | 32 | # save the linked dataset 33 | saveRDS(link, here::here("Cleaned_Data", "link.rds")) 34 | -------------------------------------------------------------------------------- /Code - No Passwords/MAKE CRSP_COMP_MERGED.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | library(RPostgres) 4 | library(data.table) 5 | 6 | # Connect to WRDS Server -------------------------------------------------- 7 | wrds <- dbConnect(Postgres(), 8 | host = 'wrds-pgdata.wharton.upenn.edu', 9 | port = 9737, 10 | user = '', 11 | password = '', 12 | dbname = 'wrds', 13 | sslmode = 'require') 14 | 15 | # download compustat data 16 | comp <- tbl(wrds, sql("SELECT * FROM comp.funda")) %>% 17 | # filter the bullshit as per usual 18 | filter(indfmt == 'INDL' & datafmt == 'STD' & popsrc == 'D' & consol == 'C') %>% 19 | collect() %>% 20 | mutate(merge_date = datadate) %>% 21 | setDT() 22 | 23 | # download the link file from wrds 24 | # get the CRSP - Compustat link file 25 | link <- tbl(wrds, sql("SELECT * FROM crsp.ccmxpf_lnkhist")) %>% 26 | filter(linktype %in% c("LC", "LU", "LS")) %>% 27 | collect() %>% 28 | # if linkeendt is missing set to today 29 | mutate(linkenddt = if_else(is.na(linkenddt), 30 | lubridate::today(), linkenddt)) %>% 31 | setDT() 32 | 33 | # fix two date errors which generate a very small number of duplicates at the 34 | # permno-date level 35 | link$linkenddt[which(link$gvkey == "177446" & link$lpermno == 86812)] <- ymd(20190728) 36 | link$linkenddt[which(link$gvkey == "021998" & link$lpermno == 15075)] <- ymd(20180117) 37 | # drop one extra duplicate 38 | link <- link[!(gvkey == "002759" & linkprim == "J")] 39 | 40 | # set key in data table to make this go faster 41 | setkey(link, gvkey) 42 | 43 | # bring in link variable to compustat and keep just the right hit 44 | comp <- link[comp, 45 | on = .(gvkey, linkdt <= merge_date, linkenddt >= merge_date), 46 | nomatch = NA] %>% 47 | .[, count := .N, by = list(gvkey, datadate)] %>% 48 | .[count == 1 | linkprim == "P" | linkprim == "C"] %>% 49 | .[, count := NULL] %>% 50 | as_tibble() 51 | 52 | # save the compustat crsp merged dataset 53 | saveRDS(comp, here::here("Cleaned_Data", "crsp_compustat_merged.rds")) -------------------------------------------------------------------------------- /Code - No Passwords/MAKE PERMNO_TO_GVKEY.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | library(RPostgres) 4 | library(data.table) 5 | 6 | # NOTE - THIS FILE WILL GIVE YOU A DATASET THAT GOES FROM PERMNO/DATE -> GVKEY. 7 | # BECAUSE A COMPANY MIGHT HAVE MULTIPLE PERMNOS THIS WILL NOT NECESSARILY EQUAL 8 | # GVKEY/DATE -> PERMNO 9 | 10 | # Connect to WRDS Server -------------------------------------------------- 11 | wrds <- dbConnect(Postgres(), 12 | host = 'wrds-pgdata.wharton.upenn.edu', 13 | port = 9737, 14 | user = '', 15 | password = '', 16 | dbname = 'wrds', 17 | sslmode = 'require') 18 | 19 | # download the link file from wrds 20 | # get the CRSP - Compustat link file 21 | link <- tbl(wrds, sql("SELECT * FROM crsp.ccmxpf_lnkhist")) %>% 22 | filter(linktype %in% c("LC", "LU", "LS")) %>% 23 | collect() 24 | 25 | # fix two date errors which generate a very small number of duplicates at the 26 | # permno-date level 27 | link$linkenddt[which(link$gvkey == "177446" & link$lpermno == 86812)] <- '2019-07-28' 28 | link$linkenddt[which(link$gvkey == "021998" & link$lpermno == 15075)] <- '2018-01-17' 29 | 30 | # expand dates from start to end date in linking file 31 | link <- link %>% 32 | # if linkeendt is missing set to today 33 | mutate(linkenddt = if_else(is.na(linkenddt), 34 | lubridate::today(), linkenddt)) %>% 35 | # expand date by row 36 | rowwise() %>% 37 | do(tibble(gvkey = .$gvkey, permno = .$lpermno, permco = .$lpermco, 38 | linkprim = .$linkprim, liid = .$liid, 39 | datadate = seq.Date(.$linkdt, .$linkenddt, by = "day"))) %>% 40 | ungroup() %>% 41 | setDT() 42 | 43 | # set key in data table to make this go faster 44 | setkey(link, permno, datadate) 45 | 46 | # bring in names and ticker 47 | names <- tbl(wrds, sql("SELECT * FROM crsp.dsenames")) %>% 48 | collect() 49 | 50 | # expand dates from start to end date in linking file 51 | names <- names %>% 52 | # expand date by row 53 | rowwise() %>% 54 | do(tibble(permno = .$permno, cusip = .$ncusip, 55 | ticker = .$ticker, comnam = .$comnam, 56 | datadate = seq.Date(.$namedt, .$nameendt, by = "day"))) %>% 57 | ungroup() %>% 58 | setDT() 59 | 60 | setkey(names, permno, datadate) 61 | 62 | # merge in gvkey link info to the name file 63 | link <- merge(names, link, by = c("permno", "datadate"), all.x = TRUE) %>% 64 | # save it as a tibble 65 | as_tibble() 66 | 67 | # save the dataset 68 | saveRDS(link, here::here("Cleaned_Data", "permno_day_to_gvkey.rds")) -------------------------------------------------------------------------------- /Code - No Passwords/MAKE_AMIHUD_ILLIQUIDITY_MEASURE.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | library(RPostgres) 4 | library(data.table) 5 | 6 | # Connect to WRDS Server -------------------------------------------------- 7 | wrds <- dbConnect(Postgres(), 8 | host = 'wrds-pgdata.wharton.upenn.edu', 9 | port = 9737, 10 | user = '', 11 | password = '', 12 | dbname = 'wrds', 13 | sslmode = 'require') 14 | 15 | # pull in the crsp variables we need for data after 1985 16 | crsp <- tbl(wrds, sql("SELECT date, cusip, permno, permco, issuno, hsiccd, prc, vol, ret, shrout \ 17 | FROM crsp.dsf WHERE date >= '1985-01-01'")) %>% 18 | collect() 19 | 20 | # make amihud liquidity scores 21 | crsp_amihud <- setDT(crsp) 22 | crsp_amihud <- crsp_amihud %>% 23 | # drop if volume is zero bc the score breaks down 24 | .[vol != 0] %>% 25 | # make a year variable 26 | .[, year := year(date)] 27 | 28 | # set keys 29 | setkey(crsp_amihud, permno, year) 30 | 31 | # calculate by permno year 32 | amihud <- crsp_amihud %>% 33 | .[, amihud := 1000 * sqrt(abs(ret)/(abs(prc) * vol))] %>% 34 | .[, .(amihud = mean(amihud, na.rm = TRUE)), keyby = .(permno, year)] 35 | 36 | # save 37 | saveRDS(amihud, here::here("Cleaned_Data", "amihud.rds")) -------------------------------------------------------------------------------- /Code - No Passwords/MAKE_INST_SHARES.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | library(RPostgres) 4 | library(data.table) 5 | 6 | # Connect to WRDS Server -------------------------------------------------- 7 | wrds <- dbConnect(Postgres(), 8 | host = 'wrds-pgdata.wharton.upenn.edu', 9 | port = 9737, 10 | user = '', 11 | password = '', 12 | dbname = 'wrds', 13 | sslmode = 'require') 14 | 15 | # download institutional holdings 16 | institutional_data <- tbl(wrds, sql("SELECT rdate, fdate, mgrno, shares, cusip, shrout2 \ 17 | FROM tfn.s34")) %>% 18 | # keep just the first report date per mgr/cusip/reporting date 19 | group_by(rdate, mgrno, cusip) %>% 20 | filter(fdate == min(fdate, na.rm = TRUE)) %>% 21 | ungroup() %>% 22 | # sum shares over managers by reporting date, cusip 23 | group_by(rdate, cusip) %>% 24 | summarize(inst_shares = sum(shares, na.rm = TRUE)) %>% 25 | ungroup() %>% 26 | collect() %>% 27 | drop_na() 28 | 29 | # save 30 | saveRDS(institutional_data, here::here("Cleaned_Data", "institutional_shares.rds")) -------------------------------------------------------------------------------- /Code - No Passwords/SCRAPE_FF_INDS.R: -------------------------------------------------------------------------------- 1 | # This code will scrape Ken French's industry definition file and make a data frame 2 | # with the industry groupings and SIC code range. This uses Siccodes12 but you can use 3 | # the other industry definitions he has up there just swap out in the url link. 4 | 5 | library(tidyverse) 6 | 7 | # url for the industry definitions 8 | url <- "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes12.zip" 9 | 10 | # Download the data and unzip it 11 | f <- tempfile() 12 | download.file(url, f) 13 | inds <- read.delim(unz(f, "Siccodes12.txt"), header = FALSE) 14 | 15 | # make table 16 | inds <- inds %>% 17 | # rename the data column 18 | rename(name = V1) %>% 19 | # detect if it is an ID column 20 | rowwise() %>% 21 | # remove the leading space 22 | mutate(name = str_trim(name, side = "left")) %>% 23 | # detect if there are any letters in the name 24 | mutate(name_id = ifelse(sum(str_detect(name, letters)) >= 1, 1, 0)) %>% 25 | # get the id numbers 26 | mutate(ind_num = ifelse(name_id == 1, as.numeric(str_sub(name, 1, str_locate(name, " ")[1] - 1)), as.numeric(NA))) %>% 27 | # get the short name 28 | mutate(ind_abbrev = ifelse(name_id == 1, 29 | str_sub(name, str_locate_all(name, " ")[[1]][1] + 1, 30 | str_locate_all(name, " ")[[1]][2] - 1), as.character(NA))) %>% 31 | # get the full name 32 | mutate(ind_name = ifelse(name_id == 1, 33 | str_sub(name, str_locate_all(name, " ")[[1]][2] + 1, 34 | nchar(name)), as.character(NA))) %>% 35 | # fill all the values down 36 | ungroup() %>% 37 | fill(ind_num) %>% fill(ind_abbrev) %>% fill(ind_name) %>% 38 | # drop name id columns 39 | group_by(ind_num) %>% 40 | add_tally %>% 41 | filter(n == 1 | name_id == 0) %>% 42 | # get low and high sic by row 43 | rowwise() %>% 44 | mutate(sic_low = ifelse(name_id == 1, as.numeric(NA), as.numeric(str_sub(name, 1, str_locate(name, "-")[1] - 1))), 45 | sic_high = ifelse(name_id == 1, as.numeric(NA), as.numeric(str_sub(name, str_locate(name, "-")[1] + 1), nchar(name)))) %>% 46 | ungroup() %>% 47 | select(ind_num, ind_abbrev, ind_name, sic_low, sic_high) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Cleans 2 | This repo has code to do primary data cleaning for Compustat / Crsp from WRDS. 3 | The included codes do the following data cleaning steps: 4 | 5 | 1. MAKE CRSP_COMP_MERGED 6 | 7 | This code takes the universe of annual compustat financial information and matches to CRSP with the CRSP-specific unique security identifier (permno). In addition, it creates a file that has the associated primary permno for each gvkey/date combination. 8 | 9 | 2. MAKE PERMNO_TO_GVKEY 10 | 11 | This creates a file that has the associated compustat identifier (gvkey) for each permno/date combination. Because one firm might have multiple issued securities, this file should be longer than the unique gvkey/date combination file from CRSP_COMP_MERGED. In addition, there may be observations where there is a name and ticker associated with a permno, but where there is no compustat identifier (gvkey). These observations are retained, with gvkey being blank. 12 | 13 | 14 | 3. FAMA_FRENCH_FACTORS 15 | 16 | This code makes the portfolio based Fama French Factors and was contributed by Gertjan Verdickt. 17 | 18 | 4. SCRAPE_FF_INDS 19 | 20 | This code scrapes the Fama-French Industry definitions from Ken French's website. It uses the 12-industry FF definition, but you can swap out for any of the other industry classifications (e.g. 17, 48) in the url line. 21 | 22 | 5. MAKE_AMIHUD_ILLIQIDUITY MEASURE 23 | 24 | This code will make the Amihud (2002) Illiquidity measure by permno/calendar year (you can update it for different date ranges pretty easily if you so desire). This is measured as: 25 | 26 | 27 | 28 | 6. MAKE_INST_SHARES 29 | 30 | This code will calculate the number of shares held by institutional investors at the cusip/quarter level using Thomson Reuters 13F holdings data. 31 | --------------------------------------------------------------------------------