├── Code - No Passwords
    ├── FAMA_FRENCH_FACTORS.R
    ├── MAKE CRSP_COMP_LINK FILE.R
    ├── MAKE CRSP_COMP_MERGED.R
    ├── MAKE PERMNO_TO_GVKEY.R
    ├── MAKE_AMIHUD_ILLIQUIDITY_MEASURE.R
    ├── MAKE_INST_SHARES.R
    └── SCRAPE_FF_INDS.R
└── README.md


/Code - No Passwords/FAMA_FRENCH_FACTORS.R:
--------------------------------------------------------------------------------
  1 | ###############################################################################################################
  2 | ### REPLICATE FAMA-FRENCH-CARHART 4-FACTOR MODEL ####
  3 | # Code by Wayne Chang (https://sites.google.com/site/waynelinchang/home)
  4 | # Latest update assistance from Yifan Liu 
  5 | # Last Update: June 22, 2018
  6 | # Based on methodology described on Ken French website and relevant Fama-French papers
  7 | 
  8 | ###############################################################################################################
  9 | ### SETUP ###
 10 | 
 11 | library(tidyverse); library(zoo)
 12 | require(data.table); library(lubridate)
 13 | setwd("")
 14 | 
 15 | ### WRDS CONNECTION ###
 16 | 
 17 | library(RPostgres)
 18 | wrds <- dbConnect(Postgres(),
 19 |                   host='wrds-pgdata.wharton.upenn.edu',
 20 |                   port=9737,
 21 |                   user='',
 22 |                   password='',
 23 |                   dbname='wrds',
 24 |                   sslmode='require')
 25 | 
 26 | ###############################################################################################################
 27 | ### LOAD COMPUSTAT FROM WRDS ###
 28 |   # Downloads Compustat and Compustat/Crsp merging link from WRDS
 29 |   # adds the linked permno to the compustat dataset
 30 |   # no filtering except must have PERMNO link
 31 | 
 32 | # retrieve Compustat annual data (takes 10mins each below)
 33 | res <- dbSendQuery(wrds,"select GVKEY, CUSIP, DATADATE, FYR, FYEAR, SICH, NAICSH,
 34 |                             AT, LT, SEQ, CEQ, PSTKL, PSTKRV, PSTK, TXDITC, TXDB, ITCB,
 35 |                             REVT, COGS, XINT, XSGA, IB, TXDI, DVC, ACT, CHE, LCT,
 36 |                             DLC, TXP, DP, PPEGT, INVT 
 37 |                     from COMP.FUNDA
 38 |                     where INDFMT='INDL' and DATAFMT='STD' and CONSOL='C' and POPSRC='D'") # STD is unrestatd data
 39 | data.comp.funda <- dbFetch(res, n = -1) # n=-1 denotes no max but retrieve all record
 40 | save(data.comp.funda, file = "180619 data.comp.funda.RData")
 41 |                        
 42 | # retrieve Compustat quarterly data
 43 | res <- dbSendQuery(wrds,"select GVKEY, CUSIP, DATADATE, FYR, FYEARQ,
 44 |                             ATQ, LTQ, SEQQ, CEQQ, PSTKRQ, PSTKQ, TXDITCQ, TXDBQ,
 45 |                             REVTQ, COGSQ, XINTQ, XSGAQ, IBQ, TXDIQ, ACTQ, CHEQ, LCTQ,
 46 |                             DLCQ, TXPQ, DPQ, PPEGTQ, INVTQ, EPSPXQ, RDQ
 47 |                            from COMPM.FUNDQ
 48 |                            where INDFMT='INDL' and DATAFMT='STD' and CONSOL='C' and POPSRC='D'") # STD is unrestatd data
 49 | data.comp.fundq <- dbFetch(res, n = -1) # n=-1 denotes no max but retrieve all record
 50 | save(data.comp.fundq, file = "180619 data.comp.fundq.RData")
 51 | 
 52 | # retrieve Merged Compustat/CRSP link table
 53 | res <- dbSendQuery(wrds,"select GVKEY, LPERMNO, LINKDT, LINKENDDT, LINKTYPE, LINKPRIM
 54 |                     from crsp.ccmxpf_lnkhist")
 55 | data.ccmlink <- dbFetch(res, n = -1) 
 56 | save(data.ccmlink, file = "180619 data.ccmlink.RData")
 57 | 
 58 | # Merge the linked Permno onto Compustat dataset
 59 | # compared to SAS code based on WRDS FF Research macro, I don't include all Linktypes but add J Linkprim
 60 |   # including J linkprim is key bc/ allows me to get the post-2010 Berkshire history
 61 |   # omitting non-primary linktypes led to 1% fewer obs (2,000) but cleaner data (datadate<="2013-12-31" for comparability)
 62 | data.ccm <-  data.ccmlink %>%
 63 |   # use only primary links (from WRDS Merged Compustat/CRSP examples)
 64 |   filter(linktype %in% c("LU", "LC", "LS")) %>%
 65 |   filter(linkprim %in% c("P", "C", "J")) %>%
 66 |   merge(data.comp.funda, by="gvkey") %>% # inner join, keep only if permno exists
 67 |   mutate(datadate = as.Date(datadate), 
 68 |          permno = as.factor(lpermno),
 69 |          linkdt = as.Date(linkdt),
 70 |          linkenddt = as.Date(linkenddt),
 71 |          linktype = factor(linktype, levels=c("LC", "LU", "LS")),
 72 |          linkprim = factor(linkprim, levels=c("P", "C", "J"))) %>%
 73 |   # remove compustat fiscal ends that do not fall within linked period; linkenddt=NA (from .E) means ongoing  
 74 |   filter(datadate >= linkdt & (datadate <= linkenddt | is.na(linkenddt))) %>%
 75 |   # prioritize linktype, linkprim based on order of preference/primary if duplicate
 76 |   arrange(datadate, permno, linktype, linkprim) %>%
 77 |   distinct(datadate, permno, .keep_all = TRUE)
 78 | save(data.ccm, file = "180619 data.ccm.RData")
 79 | rm(data.comp.funda, data.ccmlink)
 80 | 
 81 | ###############################################################################################################
 82 | ### COMPUSTAT CLEANING AND VAR CALC ###
 83 | 
 84 | # load("180619 data.ccm.RData")
 85 | data.comp <- data.ccm %>%
 86 |   rename(PERMNO=permno) %>% data.table %>% # ensure col names match crsp's
 87 |   group_by(PERMNO) %>% 
 88 |   mutate(datadate = as.yearmon(datadate),
 89 |          comp.count = n()) %>% # number of years in data; future option to cut first year data; works but leads to warnings
 90 |     # tests based on BE spread show FF no longer impose this condition (even though mentioned in FF'93)
 91 |   ungroup %>% arrange(datadate, PERMNO) %>% data.frame %>%
 92 |   distinct(datadate, PERMNO, .keep_all = TRUE) # hasn't been issue but just in case
 93 | 
 94 | data.comp.a <- data.comp %>%
 95 |   group_by(PERMNO) %>%
 96 |   mutate(BE = coalesce(seq, ceq + pstk, at - lt) + coalesce(txditc, txdb + itcb, 0) - 
 97 |               coalesce(pstkrv, pstkl, pstk, 0), # consistent w/ French website variable definitions
 98 |          OpProf = (revt - coalesce(cogs, 0) - coalesce(xint, 0) - coalesce(xsga,0)),
 99 |          OpProf = as.numeric(ifelse(is.na(cogs) & is.na(xint) & is.na(xsga), NA, OpProf)), # FF condition
100 |          GrProf = (revt - cogs),
101 |          Cflow = ib + coalesce(txdi, 0) + dp,  # operating; consistent w/ French website variable definitions
102 |          Inv = (coalesce(ppegt - lag(ppegt), 0) + coalesce(invt - lag(invt), 0)) / lag(at),
103 |          AstChg = (at - lag(at)) / lag(at) # note that lags use previously available (may be different from 1 yr)
104 |          ) %>% ungroup %>%
105 |   arrange(datadate, PERMNO) %>%
106 |   select(datadate, PERMNO, comp.count, at, revt, ib, dvc, BE:AstChg) %>%
107 |   mutate_if(is.numeric, funs(ifelse(is.infinite(.), NA, .))) %>% # replace Inf w/ NA's
108 |   mutate_if(is.numeric, funs(round(., 5))) # round to 5 decimal places (for some reason, 0's not properly coded in some instances)
109 | save(data.comp.a, file="180619 data.comp.a.RData")
110 | rm(data.ccm, data.comp)
111 | 
112 | ###############################################################################################################
113 | ### LOAD CRSP FROM WRDS ###
114 |   # Downloads CRSP MSE, MSF, and MSEDELIST tables from WRDS
115 |   # merges, cleans, and for market cap calc, combines permco's with multiple permnos (eg berkshire)
116 |   # no filtering 
117 | 
118 | # SLOW CODE (30 mins)
119 | res <- dbSendQuery(wrds, "select DATE, PERMNO, PERMCO, CFACPR, CFACSHR, SHROUT, PRC, RET, RETX, VOL
120 |                    from CRSP.MSF")
121 | #                   where PRC is not null")
122 | crsp.msf <- dbFetch(res, n = -1) 
123 | save(crsp.msf, file = "180619 crsp.msf.RData")
124 | 
125 | res <- dbSendQuery(wrds, "select DATE, PERMNO, SHRCD, EXCHCD
126 |                    from CRSP.MSE")
127 | #                   where SHRCD is not null")
128 | crsp.mse <- dbFetch(res, n = -1)
129 | save(crsp.mse, file = "180619 crsp.mse.RData")
130 | 
131 | res <- dbSendQuery(wrds, "select DLSTDT, PERMNO, dlret
132 |                    from crspq.msedelist")
133 | #                   where dlret is not null")
134 | crsp.msedelist <- dbFetch(res, n = -1)
135 | save(crsp.msedelist, file = "180619 crsp.msedelist.RData")
136 | 
137 | # clean and marge data
138 | crsp.msf <- crsp.msf %>%
139 |   filter(!is.na(prc)) %>%
140 |   mutate(Date = as.yearmon(as.Date(date))) %>%
141 |   select(-date)
142 | crsp.mse <- crsp.mse %>%
143 |   filter(!is.na(shrcd)) %>%
144 |   mutate(Date = as.yearmon(as.Date(date))) %>%
145 |   select(-date)
146 | crsp.msedelist <- crsp.msedelist %>%
147 |   filter(!is.na(dlret)) %>%
148 |   mutate(Date = as.yearmon(as.Date(dlstdt))) %>%
149 |   select(-dlstdt)
150 | data.crsp.m <- crsp.msf %>%
151 |   merge(crsp.mse, by=c("Date", "permno"), all=TRUE, allow.cartesian=TRUE) %>%
152 |   merge(crsp.msedelist, by=c("Date", "permno"), all=TRUE, allow.cartesian=TRUE) %>%
153 |   rename(PERMNO=permno) %>% 
154 |   mutate_at(vars(PERMNO, permco, shrcd, exchcd), funs(as.factor)) %>%
155 |   mutate(retadj=ifelse(!is.na(ret), ret, ifelse(!is.na(dlret), dlret, NA))) %>% # create retadj by merging ret and dlret
156 |   arrange(Date, PERMNO) %>%
157 |   group_by(PERMNO) %>%    
158 |   mutate_at(vars(shrcd, exchcd), funs(na.locf(., na.rm=FALSE)))  # fill in NA's with latest available (must sort by Date and group by PERMNO)
159 |   
160 | data.crsp.m <- data.crsp.m %>%
161 |   mutate(meq = shrout * abs(prc)) %>% # me for each permno
162 |   group_by(Date, permco) %>%
163 |   mutate(ME = sum(meq)) %>% # to calc market cap, merge permnos with same permnco
164 |   arrange(Date, permco, desc(meq)) %>%
165 |   group_by(Date, permco) %>%
166 |   slice(1) %>% # keep only permno with largest meq
167 |   ungroup
168 | 
169 | save(data.crsp.m, file = "180619 data.crsp.m.RData")
170 | rm(crsp.mse, crsp.msf, crsp.msedelist)
171 | 
172 | ###############################################################################################################
173 | ### CRSP CLEANING ###
174 |   # filters EXCHCD (NYSE, NASDAQ, AMEX) and SHRCD (10,11)
175 | 
176 | Fill_TS_NAs <- function(main) {
177 |   # takes datat frame with Date and PERMNO as columns and fills in NAs where there are gaps
178 |   
179 |   core <- select(main, Date, PERMNO)
180 |   # find first and last dates of each PERMNO
181 |   date.bookends <- core %>%
182 |     group_by(PERMNO) %>%
183 |     summarize(first = first(Date), last = last(Date))
184 |   
185 |   # generate all dates for all PERMNOs then trim those outside of each PERMNO's first and last dates
186 |   output <- core %>%
187 |     mutate(temp = 1) %>% # create 3rd column so spread can be applied
188 |     spread(., PERMNO, temp) %>%
189 |     gather(., PERMNO, temp, -Date) %>%
190 |     merge(date.bookends, by="PERMNO", all.x=TRUE) %>%
191 |     group_by(PERMNO) %>%
192 |     filter(Date>=first & Date<=last) %>%
193 |     select(Date, PERMNO) %>%
194 |     merge(., main, by=c("Date", "PERMNO"), all.x=TRUE)
195 |   
196 |   return(output)
197 | }
198 | 
199 | # SLOW CODE (5 mins)
200 | load("180619 data.crsp.m.RData")
201 | data.crsp.cln <- data.crsp.m %>%
202 |   select(Date, PERMNO, shrcd, exchcd, cfacpr, cfacshr, shrout, prc, vol, retx, retadj, ME) %>%
203 |   mutate(ME = ME/1000) %>%  # convert from thousands to millions (consistent with compustat values)
204 |   filter((shrcd == 10 | shrcd == 11) & (exchcd == 1 | exchcd == 2 | exchcd == 3)) %>%
205 |   Fill_TS_NAs %>% # fill in gap dates within each PERMNO with NAs to uses lead/lag (lead to NAs for SHRCD and EXCHCD); fn in AnoDecomp_Support
206 |   mutate(PERMNO = as.factor(PERMNO)) %>%
207 |   group_by(PERMNO) %>%
208 |   mutate(port.weight = as.numeric(ifelse(!is.na(lag(ME)), lag(ME), ME/(1+retx))), # calc portweight as ME at beginning of period
209 |          port.weight = ifelse(is.na(retadj) & is.na(prc), NA, port.weight)) %>% # remove portweights calc for date gaps
210 |   ungroup %>% 
211 |   rename(retadj.1mn = retadj) %>%
212 |   arrange(Date, PERMNO) %>%
213 |   distinct(Date, PERMNO, .keep_all = TRUE) # hasn't been issue but just in case
214 | 
215 | save(data.crsp.cln, file = "180619 data.crsp.cln.RData")
216 | rm(data.crsp.m)
217 | 
218 | ###############################################################################################################
219 | ### MERGE COMPUSTAT AND CRSP ###
220 |   # Merges CRSP and Compustat data fundamentals by PERMNO and DATE (annual-June-end portfolio formation)
221 |   # Also get Davis book equity data (Compustat match begins 1951 but Davis book data available starting 20s)
222 |   # Keep all CRSP info (drop Compustat if can't find CRSP)
223 |   # Match Compustat and Davis data based on FF methodology (to following year June when data is first known at month end)
224 | 
225 | load("180619 data.crsp.cln.RData")
226 | load("180619 data.comp.a.RData")
227 | 
228 | data.Davis.bkeq <- read.csv("~/OneDrive/Research/Data/French/Davis Book Equity.csv")
229 | data.Davis.bkeq[data.Davis.bkeq == -999 | data.Davis.bkeq == -99.99] <- NA
230 | data.Davis.bkeq <- data.Davis.bkeq %>%
231 |   mutate(PERMNO = factor(PERMNO)) %>%
232 |   data.table %>%
233 |   select(-FirstYr, -LastYr) %>%
234 |   gather(Date, Davis.bkeq, -PERMNO, na.rm=TRUE) %>%
235 |   mutate(Date = as.yearmon(ymd(paste0(substr(Date, 2, 5),"-6-01"))))
236 | 
237 | na_locf_until = function(x, n) {
238 |   # in time series data, fill in na's untill indicated n
239 |   l <- cumsum(! is.na(x))
240 |   c(NA, x[! is.na(x)])[replace(l, ave(l, l, FUN=seq_along) > (n+1), 0) + 1]
241 | }
242 | 
243 | # SLOW CODE (5 mins)
244 | data.both.m <- data.comp.a %>%  
245 |   mutate(Date = datadate + (18-month(datadate))/12) %>% # map to next year June period when data is known (must occur in previous year)
246 |   merge(data.crsp.cln, ., by=c("Date", "PERMNO"), all.x=TRUE, allow.cartesian=TRUE) %>%  # keep all CRSP records (Compustat only goes back to 1950)
247 |   merge(data.Davis.bkeq, by=c("Date", "PERMNO"), all.x=TRUE, allow.cartesian=TRUE) %>%
248 |   arrange(PERMNO, Date, desc(datadate)) %>%
249 |   distinct(PERMNO, Date, .keep_all = TRUE) %>% # drop older datadates (must sort by desc(datadate))
250 |   group_by(PERMNO) %>%
251 |   # fill in Compustat and Davis data NA's with latest available for subsequent year (must sort by Date and group by PERMNO)
252 |   # filling max of 11 previous months means gaps may appear when fiscal year end changes (very strict)
253 |   mutate_at(vars(datadate:Davis.bkeq), funs(na_locf_until(., 11))) %>%
254 |   ungroup %>% 
255 |   mutate(datadate = yearmon(datadate)) %>%
256 |   arrange(Date, PERMNO)
257 | 
258 | save(data.both.m, file = "180619 data.both.m.RData") 
259 |   # company info has no Date gaps (filled with NA's)
260 |   # all data publicly available by end of Date period (Compustat first data is June-1950 matched to CRSP Jun-51))
261 |   # includes all CRSP (but only Compustat/Davis data that matches CRSP)
262 |     # CRSP first month price data Dec-25, return data Jan-26
263 |     # CRSP last month data Dec-17 (Compustat 2017 data available but discarded bc/ must be mapped to CRSP 2018 data)
264 |   # 180619 3.507 MM obs (Old: 170226 3.463 MM obs)
265 | rm(data.comp.a, data.crsp.cln, data.Davis.bkeq)
266 | 
267 | ###############################################################################################################
268 | ### Add FF Variables ###
269 | 
270 | # SLOW CODE (10 mins)
271 | load("180619 data.both.m.RData") 
272 | data.both.FF.m <- data.both.m %>%
273 |   group_by(PERMNO) %>%
274 |   mutate(d.shares = (shrout*cfacshr)/(lag(shrout)*lag(cfacshr))-1, # change in monthly share count (adjusted for splits)
275 |          ret.12t2 = (lag(retadj.1mn,1)+1)*(lag(retadj.1mn,2)+1)*(lag(retadj.1mn,3)+1)*(lag(retadj.1mn,4)+1)*
276 |                     (lag(retadj.1mn,5)+1)*(lag(retadj.1mn,6)+1)*(lag(retadj.1mn,7)+1)*(lag(retadj.1mn,8)+1)*
277 |                     (lag(retadj.1mn,9)+1)*(lag(retadj.1mn,10)+1)*(lag(retadj.1mn,11)+1)-1, # to calc momentum spread
278 |          BE = coalesce(BE, Davis.bkeq), # data available by end-of-Jun based on Compustat Date mapping 
279 |          ME.Dec = as.numeric(ifelse(month(Date)==6 & lag(ME,6)>0, lag(ME,6), NA)), # previous Dec ME 
280 |          ME.Jun = as.numeric(ifelse(month(Date)==6, ME, NA)), 
281 |          BM.FF = as.numeric(ifelse(month(Date)==6 & ME.Dec>0, BE/ME.Dec, NA)), 
282 |          OpIB = as.numeric(ifelse(month(Date)==6 & BE>0, OpProf/BE, NA)), 
283 |          GrIA = as.numeric(ifelse(month(Date)==6 & at>0, GrProf/at, NA)),
284 |          CFP.FF = as.numeric(ifelse(month(Date)==6 & ME.Dec>0, Cflow/ME.Dec, NA)),
285 |          BM.m = BE/ME, # monthly updated version for spread calc
286 |          CFP.m = Cflow/ME, # monthly updated version for spread calc
287 |          lag.ME.Jun = lag(ME.Jun), # monthly data so only lag by 1 mn
288 |          lag.BM.FF = lag(BM.FF),
289 |          lag.ret.12t2 = lag(ret.12t2),
290 |          lag.OpIB = lag(OpIB),
291 |          lag.AstChg = lag(AstChg))
292 | 
293 | data.both.FF.m %<>%
294 |   mutate_at(vars(d.shares:lag.AstChg), funs(ifelse(!is.infinite(.), ., NA))) %>% # code Inf values as NAs
295 |   select(Date, datadate, PERMNO, exchcd, comp.count, prc, vol, retadj.1mn, d.shares, ME, port.weight, 
296 |          ret.12t2, at:AstChg, ME.Jun:lag.AstChg) %>%
297 |   arrange(Date, PERMNO) %>%
298 |   group_by(PERMNO) %>%
299 |   mutate_at(vars(ME.Jun:CFP.FF, lag.ME.Jun:lag.AstChg), funs(na_locf_until(., 11))) %>%
300 |   ungroup %>%
301 |   mutate(port.weight = ifelse(is.na(port.weight), 0, port.weight)) # necessary to avoid NAs for weighted ret calc
302 | save(data.both.FF.m, file = "180619 data.both.FF.m.RData")
303 | rm(data.both.m)
304 | 
305 | ###############################################################################################################
306 | ### Form FF Factors ###
307 | 
308 | Form_CharSizePorts2 <- function(main, size, var, wght, ret) { # streamlined version
309 |   # forms 2x3 (size x specificed-characteristc) and forms the 6 portfolios
310 |   # variable broken by 30-70 percentiles, size broken up at 50 percentile (breakpoints uses NYSE data only)
311 |   # requires Date and exchcd
312 |   # outputs portfolio returns for each period,
313 |   
314 |   main.cln <- main %>%
315 |     select(Date, PERMNO, exchcd, !!size, !!var, !!wght, !!ret)
316 |   
317 |   Bkpts.NYSE <- main.cln %>% # create size and var breakpoints based on NYSE stocks only
318 |     filter(exchcd == 1) %>% # NYSE exchange
319 |     group_by(Date) %>%
320 |     summarize(var.P70 = quantile(!!var, probs=.7, na.rm=TRUE),
321 |               var.P30 = quantile(!!var, probs=.3, na.rm=TRUE),
322 |               size.Med = quantile(!!size, probs=.5, na.rm=TRUE))
323 |   
324 |   # calculate size and var portfolio returns
325 |   main.rank <- main.cln %>%
326 |     merge(Bkpts.NYSE, by="Date", all.x=TRUE) %>%
327 |     mutate(Size = ifelse(!!size<size.Med, "Small", "Big"),
328 |            Var = ifelse(!!var<var.P30, "Low", ifelse(!!var>var.P70, "High", "Neutral")),
329 |            Port = paste(Size, Var, sep="."))
330 |   
331 |   Ret <- main.rank %>% # name 2 x 3 size-var portfolios
332 |     group_by(Date, Port) %>%
333 |     summarize(ret.port = weighted.mean(!!ret, !!wght, na.rm=TRUE)) %>% # calc value-weighted returns
334 |     spread(Port, ret.port) %>% # transpose portfolios expressed as rows into seperate columns
335 |     mutate(Small = (Small.High + Small.Neutral + Small.Low)/3,
336 |            Big = (Big.High + Big.Neutral + Big.Low)/3,
337 |            SMB = Small - Big,
338 |            High = (Small.High + Big.High)/2,
339 |            Low = (Small.Low + Big.Low)/2,
340 |            HML = High - Low)
341 |   
342 |   return(Ret)
343 | }
344 | 
345 | Form_FF6Ports <- function(df) {
346 |   # form FF6 factors from data (SMB, HML, RMW, CMA, UMD)
347 |   output <- df %>%
348 |     group_by(Date) %>%
349 |     summarize(MyMkt = weighted.mean(retadj.1mn, w=port.weight, na.rm=TRUE)) %>%
350 |     merge(Form_CharSizePorts2(df, quo(lag.ME.Jun), quo(lag.BM.FF), quo(port.weight), quo(retadj.1mn)),
351 |           by="Date", all.x=TRUE) %>% # SMB, HML
352 |     select(Date:MyMkt, MySMB=SMB, MySMBS=Small, MySMBB=Big, MyHML=HML, MyHMLH=High, MyHMLL=Low) %>%
353 |     merge(Form_CharSizePorts2(df, quo(lag.ME.Jun), quo(lag.OpIB), quo(port.weight), quo(retadj.1mn)),
354 |           by="Date", all.x=TRUE) %>% # RMW
355 |     select(Date:MyHMLL, MyRMW=HML, MyRMWR=High, MyRMWW=Low) %>%
356 |     merge(Form_CharSizePorts2(df, quo(lag.ME.Jun), quo(lag.AstChg), quo(port.weight), quo(retadj.1mn)),
357 |         by="Date", all.x=TRUE) %>% # CMA
358 |     select(Date:MyRMWW, MyCMA=HML, MyCMAC=Low, MyCMAA=High) %>%
359 |     mutate(MyCMA=-MyCMA) %>%
360 |     merge(Form_CharSizePorts2(df, quo(port.weight), quo(lag.ret.12t2), quo(port.weight), quo(retadj.1mn)), 
361 |           by="Date", all.x=TRUE) %>% # UMD
362 |     select(Date:MyCMAA, MyUMD=HML, MyUMDU=High, MyUMDD=Low)
363 |   return(output)
364 | }
365 | 
366 | load("180619 data.both.FF.m.RData")
367 | dt.myFF6.m <- Form_FF6Ports(data.both.FF.m) %>%
368 |   filter(year(Date) > 1925 & year(Date) < 2018)
369 | save(dt.myFF6.m, file = "180619 dt.myFF6.m.RData")
370 | 
371 | ###############################################################################################################
372 | ### TEST FOR CONSISTENCY WITH POSTED FACTORS ###
373 | 
374 | Compare_Two_Vectors2 <- function(v1, v2, sqnc=1) { # omits DATE option
375 |   # takes two dataframes where each has a Date and another var column
376 |   # compares the two var columns for similarity
377 |   # v1.date, v2.date denotes col name of date which must be yearmon
378 |   # sqnc affects the plot, e.g. if =3 then plot every third point (so not too crowded)
379 |   # will not work if missing observations in between for one dataframe since cor needs exact same length
380 |   
381 |   # obtain common time segment among two vectors
382 |   lo.date <- max(v1[["Date"]][min(which(!is.na(v1[[colnames(v1)[2]]])))],
383 |                  v2[["Date"]][min(which(!is.na(v2[[colnames(v2)[2]]])))])
384 |   hi.date <- min(v1[["Date"]][max(which(!is.na(v1[[colnames(v1)[2]]])))],
385 |                  v2[["Date"]][max(which(!is.na(v2[[colnames(v2)[2]]])))])
386 |   v1.trim <- subset(v1, lo.date <= Date & Date <= hi.date)
387 |   v2.trim <- subset(v2, lo.date <= Date & Date <= hi.date)
388 |   
389 |   print("correlation")
390 |   print(cor(v1.trim[[colnames(v1.trim)[2]]], v2.trim[[colnames(v2.trim)[2]]], use="complete.obs"))
391 |   print("v1.trim mean")
392 |   print(mean(v1.trim[[colnames(v1.trim)[2]]], na.rm=TRUE))
393 |   print("v2.trim mean")
394 |   print(mean(v2.trim[[colnames(v2.trim)[2]]], na.rm=TRUE))
395 |   print("means: v1.trim-v2.trim")
396 |   print(mean(v1.trim[[colnames(v1.trim)[2]]], na.rm=TRUE)-mean(v2.trim[[colnames(v2.trim)[2]]], na.rm=TRUE))
397 |   plot(v1.trim[["Date"]][seq(1,nrow(v1.trim),sqnc)],
398 |        v1.trim[[colnames(v1)[2]]][seq(1,nrow(v1.trim),sqnc)], type="l", xlab="Date", ylab="Returns (%)")
399 |   lines(v2.trim[["Date"]][seq(1,nrow(v2.trim),sqnc)],
400 |         v2.trim[[colnames(v2)[2]]][seq(1,nrow(v2.trim),sqnc)], type="l", col="blue")
401 |   legend("topright", c("1", "2"), lty=1, col=c("black","blue"))
402 | }
403 | 
404 | # import FF6 and Mkt to check data
405 | library(XLConnect)
406 | wb <- loadWorkbook("~/OneDrive/Research/Data/French/180622 French_FF6.xlsx")
407 | dt.FF6.m <- wb %>%
408 |   readWorksheet(sheet = 1) %>%
409 |   mutate(Date = as.yearmon(ymd(paste0(Date,28))),
410 |          Mkt = MktRF + RF) %>%
411 |   mutate_at(vars(-Date), funs(./100)) %>%
412 |   arrange(Date)
413 | 
414 | # check FF6 factor returns (Jul '26 through Dec '17)
415 | load("180619 dt.myFF6.m.RData")
416 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyMkt), select(dt.FF6.m, Date, Mkt), sqnc=12)
417 | # Mkt: cor 99.999%; both means 93.87 bps
418 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MySMB), select(dt.FF6.m, Date, SMB), sqnc=12)
419 | # SMB: cor 99.6%; means 19.2 bps vs 20.1 bps
420 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyHML), select(dt.FF6.m, Date, HML), sqnc=12)
421 | # HML: cor 98.9%; means 37.2 bps vs 38.2 bps 
422 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyRMW), select(dt.FF6.m, Date, RMW), sqnc=12)
423 | # RMW: cor 98.3%; means 26.2 bps vs 24.8 bps
424 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyCMA), select(dt.FF6.m, Date, CMA), sqnc=12)
425 | # CMA: cor 97.9%; means 23.2 bps vs 28.7 bps (SIZEABLE GAP)
426 | # tested using lag(ME) for port weights instead (avoid using RETX) but doesn't make a diff
427 | Compare_Two_Vectors2(select(dt.myFF6.m, Date, MyUMD), select(dt.FF6.m, Date, UMD), sqnc=12)
428 | # UMD: cor 99.8%; means 65.3 bps vs 65.8 bps
429 | 
430 | 


--------------------------------------------------------------------------------
/Code - No Passwords/MAKE CRSP_COMP_LINK FILE.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(lubridate)
 3 | library(RPostgres)
 4 | library(data.table)
 5 | 
 6 | # NOTE - THIS FILE WILL THE LINKING FILE BETWEEN CRSP AND COMPUSTAT
 7 | # Connect to WRDS Server --------------------------------------------------
 8 | wrds <- dbConnect(Postgres(),
 9 |                   host = 'wrds-pgdata.wharton.upenn.edu',
10 |                   port = 9737,
11 |                   user = '',
12 |                   password = '',
13 |                   dbname = 'wrds',
14 |                   sslmode = 'require')
15 | 
16 | # download the link file from wrds
17 | # get the CRSP - Compustat link file
18 | link <- tbl(wrds, sql("SELECT * FROM crsp.ccmxpf_lnkhist")) %>%
19 |   filter(linktype %in% c("LC", "LU", "LS")) %>% 
20 |   collect() %>% 
21 |   # if linkeendt is missing set to today
22 |   mutate(linkenddt = if_else(is.na(linkenddt), 
23 |                              lubridate::today(), linkenddt)) 
24 | 
25 | # fix two date errors which generate a very small number of duplicates at the 
26 | # permno-date level
27 | link$linkenddt[which(link$gvkey == "177446" & link$lpermno == 86812)] <- ymd(20190728)
28 | link$linkenddt[which(link$gvkey == "021998" & link$lpermno == 15075)] <- ymd(20180117)
29 | # drop one extra duplicate
30 | link <- link %>% filter(!(gvkey == "002759" & linkprim == "J"))
31 | 
32 | # save the linked dataset
33 | saveRDS(link, here::here("Cleaned_Data", "link.rds"))
34 | 


--------------------------------------------------------------------------------
/Code - No Passwords/MAKE CRSP_COMP_MERGED.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(lubridate)
 3 | library(RPostgres)
 4 | library(data.table)
 5 | 
 6 | # Connect to WRDS Server --------------------------------------------------
 7 | wrds <- dbConnect(Postgres(),
 8 |                   host = 'wrds-pgdata.wharton.upenn.edu',
 9 |                   port = 9737,
10 |                   user = '',
11 |                   password = '',
12 |                   dbname = 'wrds',
13 |                   sslmode = 'require')
14 | 
15 | # download compustat data
16 | comp <- tbl(wrds, sql("SELECT * FROM comp.funda")) %>%
17 |   # filter the bullshit as per usual
18 |   filter(indfmt == 'INDL' & datafmt == 'STD' & popsrc == 'D' & consol == 'C') %>% 
19 |   collect() %>%
20 |   mutate(merge_date = datadate) %>% 
21 |   setDT()
22 | 
23 | # download the link file from wrds
24 | # get the CRSP - Compustat link file
25 | link <- tbl(wrds, sql("SELECT * FROM crsp.ccmxpf_lnkhist")) %>%
26 |   filter(linktype %in% c("LC", "LU", "LS")) %>% 
27 |   collect() %>% 
28 |   # if linkeendt is missing set to today
29 |   mutate(linkenddt = if_else(is.na(linkenddt), 
30 |                              lubridate::today(), linkenddt)) %>% 
31 |   setDT()
32 | 
33 | # fix two date errors which generate a very small number of duplicates at the 
34 | # permno-date level
35 | link$linkenddt[which(link$gvkey == "177446" & link$lpermno == 86812)] <- ymd(20190728)
36 | link$linkenddt[which(link$gvkey == "021998" & link$lpermno == 15075)] <- ymd(20180117)
37 | # drop one extra duplicate
38 | link <- link[!(gvkey == "002759" & linkprim == "J")]
39 | 
40 | # set key in data table to make this go faster
41 | setkey(link, gvkey)
42 | 
43 | # bring in link variable to compustat and keep just the right hit
44 | comp <- link[comp, 
45 |              on = .(gvkey, linkdt <= merge_date, linkenddt >= merge_date), 
46 |              nomatch = NA] %>% 
47 |   .[, count := .N, by = list(gvkey, datadate)] %>% 
48 |   .[count == 1 | linkprim == "P" | linkprim == "C"] %>% 
49 |   .[, count := NULL] %>% 
50 |   as_tibble()
51 | 
52 | # save the compustat crsp merged dataset
53 | saveRDS(comp, here::here("Cleaned_Data", "crsp_compustat_merged.rds"))


--------------------------------------------------------------------------------
/Code - No Passwords/MAKE PERMNO_TO_GVKEY.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(lubridate)
 3 | library(RPostgres)
 4 | library(data.table)
 5 | 
 6 | # NOTE - THIS FILE WILL GIVE YOU A DATASET THAT GOES FROM PERMNO/DATE -> GVKEY.
 7 | # BECAUSE A COMPANY MIGHT HAVE MULTIPLE PERMNOS THIS WILL NOT NECESSARILY EQUAL 
 8 | # GVKEY/DATE -> PERMNO
 9 | 
10 | # Connect to WRDS Server --------------------------------------------------
11 | wrds <- dbConnect(Postgres(),
12 |                   host = 'wrds-pgdata.wharton.upenn.edu',
13 |                   port = 9737,
14 |                   user = '',
15 |                   password = '',
16 |                   dbname = 'wrds',
17 |                   sslmode = 'require')
18 | 
19 | # download the link file from wrds
20 | # get the CRSP - Compustat link file
21 | link <- tbl(wrds, sql("SELECT * FROM crsp.ccmxpf_lnkhist")) %>%
22 |   filter(linktype %in% c("LC", "LU", "LS")) %>% 
23 |   collect()
24 | 
25 | # fix two date errors which generate a very small number of duplicates at the 
26 | # permno-date level
27 | link$linkenddt[which(link$gvkey == "177446" & link$lpermno == 86812)] <- '2019-07-28'
28 | link$linkenddt[which(link$gvkey == "021998" & link$lpermno == 15075)] <- '2018-01-17'
29 | 
30 | # expand dates from start to end date in linking file
31 | link <- link %>% 
32 |   # if linkeendt is missing set to today
33 |   mutate(linkenddt = if_else(is.na(linkenddt), 
34 |                              lubridate::today(), linkenddt)) %>% 
35 |   # expand date by row
36 |   rowwise() %>% 
37 |   do(tibble(gvkey = .$gvkey, permno = .$lpermno, permco = .$lpermco,
38 |             linkprim = .$linkprim, liid = .$liid,
39 |             datadate = seq.Date(.$linkdt, .$linkenddt, by = "day"))) %>% 
40 |   ungroup() %>% 
41 |   setDT()
42 | 
43 | # set key in data table to make this go faster
44 | setkey(link, permno, datadate)
45 | 
46 | # bring in names and ticker
47 | names <- tbl(wrds, sql("SELECT * FROM crsp.dsenames")) %>% 
48 |   collect()
49 | 
50 | # expand dates from start to end date in linking file
51 | names <- names %>% 
52 |   # expand date by row
53 |   rowwise() %>% 
54 |   do(tibble(permno = .$permno, cusip = .$ncusip, 
55 |             ticker = .$ticker, comnam = .$comnam,
56 |             datadate = seq.Date(.$namedt, .$nameendt, by = "day"))) %>% 
57 |   ungroup() %>% 
58 |   setDT()
59 | 
60 | setkey(names, permno, datadate)
61 | 
62 | # merge in gvkey link info to the name file
63 | link <- merge(names, link, by = c("permno", "datadate"), all.x = TRUE) %>% 
64 |   # save it as a tibble
65 |   as_tibble()
66 | 
67 | # save the dataset
68 | saveRDS(link, here::here("Cleaned_Data", "permno_day_to_gvkey.rds"))


--------------------------------------------------------------------------------
/Code - No Passwords/MAKE_AMIHUD_ILLIQUIDITY_MEASURE.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(lubridate)
 3 | library(RPostgres)
 4 | library(data.table)
 5 | 
 6 | # Connect to WRDS Server --------------------------------------------------
 7 | wrds <- dbConnect(Postgres(),
 8 |                   host = 'wrds-pgdata.wharton.upenn.edu',
 9 |                   port = 9737,
10 |                   user = '',
11 |                   password = '',
12 |                   dbname = 'wrds',
13 |                   sslmode = 'require')
14 | 
15 | # pull in the crsp variables we need for data after 1985
16 | crsp <- tbl(wrds, sql("SELECT date, cusip, permno, permco, issuno, hsiccd, prc, vol, ret, shrout \
17 |                   FROM crsp.dsf WHERE date >= '1985-01-01'")) %>% 
18 |   collect()
19 | 
20 | # make amihud liquidity scores
21 | crsp_amihud <- setDT(crsp)
22 | crsp_amihud <- crsp_amihud %>% 
23 |   # drop if volume is zero bc the score breaks down
24 |   .[vol != 0] %>% 
25 |   # make a year variable
26 |   .[, year := year(date)]
27 | 
28 | # set keys
29 | setkey(crsp_amihud, permno, year)
30 | 
31 | # calculate by permno year
32 | amihud <- crsp_amihud %>% 
33 |   .[, amihud := 1000 * sqrt(abs(ret)/(abs(prc) * vol))] %>% 
34 |   .[, .(amihud = mean(amihud, na.rm = TRUE)), keyby = .(permno, year)]
35 | 
36 | # save 
37 | saveRDS(amihud, here::here("Cleaned_Data", "amihud.rds"))


--------------------------------------------------------------------------------
/Code - No Passwords/MAKE_INST_SHARES.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(lubridate)
 3 | library(RPostgres)
 4 | library(data.table)
 5 | 
 6 | # Connect to WRDS Server --------------------------------------------------
 7 | wrds <- dbConnect(Postgres(),
 8 |                   host = 'wrds-pgdata.wharton.upenn.edu',
 9 |                   port = 9737,
10 |                   user = '',
11 |                   password = '',
12 |                   dbname = 'wrds',
13 |                   sslmode = 'require')
14 | 
15 | # download institutional holdings 
16 | institutional_data <- tbl(wrds, sql("SELECT rdate, fdate, mgrno, shares, cusip, shrout2 \ 
17 |                                     FROM tfn.s34")) %>% 
18 |   # keep just the first report date per mgr/cusip/reporting date
19 |   group_by(rdate, mgrno, cusip) %>% 
20 |   filter(fdate == min(fdate, na.rm = TRUE)) %>% 
21 |   ungroup() %>% 
22 |   # sum shares over managers by reporting date, cusip
23 |   group_by(rdate, cusip) %>% 
24 |   summarize(inst_shares = sum(shares, na.rm = TRUE)) %>% 
25 |   ungroup() %>% 
26 |   collect() %>% 
27 |   drop_na()
28 | 
29 | # save 
30 | saveRDS(institutional_data, here::here("Cleaned_Data", "institutional_shares.rds"))


--------------------------------------------------------------------------------
/Code - No Passwords/SCRAPE_FF_INDS.R:
--------------------------------------------------------------------------------
 1 | # This code will scrape Ken French's industry definition file and make a data frame 
 2 | # with the industry groupings and SIC code range. This uses Siccodes12 but you can use 
 3 | # the other industry definitions he has up there just swap out in the url link.
 4 | 
 5 | library(tidyverse)
 6 | 
 7 | # url for the industry definitions
 8 | url <- "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes12.zip"
 9 | 
10 | # Download the data and unzip it
11 | f <- tempfile() 
12 | download.file(url, f) 
13 | inds <- read.delim(unz(f, "Siccodes12.txt"), header = FALSE)
14 | 
15 | # make table
16 | inds <- inds %>% 
17 |   # rename the data column
18 |   rename(name = V1) %>% 
19 |   # detect if it is an ID column
20 |   rowwise() %>% 
21 |   # remove the leading space
22 |   mutate(name = str_trim(name, side = "left")) %>% 
23 |   # detect if there are any letters in the name
24 |   mutate(name_id = ifelse(sum(str_detect(name, letters)) >= 1, 1, 0)) %>% 
25 |   # get the id numbers
26 |   mutate(ind_num = ifelse(name_id == 1, as.numeric(str_sub(name, 1, str_locate(name, " ")[1] - 1)), as.numeric(NA))) %>% 
27 |   # get the short name
28 |   mutate(ind_abbrev = ifelse(name_id == 1,
29 |                              str_sub(name, str_locate_all(name, " ")[[1]][1] + 1, 
30 |                                      str_locate_all(name, " ")[[1]][2] - 1), as.character(NA))) %>% 
31 |   # get the full name
32 |   mutate(ind_name = ifelse(name_id == 1,
33 |                            str_sub(name, str_locate_all(name, " ")[[1]][2] + 1,
34 |                                    nchar(name)), as.character(NA))) %>% 
35 |   # fill all the values down
36 |   ungroup() %>%
37 |   fill(ind_num) %>% fill(ind_abbrev) %>% fill(ind_name) %>%
38 |   # drop name id columns
39 |   group_by(ind_num) %>% 
40 |   add_tally %>%
41 |   filter(n == 1 | name_id == 0) %>%
42 |   # get low and high sic by row
43 |   rowwise() %>% 
44 |   mutate(sic_low = ifelse(name_id == 1, as.numeric(NA), as.numeric(str_sub(name, 1, str_locate(name, "-")[1] - 1))),
45 |          sic_high = ifelse(name_id == 1, as.numeric(NA), as.numeric(str_sub(name, str_locate(name, "-")[1] + 1), nchar(name)))) %>%
46 |   ungroup() %>%
47 |   select(ind_num, ind_abbrev, ind_name, sic_low, sic_high)
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data-Cleans
 2 | This repo has code to do primary data cleaning for Compustat / Crsp from WRDS.
 3 | The included codes do the following data cleaning steps:
 4 | 
 5 | 1. MAKE CRSP_COMP_MERGED
 6 | 
 7 | This code takes the universe of annual compustat financial information and matches to CRSP with the CRSP-specific unique security identifier (permno). In addition, it creates a file that has the associated primary permno for each gvkey/date combination. 
 8 | 
 9 | 2. MAKE PERMNO_TO_GVKEY
10 | 
11 | This creates a file that has the associated compustat identifier (gvkey) for each permno/date combination. Because one firm might have multiple issued securities, this file should be longer than the unique gvkey/date combination file from CRSP_COMP_MERGED. In addition, there may be observations where there is a name and ticker associated with a permno, but where there is no compustat identifier (gvkey). These observations are retained, with gvkey being blank. 
12 | 
13 | 
14 | 3. FAMA_FRENCH_FACTORS
15 | 
16 | This code makes the portfolio based Fama French Factors and was contributed by Gertjan Verdickt.
17 | 
18 | 4. SCRAPE_FF_INDS
19 | 
20 | This code scrapes the Fama-French Industry definitions from Ken French's website. It uses the 12-industry FF definition, but you can swap out for any of the other industry classifications (e.g. 17, 48) in the url line.
21 | 
22 | 5. MAKE_AMIHUD_ILLIQIDUITY MEASURE
23 | 
24 | This code will make the Amihud (2002) Illiquidity measure by permno/calendar year (you can update it for different date ranges pretty easily if you so desire). This is measured as:
25 | 
26 | <img src="https://render.githubusercontent.com/render/math?math=AMIHUD_{iy} = \frac{\sum_{i = 1}^N 1000 \cdot \sqrt{\frac{\left|ret_{it}\right|}{\left| prc_{it} \right| \cdot vol_{it}}}}{N}">
27 | 
28 | 6. MAKE_INST_SHARES 
29 | 
30 | This code will calculate the number of shares held by institutional investors at the cusip/quarter level using Thomson Reuters 13F holdings data. 
31 | 


--------------------------------------------------------------------------------