├── .gitignore ├── code ├── 0multi_group_F.py ├── 1dyn_het_sims.py ├── 1multi_F_power.py ├── dgp.py ├── jupytext.toml ├── misc │ └── event_study_anscombe.py ├── plotters.py ├── saturated.py └── syncer.sh ├── figtab ├── hetfx.png ├── homfx.png ├── rejection_rates_F.png ├── rejection_rates_dyn.png ├── respecification_verify.png ├── respecification_verify_het.png ├── respecification_verify_hom.png ├── static_dynamic_effects.png └── true_functions.png ├── paper ├── appendix.typ ├── jmlr.typ ├── main.bib ├── main.pdf └── main.typ └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/* 2 | *.csv 3 | *.dta 4 | *.xlsx 5 | *.ipynb_checkpoints 6 | .vscode/* 7 | *.log 8 | *.bbl 9 | *.bcf 10 | *.fls 11 | *.bcf 12 | *.run.xml 13 | **/*_cache/* 14 | *.shp 15 | *.shx 16 | *.qpj 17 | *.dbf 18 | *.RData 19 | *.Rds 20 | *.tab 21 | *.swp 22 | *.spq 23 | *.pqt 24 | *.geojson 25 | *.gpkg 26 | *.aux 27 | *.blg 28 | *.out 29 | *.synctex.gz 30 | *.ipynb 31 | *.pkl 32 | input/bischof_wagner/replication.pdf 33 | input/bischof_wagner/replication.Rmd 34 | -------------------------------------------------------------------------------- /code/0multi_group_F.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.16.4 9 | # kernelspec: 10 | # display_name: py311 11 | # language: python 12 | # name: python3 13 | # --- 14 | 15 | # %% 16 | import matplotlib.pyplot as plt 17 | import numpy as np 18 | 19 | from dgp import panel_dgp_stagg 20 | from plotters import checkplot, diag_plot 21 | from saturated import test_treatment_heterogeneity 22 | 23 | np.random.seed(42) 24 | # %% 25 | num_periods = 30 26 | treatment_start_cohorts = [10, 15, 20] 27 | num_treated_units = [25_00, 50_00, 25_00] 28 | 29 | # effect functions 30 | treat_effect_vector_1 = np.log( 31 | 2 * np.arange(1, num_periods - treatment_start_cohorts[1] + 1) 32 | ) 33 | treat_effect_vector_1[8:] = 0 # switch off effects after a week 34 | base_treatment_effects = [ 35 | np.r_[ 36 | np.linspace(2, 0, num_periods - treatment_start_cohorts[0] - 10), 37 | np.repeat(0, 10), 38 | ], 39 | treat_effect_vector_1, 40 | np.sin( 41 | np.arange(1, num_periods - treatment_start_cohorts[2] + 1) 42 | ), # Treatment effect function for cohort 2 43 | ] 44 | 45 | sigma_i, sigma_t = 2, 1 46 | sigma_epsilon = 1 47 | dgp = panel_dgp_stagg( 48 | num_units=20_000, 49 | num_treated=num_treated_units, 50 | num_periods=num_periods, 51 | treatment_start_cohorts=treatment_start_cohorts, 52 | hetfx=False, 53 | base_treatment_effects=base_treatment_effects, 54 | sigma_unit=sigma_i, 55 | sigma_time=sigma_t, 56 | sigma_epsilon=sigma_epsilon, 57 | ) 58 | Y0, Y1, W, df = dgp["Y0"], dgp["Y1"], dgp["W"], dgp["dataframe"] 59 | 60 | # %% 61 | checkplot(df) 62 | plt.savefig("../figtab/respecification_verify_het.png") 63 | # %% 64 | diag_plot(df, treatment_start_cohorts, base_treatment_effects) 65 | pv = test_treatment_heterogeneity(df) 66 | print(pv) 67 | plt.savefig("../figtab/hetfx.png") 68 | 69 | # %% 70 | 71 | # %% [markdown] 72 | # ## homogeneous DGP 73 | 74 | # %% 75 | num_periods = 30 76 | treatment_start_cohorts = [10, 15, 20] 77 | num_treated_units = [25_00, 50_00, 25_00] 78 | 79 | base_treatment_effects = [ 80 | np.log(np.arange(1, num_periods - t + 1)) for t in treatment_start_cohorts 81 | ] 82 | 83 | # %% 84 | 85 | sigma_i, sigma_t = 2, 1 86 | sigma_epsilon = 1 87 | dgp_homog = panel_dgp_stagg( 88 | num_units=20_000, 89 | num_treated=num_treated_units, 90 | num_periods=num_periods, 91 | treatment_start_cohorts=treatment_start_cohorts, 92 | hetfx=False, 93 | base_treatment_effects=base_treatment_effects, 94 | sigma_unit=sigma_i, 95 | sigma_time=sigma_t, 96 | sigma_epsilon=sigma_epsilon, 97 | ) 98 | Y0_h, Y1_h, W_h, df_h = ( 99 | dgp_homog["Y0"], 100 | dgp_homog["Y1"], 101 | dgp_homog["W"], 102 | dgp_homog["dataframe"], 103 | ) 104 | 105 | # %% 106 | diag_plot(df_h, treatment_start_cohorts, base_treatment_effects) 107 | print(test_treatment_heterogeneity(df_h)) 108 | plt.savefig("../figtab/homfx.png") 109 | # %% 110 | checkplot(df_h) 111 | plt.savefig("../figtab/respecification_verify_hom.png") 112 | 113 | # %% 114 | -------------------------------------------------------------------------------- /code/1dyn_het_sims.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.16.4 9 | # kernelspec: 10 | # display_name: py311 11 | # language: python 12 | # name: python3 13 | # --- 14 | 15 | # %% 16 | import pickle 17 | 18 | import numpy as np 19 | import pandas as pd 20 | import pyfixest as pf 21 | 22 | import matplotlib.pyplot as plt 23 | from lets_plot import * 24 | LetsPlot.setup_html() 25 | 26 | 27 | # %% [markdown] 28 | # ## regression estimators for effect dynamics 29 | 30 | # %% 31 | def diff_in_means(df, T, T0): 32 | cross_sec_df = ( 33 | df.query(f"time >= {T0}").groupby("unit").agg({"Y": "mean", "W": "max"}) 34 | ) 35 | estimate = pf.feols("Y~W", cross_sec_df).tidy().loc["W", "Estimate"] 36 | return np.repeat(estimate, T - T0) 37 | 38 | 39 | def cuped(df, T, T0): 40 | cross_sec_df = ( 41 | df.query(f"time >= {T0}").groupby("unit").agg({"Y": "mean", "W": "max"}) 42 | ) 43 | cross_sec_df = cross_sec_df.merge( 44 | df.query(f"time < {T0}").groupby("unit").Y.mean().rename("ldv"), 45 | left_index=True, 46 | right_index=True, 47 | ) 48 | estimate = pf.feols("Y~W+ldv", cross_sec_df).tidy().loc["W", "Estimate"] 49 | return np.repeat(estimate, T - T0) 50 | 51 | 52 | def twfe(df, T, T0): 53 | m = pf.feols("Y~W | unit + time", df).tidy() 54 | estimate = m.loc["W", "Estimate"] 55 | return np.repeat(estimate, T - T0) 56 | 57 | 58 | def panel_diff_in_means(df, T, T0): 59 | mean_outcomes = df.groupby(["W", "time"])["Y"].mean().unstack() 60 | diff_means = mean_outcomes.iloc[1, :] - mean_outcomes.iloc[0, :] 61 | return diff_means[diff_means.notna()].values 62 | 63 | 64 | def event_study(df, T, T0): 65 | df["ever_treated"] = df.groupby("unit")["W"].transform("max") 66 | m = pf.feols(f"Y ~ i(time, ever_treated, ref = {T0-1}) | unit + time", df) 67 | return m.coef()[(T0 - 1) :].values 68 | 69 | 70 | # %% 71 | def sim_panel( 72 | base_effect, 73 | N=1_000_000, 74 | T=35, 75 | T0=15, 76 | sigma_list=[5, 2, 0.01, 2], 77 | hetfx=False, 78 | num_treated=None, 79 | rho=0.7, 80 | seed=42, 81 | debug=False, 82 | ): 83 | np.random.seed(seed) 84 | sigma_unit, sigma_time, sigma_tt, sigma_e = sigma_list 85 | # Generate data 86 | unit_ids = np.repeat(np.arange(N), T) 87 | time_ids = np.tile(np.arange(T), N) 88 | # Generate unit-specific intercepts and time trends 89 | unit_fe = np.random.normal(0, sigma_unit, N) 90 | time_fe = np.random.normal(0, sigma_time, T) 91 | unit_tt = np.random.normal(0, sigma_tt, N) 92 | # Generate treatment indicator 93 | if num_treated is None: 94 | W = np.random.binomial(1, 0.5, N) 95 | else: 96 | treated_units = np.random.choice(N, num_treated, replace=False) 97 | W = np.zeros(N) 98 | W[treated_units] = 1 99 | W = np.repeat(W, T) 100 | W = W * (time_ids >= T0) 101 | # Generate treatment effect 102 | if hetfx: 103 | unit_effects = np.random.uniform(0.5, 1.5, N) 104 | else: 105 | unit_effects = np.ones(N) 106 | treatment_effect = np.outer(unit_effects, base_effect) 107 | # Generate serially correlated residuals 108 | residuals = np.zeros((N, T)) 109 | residuals[:, 0] = np.random.normal(0, sigma_e, N) 110 | epsilon = np.random.normal(0, 1, (N, T - 1)) 111 | factor = sigma_e * np.sqrt(1 - rho**2) 112 | for t in range(1, T): 113 | residuals[:, t] = rho * residuals[:, t - 1] + factor * epsilon[:, t - 1] 114 | # Generate outcome 115 | Y = ( 116 | np.repeat(unit_fe, T) 117 | + np.repeat(unit_tt, T) * time_ids 118 | + treatment_effect.flatten() * W 119 | + np.tile(time_fe, N) 120 | + residuals.flatten() 121 | ) 122 | 123 | # Create DataFrame 124 | df = pd.DataFrame({"unit": unit_ids, "time": time_ids, "Y": Y, "W": W}) 125 | if debug: 126 | return Y, W, treatment_effect, df 127 | return df 128 | 129 | 130 | # %% 131 | def generate_treatment_effect(effect_type, T, T0, max_effect=1): 132 | if effect_type == "constant": 133 | return np.concatenate([np.zeros(T0), np.full(T - T0, max_effect)]) 134 | elif effect_type == "linear": 135 | return np.concatenate([np.zeros(T0), np.linspace(0, max_effect, T - T0)]) 136 | elif effect_type == "concave": 137 | return np.concatenate( 138 | [ 139 | np.zeros(T0), 140 | max_effect * 0.5 * np.log(2 * np.arange(1, T - T0 + 1) / (T - T0) + 1), 141 | ] 142 | ) 143 | elif effect_type == "positive_then_negative": 144 | half_point = (T - T0) // 2 145 | return np.concatenate( 146 | [ 147 | np.zeros(T0), 148 | np.linspace(0, max_effect, half_point), 149 | np.linspace(max_effect, -max_effect, T - T0 - half_point), 150 | ] 151 | ) 152 | elif effect_type == "exponential": 153 | return np.concatenate( 154 | [ 155 | np.zeros(T0), 156 | max_effect * (1 - np.exp(-np.linspace(0, 5, T - T0))), 157 | ] 158 | ) 159 | elif effect_type == "sinusoidal": 160 | return np.concatenate( 161 | [ 162 | np.zeros(T0), 163 | max_effect * np.sin(np.linspace(0, 2 * np.pi, T - T0)), 164 | ] 165 | ) 166 | elif effect_type == "random_walk": 167 | return np.concatenate( 168 | [ 169 | np.zeros(T0), 170 | max_effect * np.cumsum(np.random.randn(T - T0)), 171 | ] 172 | ) 173 | else: 174 | raise ValueError("Unknown effect type") 175 | 176 | 177 | # %% 178 | # Define the simulation engine 179 | def simulation_engine( 180 | effect_type, T, T0, max_effect, N, num_treated, sigma_list, hetfx, rho, seed 181 | ): 182 | # Generate true treatment effects 183 | effect_vector = generate_treatment_effect(effect_type, T, T0, max_effect) 184 | # Simulate data 185 | df = sim_panel( 186 | effect_vector, 187 | N=N, 188 | T=T, 189 | T0=T0, 190 | sigma_list=sigma_list, 191 | hetfx=hetfx, 192 | num_treated=num_treated, 193 | rho=rho, 194 | seed=seed, 195 | ) 196 | # Apply estimators 197 | estimates = {} 198 | estimates["true_effect"] = effect_vector[T0:] 199 | estimators = [diff_in_means, twfe, event_study] 200 | # apply them and add to the estimates dictionary 201 | for estimator in estimators: 202 | estimates[estimator.__name__] = estimator(df, T, T0) 203 | return estimates 204 | 205 | 206 | # %% 207 | effect_types = [ 208 | "constant", 209 | "linear", 210 | "concave", 211 | "positive_then_negative", 212 | "exponential", 213 | "sinusoidal", 214 | "random_walk", 215 | ] 216 | T, T0 = 35, 15 217 | max_effect = 1 218 | ests = {} 219 | for effect_type in effect_types: 220 | est = simulation_engine( 221 | effect_type, T, T0, max_effect, 50_000, 25_000, [5, 2, 0.01, 2], False, 0.7, 42 222 | ) 223 | ests[effect_type] = pd.DataFrame.from_dict(est) 224 | f, ax = plt.subplots(2, 4, figsize=(16, 8), sharey=False, sharex=True) 225 | for k, v, i in zip(ests.keys(), ests.values(), range(7)): 226 | v[["true_effect"]].plot( 227 | ax=ax[i // 4, i % 4], marker=".", ms=6, alpha=1, legend=False 228 | ) 229 | v.drop(["true_effect"], axis=1).plot( 230 | ax=ax[i // 4, i % 4], marker=".", alpha=0.7, legend=False 231 | ) 232 | ax[i // 4, i % 4].set_title(k) 233 | ax[i // 4, i % 4].axhline(0, color="black", linestyle="--") 234 | ax[i // 4, i % 4].axhline(v.true_effect.mean(), color="red", linestyle="--") 235 | ax[0, 0].legend() 236 | # destroy empty axis 237 | ax[1, 3].axis("off") 238 | f.tight_layout() 239 | # f.suptitle("Static and Dynamic Effects") 240 | f.savefig("../figtab/static_dynamic_effects.png") 241 | 242 | 243 | # %% [markdown] 244 | # ## F test 245 | 246 | # %% 247 | def f_test_stability(df, T0, vcv={"CRV1": "unit"}, dgp_type="", return_plot=True): 248 | # Fit models 249 | df["rel_time"] = df["time"] - (T0) + 1 250 | df["rel_time"] = df["rel_time"].where(df["W"] == 1, 0) 251 | 252 | restricted = pf.feols("Y ~ i(W) | unit + time", df) 253 | unrestricted = pf.feols("Y ~ i(rel_time, ref=0) | unit + time", df, vcov=vcv) 254 | # Get the restricted estimate 255 | restricted_effect = restricted.coef().iloc[0] 256 | # Create R matrix - each row tests one event study coefficient 257 | # against restricted estimate 258 | n_evstudy_coefs = unrestricted.coef().shape[0] 259 | R = np.eye(n_evstudy_coefs) 260 | # q vector is the restricted estimate repeated 261 | q = np.repeat(restricted_effect, n_evstudy_coefs) 262 | # Conduct Wald test 263 | pv = unrestricted.wald_test(R=R, q=q, distribution="chi2")["pvalue"] 264 | if not return_plot: 265 | return pv 266 | plotout = pf.iplot( 267 | [restricted, unrestricted], coord_flip=False, figsize=(900, 400) 268 | ) + labs( 269 | title=f"{dgp_type}", 270 | subtitle=f"Stability Test p-value ={pv:.3f}", 271 | x="", 272 | y="", 273 | ) 274 | return plotout 275 | 276 | 277 | # %% 278 | effect_types = [ 279 | "constant", 280 | "linear", 281 | "concave", 282 | "positive_then_negative", 283 | "exponential", 284 | "sinusoidal", 285 | "random_walk", 286 | ] 287 | N, num_treated, sigma_list, hetfx, rho, seed = ( 288 | 50_000, 289 | 25_000, 290 | [5, 2, 0.01, 2], 291 | False, 292 | 0.7, 293 | 42, 294 | ) 295 | T, T0 = 35, 15 296 | max_effect = 0.1 297 | 298 | for effect_type in effect_types: 299 | effect_vector = generate_treatment_effect(effect_type, T, T0, max_effect) 300 | # Simulate data 301 | df = sim_panel( 302 | effect_vector, 303 | N=N, 304 | T=T, 305 | T0=T0, 306 | sigma_list=sigma_list, 307 | hetfx=hetfx, 308 | num_treated=num_treated, 309 | rho=rho, 310 | seed=seed, 311 | ) 312 | f_test_stability(df, T0, dgp_type=effect_type).show() 313 | 314 | # %% [markdown] 315 | # ## simulations 316 | 317 | # %% 318 | from joblib import Parallel, delayed 319 | 320 | 321 | def run_single_simulation( 322 | effect_type, T, T0, max_effect, N, num_treated, sigma_list, hetfx, rho, seed 323 | ): 324 | # Generate treatment effect vector 325 | effect_vector = generate_treatment_effect(effect_type, T, T0, max_effect) 326 | 327 | # Simulate data 328 | df = sim_panel( 329 | effect_vector, 330 | N=N, 331 | T=T, 332 | T0=T0, 333 | sigma_list=sigma_list, 334 | hetfx=hetfx, 335 | num_treated=num_treated, 336 | rho=rho, 337 | seed=seed, 338 | ) 339 | 340 | # Run stability test and return p-value 341 | return f_test_stability(df, T0, return_plot=False) 342 | 343 | 344 | # %% 345 | def compute_power(n_sims=1000, n_jobs=-1): 346 | effect_types = [ 347 | "constant", 348 | "linear", 349 | "concave", 350 | "positive_then_negative", 351 | "exponential", 352 | "sinusoidal", 353 | "random_walk", 354 | ] 355 | 356 | base_params = { 357 | "N": 50_000, 358 | "num_treated": 25_000, 359 | "sigma_list": [5, 2, 0.01, 2], 360 | "hetfx": False, 361 | "rho": 0.7, 362 | "T": 35, 363 | "T0": 15, 364 | "max_effect": 0.1, 365 | } 366 | 367 | results = {} 368 | for effect_type in effect_types: 369 | # Run parallel simulations 370 | p_values = Parallel(n_jobs=n_jobs)( 371 | delayed(run_single_simulation)( 372 | effect_type=effect_type, seed=i, **base_params # Use iteration as seed 373 | ) 374 | for i in range(n_sims) 375 | ) 376 | 377 | # Compute rejection rate at 5% level 378 | power = np.mean(np.array(p_values) < 0.05) 379 | results[effect_type] = {"power": power, "p_values": p_values} 380 | 381 | return results 382 | 383 | 384 | # %% 385 | # %%time 386 | # Run simulations 387 | results = compute_power(n_sims=1000, n_jobs=8) 388 | 389 | # %% 390 | with open("../tmp/results_dyn.pkl", "wb") as f: 391 | pickle.dump(results, f) 392 | 393 | # %% 394 | with open("../tmp/results_dyn.pkl", "rb") as f: 395 | results = pickle.load(f) 396 | results = pd.DataFrame(results).T.reset_index() 397 | results.rename(columns={"power": "rejection_rate", "index": "dgp"}, inplace=True) 398 | results 399 | 400 | # %% 401 | # Plot results 402 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True) 403 | 404 | # Plot power 405 | results.plot(kind="bar", x="dgp", y="rejection_rate", ax=ax1) 406 | ax1.xaxis.set_tick_params(rotation=45) 407 | ax1.set_title("Rejection Rate by DGP") 408 | ax1.set_ylabel("rejection rate") 409 | ax1.set_xlabel("") 410 | ax1.axhline(0.05, color="r", linestyle="--", label="α=0.05") 411 | 412 | # Plot p-value distributions 413 | ax2.boxplot([r for r in results["p_values"]], labels=results["dgp"]) 414 | ax2.set_title("P-value Distributions") 415 | ax2.axhline(0.05, color="r", linestyle="--") 416 | ax2.xaxis.set_tick_params(rotation=45) 417 | plt.tight_layout() 418 | plt.savefig("../figtab/rejection_rates_dyn.png") 419 | -------------------------------------------------------------------------------- /code/1multi_F_power.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import contextlib 3 | import io 4 | import warnings 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | from joblib import Parallel, delayed 10 | from tqdm import tqdm 11 | 12 | np.random.seed(42) 13 | # %% 14 | from dgp import panel_dgp_stagg 15 | from saturated import test_treatment_heterogeneity 16 | 17 | # %% 18 | 19 | # ▄▄▄▄ ██ 20 | # ██▀▀▀ ▀▀ 21 | # ▄█████▄ ▄████▄ ██▄████▄ ███████ ████ ▄███▄██ ▄▄█████▄ 22 | # ██▀ ▀ ██▀ ▀██ ██▀ ██ ██ ██ ██▀ ▀██ ██▄▄▄▄ ▀ 23 | # ██ ██ ██ ██ ██ ██ ██ ██ ██ ▀▀▀▀██▄ 24 | # ▀██▄▄▄▄█ ▀██▄▄██▀ ██ ██ ██ ▄▄▄██▄▄▄ ▀██▄▄███ █▄▄▄▄▄██ 25 | # ▀▀▀▀▀ ▀▀▀▀ ▀▀ ▀▀ ▀▀ ▀▀▀▀▀▀▀▀ ▄▀▀▀ ██ ▀▀▀▀▀▀ 26 | # ▀████▀▀ 27 | 28 | 29 | num_periods = 30 30 | treatment_start_cohorts = [10, 15, 20] 31 | num_treated_units = [25_00, 50_00, 25_00] 32 | 33 | 34 | configs = [ 35 | { 36 | "name": "homogeneous", # homogeneous effects 37 | "base_treatment_effects": lambda t: [ 38 | np.log(np.arange(1, num_periods - t + 1)) for t in treatment_start_cohorts 39 | ], 40 | }, 41 | { 42 | "name": "log_vs_linear_vs_sin", # original heterogeneous case 43 | "base_treatment_effects": lambda _: [ 44 | np.r_[ 45 | np.linspace(2, 0, num_periods - treatment_start_cohorts[0] - 10), 46 | np.repeat(0, 10), 47 | ], 48 | np.log(2 * np.arange(1, num_periods - treatment_start_cohorts[1] + 1)), 49 | np.sin(np.arange(1, num_periods - treatment_start_cohorts[2] + 1)), 50 | ], 51 | }, 52 | { 53 | "name": "small_differences", # subtle heterogeneity 54 | "base_treatment_effects": lambda t: [ 55 | np.log(np.arange(1, num_periods - t + 1)) * (1 + i * 0.1) 56 | for i, t in enumerate(treatment_start_cohorts) 57 | ], 58 | }, 59 | { 60 | "name": "large_differences", # effects depend on treatment timing 61 | "base_treatment_effects": lambda t: [ 62 | np.log(np.arange(1, num_periods - t + 1)) * (t / 10) 63 | for t in treatment_start_cohorts 64 | ], 65 | }, 66 | { 67 | "name": "selection_on_gains", # subtle heterogeneity 68 | "base_treatment_effects": lambda t: [ 69 | np.log(np.arange(1, num_periods - t + 1)) * (1 - i * 0.1) 70 | for i, t in enumerate(treatment_start_cohorts) 71 | ], 72 | }, 73 | ] 74 | 75 | additional_configs = [ 76 | { 77 | "name": "novelty_effects", 78 | "base_treatment_effects": lambda t: [ 79 | 2 * np.exp(-0.3 * np.arange(num_periods - t)) + 0.5 # Sharp decay to 0.5 80 | for t in treatment_start_cohorts 81 | ], 82 | }, 83 | { 84 | "name": "activity_bias", # First cohort different from rest 85 | "base_treatment_effects": lambda t: [ 86 | # First cohort has strong persistent effects 87 | ( 88 | 2.5 * np.ones(num_periods - treatment_start_cohorts[0]) 89 | if i == 0 90 | # Other cohorts have standard log pattern 91 | else np.log(np.arange(1, num_periods - t + 1)) 92 | ) 93 | for i, t in enumerate(treatment_start_cohorts) 94 | ], 95 | }, 96 | ] 97 | 98 | configs.extend(additional_configs) 99 | 100 | 101 | # %% 102 | def plot_true_functions( 103 | treatment_start_cohorts, 104 | base_treatment_effects, 105 | title, 106 | ax, 107 | ): 108 | true_fns = {} 109 | for c, s in enumerate(treatment_start_cohorts): 110 | effect_vector_padded = np.pad( 111 | base_treatment_effects[c], 112 | (treatment_start_cohorts[-1], 0), 113 | ) 114 | 115 | # Create x-axis values that skip -1 116 | x_values = np.arange(len(effect_vector_padded)) 117 | x_values = np.where( 118 | x_values >= treatment_start_cohorts[-1], 119 | x_values - treatment_start_cohorts[-1], 120 | x_values - treatment_start_cohorts[-1] - 1, 121 | ) 122 | true_fns[f"cohort_{s}"] = pd.Series( 123 | {x: y for x, y in zip(x_values, effect_vector_padded)} 124 | ) 125 | 126 | true_event_study = pd.concat(true_fns).reset_index() 127 | true_event_study.columns = ["cohort", "rel_time", "true_effect"] 128 | true_event_study = true_event_study.groupby("rel_time")["true_effect"].mean() 129 | cmp = plt.get_cmap("viridis", len(true_fns)) 130 | i = 0 131 | for k, v in true_fns.items(): 132 | ax.plot(v, color=cmp(i), marker=".") 133 | i += 1 134 | ax.axvline(-1, color="black", linestyle="--") 135 | ax.axhline(0, color="black", linestyle=":") 136 | ax.set_title(title) 137 | 138 | 139 | # %% 140 | f, ax = plt.subplots( 141 | 3, int(np.ceil(len(configs) / 3)), figsize=(10, 7), sharex=True, sharey=True 142 | ) 143 | ax = ax.flatten() 144 | for i, config in enumerate(configs): 145 | plot_true_functions( 146 | treatment_start_cohorts, 147 | config["base_treatment_effects"](treatment_start_cohorts), 148 | config["name"], 149 | ax[i], 150 | ) 151 | # delete last subplot 152 | f.delaxes(ax[-2]) 153 | f.delaxes(ax[-1]) 154 | f.tight_layout() 155 | f.savefig("../figtab/true_functions.png") 156 | # %% 157 | 158 | # ██▄███▄ ▄████▄ ██ ██ ▄████▄ ██▄████ 159 | # ██▀ ▀██ ██▀ ▀██ ▀█ ██ █▀ ██▄▄▄▄██ ██▀ 160 | # ██ ██ ██ ██ ██▄██▄██ ██▀▀▀▀▀▀ ██ 161 | # ███▄▄██▀ ▀██▄▄██▀ ▀██ ██▀ ▀██▄▄▄▄█ ██ 162 | # ██ ▀▀▀ ▀▀▀▀ ▀▀ ▀▀ ▀▀▀▀▀ ▀▀ 163 | # ██ 164 | 165 | 166 | # %% 167 | @contextlib.contextmanager 168 | def suppress_stdout(): 169 | stdout = io.StringIO() 170 | with contextlib.redirect_stdout(stdout): 171 | yield stdout 172 | 173 | 174 | def single_simulation( 175 | config, treatment_start_cohorts, num_periods, num_treated_units, seed=42 176 | ): 177 | with suppress_stdout(), warnings.catch_warnings(): 178 | warnings.simplefilter("ignore") 179 | # Generate data 180 | dgp = panel_dgp_stagg( 181 | num_units=20_000, 182 | num_treated=num_treated_units, 183 | num_periods=num_periods, 184 | treatment_start_cohorts=treatment_start_cohorts, 185 | hetfx=False, 186 | base_treatment_effects=config["base_treatment_effects"]( 187 | treatment_start_cohorts 188 | ), 189 | sigma_unit=2, 190 | sigma_time=1, 191 | sigma_epsilon=1, 192 | seed=seed, 193 | ) 194 | # Run test 195 | return test_treatment_heterogeneity(dgp["dataframe"]) 196 | 197 | 198 | # %% 199 | single_simulation(configs[0], treatment_start_cohorts, num_periods, num_treated_units) 200 | 201 | # %% 202 | 203 | 204 | def power_analysis( 205 | n_sims=1000, 206 | dgp_configs=configs, 207 | alpha=0.05, 208 | n_jobs=-1, 209 | ): 210 | results = [] 211 | 212 | for config in dgp_configs: # Parallel simulation with progress bar 213 | pvalues = Parallel(n_jobs=n_jobs)( 214 | delayed(single_simulation)( 215 | config, treatment_start_cohorts, num_periods, num_treated_units, seed=i 216 | ) 217 | for i in tqdm(range(n_sims), desc=f"Running {config['name']}") 218 | ) 219 | # Compute rejection rate 220 | rejection_rate = np.mean(np.array(pvalues) < alpha) 221 | results.append( 222 | { 223 | "dgp": config["name"], 224 | "rejection_rate": rejection_rate, 225 | "pvalues": pvalues, 226 | } 227 | ) 228 | 229 | return pd.DataFrame(results) 230 | 231 | 232 | # %% # Run power analysis 233 | results = power_analysis(n_sims=1000, dgp_configs=configs, n_jobs=-1) 234 | results 235 | # %% 236 | results.to_pickle("../tmp/rejection_rates_F.pkl") 237 | # %% 238 | results = pd.read_pickle("../tmp/rejection_rates_F.pkl") 239 | results.loc[results.dgp == "timing_dependent", "dgp"] = "large_differences" 240 | # %% 241 | # Plot results 242 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True) 243 | 244 | # Plot power 245 | results.plot(kind="bar", x="dgp", y="rejection_rate", ax=ax1) 246 | ax1.set_title("Rejection Rate by DGP") 247 | ax1.set_ylabel("rejection rate") 248 | plt.setp(ax1.get_xticklabels(), rotation=45, ha="right") 249 | ax1.get_legend().remove() 250 | ax1.axhline(0.05, color="r", linestyle="--", label="α=0.05") 251 | 252 | # Plot p-value distributions 253 | ax2.boxplot([r for r in results["pvalues"]], labels=results["dgp"]) 254 | ax2.set_title("P-value Distributions") 255 | plt.setp(ax2.get_xticklabels(), rotation=45, ha="right") 256 | # ax2.set_yscale("log") 257 | ax2.axhline(0.05, color="r", linestyle="--") 258 | 259 | plt.tight_layout() 260 | plt.savefig("../figtab/rejection_rates_F.png") 261 | # %% 262 | -------------------------------------------------------------------------------- /code/dgp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def panel_dgp_stagg( 6 | num_units=100, 7 | num_periods=30, 8 | num_treated=[50], 9 | treatment_start_cohorts=[15], 10 | sigma_unit=1, 11 | sigma_time=0.5, 12 | sigma_epsilon=0.2, 13 | hetfx=False, 14 | base_treatment_effects=[0.1 * np.log(np.arange(1, 30 - 15 + 1))], 15 | return_dataframe=True, 16 | ar_coef=0.8, 17 | seed = 42, 18 | ): 19 | np.random.seed(seed) 20 | # unit FEs 21 | unit_intercepts = np.random.normal(0, sigma_unit, num_units) 22 | #################################################################### 23 | # time FEs: Generate day-of-the-week pattern 24 | day_effects = np.array( 25 | [-0.1, 0.1, 0, 0, 0.1, 0.5, 0.5] 26 | ) # Stronger effects on weekends 27 | day_pattern = np.tile(day_effects, num_periods // 7 + 1)[:num_periods] 28 | # autoregressive structure in time FEs 29 | ar_coef_time = 0.2 30 | ar_noise_time = np.random.normal(0, sigma_time, num_periods) 31 | time_intercepts = np.zeros(num_periods) 32 | time_intercepts[0] = ar_noise_time[0] 33 | for t in range(1, num_periods): 34 | time_intercepts[t] = ar_coef_time * time_intercepts[t - 1] + ar_noise_time[t] 35 | # Combine day-of-the-week pattern and autoregressive structure 36 | time_intercepts = day_pattern + time_intercepts - np.mean(time_intercepts) 37 | #################################################################### 38 | # Generate autoregressive noise for each unit 39 | ar_noise = np.random.normal(0, sigma_epsilon, (num_units, num_periods)) 40 | noise = np.zeros((num_units, num_periods)) 41 | noise[:, 0] = ar_noise[:, 0] 42 | for t in range(1, num_periods): 43 | noise[:, t] = ar_coef * noise[:, t - 1] + ar_noise[:, t] 44 | # N X T matrix of potential outcomes under control 45 | Y0 = unit_intercepts[:, np.newaxis] + time_intercepts[np.newaxis, :] + noise 46 | #################################################################### 47 | # Generate heterogeneous multipliers for each unit 48 | if hetfx: 49 | heterogeneous_multipliers = np.random.uniform(0.5, 1.5, num_units) 50 | else: 51 | heterogeneous_multipliers = np.ones(num_units) 52 | # random assignment 53 | treated_units = np.array([], dtype=int) 54 | treatment_status = np.zeros((num_units, num_periods), dtype=bool) 55 | #################################################################### 56 | # Create a 2D array to store the heterogeneous treatment effects 57 | treatment_effect = np.zeros((num_units, num_periods)) 58 | # iterate over treatment cohorts 59 | for cohort_idx, (treatment_start, num_treated_cohort) in enumerate( 60 | zip(treatment_start_cohorts, num_treated) 61 | ): 62 | base_treatment_effect = base_treatment_effects[cohort_idx] 63 | cohort_treatment_effect = np.zeros((num_units, num_periods - treatment_start)) 64 | 65 | for i in range(num_units): 66 | cohort_treatment_effect[i, :] = ( 67 | heterogeneous_multipliers[i] * base_treatment_effect 68 | ) 69 | cohort_treated_units = np.random.choice( 70 | np.setdiff1d(np.arange(num_units), treated_units), 71 | num_treated_cohort, 72 | replace=False, 73 | ) 74 | treated_units = np.concatenate((treated_units, cohort_treated_units)) 75 | treatment_status[cohort_treated_units, treatment_start:] = True 76 | treatment_effect[ 77 | cohort_treated_units, treatment_start: 78 | ] += cohort_treatment_effect[cohort_treated_units, :] 79 | 80 | # Apply the heterogeneous treatment effect to the treated units 81 | Y1 = Y0.copy() 82 | Y1[treatment_status] += treatment_effect[treatment_status] 83 | #################################################################### 84 | result = { 85 | "Y1": Y1, 86 | "Y0": Y0, 87 | "W": treatment_status, 88 | "unit_intercepts": unit_intercepts, 89 | "time_intercepts": time_intercepts, 90 | } 91 | 92 | if return_dataframe: 93 | # Create a DataFrame 94 | unit_ids = np.repeat(np.arange(num_units), num_periods) 95 | time_ids = np.tile(np.arange(num_periods), num_units) 96 | W_it = treatment_status.flatten().astype(int) 97 | Y_it = np.where(W_it, Y1.flatten(), Y0.flatten()) 98 | unit_intercepts_flat = np.repeat(unit_intercepts, num_periods) 99 | time_intercepts_flat = np.tile(time_intercepts, num_units) 100 | df = pd.DataFrame( 101 | { 102 | "unit_id": unit_ids, 103 | "time_id": time_ids, 104 | "W_it": W_it, 105 | "Y_it": Y_it, 106 | "unit_intercept": unit_intercepts_flat, 107 | "time_intercept": time_intercepts_flat, 108 | } 109 | ) 110 | result["dataframe"] = df 111 | return result 112 | 113 | 114 | def generate_treatment_effect(effect_type, T, T0, max_effect=1): 115 | if effect_type == "constant": 116 | return np.full(T - T0, max_effect) 117 | elif effect_type == "linear": 118 | return np.linspace(0, max_effect, T - T0) 119 | elif effect_type == "concave": 120 | return max_effect * np.log(2 * np.arange(1, T - T0 + 1) / (T - T0) + 1) 121 | elif effect_type == "positive_then_negative": 122 | half_point = (T - T0) // 2 123 | return np.concatenate( 124 | [ 125 | np.linspace(0, max_effect, half_point), 126 | np.linspace(max_effect, -max_effect, T - T0 - half_point), 127 | ] 128 | ) 129 | elif effect_type == "exponential": 130 | return max_effect * (1 - np.exp(-np.linspace(0, 5, T - T0))) 131 | elif effect_type == "sinusoidal": 132 | return max_effect * np.sin(np.linspace(0, 2 * np.pi, T - T0)) 133 | elif effect_type == "random_walk": 134 | return max_effect * np.cumsum(np.random.randn(T - T0)) 135 | else: 136 | raise ValueError("Unknown effect type") 137 | -------------------------------------------------------------------------------- /code/jupytext.toml: -------------------------------------------------------------------------------- 1 | # Pair ipynb notebooks to py:percent text notebooks 2 | formats = "ipynb,py:percent" 3 | -------------------------------------------------------------------------------- /code/misc/event_study_anscombe.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.16.4 9 | # kernelspec: 10 | # display_name: py311 11 | # language: python 12 | # name: python3 13 | # --- 14 | 15 | # %% [markdown] id="cdis85xCuhbr" 16 | # # Anscombe's quartet for longitudinal experiments 17 | 18 | # %% id="hL511B-3olXp" 19 | import numpy as np 20 | import pandas as pd 21 | import matplotlib.pyplot as plt 22 | import plotnine as p9 23 | import pyfixest as pf 24 | 25 | # %matplotlib inline 26 | 27 | # %% id="I33hyIWRosou" 28 | def generate_panel_data( 29 | N, 30 | T, 31 | K, 32 | unit_fac_lb=-0.2, 33 | unit_fac_ub=0.2, 34 | time_fac_lb=-0.1, 35 | time_fac_ub=0.1, 36 | sigma=0.1, 37 | trend_sigma=0.01, 38 | ): 39 | F, L = ( 40 | np.random.uniform(time_fac_lb, time_fac_ub, (T, K)), 41 | np.random.uniform(unit_fac_lb, unit_fac_ub, (N, K)), 42 | ) 43 | time_trends = np.random.normal(0, trend_sigma, (N, 1)) * np.arange(T).reshape(1, T) 44 | epsilon = np.random.normal(0, sigma, (N, T)) 45 | Y = np.dot(L, F.T) + epsilon + time_trends 46 | return Y, L 47 | 48 | 49 | # %% 50 | def generate_quartet(N=1000, T=20, T0=10, T1=15, K=3, **kwargs): 51 | np.random.seed(42) 52 | 53 | # Generate baseline panel data 54 | Y, _ = generate_panel_data(N, T, K, **kwargs) 55 | 56 | # Treatment effects 57 | group_1_effect = 0 58 | group_2_effect = np.random.normal(0, 0.25, N // 2) 59 | group_3_effect = np.linspace(-0.5, 0.5, T - T0) 60 | group_4_effect_T0 = np.linspace(-0.25, 0.25, T - T0) 61 | group_4_effect_T1 = np.linspace(-0.75, 0.75, T - T1) 62 | 63 | # Base data 64 | data = pd.DataFrame( 65 | { 66 | "unit": np.repeat(range(N), T), 67 | "time": np.tile(range(T), N), 68 | "outcome": Y.flatten(), 69 | } 70 | ) 71 | 72 | # Randomly assign half the units to treatment 73 | treated_units = np.random.choice(N, N // 2, replace=False) 74 | 75 | # Scenario 1: Constant treatment effect (now zero) 76 | data1 = data.copy() 77 | data1["treated"] = data1["unit"].isin(treated_units) & (data1["time"] >= T0) 78 | data1.loc[data1["treated"], "outcome"] += group_1_effect 79 | 80 | # Scenario 2: Heterogeneous treatment effects across subgroups (zero mean) 81 | data2 = data.copy() 82 | data2["treated"] = data2["unit"].isin(treated_units) & (data2["time"] >= T0) 83 | for i, unit in enumerate(treated_units): 84 | data2.loc[ 85 | (data2["unit"] == unit) & (data2["time"] >= T0), "outcome" 86 | ] += group_2_effect[i] 87 | 88 | # Scenario 3: Heterogeneous treatment effects over time (zero mean) 89 | data3 = data.copy() 90 | data3["treated"] = data3["unit"].isin(treated_units[: N // 2]) & ( 91 | data3["time"] >= T0 92 | ) 93 | for t in range(T0, T): 94 | data3.loc[ 95 | (data3["treated"]) & (data3["time"] == t), "outcome" 96 | ] += group_3_effect[t - T0] 97 | 98 | # Scenario 4: Heterogeneous treatment effects over time and across cohorts (zero mean) 99 | data4 = data.copy() 100 | data4["treated_T0"] = data4["unit"].isin(treated_units[: N // 4]) & ( 101 | data4["time"] >= T0 102 | ) 103 | data4["treated_T1"] = data4["unit"].isin(treated_units[N // 4 : N // 2]) & ( 104 | data4["time"] >= T1 105 | ) 106 | data4["treated"] = data4["treated_T0"] | data4["treated_T1"] 107 | for t in range(T0, T): 108 | data4.loc[ 109 | (data4["treated_T0"]) & (data4["time"] == t), "outcome" 110 | ] += group_4_effect_T0[t - T0] 111 | for t in range(T1, T): 112 | data4.loc[ 113 | (data4["treated_T1"]) & (data4["time"] == t), "outcome" 114 | ] += group_4_effect_T1[t - T1] 115 | for d in [data1, data2, data3, data4]: 116 | d["ever_treated"] = d.groupby("unit")["treated"].transform("max") 117 | return data1, data2, data3, data4 118 | 119 | 120 | # %% [markdown] 121 | # ## Post-treatment difference in means 122 | 123 | # %% 124 | def calculate_ate(data): 125 | treated = data[data["treated"] & (data["time"] >= T0)]["outcome"].mean() 126 | control = data[~data["treated"] & (data["time"] >= T0)]["outcome"].mean() 127 | return treated - control 128 | 129 | 130 | # %% # Generate data 131 | N, T, T0, T1 = 1000, 20, 10, 15 132 | data1, data2, data3, data4 = generate_quartet(N, T, T0, T1, trend_sigma=0.01) 133 | # %% # Calculate ATEs from post-treatment data 134 | for i, data in enumerate([data1, data2, data3, data4], 1): 135 | ate = calculate_ate(data) 136 | print(f"Scenario {i}:") 137 | print(f"ATE: {ate:.4f}") 138 | 139 | 140 | # %% [markdown] 141 | # ## Plot DGPs 142 | 143 | # %% # Plot DGPs 144 | n_samples = 30 145 | scenarios = [ 146 | "1: Constant Effect, 1 adoption cohort", 147 | "2: Heterogeneous Across Units, 1 adoption cohort", 148 | "3: Heterogeneous Over Time, 1 adoption cohort", 149 | "4: Heterogeneous Across Units and Time, 2 adoption cohorts", 150 | ] 151 | 152 | plot_data_list = [] 153 | for i, data in enumerate([data1, data2, data3, data4]): 154 | # Sample units 155 | all_units = data["unit"].unique() 156 | sampled_units = np.random.choice(all_units, n_samples, replace=False) 157 | # Filter data for sampled units 158 | scenario_data = data[data["unit"].isin(sampled_units)].copy() 159 | scenario_data["scenario"] = scenarios[i] 160 | plot_data_list.append(scenario_data) 161 | # Combine all data 162 | plot_data = pd.concat(plot_data_list, ignore_index=True) 163 | # Create the plot 164 | plot = ( 165 | p9.ggplot(plot_data) 166 | + p9.aes(x="time", y="outcome", group="factor(unit)", color="factor(treated)") 167 | + p9.geom_line(alpha=0.7) 168 | + p9.geom_vline(xintercept=T0, linetype="dashed", color="green", alpha=0.7) 169 | + p9.geom_vline( 170 | data=plot_data[plot_data["scenario"] == scenarios[3]], 171 | xintercept=T1, 172 | linetype="dashed", 173 | color="purple", 174 | alpha=0.7, 175 | ) 176 | + p9.geom_hline(yintercept=0, linetype="dashed", color="black", alpha=0.7) 177 | + p9.facet_wrap("~ scenario", scales="free_y", ncol=2) 178 | + p9.theme_matplotlib() 179 | + p9.theme( 180 | legend_position="none", 181 | figure_size=(10, 8), 182 | ) 183 | + p9.labs(x="Time", y="Outcome") 184 | + p9.scale_color_manual(values=["blue", "red"]) 185 | ) 186 | plot = plot + p9.ggtitle( 187 | "Raw outcomes for 30 units from four DGPs\nCross-sectional ATE=0 for all four" 188 | ) 189 | plot 190 | 191 | # %% [markdown] id="8T_jARzZpgR4" 192 | # ## Event Study with `fixest` 193 | 194 | # %% 195 | from saturated import saturated_event_study 196 | 197 | # %% colab={"base_uri": "https://localhost:8080/", "height": 927} id="z-vu9bLro00j" outputId="86f5baac-2b6b-4c19-9594-9b1c5a867480" 198 | N, T, T0, T1 = 1000, 20, 10, 15 199 | data1, data2, data3, data4 = generate_quartet(N, T, T0, T1, trend_sigma=0.1, sigma=0.2) 200 | 201 | # %% 202 | f, ax = plt.subplots(2, 2, figsize=(8, 6), sharex=True, sharey=True) 203 | ax = ax.flatten() 204 | saturated_event_study(data1, "outcome", "treated", "time", "unit", ax=ax[0]) 205 | saturated_event_study(data2, "outcome", "treated", "time", "unit", ax=ax[1]) 206 | saturated_event_study(data3, "outcome", "treated", "time", "unit", ax=ax[2]) 207 | saturated_event_study(data4, "outcome", "treated", "time", "unit", ax=ax[3]) 208 | ax[0].axvline(-1, color="black", linestyle="--") 209 | ax[1].axvline(-1, color="black", linestyle="--") 210 | ax[2].axvline(-1, color="black", linestyle="--") 211 | ax[3].axvline(-1, color="black", linestyle="--") 212 | ax[0].axhline(0, color="black", linestyle="--") 213 | ax[1].axhline(0, color="black", linestyle="--") 214 | ax[2].axhline(0, color="black", linestyle="--") 215 | ax[3].axhline(0, color="black", linestyle="--") 216 | f.suptitle("Event Study figures for the four scenarios") 217 | f.tight_layout() 218 | f.show() 219 | # f.savefig("../output/event_study.png") 220 | -------------------------------------------------------------------------------- /code/plotters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import pyfixest as pf 5 | from saturated import test_treatment_heterogeneity, saturated_event_study 6 | 7 | 8 | def mini_panelview(data, unit, time, treat): 9 | treatment_quilt = data.pivot(index=unit, columns=time, values=treat) 10 | treatment_quilt = treatment_quilt.drop_duplicates() 11 | treatment_quilt = treatment_quilt.loc[ 12 | treatment_quilt.sum(axis=1).sort_values().index 13 | ] 14 | return treatment_quilt 15 | 16 | 17 | def diag_plot(df, treatment_start_cohorts, base_treatment_effects, figdim = (9, 10)): 18 | df2 = df.merge( 19 | df.assign(first_treated_period=df.time_id * df.W_it) 20 | .groupby("unit_id")["first_treated_period"] 21 | .apply(lambda x: x[x > 0].min()), 22 | on="unit_id", 23 | ) 24 | df2["rel_time"] = df2.time_id - df2["first_treated_period"] 25 | df2["first_treated_period"] = ( 26 | df2["first_treated_period"].replace(np.nan, 0).astype("int") 27 | ) 28 | df2["rel_time"] = df2["rel_time"].replace(np.nan, np.inf) 29 | 30 | fit_evstud = pf.feols( 31 | "Y_it ~ i(rel_time, ref=-1.0) | unit_id + time_id", 32 | df2, 33 | vcov={"CRV1": "unit_id"}, 34 | ) 35 | res = fit_evstud.tidy() 36 | # truth 37 | true_fns = {} 38 | for c, s in enumerate(treatment_start_cohorts): 39 | effect_vector_padded = np.pad( 40 | base_treatment_effects[c], 41 | (treatment_start_cohorts[-1], 0), 42 | ) 43 | 44 | # Create x-axis values that skip -1 45 | x_values = np.arange(len(effect_vector_padded)) 46 | x_values = np.where( 47 | x_values >= treatment_start_cohorts[-1], 48 | x_values - treatment_start_cohorts[-1], 49 | x_values - treatment_start_cohorts[-1] - 1, 50 | ) 51 | true_fns[f"cohort_{s}"] = pd.Series( 52 | {x: y for x, y in zip(x_values, effect_vector_padded)} 53 | ) 54 | 55 | true_event_study = pd.concat(true_fns).reset_index() 56 | true_event_study.columns = ["cohort", "rel_time", "true_effect"] 57 | true_event_study = true_event_study.groupby("rel_time")["true_effect"].mean() 58 | f, ax = plt.subplots(4, 1, figsize=figdim) 59 | cmp = plt.get_cmap("Set1") 60 | i = 0 61 | for k, v in true_fns.items(): 62 | ax[0].plot(v, color=cmp(i), marker=".") 63 | i += 1 64 | ax[0].axvline(-1, color="black", linestyle="--") 65 | ax[0].axhline(0, color="black", linestyle=":") 66 | ax[0].set_title("True treatment effect functions") 67 | 68 | event_time = ( 69 | res.index.str.extract(r"\[T\.(-?\d+\.\d+)\]").astype(float).values.flatten() 70 | ) 71 | 72 | ax[1].plot(event_time, res["Estimate"], marker=".", label="2wfe", color=cmp(1)) 73 | ax[1].fill_between( 74 | event_time, 75 | res["2.5%"], 76 | res["97.5%"], 77 | alpha=0.2, 78 | color=cmp(1), 79 | ) 80 | ax[1].plot(true_event_study, color="black", label="true", marker=".") 81 | ax[1].axvline(-1, color="black", linestyle="--") 82 | ax[1].axhline(0, color="black", linestyle=":") 83 | ax[1].set_title("Pooled event study \n 2WFE") 84 | ax[1].legend() 85 | 86 | # saturated 87 | _ = saturated_event_study( 88 | df, 89 | outcome="Y_it", 90 | treatment="W_it", 91 | unit_id="unit_id", 92 | time_id="time_id", 93 | ax=ax[2], 94 | ) 95 | ax[2].set_title("Saturated event study \n cohort X time interactions + 2WFE") 96 | 97 | treat_quilt = mini_panelview( 98 | df, 99 | unit="unit_id", 100 | time="time_id", 101 | treat="W_it", 102 | ) 103 | ax[3].imshow(treat_quilt, aspect="auto", cmap="viridis") 104 | 105 | f.tight_layout() 106 | 107 | 108 | ###################################################################### 109 | def checkplot(df): 110 | mm = test_treatment_heterogeneity(df, retmod=True) 111 | mmres = mm.tidy().reset_index() 112 | mmres[["time", "cohort"]] = mmres.Coefficient.str.split(":", expand=True) 113 | mmres["time"] = mmres.time.str.extract(r"\[T\.(-?\d+\.\d+)\]").astype(float) 114 | mmres["cohort"] = mmres.cohort.str.extract(r"(\d+)") 115 | mmres.loc[~(mmres.cohort.isna()) & (mmres.time > 0)].index 116 | 117 | evstudy_coefs = {} 118 | evstudy_coefs["0"] = ( 119 | mmres[mmres.cohort.isna()][["Estimate", "time"]].set_index("time").iloc[:, 0] 120 | ) 121 | for cohort in mmres.cohort.unique()[1:]: 122 | evstudy_coefs[cohort] = ( 123 | mmres.loc[mmres.cohort == cohort][["Estimate", "time"]] 124 | .set_index("time") 125 | .iloc[:, 0] 126 | ) 127 | 128 | f, ax = plt.subplots(3, 1, figsize=(12, 9), sharex=True) 129 | # vanilla event study 130 | saturated_event_study( 131 | df, 132 | outcome="Y_it", 133 | treatment="W_it", 134 | time_id="time_id", 135 | unit_id="unit_id", 136 | ax=ax[0], 137 | ) 138 | ax[0].set_title("Saturated event study") 139 | # cohort interactions 140 | ax[1].set_title("Cohort deviation coefficients relative to first cohort") 141 | ax[1].plot(evstudy_coefs["0"], label="Cohort 0", marker=".") 142 | ax[1].plot(evstudy_coefs["15"], label="Cohort 1", marker=".") 143 | ax[1].plot(evstudy_coefs["20"], label="Cohort 2", marker=".") 144 | ax[1].axvline(-0.5, color="black", linestyle="--", alpha=0.5) 145 | ax[1].axhline(0, color="black", linestyle=":", alpha=0.5) 146 | # combined 147 | ax[2].set_title("Aggregate cohort effects (Baseline + cohort deviations)") 148 | ax[2].plot(evstudy_coefs["0"], label="Cohort 0", marker=".") 149 | ax[2].plot(evstudy_coefs["15"] + evstudy_coefs["0"], label="Cohort 1", marker=".") 150 | ax[2].plot(evstudy_coefs["20"] + evstudy_coefs["0"], label="Cohort 2", marker=".") 151 | ax[2].axvline(-0.5, color="black", linestyle="--", alpha=0.5) 152 | ax[2].axhline(0, color="black", linestyle=":", alpha=0.5) 153 | ax[2].legend() 154 | 155 | -------------------------------------------------------------------------------- /code/saturated.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import pyfixest as pf 5 | 6 | 7 | def saturated_event_study( 8 | df: pd.DataFrame, 9 | outcome: str = "outcome", 10 | treatment: str = "treated", 11 | time_id: str = "time", 12 | unit_id: str = "unit", 13 | ax: plt.Axes = None, 14 | ): 15 | # create interactions 16 | df = df.merge( 17 | df.assign(first_treated_period=df[time_id] * df[treatment]) 18 | .groupby(unit_id)["first_treated_period"] 19 | .apply(lambda x: x[x > 0].min()), 20 | on=unit_id, 21 | ) 22 | df["rel_time"] = df[time_id] - df["first_treated_period"] 23 | df["first_treated_period"] = ( 24 | df["first_treated_period"].replace(np.nan, 0).astype("int") 25 | ) 26 | df["rel_time"] = df["rel_time"].replace(np.nan, np.inf) 27 | cohort_dummies = pd.get_dummies( 28 | df.first_treated_period, drop_first=True, prefix="cohort_dummy" 29 | ) 30 | df_int = pd.concat([df, cohort_dummies], axis=1) 31 | # formula 32 | ff = f""" 33 | {outcome} ~ 34 | {'+'.join([f"i(rel_time, {x}, ref = -1.0)" for x in df_int.filter(like = "cohort_dummy", axis = 1).columns])} 35 | | {unit_id} + {time_id} 36 | """ 37 | m = pf.feols(ff, df_int, vcov={"CRV1": unit_id}) 38 | if ax: 39 | # plot 40 | res = m.tidy() 41 | # create a dict with cohort specific effect curves 42 | res_dict = {} 43 | for c in cohort_dummies.columns: 44 | res_cohort = res.filter(like=c, axis=0) 45 | event_time = ( 46 | res_cohort.index.str.extract(r"\[T\.(-?\d+\.\d+)\]") 47 | .astype(float) 48 | .values.flatten() 49 | ) 50 | res_dict[c] = {"est": res_cohort, "time": event_time} 51 | 52 | i = 0 53 | cmp = plt.get_cmap("Set1") 54 | for k, v in res_dict.items(): 55 | ax.plot(v["time"], v["est"]["Estimate"], marker=".", label=k, color=cmp(i)) 56 | ax.fill_between( 57 | v["time"], v["est"]["2.5%"], v["est"]["97.5%"], alpha=0.2, color=cmp(i) 58 | ) 59 | i += 1 60 | ax.axvline(-1, color="black", linestyle="--") 61 | ax.axhline(0, color="black", linestyle=":") 62 | return m 63 | 64 | 65 | def test_treatment_heterogeneity( 66 | df: pd.DataFrame, 67 | outcome: str = "Y_it", 68 | treatment: str = "W_it", 69 | unit_id: str = "unit_id", 70 | time_id: str = "time_id", 71 | retmod: bool = False, 72 | ): 73 | # Get treatment timing info 74 | df = df.merge( 75 | df.assign(first_treated_period=df[time_id] * df[treatment]) 76 | .groupby(unit_id)["first_treated_period"] 77 | .apply(lambda x: x[x > 0].min()), 78 | on=unit_id, 79 | ) 80 | df["rel_time"] = df[time_id] - df["first_treated_period"] 81 | df["first_treated_period"] = ( 82 | df["first_treated_period"].replace(np.nan, 0).astype("int") 83 | ) 84 | df["rel_time"] = df["rel_time"].replace(np.nan, np.inf) 85 | # Create dummies but drop TWO cohorts - one serves as base for pooled effects 86 | cohort_dummies = pd.get_dummies( 87 | df.first_treated_period, drop_first=True, prefix="cohort_dummy" 88 | ).iloc[ 89 | :, 1: 90 | ] # drop an additional cohort - drops interactions for never treated and baseline 91 | 92 | df_int = pd.concat([df, cohort_dummies], axis=1) 93 | 94 | # Modified formula with base effects + cohort-specific deviations 95 | ff = f""" 96 | {outcome} ~ 97 | i(rel_time, ref=-1.0) + 98 | {'+'.join([f"i(rel_time, {x}, ref = -1.0)" for x in df_int.filter(like = "cohort_dummy", axis = 1).columns])} 99 | | {unit_id} + {time_id} 100 | """ 101 | 102 | model = pf.feols(ff, df_int, vcov={"CRV1": unit_id}) 103 | P = model.coef().shape[0] 104 | 105 | if retmod: 106 | return model 107 | mmres = model.tidy().reset_index() 108 | mmres[["time", "cohort"]] = mmres.Coefficient.str.split(":", expand=True) 109 | mmres["time"] = mmres.time.str.extract(r"\[T\.(-?\d+\.\d+)\]").astype(float) 110 | mmres["cohort"] = mmres.cohort.str.extract(r"(\d+)") 111 | # indices of coefficients that are deviations from common event study coefs 112 | event_study_coefs = mmres.loc[~(mmres.cohort.isna()) & (mmres.time > 0)].index 113 | # Method 2 (K x P) - more efficient 114 | K = len(event_study_coefs) 115 | R2 = np.zeros((K, P)) 116 | for i, idx in enumerate(event_study_coefs): 117 | R2[i, idx] = 1 118 | 119 | test_result = model.wald_test(R=R2, distribution="chi2") 120 | return test_result["pvalue"] 121 | 122 | 123 | def test_dynamics( 124 | df, 125 | outcome="Y", 126 | treatment="W", 127 | time_id="time", 128 | unit_id="unit", 129 | vcv={"CRV1": "unit"}, 130 | ): 131 | # Fit models 132 | df = df.merge( 133 | df.assign(first_treated_period=df[time_id] * df[treatment]) 134 | .groupby(unit_id)["first_treated_period"] 135 | .apply(lambda x: x[x > 0].min()), 136 | on=unit_id, 137 | ) 138 | df["rel_time"] = df[time_id] - df["first_treated_period"] 139 | df["rel_time"] = df["rel_time"].replace(np.nan, np.inf) 140 | restricted = pf.feols(f"{outcome} ~ i({treatment}) | {unit_id} + {time_id}", df) 141 | unrestricted = pf.feols( 142 | f"{outcome} ~ i(rel_time, ref=0) | {unit_id} + {time_id}", df, vcov=vcv 143 | ) 144 | # Get the restricted estimate 145 | restricted_effect = restricted.coef().iloc[0] 146 | # Create R matrix - each row tests one event study coefficient 147 | # against restricted estimate 148 | n_evstudy_coefs = unrestricted.coef().shape[0] 149 | R = np.eye(n_evstudy_coefs) 150 | # q vector is the restricted estimate repeated 151 | q = np.repeat(restricted_effect, n_evstudy_coefs) 152 | # Conduct Wald test 153 | pv = unrestricted.wald_test(R=R, q=q, distribution="chi2")["pvalue"] 154 | return pv 155 | -------------------------------------------------------------------------------- /code/syncer.sh: -------------------------------------------------------------------------------- 1 | for f in *.ipynb; do jupytext --sync $f; done 2 | -------------------------------------------------------------------------------- /figtab/hetfx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/hetfx.png -------------------------------------------------------------------------------- /figtab/homfx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/homfx.png -------------------------------------------------------------------------------- /figtab/rejection_rates_F.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/rejection_rates_F.png -------------------------------------------------------------------------------- /figtab/rejection_rates_dyn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/rejection_rates_dyn.png -------------------------------------------------------------------------------- /figtab/respecification_verify.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/respecification_verify.png -------------------------------------------------------------------------------- /figtab/respecification_verify_het.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/respecification_verify_het.png -------------------------------------------------------------------------------- /figtab/respecification_verify_hom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/respecification_verify_hom.png -------------------------------------------------------------------------------- /figtab/static_dynamic_effects.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/static_dynamic_effects.png -------------------------------------------------------------------------------- /figtab/true_functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/true_functions.png -------------------------------------------------------------------------------- /paper/appendix.typ: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/paper/appendix.typ -------------------------------------------------------------------------------- /paper/jmlr.typ: -------------------------------------------------------------------------------- 1 | #let jmlr( 2 | title: [], 3 | authors: (), 4 | abstract: [], 5 | keywords: (), 6 | bibliography: none, 7 | appendix: none, 8 | date: none, 9 | body, 10 | ) = { 11 | // Extract affls if provided in the specific format 12 | let affls = () 13 | if authors.len() == 2 and type(authors) == array { 14 | (authors, affls) = authors 15 | } 16 | 17 | // Basic document setup 18 | set document(title: title) 19 | set page( 20 | paper: "us-letter", 21 | margin: (left: 1.0in, right: 1.0in, top: 1.0in, bottom: 1.0in), 22 | numbering: "1", 23 | ) 24 | 25 | // Basic text settings 26 | set text(font: ("P052",), size: 11pt) 27 | set par(leading: 0.55em, first-line-indent: 17pt, justify: true) 28 | set heading(numbering: "1.1 ") 29 | 30 | // Set citation style to dark red 31 | // set cite(style: "chicago-author-date") 32 | show cite: set text(fill: rgb(139, 0, 0)) // Dark red color for citations 33 | 34 | // Make all links dark red with no underline 35 | show link: set text(fill: rgb(139, 0, 0)) 36 | 37 | // Title 38 | align(center)[ 39 | #block(text(size: 14pt, weight: "bold", title)) 40 | #v(1em) 41 | ] 42 | 43 | // Authors 44 | for author in authors { 45 | align(center)[ 46 | #text(weight: "bold", author.name) 47 | #if "affl" in author and author.affl in affls { 48 | let affl = affls.at(author.affl) 49 | if "department" in affl { 50 | linebreak() 51 | emph(affl.department) 52 | } 53 | } 54 | #if "email" in author and author.email != "" { 55 | linebreak() 56 | link("mailto:" + author.email, author.email) 57 | } 58 | ] 59 | v(0.5em) 60 | } 61 | 62 | // Abstract 63 | if abstract != [] { 64 | v(1em) 65 | align(center)[*Abstract*] 66 | block( 67 | width: 100%, 68 | inset: (x: 2em), 69 | abstract 70 | ) 71 | } 72 | 73 | // Keywords 74 | if keywords != () { 75 | v(0.5em) 76 | block( 77 | width: 100%, 78 | inset: (x: 2em), 79 | [*Keywords:* #keywords.join(", ")] 80 | ) 81 | } 82 | 83 | v(2em) 84 | 85 | // Main body 86 | body 87 | 88 | // Appendix 89 | if appendix != none { 90 | pagebreak() 91 | heading(numbering: "A.1", [Appendix]) 92 | counter(heading).update(0) 93 | appendix 94 | } 95 | 96 | // Bibliography 97 | if bibliography != none { 98 | pagebreak() 99 | heading([References]) 100 | bibliography 101 | } 102 | } 103 | 104 | // Simplest possible theorem function 105 | #let theorem(body) = { 106 | block( 107 | fill: rgb(240, 240, 240), 108 | inset: 1em, 109 | radius: 4pt, 110 | [*Theorem.* #body] 111 | ) 112 | } 113 | 114 | // Simplest possible proof function 115 | #let proof(body) = { 116 | block( 117 | inset: 1em, 118 | [*Proof.* #body #h(1fr) #sym.square.stroked] 119 | ) 120 | } 121 | -------------------------------------------------------------------------------- /paper/main.bib: -------------------------------------------------------------------------------- 1 | @ARTICLE{Arkhangelsky2023-rf, 2 | title = {Causal Models for Longitudinal and Panel Data: A Survey}, 3 | author = {Arkhangelsky, D and Imbens, G}, 4 | journaltitle = {SSRN Electronic Journal}, 5 | date = {2023-11-26}, 6 | url = {http://www.nber.org/papers/w31942.pdf} 7 | } 8 | 9 | @inproceedings{currie2020technology, 10 | title={Technology and big data are changing economics: Mining text to track methods}, 11 | author={Currie, Janet and Kleven, Henrik and Zwiers, Esm{\'e}e}, 12 | booktitle={AEA Papers and Proceedings}, 13 | volume={110}, 14 | pages={42--48}, 15 | year={2020}, 16 | organization={American Economic Association 2014 Broadway, Suite 305, Nashville, TN 37203} 17 | } 18 | 19 | @book{lehmann2005testing, 20 | address = {New York}, 21 | author = {Lehmann, E. L. and Romano, Joseph P.}, 22 | edition = {Third}, 23 | publisher = {Springer}, 24 | series = {Springer Texts in Statistics}, 25 | title = {Testing statistical hypotheses}, 26 | year = 2005 27 | } 28 | 29 | @ARTICLE{Ding2019-nr, 30 | title = {Decomposing Treatment Effect Variation}, 31 | author = {Ding, Peng and Feller, Avi and Miratrix, Luke}, 32 | journaltitle = {Journal of the American Statistical Association}, 33 | publisher = {Taylor \& Francis}, 34 | volume = {114}, 35 | issue = {525}, 36 | pages = {304--317}, 37 | date = {2019-01-02}, 38 | url = {https://doi.org/10.1080/01621459.2017.1407322} 39 | } 40 | 41 | @ARTICLE{Callaway2021-gv, 42 | title = {Difference-in-Differences with multiple time periods}, 43 | author = {Callaway, Brantly and Sant'Anna, Pedro H C}, 44 | journaltitle = {Journal of econometrics}, 45 | publisher = {Elsevier BV}, 46 | volume = {225}, 47 | issue = {2}, 48 | pages = {200--230}, 49 | date = {2021-12-01}, 50 | url = {https://www.sciencedirect.com/science/article/pii/S0304407620303948}, 51 | language = {en} 52 | } 53 | 54 | 55 | @ARTICLE{weiss2024much, 56 | title={How Much Should We Trust Modern Difference-in-Differences Estimates?}, 57 | author={Weiss, Amanda}, 58 | year={2024}, 59 | journal={socarxiv preprint}, 60 | institution={Center for Open Science} 61 | } 62 | 63 | 64 | @article{chiu2023and, 65 | title={What to do (and not to do) with causal panel analysis under parallel trends: Lessons from a large reanalysis study}, 66 | author={Chiu, Albert and Lan, Xingchen and Liu, Ziyi and Xu, Yiqing}, 67 | journal={arXiv preprint arXiv:2309.15983}, 68 | year={2023} 69 | } 70 | 71 | @ARTICLE{Schmidheiny2023-of, 72 | title = {On event studies and distributed‐lags in two‐way fixed effects 73 | models: Identification, equivalence, and generalization}, 74 | author = {Schmidheiny, Kurt and Siegloch, Sebastian}, 75 | journaltitle = {Journal of applied econometrics (Chichester, England)}, 76 | publisher = {Wiley}, 77 | volume = {38}, 78 | issue = {5}, 79 | pages = {695--713}, 80 | date = {2023-08}, 81 | url = {https://www.schmidheiny.name/research/docs/schmidheiny-siegloch_2020-11.pdf}, 82 | language = {en} 83 | } 84 | 85 | 86 | @ARTICLE{Abraham2020-wu, 87 | title = {Estimating Dynamic Treatment Effects in Event Studies with 88 | Heterogeneous Treatment Effects}, 89 | author = {Abraham, Sarah and Sun, Liyang}, 90 | journaltitle = {Journal of econometrics}, 91 | date = {2020} 92 | } 93 | 94 | @book{angrist2009mostly, 95 | title={Mostly harmless econometrics: An empiricist's companion}, 96 | author={Angrist, Joshua D and Pischke, J{\"o}rn-Steffen}, 97 | year={2009}, 98 | publisher={Princeton university press} 99 | } 100 | 101 | @article{rambachan2023more, 102 | title={A more credible approach to parallel trends}, 103 | author={Rambachan, Ashesh and Roth, Jonathan}, 104 | journal={Review of Economic Studies}, 105 | volume={90}, 106 | number={5}, 107 | pages={2555--2591}, 108 | year={2023}, 109 | publisher={Oxford University Press US} 110 | } 111 | 112 | @article{lechner2011estimation, 113 | title={The estimation of causal effects by difference-in-difference methods}, 114 | author={Lechner, Michael}, 115 | journal={Foundations and Trends in Econometrics}, 116 | volume={4}, 117 | number={3}, 118 | pages={165--224}, 119 | year={2011}, 120 | publisher={Now Publishers, Inc.} 121 | } 122 | 123 | @ARTICLE{Wooldridge2021-op, 124 | title = {Two-Way Fixed Effects, the Two-Way Mundlak Regression, and 125 | Difference-in-Differences Estimators}, 126 | author = {Wooldridge, Jeffrey M}, 127 | journaltitle = {Working paper}, 128 | date = {2021-08-17}, 129 | url = {http://dx.doi.org/}, 130 | urldate = {2021-08-16} 131 | } 132 | 133 | @article{lal2024large, 134 | title={Large Scale Longitudinal Experiments: Estimation and Inference}, 135 | author={Lal, Apoorva and Fischer, Alexander and Wardrop, Matthew}, 136 | journal={arXiv preprint arXiv:2410.09952}, 137 | year={2024} 138 | } 139 | 140 | @ARTICLE{Roth2022-sz, 141 | title = {What's trending in difference-in-differences? A synthesis of 142 | the recent econometrics literature}, 143 | author = {Roth, Jonathan and Sant'Anna, Pedro H C and Bilinski, Alyssa 144 | and Poe, John}, 145 | journaltitle = {Journal of Econometrics}, 146 | date = {2023-01-04}, 147 | eprinttype = {arXiv}, 148 | eprintclass = {econ.EM}, 149 | urldate = {2023-01-01} 150 | } 151 | 152 | 153 | @ARTICLE{Goldsmith-Pinkham2024-ef, 154 | title = {Contamination bias in linear regressions}, 155 | author = {Goldsmith-Pinkham, Paul S and Hull, Peter and Kolesár, Michal}, 156 | journaltitle = {American Economic Review}, 157 | date = {2024}, 158 | language = {en} 159 | } 160 | 161 | @ARTICLE{De_Chaisemartin2021-ln, 162 | title = {Two-way fixed effects and differences-in-differences with 163 | heterogeneous treatment effects: A survey}, 164 | author = {de Chaisemartin, Clément and D'Haultfœuille, Xavier}, 165 | date = {2021-12-08}, 166 | url = {https://papers.ssrn.com/abstract=3980758}, 167 | urldate = {2021-12-08} 168 | } 169 | 170 | @ARTICLE{Goodman-Bacon2021-ys, 171 | title = {Difference-in-differences with variation in treatment timing}, 172 | author = {Goodman-Bacon, Andrew}, 173 | journaltitle = {Journal of econometrics}, 174 | date = {2021-06-12}, 175 | url = {https://www.sciencedirect.com/science/article/pii/S0304407621001445} 176 | } 177 | 178 | @ARTICLE{De_Chaisemartin2020-za, 179 | title = {Two-way fixed effects estimators with heterogeneous treatment 180 | effects}, 181 | author = {de Chaisemartin, Clément and D'Haultfœuille, Xavier}, 182 | journaltitle = {The American economic review}, 183 | date = {2020}, 184 | eprinttype = {arXiv}, 185 | eprintclass = {econ.EM}, 186 | url = {http://arxiv.org/abs/1803.08807} 187 | } 188 | 189 | @ARTICLE{Imbens2021-zw, 190 | title = {Double-Robust Identification for Causal Panel Data Models}, 191 | author = {{Imbens} and {Arkhangelsky}}, 192 | journaltitle = {NBER}, 193 | date = {2021}, 194 | url = {https://www.nber.org/system/files/working_papers/w28364/w28364.pdf} 195 | } 196 | -------------------------------------------------------------------------------- /paper/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/paper/main.pdf -------------------------------------------------------------------------------- /paper/main.typ: -------------------------------------------------------------------------------- 1 | #import "jmlr.typ": jmlr, theorem, proof 2 | #let authors = ( 3 | (name: "Apoorva Lal", affl: "one", email: ""), 4 | ) 5 | #let affls = ( 6 | one: ( 7 | department: "Netflix, Los Gatos, CA", 8 | ), 9 | ) 10 | #set math.equation(numbering: "(1)") 11 | #show: jmlr.with( 12 | title: [When can we get away with using the two-way fixed effects regression?], 13 | authors: (authors, affls), 14 | abstract: " 15 | The use of the two-way fixed effects regression in empirical social science was historically motivated by folk wisdom that it uncovers the average treatment effect on the treated (ATT). This has come under scrutiny recently due to recent results in applied econometrics showing that it fails to uncover meaningful averages of heterogeneous treatment effects in the presence of effect heterogeneity over time and across adoption cohorts, and several heterogeneity-robust alternatives have been proposed. However, these estimators often have higher variance and are therefore under-powered for many applications, which poses a bias-variance tradeoff that is challenging for researchers to navigate. In this paper, we propose simple tests of linear restrictions that can be used to test for differences in dynamic treatment effects over cohorts, which allows us to test for when the two-way fixed effects regression is likely to yield biased estimates of the ATT. These tests are implemented as methods in the pyfixest python library. 16 | ", 17 | keywords: ("difference in differences", "panel data", "heterogeneous treatment effects"), 18 | bibliography: bibliography("main.bib"), 19 | appendix: none, 20 | date: datetime( 21 | year: 2025, 22 | month: 03, 23 | day: 15, 24 | ) 25 | 26 | ) 27 | 28 | 29 | 30 | 31 | = Introduction 32 | 33 | Difference-in-Differences and event-studies are now the most popular for estimating causal effects in observational settings thanks to the growing importance of administrative and other offline data sources (@currie2020technology). Their popularity arises from their broad applicability to settings with selection on unobservable unit-specific factors, and straightforward implementation as a two-way fixed effects linear regression in the two-period case. As with many empirical techniques, however, practice outstrips theory, and the extension of the equivalence between fixed-effects regression and the non-parametric estimator in the two-period, two-cohort case turned out to be more complicated than previously realized, which has prompted an exposion of alternative estimators in that are robust to the 'contamination bias' introduced into the two-way fixed-effects regression by treatment effect heterogeneity. Since these newer estimators either trim the data or add many parameters to zero out the bias, they tend to have considerably higher variance, which introduces a bias-variance tradeoff that is challenging to navigate for practitioners. In this paper, we propose the application of 34 | classical joint-tests of appropriately parametrized linear regressions as a tool to aid practitioners in navigating the bias-variance tradeoff in difference-in-differences settings. 35 | 36 | = Methodology 37 | 38 | Consider a balanced panel-data setting with $i = 1, ..., N$ individuals observed over $t = 1, ..., T$ time periods. For each unit $i$, a binary treatment $w_(i t) := 1(t >= g_(i))$ is assigned at some adoption time $g_(i) in cal(G)$ where $cal(G) := [T] union infinity$ is the set of treatment adoption times and $g_i = infinity$ indicates a never-treated unit. We observe a scalar outcome $y_(i t) = w_(i t) y^(1)_(i t) + (1-w_(i t)) y^(0)_(i t)$, where $y^(1)_(i t)$ and $y^(0)_(i t)$ are potential outcomes under treatment and control, respectively 39 | #footnote[Defining potential outcomes as $y^(w)_(i t)$ is a strong but common assumption; it requires no carryover - that the outcome for unit $i$ at time $t$ is only influenced by $i$'s current-period treatment and not treatment history. Alternative estimators such as Marginal Structural Models (MSMs) and dynamic panel models permit estimation in the presence of carryover under different strong assumptions but are considerably more computationally challenging, and as such are used infrequently.]. The following two-way fixed effects regression 40 | 41 | $ 42 | y_(i t) = tau w_(i t) + alpha_i + lambda_t + epsilon_(i t) 43 | $ 44 | 45 | is a workhorse regression in applied economics and adjacent fields for the estimation of causal effects in such settings. The estimand that researchers typically seek to estimate in panel data settings is the Average Treatment effect on the Treated (ATT) ($EE[y^(1)_(i t) - y^(0)_(i t) | w_(i t) = 1]$), and researchers often interpret the coefficient on the treatment indicator, $hat(tau)$, as an estimate of the ATT. 46 | The above regression's dynamic ('event study') counterpart 47 | 48 | $ 49 | y_(i t) = sum_(s != -1)^(T) gamma_s Delta_(i t)^s + alpha_i + lambda_t + epsilon_(i t) 50 | $ 51 | 52 | where $Delta_(i t)^s$ is an indicator for the $s$-th period relative to the adoption time for treated units (which in turn is the first-difference of the treatment indicator, @Schmidheiny2023-of). The presence of leads and lags of the switching indicator in this regression allows us to interpret the coefficients on lags as estimate the dynamic ATT (@angrist2009mostly ch 5) and coefficient on leads as a visual check of the validity of the parallel trends. This practice is widespread in applied research but tends to be distortionary and has low power (@rambachan2023more). 53 | 54 | When $g_i in {T_0, infinity}$ (one-shot adoption), the above regressions are unbiased estimates of the ATT under the assumption of parallel trends and no anticipation (@lechner2011estimation). However, when $g_i in {T_0, ..., T-1}$ (staggered adoption), the above regressions exhibit the 'negative weighting'/'contamination bias' problem (@Goodman-Bacon2021-ys, @De_Chaisemartin2020-za, @Goldsmith-Pinkham2024-ef). As in the cross-sectional case, the regression coefficient on the treatment indicator, $hat(tau)$, is a weighted average of the treatment effects over time and across treated cohorts, where the weights are functions of the conditional variance in the treatment. Unlike in the cross-sectional case, however, these weights can be negative for some cohorts, which yields the conclusion that the two-way fixed effects regression can fail to uncover meaningful averages of heterogeneous treatment effects over time and across adoption cohorts#footnote[In particlar, this constitutes a violation of the 'no-sign reversal property' where $hat(tau)$ is positive even if the treatment effect is strictly negative for each $(g,t)$ (@De_Chaisemartin2021-ln).]. The same is true for the event study coefficient vector $bold(gamma)$ (@Abraham2020-wu). 55 | 56 | This has prompted a explosion of research in applied econometrics on new estimators that aim to uncover the ATT in the presence of heterogeneous treatment effects over time and across adoption cohorts (@De_Chaisemartin2021-ln, @Roth2022-sz, @Arkhangelsky2023-rf for reviews). Such heterogeneity-robust estimators typically involve estimating the ATT separately for each cohort using tailored comparisons between each treated cohort and either a never-treated or not-yet-treated group, and then averaging (optionally weighted by inverse-propensity weights, e.g. @Callaway2021-gv) these estimates to obtain an overall estimate of the ATT. While their consistency properties for the ATT are well understood and they avoid the negative weighting problem by construction, they are often computationally expensive and have higher variance than the two-way fixed effects regression. 57 | 58 | This poses a practical bias-variance tradeoff for researchers: while the two-way fixed effects regression is computationally simple and has low variance, it may yield biased estimates of the ATT in the presence of heterogeneous treatment effects over time and across adoption cohorts. In contrast, heterogeneity-robust estimators are computationally expensive and have higher variance, but they are consistent for the ATT in the presence of heterogeneous treatment effects over time and across adoption cohorts. As a practical matter, a large re-analysis of published work in political science by @chiu2023and finds that they rarely overturn the conclusions of the two-way fixed effects regression, and are typically have considerably larger variance. Similarly, @weiss2024much finds that most new heterogeneity-robust estimators are underpowered for realistic effect sizes in the state-level US setting where difference-in-differences approaches commonly used. 59 | 60 | This motivates the primary focus of this paper: to develop simple tests that can be used to test for differences in dynamic treatment effects over cohorts, which allows us to test for when the two-way fixed effects regression is likely to yield biased estimates of the ATT. Heuristically, if the dynamic treatment effects are homogeneous over cohorts, then the two-way fixed effects regression is likely to yield unbiased estimates of the ATT that are considerably more precise than alternative estimators that typically discard more data in order to shut down the negative weighting problem. 61 | 62 | To build intuition for this approach, consider @homfx and @hetfx. In @homfx, there are three adoption cohorts (plus a never-treated cohort - bottom panel), and all cohorts exhibit the same temporal heterogeneity pattern (the effect function is $log(t)$ - top panel), and so the 2WFE event study (blue line in panel 2) is consistent for the true dynamic ATT (black line in panel 2). We can also consistently estimate the cohort-level ATTs with an appropriately saturated regression (@Abraham2020-wu, @Wooldridge2021-op) as shown in the third panel. In @hetfx, in contrast, we have the same three adoption cohorts, but the three cohorts exhibit radically different temporal heterogeneity: the first exhibits a linear decay down to zero, the second exhibits a log increase followed by zero, and the third exhibits sinusoidal effects. In this case, the 2WFE event study (blue line in panel 2) is not consistent for the true dynamic ATT (black line in panel 2); in fact, the estimated event study suggests a violation of the parallel trend assumption despite the treatments being randomized and thus parallel trends being true in the DGP, which is a pernicious side-effect of the negative weights problem. We can still estimate the cohort-level ATTs correctly with a saturated regression. The key insight is that testing for differences between a 'pooled' event study (the blue line in the second panel) and cohort X time interactions (that yield the cohort-level estimates in the third panel) can help us distinguish between the two scenarios. This can be formulated as a joint F-test on the coefficients of the cohort X time interactions in a saturated regression. We provide a formal statement of this test in the next section, and show through simulation studies that this approach can detect cohort-level temporal heterogeneity in a variety of DGPs. 63 | 64 | #figure( 65 | image("../figtab/homfx.png", width: 100%), 66 | caption: [ 67 | true and estimated effects from pooled and saturated event study regressions with homogeneous treatment effects across three cohorts. Joint test p-value = 0.11 68 | ], 69 | ) 70 | 71 | 72 | #figure( 73 | image("../figtab/hetfx.png", width: 100%), 74 | caption: [ 75 | true and estimated effects from pooled and saturated event study regressions in a DGP with heterogeneous treatment effects across three cohorts. Joint test p-value = 0.000 76 | ], 77 | ) 78 | 79 | 80 | = Methodology 81 | 82 | Tests considered in the following section take the form of traditional joint tests of multiple linear restrictions, where the null hypothesis is that $bold(R) bold(beta) = bold(q)$ where $bold(R)$ is a $m times k$ matrix of linear restrictions, $bold(beta)$ is a $k times 1$ vector of coefficients, and $q$ is a $m times 1$ vector of constants. The test statistic is then 83 | 84 | $ 85 | F = frac( 86 | (bold(R) hat(beta) - bold(q))' 87 | [bold(R) hat(bb(V)) bold(R)']^(-1) 88 | (bold(R) hat(beta) - bold(q)), 89 | m) 90 | ~ F(m, n-k) " under the null hypothesis" 91 | $ 92 | 93 | where $hat(bb(V))$ is the cluster-robust variance-covariance matrix of the coefficient estimates. 94 | We consider two tests: one for testing for event study dynamics, and one for testing for heterogeneity in event study dynamics. These tests are both classical Wald tests for linear restrictions and are asymptotically equivalent to the Likelihood Ratio test and Lagrange Multiplier test. The test is optimal (most powerful) in the class of invariant tests for local alternatives when errors are normally distributed (@lehmann2005testing lemma 8.5.2). 95 | #footnote[This can be implemented using either a $chi^2$ or $F$ test; the distinction between the two is due to different degrees of freedom that disappear for realistic sample sizes] 96 | 97 | == Testing for event study dynamics 98 | 99 | As a warmup, consider a simple comparison between @statictwfe and @eventstudy. The latter decomposes the ATT across time-periods. For the purposes of testing for event study dynamics, we only care about comparing the equality of the dynamic treatment effects after the treatment is assigned (${gamma_t}_(t=0)^T$) against the common ATT estimate $tau$. We can test the following null hypothesis 100 | $ 101 | H_0: {gamma_t}_(t=0)^T = hat(tau) " for all k" > 0 102 | $ 103 | 104 | by specifying $bold(R) = bold(I)_K$ as a $T_1 times T_1$ identity matrix and $bold(q) = (hat(tau), ..., hat(tau))'$ as a $T_1$-vector of the restricted estimate ($hat(tau)$ from @statictwfe). 105 | #footnote[ 106 | this can equivalently be formulated by testing for the equality of adjacent elements of $bold(gamma)$, e.g. $gamma_1 = gamma_2$ by specifying $bold(R)$ that contains rows like $[1, -1, 0, ..., 0]$ and $q = [0, ..., 0]$. 107 | ] 108 | 109 | == Testing for across-cohort heterogeneity in dynamic treatment effects 110 | 111 | Next, we extend the approach outlined above to construct a test for across-cohort heterogeneity in dynamic treatment effects. A conventional method to estimate the cohort-level ATTs is to estimate the dynamic treatment effects separately for each cohort and then average these estimates to obtain an overall estimate of the ATT (@Abraham2020-wu, @Wooldridge2021-op, @lal2024large), which involves specifying the following regression 112 | 113 | $ 114 | y_(i t) = alpha_i + lambda_t + 115 | underbrace( 116 | sum_(g_i in cal(C)\\ infinity) sum_(s != -1)^(T) bb(1)(g_i = c) tau^(s c) Delta_(i t)^s, 117 | "Cohort-Time Interactions") 118 | + epsilon_(i t) 119 | $ 120 | 121 | This is a saturated event study that constructs cohort $times$ time interactions for each adoption cohort (with $g_i = infinity$ never treated cohort) omitted and therefore recovers the cohort-level event studies. These coefficients are reported in the third panel in @homfx and @hetfx, and correctly uncover the true cohort-level ATTs in the presence of arbitrary heterogeneous treatment effects across cohorts (top panel). The downside of this approach, however, are twofold. First, these regressions can get unwieldy with many cohorts, and the number of parameters grows linearly with the number of cohorts. Second, the cohort level ATTs are self-contained and therefore constructing a test for equality across multiple cohorts is not straightforward. Instead, one may re-specify the saturated event-study regression @satevent as follows: 122 | 123 | $ 124 | y_(i t) = alpha_i + lambda_t + 125 | underbrace(sum_(s != -1)^(T) gamma_s Delta_(i t)^s, "(a) Common event study coefficients") 126 | + 127 | underbrace(sum_(c in cal(C)) sum_(s != -1)^(T) delta_s Delta_(i t)^(c s), "(b) Cohort-specific deviations") 128 | + epsilon_(i t) 129 | $ 130 | 131 | @jointreg returns numerically identical estimates of the cohort-level dynamic ATT as @satevent, but it allows us to test for differences in dynamic treatment effects over cohorts more easily. This is because @jointreg contains a common event study coefficient vector (a), and cohort-level deviations (b). The (b) terms can be jointly tested against the null of zero, which serves as a direct test of cohort-level treatment effect heterogeneity relative to a traditional event study. This approach is similar to omnibus tests of effect heterogeneity in cross-sectional RCTs proposed by @Ding2019-nr, testing the joint null of $gamma = 0$ in the interacted regression $y ~ tau W + X beta + W X gamma + epsilon$ serves as a test for explained effect heterogeneity. We illustrate an application of this test in @respec, where the top panel reports the saturated event study @satevent, the middle panel reports the coefficients from re-specified model @jointreg, and the bottom panel reports the sum of the common event study and cohort-specific deviations, which reproduces the saturated event study estimates exactly. 132 | 133 | #set align(left) 134 | #figure( 135 | grid( 136 | columns: (260pt, 260pt), 137 | [ #image("../figtab/respecification_verify_hom.png", width: 120%) ], 138 | [ #image("../figtab/respecification_verify_het.png", width: 120%) ], 139 | ), 140 | caption: [ 141 | For each DGP (homogeneous - @homfx - on the left and heterogeneous - @hetfx - on the right), the top panel illustrates the traditional event study estimates from eqn @satevent, which are unbiased for the true effects. The middle panel plots the re-specified model, which plots an overall event study (first cohort : blue) and subsequent cohort deviations (second and third cohorts - which are null in this DGP). The final panel plots the sum of the blue and cohort-specific coefficients, which reproduces the event study coefficient from the first panel exactly. 142 | ], 143 | ) 144 | 145 | 146 | We show in the next section that this test is consistent for the null hypothesis of homogeneous dynamic treatment effects over cohorts, and that it has power against a variety of alternatives. As a concrete example, the joint $p-$value for the cohort $times$ time interactions in @homfx is $0.11$, while the joint p-value for the cohort $times$ time interactions in @hetfx is $0.000$. Thus, we can reject the null hypothesis of homogeneous dynamic treatment effects in @hetfx but not in @homfx, which is consistent with the underlying DGP. In the next section, we show through simulation studies that this test has good power to detect across-cohort heterogeneity in dynamic treatment effects in a variety of DGPs. 147 | 148 | = Simulation Studies 149 | 150 | == Testing for event study dynamics 151 | 152 | To begin, we perform simulation studies based on to study the properties of the testing procedure described in @test_dyn. We consider the simple setting with a single adoption cohort where the treatment effects follow one of the following seven DGPs visualised in @static_dyn. 153 | 154 | #figure( 155 | image("../figtab/static_dynamic_effects.png", width: 105%), 156 | caption: [ 157 | true treatment effect functions and estimates from difference in means, static, and dynamic two-way fixed effects regressions. The treatment effect is truly stationary in the first DGP and varies over time in the others. 158 | ], 159 | ) 160 | 161 | The first DGP has constant effects over time, while the others have varying degrees of temporal heterogeneity. We simulate 1000 replications of the data for each DGP, and compute the rejection rate of the joint test for dynamic treatment effects outlined in the previous section. We report the rejection rate and p-value distribution in @rejrates_dyn. We find that the rejection rate for the constant DGP (null) is under the nominal level of $alpha = 0.05$, while the rejection rates for the other DGPs considerably higher. The rejection rate for concave effects is considerably lower, although this is likely due to the fact that the treatment effects do actually tail off in later time periods and the static effect captures this well. 162 | 163 | #figure( 164 | image("../figtab/rejection_rates_dyn.png", width: 105%), 165 | caption: [ 166 | Rejection rates over 1000 replications for the joint test of dynamic treatment effects using an F-test in DGPs from @static_dyn 167 | ], 168 | ) 169 | 170 | 171 | == Testing for across-cohort heterogeneity in dynamic treatment effects 172 | 173 | Next, we perform simulation studies based on to study the properties of the testing procedure described in @test_het. Here, we consider seven different DGPs with homogeneous and heterogeneous treatment effect functions across cohorts as illustrated in @truefns. In addition to the two DGPs described in the previous section, we consider DGPs with heterogeneity that applies a scaler multiplier to the concave (log) effect function in @homfx with 'small' and 'large' differences; a DGP with 'selection on gains' where the cohort with the largest treatment effect adopts first; a DGP with 'novelty effects' where the treatment effect is large for the first few periods and then diminishes; and finally a DGP with 'activity bias' where the treatment effect is immediate and large for the earliest adopting cohort and much more gradual for the others. Among all these DGPs, the homogenous and novelty effects DGPs have homogeneous treatment effects across cohorts, while all others have heterogeneous treatment effects across cohorts. 174 | 175 | For each DGP, we simulate 1000 replications of the data, and compute the rejection rate of the joint test for cohort-level coefficients outlined in the previous section. We report the rejection rate and p-value distribution in @rejrates. We find that the rejection rate for the homogeneous DGP (null) is under the nominal level of $alpha = 0.05$, while the rejection rates for heterogeneous DGPs are close to 1. This suggests that the test has good power to detect across-cohort heterogeneity in dynamic treatment effects. 176 | 177 | #figure( 178 | image("../figtab/true_functions.png", width: 80%), 179 | caption: [ 180 | true cohort level effect functions for homogeneous and heterogeneous treatment effects across three cohorts. Earliest-treated cohort is in purple, middle cohort in green, and latest cohort in yellow. 181 | 'Homogenous' and 'novelty effects' DGPs have homogeneous treatment effects across cohorts, while all others have heterogeneous treatment effects across cohorts. 182 | ], 183 | ) 184 | 185 | 186 | 187 | #figure( 188 | image("../figtab/rejection_rates_F.png", width: 100%), 189 | caption: [ 190 | Rejection rates over 1000 replications for the joint test of cohort-level coefficients using an F-test in DGPs from @truefns 191 | ], 192 | ) 193 | 194 | 195 | = Conclusion 196 | 197 | The two-way fixed effects regression remains a workhorse tool in applied economics despite recent critiques highlighting its potential shortcomings under treatment effect heterogeneity. This paper provides simple diagnostic tests that help researchers determine when TWFE is likely to yield reliable estimates versus when more complex estimators are needed. Our simulation evidence shows these tests have good power to detect problematic patterns of effect heterogeneity while maintaining correct size under the null of homogeneous effects. 198 | 199 | The tests we propose are computationally simple and implemented in the pyfixest library and readily implementable in standard statistical software. Since heterogeneity-robust estimators often come with higher variance and computational complexity, the ability to test when they are truly needed helps researchers make principled choices about their estimation strategy. While these tests cannot guarantee TWFE will recover meaningful treatment effects, they provide a practical tool for detecting scenarios where the recent critiques of TWFE are most relevant. 200 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Joint-tests for treatment-effect heterogeneity in panel data 2 | 3 | Or, do you need to run 5 different heterogeneity-robust estimators in your event study. 4 | 5 | Paper draft + replication code 6 | - first paper written entirely in typst. Excellent experience; strongly recommend. 7 | 8 | --------------------------------------------------------------------------------