├── .gitignore
├── code
    ├── 0multi_group_F.py
    ├── 1dyn_het_sims.py
    ├── 1multi_F_power.py
    ├── dgp.py
    ├── jupytext.toml
    ├── misc
    │   └── event_study_anscombe.py
    ├── plotters.py
    ├── saturated.py
    └── syncer.sh
├── figtab
    ├── hetfx.png
    ├── homfx.png
    ├── rejection_rates_F.png
    ├── rejection_rates_dyn.png
    ├── respecification_verify.png
    ├── respecification_verify_het.png
    ├── respecification_verify_hom.png
    ├── static_dynamic_effects.png
    └── true_functions.png
├── paper
    ├── appendix.typ
    ├── jmlr.typ
    ├── main.bib
    ├── main.pdf
    └── main.typ
└── readme.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | **/__pycache__/*
 2 | *.csv
 3 | *.dta
 4 | *.xlsx
 5 | *.ipynb_checkpoints
 6 | .vscode/*
 7 | *.log
 8 | *.bbl
 9 | *.bcf
10 | *.fls
11 | *.bcf
12 | *.run.xml
13 | **/*_cache/*
14 | *.shp
15 | *.shx
16 | *.qpj
17 | *.dbf
18 | *.RData
19 | *.Rds
20 | *.tab
21 | *.swp
22 | *.spq
23 | *.pqt
24 | *.geojson
25 | *.gpkg
26 | *.aux
27 | *.blg
28 | *.out
29 | *.synctex.gz
30 | *.ipynb
31 | *.pkl
32 | input/bischof_wagner/replication.pdf
33 | input/bischof_wagner/replication.Rmd
34 | 


--------------------------------------------------------------------------------
/code/0multi_group_F.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     text_representation:
  5 | #       extension: .py
  6 | #       format_name: percent
  7 | #       format_version: '1.3'
  8 | #       jupytext_version: 1.16.4
  9 | #   kernelspec:
 10 | #     display_name: py311
 11 | #     language: python
 12 | #     name: python3
 13 | # ---
 14 | 
 15 | # %%
 16 | import matplotlib.pyplot as plt
 17 | import numpy as np
 18 | 
 19 | from dgp import panel_dgp_stagg
 20 | from plotters import checkplot, diag_plot
 21 | from saturated import test_treatment_heterogeneity
 22 | 
 23 | np.random.seed(42)
 24 | # %%
 25 | num_periods = 30
 26 | treatment_start_cohorts = [10, 15, 20]
 27 | num_treated_units = [25_00, 50_00, 25_00]
 28 | 
 29 | # effect functions
 30 | treat_effect_vector_1 = np.log(
 31 |     2 * np.arange(1, num_periods - treatment_start_cohorts[1] + 1)
 32 | )
 33 | treat_effect_vector_1[8:] = 0  # switch off effects after a week
 34 | base_treatment_effects = [
 35 |     np.r_[
 36 |         np.linspace(2, 0, num_periods - treatment_start_cohorts[0] - 10),
 37 |         np.repeat(0, 10),
 38 |     ],
 39 |     treat_effect_vector_1,
 40 |     np.sin(
 41 |         np.arange(1, num_periods - treatment_start_cohorts[2] + 1)
 42 |     ),  # Treatment effect function for cohort 2
 43 | ]
 44 | 
 45 | sigma_i, sigma_t = 2, 1
 46 | sigma_epsilon = 1
 47 | dgp = panel_dgp_stagg(
 48 |     num_units=20_000,
 49 |     num_treated=num_treated_units,
 50 |     num_periods=num_periods,
 51 |     treatment_start_cohorts=treatment_start_cohorts,
 52 |     hetfx=False,
 53 |     base_treatment_effects=base_treatment_effects,
 54 |     sigma_unit=sigma_i,
 55 |     sigma_time=sigma_t,
 56 |     sigma_epsilon=sigma_epsilon,
 57 | )
 58 | Y0, Y1, W, df = dgp["Y0"], dgp["Y1"], dgp["W"], dgp["dataframe"]
 59 | 
 60 | # %%
 61 | checkplot(df)
 62 | plt.savefig("../figtab/respecification_verify_het.png")
 63 | # %%
 64 | diag_plot(df, treatment_start_cohorts, base_treatment_effects)
 65 | pv = test_treatment_heterogeneity(df)
 66 | print(pv)
 67 | plt.savefig("../figtab/hetfx.png")
 68 | 
 69 | # %%
 70 | 
 71 | # %% [markdown]
 72 | # ## homogeneous DGP
 73 | 
 74 | # %%
 75 | num_periods = 30
 76 | treatment_start_cohorts = [10, 15, 20]
 77 | num_treated_units = [25_00, 50_00, 25_00]
 78 | 
 79 | base_treatment_effects = [
 80 |     np.log(np.arange(1, num_periods - t + 1)) for t in treatment_start_cohorts
 81 | ]
 82 | 
 83 | # %%
 84 | 
 85 | sigma_i, sigma_t = 2, 1
 86 | sigma_epsilon = 1
 87 | dgp_homog = panel_dgp_stagg(
 88 |     num_units=20_000,
 89 |     num_treated=num_treated_units,
 90 |     num_periods=num_periods,
 91 |     treatment_start_cohorts=treatment_start_cohorts,
 92 |     hetfx=False,
 93 |     base_treatment_effects=base_treatment_effects,
 94 |     sigma_unit=sigma_i,
 95 |     sigma_time=sigma_t,
 96 |     sigma_epsilon=sigma_epsilon,
 97 | )
 98 | Y0_h, Y1_h, W_h, df_h = (
 99 |     dgp_homog["Y0"],
100 |     dgp_homog["Y1"],
101 |     dgp_homog["W"],
102 |     dgp_homog["dataframe"],
103 | )
104 | 
105 | # %%
106 | diag_plot(df_h, treatment_start_cohorts, base_treatment_effects)
107 | print(test_treatment_heterogeneity(df_h))
108 | plt.savefig("../figtab/homfx.png")
109 | # %%
110 | checkplot(df_h)
111 | plt.savefig("../figtab/respecification_verify_hom.png")
112 | 
113 | # %%
114 | 


--------------------------------------------------------------------------------
/code/1dyn_het_sims.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     text_representation:
  5 | #       extension: .py
  6 | #       format_name: percent
  7 | #       format_version: '1.3'
  8 | #       jupytext_version: 1.16.4
  9 | #   kernelspec:
 10 | #     display_name: py311
 11 | #     language: python
 12 | #     name: python3
 13 | # ---
 14 | 
 15 | # %%
 16 | import pickle
 17 | 
 18 | import numpy as np
 19 | import pandas as pd
 20 | import pyfixest as pf
 21 | 
 22 | import matplotlib.pyplot as plt
 23 | from lets_plot import *
 24 | LetsPlot.setup_html()
 25 | 
 26 | 
 27 | # %% [markdown]
 28 | # ## regression estimators for effect dynamics
 29 | 
 30 | # %%
 31 | def diff_in_means(df, T, T0):
 32 |     cross_sec_df = (
 33 |         df.query(f"time >= {T0}").groupby("unit").agg({"Y": "mean", "W": "max"})
 34 |     )
 35 |     estimate = pf.feols("Y~W", cross_sec_df).tidy().loc["W", "Estimate"]
 36 |     return np.repeat(estimate, T - T0)
 37 | 
 38 | 
 39 | def cuped(df, T, T0):
 40 |     cross_sec_df = (
 41 |         df.query(f"time >= {T0}").groupby("unit").agg({"Y": "mean", "W": "max"})
 42 |     )
 43 |     cross_sec_df = cross_sec_df.merge(
 44 |         df.query(f"time < {T0}").groupby("unit").Y.mean().rename("ldv"),
 45 |         left_index=True,
 46 |         right_index=True,
 47 |     )
 48 |     estimate = pf.feols("Y~W+ldv", cross_sec_df).tidy().loc["W", "Estimate"]
 49 |     return np.repeat(estimate, T - T0)
 50 | 
 51 | 
 52 | def twfe(df, T, T0):
 53 |     m = pf.feols("Y~W | unit + time", df).tidy()
 54 |     estimate = m.loc["W", "Estimate"]
 55 |     return np.repeat(estimate, T - T0)
 56 | 
 57 | 
 58 | def panel_diff_in_means(df, T, T0):
 59 |     mean_outcomes = df.groupby(["W", "time"])["Y"].mean().unstack()
 60 |     diff_means = mean_outcomes.iloc[1, :] - mean_outcomes.iloc[0, :]
 61 |     return diff_means[diff_means.notna()].values
 62 | 
 63 | 
 64 | def event_study(df, T, T0):
 65 |     df["ever_treated"] = df.groupby("unit")["W"].transform("max")
 66 |     m = pf.feols(f"Y ~ i(time, ever_treated, ref = {T0-1}) | unit + time", df)
 67 |     return m.coef()[(T0 - 1) :].values
 68 | 
 69 | 
 70 | # %%
 71 | def sim_panel(
 72 |     base_effect,
 73 |     N=1_000_000,
 74 |     T=35,
 75 |     T0=15,
 76 |     sigma_list=[5, 2, 0.01, 2],
 77 |     hetfx=False,
 78 |     num_treated=None,
 79 |     rho=0.7,
 80 |     seed=42,
 81 |     debug=False,
 82 | ):
 83 |     np.random.seed(seed)
 84 |     sigma_unit, sigma_time, sigma_tt, sigma_e = sigma_list
 85 |     # Generate data
 86 |     unit_ids = np.repeat(np.arange(N), T)
 87 |     time_ids = np.tile(np.arange(T), N)
 88 |     # Generate unit-specific intercepts and time trends
 89 |     unit_fe = np.random.normal(0, sigma_unit, N)
 90 |     time_fe = np.random.normal(0, sigma_time, T)
 91 |     unit_tt = np.random.normal(0, sigma_tt, N)
 92 |     # Generate treatment indicator
 93 |     if num_treated is None:
 94 |         W = np.random.binomial(1, 0.5, N)
 95 |     else:
 96 |         treated_units = np.random.choice(N, num_treated, replace=False)
 97 |         W = np.zeros(N)
 98 |         W[treated_units] = 1
 99 |     W = np.repeat(W, T)
100 |     W = W * (time_ids >= T0)
101 |     # Generate treatment effect
102 |     if hetfx:
103 |         unit_effects = np.random.uniform(0.5, 1.5, N)
104 |     else:
105 |         unit_effects = np.ones(N)
106 |     treatment_effect = np.outer(unit_effects, base_effect)
107 |     # Generate serially correlated residuals
108 |     residuals = np.zeros((N, T))
109 |     residuals[:, 0] = np.random.normal(0, sigma_e, N)
110 |     epsilon = np.random.normal(0, 1, (N, T - 1))
111 |     factor = sigma_e * np.sqrt(1 - rho**2)
112 |     for t in range(1, T):
113 |         residuals[:, t] = rho * residuals[:, t - 1] + factor * epsilon[:, t - 1]
114 |     # Generate outcome
115 |     Y = (
116 |         np.repeat(unit_fe, T)
117 |         + np.repeat(unit_tt, T) * time_ids
118 |         + treatment_effect.flatten() * W
119 |         + np.tile(time_fe, N)
120 |         + residuals.flatten()
121 |     )
122 | 
123 |     # Create DataFrame
124 |     df = pd.DataFrame({"unit": unit_ids, "time": time_ids, "Y": Y, "W": W})
125 |     if debug:
126 |         return Y, W, treatment_effect, df
127 |     return df
128 | 
129 | 
130 | # %%
131 | def generate_treatment_effect(effect_type, T, T0, max_effect=1):
132 |     if effect_type == "constant":
133 |         return np.concatenate([np.zeros(T0), np.full(T - T0, max_effect)])
134 |     elif effect_type == "linear":
135 |         return np.concatenate([np.zeros(T0), np.linspace(0, max_effect, T - T0)])
136 |     elif effect_type == "concave":
137 |         return np.concatenate(
138 |             [
139 |                 np.zeros(T0),
140 |                 max_effect * 0.5 * np.log(2 * np.arange(1, T - T0 + 1) / (T - T0) + 1),
141 |             ]
142 |         )
143 |     elif effect_type == "positive_then_negative":
144 |         half_point = (T - T0) // 2
145 |         return np.concatenate(
146 |             [
147 |                 np.zeros(T0),
148 |                 np.linspace(0, max_effect, half_point),
149 |                 np.linspace(max_effect, -max_effect, T - T0 - half_point),
150 |             ]
151 |         )
152 |     elif effect_type == "exponential":
153 |         return np.concatenate(
154 |             [
155 |                 np.zeros(T0),
156 |                 max_effect * (1 - np.exp(-np.linspace(0, 5, T - T0))),
157 |             ]
158 |         )
159 |     elif effect_type == "sinusoidal":
160 |         return np.concatenate(
161 |             [
162 |                 np.zeros(T0),
163 |                 max_effect * np.sin(np.linspace(0, 2 * np.pi, T - T0)),
164 |             ]
165 |         )
166 |     elif effect_type == "random_walk":
167 |         return np.concatenate(
168 |             [
169 |                 np.zeros(T0),
170 |                 max_effect * np.cumsum(np.random.randn(T - T0)),
171 |             ]
172 |         )
173 |     else:
174 |         raise ValueError("Unknown effect type")
175 | 
176 | 
177 | # %%
178 | # Define the simulation engine
179 | def simulation_engine(
180 |     effect_type, T, T0, max_effect, N, num_treated, sigma_list, hetfx, rho, seed
181 | ):
182 |     # Generate true treatment effects
183 |     effect_vector = generate_treatment_effect(effect_type, T, T0, max_effect)
184 |     # Simulate data
185 |     df = sim_panel(
186 |         effect_vector,
187 |         N=N,
188 |         T=T,
189 |         T0=T0,
190 |         sigma_list=sigma_list,
191 |         hetfx=hetfx,
192 |         num_treated=num_treated,
193 |         rho=rho,
194 |         seed=seed,
195 |     )
196 |     # Apply estimators
197 |     estimates = {}
198 |     estimates["true_effect"] = effect_vector[T0:]
199 |     estimators = [diff_in_means, twfe, event_study]
200 |     # apply them and add to the estimates dictionary
201 |     for estimator in estimators:
202 |         estimates[estimator.__name__] = estimator(df, T, T0)
203 |     return estimates
204 | 
205 | 
206 | # %%
207 | effect_types = [
208 |     "constant",
209 |     "linear",
210 |     "concave",
211 |     "positive_then_negative",
212 |     "exponential",
213 |     "sinusoidal",
214 |     "random_walk",
215 | ]
216 | T, T0 = 35, 15
217 | max_effect = 1
218 | ests = {}
219 | for effect_type in effect_types:
220 |     est = simulation_engine(
221 |         effect_type, T, T0, max_effect, 50_000, 25_000, [5, 2, 0.01, 2], False, 0.7, 42
222 |     )
223 |     ests[effect_type] = pd.DataFrame.from_dict(est)
224 | f, ax = plt.subplots(2, 4, figsize=(16, 8), sharey=False, sharex=True)
225 | for k, v, i in zip(ests.keys(), ests.values(), range(7)):
226 |     v[["true_effect"]].plot(
227 |         ax=ax[i // 4, i % 4], marker=".", ms=6, alpha=1, legend=False
228 |     )
229 |     v.drop(["true_effect"], axis=1).plot(
230 |         ax=ax[i // 4, i % 4], marker=".", alpha=0.7, legend=False
231 |     )
232 |     ax[i // 4, i % 4].set_title(k)
233 |     ax[i // 4, i % 4].axhline(0, color="black", linestyle="--")
234 |     ax[i // 4, i % 4].axhline(v.true_effect.mean(), color="red", linestyle="--")
235 | ax[0, 0].legend()
236 | # destroy empty axis
237 | ax[1, 3].axis("off")
238 | f.tight_layout()
239 | # f.suptitle("Static and Dynamic Effects")
240 | f.savefig("../figtab/static_dynamic_effects.png")
241 | 
242 | 
243 | # %% [markdown]
244 | # ## F test
245 | 
246 | # %%
247 | def f_test_stability(df, T0, vcv={"CRV1": "unit"}, dgp_type="", return_plot=True):
248 |     # Fit models
249 |     df["rel_time"] = df["time"] - (T0) + 1
250 |     df["rel_time"] = df["rel_time"].where(df["W"] == 1, 0)
251 | 
252 |     restricted = pf.feols("Y ~ i(W) | unit + time", df)
253 |     unrestricted = pf.feols("Y ~ i(rel_time, ref=0) | unit + time", df, vcov=vcv)
254 |     # Get the restricted estimate
255 |     restricted_effect = restricted.coef().iloc[0]
256 |     # Create R matrix - each row tests one event study coefficient
257 |     # against restricted estimate
258 |     n_evstudy_coefs = unrestricted.coef().shape[0]
259 |     R = np.eye(n_evstudy_coefs)
260 |     # q vector is the restricted estimate repeated
261 |     q = np.repeat(restricted_effect, n_evstudy_coefs)
262 |     # Conduct Wald test
263 |     pv = unrestricted.wald_test(R=R, q=q, distribution="chi2")["pvalue"]
264 |     if not return_plot:
265 |         return pv
266 |     plotout = pf.iplot(
267 |         [restricted, unrestricted], coord_flip=False, figsize=(900, 400)
268 |     ) + labs(
269 |         title=f"{dgp_type}",
270 |         subtitle=f"Stability Test p-value ={pv:.3f}",
271 |         x="",
272 |         y="",
273 |     )
274 |     return plotout
275 | 
276 | 
277 | # %%
278 | effect_types = [
279 |     "constant",
280 |     "linear",
281 |     "concave",
282 |     "positive_then_negative",
283 |     "exponential",
284 |     "sinusoidal",
285 |     "random_walk",
286 | ]
287 | N, num_treated, sigma_list, hetfx, rho, seed = (
288 |     50_000,
289 |     25_000,
290 |     [5, 2, 0.01, 2],
291 |     False,
292 |     0.7,
293 |     42,
294 | )
295 | T, T0 = 35, 15
296 | max_effect = 0.1
297 | 
298 | for effect_type in effect_types:
299 |     effect_vector = generate_treatment_effect(effect_type, T, T0, max_effect)
300 |     # Simulate data
301 |     df = sim_panel(
302 |         effect_vector,
303 |         N=N,
304 |         T=T,
305 |         T0=T0,
306 |         sigma_list=sigma_list,
307 |         hetfx=hetfx,
308 |         num_treated=num_treated,
309 |         rho=rho,
310 |         seed=seed,
311 |     )
312 |     f_test_stability(df, T0, dgp_type=effect_type).show()
313 | 
314 | # %% [markdown]
315 | # ## simulations
316 | 
317 | # %%
318 | from joblib import Parallel, delayed
319 | 
320 | 
321 | def run_single_simulation(
322 |     effect_type, T, T0, max_effect, N, num_treated, sigma_list, hetfx, rho, seed
323 | ):
324 |     # Generate treatment effect vector
325 |     effect_vector = generate_treatment_effect(effect_type, T, T0, max_effect)
326 | 
327 |     # Simulate data
328 |     df = sim_panel(
329 |         effect_vector,
330 |         N=N,
331 |         T=T,
332 |         T0=T0,
333 |         sigma_list=sigma_list,
334 |         hetfx=hetfx,
335 |         num_treated=num_treated,
336 |         rho=rho,
337 |         seed=seed,
338 |     )
339 | 
340 |     # Run stability test and return p-value
341 |     return f_test_stability(df, T0, return_plot=False)
342 | 
343 | 
344 | # %%
345 | def compute_power(n_sims=1000, n_jobs=-1):
346 |     effect_types = [
347 |         "constant",
348 |         "linear",
349 |         "concave",
350 |         "positive_then_negative",
351 |         "exponential",
352 |         "sinusoidal",
353 |         "random_walk",
354 |     ]
355 | 
356 |     base_params = {
357 |         "N": 50_000,
358 |         "num_treated": 25_000,
359 |         "sigma_list": [5, 2, 0.01, 2],
360 |         "hetfx": False,
361 |         "rho": 0.7,
362 |         "T": 35,
363 |         "T0": 15,
364 |         "max_effect": 0.1,
365 |     }
366 | 
367 |     results = {}
368 |     for effect_type in effect_types:
369 |         # Run parallel simulations
370 |         p_values = Parallel(n_jobs=n_jobs)(
371 |             delayed(run_single_simulation)(
372 |                 effect_type=effect_type, seed=i, **base_params  # Use iteration as seed
373 |             )
374 |             for i in range(n_sims)
375 |         )
376 | 
377 |         # Compute rejection rate at 5% level
378 |         power = np.mean(np.array(p_values) < 0.05)
379 |         results[effect_type] = {"power": power, "p_values": p_values}
380 | 
381 |     return results
382 | 
383 | 
384 | # %%
385 | # %%time
386 | # Run simulations
387 | results = compute_power(n_sims=1000, n_jobs=8)
388 | 
389 | # %%
390 | with open("../tmp/results_dyn.pkl", "wb") as f:
391 |     pickle.dump(results, f)
392 | 
393 | # %%
394 | with open("../tmp/results_dyn.pkl", "rb") as f:
395 |     results = pickle.load(f)
396 | results = pd.DataFrame(results).T.reset_index()
397 | results.rename(columns={"power": "rejection_rate", "index": "dgp"}, inplace=True)
398 | results
399 | 
400 | # %%
401 | # Plot results
402 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
403 | 
404 | # Plot power
405 | results.plot(kind="bar", x="dgp", y="rejection_rate", ax=ax1)
406 | ax1.xaxis.set_tick_params(rotation=45)
407 | ax1.set_title("Rejection Rate by DGP")
408 | ax1.set_ylabel("rejection rate")
409 | ax1.set_xlabel("")
410 | ax1.axhline(0.05, color="r", linestyle="--", label="α=0.05")
411 | 
412 | # Plot p-value distributions
413 | ax2.boxplot([r for r in results["p_values"]], labels=results["dgp"])
414 | ax2.set_title("P-value Distributions")
415 | ax2.axhline(0.05, color="r", linestyle="--")
416 | ax2.xaxis.set_tick_params(rotation=45)
417 | plt.tight_layout()
418 | plt.savefig("../figtab/rejection_rates_dyn.png")
419 | 


--------------------------------------------------------------------------------
/code/1multi_F_power.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | import contextlib
  3 | import io
  4 | import warnings
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import pandas as pd
  9 | from joblib import Parallel, delayed
 10 | from tqdm import tqdm
 11 | 
 12 | np.random.seed(42)
 13 | # %%
 14 | from dgp import panel_dgp_stagg
 15 | from saturated import test_treatment_heterogeneity
 16 | 
 17 | # %%
 18 | 
 19 | #                                   ▄▄▄▄      ██
 20 | #                                  ██▀▀▀      ▀▀
 21 | #   ▄█████▄   ▄████▄   ██▄████▄  ███████    ████      ▄███▄██  ▄▄█████▄
 22 | #  ██▀    ▀  ██▀  ▀██  ██▀   ██    ██         ██     ██▀  ▀██  ██▄▄▄▄ ▀
 23 | #  ██        ██    ██  ██    ██    ██         ██     ██    ██   ▀▀▀▀██▄
 24 | #  ▀██▄▄▄▄█  ▀██▄▄██▀  ██    ██    ██      ▄▄▄██▄▄▄  ▀██▄▄███  █▄▄▄▄▄██
 25 | #    ▀▀▀▀▀     ▀▀▀▀    ▀▀    ▀▀    ▀▀      ▀▀▀▀▀▀▀▀   ▄▀▀▀ ██   ▀▀▀▀▀▀
 26 | #                                                     ▀████▀▀
 27 | 
 28 | 
 29 | num_periods = 30
 30 | treatment_start_cohorts = [10, 15, 20]
 31 | num_treated_units = [25_00, 50_00, 25_00]
 32 | 
 33 | 
 34 | configs = [
 35 |     {
 36 |         "name": "homogeneous",  # homogeneous effects
 37 |         "base_treatment_effects": lambda t: [
 38 |             np.log(np.arange(1, num_periods - t + 1)) for t in treatment_start_cohorts
 39 |         ],
 40 |     },
 41 |     {
 42 |         "name": "log_vs_linear_vs_sin",  # original heterogeneous case
 43 |         "base_treatment_effects": lambda _: [
 44 |             np.r_[
 45 |                 np.linspace(2, 0, num_periods - treatment_start_cohorts[0] - 10),
 46 |                 np.repeat(0, 10),
 47 |             ],
 48 |             np.log(2 * np.arange(1, num_periods - treatment_start_cohorts[1] + 1)),
 49 |             np.sin(np.arange(1, num_periods - treatment_start_cohorts[2] + 1)),
 50 |         ],
 51 |     },
 52 |     {
 53 |         "name": "small_differences",  # subtle heterogeneity
 54 |         "base_treatment_effects": lambda t: [
 55 |             np.log(np.arange(1, num_periods - t + 1)) * (1 + i * 0.1)
 56 |             for i, t in enumerate(treatment_start_cohorts)
 57 |         ],
 58 |     },
 59 |     {
 60 |         "name": "large_differences",  # effects depend on treatment timing
 61 |         "base_treatment_effects": lambda t: [
 62 |             np.log(np.arange(1, num_periods - t + 1)) * (t / 10)
 63 |             for t in treatment_start_cohorts
 64 |         ],
 65 |     },
 66 |     {
 67 |         "name": "selection_on_gains",  # subtle heterogeneity
 68 |         "base_treatment_effects": lambda t: [
 69 |             np.log(np.arange(1, num_periods - t + 1)) * (1 - i * 0.1)
 70 |             for i, t in enumerate(treatment_start_cohorts)
 71 |         ],
 72 |     },
 73 | ]
 74 | 
 75 | additional_configs = [
 76 |     {
 77 |         "name": "novelty_effects",
 78 |         "base_treatment_effects": lambda t: [
 79 |             2 * np.exp(-0.3 * np.arange(num_periods - t)) + 0.5  # Sharp decay to 0.5
 80 |             for t in treatment_start_cohorts
 81 |         ],
 82 |     },
 83 |     {
 84 |         "name": "activity_bias",  # First cohort different from rest
 85 |         "base_treatment_effects": lambda t: [
 86 |             # First cohort has strong persistent effects
 87 |             (
 88 |                 2.5 * np.ones(num_periods - treatment_start_cohorts[0])
 89 |                 if i == 0
 90 |                 # Other cohorts have standard log pattern
 91 |                 else np.log(np.arange(1, num_periods - t + 1))
 92 |             )
 93 |             for i, t in enumerate(treatment_start_cohorts)
 94 |         ],
 95 |     },
 96 | ]
 97 | 
 98 | configs.extend(additional_configs)
 99 | 
100 | 
101 | # %%
102 | def plot_true_functions(
103 |     treatment_start_cohorts,
104 |     base_treatment_effects,
105 |     title,
106 |     ax,
107 | ):
108 |     true_fns = {}
109 |     for c, s in enumerate(treatment_start_cohorts):
110 |         effect_vector_padded = np.pad(
111 |             base_treatment_effects[c],
112 |             (treatment_start_cohorts[-1], 0),
113 |         )
114 | 
115 |         # Create x-axis values that skip -1
116 |         x_values = np.arange(len(effect_vector_padded))
117 |         x_values = np.where(
118 |             x_values >= treatment_start_cohorts[-1],
119 |             x_values - treatment_start_cohorts[-1],
120 |             x_values - treatment_start_cohorts[-1] - 1,
121 |         )
122 |         true_fns[f"cohort_{s}"] = pd.Series(
123 |             {x: y for x, y in zip(x_values, effect_vector_padded)}
124 |         )
125 | 
126 |     true_event_study = pd.concat(true_fns).reset_index()
127 |     true_event_study.columns = ["cohort", "rel_time", "true_effect"]
128 |     true_event_study = true_event_study.groupby("rel_time")["true_effect"].mean()
129 |     cmp = plt.get_cmap("viridis", len(true_fns))
130 |     i = 0
131 |     for k, v in true_fns.items():
132 |         ax.plot(v, color=cmp(i), marker=".")
133 |         i += 1
134 |     ax.axvline(-1, color="black", linestyle="--")
135 |     ax.axhline(0, color="black", linestyle=":")
136 |     ax.set_title(title)
137 | 
138 | 
139 | # %%
140 | f, ax = plt.subplots(
141 |     3, int(np.ceil(len(configs) / 3)), figsize=(10, 7), sharex=True, sharey=True
142 | )
143 | ax = ax.flatten()
144 | for i, config in enumerate(configs):
145 |     plot_true_functions(
146 |         treatment_start_cohorts,
147 |         config["base_treatment_effects"](treatment_start_cohorts),
148 |         config["name"],
149 |         ax[i],
150 |     )
151 | # delete last subplot
152 | f.delaxes(ax[-2])
153 | f.delaxes(ax[-1])
154 | f.tight_layout()
155 | f.savefig("../figtab/true_functions.png")
156 | # %%
157 | 
158 | #  ██▄███▄    ▄████▄  ██      ██  ▄████▄    ██▄████
159 | #  ██▀  ▀██  ██▀  ▀██ ▀█  ██  █▀ ██▄▄▄▄██   ██▀
160 | #  ██    ██  ██    ██  ██▄██▄██  ██▀▀▀▀▀▀   ██
161 | #  ███▄▄██▀  ▀██▄▄██▀  ▀██  ██▀  ▀██▄▄▄▄█   ██
162 | #  ██ ▀▀▀      ▀▀▀▀     ▀▀  ▀▀     ▀▀▀▀▀    ▀▀
163 | #  ██
164 | 
165 | 
166 | # %%
167 | @contextlib.contextmanager
168 | def suppress_stdout():
169 |     stdout = io.StringIO()
170 |     with contextlib.redirect_stdout(stdout):
171 |         yield stdout
172 | 
173 | 
174 | def single_simulation(
175 |     config, treatment_start_cohorts, num_periods, num_treated_units, seed=42
176 | ):
177 |     with suppress_stdout(), warnings.catch_warnings():
178 |         warnings.simplefilter("ignore")
179 |         # Generate data
180 |         dgp = panel_dgp_stagg(
181 |             num_units=20_000,
182 |             num_treated=num_treated_units,
183 |             num_periods=num_periods,
184 |             treatment_start_cohorts=treatment_start_cohorts,
185 |             hetfx=False,
186 |             base_treatment_effects=config["base_treatment_effects"](
187 |                 treatment_start_cohorts
188 |             ),
189 |             sigma_unit=2,
190 |             sigma_time=1,
191 |             sigma_epsilon=1,
192 |             seed=seed,
193 |         )
194 |         # Run test
195 |         return test_treatment_heterogeneity(dgp["dataframe"])
196 | 
197 | 
198 | # %%
199 | single_simulation(configs[0], treatment_start_cohorts, num_periods, num_treated_units)
200 | 
201 | # %%
202 | 
203 | 
204 | def power_analysis(
205 |     n_sims=1000,
206 |     dgp_configs=configs,
207 |     alpha=0.05,
208 |     n_jobs=-1,
209 | ):
210 |     results = []
211 | 
212 |     for config in dgp_configs:  # Parallel simulation with progress bar
213 |         pvalues = Parallel(n_jobs=n_jobs)(
214 |             delayed(single_simulation)(
215 |                 config, treatment_start_cohorts, num_periods, num_treated_units, seed=i
216 |             )
217 |             for i in tqdm(range(n_sims), desc=f"Running {config['name']}")
218 |         )
219 |         # Compute rejection rate
220 |         rejection_rate = np.mean(np.array(pvalues) < alpha)
221 |         results.append(
222 |             {
223 |                 "dgp": config["name"],
224 |                 "rejection_rate": rejection_rate,
225 |                 "pvalues": pvalues,
226 |             }
227 |         )
228 | 
229 |     return pd.DataFrame(results)
230 | 
231 | 
232 | # %% # Run power analysis
233 | results = power_analysis(n_sims=1000, dgp_configs=configs, n_jobs=-1)
234 | results
235 | # %%
236 | results.to_pickle("../tmp/rejection_rates_F.pkl")
237 | # %%
238 | results = pd.read_pickle("../tmp/rejection_rates_F.pkl")
239 | results.loc[results.dgp == "timing_dependent", "dgp"] = "large_differences"
240 | # %%
241 | # Plot results
242 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
243 | 
244 | # Plot power
245 | results.plot(kind="bar", x="dgp", y="rejection_rate", ax=ax1)
246 | ax1.set_title("Rejection Rate by DGP")
247 | ax1.set_ylabel("rejection rate")
248 | plt.setp(ax1.get_xticklabels(), rotation=45, ha="right")
249 | ax1.get_legend().remove()
250 | ax1.axhline(0.05, color="r", linestyle="--", label="α=0.05")
251 | 
252 | # Plot p-value distributions
253 | ax2.boxplot([r for r in results["pvalues"]], labels=results["dgp"])
254 | ax2.set_title("P-value Distributions")
255 | plt.setp(ax2.get_xticklabels(), rotation=45, ha="right")
256 | # ax2.set_yscale("log")
257 | ax2.axhline(0.05, color="r", linestyle="--")
258 | 
259 | plt.tight_layout()
260 | plt.savefig("../figtab/rejection_rates_F.png")
261 | # %%
262 | 


--------------------------------------------------------------------------------
/code/dgp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | def panel_dgp_stagg(
  6 |     num_units=100,
  7 |     num_periods=30,
  8 |     num_treated=[50],
  9 |     treatment_start_cohorts=[15],
 10 |     sigma_unit=1,
 11 |     sigma_time=0.5,
 12 |     sigma_epsilon=0.2,
 13 |     hetfx=False,
 14 |     base_treatment_effects=[0.1 * np.log(np.arange(1, 30 - 15 + 1))],
 15 |     return_dataframe=True,
 16 |     ar_coef=0.8,
 17 |     seed = 42,
 18 | ):
 19 |     np.random.seed(seed)
 20 |     # unit FEs
 21 |     unit_intercepts = np.random.normal(0, sigma_unit, num_units)
 22 |     ####################################################################
 23 |     # time FEs: Generate day-of-the-week pattern
 24 |     day_effects = np.array(
 25 |         [-0.1, 0.1, 0, 0, 0.1, 0.5, 0.5]
 26 |     )  # Stronger effects on weekends
 27 |     day_pattern = np.tile(day_effects, num_periods // 7 + 1)[:num_periods]
 28 |     # autoregressive structure in time FEs
 29 |     ar_coef_time = 0.2
 30 |     ar_noise_time = np.random.normal(0, sigma_time, num_periods)
 31 |     time_intercepts = np.zeros(num_periods)
 32 |     time_intercepts[0] = ar_noise_time[0]
 33 |     for t in range(1, num_periods):
 34 |         time_intercepts[t] = ar_coef_time * time_intercepts[t - 1] + ar_noise_time[t]
 35 |     # Combine day-of-the-week pattern and autoregressive structure
 36 |     time_intercepts = day_pattern + time_intercepts - np.mean(time_intercepts)
 37 |     ####################################################################
 38 |     # Generate autoregressive noise for each unit
 39 |     ar_noise = np.random.normal(0, sigma_epsilon, (num_units, num_periods))
 40 |     noise = np.zeros((num_units, num_periods))
 41 |     noise[:, 0] = ar_noise[:, 0]
 42 |     for t in range(1, num_periods):
 43 |         noise[:, t] = ar_coef * noise[:, t - 1] + ar_noise[:, t]
 44 |     # N X T matrix of potential outcomes under control
 45 |     Y0 = unit_intercepts[:, np.newaxis] + time_intercepts[np.newaxis, :] + noise
 46 |     ####################################################################
 47 |     # Generate heterogeneous multipliers for each unit
 48 |     if hetfx:
 49 |         heterogeneous_multipliers = np.random.uniform(0.5, 1.5, num_units)
 50 |     else:
 51 |         heterogeneous_multipliers = np.ones(num_units)
 52 |     # random assignment
 53 |     treated_units = np.array([], dtype=int)
 54 |     treatment_status = np.zeros((num_units, num_periods), dtype=bool)
 55 |     ####################################################################
 56 |     # Create a 2D array to store the heterogeneous treatment effects
 57 |     treatment_effect = np.zeros((num_units, num_periods))
 58 |     # iterate over treatment cohorts
 59 |     for cohort_idx, (treatment_start, num_treated_cohort) in enumerate(
 60 |         zip(treatment_start_cohorts, num_treated)
 61 |     ):
 62 |         base_treatment_effect = base_treatment_effects[cohort_idx]
 63 |         cohort_treatment_effect = np.zeros((num_units, num_periods - treatment_start))
 64 | 
 65 |         for i in range(num_units):
 66 |             cohort_treatment_effect[i, :] = (
 67 |                 heterogeneous_multipliers[i] * base_treatment_effect
 68 |             )
 69 |         cohort_treated_units = np.random.choice(
 70 |             np.setdiff1d(np.arange(num_units), treated_units),
 71 |             num_treated_cohort,
 72 |             replace=False,
 73 |         )
 74 |         treated_units = np.concatenate((treated_units, cohort_treated_units))
 75 |         treatment_status[cohort_treated_units, treatment_start:] = True
 76 |         treatment_effect[
 77 |             cohort_treated_units, treatment_start:
 78 |         ] += cohort_treatment_effect[cohort_treated_units, :]
 79 | 
 80 |     # Apply the heterogeneous treatment effect to the treated units
 81 |     Y1 = Y0.copy()
 82 |     Y1[treatment_status] += treatment_effect[treatment_status]
 83 |     ####################################################################
 84 |     result = {
 85 |         "Y1": Y1,
 86 |         "Y0": Y0,
 87 |         "W": treatment_status,
 88 |         "unit_intercepts": unit_intercepts,
 89 |         "time_intercepts": time_intercepts,
 90 |     }
 91 | 
 92 |     if return_dataframe:
 93 |         # Create a DataFrame
 94 |         unit_ids = np.repeat(np.arange(num_units), num_periods)
 95 |         time_ids = np.tile(np.arange(num_periods), num_units)
 96 |         W_it = treatment_status.flatten().astype(int)
 97 |         Y_it = np.where(W_it, Y1.flatten(), Y0.flatten())
 98 |         unit_intercepts_flat = np.repeat(unit_intercepts, num_periods)
 99 |         time_intercepts_flat = np.tile(time_intercepts, num_units)
100 |         df = pd.DataFrame(
101 |             {
102 |                 "unit_id": unit_ids,
103 |                 "time_id": time_ids,
104 |                 "W_it": W_it,
105 |                 "Y_it": Y_it,
106 |                 "unit_intercept": unit_intercepts_flat,
107 |                 "time_intercept": time_intercepts_flat,
108 |             }
109 |         )
110 |         result["dataframe"] = df
111 |     return result
112 | 
113 | 
114 | def generate_treatment_effect(effect_type, T, T0, max_effect=1):
115 |     if effect_type == "constant":
116 |         return np.full(T - T0, max_effect)
117 |     elif effect_type == "linear":
118 |         return np.linspace(0, max_effect, T - T0)
119 |     elif effect_type == "concave":
120 |         return max_effect * np.log(2 * np.arange(1, T - T0 + 1) / (T - T0) + 1)
121 |     elif effect_type == "positive_then_negative":
122 |         half_point = (T - T0) // 2
123 |         return np.concatenate(
124 |             [
125 |                 np.linspace(0, max_effect, half_point),
126 |                 np.linspace(max_effect, -max_effect, T - T0 - half_point),
127 |             ]
128 |         )
129 |     elif effect_type == "exponential":
130 |         return max_effect * (1 - np.exp(-np.linspace(0, 5, T - T0)))
131 |     elif effect_type == "sinusoidal":
132 |         return max_effect * np.sin(np.linspace(0, 2 * np.pi, T - T0))
133 |     elif effect_type == "random_walk":
134 |         return max_effect * np.cumsum(np.random.randn(T - T0))
135 |     else:
136 |         raise ValueError("Unknown effect type")
137 | 


--------------------------------------------------------------------------------
/code/jupytext.toml:
--------------------------------------------------------------------------------
1 | # Pair ipynb notebooks to py:percent text notebooks
2 | formats = "ipynb,py:percent"
3 | 


--------------------------------------------------------------------------------
/code/misc/event_study_anscombe.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     text_representation:
  5 | #       extension: .py
  6 | #       format_name: percent
  7 | #       format_version: '1.3'
  8 | #       jupytext_version: 1.16.4
  9 | #   kernelspec:
 10 | #     display_name: py311
 11 | #     language: python
 12 | #     name: python3
 13 | # ---
 14 | 
 15 | # %% [markdown] id="cdis85xCuhbr"
 16 | # # Anscombe's quartet for longitudinal experiments
 17 | 
 18 | # %% id="hL511B-3olXp"
 19 | import numpy as np
 20 | import pandas as pd
 21 | import matplotlib.pyplot as plt
 22 | import plotnine as p9
 23 | import pyfixest as pf
 24 | 
 25 | # %matplotlib inline
 26 | 
 27 | # %% id="I33hyIWRosou"
 28 | def generate_panel_data(
 29 |     N,
 30 |     T,
 31 |     K,
 32 |     unit_fac_lb=-0.2,
 33 |     unit_fac_ub=0.2,
 34 |     time_fac_lb=-0.1,
 35 |     time_fac_ub=0.1,
 36 |     sigma=0.1,
 37 |     trend_sigma=0.01,
 38 | ):
 39 |     F, L = (
 40 |         np.random.uniform(time_fac_lb, time_fac_ub, (T, K)),
 41 |         np.random.uniform(unit_fac_lb, unit_fac_ub, (N, K)),
 42 |     )
 43 |     time_trends = np.random.normal(0, trend_sigma, (N, 1)) * np.arange(T).reshape(1, T)
 44 |     epsilon = np.random.normal(0, sigma, (N, T))
 45 |     Y = np.dot(L, F.T) + epsilon + time_trends
 46 |     return Y, L
 47 | 
 48 | 
 49 | # %%
 50 | def generate_quartet(N=1000, T=20, T0=10, T1=15, K=3, **kwargs):
 51 |     np.random.seed(42)
 52 | 
 53 |     # Generate baseline panel data
 54 |     Y, _ = generate_panel_data(N, T, K, **kwargs)
 55 | 
 56 |     # Treatment effects
 57 |     group_1_effect = 0
 58 |     group_2_effect = np.random.normal(0, 0.25, N // 2)
 59 |     group_3_effect = np.linspace(-0.5, 0.5, T - T0)
 60 |     group_4_effect_T0 = np.linspace(-0.25, 0.25, T - T0)
 61 |     group_4_effect_T1 = np.linspace(-0.75, 0.75, T - T1)
 62 | 
 63 |     # Base data
 64 |     data = pd.DataFrame(
 65 |         {
 66 |             "unit": np.repeat(range(N), T),
 67 |             "time": np.tile(range(T), N),
 68 |             "outcome": Y.flatten(),
 69 |         }
 70 |     )
 71 | 
 72 |     # Randomly assign half the units to treatment
 73 |     treated_units = np.random.choice(N, N // 2, replace=False)
 74 | 
 75 |     # Scenario 1: Constant treatment effect (now zero)
 76 |     data1 = data.copy()
 77 |     data1["treated"] = data1["unit"].isin(treated_units) & (data1["time"] >= T0)
 78 |     data1.loc[data1["treated"], "outcome"] += group_1_effect
 79 | 
 80 |     # Scenario 2: Heterogeneous treatment effects across subgroups (zero mean)
 81 |     data2 = data.copy()
 82 |     data2["treated"] = data2["unit"].isin(treated_units) & (data2["time"] >= T0)
 83 |     for i, unit in enumerate(treated_units):
 84 |         data2.loc[
 85 |             (data2["unit"] == unit) & (data2["time"] >= T0), "outcome"
 86 |         ] += group_2_effect[i]
 87 | 
 88 |     # Scenario 3: Heterogeneous treatment effects over time (zero mean)
 89 |     data3 = data.copy()
 90 |     data3["treated"] = data3["unit"].isin(treated_units[: N // 2]) & (
 91 |         data3["time"] >= T0
 92 |     )
 93 |     for t in range(T0, T):
 94 |         data3.loc[
 95 |             (data3["treated"]) & (data3["time"] == t), "outcome"
 96 |         ] += group_3_effect[t - T0]
 97 | 
 98 |     # Scenario 4: Heterogeneous treatment effects over time and across cohorts (zero mean)
 99 |     data4 = data.copy()
100 |     data4["treated_T0"] = data4["unit"].isin(treated_units[: N // 4]) & (
101 |         data4["time"] >= T0
102 |     )
103 |     data4["treated_T1"] = data4["unit"].isin(treated_units[N // 4 : N // 2]) & (
104 |         data4["time"] >= T1
105 |     )
106 |     data4["treated"] = data4["treated_T0"] | data4["treated_T1"]
107 |     for t in range(T0, T):
108 |         data4.loc[
109 |             (data4["treated_T0"]) & (data4["time"] == t), "outcome"
110 |         ] += group_4_effect_T0[t - T0]
111 |     for t in range(T1, T):
112 |         data4.loc[
113 |             (data4["treated_T1"]) & (data4["time"] == t), "outcome"
114 |         ] += group_4_effect_T1[t - T1]
115 |     for d in [data1, data2, data3, data4]:
116 |         d["ever_treated"] = d.groupby("unit")["treated"].transform("max")
117 |     return data1, data2, data3, data4
118 | 
119 | 
120 | # %% [markdown]
121 | # ## Post-treatment difference in means
122 | 
123 | # %%
124 | def calculate_ate(data):
125 |     treated = data[data["treated"] & (data["time"] >= T0)]["outcome"].mean()
126 |     control = data[~data["treated"] & (data["time"] >= T0)]["outcome"].mean()
127 |     return treated - control
128 | 
129 | 
130 | # %% # Generate data
131 | N, T, T0, T1 = 1000, 20, 10, 15
132 | data1, data2, data3, data4 = generate_quartet(N, T, T0, T1, trend_sigma=0.01)
133 | # %% # Calculate ATEs from post-treatment data
134 | for i, data in enumerate([data1, data2, data3, data4], 1):
135 |     ate = calculate_ate(data)
136 |     print(f"Scenario {i}:")
137 |     print(f"ATE: {ate:.4f}")
138 | 
139 | 
140 | # %% [markdown]
141 | # ## Plot DGPs
142 | 
143 | # %% # Plot DGPs
144 | n_samples = 30
145 | scenarios = [
146 |     "1: Constant Effect, 1 adoption cohort",
147 |     "2: Heterogeneous Across Units, 1 adoption cohort",
148 |     "3: Heterogeneous Over Time, 1 adoption cohort",
149 |     "4: Heterogeneous Across Units and Time, 2 adoption cohorts",
150 | ]
151 | 
152 | plot_data_list = []
153 | for i, data in enumerate([data1, data2, data3, data4]):
154 |     # Sample units
155 |     all_units = data["unit"].unique()
156 |     sampled_units = np.random.choice(all_units, n_samples, replace=False)
157 |     # Filter data for sampled units
158 |     scenario_data = data[data["unit"].isin(sampled_units)].copy()
159 |     scenario_data["scenario"] = scenarios[i]
160 |     plot_data_list.append(scenario_data)
161 | # Combine all data
162 | plot_data = pd.concat(plot_data_list, ignore_index=True)
163 | # Create the plot
164 | plot = (
165 |     p9.ggplot(plot_data)
166 |     + p9.aes(x="time", y="outcome", group="factor(unit)", color="factor(treated)")
167 |     + p9.geom_line(alpha=0.7)
168 |     + p9.geom_vline(xintercept=T0, linetype="dashed", color="green", alpha=0.7)
169 |     + p9.geom_vline(
170 |         data=plot_data[plot_data["scenario"] == scenarios[3]],
171 |         xintercept=T1,
172 |         linetype="dashed",
173 |         color="purple",
174 |         alpha=0.7,
175 |     )
176 |     + p9.geom_hline(yintercept=0, linetype="dashed", color="black", alpha=0.7)
177 |     + p9.facet_wrap("~ scenario", scales="free_y", ncol=2)
178 |     + p9.theme_matplotlib()
179 |     + p9.theme(
180 |         legend_position="none",
181 |         figure_size=(10, 8),
182 |     )
183 |     + p9.labs(x="Time", y="Outcome")
184 |     + p9.scale_color_manual(values=["blue", "red"])
185 | )
186 | plot = plot + p9.ggtitle(
187 |     "Raw outcomes for 30 units from four DGPs\nCross-sectional ATE=0 for all four"
188 | )
189 | plot
190 | 
191 | # %% [markdown] id="8T_jARzZpgR4"
192 | # ## Event Study with `fixest`
193 | 
194 | # %%
195 | from saturated import saturated_event_study
196 | 
197 | # %% colab={"base_uri": "https://localhost:8080/", "height": 927} id="z-vu9bLro00j" outputId="86f5baac-2b6b-4c19-9594-9b1c5a867480"
198 | N, T, T0, T1 = 1000, 20, 10, 15
199 | data1, data2, data3, data4 = generate_quartet(N, T, T0, T1, trend_sigma=0.1, sigma=0.2)
200 | 
201 | # %%
202 | f, ax = plt.subplots(2, 2, figsize=(8, 6), sharex=True, sharey=True)
203 | ax = ax.flatten()
204 | saturated_event_study(data1, "outcome", "treated", "time", "unit", ax=ax[0])
205 | saturated_event_study(data2, "outcome", "treated", "time", "unit", ax=ax[1])
206 | saturated_event_study(data3, "outcome", "treated", "time", "unit", ax=ax[2])
207 | saturated_event_study(data4, "outcome", "treated", "time", "unit", ax=ax[3])
208 | ax[0].axvline(-1, color="black", linestyle="--")
209 | ax[1].axvline(-1, color="black", linestyle="--")
210 | ax[2].axvline(-1, color="black", linestyle="--")
211 | ax[3].axvline(-1, color="black", linestyle="--")
212 | ax[0].axhline(0, color="black", linestyle="--")
213 | ax[1].axhline(0, color="black", linestyle="--")
214 | ax[2].axhline(0, color="black", linestyle="--")
215 | ax[3].axhline(0, color="black", linestyle="--")
216 | f.suptitle("Event Study figures for the four scenarios")
217 | f.tight_layout()
218 | f.show()
219 | # f.savefig("../output/event_study.png")
220 | 


--------------------------------------------------------------------------------
/code/plotters.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import pyfixest as pf
  5 | from saturated import test_treatment_heterogeneity, saturated_event_study
  6 | 
  7 | 
  8 | def mini_panelview(data, unit, time, treat):
  9 |     treatment_quilt = data.pivot(index=unit, columns=time, values=treat)
 10 |     treatment_quilt = treatment_quilt.drop_duplicates()
 11 |     treatment_quilt = treatment_quilt.loc[
 12 |         treatment_quilt.sum(axis=1).sort_values().index
 13 |     ]
 14 |     return treatment_quilt
 15 | 
 16 | 
 17 | def diag_plot(df, treatment_start_cohorts, base_treatment_effects, figdim = (9, 10)):
 18 |     df2 = df.merge(
 19 |         df.assign(first_treated_period=df.time_id * df.W_it)
 20 |         .groupby("unit_id")["first_treated_period"]
 21 |         .apply(lambda x: x[x > 0].min()),
 22 |         on="unit_id",
 23 |     )
 24 |     df2["rel_time"] = df2.time_id - df2["first_treated_period"]
 25 |     df2["first_treated_period"] = (
 26 |         df2["first_treated_period"].replace(np.nan, 0).astype("int")
 27 |     )
 28 |     df2["rel_time"] = df2["rel_time"].replace(np.nan, np.inf)
 29 | 
 30 |     fit_evstud = pf.feols(
 31 |         "Y_it ~ i(rel_time, ref=-1.0) | unit_id + time_id",
 32 |         df2,
 33 |         vcov={"CRV1": "unit_id"},
 34 |     )
 35 |     res = fit_evstud.tidy()
 36 |     # truth
 37 |     true_fns = {}
 38 |     for c, s in enumerate(treatment_start_cohorts):
 39 |         effect_vector_padded = np.pad(
 40 |             base_treatment_effects[c],
 41 |             (treatment_start_cohorts[-1], 0),
 42 |         )
 43 | 
 44 |         # Create x-axis values that skip -1
 45 |         x_values = np.arange(len(effect_vector_padded))
 46 |         x_values = np.where(
 47 |             x_values >= treatment_start_cohorts[-1],
 48 |             x_values - treatment_start_cohorts[-1],
 49 |             x_values - treatment_start_cohorts[-1] - 1,
 50 |         )
 51 |         true_fns[f"cohort_{s}"] = pd.Series(
 52 |             {x: y for x, y in zip(x_values, effect_vector_padded)}
 53 |         )
 54 | 
 55 |     true_event_study = pd.concat(true_fns).reset_index()
 56 |     true_event_study.columns = ["cohort", "rel_time", "true_effect"]
 57 |     true_event_study = true_event_study.groupby("rel_time")["true_effect"].mean()
 58 |     f, ax = plt.subplots(4, 1, figsize=figdim)
 59 |     cmp = plt.get_cmap("Set1")
 60 |     i = 0
 61 |     for k, v in true_fns.items():
 62 |         ax[0].plot(v, color=cmp(i), marker=".")
 63 |         i += 1
 64 |     ax[0].axvline(-1, color="black", linestyle="--")
 65 |     ax[0].axhline(0, color="black", linestyle=":")
 66 |     ax[0].set_title("True treatment effect functions")
 67 | 
 68 |     event_time = (
 69 |         res.index.str.extract(r"\[T\.(-?\d+\.\d+)\]").astype(float).values.flatten()
 70 |     )
 71 | 
 72 |     ax[1].plot(event_time, res["Estimate"], marker=".", label="2wfe", color=cmp(1))
 73 |     ax[1].fill_between(
 74 |         event_time,
 75 |         res["2.5%"],
 76 |         res["97.5%"],
 77 |         alpha=0.2,
 78 |         color=cmp(1),
 79 |     )
 80 |     ax[1].plot(true_event_study, color="black", label="true", marker=".")
 81 |     ax[1].axvline(-1, color="black", linestyle="--")
 82 |     ax[1].axhline(0, color="black", linestyle=":")
 83 |     ax[1].set_title("Pooled event study \n 2WFE")
 84 |     ax[1].legend()
 85 | 
 86 |     # saturated
 87 |     _ = saturated_event_study(
 88 |         df,
 89 |         outcome="Y_it",
 90 |         treatment="W_it",
 91 |         unit_id="unit_id",
 92 |         time_id="time_id",
 93 |         ax=ax[2],
 94 |     )
 95 |     ax[2].set_title("Saturated event study \n cohort X time interactions + 2WFE")
 96 | 
 97 |     treat_quilt = mini_panelview(
 98 |         df,
 99 |         unit="unit_id",
100 |         time="time_id",
101 |         treat="W_it",
102 |     )
103 |     ax[3].imshow(treat_quilt, aspect="auto", cmap="viridis")
104 | 
105 |     f.tight_layout()
106 | 
107 | 
108 | ######################################################################
109 | def checkplot(df):
110 |     mm = test_treatment_heterogeneity(df, retmod=True)
111 |     mmres = mm.tidy().reset_index()
112 |     mmres[["time", "cohort"]] = mmres.Coefficient.str.split(":", expand=True)
113 |     mmres["time"] = mmres.time.str.extract(r"\[T\.(-?\d+\.\d+)\]").astype(float)
114 |     mmres["cohort"] = mmres.cohort.str.extract(r"(\d+)")
115 |     mmres.loc[~(mmres.cohort.isna()) & (mmres.time > 0)].index
116 | 
117 |     evstudy_coefs = {}
118 |     evstudy_coefs["0"] = (
119 |         mmres[mmres.cohort.isna()][["Estimate", "time"]].set_index("time").iloc[:, 0]
120 |     )
121 |     for cohort in mmres.cohort.unique()[1:]:
122 |         evstudy_coefs[cohort] = (
123 |             mmres.loc[mmres.cohort == cohort][["Estimate", "time"]]
124 |             .set_index("time")
125 |             .iloc[:, 0]
126 |         )
127 | 
128 |     f, ax = plt.subplots(3, 1, figsize=(12, 9), sharex=True)
129 |     # vanilla event study
130 |     saturated_event_study(
131 |         df,
132 |         outcome="Y_it",
133 |         treatment="W_it",
134 |         time_id="time_id",
135 |         unit_id="unit_id",
136 |         ax=ax[0],
137 |     )
138 |     ax[0].set_title("Saturated event study")
139 |     # cohort interactions
140 |     ax[1].set_title("Cohort deviation coefficients relative to first cohort")
141 |     ax[1].plot(evstudy_coefs["0"], label="Cohort 0", marker=".")
142 |     ax[1].plot(evstudy_coefs["15"], label="Cohort 1", marker=".")
143 |     ax[1].plot(evstudy_coefs["20"], label="Cohort 2", marker=".")
144 |     ax[1].axvline(-0.5, color="black", linestyle="--", alpha=0.5)
145 |     ax[1].axhline(0, color="black", linestyle=":", alpha=0.5)
146 |     # combined
147 |     ax[2].set_title("Aggregate cohort effects (Baseline + cohort deviations)")
148 |     ax[2].plot(evstudy_coefs["0"], label="Cohort 0", marker=".")
149 |     ax[2].plot(evstudy_coefs["15"] + evstudy_coefs["0"], label="Cohort 1", marker=".")
150 |     ax[2].plot(evstudy_coefs["20"] + evstudy_coefs["0"], label="Cohort 2", marker=".")
151 |     ax[2].axvline(-0.5, color="black", linestyle="--", alpha=0.5)
152 |     ax[2].axhline(0, color="black", linestyle=":", alpha=0.5)
153 |     ax[2].legend()
154 | 
155 | 


--------------------------------------------------------------------------------
/code/saturated.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import pyfixest as pf
  5 | 
  6 | 
  7 | def saturated_event_study(
  8 |     df: pd.DataFrame,
  9 |     outcome: str = "outcome",
 10 |     treatment: str = "treated",
 11 |     time_id: str = "time",
 12 |     unit_id: str = "unit",
 13 |     ax: plt.Axes = None,
 14 | ):
 15 |     # create interactions
 16 |     df = df.merge(
 17 |         df.assign(first_treated_period=df[time_id] * df[treatment])
 18 |         .groupby(unit_id)["first_treated_period"]
 19 |         .apply(lambda x: x[x > 0].min()),
 20 |         on=unit_id,
 21 |     )
 22 |     df["rel_time"] = df[time_id] - df["first_treated_period"]
 23 |     df["first_treated_period"] = (
 24 |         df["first_treated_period"].replace(np.nan, 0).astype("int")
 25 |     )
 26 |     df["rel_time"] = df["rel_time"].replace(np.nan, np.inf)
 27 |     cohort_dummies = pd.get_dummies(
 28 |         df.first_treated_period, drop_first=True, prefix="cohort_dummy"
 29 |     )
 30 |     df_int = pd.concat([df, cohort_dummies], axis=1)
 31 |     # formula
 32 |     ff = f"""
 33 |                 {outcome} ~
 34 |                 {'+'.join([f"i(rel_time, {x}, ref = -1.0)" for x in df_int.filter(like = "cohort_dummy", axis = 1).columns])}
 35 |                 | {unit_id} + {time_id}
 36 |                 """
 37 |     m = pf.feols(ff, df_int, vcov={"CRV1": unit_id})
 38 |     if ax:
 39 |         # plot
 40 |         res = m.tidy()
 41 |         # create a dict with cohort specific effect curves
 42 |         res_dict = {}
 43 |         for c in cohort_dummies.columns:
 44 |             res_cohort = res.filter(like=c, axis=0)
 45 |             event_time = (
 46 |                 res_cohort.index.str.extract(r"\[T\.(-?\d+\.\d+)\]")
 47 |                 .astype(float)
 48 |                 .values.flatten()
 49 |             )
 50 |             res_dict[c] = {"est": res_cohort, "time": event_time}
 51 | 
 52 |         i = 0
 53 |         cmp = plt.get_cmap("Set1")
 54 |         for k, v in res_dict.items():
 55 |             ax.plot(v["time"], v["est"]["Estimate"], marker=".", label=k, color=cmp(i))
 56 |             ax.fill_between(
 57 |                 v["time"], v["est"]["2.5%"], v["est"]["97.5%"], alpha=0.2, color=cmp(i)
 58 |             )
 59 |             i += 1
 60 |         ax.axvline(-1, color="black", linestyle="--")
 61 |         ax.axhline(0, color="black", linestyle=":")
 62 |     return m
 63 | 
 64 | 
 65 | def test_treatment_heterogeneity(
 66 |     df: pd.DataFrame,
 67 |     outcome: str = "Y_it",
 68 |     treatment: str = "W_it",
 69 |     unit_id: str = "unit_id",
 70 |     time_id: str = "time_id",
 71 |     retmod: bool = False,
 72 | ):
 73 |     # Get treatment timing info
 74 |     df = df.merge(
 75 |         df.assign(first_treated_period=df[time_id] * df[treatment])
 76 |         .groupby(unit_id)["first_treated_period"]
 77 |         .apply(lambda x: x[x > 0].min()),
 78 |         on=unit_id,
 79 |     )
 80 |     df["rel_time"] = df[time_id] - df["first_treated_period"]
 81 |     df["first_treated_period"] = (
 82 |         df["first_treated_period"].replace(np.nan, 0).astype("int")
 83 |     )
 84 |     df["rel_time"] = df["rel_time"].replace(np.nan, np.inf)
 85 |     # Create dummies but drop TWO cohorts - one serves as base for pooled effects
 86 |     cohort_dummies = pd.get_dummies(
 87 |         df.first_treated_period, drop_first=True, prefix="cohort_dummy"
 88 |     ).iloc[
 89 |         :, 1:
 90 |     ]  # drop an additional cohort - drops interactions for never treated and baseline
 91 | 
 92 |     df_int = pd.concat([df, cohort_dummies], axis=1)
 93 | 
 94 |     # Modified formula with base effects + cohort-specific deviations
 95 |     ff = f"""
 96 |     {outcome} ~
 97 |     i(rel_time, ref=-1.0) +
 98 |     {'+'.join([f"i(rel_time, {x}, ref = -1.0)" for x in df_int.filter(like = "cohort_dummy", axis = 1).columns])}
 99 |     | {unit_id} + {time_id}
100 |     """
101 | 
102 |     model = pf.feols(ff, df_int, vcov={"CRV1": unit_id})
103 |     P = model.coef().shape[0]
104 | 
105 |     if retmod:
106 |         return model
107 |     mmres = model.tidy().reset_index()
108 |     mmres[["time", "cohort"]] = mmres.Coefficient.str.split(":", expand=True)
109 |     mmres["time"] = mmres.time.str.extract(r"\[T\.(-?\d+\.\d+)\]").astype(float)
110 |     mmres["cohort"] = mmres.cohort.str.extract(r"(\d+)")
111 |     # indices of coefficients that are deviations from common event study coefs
112 |     event_study_coefs = mmres.loc[~(mmres.cohort.isna()) & (mmres.time > 0)].index
113 |     # Method 2 (K x P) - more efficient
114 |     K = len(event_study_coefs)
115 |     R2 = np.zeros((K, P))
116 |     for i, idx in enumerate(event_study_coefs):
117 |         R2[i, idx] = 1
118 | 
119 |     test_result = model.wald_test(R=R2, distribution="chi2")
120 |     return test_result["pvalue"]
121 | 
122 | 
123 | def test_dynamics(
124 |     df,
125 |     outcome="Y",
126 |     treatment="W",
127 |     time_id="time",
128 |     unit_id="unit",
129 |     vcv={"CRV1": "unit"},
130 | ):
131 |     # Fit models
132 |     df = df.merge(
133 |         df.assign(first_treated_period=df[time_id] * df[treatment])
134 |         .groupby(unit_id)["first_treated_period"]
135 |         .apply(lambda x: x[x > 0].min()),
136 |         on=unit_id,
137 |     )
138 |     df["rel_time"] = df[time_id] - df["first_treated_period"]
139 |     df["rel_time"] = df["rel_time"].replace(np.nan, np.inf)
140 |     restricted = pf.feols(f"{outcome} ~ i({treatment}) | {unit_id} + {time_id}", df)
141 |     unrestricted = pf.feols(
142 |         f"{outcome} ~ i(rel_time, ref=0) | {unit_id} + {time_id}", df, vcov=vcv
143 |     )
144 |     # Get the restricted estimate
145 |     restricted_effect = restricted.coef().iloc[0]
146 |     # Create R matrix - each row tests one event study coefficient
147 |     # against restricted estimate
148 |     n_evstudy_coefs = unrestricted.coef().shape[0]
149 |     R = np.eye(n_evstudy_coefs)
150 |     # q vector is the restricted estimate repeated
151 |     q = np.repeat(restricted_effect, n_evstudy_coefs)
152 |     # Conduct Wald test
153 |     pv = unrestricted.wald_test(R=R, q=q, distribution="chi2")["pvalue"]
154 |     return pv
155 | 


--------------------------------------------------------------------------------
/code/syncer.sh:
--------------------------------------------------------------------------------
1 | for f in *.ipynb; do jupytext --sync $f; done
2 | 


--------------------------------------------------------------------------------
/figtab/hetfx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/hetfx.png


--------------------------------------------------------------------------------
/figtab/homfx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/homfx.png


--------------------------------------------------------------------------------
/figtab/rejection_rates_F.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/rejection_rates_F.png


--------------------------------------------------------------------------------
/figtab/rejection_rates_dyn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/rejection_rates_dyn.png


--------------------------------------------------------------------------------
/figtab/respecification_verify.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/respecification_verify.png


--------------------------------------------------------------------------------
/figtab/respecification_verify_het.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/respecification_verify_het.png


--------------------------------------------------------------------------------
/figtab/respecification_verify_hom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/respecification_verify_hom.png


--------------------------------------------------------------------------------
/figtab/static_dynamic_effects.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/static_dynamic_effects.png


--------------------------------------------------------------------------------
/figtab/true_functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/figtab/true_functions.png


--------------------------------------------------------------------------------
/paper/appendix.typ:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/paper/appendix.typ


--------------------------------------------------------------------------------
/paper/jmlr.typ:
--------------------------------------------------------------------------------
  1 | #let jmlr(
  2 |   title: [],
  3 |   authors: (),
  4 |   abstract: [],
  5 |   keywords: (),
  6 |   bibliography: none,
  7 |   appendix: none,
  8 |   date: none,
  9 |   body,
 10 | ) = {
 11 |   // Extract affls if provided in the specific format
 12 |   let affls = ()
 13 |   if authors.len() == 2 and type(authors) == array {
 14 |     (authors, affls) = authors
 15 |   }
 16 | 
 17 |   // Basic document setup
 18 |   set document(title: title)
 19 |   set page(
 20 |     paper: "us-letter",
 21 |     margin: (left: 1.0in, right: 1.0in, top: 1.0in, bottom: 1.0in),
 22 |     numbering: "1",
 23 |   )
 24 | 
 25 |   // Basic text settings
 26 |   set text(font: ("P052",), size: 11pt)
 27 |   set par(leading: 0.55em, first-line-indent: 17pt, justify: true)
 28 |   set heading(numbering: "1.1  ")
 29 | 
 30 |   // Set citation style to dark red
 31 |   // set cite(style: "chicago-author-date")
 32 |   show cite: set text(fill: rgb(139, 0, 0))  // Dark red color for citations
 33 | 
 34 |   // Make all links dark red with no underline
 35 |   show link: set text(fill: rgb(139, 0, 0))
 36 | 
 37 |   // Title
 38 |   align(center)[
 39 |     #block(text(size: 14pt, weight: "bold", title))
 40 |     #v(1em)
 41 |   ]
 42 | 
 43 |   // Authors
 44 |   for author in authors {
 45 |     align(center)[
 46 |       #text(weight: "bold", author.name)
 47 |       #if "affl" in author and author.affl in affls {
 48 |         let affl = affls.at(author.affl)
 49 |         if "department" in affl {
 50 |           linebreak()
 51 |           emph(affl.department)
 52 |         }
 53 |       }
 54 |       #if "email" in author and author.email != "" {
 55 |         linebreak()
 56 |         link("mailto:" + author.email, author.email)
 57 |       }
 58 |     ]
 59 |     v(0.5em)
 60 |   }
 61 | 
 62 |   // Abstract
 63 |   if abstract != [] {
 64 |     v(1em)
 65 |     align(center)[*Abstract*]
 66 |     block(
 67 |       width: 100%,
 68 |       inset: (x: 2em),
 69 |       abstract
 70 |     )
 71 |   }
 72 | 
 73 |   // Keywords
 74 |   if keywords != () {
 75 |     v(0.5em)
 76 |     block(
 77 |       width: 100%,
 78 |       inset: (x: 2em),
 79 |       [*Keywords:* #keywords.join(", ")]
 80 |     )
 81 |   }
 82 | 
 83 |   v(2em)
 84 | 
 85 |   // Main body
 86 |   body
 87 | 
 88 |   // Appendix
 89 |   if appendix != none {
 90 |     pagebreak()
 91 |     heading(numbering: "A.1", [Appendix])
 92 |     counter(heading).update(0)
 93 |     appendix
 94 |   }
 95 | 
 96 |   // Bibliography
 97 |   if bibliography != none {
 98 |     pagebreak()
 99 |     heading([References])
100 |     bibliography
101 |   }
102 | }
103 | 
104 | // Simplest possible theorem function
105 | #let theorem(body) = {
106 |   block(
107 |     fill: rgb(240, 240, 240),
108 |     inset: 1em,
109 |     radius: 4pt,
110 |     [*Theorem.* #body]
111 |   )
112 | }
113 | 
114 | // Simplest possible proof function
115 | #let proof(body) = {
116 |   block(
117 |     inset: 1em,
118 |     [*Proof.* #body #h(1fr) #sym.square.stroked]
119 |   )
120 | }
121 | 


--------------------------------------------------------------------------------
/paper/main.bib:
--------------------------------------------------------------------------------
  1 | @ARTICLE{Arkhangelsky2023-rf,
  2 |   title        = {Causal Models for Longitudinal and Panel Data: A Survey},
  3 |   author       = {Arkhangelsky, D and Imbens, G},
  4 |   journaltitle = {SSRN Electronic Journal},
  5 |   date         = {2023-11-26},
  6 |   url          = {http://www.nber.org/papers/w31942.pdf}
  7 | }
  8 | 
  9 | @inproceedings{currie2020technology,
 10 |   title={Technology and big data are changing economics: Mining text to track methods},
 11 |   author={Currie, Janet and Kleven, Henrik and Zwiers, Esm{\'e}e},
 12 |   booktitle={AEA Papers and Proceedings},
 13 |   volume={110},
 14 |   pages={42--48},
 15 |   year={2020},
 16 |   organization={American Economic Association 2014 Broadway, Suite 305, Nashville, TN 37203}
 17 | }
 18 | 
 19 | @book{lehmann2005testing,
 20 |   address = {New York},
 21 |   author = {Lehmann, E. L. and Romano, Joseph P.},
 22 |   edition = {Third},
 23 |   publisher = {Springer},
 24 |   series = {Springer Texts in Statistics},
 25 |   title = {Testing statistical hypotheses},
 26 |   year = 2005
 27 | }
 28 | 
 29 | @ARTICLE{Ding2019-nr,
 30 |   title        = {Decomposing Treatment Effect Variation},
 31 |   author       = {Ding, Peng and Feller, Avi and Miratrix, Luke},
 32 |   journaltitle = {Journal of the American Statistical Association},
 33 |   publisher    = {Taylor \& Francis},
 34 |   volume       = {114},
 35 |   issue        = {525},
 36 |   pages        = {304--317},
 37 |   date         = {2019-01-02},
 38 |   url          = {https://doi.org/10.1080/01621459.2017.1407322}
 39 | }
 40 | 
 41 | @ARTICLE{Callaway2021-gv,
 42 |   title        = {Difference-in-Differences with multiple time periods},
 43 |   author       = {Callaway, Brantly and Sant'Anna, Pedro H C},
 44 |   journaltitle = {Journal of econometrics},
 45 |   publisher    = {Elsevier BV},
 46 |   volume       = {225},
 47 |   issue        = {2},
 48 |   pages        = {200--230},
 49 |   date         = {2021-12-01},
 50 |   url          = {https://www.sciencedirect.com/science/article/pii/S0304407620303948},
 51 |   language     = {en}
 52 | }
 53 | 
 54 | 
 55 | @ARTICLE{weiss2024much,
 56 |   title={How Much Should We Trust Modern Difference-in-Differences Estimates?},
 57 |   author={Weiss, Amanda},
 58 |   year={2024},
 59 |   journal={socarxiv preprint},
 60 |   institution={Center for Open Science}
 61 | }
 62 | 
 63 | 
 64 | @article{chiu2023and,
 65 |   title={What to do (and not to do) with causal panel analysis under parallel trends: Lessons from a large reanalysis study},
 66 |   author={Chiu, Albert and Lan, Xingchen and Liu, Ziyi and Xu, Yiqing},
 67 |   journal={arXiv preprint arXiv:2309.15983},
 68 |   year={2023}
 69 | }
 70 | 
 71 | @ARTICLE{Schmidheiny2023-of,
 72 |   title        = {On event studies and distributed‐lags in two‐way fixed effects
 73 |                   models: Identification, equivalence, and generalization},
 74 |   author       = {Schmidheiny, Kurt and Siegloch, Sebastian},
 75 |   journaltitle = {Journal of applied econometrics (Chichester, England)},
 76 |   publisher    = {Wiley},
 77 |   volume       = {38},
 78 |   issue        = {5},
 79 |   pages        = {695--713},
 80 |   date         = {2023-08},
 81 |   url          = {https://www.schmidheiny.name/research/docs/schmidheiny-siegloch_2020-11.pdf},
 82 |   language     = {en}
 83 | }
 84 | 
 85 | 
 86 | @ARTICLE{Abraham2020-wu,
 87 |   title        = {Estimating Dynamic Treatment Effects in Event Studies with
 88 |                   Heterogeneous Treatment Effects},
 89 |   author       = {Abraham, Sarah and Sun, Liyang},
 90 |   journaltitle = {Journal of econometrics},
 91 |   date         = {2020}
 92 | }
 93 | 
 94 | @book{angrist2009mostly,
 95 |   title={Mostly harmless econometrics: An empiricist's companion},
 96 |   author={Angrist, Joshua D and Pischke, J{\"o}rn-Steffen},
 97 |   year={2009},
 98 |   publisher={Princeton university press}
 99 | }
100 | 
101 | @article{rambachan2023more,
102 |   title={A more credible approach to parallel trends},
103 |   author={Rambachan, Ashesh and Roth, Jonathan},
104 |   journal={Review of Economic Studies},
105 |   volume={90},
106 |   number={5},
107 |   pages={2555--2591},
108 |   year={2023},
109 |   publisher={Oxford University Press US}
110 | }
111 | 
112 | @article{lechner2011estimation,
113 |   title={The estimation of causal effects by difference-in-difference methods},
114 |   author={Lechner, Michael},
115 |   journal={Foundations and Trends in Econometrics},
116 |   volume={4},
117 |   number={3},
118 |   pages={165--224},
119 |   year={2011},
120 |   publisher={Now Publishers, Inc.}
121 | }
122 | 
123 | @ARTICLE{Wooldridge2021-op,
124 |   title        = {Two-Way Fixed Effects, the Two-Way Mundlak Regression, and
125 |                   Difference-in-Differences Estimators},
126 |   author       = {Wooldridge, Jeffrey M},
127 |   journaltitle = {Working paper},
128 |   date         = {2021-08-17},
129 |   url          = {http://dx.doi.org/},
130 |   urldate      = {2021-08-16}
131 | }
132 | 
133 | @article{lal2024large,
134 |   title={Large Scale Longitudinal Experiments: Estimation and Inference},
135 |   author={Lal, Apoorva and Fischer, Alexander and Wardrop, Matthew},
136 |   journal={arXiv preprint arXiv:2410.09952},
137 |   year={2024}
138 | }
139 | 
140 | @ARTICLE{Roth2022-sz,
141 |   title        = {What's trending in difference-in-differences? A synthesis of
142 |                   the recent econometrics literature},
143 |   author       = {Roth, Jonathan and Sant'Anna, Pedro H C and Bilinski, Alyssa
144 |                   and Poe, John},
145 |   journaltitle = {Journal of Econometrics},
146 |   date         = {2023-01-04},
147 |   eprinttype   = {arXiv},
148 |   eprintclass  = {econ.EM},
149 |   urldate      = {2023-01-01}
150 | }
151 | 
152 | 
153 | @ARTICLE{Goldsmith-Pinkham2024-ef,
154 |   title        = {Contamination bias in linear regressions},
155 |   author       = {Goldsmith-Pinkham, Paul S and Hull, Peter and Kolesár, Michal},
156 |   journaltitle = {American Economic Review},
157 |   date         = {2024},
158 |   language     = {en}
159 | }
160 | 
161 | @ARTICLE{De_Chaisemartin2021-ln,
162 |   title   = {Two-way fixed effects and differences-in-differences with
163 |              heterogeneous treatment effects: A survey},
164 |   author  = {de Chaisemartin, Clément and D'Haultfœuille, Xavier},
165 |   date    = {2021-12-08},
166 |   url     = {https://papers.ssrn.com/abstract=3980758},
167 |   urldate = {2021-12-08}
168 | }
169 | 
170 | @ARTICLE{Goodman-Bacon2021-ys,
171 |   title        = {Difference-in-differences with variation in treatment timing},
172 |   author       = {Goodman-Bacon, Andrew},
173 |   journaltitle = {Journal of econometrics},
174 |   date         = {2021-06-12},
175 |   url          = {https://www.sciencedirect.com/science/article/pii/S0304407621001445}
176 | }
177 | 
178 | @ARTICLE{De_Chaisemartin2020-za,
179 |   title        = {Two-way fixed effects estimators with heterogeneous treatment
180 |                   effects},
181 |   author       = {de Chaisemartin, Clément and D'Haultfœuille, Xavier},
182 |   journaltitle = {The American economic review},
183 |   date         = {2020},
184 |   eprinttype   = {arXiv},
185 |   eprintclass  = {econ.EM},
186 |   url          = {http://arxiv.org/abs/1803.08807}
187 | }
188 | 
189 | @ARTICLE{Imbens2021-zw,
190 |   title        = {Double-Robust Identification for Causal Panel Data Models},
191 |   author       = {{Imbens} and {Arkhangelsky}},
192 |   journaltitle = {NBER},
193 |   date         = {2021},
194 |   url          = {https://www.nber.org/system/files/working_papers/w28364/w28364.pdf}
195 | }
196 | 


--------------------------------------------------------------------------------
/paper/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apoorvalal/TestingInEventStudies/67fcce76736e78863a53da9696b405ab31037672/paper/main.pdf


--------------------------------------------------------------------------------
/paper/main.typ:
--------------------------------------------------------------------------------
  1 | #import "jmlr.typ": jmlr, theorem, proof
  2 | #let authors = (
  3 |   (name: "Apoorva Lal", affl: "one", email: ""),
  4 | )
  5 | #let affls = (
  6 |   one: (
  7 |     department: "Netflix, Los Gatos, CA",
  8 |   ),
  9 | )
 10 | #set math.equation(numbering: "(1)")
 11 | #show: jmlr.with(
 12 |   title: [When can we get away with using the two-way fixed effects regression?],
 13 |   authors: (authors, affls),
 14 |   abstract: "
 15 |   The use of the two-way fixed effects regression in empirical social science was historically motivated by folk wisdom that it uncovers the average treatment effect on the treated (ATT). This has come under scrutiny recently due to recent results in applied econometrics showing that it fails to uncover meaningful averages of heterogeneous treatment effects in the presence of effect heterogeneity over time and across adoption cohorts, and several heterogeneity-robust alternatives have been proposed. However, these estimators often have higher variance and are therefore under-powered for many applications, which poses a bias-variance tradeoff that is challenging for researchers to navigate. In this paper, we propose simple tests of linear restrictions that can be used to test for differences in dynamic treatment effects over cohorts, which allows us to test for when the two-way fixed effects regression is likely to yield biased estimates of the ATT. These tests are implemented as methods in the pyfixest python library.
 16 |   ",
 17 |   keywords: ("difference in differences", "panel data", "heterogeneous treatment effects"),
 18 |   bibliography: bibliography("main.bib"),
 19 |   appendix: none,
 20 |   date: datetime(
 21 |   year: 2025,
 22 |   month: 03,
 23 |   day: 15,
 24 | )
 25 | 
 26 | )
 27 | 
 28 | 
 29 | 
 30 | 
 31 | = Introduction
 32 | 
 33 | Difference-in-Differences and event-studies are now the most popular for estimating causal effects in observational settings thanks to the growing importance of administrative and other offline data sources (@currie2020technology). Their popularity arises from their broad applicability to settings with selection on unobservable unit-specific factors, and straightforward implementation as a two-way fixed effects linear regression in the two-period case. As with many empirical techniques, however, practice outstrips theory, and the extension of the equivalence between fixed-effects regression and the non-parametric estimator in the two-period, two-cohort case turned out to be more complicated than previously realized, which has prompted an exposion of alternative estimators in that are robust to the 'contamination bias' introduced into the two-way fixed-effects regression by treatment effect heterogeneity. Since these newer estimators either trim the data or add many parameters to zero out the bias, they tend to have considerably higher variance, which introduces a bias-variance tradeoff that is challenging to navigate for practitioners. In this paper, we propose the application of
 34 | classical joint-tests of appropriately parametrized linear regressions as a tool to aid practitioners in navigating the bias-variance tradeoff in difference-in-differences settings.
 35 | 
 36 | = Methodology
 37 | 
 38 | Consider a balanced panel-data setting with $i = 1, ..., N$ individuals observed over $t = 1, ..., T$ time periods. For each unit $i$, a binary treatment $w_(i t) := 1(t >= g_(i))$ is assigned at some adoption time $g_(i) in cal(G)$ where $cal(G) := [T] union infinity$ is the set of treatment adoption times and $g_i = infinity$ indicates a never-treated unit. We observe a scalar outcome $y_(i t) = w_(i t) y^(1)_(i t) + (1-w_(i t)) y^(0)_(i t)$, where $y^(1)_(i t)$ and $y^(0)_(i t)$ are potential outcomes under treatment and control, respectively
 39 | #footnote[Defining potential outcomes as $y^(w)_(i t)$ is a strong but common assumption; it requires no carryover - that the outcome for unit $i$ at time $t$ is only influenced by $i$'s current-period treatment and not treatment history. Alternative estimators such as Marginal Structural Models (MSMs) and dynamic panel models permit estimation in the presence of carryover under different strong assumptions but are considerably more computationally challenging, and as such are used infrequently.]. The following two-way fixed effects regression
 40 | 
 41 | $
 42 |   y_(i t) = tau w_(i t) + alpha_i + lambda_t + epsilon_(i t)
 43 | $ <statictwfe>
 44 | 
 45 | is a workhorse regression in applied economics and adjacent fields for the estimation of causal effects in such settings. The estimand that researchers typically seek to estimate in panel data settings is the Average Treatment effect on the Treated (ATT) ($EE[y^(1)_(i t) - y^(0)_(i t) | w_(i t) = 1]$), and researchers often interpret the coefficient on the treatment indicator, $hat(tau)$, as an estimate of the ATT.
 46 | The above regression's dynamic ('event study') counterpart
 47 | 
 48 | $
 49 | y_(i t) = sum_(s != -1)^(T) gamma_s Delta_(i t)^s + alpha_i + lambda_t + epsilon_(i t)
 50 | $ <eventstudy>
 51 | 
 52 | where $Delta_(i t)^s$ is an indicator for the $s$-th period relative to the adoption time for treated units (which in turn is the first-difference of the treatment indicator, @Schmidheiny2023-of). The presence of leads and lags of the switching indicator in this regression allows us to interpret the coefficients on lags as estimate the dynamic ATT (@angrist2009mostly ch 5) and coefficient on leads as a visual check of the validity of the parallel trends. This practice is widespread in applied research but tends to be distortionary and has low power (@rambachan2023more).
 53 | 
 54 | When $g_i in {T_0, infinity}$ (one-shot adoption), the above regressions are unbiased estimates of the ATT under the assumption of parallel trends and no anticipation (@lechner2011estimation). However, when $g_i in {T_0, ..., T-1}$ (staggered adoption), the above regressions exhibit the 'negative weighting'/'contamination bias' problem (@Goodman-Bacon2021-ys, @De_Chaisemartin2020-za, @Goldsmith-Pinkham2024-ef). As in the cross-sectional case, the regression coefficient on the treatment indicator, $hat(tau)$, is a weighted average of the treatment effects over time and across treated cohorts, where the weights are functions of the conditional variance in the treatment. Unlike in the cross-sectional case, however, these weights can be negative for some cohorts, which yields the conclusion that the two-way fixed effects regression can fail to uncover meaningful averages of heterogeneous treatment effects over time and across adoption cohorts#footnote[In particlar, this constitutes a violation of the 'no-sign reversal property' where $hat(tau)$ is positive even if the treatment effect is strictly negative for each $(g,t)$ (@De_Chaisemartin2021-ln).]. The same is true for the event study coefficient vector $bold(gamma)$ (@Abraham2020-wu).
 55 | 
 56 | This has prompted a explosion of research in applied econometrics on new estimators that aim to uncover the ATT in the presence of heterogeneous treatment effects over time and across adoption cohorts (@De_Chaisemartin2021-ln, @Roth2022-sz, @Arkhangelsky2023-rf for reviews). Such heterogeneity-robust estimators typically involve estimating the ATT separately for each cohort using tailored comparisons between each treated cohort and either a never-treated or not-yet-treated group, and then averaging (optionally weighted by inverse-propensity weights, e.g. @Callaway2021-gv) these estimates to obtain an overall estimate of the ATT. While their consistency properties for the ATT are well understood and they avoid the negative weighting problem by construction, they are often computationally expensive and have higher variance than the two-way fixed effects regression.
 57 | 
 58 | This poses a practical bias-variance tradeoff for researchers: while the two-way fixed effects regression is computationally simple and has low variance, it may yield biased estimates of the ATT in the presence of heterogeneous treatment effects over time and across adoption cohorts. In contrast, heterogeneity-robust estimators are computationally expensive and have higher variance, but they are consistent for the ATT in the presence of heterogeneous treatment effects over time and across adoption cohorts.  As a practical matter, a large re-analysis of published work in political science by @chiu2023and finds that they rarely overturn the conclusions of the two-way fixed effects regression, and are typically have considerably larger variance. Similarly, @weiss2024much finds that most new heterogeneity-robust estimators are underpowered for realistic effect sizes in the state-level US setting where difference-in-differences approaches commonly used.
 59 | 
 60 | This motivates the primary focus of this paper: to develop simple tests that can be used to test for differences in dynamic treatment effects over cohorts, which allows us to test for when the two-way fixed effects regression is likely to yield biased estimates of the ATT. Heuristically, if the dynamic treatment effects are homogeneous over cohorts, then the two-way fixed effects regression is likely to yield unbiased estimates of the ATT that are considerably more precise than alternative estimators that typically discard more data in order to shut down the negative weighting problem.
 61 | 
 62 | To build intuition for this approach, consider @homfx and @hetfx. In @homfx, there are three adoption cohorts (plus a never-treated cohort - bottom panel), and all cohorts exhibit the same temporal heterogeneity pattern (the effect function is $log(t)$ - top panel), and so the 2WFE event study (blue line in panel 2) is consistent for the true dynamic ATT (black line in panel 2). We can also consistently estimate the cohort-level ATTs with an appropriately saturated regression  (@Abraham2020-wu, @Wooldridge2021-op) as shown in the third panel. In @hetfx, in contrast, we have the same three adoption cohorts, but the three cohorts exhibit radically different temporal heterogeneity: the first exhibits a linear decay down to zero, the second exhibits a log increase followed by zero, and the third exhibits sinusoidal effects. In this case, the 2WFE event study (blue line in panel 2) is not consistent for the true dynamic ATT (black line in panel 2); in fact, the estimated event study suggests a violation of the parallel trend assumption despite the treatments being randomized and thus parallel trends being true in the DGP, which is a pernicious side-effect of the negative weights problem. We can still estimate the cohort-level ATTs correctly with a saturated regression. The key insight is that testing for differences between a 'pooled' event study (the blue line in the second panel) and cohort X time interactions (that yield the cohort-level estimates in the third panel) can help us distinguish between the two scenarios. This can be formulated as a joint F-test on the coefficients of the cohort X time interactions in a saturated regression. We provide a formal statement of this test in the next section, and show through simulation studies that this approach can detect cohort-level temporal heterogeneity in a variety of DGPs.
 63 | 
 64 | #figure(
 65 |   image("../figtab/homfx.png", width: 100%),
 66 |   caption: [
 67 |     true and estimated effects from pooled and saturated event study regressions with homogeneous treatment effects across three cohorts. Joint test p-value = 0.11
 68 |   ],
 69 | ) <homfx>
 70 | 
 71 | 
 72 | #figure(
 73 |   image("../figtab/hetfx.png", width: 100%),
 74 |   caption: [
 75 |     true and estimated effects from pooled and saturated event study regressions in a DGP with heterogeneous treatment effects across three cohorts. Joint test p-value = 0.000
 76 |   ],
 77 | ) <hetfx>
 78 | 
 79 | 
 80 | = Methodology
 81 | 
 82 | Tests considered in the following section take the form of traditional joint tests of multiple linear restrictions, where the null hypothesis is that $bold(R) bold(beta) = bold(q)$ where $bold(R)$ is a $m times k$ matrix of linear restrictions, $bold(beta)$ is a $k times 1$ vector of coefficients, and $q$ is a $m times 1$ vector of constants. The test statistic is then
 83 | 
 84 | $
 85 | F = frac(
 86 | (bold(R) hat(beta) - bold(q))'
 87 | [bold(R) hat(bb(V)) bold(R)']^(-1)
 88 | (bold(R) hat(beta) - bold(q)),
 89 | m)
 90 | ~ F(m, n-k) " under the null hypothesis"
 91 | $
 92 | 
 93 | where $hat(bb(V))$ is the cluster-robust variance-covariance matrix of the coefficient estimates.
 94 | We consider two tests: one for testing for event study dynamics, and one for testing for heterogeneity in event study dynamics. These tests are both classical Wald tests for linear restrictions and are asymptotically equivalent to the Likelihood Ratio test and Lagrange Multiplier test. The test is optimal (most powerful) in the class of invariant tests for local alternatives when errors are normally distributed (@lehmann2005testing lemma 8.5.2).
 95 | #footnote[This can be implemented using either a $chi^2$ or $F$ test; the distinction between the two is due to different degrees of freedom that disappear for realistic sample sizes]
 96 | 
 97 | == Testing for event study dynamics <test_dyn>
 98 | 
 99 | As a warmup, consider a simple comparison between @statictwfe and @eventstudy. The latter decomposes the ATT across time-periods. For the purposes of testing for event study dynamics, we only care about comparing the equality of the dynamic treatment effects after the treatment is assigned (${gamma_t}_(t=0)^T$) against the common ATT estimate $tau$. We can test the following null hypothesis
100 | $
101 | H_0: {gamma_t}_(t=0)^T = hat(tau) " for all k" > 0
102 | $
103 | 
104 | by specifying $bold(R) = bold(I)_K$ as a $T_1 times T_1$ identity matrix and $bold(q) = (hat(tau),  ..., hat(tau))'$ as a $T_1$-vector of the restricted estimate ($hat(tau)$ from @statictwfe).
105 | #footnote[
106 |   this can equivalently be formulated by testing for the equality of adjacent elements of $bold(gamma)$, e.g. $gamma_1 = gamma_2$ by specifying $bold(R)$ that contains rows like $[1, -1, 0, ..., 0]$ and $q = [0, ..., 0]$.
107 | ]
108 | 
109 | == Testing for across-cohort heterogeneity in dynamic treatment effects <test_het>
110 | 
111 | Next, we extend the approach outlined above to construct a test for across-cohort heterogeneity in dynamic treatment effects. A conventional method to estimate the cohort-level ATTs is to estimate the dynamic treatment effects separately for each cohort and then average these estimates to obtain an overall estimate of the ATT (@Abraham2020-wu, @Wooldridge2021-op, @lal2024large), which involves specifying the following regression
112 | 
113 | $
114 | y_(i t) = alpha_i + lambda_t +
115 |   underbrace(
116 |     sum_(g_i in cal(C)\\ infinity) sum_(s != -1)^(T) bb(1)(g_i = c) tau^(s c) Delta_(i t)^s,
117 |     "Cohort-Time Interactions")
118 |   + epsilon_(i t)
119 | $ <satevent>
120 | 
121 | This is a saturated event study that constructs cohort $times$ time interactions for each adoption cohort (with $g_i = infinity$ never treated cohort) omitted and therefore recovers the cohort-level event studies. These coefficients are reported in the third panel in @homfx and @hetfx, and correctly uncover the true cohort-level ATTs in the presence of arbitrary heterogeneous treatment effects across cohorts (top panel). The downside of this approach, however, are twofold. First, these regressions can get unwieldy with many cohorts, and the number of parameters grows linearly with the number of cohorts. Second, the cohort level ATTs are self-contained and therefore constructing a test for equality across multiple cohorts is not straightforward. Instead, one may re-specify the saturated event-study regression @satevent as follows:
122 | 
123 | $
124 |   y_(i t) = alpha_i + lambda_t +
125 |     underbrace(sum_(s != -1)^(T) gamma_s Delta_(i t)^s, "(a) Common event study coefficients")
126 |     +
127 |     underbrace(sum_(c in cal(C)) sum_(s != -1)^(T) delta_s Delta_(i t)^(c s), "(b) Cohort-specific deviations")
128 |     + epsilon_(i t)
129 | $ <jointreg>
130 | 
131 | @jointreg returns numerically identical estimates of the cohort-level dynamic ATT as @satevent, but it allows us to test for differences in dynamic treatment effects over cohorts more easily. This is because @jointreg contains a common event study coefficient vector (a), and cohort-level deviations (b). The (b) terms can be jointly tested against the null of zero, which serves as a direct test of cohort-level treatment effect heterogeneity relative to a traditional event study. This approach is similar to omnibus tests of effect heterogeneity in cross-sectional RCTs proposed by @Ding2019-nr, testing the joint null of $gamma = 0$ in the interacted regression $y ~ tau W + X beta + W X gamma + epsilon$ serves as a test for explained effect heterogeneity. We illustrate an application of this test in @respec, where the top panel reports the saturated event study @satevent, the middle panel reports the coefficients from re-specified model @jointreg, and the bottom panel reports the sum of the common event study and cohort-specific deviations, which reproduces the saturated event study estimates exactly.
132 | 
133 | #set align(left)
134 | #figure(
135 |     grid(
136 |         columns: (260pt, 260pt),
137 |         [ #image("../figtab/respecification_verify_hom.png", width: 120%) ],
138 |         [ #image("../figtab/respecification_verify_het.png", width: 120%) ],
139 |     ),
140 |     caption: [
141 |     For each DGP (homogeneous - @homfx - on the left and heterogeneous - @hetfx - on the right), the top panel illustrates the traditional event study estimates from eqn @satevent, which are unbiased for the true effects. The middle panel plots the re-specified model, which plots an overall event study (first cohort : blue) and subsequent cohort deviations (second and third cohorts - which are null in this DGP). The final panel plots the sum of the blue and cohort-specific coefficients, which reproduces the event study coefficient from the first panel exactly.
142 |   ],
143 | ) <respec>
144 | 
145 | 
146 | We show in the next section that this test is consistent for the null hypothesis of homogeneous dynamic treatment effects over cohorts, and that it has power against a variety of alternatives. As a concrete example, the joint $p-$value for the cohort $times$ time interactions in @homfx is $0.11$, while the joint p-value for the cohort $times$ time interactions in @hetfx is $0.000$. Thus, we can reject the null hypothesis of homogeneous dynamic treatment effects in @hetfx but not in @homfx, which is consistent with the underlying DGP. In the next section, we show through simulation studies that this test has good power to detect across-cohort heterogeneity in dynamic treatment effects in a variety of DGPs.
147 | 
148 | = Simulation Studies
149 | 
150 | == Testing for event study dynamics
151 | 
152 | To begin, we perform simulation studies based on to study the properties of the testing procedure described in @test_dyn. We consider the simple setting with a single adoption cohort where the treatment effects follow one of the following seven DGPs visualised in @static_dyn.
153 | 
154 | #figure(
155 |   image("../figtab/static_dynamic_effects.png", width: 105%),
156 |   caption: [
157 |     true treatment effect functions and estimates from difference in means, static, and dynamic two-way fixed effects regressions. The treatment effect is truly stationary in the first DGP and varies over time in the others.
158 |   ],
159 | ) <static_dyn>
160 | 
161 | The first DGP has constant effects over time, while the others have varying degrees of temporal heterogeneity. We simulate 1000 replications of the data for each DGP, and compute the rejection rate of the joint test for dynamic treatment effects outlined in the previous section. We report the rejection rate and p-value distribution in @rejrates_dyn. We find that the rejection rate for the constant DGP (null) is under the nominal level of $alpha = 0.05$, while the rejection rates for the other DGPs considerably higher. The rejection rate for concave effects is considerably lower, although this is likely due to the fact that the treatment effects do actually tail off in later time periods and the static effect captures this well.
162 | 
163 | #figure(
164 |   image("../figtab/rejection_rates_dyn.png", width: 105%),
165 |   caption: [
166 |     Rejection rates over 1000 replications for the joint test of dynamic treatment effects using an F-test in DGPs from @static_dyn
167 |   ],
168 | ) <rejrates_dyn>
169 | 
170 | 
171 | == Testing for across-cohort heterogeneity in dynamic treatment effects
172 | 
173 | Next, we perform simulation studies based on to study the properties of the testing procedure described in @test_het. Here, we consider seven different DGPs with homogeneous and heterogeneous treatment effect functions across cohorts as illustrated in @truefns. In addition to the two DGPs described in the previous section, we consider DGPs with heterogeneity that applies a scaler multiplier to the concave (log) effect function in @homfx with 'small' and 'large' differences; a DGP with 'selection on gains' where the cohort with the largest treatment effect adopts first; a DGP with 'novelty effects' where the treatment effect is large for the first few periods and then diminishes; and finally a DGP with 'activity bias' where the treatment effect is immediate and large for the earliest adopting cohort and much more gradual for the others. Among all these DGPs, the homogenous and novelty effects DGPs have homogeneous treatment effects across cohorts, while all others have heterogeneous treatment effects across cohorts.
174 | 
175 | For each DGP, we simulate 1000 replications of the data, and compute the rejection rate of the joint test for cohort-level coefficients outlined in the previous section. We report the rejection rate and p-value distribution in @rejrates. We find that the rejection rate for the homogeneous DGP (null) is under the nominal level of $alpha = 0.05$, while the rejection rates for heterogeneous DGPs are close to 1. This suggests that the test has good power to detect across-cohort heterogeneity in dynamic treatment effects.
176 | 
177 | #figure(
178 |   image("../figtab/true_functions.png", width: 80%),
179 |   caption: [
180 |     true cohort level effect functions for homogeneous and heterogeneous treatment effects across three cohorts. Earliest-treated cohort is in purple, middle cohort in green, and latest cohort in yellow.
181 |     'Homogenous' and 'novelty effects' DGPs have homogeneous treatment effects across cohorts, while all others have heterogeneous treatment effects across cohorts.
182 |   ],
183 | ) <truefns>
184 | 
185 | 
186 | 
187 | #figure(
188 |   image("../figtab/rejection_rates_F.png", width: 100%),
189 |   caption: [
190 |     Rejection rates over 1000 replications for the joint test of cohort-level coefficients using an F-test in DGPs from @truefns
191 |   ],
192 | ) <rejrates>
193 | 
194 | 
195 | =  Conclusion
196 | 
197 | The two-way fixed effects regression remains a workhorse tool in applied economics despite recent critiques highlighting its potential shortcomings under treatment effect heterogeneity. This paper provides simple diagnostic tests that help researchers determine when TWFE is likely to yield reliable estimates versus when more complex estimators are needed. Our simulation evidence shows these tests have good power to detect problematic patterns of effect heterogeneity while maintaining correct size under the null of homogeneous effects.
198 | 
199 | The tests we propose are computationally simple and implemented in the pyfixest library and readily implementable in standard statistical software. Since heterogeneity-robust estimators often come with higher variance and computational complexity, the ability to test when they are truly needed helps researchers make principled choices about their estimation strategy. While these tests cannot guarantee TWFE will recover meaningful treatment effects, they provide a practical tool for detecting scenarios where the recent critiques of TWFE are most relevant.
200 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Joint-tests for treatment-effect heterogeneity in panel data
2 | 
3 | Or, do you need to run 5 different heterogeneity-robust estimators in your event study.
4 | 
5 | Paper draft + replication code 
6 | - first paper written entirely in typst. Excellent experience; strongly recommend. 
7 | 
8 | 


--------------------------------------------------------------------------------