├── FDIDP99Update.png
├── FDID_HK.png
├── FDID_HK_CIsBar.png
├── FDID_SJ_Rep.do
├── HubeiPlot.png
├── HubeiPlot2.png
├── HubeiWidth.png
├── HubeivsAverage.png
├── README.md
├── StataVignette.md
├── Vignette.md
├── barcelona.dta
├── basque.dta
├── fdid.ado
├── fdid.pkg
├── fdid.sthlp
├── fdid_test.do
├── fdidevent.do
├── fitCali.png
├── fithongkong.png
├── hcw-data.txt
├── hcw.dta
├── kh-data.txt
├── smoking.dta
├── stata.toc
└── turnout.dta
/FDIDP99Update.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/FDIDP99Update.png
--------------------------------------------------------------------------------
/FDID_HK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/FDID_HK.png
--------------------------------------------------------------------------------
/FDID_HK_CIsBar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/FDID_HK_CIsBar.png
--------------------------------------------------------------------------------
/FDID_SJ_Rep.do:
--------------------------------------------------------------------------------
1 | cap log close
2 | log using fdidlog.log, replace
3 |
4 | clear *
5 |
6 | * Users Need sdid_event: https://github.com/Daniel-Pailanir/sdid/tree/main/sdid_event
7 |
8 | net from "https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/main"
9 |
10 | net install fdid, replace
11 |
12 | net get fdid, replace
13 |
14 | u smoking, clear
15 |
16 | fdid cigsale, treated(treat) unitnames(state)
17 |
18 | mkf newframe
19 |
20 | cwf newframe
21 |
22 | svmat e(series), names(col)
23 |
24 | cls
25 | twoway (connected cigsale3 year, mcolor(black) msize(small) msymbol(smcircle) lcolor(black) lwidth(medthick)) ///
26 | (connected cfdd3 year, mcolor(gs11) msize(small) msymbol(smsquare) lcolor(gs11) lpattern(solid) lwidth(thin)) ///
27 | (connected ymeandid3 year, mcolor(gs11) msize(small) msymbol(smtriangle) lcolor(gs11) lwidth(thin)), ///
28 | ylabel(#10, grid glwidth(vthin) glcolor(gs8%20) glpattern(dash)) ///
29 | xline(1989, lwidth(medium) lpattern(solid) lcolor(black)) ///
30 | xlabel(#10, grid glwidth(vthin) glcolor(gs8%20) glpattern(dash)) ///
31 | legend(cols(1) ///
32 | position(9) ///
33 | order(1 "California" 2 "DID Prediction" 3 "DID y{subscript:N{subscript:co}} Mean") ///
34 | region(fcolor(none) lcolor(none)) ring(0)) ///
35 | scheme(sj) ///
36 | graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white)) ///
37 | plotregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white)) ///
38 | name(did, replace) yti(Cigarette Sales) ti("All Controls")
39 |
40 | twoway (connected cigsale3 year, mcolor(black) msize(small) msymbol(smcircle) lcolor(black) lwidth(medthick)) ///
41 | (connected cf3 year, mcolor(gs11) msize(small) msymbol(smsquare) lcolor(gs11) lpattern(solid) lwidth(thin)) ///
42 | (connected ymeanfdid year, mcolor(gs11) msize(small) msymbol(smtriangle) lcolor(gs11) lwidth(thin)), ///
43 | ylabel(#10, grid glwidth(vthin) glcolor(gs8%20) glpattern(dash)) ///
44 | xline(1989, lwidth(medium) lpattern(solid) lcolor(black)) ///
45 | xlabel(#10, grid glwidth(vthin) glcolor(gs8%20) glpattern(dash)) ///
46 | legend(cols(1) ///
47 | position(9) ///
48 | order(1 "California" 2 "FDID Prediction" 3 "FDID y{subscript:N{subscript:co}} Mean") ///
49 | region(fcolor(none) lcolor(none)) ring(0)) ///
50 | scheme(sj) ///
51 | graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white)) ///
52 | plotregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white)) name(fdid, replace) ti("FDID Controls")
53 |
54 | graph combine did fdid, ///
55 | xsize(9) ///
56 | ysize(4.5) //
57 |
58 | graph export "FDIDP99.png", as(png) name("Graph") replace
59 |
60 | qui log close
61 |
62 |
--------------------------------------------------------------------------------
/HubeiPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/HubeiPlot.png
--------------------------------------------------------------------------------
/HubeiPlot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/HubeiPlot2.png
--------------------------------------------------------------------------------
/HubeiWidth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/HubeiWidth.png
--------------------------------------------------------------------------------
/HubeivsAverage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/HubeivsAverage.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Forward Difference-in-Differences
2 |
3 | This repository contains the Python and Stata code to estimate the Forward Difference-in-Differences estimator. The Python installation is under construction, but Stata users may install it by doing:
4 | ```
5 | net install fdid, from("https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/main") replace
6 | ```
7 | where you may find the ancillary files with the Basque and HCW data. The vignette for Stata is [here](https://github.com/jgreathouse9/FDIDTutorial/blob/main/StataVignette.md).
8 |
9 | - As of 8/11/2024, the Stata version reports Cohort ATTs, however this development is under construction. So, while users may do
10 |
11 | ```stata
12 | net from "https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/main"
13 | net install fdid, replace
14 | net get fdid, replace
15 | // Reinstall fdid so you have the most recent version
16 | clear *
17 | qui u basque
18 | // Import basque
19 | replace treat = 1 if id==12 & year >=1976
20 | // Pretend Extremadura (a unit that'll never be a donor) was treated
21 | cls
22 | // Estimate FDID
23 | fdid gdpcap, tr(treat)
24 | mat l e(results)
25 | ```
26 |
27 | to obtain the basic results, I'm working on developing event study estimates for this as well. So, discussion of staggered adoption is omitted from the Stata Vignette (I'll also do this for the Python version too, after I send the Stata version to Stata Journal)
28 |
29 | ## Troubleshooting Forward DID
30 |
31 | For anyone who wishes to open an issue about trouble with the Stata code, please provide a minimal worked example of what you did. For example,
32 |
33 | ```stata
34 | clear *
35 | cls
36 | u "hcw.dta", clear
37 |
38 |
39 | fdid gdp, tr(treatt) unitnames(state) gr2opts(scheme(sj) name(hcwte))
40 | ```
41 | Is a minimum worked example because it takes me, from start to finish, how the data was imported and it allows me to see what the error is (the fact the treat variable was spelled wrong).
42 |
43 | Screenshots of your Stata terminal or other things that don't allow me to reproduce the problem aren't helpful in terms of debugging, so including an example which reproduces the problem is the best way to raise issues.
44 |
--------------------------------------------------------------------------------
/StataVignette.md:
--------------------------------------------------------------------------------
1 | # Intro to ```fdid```
2 |
3 | Here, I cover the forward selection difference-in-differences method for Stata. Note that I already do the equivalent in [the Python vignette](https://github.com/jgreathouse9/FDIDTutorial/blob/main/Vignette.md). So, I will but briefly restate the algorithm and the basic ideas. For the more technical treatment, see [my paper](https://jgreathouse9.github.io/publications/FDIDSJ.pdf). This vignette demonstrates how to use FDID for Stata 16 and up. Users need ```sdid_event``` to be [installed](https://github.com/DiegoCiccia/sdid/tree/main/sdid_event#github).
4 |
5 | ```stata
6 | net install sdid_event, from("https://raw.githubusercontent.com/DiegoCiccia/sdid/main/sdid_event") replace
7 | ```
8 |
9 | We install ```fdid``` and its help file into Stata like
10 |
11 | ```stata
12 | net inst fdid, from("https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/main") replace
13 | ```
14 | We can get the datasets I include like
15 | ```stata
16 | net get fdid, all
17 | ```
18 |
19 | # The Model
20 |
21 | ## Setup
22 |
23 | We observe $\mathcal{N} = \{1, 2, \ldots, N\}$ units where $\mathcal N$ has cardinality $N = |\mathcal{N}|$. $j=1$ is treated and controls are $\mathcal{N}\_0 = \mathcal{N} \setminus \{1\}$. Time is indexed by $t$. Denote pre-post-policy periods as $\mathcal{T}\_1 = \{1, 2, \ldots, T\_0\}$ and $\mathcal{T}\_2 = \{T\_0+1, \ldots, T\}$, where $\mathcal{T}= \mathcal{T}\_1 \cup \mathcal{T}\_2$. The subset of controls we wish to select are $\widehat{U} \subset \mathcal{N}\_0$, or the subset of controls. DiD is estimated like $y\_{1t}=\hat\alpha\_{\mathcal{N}\_0}+ \bar{y}\_{\mathcal{N}\_0t} \: t \in \mathcal{T}\_1$, where $\bar{y}\_{\mathcal{N}\_0t}\coloneqq \frac{1}{N\_0} \sum\_{j \in \mathcal{N}\_0} y\_{jt}$. The estimated least-squares intercept is computed like $\hat\alpha\_{\mathcal{N}\_0} \coloneqq T\_{1}^{-1}\sum\_{t \in \mathcal{T}\_{1}}\left(y\_{1t}-\bar{y}\_{\mathcal{N}\_0t}\right)$.
24 |
25 | ## The Algorithm
26 |
27 | Basically, ```fdid``` uses a forward selection algorithm to choose the optimal control group for a single treated unit. Each control unit is used in bivariate linear regression models to choose the best control units. We choose the ideal control from each iteration, and add it to the next set of predictors, choosing the best control from that iteration plus the already selected units. Below I paraphrase from my paper:
28 |
29 | > Let $\mathcal{N}\_0 = \{i\_1 \text{ (Chicago)}, i\_2 \text{ (Miami)}, i\_3 \text{ (Phoenix)}\}$ be the controls for a generic treated unit. For $k=1$, we estimate DiD for each control unit in $\mathcal{N}\_0$ individually, yielding pre-treatment $R^2\_{ik}$ values: $R^2\_{1,1} = 0.60$, $R^2\_{2,1} = 0.50$, and $R^2\_{3,1} = 0.23$. Since $R^2\_{1,1} = 0.60$ is the highest, we update the control set to $\widehat{U}\_1 = \{i\_1\}$ and $R\_k^{2}=0.60$. This is the first candidate model. For $k=2$, we estimate two DiD models using $i\_1$ with the remaining controls from $\{i\_2, i\_3\}$, yielding $R^2\_{2,2} = 0.88$ and $R^2\_{3,2} = 0.68$. We select $i\_2$ (Miami) and update the control set to $\widehat{U}\_2 = \{i\_1, i\_2\}$ since $R^2\_{2,2} = 0.88$ is the highest. This is the second candidate model. For $k=3$, using all controls, we get $R^2\_{3,3} = 0.55$. This is the third candidate model. The final control set is $\widehat{U}\_2 = \{i\_1, i\_2\}$, as $\text{max}\_k R^2\_k = 0.88$, as this is the candidate model with the highest R-squared.
30 |
31 |
32 | We estimate FDID like
33 |
34 | ```math
35 | \begin{equation}
36 | y_{1t}=\hat\alpha_{\widehat{U}}+ \bar{y}_{\widehat{U}t} \quad t \in \mathcal{T}_1
37 | \end{equation}
38 | ```
39 |
40 | where we now exchange $\mathcal{N}\_0$ for $\widehat{U}$. Denote the FDID predictions as $\hat{y}\_{1t}^{0}=\hat\alpha\_{\widehat{U}}+ \bar{y}\_{\widehat{U}t}$, where the pre-treatment periods corresponds to the in-sample fit and the opposite denotes the out-of-sample counterfactual. Our causal estimand is: $\widehat{ATT}\_{\widehat{U}} = \frac{1}{T\_2} \sum\_{t \in \mathcal{T}\_2} \left(y\_{1t} - \hat{y}\_{1t}^0\right)$, or the average treatment effect on the treated.
41 |
42 | # HCW
43 |
44 | We can use [the HCW dataset](https://doi.org/10.1002/jae.1230) to demonstrate ```fdid```. We begin by importing the data
45 |
46 | ```stata
47 | u "hcw.dta", clear
48 | ```
49 |
50 | Here, we study the impact of Hong Kong's [economic integreation](https://www.henleyglobal.com/residence-investment/hong-kong/cepa-hong-kong-china). First, we can do ```xtdescribe```, which produces the output
51 |
52 | ```stata
53 |
54 | id: 1, 2, ..., 25 n = 25
55 | time: 1, 2, ..., 61 T = 61
56 | Delta(time) = 1 unit
57 | Span(time) = 61 periods
58 | (id*time uniquely identifies each observation)
59 |
60 | Distribution of T\_i: min 5% 25% 50% 75% 95% max
61 | 61 61 61 61 61 61 61
62 |
63 | Freq. Percent Cum. | Pattern
64 | ---------------------------+---------------------------------------------------------------
65 | 25 100.00 100.00 | 1111111111111111111111111111111111111111111111111111111111111
66 | ---------------------------+---------------------------------------------------------------
67 | 25 100.00 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
68 | ```
69 |
70 | We see that $T=61$. When we do ```list if treat==1```, we get the output
71 |
72 | ```stata
73 | time state gdp id treat polint
74 | 45 hongkong .077 9 1 1
75 | 46 hongkong .12 9 1 1
76 | 47 hongkong .066 9 1 1
77 | 48 hongkong .079 9 1 1
78 | 49 hongkong .062 9 1 1
79 | 50 hongkong .071 9 1 1
80 | 51 hongkong .081 9 1 1
81 | 52 hongkong .069 9 1 1
82 | 53 hongkong .09 9 1 1
83 | 54 hongkong .062 9 1 1
84 | 55 hongkong .064 9 1 1
85 | 56 hongkong .066 9 1 1
86 | 57 hongkong .055 9 1 1
87 | 58 hongkong .062 9 1 1
88 | 59 hongkong .068 9 1 1
89 | 60 hongkong .069 9 1 1
90 | 61 hongkong .073 9 1 1
91 | ```
92 |
93 | We have 44 pretreatment periods and 17 post-treatment periods. Our goal is to estimate the impact for those final 17 periods. From the ```xtdescribe``` output, we see that we have 24 control units. To estimate ```fdid```, we simply do
94 | ```stata
95 | fdid gdp, tr(treat) unitnames(state) gr1opts(title(FDID Results))
96 |
97 | ```
98 | We specify the outcome of interest as ```gdp``` and we specify the treatment as ```treat```. We use the strings of the ```state``` variable to define the names of our units, sicne they are strings. This syntax produces the table
99 | ```stata
100 | Forward Difference-in-Differences | T0 R2: 0.843 T0 RMSE: 0.016
101 | -----------------------------------------------------------------------------
102 | gdp | ATT Std. Err. t P>|t| [95% Conf. Interval]
103 | -------------+---------------------------------------------------------------
104 | treat | 0.02540 0.00462 5.49 0.000 0.01634 0.03447
105 | -----------------------------------------------------------------------------
106 | Treated Unit: hongkong
107 | FDID selects philippines, singapore, thailand, norway, mexico, korea, indonesia, newzealand, malaysia, as the optimal donors.
108 | See Li (2024) for technical details.
109 | ```
110 |
111 | Pleasingly, these are the exact same results Kathy gets in her MATLAB code. Here is the plot:
112 |
113 |
114 |
115 |
116 | If we wish to see the returned results, we can do
117 | ```stata
118 |
119 | ereturn list
120 |
121 | macros:
122 | e(U) : "philippines, singapore, thailand, norway, mexico, korea, indonesia, newzealand, malaysia,"
123 | e(properties) : "b V"
124 | e(depvar) : "gdp"
125 |
126 | matrices:
127 | e(b) : 1 x 1
128 | e(V) : 1 x 1
129 | e(series) : 61 x 9
130 | e(setting) : 1 x 6
131 | e(results) : 2 x 7
132 | e(dyneff) : 61 x 6
133 |
134 | ```
135 | The ```e(series)``` is a matrix containing the observed and counterfactual values, event time, individual treatment effects. Naturally, the other statistics pertain to the total number of controls, the number of controls selected, as well as inferential statistics.
136 |
137 | ```stata
138 |
139 | mat l e(results)
140 |
141 | e(results)[2,7]
142 | ATT PATT SE t LB UB R2
143 | FDID .02540494 53.843074 .00462405 5.4940862 .01634196 .03446791 .8427835
144 | DID .03172116 77.6203 .00298081 10.641796 .02556907 .03787324 .5046
145 | ```
146 | Here DID uses the robust standard error as estimated by ```xtdidregress```. We can clearly see that the pre-intervention $R^2$ for the selected control group of FDID is much higher than the DID method, suggesting that the parallel trends assumption for Forward DID holds compared to when we use all controls, as DID does.
147 |
148 | # Proposition 99
149 |
150 | Next, I'd like to replicate one of the more classic papers in synthetic control methods, the case of Proposition 99 for California. Prop 99 was an anti-tobacco campaign that sought to reduce the rate of smoking in the population via education, awareness, and taxation. To run this, we do
151 |
152 | ```stata
153 |
154 | clear *
155 | cls
156 | u "smoking.dta", clear
157 | ```
158 | When we do ```xtdescribe```, we get
159 |
160 | ```stata
161 | id: 1, 2, ..., 39 n = 39
162 | year: 1970, 1971, ..., 2000 T = 31
163 | Delta(year) = 1 year
164 | Span(year) = 31 periods
165 | (id*year uniquely identifies each observation)
166 |
167 | Distribution of T\_i: min 5% 25% 50% 75% 95% max
168 | 31 31 31 31 31 31 31
169 |
170 | Freq. Percent Cum. | Pattern
171 | ---------------------------+---------------------------------
172 | 39 100.00 100.00 | 1111111111111111111111111111111
173 | ---------------------------+---------------------------------
174 | 39 100.00 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
175 |
176 | ```
177 |
178 | We can see we have 38 controls. ```list if treat==1``` returns the output
179 |
180 | ```stata
181 | state year cigsale lnincome beer age15~24 retprice treated id
182 | California 1989 82.4 10.14231 23.7 .1535246 126.4 1 3
183 | California 1990 77.8 10.14162 23.8 .149523 163.8 1 3
184 | California 1991 68.7 10.11071 22.3 . 186.8 1 3
185 | California 1992 67.5 10.11494 21.3 . 201.9 1 3
186 | California 1993 63.4 10.0985 20.8 . 205.1 1 3
187 | California 1994 58.6 10.09951 20.1 . 190.3 1 3
188 | California 1995 56.4 10.15592 19.7 . 195.1 1 3
189 | California 1996 54.5 10.17864 19.1 . 197.9 1 3
190 | California 1997 53.8 10.17519 19.5 . 200.3 1 3
191 | California 1998 52.3 . . . 207.8 1 3
192 | California 1999 47.2 . . . 224.9 1 3
193 | California 2000 41.6 . . . 351.2 1 3
194 |
195 | ```
196 | We can see that treatment begins in 1989, continuing until the end of the study period. We estimate the effect like
197 |
198 | ```stata
199 | fdid cigsale, tr(treated) unitnames(state)
200 | ```
201 | which returns the table
202 | ```stata
203 | Forward Difference-in-Differences T0 R2: 0.988 T0 RMSE: 1.282
204 |
205 | -----------------------------------------------------------------------------------------
206 | cigsale | ATT Std. Err. t P>|t| [95% Conf. Interval]
207 | -------------+---------------------------------------------------------------------------
208 | treated | -13.64671 0.46016 29.66 0.000 -14.54861 -12.74481
209 | -----------------------------------------------------------------------------------------
210 | Treated Unit: California
211 | FDID selects Montana, Colorado, Nevada, Connecticut, as the optimal donors.
212 | See Li (2024) for technical details.
213 | ```
214 |
215 | With these results, we may produce the plot
216 |
217 | ```stata
218 |
219 | svmat e(series), names(col)
220 |
221 | tsset year
222 |
223 | lab var cigsale3 "California"
224 |
225 | lab var cf3 "FDID"
226 | lab var cfdd3 "DID"
227 |
228 | lab var ymeandid "DID Control Mean"
229 | lab var ymeanfdid "FDID Control Mean"
230 | lab var year "Year"
231 | twoway (tsline cigsale3) ///
232 | (tsline cfdd3, lcolor(black) lwidth(thick) lpattern(dash)) ///
233 | (tsline ymeandid, lcolor(black) lwidth(thick) lpattern(solid)), ///
234 | scheme(sj) name(did, replace) ///
235 | yti(Cigarette Consumption per Capita) tli(1989) legend(ring(0) pos(7) col(1) size(large)) ///
236 | ti(Uses all controls)
237 |
238 | twoway (tsline cigsale3) ///
239 | (tsline cf3,lcolor(gs6) lwidth(thick) lpattern(longdash)) ///
240 | (tsline ymeanfdid, lcolor(gs6) lwidth(thick) lpattern(solid)), ///
241 | scheme(sj) name(fdid, replace) tli(1989) legend(ring(0) pos(7) col(1) size(large)) ///
242 | ti(Uses 4 controls)
243 |
244 |
245 | graph combine did fdid, xsize(8)
246 |
247 | ```
248 |
249 |
250 |
251 |
252 |
253 | The R-squared of DID here is 0.604, versus FDID's R-squared of 0.988. This naturally has real implications for the analysis' findings. Because the fit for DID in the pre-intervention period is so poor (as a result of non-parallel trends holding across all control units), the DID method badly overestimates the causal effect, returning an ATT of -27.349.
254 |
255 | FDID's control group seems much more parallel to the pre-treatment trend of California. Its pre-intervention R-squared is higher than DID, meaning that the parallel trends assumption is much more likely to hold for FDID in this instance relative to DID. The effect sizes also differ, with FDID returning an ATT of -13.647.
256 |
257 | To put this another way, the ATT of FDID is basically half of the DID estimate due to parallel trends bias from the original DID method. This is a colossal reduction of effect. Also of interest is that FDID selects 4 control states which happen to be the exact same states as the original synthetic control method selected. On top of this, we can also see that FDID gets these results without needing to use retail price of cigarettes, age, income, taxation, and outcome lags to attain what is essentially the same results of other synthetic control methods ([which tend to vary](https://rpubs.com/dwrich27/941298) between -13 and -19, depending on which [flavor](https://doi.org/10.48550/arXiv.2203.11576) of SCM we use).
258 |
259 | Of course, FDID assumes that a uniformly weighted average is the ideal way to model the counterfactual, but the point here is that we can get very similar results to the findings of the original model using a relatively simpler estimator which also happens to be qualitatively similar. An added benefit of DID is that inference is more straightforward compared to synthetic controls. In the staggered adoption case, we simply estimate one ATT per treated unit (using the never treated units as controls) and average the effect sizes together. Okay, so that's it for the vignette. No doubt people will have questions, suggestions, ideas, or errors to report, concerns, so you may contact me as ususal.
260 |
261 | # Contact
262 | - Jared Greathouse: (see [my website](https://jgreathouse9.github.io/))
263 |
--------------------------------------------------------------------------------
/Vignette.md:
--------------------------------------------------------------------------------
1 | A Tutorial on Forward and Augmented Difference-in-Differences
2 | ==============
3 |
4 | ***Revisiting Hong Kong's Economic Integration and Hubei's Lockdown***
5 |
6 | **Author:** *Jared Greathouse*
7 |
8 | > **Note**
9 | >
10 | > This is an ongoing project; any feedback or comments are most welcome!
11 |
12 | # Introduction
13 | This tutorial uses publicly available data to demonstrate the utility of the [Forward](https://doi.org/10.1287/mksc.2022.0212) and [Augmented](https://doi.org/10.1287/mksc.2022.1406) Difference-in-Differences estimators. It is based on MATLAB code very kindly provided by [Kathleen Li](https://sites.utexas.edu/kathleenli/).
14 |
15 | We estimate the counterfactual for two empirical examples: First is GDP Growth for Hong Kong had their economy never economically integrated with Mainland China, [revisiting](https://doi.org/10.1002/jae.1230) the classic panel data approach study. Then we revisit [a more recent study](https://doi.org/10.1002/jae.2871) where we estimate the Quarterly GDP for Hubei had their economy never locked down in 2020 to prevent the spread of COVID-19. This tutorial will consist of two parts: firstly I will go over the anatomy of the class itself, detailing its helper functions and explaining what each sub-method does. Then I discuss how to estimate the model. However, first we have some preliminaries:
16 | ## Prerequisite Libraries
17 | ```python
18 | import numpy as np
19 | import pandas as pd
20 | from scipy.stats import norm
21 | import cvxpy as cp
22 | import matplotlib.pyplot as plt
23 | import matplotlib
24 | ```
25 | Strictly speaking, you don't need to import ```matplotlib```, I only do so because I am customizing my own graphics.
26 | ## Model Primitives
27 | Here, we have $\mathcal{N} \coloneqq \lbrace{0 \ldots N \rbrace}$ units across $t \in \left(1, T\right) \cap \mathbb{N}$ time periods, where $j=0$ is our sole treated unit. This leaves us with $\mathcal{N}\_{0} \coloneqq \lbrace{1 \ldots N\rbrace}$ control units. We have two sets of time series $\mathcal{T}\coloneqq \mathcal{T}\_{0} \cup \mathcal{T}\_{1}$, where $\mathcal{T}\_{0}\coloneqq \lbrace{1\ldots T_0 \rbrace}$ is the pre-intervention period and $\mathcal{T}\_{1}\coloneqq \lbrace{T_0+1\ldots T \rbrace}$ denotes the post-intervention period. We observe
28 | ```math
29 | \begin{equation*}
30 | y_{jt} =
31 | \begin{cases}
32 | y^{0}_{jt} & \forall \: j\in \mathcal{N}_0\\
33 | y^{0}_{0t} & \text{if } j = 0 \text{ and } t \in \mathcal{T}_0 \\
34 | y^{1}_{0t} & \text{if } j = 0 \text{ and } t \in \mathcal{T}_1
35 | \end{cases}
36 |
37 | \end{equation*}
38 | ```
39 | where $y_{jt}^1$ and $y_{jt}^0$ respectively are the outcomes we observe under treatment or control. The above brace says we observe all of our control units being untreated at all points in time, and we observe the outcomes of our treated unit as treated or not (denoted by time of course). The basic problem of causal inference is that we can't see how Hong Kong or Hubei's GDPs would have evolved in the post-intervention period absent their respective interventions. The reason for this, naturally, is the treatment $y_{jt} = d_{jt} y_{jt}^1 + (1 - d_{jt}) y_{jt}^0$ where $d \in \lbrace{0,1\rbrace}$ is a dummy variable indicating treatment or control status. Thus, the counterfactual outcome is something that we must estimate.
40 | ## Parallel Trends
41 | Two (**very wrong**) ways we could do this are
42 | - Subtracting the average of the pre-intervention GDPs for the treated unit from their own post-intervention GDPs, or
43 | - Subtracting the average of the treated units from the post-intervention average of the control units.
44 |
45 | The first approach assumes that nothing else was impacting the pre and post-GDPs aside from the intervention. The latter presumes that there are no differences (or at least, that they are too small to matter) between the treated units and the control units. DID however posits a different data generating process, $y\_{jt}=b\_{j} + (1+ \lambda\_{j})f\_t + \epsilon\_{jt}$. DID posits that our observed outcome are generated by a one-dimensional factor model of time variant factors, $f\_t$, plus some constant, $b\_j$, and an error term. This is a slightly more realsitic data generating process. The validity of any DID design depends on the validity of the parallel trends assumption (PTA), $\mathbb{E}\[y_{jt} | d = 1, t \in \mathcal{T}\_{1}\] - \mathbb{E}\[y_{jt} | d = 0, t \in \mathcal{T}\_{1}\] = \delta_0 + \delta_1$. This is simply a restatement of the standard two-way fixed effects model, $y^{\text{DID}}\_{0t}-\bar{y}\_{\mathcal{N}\_0}=\alpha\_{j} + v\_{t}$, where $v\_{t}$ is a zero mean, finite variance stationary process. In other words, PTA posits the post-intervention trend of our treated unit would be the average of the control group plus some intercept absent the intervention. Practically speaking, this has an important implication: all of our control units must similar enough to our treated unit in the pre-intervention period. Because it is an assumption, we cannot perfectly test for it. However, we can test part of it, namely by inspecting the pre-intervention period and how the average of controls compares to the treated unit. So, let's do it for Hubei.
46 |
47 |
48 |
49 | Let's begin with the graph on the left. Hubei and the average of controls appears pretty similar for the first few years (measured at the quarterly level here). Hubei's GDP is consistently higher than the average, but they otherwise move in a fairly similar manner. This is fine because so long as the difference is parallel as defined above, the intercept of DID approximates the values of the treaetd unit pre-intervention. However especially between the years 2016 and 2017, Hubei's trends appear to grow at a faster rate than the average of controls. What this sugests is that there are other important factors affecting the trends of Hubei relative to the simple average of controls. In other words, for the final few pre-intervention periods, the difference is not mean zero. On its face though, we should be quite skeptical about the classic parallel trends assumption. "Why", might we ask, "would all of our control units be a suitable counterfactual for the treated unit?" Another way of thinking about this is a weighting estimator: much as in the synthetic control literature where all units are assigned weights that are between 0 and 1, must be positive, and must add to 1, DID does the exact same thing. It assigns the weights as a proportion of control units. We should ask if this idea makes sense. After all, we know that substantially large values will affect an average. We would not compare the homicide trends, for example, of Newnan Georgia to New Orleans, Louisiana, because Newnan is a completely different area with different demographics and other important factors that might affect the homicide or crime trends. But suppose we add in Miami or Atlanta: these two units, on its face at least, are likely much more comparable to New Orleans than Newnan. Similar in urbanicity and likely other background factors, these units likely serve as better untreated units to compare New Orleans to. Using DID, all units would obtain the same weights here (Newnan, Miami, Atlanta).
50 |
51 | Now let's look at the graph on the right. If we agree that using all units as the counterfactual may not satisfy the parallel trends assumption, then perhaps some average of _certain_ units will. This suggests that having principled methods to choose control units is in order. But how? Students of econometrics may answer that the natural response to these questions for regression-based analysis is to control for confounding variables that may be related to the intervention and the outcome; however, obtaining a satisfactory list of predictors is sometimes not possible or financially expensive. If we only have data on the outcome, we must make due with that. Even so, which of these control units in the right panel are the most similar in trend to Hubei? What kinds of qualitative questions might we ask? Do we, for example, include only cities over a certain GDP threshold? And also, what would it mean empirically for these cities to be similar in GDP? What about geography, do we limit our comparison cities to being within 100 miles of Hubei? Note that the question only becomes amplified in high dimensions. If we had 200 control units, now by what standards would we use? An even if we could agree to a certain standard, how many units should even be in this optimal control group? The fact of the matter is that these questions become much too complex for humans to answer, even in situations where we discard other units that were exposed to a treatment or intervention. Thus, the main contribution of ```FDID``` is that it uses a machine-learning algorithm called _forward selection_ to select the optimal pool of control units. Now that we have the preliminaries out of the way, we can finally go into the class
52 |
53 | # Forward Selection Differences in Differences
54 | ## The Class Inputs
55 | Here are the inputs the user may specify.
56 | ```python
57 | class FDID:
58 | def __init__(self, df, unitid, time, outcome, treat,
59 | figsize=(12, 6),
60 | graph_style="default",
61 | grid=True,
62 | counterfactual_color="red",
63 | treated_color="black",
64 | filetype="png",
65 | display_graphs=True,
66 | placebo=None
67 | ):
68 | self.df = df
69 | self.unitid = unitid
70 | self.time = time
71 | self.outcome = outcome
72 | self.treated = treat
73 | self.figsize = figsize
74 | self.graph_style = graph_style
75 | self.grid = grid
76 | self.counterfactual_color = counterfactual_color
77 | self.treated_color = treated_color
78 |
79 | self.filetype = filetype
80 | self.display_graphs = display_graphs
81 |
82 | # Check for the "placebo" option
83 | if placebo is not None:
84 | self.validate_placebo_option(placebo)
85 | self.placebo_option = placebo
86 |
87 | def validate_placebo_option(self, placebo):
88 | # Check if the provided placebo option is a dictionary
89 | if not isinstance(placebo, dict):
90 | raise ValueError("The 'placebo' option must be a dictionary.")
91 |
92 | # Check for the first key in the dictionary
93 | first_key = next(iter(placebo), None)
94 | if first_key not in ["Time", "Space"]:
95 | raise ValueError(
96 | "The first key in the 'placebo' option must be either 'Time' or 'Space'.")
97 |
98 | # If the first key is "Time", check if the associated value is a list of positive integers
99 | if first_key == "Time":
100 | values = placebo[first_key]
101 | if not (isinstance(values, list) and all(isinstance(num, int) and num > 0 for num in values)):
102 | raise ValueError(
103 | "If the first key in the 'placebo' option is 'Time', the associated value must be a list of positive integers.")
104 |
105 | ```
106 | The user specifies the dataframe they wish to use, as well as the 4 columns I just mentioned. The user may also customize, should they specify to see graphs, the colors of the trend lines for the observed and FDID predictions. I also allow for in-time placebo estimates (to be expanded on at a later date).
107 |
108 | The FDID class makes a few assumptions about ones data structure. Firstly, it presumes that the user has a long panel dataset of 4 columns, where we have one column for the outcomes, one column for the time, one column of unit names, and one column for the treatment indicator, which in this case is 0 for all periods untreated, and 1 for the treated unit during the treatment period. The code does not do so now, but in future iterations it will test to see that all of these are true.
109 | ```python
110 | Country GDP Time Integration
111 | 0 Hong Kong 0.0620 0 0
112 | 1 Hong Kong 0.0590 1 0
113 | 2 Hong Kong 0.0580 2 0
114 | 3 Hong Kong 0.0620 3 0
115 | 4 Hong Kong 0.0790 4 0
116 | .. ... ... ... ...
117 | 56 China 0.1110 56 0
118 | 57 China 0.1167 57 0
119 | 58 China 0.1002 58 0
120 | 59 China 0.1017 59 0
121 | 60 China 0.1238 60 0
122 |
123 | [1525 rows x 4 columns]
124 | ```
125 | ## Vanilla DID
126 | Here is the DID method.
127 |
128 | Click to expand/collapse
129 |
130 | ```python
131 | def DID(self, y, datax, t1):
132 | t = len(y)
133 |
134 | x1, x2 = np.mean(datax[:t1], axis=1).reshape(-1,
135 | 1), np.mean(datax[t1:t], axis=1).reshape(-1, 1)
136 |
137 | b_DID = np.mean(y[:t1] - x1, axis=0) # DID intercept estimator
138 | y1_DID = b_DID + x1 # DID in-sample-fit
139 | y2_DID = b_DID + x2 # DID out-of-sample prediction
140 | y_DID = np.vstack((y1_DID, y2_DID)) # Stack y1_DID and y2_DID vertically
141 |
142 | y1_DID, y2_DID = y_DID[:t1], y_DID[t1:t]
143 |
144 | # DID ATT estimate and percentage
145 |
146 | ATT_DID = np.mean(y[t1:t] - y_DID[t1:t])
147 | ATT_DID_percentage = 100 * ATT_DID / np.mean(y_DID[t1:t])
148 |
149 | # DID R-square
150 |
151 | R2_DID = 1 - (np.mean((y[:t1] - y_DID[:t1]) ** 2)) / (
152 | np.mean((y[:t1] - np.mean(y[:t1])) ** 2)
153 | )
154 |
155 | # Estimated DID residual
156 |
157 | u1_DID = y[:t1] - y_DID[:t1]
158 |
159 | # \hat \Sigma_{1,DID} and \hat \Sigma_{2,DID}
160 | t2 = t - t1
161 |
162 | Omega_1_hat_DID = (t2 / t1) * np.mean(u1_DID**2)
163 | Omega_2_hat_DID = np.mean(u1_DID**2)
164 |
165 | # \hat Sigma_{DID}
166 |
167 | std_Omega_hat_DID = np.sqrt(Omega_1_hat_DID + Omega_2_hat_DID)
168 |
169 | # Standardized ATT_DID
170 |
171 | ATT_std_DID = np.sqrt(t2) * ATT_DID / std_Omega_hat_DID
172 |
173 | # P-value for H0: ATT=0
174 |
175 | p_value_DID = 2 * (1 - norm.cdf(np.abs(ATT_std_DID)))
176 |
177 | # P-value for 1-sided test
178 |
179 | p_value_one_sided = 1 - norm.cdf(ATT_std_DID)
180 |
181 | # 95% Confidence Interval for DID ATT estimate
182 |
183 | z_critical = norm.ppf(0.975) # 1.96 for a two-tailed test
184 | CI_95_DID_left = ATT_DID - z_critical * std_Omega_hat_DID / np.sqrt(t2)
185 | CI_95_DID_right = ATT_DID + z_critical * std_Omega_hat_DID / np.sqrt(t2)
186 | CI_95_DID_width = [
187 | CI_95_DID_left,
188 | CI_95_DID_right,
189 | CI_95_DID_right - CI_95_DID_left,
190 | ]
191 |
192 | # Metrics of fit subdictionary
193 | Fit_dict = {
194 | "T0 RMSE": round(np.std(y[:t1] - y1_DID), 3),
195 | "R-Squared": round(R2_DID, 3)
196 | }
197 |
198 | # ATTS subdictionary
199 | ATTS = {
200 | "ATT": round(ATT_DID, 3),
201 | "Percent ATT": round(ATT_DID_percentage, 3),
202 | "SATT": round(ATT_std_DID, 3),
203 | }
204 |
205 | # Inference subdictionary
206 | Inference = {
207 | "P-Value": round(p_value_DID, 3),
208 | "95 LB": round(CI_95_DID_left, 3),
209 | "95 UB": round(CI_95_DID_right, 3),
210 | }
211 |
212 | # Vectors subdictionary
213 | Vectors = {
214 | "Observed Unit": np.round(y, 3),
215 | "DID Unit": np.round(y_DID, 3),
216 | "Gap": np.round(y - y_DID, 3)
217 | }
218 |
219 | # Main dictionary
220 | DID_dict = {
221 | "Effects": ATTS,
222 | "Vectors": Vectors,
223 | "Fit": Fit_dict,
224 | "Inference": Inference
225 | }
226 |
227 | return DID_dict
228 |
229 | ```
230 |
231 |
232 | DID is naturally the main workhorse for the selection algorithm and the ```FDID``` estimator itself. It calculates the DID predictions for a certain input matrix of control units and a vector of treatment unit outcomes. When we use OLS to estimate DID, we can think of a least squares estimator where our counterfactual $y^{\text{DID}}\_{jt}$ may be estimated via $y^{\text{DID}}\_{jt} = \delta_0 + \delta_1 \bar{y}\_{j \in \mathcal{N}\_{0}} \forall t \in \mathcal{T}\_{0}$, where $\delta_0$ is the intercept and $\delta_1$ is a coefficient that equals 1 and $\bar{y}$ is the average of our control units.
233 | ## Augmented DID
234 | Augmented DID allows for heterogeneous treatment effects. In AUGDID, our regression model is $y^{\text{ADID}}\_{jt} = \delta_0 + \delta_1 \bar{y}\_{j \in \mathcal{N}\_{0}} \forall t \in \mathcal{T}\_{0}$ where $\delta_1$ may be any value. For AUGDID, the PTA is that the counterfactual would be parallel to the pure average of controls plus some slope adjusted constant. Notice how both ADID and DID return model fit statistics and ATTs in dictionaries.
235 |
236 | Click to expand/collapse
237 |
238 | ```python
239 | def AUGDID(self, datax, t, t1, t2, y, y1, y2):
240 | const = np.ones(t) # t by 1 vector of ones (for intercept)
241 | # add an intercept to control unit data matrix, t by N (N=11)
242 | x = np.column_stack([const, datax])
243 | x1 = x[:t1, :] # control units' pretreatment data matrix, t1 by N
244 | x2 = x[t1:, :] # control units' pretreatment data matrix, t2 by N
245 |
246 | # ATT estimation by ADID method
247 | x10 = datax[:t1, :]
248 | x20 = datax[t1:, :]
249 | x1_ADID = np.column_stack([np.ones(x10.shape[0]), np.mean(x10, axis=1)])
250 | x2_ADID = np.column_stack([np.ones(x20.shape[0]), np.mean(x20, axis=1)])
251 | # Define variables
252 | b_ADID_cvx = cp.Variable(x1_ADID.shape[1])
253 |
254 | # Define the problem
255 | objective = cp.Minimize(cp.sum_squares(x1_ADID @ b_ADID_cvx - y1))
256 | problem = cp.Problem(objective)
257 |
258 | # Solve the problem
259 | problem.solve()
260 |
261 | # Extract the solution
262 | b_ADID_optimized = b_ADID_cvx.value
263 |
264 | # Compute in-sample fit
265 | y1_ADID = x1_ADID @ b_ADID_optimized
266 |
267 | # Compute prediction
268 | y2_ADID = x2_ADID @ b_ADID_optimized
269 |
270 | # Concatenate in-sample fit and prediction
271 | y_ADID = np.concatenate([y1_ADID, y2_ADID])
272 |
273 | ATT = np.mean(y2 - y2_ADID) # ATT by ADID
274 | ATT_per = 100 * ATT / np.mean(y2_ADID) # ATT in percentage by ADID
275 |
276 | e1_ADID = (
277 | y1 - y1_ADID
278 | ) # t1 by 1 vector of treatment unit's (pre-treatment) residuals
279 | sigma2_ADID = np.mean(e1_ADID**2) # \hat sigma^2_e
280 |
281 | eta_ADID = np.mean(x2, axis=0).reshape(-1, 1)
282 | psi_ADID = x1.T @ x1 / t1
283 |
284 | Omega_1_ADID = (sigma2_ADID * eta_ADID.T) @ np.linalg.inv(psi_ADID) @ eta_ADID
285 | Omega_2_ADID = sigma2_ADID
286 |
287 | Omega_ADID = (t2 / t1) * Omega_1_ADID + Omega_2_ADID # Variance
288 |
289 | ATT_std = np.sqrt(t2) * ATT / np.sqrt(Omega_ADID)
290 | alpha = 0.5
291 | quantile = norm.ppf(1 - alpha)
292 |
293 | CI_95_DID_left = ATT - quantile * np.sqrt(sigma2_ADID) / np.sqrt(t2)
294 | CI_95_DID_right = ATT + quantile * np.sqrt(sigma2_ADID) / np.sqrt(t2)
295 |
296 | RMSE = np.sqrt(np.mean((y1 - y1_ADID) ** 2))
297 | RMSEPost = np.sqrt(np.mean((y2 - y2_ADID) ** 2))
298 |
299 | R2_ADID = 1 - (np.mean((y1 - y1_ADID) ** 2)) / np.mean((y1 - np.mean(y1)) ** 2)
300 |
301 | # P-value for H0: ATT=0
302 |
303 | p_value_aDID = 2 * (1 - norm.cdf(np.abs(ATT_std)))
304 |
305 | CI_95_DID_width = [
306 | CI_95_DID_left,
307 | CI_95_DID_right,
308 | CI_95_DID_right - CI_95_DID_left,
309 | ]
310 |
311 | # Metrics of fit subdictionary
312 | Fit_dict = {
313 | "T0 RMSE": round(np.std(y[:t1] - y1_ADID), 3),
314 | "R-Squared": round(R2_ADID, 3)
315 | }
316 |
317 | # ATTS subdictionary
318 | ATTS = {
319 | "ATT": round(ATT, 3),
320 | "Percent ATT": round(ATT_per, 3),
321 | "SATT": round(ATT_std.item(), 3),
322 | }
323 |
324 | # Inference subdictionary
325 | Inference = {
326 | "P-Value": round(p_value_aDID.item(), 3),
327 | "95 LB": round(CI_95_DID_left.item(), 3),
328 | "95 UB": round(CI_95_DID_right.item(), 3)
329 | }
330 |
331 | # Vectors subdictionary
332 | Vectors = {
333 | "Observed Unit": np.round(y, 3),
334 | "DID Unit": np.round(y_ADID, 3),
335 | "Gap": np.round(y - y_ADID, 3)
336 | }
337 |
338 | # Main dictionary
339 | ADID_dict = {
340 | "Effects": ATTS,
341 | "Vectors": Vectors,
342 | "Fit": Fit_dict,
343 | "Inference": Inference
344 | }
345 |
346 | return ADID_dict, y_ADID
347 | ```
348 |
349 |
350 | ## Forward Selection
351 | In Forward Difference-in-Differences (```FDID```), we use the forward selection algorithm to select our control group.
352 |
353 | Click to expand/collapse
354 |
355 | ```python
356 | def selector(self, no_control, t1, t, y, y1, y2, datax, control_ID, df):
357 | R2 = np.zeros(no_control) # Creates an empty vector equal in length to the number of controls
358 | R2final = np.zeros(no_control)
359 | control_ID_adjusted = np.array(control_ID) - 1 # Adjusts the indexing from MATLAB
360 |
361 | select_c = np.zeros(no_control, dtype=int)
362 |
363 | for j in range(no_control):
364 | # Over the jth control unit, estimate DID and save its R-squared stat
365 |
366 | ResultDict = self.DID(y.reshape(-1, 1), datax[:t, j].reshape(-1, 1), t1)
367 |
368 | R2[j] = ResultDict["Fit"]["R-Squared"]
369 |
370 | R2final[0] = np.max(R2) # The model with the highest R-Squared
371 | first_c = np.argmax(R2)
372 | select_c[0] = control_ID_adjusted[first_c] # The selected unit
373 |
374 | for k in range(2, no_control + 1):
375 | left = np.setdiff1d(control_ID_adjusted, select_c[: k - 1])
376 | control_left = datax[:, left]
377 | R2 = np.zeros(len(left))
378 |
379 | for jj in range(len(left)):
380 | combined_control = np.concatenate(
381 | (
382 | datax[:t1, np.concatenate((select_c[: k - 1], [left[jj]]))],
383 | datax[t1:t, np.concatenate((select_c[: k - 1], [left[jj]]))]
384 | ),
385 | axis=0
386 | )
387 | ResultDict = self.DID(y.reshape(-1, 1), combined_control, t1)
388 | R2[jj] = ResultDict["Fit"]["R-Squared"]
389 |
390 | R2final[k - 1] = np.max(R2)
391 | select = left[np.argmax(R2)]
392 | select_c[k - 1] = select
393 | selected_unit_names = [df.columns[i] for i in select_c]
394 |
395 | return select_c, R2final
396 | ```
397 |
398 |
399 | The method takes as inputs the number of pre-intervention periods, post-periods and total time periods, as well as the treatment vector and the donor matrix. It also prints the names of the optimal control units that are selected. Here is what is going on in more detail.
400 |
401 | We take the full pool of control units and iteratively use each control unit to estimate $N-1$ DID models. We then store the DID model which has the highest $R^2$ statistic. For example, say our treated unit is Washington DC and we have Los Angeles, New York City, Chicago, and Miami as controls. If Los Angeles' DID model has the higest $R^2$, Los Angeles becomes the first selected unit. Then, keep looping through the other unselected control units. If the DID model that has NYC and Los Angeles has the highest $R^2$, we then add NYC to the selected pool. Then, if Miami has the next highest, we add Miami and estimate DID again. And so on, until we estimate the sum of $1 \ldots N$ DID models. After we've done so, we then keep whichever DID model that has the highest $R^2$ statistic. Intuitively, this is an improvement over the standard average of controls: if the average of all controls does not provide a satisfactory approximation for our treated unit, then using some subset of the controls must perform _at least_ as well. The final pool of control units selected by forward selection, termed $\hat{U}$ in the FDID paper, is the final control group that we use.
402 | ## Forward DID
403 | Now for the actual estimation.
404 | ```python
405 | def est(self, control, t, t1, t2, y, y1, y2, datax):
406 |
407 | FDID_dict = self.DID(y.reshape(-1, 1), control, t1)
408 |
409 | y_FDID = FDID_dict['Vectors']['DID Unit']
410 |
411 | DID_dict = self.DID(y.reshape(-1, 1), datax, t1)
412 |
413 | AUGDID_dict, y_ADID = self.AUGDID(datax, t, t1, t2, y, y1, y2)
414 | time_points = np.arange(1, len(y) + 1)
415 |
416 | return FDID_dict, DID_dict, AUGDID_dict, y_FDID
417 | ```
418 | The regression model is $y^{\text{FDID}}\_{jt} = \delta_0 + \delta_1 \bar{y}\_{j \in \hat{U}} \forall t \in \mathcal{T}\_{0}$. Here, $\delta_1$ still must be equal to 1, but parallel trends is made into a more plausible assumption by virtue of the improved control group. In the code, ```control``` matrix (or, the matrix of control units that are selected based off index set $\hat{U}$) serves as the new control group, in contrast to ```datax```, the original control group. The method returns the relevant dictionaries of fit and effect size estimates across all designs (```AUGDID```, ```DID``` and ```FDID```). With the helper functions described, we can now go into the main function of the class that the user needs to actually work with, the ```.fit``` method.
419 | ## Fitting FDID
420 |
421 | Click to expand/collapse
422 |
423 | ```python
424 | def fit(self):
425 | Ywide = self.df.pivot_table(values=self.outcome, index=self.time, columns=self.unitid, sort=False)
426 | treated_unit_name = self.df.loc[self.df[self.treated] == 1, self.unitid].values[0]
427 | y = Ywide[treated_unit_name].values.reshape(-1, 1)
428 | donor_df = self.df[self.df[self.unitid] != treated_unit_name]
429 | donor_names = donor_df[self.unitid].unique()
430 | datax = Ywide[donor_names].values
431 | no_control = datax.shape[1]
432 | control_ID = np.arange(1, no_control + 1)
433 |
434 | t = np.shape(y)[0]
435 | assert t > 5, "You have less than 5 total periods."
436 |
437 | t1 = len(
438 | self.df[
439 | (self.df[self.unitid] == treated_unit_name)
440 | & (self.df[self.treated] == 0)
441 | ]
442 | )
443 | t2 = t - t1
444 | y1 = np.ravel(y[:t1])
445 | y2 = np.ravel(y[-t2:])
446 |
447 | control_order, R2_final = self.selector(
448 | no_control, t1, t, y, y1, y2, datax, control_ID, Ywide
449 | )
450 | selected_control_indices = control_order[:R2_final.argmax() + 1]
451 | selected_control_names = [Ywide.columns[i] for i in selected_control_indices]
452 | control = datax[:, control_order[: R2_final.argmax() + 1]]
453 |
454 | print(selected_control_names)
455 | FDID_dict, DID_dict, AUGDID_dict, y_FDID = self.est(
456 | control, t, t1, t - t1, y, y1, y2, datax
457 | )
458 |
459 | estimators_results = []
460 | estimators_results.append({"FDID": FDID_dict})
461 | estimators_results.append({"DID": DID_dict})
462 | estimators_results.append({"AUGDID": AUGDID_dict})
463 |
464 | def round_dict_values(input_dict, decimal_places=3):
465 | rounded_dict = {}
466 | for key, value in input_dict.items():
467 | if isinstance(value, dict):
468 | rounded_dict[key] = round_dict_values(value, decimal_places)
469 | elif isinstance(value, (int, float, np.float64)):
470 | rounded_dict[key] = round(value, decimal_places)
471 | else:
472 | rounded_dict[key] = value
473 | return rounded_dict
474 |
475 | estimators_results = [
476 | round_dict_values(result) for result in estimators_results
477 | ]
478 |
479 | if self.display_graphs:
480 | time_axis = self.df[self.df[self.unitid] == treated_unit_name][self.time].values
481 | intervention_point = self.df.loc[self.df[self.treated] == 1, self.time].min()
482 | n = np.arange(1, t+1)
483 | fig = plt.figure(figsize=self.figsize)
484 |
485 | plt.plot(
486 | n,
487 | y,
488 | color=self.treated_color,
489 | label="Observed " + treated_unit_name,
490 | linewidth=3
491 | )
492 | plt.plot(
493 | n[: t1],
494 | y_FDID[: t1],
495 | color=self.counterfactual_color,
496 | linewidth=3,
497 | label="FDID " + treated_unit_name,
498 | )
499 | plt.plot(
500 | n[t1-1:],
501 | y_FDID[t1-1:],
502 | color=self.counterfactual_color,
503 | linestyle="--",
504 | linewidth=2.5,
505 | )
506 | plt.axvline(
507 | x=intervention_point,
508 | color="#ed2939",
509 | linestyle="--", linewidth=2.5,
510 | label=self.treated + ", " + str(intervention_point),
511 | )
512 | upb = max(max(y), max(y_FDID))
513 | lpb = min(0.5 * min(min(y), min(y_FDID)), 1 * min(min(y), min(y_FDID)))
514 |
515 | plt.ylim(lpb, upb)
516 | plt.xlabel(self.time)
517 | plt.ylabel(self.outcome)
518 | plt.title("FDID Analysis: " + treated_unit_name +
519 | " versus Synthetic " + treated_unit_name)
520 | plt.grid(self.grid)
521 | plt.legend()
522 | plt.show()
523 |
524 | return estimators_results
525 |
526 | ```
527 |
528 |
529 | The first step is is to reshape our data to wide, such that the columns of ```Ywide``` are the outcome vectors of each unit in our dataframe. Each row is a time period at some point $t$. Since the column names of ```Ywide``` are the names of our units, we extract the name of the treated unit, or the only unit (in this case) whose treatment variable ever equals 1. We then refer to the column in the wide dataset with the name of the treated unit, and extract its $T \times 1$ vector. We then construct the dataframe of control units, or all the columns that are not named the treated unit, and convert it to a matrix. The shape (or number of columns) of the control matrix ```donor_names``` refers to the $N$ number of control units. We then assign these into an $1 \ldots N$ index. Then, we get the number of time periods, represented by the length/number of rows of the ```y``` vector. We get the number of pre-intervention periods based on the number of periods where the treatment variable is 0 and the unit name column is the name of the treated unit. From there, we simply use the already discussed functions to select the control units and estimate Forward and Augmented DID.
530 |
531 | With all this in mind, here is the complete class.
532 |
533 | Code example
534 |
535 | ```python
536 | class FDID:
537 | def __init__(self, df, unitid, time, outcome, treat,
538 | figsize=(12, 6),
539 | graph_style="default",
540 | grid=True,
541 | counterfactual_color="red",
542 | treated_color="black",
543 | filetype="png",
544 | display_graphs=True,
545 | placebo=None
546 | ):
547 | self.df = df
548 | self.unitid = unitid
549 | self.time = time
550 | self.outcome = outcome
551 | self.treated = treat
552 | self.figsize = figsize
553 | self.graph_style = graph_style
554 | self.grid = grid
555 | self.counterfactual_color = counterfactual_color
556 | self.treated_color = treated_color
557 |
558 | self.filetype = filetype
559 | self.display_graphs = display_graphs
560 |
561 | # Check for the "placebo" option
562 | if placebo is not None:
563 | self.validate_placebo_option(placebo)
564 | self.placebo_option = placebo
565 |
566 | def validate_placebo_option(self, placebo):
567 | # Check if the provided placebo option is a dictionary
568 | if not isinstance(placebo, dict):
569 | raise ValueError("The 'placebo' option must be a dictionary.")
570 |
571 | # Check for the first key in the dictionary
572 | first_key = next(iter(placebo), None)
573 | if first_key not in ["Time", "Space"]:
574 | raise ValueError(
575 | "The first key in the 'placebo' option must be either 'Time' or 'Space'.")
576 |
577 | # If the first key is "Time", check if the associated value is a list of positive integers
578 | if first_key == "Time":
579 | values = placebo[first_key]
580 | if not (isinstance(values, list) and all(isinstance(num, int) and num > 0 for num in values)):
581 | raise ValueError(
582 | "If the first key in the 'placebo' option is 'Time', the associated value must be a list of positive integers.")
583 |
584 | def DID(self, y, datax, t1):
585 | t = len(y)
586 |
587 | x1, x2 = np.mean(datax[:t1], axis=1).reshape(-1,
588 | 1), np.mean(datax[t1:t], axis=1).reshape(-1, 1)
589 | b_DID = np.mean(y[:t1] - x1, axis=0) # DID intercept estimator
590 |
591 | y1_DID = b_DID + x1 # DID in-sample-fit
592 | y2_DID = b_DID + x2 # DID out-of-sample prediction
593 | y_DID = np.vstack((y1_DID, y2_DID)) # Stack y1_DID and y2_DID vertically
594 |
595 | y1_DID, y2_DID = y_DID[:t1], y_DID[t1:t]
596 |
597 | # DID ATT estimate and percentage
598 | if hasattr(self, 'placebo_option'):
599 | t1 = self.realt1
600 |
601 | ATT_DID = np.mean(y[t1:t] - y_DID[t1:t])
602 | ATT_DID_percentage = 100 * ATT_DID / np.mean(y_DID[t1:t])
603 |
604 | # DID R-square
605 |
606 | R2_DID = 1 - (np.mean((y[:t1] - y_DID[:t1]) ** 2)) / (
607 | np.mean((y[:t1] - np.mean(y[:t1])) ** 2)
608 | )
609 |
610 | # Estimated DID residual
611 |
612 | u1_DID = y[:t1] - y_DID[:t1]
613 |
614 | # \hat \Sigma_{1,DID} and \hat \Sigma_{2,DID}
615 | t2 = t - t1
616 |
617 | Omega_1_hat_DID = (t2 / t1) * np.mean(u1_DID**2)
618 | Omega_2_hat_DID = np.mean(u1_DID**2)
619 |
620 | # \hat Sigma_{DID}
621 |
622 | std_Omega_hat_DID = np.sqrt(Omega_1_hat_DID + Omega_2_hat_DID)
623 |
624 | # Standardized ATT_DID
625 |
626 | ATT_std_DID = np.sqrt(t2) * ATT_DID / std_Omega_hat_DID
627 |
628 | # P-value for H0: ATT=0
629 |
630 | p_value_DID = 2 * (1 - norm.cdf(np.abs(ATT_std_DID)))
631 |
632 | # P-value for 1-sided test
633 |
634 | p_value_one_sided = 1 - norm.cdf(ATT_std_DID)
635 |
636 | # 95% Confidence Interval for DID ATT estimate
637 |
638 | z_critical = norm.ppf(0.975) # 1.96 for a two-tailed test
639 | CI_95_DID_left = ATT_DID - z_critical * std_Omega_hat_DID / np.sqrt(t2)
640 | CI_95_DID_right = ATT_DID + z_critical * std_Omega_hat_DID / np.sqrt(t2)
641 | CI_95_DID_width = [
642 | CI_95_DID_left,
643 | CI_95_DID_right,
644 | CI_95_DID_right - CI_95_DID_left,
645 | ]
646 |
647 | # Metrics of fit subdictionary
648 | Fit_dict = {
649 | "T0 RMSE": round(np.std(y[:t1] - y_DID[:t1]), 3),
650 | "R-Squared": round(R2_DID, 3)
651 | }
652 |
653 | # ATTS subdictionary
654 | ATTS = {
655 | "ATT": round(ATT_DID, 3),
656 | "Percent ATT": round(ATT_DID_percentage, 3),
657 | "SATT": round(ATT_std_DID, 3),
658 | }
659 |
660 | # Inference subdictionary
661 | Inference = {
662 | "P-Value": round(p_value_DID, 3),
663 | "95 LB": round(CI_95_DID_left, 3),
664 | "95 UB": round(CI_95_DID_right, 3),
665 | "Width": CI_95_DID_right - CI_95_DID_left
666 | }
667 |
668 | gap = y - y_DID
669 |
670 | second_column = np.arange(gap.shape[0]) - t1+1
671 |
672 | gap_matrix = np.column_stack((gap, second_column))
673 |
674 | # Vectors subdictionary
675 | Vectors = {
676 | "Observed Unit": np.round(y, 3),
677 | "Counterfactual": np.round(y_DID, 3),
678 | "Gap": np.round(gap_matrix, 3)
679 | }
680 |
681 | # Main dictionary
682 | DID_dict = {
683 | "Effects": ATTS,
684 | "Vectors": Vectors,
685 | "Fit": Fit_dict,
686 | "Inference": Inference
687 | }
688 |
689 | return DID_dict
690 |
691 | def AUGDID(self, datax, t, t1, t2, y, y1, y2):
692 | const = np.ones(t) # t by 1 vector of ones (for intercept)
693 | # add an intercept to control unit data matrix, t by N (N=11)
694 | x = np.column_stack([const, datax])
695 | x1 = x[:t1, :] # control units' pretreatment data matrix, t1 by N
696 | x2 = x[t1:, :] # control units' pretreatment data matrix, t2 by N
697 |
698 | # ATT estimation by ADID method
699 | x10 = datax[:t1, :]
700 | x20 = datax[t1:, :]
701 | x1_ADID = np.column_stack([np.ones(x10.shape[0]), np.mean(x10, axis=1)])
702 | x2_ADID = np.column_stack([np.ones(x20.shape[0]), np.mean(x20, axis=1)])
703 |
704 | b_ADID = np.linalg.inv(x1_ADID.T @ x1_ADID) @ (x1_ADID.T @ y1) # ADID estimate of delta
705 |
706 | y1_ADID = x1_ADID @ b_ADID # t1 by 1 vector of ADID in-sample fit
707 | y2_ADID = x2_ADID @ b_ADID # t2 by 1 vector of ADID prediction
708 |
709 | # t by 1 vector of ADID fit/prediction
710 | y_ADID = np.concatenate([y1_ADID, y2_ADID]).reshape(-1, 1)
711 | if hasattr(self, 'placebo_option'):
712 | t1 = self.realt1
713 | ATT = np.mean(y2 - y2_ADID) # ATT by ADID
714 | ATT_per = 100 * ATT / np.mean(y2_ADID) # ATT in percentage by ADID
715 |
716 | e1_ADID = (
717 | y1 - y1_ADID
718 | ) # t1 by 1 vector of treatment unit's (pre-treatment) residuals
719 | sigma2_ADID = np.mean(e1_ADID**2) # \hat sigma^2_e
720 |
721 | eta_ADID = np.mean(x2, axis=0).reshape(-1, 1)
722 | psi_ADID = x1.T @ x1 / t1
723 |
724 | Omega_1_ADID = (sigma2_ADID * eta_ADID.T) @ np.linalg.pinv(psi_ADID) @ eta_ADID
725 | Omega_2_ADID = sigma2_ADID
726 |
727 | Omega_ADID = (t2 / t1) * Omega_1_ADID + Omega_2_ADID # Variance
728 |
729 | ATT_std = np.sqrt(t2) * ATT / np.sqrt(Omega_ADID)
730 |
731 | quantile = norm.ppf(0.975)
732 |
733 | CI_95_DID_left = ATT - quantile * np.sqrt(sigma2_ADID) / np.sqrt(t2)
734 | CI_95_DID_right = ATT + quantile * np.sqrt(sigma2_ADID) / np.sqrt(t2)
735 |
736 | RMSE = np.sqrt(np.mean((y1 - y1_ADID) ** 2))
737 | RMSEPost = np.sqrt(np.mean((y2 - y2_ADID) ** 2))
738 |
739 | R2_ADID = 1 - (np.mean((y1 - y1_ADID) ** 2)) / np.mean((y1 - np.mean(y1)) ** 2)
740 |
741 | # P-value for H0: ATT=0
742 |
743 | p_value_aDID = 2 * (1 - norm.cdf(np.abs(ATT_std)))
744 |
745 | CI_95_DID_width = [
746 | CI_95_DID_left,
747 | CI_95_DID_right,
748 | CI_95_DID_right - CI_95_DID_left,
749 | ]
750 |
751 | # Metrics of fit subdictionary
752 | Fit_dict = {
753 | "T0 RMSE": round(np.std(y[:t1] - y_ADID[:t1]), 3),
754 | "R-Squared": round(R2_ADID, 3),
755 | "T0": len(y[:t1])
756 | }
757 |
758 | # ATTS subdictionary
759 | ATTS = {
760 | "ATT": round(ATT, 3),
761 | "Percent ATT": round(ATT_per, 3),
762 | "SATT": round(ATT_std.item(), 3),
763 | }
764 |
765 | # Inference subdictionary
766 | Inference = {
767 | "P-Value": round(p_value_aDID.item(), 3),
768 | "95 LB": round(CI_95_DID_left.item(), 3),
769 | "95 UB": round(CI_95_DID_right.item(), 3),
770 | "Width": CI_95_DID_right - CI_95_DID_left
771 | }
772 | gap = y - y_ADID
773 |
774 | second_column = np.arange(gap.shape[0]) - t1+1
775 |
776 | gap_matrix = np.column_stack((gap, second_column))
777 |
778 | # Vectors subdictionary
779 | Vectors = {
780 | "Observed Unit": np.round(y, 3),
781 | "Counterfactual": np.round(y_ADID, 3),
782 | "Gap": np.round(gap_matrix, 3)
783 | }
784 |
785 | # Main dictionary
786 | ADID_dict = {
787 | "Effects": ATTS,
788 | "Vectors": Vectors,
789 | "Fit": Fit_dict,
790 | "Inference": Inference
791 | }
792 |
793 | return ADID_dict, y_ADID
794 |
795 | def est(self, control, t, t1, t2, y, y1, y2, datax):
796 |
797 | FDID_dict = self.DID(y.reshape(-1, 1), control, t1)
798 |
799 | y_FDID = FDID_dict['Vectors']['Counterfactual']
800 |
801 | DID_dict = self.DID(y.reshape(-1, 1), datax, t1)
802 |
803 | AUGDID_dict, y_ADID = self.AUGDID(datax, t, t1, t2, y, y1, y2)
804 | time_points = np.arange(1, len(y) + 1)
805 |
806 | # Calculate the ratio of widths for DID and AUGDID compared to FDID
807 | ratio_DID = DID_dict["Inference"]["Width"] / FDID_dict["Inference"]["Width"]
808 | ratio_AUGDID = AUGDID_dict["Inference"]["Width"] / FDID_dict["Inference"]["Width"]
809 |
810 | # Add the new elements to the Inference dictionaries
811 | DID_dict["Inference"]["WidthRFDID"] = ratio_DID
812 | AUGDID_dict["Inference"]["WidthRFDID"] = ratio_AUGDID
813 |
814 | return FDID_dict, DID_dict, AUGDID_dict, y_FDID
815 |
816 | def selector(self, no_control, t1, t, y, y1, y2, datax, control_ID, df):
817 | R2 = np.zeros(no_control)
818 | R2final = np.zeros(no_control)
819 | control_ID_adjusted = np.array(control_ID) - 1
820 | select_c = np.zeros(no_control, dtype=int)
821 |
822 | for j in range(no_control):
823 | ResultDict = self.DID(y.reshape(-1, 1), datax[:t, j].reshape(-1, 1), t1)
824 | R2[j] = ResultDict["Fit"]["R-Squared"]
825 | R2final[0] = np.max(R2)
826 | first_c = np.argmax(R2)
827 | select_c[0] = control_ID_adjusted[first_c]
828 |
829 | for k in range(2, no_control + 1):
830 | left = np.setdiff1d(control_ID_adjusted, select_c[: k - 1])
831 | control_left = datax[:, left]
832 | R2 = np.zeros(len(left))
833 |
834 | for jj in range(len(left)):
835 | combined_control = np.concatenate(
836 | (
837 | datax[:t1, np.concatenate((select_c[: k - 1], [left[jj]]))],
838 | datax[t1:t, np.concatenate((select_c[: k - 1], [left[jj]]))]
839 | ),
840 | axis=0
841 | )
842 | ResultDict = self.DID(y.reshape(-1, 1), combined_control, t1)
843 | R2[jj] = ResultDict["Fit"]["R-Squared"]
844 |
845 | R2final[k - 1] = np.max(R2)
846 | select = left[np.argmax(R2)]
847 | select_c[k - 1] = select
848 | return select_c, R2final
849 |
850 | def fit(self):
851 | Ywide = self.df.pivot_table(
852 | values=self.outcome, index=self.time, columns=self.unitid, sort=False
853 | )
854 |
855 | treated_unit_name = self.df.loc[self.df[self.treated] == 1, self.unitid].values[0]
856 | Ywide = Ywide[[treated_unit_name] +
857 | [col for col in Ywide.columns if col != treated_unit_name]]
858 | y = Ywide[treated_unit_name].values.reshape(-1, 1)
859 | donor_df = self.df[self.df[self.unitid] != treated_unit_name]
860 | donor_names = donor_df[self.unitid].unique()
861 | datax = Ywide[donor_names].values
862 | no_control = datax.shape[1]
863 | control_ID = np.arange(1, no_control + 1) # 1 row vector from 1 to no_control
864 |
865 | t = np.shape(y)[0]
866 | assert t > 5, "You have less than 5 total periods."
867 |
868 | results = [] # List to store results
869 |
870 | self.realt1 = len(
871 | self.df[
872 | (self.df[self.unitid] == treated_unit_name)
873 | & (self.df[self.treated] == 0)
874 | ]
875 | )
876 |
877 | if hasattr(self, 'placebo_option'):
878 | print("Placebo Option in fit method:", self.placebo_option)
879 | placebo_list = self.placebo_option.get("Time")
880 | placebo_list = [0] + self.placebo_option.get("Time")
881 | for i, itp in enumerate(placebo_list):
882 | t1 = len(
883 | self.df[
884 | (self.df[self.unitid] == treated_unit_name)
885 | & (self.df[self.treated] == 0)
886 | ]
887 | ) - itp
888 |
889 | t2 = t - t1
890 | y1 = np.ravel(y[:t1])
891 | y2 = np.ravel(y[-t2:])
892 |
893 | control_order, R2_final = self.selector(
894 | no_control, t1, t, y, y1, y2, datax, control_ID, Ywide
895 | )
896 | selected_control_indices = control_order[:R2_final.argmax() + 1]
897 | copy_wide_copy = Ywide.iloc[:, 1:].copy()
898 | selected_controls = [copy_wide_copy.columns[i] for i in selected_control_indices]
899 |
900 | control = datax[:, control_order[: R2_final.argmax() + 1]]
901 |
902 | FDID_dict, DID_dict, AUGDID_dict, y_FDID = self.est(
903 | control, t, t1, t - t1, y, y1, y2, datax
904 | )
905 |
906 | placebo_results = [] # Initialize an empty list for each placebo iteration
907 | FDID_dict['Selected Units'] = selected_controls
908 | placebo_results.append({"FDID": FDID_dict})
909 |
910 | # Append the normal DID dictionary to the list
911 | placebo_results.append({"DID": DID_dict})
912 | placebo_results.append({"AUGDID": AUGDID_dict})
913 |
914 | def round_dict_values(input_dict, decimal_places=3):
915 | rounded_dict = {}
916 | for key, value in input_dict.items():
917 | if isinstance(value, dict):
918 | # Recursively round nested dictionaries
919 | rounded_dict[key] = round_dict_values(value, decimal_places)
920 | elif isinstance(value, (int, float, np.float64)):
921 | # Round numeric values
922 | rounded_dict[key] = round(value, decimal_places)
923 | else:
924 | rounded_dict[key] = value
925 | return rounded_dict
926 |
927 | # Round all values in the placebo_results list of dictionaries
928 | placebo_results = [
929 | round_dict_values(result) for result in placebo_results
930 | ]
931 |
932 | # Add the placebo results to the overall results list with a labeled key
933 | results.append({"Placebo" + str(i): placebo_results})
934 |
935 | else:
936 | # Your logic for when placebo_option is not specified
937 | t1 = len(
938 | self.df[
939 | (self.df[self.unitid] == treated_unit_name)
940 | & (self.df[self.treated] == 0)
941 | ]
942 | )
943 |
944 | t2 = t - t1
945 | y1 = np.ravel(y[:t1])
946 | y2 = np.ravel(y[-t2:])
947 |
948 | control_order, R2_final = self.selector(
949 | no_control, t1, t, y, y1, y2, datax, control_ID, Ywide
950 | )
951 | selected_control_indices = control_order[:R2_final.argmax() + 1]
952 | copy_wide_copy = Ywide.iloc[:, 1:].copy()
953 | selected_controls = [copy_wide_copy.columns[i] for i in selected_control_indices]
954 |
955 | control = datax[:, control_order[: R2_final.argmax() + 1]]
956 |
957 | FDID_dict, DID_dict, AUGDID_dict, y_FDID = self.est(
958 | control, t, t1, t - t1, y, y1, y2, datax
959 | )
960 |
961 | estimators_results = []
962 | FDID_dict['Selected Units'] = selected_controls
963 | estimators_results.append({"FDID": FDID_dict})
964 |
965 | # Append the normal DID dictionary to the list
966 | estimators_results.append({"DID": DID_dict})
967 | estimators_results.append({"AUGDID": AUGDID_dict})
968 |
969 | def round_dict_values(input_dict, decimal_places=3):
970 | rounded_dict = {}
971 | for key, value in input_dict.items():
972 | if isinstance(value, dict):
973 | # Recursively round nested dictionaries
974 | rounded_dict[key] = round_dict_values(value, decimal_places)
975 | elif isinstance(value, (int, float, np.float64)):
976 | # Round numeric values
977 | rounded_dict[key] = round(value, decimal_places)
978 | else:
979 | rounded_dict[key] = value
980 | return rounded_dict
981 |
982 | # Round all values in the estimators_results list of dictionaries
983 | estimators_results = [
984 | round_dict_values(result) for result in estimators_results
985 | ]
986 |
987 | # Add the estimators results to the overall results list
988 | # results.append({"Estimators": estimators_results})
989 |
990 | if self.display_graphs:
991 | time_axis = self.df[self.df[self.unitid] == treated_unit_name][self.time].values
992 | intervention_point = self.df.loc[self.df[self.treated] == 1, self.time].min()
993 | n = np.arange(1, t+1)
994 | fig = plt.figure(figsize=self.figsize)
995 |
996 | plt.plot(
997 | n,
998 | y,
999 | color=self.treated_color,
1000 | label="Observed " + treated_unit_name,
1001 | linewidth=3
1002 | )
1003 | plt.plot(
1004 | n[: t1],
1005 | y_FDID[: t1],
1006 | color=self.counterfactual_color,
1007 | linewidth=3,
1008 | label="FDID " + treated_unit_name,
1009 | )
1010 | plt.plot(
1011 | n[t1-1:],
1012 | y_FDID[t1-1:],
1013 | color=self.counterfactual_color,
1014 | linestyle="--",
1015 | linewidth=2.5,
1016 | )
1017 | plt.axvline(
1018 | x=t1,
1019 | color="#ed2939",
1020 | linestyle="--", linewidth=2.5,
1021 | label=self.treated + ", " + str(intervention_point),
1022 | )
1023 | upb = max(max(y), max(y_FDID))
1024 | lpb = min(0.5 * min(min(y), min(y_FDID)), 1 * min(min(y), min(y_FDID)))
1025 |
1026 | # Set y-axis limits
1027 | # plt.ylim(lpb, upb)
1028 | plt.xlabel(self.time)
1029 | plt.ylabel(self.outcome)
1030 | plt.title("Observed " + treated_unit_name +
1031 | " versus FDID " + treated_unit_name)
1032 | plt.grid(self.grid)
1033 | plt.legend()
1034 | plt.show()
1035 |
1036 | if hasattr(self, 'placebo_option'):
1037 | return results
1038 | else:
1039 | return estimators_results
1040 |
1041 | ```
1042 |
1043 |
1044 | # Replicating Hsiao, Ching, Wan 2012
1045 | Here is an example of using the full thing. I begin by modifying my graphics to my liking
1046 | ```python
1047 | # Matplotlib theme
1048 | Jared_theme = {'axes.grid': True,
1049 | 'grid.linestyle': '-',
1050 | 'legend.framealpha': 1,
1051 | 'legend.facecolor': 'white',
1052 | 'legend.shadow': True,
1053 | 'legend.fontsize': 16,
1054 | 'legend.title_fontsize': 18,
1055 | 'xtick.labelsize': 18,
1056 | 'ytick.labelsize': 18,
1057 | 'axes.labelsize': 18,
1058 | 'axes.titlesize': 20,
1059 | 'axes.facecolor': 'grey',
1060 | 'figure.dpi': 300,
1061 | 'grid.color': '#d0d0d0'}
1062 |
1063 | matplotlib.rcParams.update(Jared_theme)
1064 | ```
1065 | Now I import the dataframe (which you may also access in this repo) and do the relvant cleaning to set up the analysis.
1066 |
1067 | Click to expand/collapse
1068 |
1069 | ```python
1070 | # Define column names
1071 |
1072 | column_names = [
1073 | "Hong Kong",
1074 | "Australia",
1075 | "Austria",
1076 | "Canada",
1077 | "Denmark",
1078 | "Finland",
1079 | "France",
1080 | "Germany",
1081 | "Italy",
1082 | "Japan",
1083 | "Korea",
1084 | "Mexico",
1085 | "Netherlands",
1086 | "New Zealand",
1087 | "Norway",
1088 | "Switzerland",
1089 | "United Kingdom",
1090 | "United States",
1091 | "Singapore",
1092 | "Philippines",
1093 | "Indonesia",
1094 | "Malaysia",
1095 | "Thailand",
1096 | "Taiwan",
1097 | "China",
1098 | ]
1099 |
1100 | df = pd.read_csv(
1101 | "https://raw.githubusercontent.com/leoyyang/rhcw/master/other/hcw-data.txt",
1102 | header=None,
1103 | delim_whitespace=True,
1104 | )
1105 | # Or , pd.read_csv(cw-data.txt", header=None, delim_whitespace=True)
1106 |
1107 | # Naming the columns after the countries
1108 | df.columns = column_names
1109 |
1110 | df = pd.melt(df, var_name="Country", value_name="GDP", ignore_index=False)
1111 |
1112 | # Add a 'Time' column ranging from 0 to 60
1113 |
1114 | df["Time"] = df.index
1115 |
1116 | df["Integration"] = (df["Country"].str.contains("Hong") & (df["Time"] >= 44)).astype(
1117 | int
1118 | )
1119 | # The names had some spaces in them, so I lazily
1120 | # selected the treated unit based on it having "Hong" in its name.
1121 |
1122 | # Here we define the columns to be used by the class
1123 |
1124 | treat = "Integration"
1125 | outcome = "GDP"
1126 | unitid = "Country"
1127 | time = "Time"
1128 | ```
1129 |
1130 |
1131 | And now for the estimation.
1132 | ```python
1133 | model = FDID(df=df,
1134 | unitid=unitid,
1135 | time=time,
1136 | outcome=outcome,
1137 | treat=treat,
1138 | display_graphs=False, figsize=(13, 8),
1139 | counterfactual_color='#7DF9FF')
1140 |
1141 | HKRes = model.fit()
1142 | ```
1143 |
1144 | The model prints the plot by itself, but we can also extract these results as follows
1145 |
1146 | ```python
1147 | observed_unit = HKRes[0]["FDID"]["Vectors"]["Observed Unit"]
1148 | FDID_unit = HKRes[0]["FDID"]["Vectors"]["Counterfactual"]
1149 |
1150 | DID_unit = HKRes[1]["DID"]["Vectors"]["Counterfactual"]
1151 | treatdate = HKRes[2]["AUGDID"]["Fit"]["T0"]
1152 |
1153 |
1154 | # Plotting
1155 | plt.figure(figsize=(10, 6))
1156 | plt.plot(observed_unit,
1157 | label='Observed Hong Kong',
1158 | linestyle='-', color='black', linewidth=2)
1159 | plt.plot(DID_unit, label='DID Hong Kong',
1160 | linestyle='-', color='red')
1161 |
1162 | plt.plot(FDID_unit, label='FDID Hong Kong',
1163 | linestyle='-', color='blue')
1164 |
1165 | plt.xlabel(time)
1166 | plt.ylabel(outcome)
1167 | plt.title('Reality versus Predictions')
1168 | plt.ylim(bottom=-.1, top=.25)
1169 |
1170 | plt.axvline(x=treatdate, color='black', linestyle='--',
1171 | linewidth=3, label=f'{treat}, {treatdate}')
1172 |
1173 |
1174 | plt.legend()
1175 |
1176 | plt.show()
1177 | ```
1178 |
1179 |
1180 |
1181 |
1182 | ```FDID``` selects Philippines, Singapore, Thailand, Norway, Mexico, Korea, Indonesia, New Zealand, Malaysia as the optimal control group.
1183 |
1184 | - FDID ATT: 0.025, Percent ATT: 53.843
1185 | - DID ATT: 0.032, Percent ATT: 77.62
1186 | - AUGDID ATT: 0.021, Percent ATT: 41.635
1187 |
1188 | With these effect sizes, we can see how the method we use affects the $ATT$ we estimate. DID estimates a giant effect size of economic integration, whereas FDID estimate much smaller effects using the optimal control group. Some, however, may still be unconvinced: "These effect sizes appear the same", one might say, "what is the added value of FDID for business or policy analysis when these effect sizes at first blush appear similar?" This is where visualizing the $ATT$ and their uncertainties comes in handy.
1189 |
1190 |
1191 |
1192 |
1193 |
1194 | Here, we can plot the ATTs and error bars for each method, giving us a better sense of the effect of the Forward Selection algorithm on the uncertainty analysis. Both estimators agree the lower bound of the $ATT$ is a positive impact of 0.016. However, the difference of the upper bounds for ```FDID``` and DID is 0.014. The reason for this is because of the violation of the parallel trends assumption for naive DID: all of the other donors are too different for the pre-intervention to be well approximated by an intercept adjusted average. The effect size will be biased as a result, while also producing CIs that are too wide. The fact that FDID has narrower confidence intervals than DID (even narrower than ADID too, which I do not plot) suggests that a simple control group selection mechanism can affect the range of effects we impute. This is important because it is not simply enough for policy analysts or business scientists to impute the ATT or the counterfactual. The precision of our results matters because uncertainty allows for us to plan accurately for best (and worst) case scenarios. A tool that produces larger confidence intervals could result in money lost for a business or other social outcomes in the case of policy studies. FDID is clearly an improvement over normal DID because of its Forward Selection algorithm, enabling for less biased and more precise effect size estimates.
1195 |
1196 | # Replicating Ke and Hsaio 2020
1197 | Let's do the next example. This example replicates the findings of [Ke and Hsiao 2020](https://doi.org/10.1002/jae.2871), which estimated the causal impact of Hubei's lockdown for COVID-19 on the economy, measured as Quarterly GDP. Note that here, I import the ```FDID``` class from my ```mlsynth``` library, which houses ```FDID``` and other counterfactual estimators. ```mlsynth``` is still under development, which is why it is not posted here, however the syntax for FDID remains the same.
1198 | ```python
1199 | import requests
1200 | from zipfile import ZipFile
1201 | import pandas as pd
1202 | from io import BytesIO
1203 | from mlsynth import FDID
1204 | import matplotlib.pyplot as plt
1205 | import matplotlib
1206 | # Matplotlib theme
1207 | Jared_theme = {'axes.grid': True,
1208 | 'grid.linestyle': '-',
1209 | 'legend.framealpha': 1,
1210 | 'legend.facecolor': 'white',
1211 | 'legend.shadow': True,
1212 | 'legend.fontsize': 16,
1213 | 'legend.title_fontsize': 18,
1214 | 'xtick.labelsize': 18,
1215 | 'ytick.labelsize': 18,
1216 | 'axes.labelsize': 18,
1217 | 'axes.titlesize': 20,
1218 | 'axes.facecolor': 'white',
1219 | 'figure.dpi': 300,
1220 | 'grid.color': 'black'}
1221 |
1222 | matplotlib.rcParams.update(Jared_theme)
1223 |
1224 | zip_url = "http://qed.econ.queensu.ca/jae/datasets/ke001/kh-data.zip"
1225 |
1226 | response = requests.get(zip_url)
1227 |
1228 | if response.status_code == 200:
1229 | # Extract the contents of the ZIP file
1230 | with ZipFile(BytesIO(response.content)) as zip_file:
1231 | file_name = zip_file.namelist()[0]
1232 |
1233 | df = pd.read_csv(zip_file.open(file_name), delimiter="\t", header=None)
1234 | # or, pd.read_csv('kh-data.txt', delimiter="\t", header=None)
1235 | else:
1236 | print("The zip file does not exist.")
1237 |
1238 | # Here are our city names.
1239 | cities = [
1240 | "Quarter", "Beijing", "Tianjin", "Hebei", "Shanxi", "Inner Mongolia",
1241 | "Liaoning", "Jilin", "Heilongjiang", "Shanghai", "Jiangsu",
1242 | "Zhejiang", "Anhui", "Fujian", "Jiangxi", "Shandong",
1243 | "Henan", "Hubei", "Hunan", "Guangdong", "Guangxi",
1244 | "Hainan", "Chongqing", "Sichuan", "Guizhou", "Yunnan",
1245 | "Tibet", "Shaanxi", "Gansu", "Qinghai", "Ningxia",
1246 | "Xinjiang"
1247 | ]
1248 |
1249 | columns_mapping = dict(zip(df.columns, cities))
1250 | df.rename(columns=columns_mapping, inplace=True)
1251 |
1252 | df = pd.melt(df, id_vars=["Quarter"], var_name="City", value_name="GDP Growth")
1253 |
1254 | df['Quarter'] = pd.to_datetime(df['Quarter'])
1255 |
1256 | df["Lockdown"] = ((df["Quarter"] > pd.to_datetime("2019-10-01")) & (df["City"].str.contains("Hubei"))).astype(int)
1257 |
1258 | df['Time'] = df.groupby(['City']).cumcount() + 1
1259 |
1260 | # Now for our variables which we feed to the class.
1261 |
1262 | treat = "Lockdown"
1263 | outcome = "GDP Growth"
1264 | unitid = "City"
1265 | time = "Time"
1266 |
1267 | model = FDID(df=df,
1268 | unitid=unitid,
1269 | time=time,
1270 | outcome=outcome,
1271 | treat=treat,
1272 | display_graphs=False, figsize=(12, 8),
1273 | counterfactual_color='#7DF9FF')
1274 |
1275 | Hubei = model.fit()
1276 |
1277 | observed_unit = Hubei[0]["FDID"]["Vectors"]["Observed Unit"]
1278 | FDID_unit = Hubei[0]["FDID"]["Vectors"]["Counterfactual"]
1279 |
1280 | DID_unit = Hubei[1]["DID"]["Vectors"]["Counterfactual"]
1281 | treatdate = Hubei[2]["AUGDID"]["Fit"]["T0"]
1282 |
1283 |
1284 | # Plotting
1285 | plt.figure(figsize=(10, 6))
1286 | plt.plot(observed_unit,
1287 | label='Observed Hubei',
1288 | linestyle='-', color='black', linewidth=3)
1289 | plt.plot(DID_unit, label='DID Hubei',
1290 | linestyle='-', color='red', linewidth=1.5)
1291 |
1292 | plt.plot(FDID_unit, label='FDID Hubei',
1293 | linestyle='-', color='blue', linewidth=1.5)
1294 |
1295 | plt.xlabel(time)
1296 | plt.ylabel(outcome)
1297 | plt.title('Reality versus Predictions')
1298 |
1299 | plt.axvline(x=treatdate, color='black', linestyle='--',
1300 | linewidth=1, label=f'{treat}, {treatdate}')
1301 |
1302 |
1303 | plt.legend()
1304 |
1305 | plt.show()
1306 |
1307 |
1308 | ```
1309 |
1310 |
1311 |
1312 |
1313 | - FDID ATT: -691.096, Percent ATT: -7.815
1314 | - DID ATT: 447.525, Percent ATT: 5.808
1315 | - AUGDID ATT: -314.138, Percent ATT: -3.71
1316 |
1317 | ```FDID``` has Anhui, Zhejiang, Beijing, Fujian, Henan, Hunan, Jiangsu, and Yunnan as the optimal controls. Already, we can see how ```FDID``` vastly improves upon the pre-intervention fit of standard DID: the pre-intervention RMSEs for FDID, DID, and AUGDID are respectively 102, 736, and 329. Using a control group that is much more similar to Hubei in the pre-2020 period allows the trends to match much more than a naive average and an intercept would. We can see that the standard PTA invoked by DID design may not always hold, and can lead to nonsensical results: here vanilla DID suggests Hubei's lockdown _improved_ the economy, which isn't a very sensible position to take at all. It is also intersting to note that ```FDID``` 's results are in line with some of the more flexible estimators from the ```mlsynth``` library. For example, [Robust PCA SCM](https://arxiv.org/abs/2108.12542) has an ATT of -708, and an 8 percent negative impact on the economy. The Factor Model Approach [by Li and Sonnier](https://doi.org/10.1177/00222437221137533) finds a -664 ATT and a percentage decrease of 7.5. AUGDID, as we see above, also predicted a negative effect on Hubei's GDP.
1318 |
1319 | Equally interesting (perhaps more) is the uncertainty analysis. FDID's confidence interval is [\-797.369,-584.822\]. DID's is [\-318.476, 1213.525\], and AUGDID's is [\-637.527, 9.252\]. I want to go further than simple nulll hypothesis testing; let's think about what these confidence intervals are actually saying in plain English. FDID says that the lockdown hurt the economy a lot (mostly in the short term). DID says that the lockdown on average helped the economy (again, which does not make sense!), but it is uncertain about how much, ranging from a relatively small decrease of 318 to a gigantic improvement of 1213. AUGDID's uncertainty is still mostly supportive of a negative impact, as well. To visualize this though, we can take the ratio of the width of the confidence intervals for each method. We can make a plot like this, where we divide the width of our confidence interval for DID and AUGDID by that of FDID.
1320 | ```python
1321 | methods = ['FDID', 'DID', 'AUGDID']
1322 |
1323 | # Width Ratios
1324 | widths = [1, Hubei[1]["DID"]["Inference"]["WidthRFDID"],
1325 | Hubei[2]["AUGDID"]["Inference"]["WidthRFDID"]]
1326 |
1327 | # Plotting
1328 | fig, ax = plt.subplots()
1329 |
1330 | # Plotting bar chart
1331 | bars = ax.bar(methods, widths, capsize=5, color=['blue', 'grey', 'red'])
1332 |
1333 | # Labeling
1334 | ax.set_ylabel('Width Ratio')
1335 | ax.set_title('Uncertainty Comparison')
1336 |
1337 | # Label the heights on top of each bar
1338 | for bar in bars:
1339 | yval = bar.get_height()
1340 | plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, round(yval, 3), ha='center', va='bottom')
1341 |
1342 | plt.grid(False)
1343 |
1344 | # Show the plot
1345 | plt.show()
1346 | ```
1347 |
1348 |
1349 |
1350 |
1351 | The plot above gives the ratios, where of course FDID's will always be 1. DID's confidence intervals are 7.2 times wider than those of FDID! AUGDID's are 3.04 times wider than FDID's. In other words, classic DID is 7 times less certain about the empirical effect than FDID. This fact again illustrates the value of the machine-learning extension of the Forward Selection method. Having this extension does not simply make the effect size less biased, it makes the uncertainty around the effect size more precise. Had we relied simply on DID, not only would we get wrong conclusions in terms of the effect, but we would be very uncertain about the direction or scale of it.
1352 |
1353 | Note however the FDID has use restrictions of its own. To directly quote the FDID paper,
1354 | > If an intercept adjusted treatment unit is outside the range of the control units (e.g., the treatment’s outcome has an upward trend that is steeper than that of all the control units’ outcomes), then this assumption will be violated because no subset of control units can trace the steeper upward trend of the treatment unit. In such a case, the Forward DID method should not be applied, and researchers should consider alternative methods, such as factor model based methods, modified synthetic control methods, or the augmented difference-in-differences method.
1355 |
1356 | Hubei does not have the highest or lowest GDP at any point in the time-series, and nor does ti ahve the steepest trend, so we would expect FDID to be feasible here. This aside, I should emphasize that the parallel trends assumption itself does not change: we're still presuming that a pure average of certain controls plus an intercept would do a better job at predicting the counterfactual. The key difference is that the plausibility of this assumption **depends** on the donor pool we use. There are good reasons in many cases to imagine an interactive fixed effects model might be more plausible or realstic than the two-way fixed effects model present in (F)DID, a data generating process often used in the theoretical justification of the synthetic control method.
1357 |
1358 | Students of econometrics may complain that this comparison is unfair to DID,, saying "Well, this is why we typically incorporate covariates into our analysis. We usually include variables that may predict the intervention and outcome, thereby allowing for a _conditional_ PTA instead of the outcome-only variant. And I agree, this is more plausible than the standard PTA. However, consider the fact that FDID attains superior fit and tighter confidence intervals without needing any predictors at all in these cases compared to DID. Indeed, we can imagine cases where differential privacy must be preserved, or where attaining covariates simply is not possible financially or practically. If all we have to rely on is outcome data, it is valuable to researchers in policy or business to have a robust counterfactual estimator which does not need predictors to work well. This certainly does not mean that we cannot (or should not) incorporate predictors into FDID. It does mean, however, that such predictors frequently needed (by DID, the standard synthetic control method, and others) are rendered _supplementary_ as opposed to being necessary for proper estimation.
1359 |
1360 | # Conclusion
1361 |
1362 | It is my hope that in making my code public we can put more of these advanced econometric methods to use for causal analysis. The benefit is not simply because they're advanced, but because they improve the accuracy of our estimates with a few simple adjustments to standard econometric techniques.
1363 |
1364 | # Contact
1365 | - Jared Greathouse:
1366 | - Kathy Li:
1367 |
--------------------------------------------------------------------------------
/barcelona.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/barcelona.dta
--------------------------------------------------------------------------------
/basque.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/basque.dta
--------------------------------------------------------------------------------
/fdid.ado:
--------------------------------------------------------------------------------
1 | *!version 1.0.2 27Sep2024
2 | // Optimizes the forward selection algorithm,
3 | // thanks to Andrew Musau and Daniel Klein
4 | *! requires sdid and sdid_event
5 | *! Jared Greathouse
6 |
7 | *****************************************************
8 |
9 | * Programmer: Jared A. Greathouse
10 |
11 | * Institution: Georgia State University
12 |
13 | * Contact: j.greathouse200@gmail.com
14 |
15 | * Created on : Jul 12, 2024
16 |
17 | * Contents: 1. Purpose
18 |
19 | * 2. Program Versions
20 |
21 | *****************************************************
22 |
23 | * 1. Purpose
24 |
25 | ** Programs Forward DD from
26 | ** Li : https://doi.org/10.1287/mksc.2022.0212
27 |
28 | * 2. Program
29 |
30 | *****************************************************
31 | set more off
32 |
33 | prog define fdid, eclass
34 | qui frame
35 | loc originalframe: di r(currentframe)
36 |
37 | /**********************************************************
38 |
39 |
40 |
41 | * Preliminaries*
42 |
43 |
44 | If the data aren't a balanced panel, something about the
45 | user's dataset ain't right.
46 | **********************************************************/
47 |
48 |
49 | cap qui xtset
50 | if _rc {
51 |
52 | disp as err "The data are not xtset"
53 | exit 498
54 | }
55 |
56 | cap qui xtset
57 |
58 | cap as r(balanced)=="strongly balanced"
59 |
60 | if _rc {
61 | disp as err "The data must be strongly balanced"
62 | exit 498
63 |
64 | }
65 |
66 | cap as r(gaps)==0
67 |
68 |
69 | if _rc {
70 | disp as err "The data cannot have gaps"
71 | exit 498
72 |
73 | }
74 |
75 |
76 | loc time: disp "`r(timevar)'"
77 |
78 | loc panel: disp "`r(panelvar)'"
79 |
80 | cap which sdid_event
81 |
82 | if _rc ==111 {
83 |
84 | disp as err "Install sdid_event before using fdid".
85 | exit 498
86 | }
87 |
88 |
89 |
90 | marksample touse
91 |
92 | _xtstrbal `panel' `time' `touse'
93 |
94 | syntax anything [if], ///
95 | TReated(varname) /// We need a treatment variable as 0 1
96 | [gr1opts(string asis)] [gr2opts(string asis)] ///
97 | [unitnames(varlist max=1 string)] [placebo]
98 |
99 |
100 | /* if unitname specified, grab the label here */
101 | if "`unitnames'" != "" {
102 | qui frame
103 | local curframe = c(frame)
104 |
105 | tempname __dfcopy
106 | cap frame drop __dfcopy
107 |
108 | frame copy `curframe' `__dfcopy'
109 |
110 | cwf `__dfcopy'
111 |
112 |
113 | qui levelsof `panel',local(levp)
114 |
115 |
116 |
117 |
118 | /* check if var exists */
119 | capture confirm string var `unitnames'
120 | if _rc {
121 | di as err "`unitnames' does not exist as a (string) variable in dataset"
122 | exit 198
123 | }
124 | /* check if it has a value for all units */
125 | tempvar pcheck
126 | qui egen `pcheck' = sd(`panel') , by(`unitnames')
127 | qui sum `pcheck'
128 | if r(sd) != 0 {
129 | di as err "`unitnames' varies within units of `panel' - revise unitnames variable "
130 | exit 198
131 | }
132 | local clab "`panel'"
133 | tempvar index
134 | gen `index' = _n
135 | /* now label the pvar accoringly */
136 | foreach i in `levp' {
137 | qui su `index' if `panel' == `i', meanonly
138 | local label = `unitnames'[`r(max)']
139 | local value = `panel'[`r(max)']
140 | qui label define `clab' `value' `"`label'"', modify
141 | }
142 | label value `panel' `clab'
143 | }
144 |
145 |
146 |
147 | if "`: value label `panel''" == "" & "`unitnames'" == "" {
148 |
149 | di as err "Your panel variable NEEDS to have a value label attached to it."
150 |
151 | di as err "Either specify -unitnames- or pre-assign your panel id with string value labels."
152 |
153 | exit 198
154 | }
155 |
156 |
157 | tempvar touse
158 | mark `touse' `if' `in'
159 |
160 | if (length("`if'")+length("`in'")>0) {
161 |
162 | qui keep if `touse'
163 | }
164 |
165 |
166 | gettoken depvar anything: anything
167 |
168 | unab depvar: `depvar'
169 |
170 | local y_lab: variable lab `depvar'
171 |
172 | loc outlab "`y_lab'" // Grabs the label of our outcome variable
173 |
174 | local tr_lab: variable lab `treated'
175 |
176 | qui levelsof `panel' if `treated'==1, loc(trunit)
177 |
178 | local nwords : word count `trunit'
179 |
180 | if `nwords' > 1 {
181 | matrix empty_matrix = J(1, 7, .) // Initialize an empty matrix with 0 rows and 7 columns
182 |
183 | matrix combined_matrix = empty_matrix // Initialize combined_matrix as empty
184 | }
185 |
186 |
187 | foreach x of loc trunit {
188 |
189 | local curframe = c(frame)
190 |
191 | if "`curframe'" != "`__dfcopy'" {
192 |
193 | tempname __dfcopy
194 | frame copy `curframe' `__dfcopy'
195 | }
196 | frame `__dfcopy' {
197 |
198 |
199 | loc trunitstr: display "`: label (`panel') `x''"
200 |
201 | if "`cfframename'" == "" {
202 |
203 | cap frame drop fdid_cfframe`x'
204 |
205 | loc defname fdid_cfframe`x'
206 | }
207 |
208 | else if `nwords' > 1 {
209 |
210 | loc defname `cfframename'`x'
211 | }
212 |
213 | else {
214 |
215 | loc defname `cfframename'
216 | }
217 |
218 | numcheck, unit(`panel') ///
219 | time(`time') ///
220 | transform(`transform') ///
221 | depvar(`depvar') /// Routine 1
222 | treated(`treated') cfframe(`defname') trnum(`x')
223 |
224 |
225 | // Routine 2
226 |
227 | treatprocess, time(`time') ///
228 | unit(`panel') ///
229 | treated(`treated') trnum(`x')
230 |
231 | loc trdate = e(interdate)
232 |
233 |
234 | /**********************************************************
235 |
236 |
237 |
238 | * Estimation*
239 |
240 |
241 | This is where we do estimation if the dataset above passes.
242 | **********************************************************/
243 |
244 | est_dd, time(`time') ///
245 | interdate(`trdate') ///
246 | intname(`tr_lab') ///
247 | outlab(`outlab') ///
248 | gr1opts(`gr1opts') ///
249 | gr2opts(`gr2opts') ///
250 | treatst(`trunitstr') ///
251 | panel(`panel') ///
252 | outcome(`depvar') ///
253 | trnum(`x') treatment(`treated') ///
254 | cfframe(`defname') ntr(`nwords') copyname(`__dfcopy') placebo(`placebo')
255 |
256 | if `nwords' > 1 {
257 | matrix resmat = e(results)
258 | matrix combined_matrix = combined_matrix \ resmat
259 | }
260 |
261 | }
262 |
263 | if `nwords'==1 {
264 |
265 |
266 | mat series = e(series)
267 | ereturn mat series= series
268 | }
269 | }
270 |
271 | if `nwords' > 1 {
272 |
273 | * Create a new macro to store the filtered elements
274 | local new_color
275 |
276 | qui frame dir
277 |
278 | loc color `r(frames)'
279 |
280 | * Loop through each element in the `color` macro and check if it contains "blue"
281 | foreach col of local color {
282 | if strpos("`col'", "fdid_cfframe") {
283 | local new_color `new_color' `col'
284 | }
285 | }
286 |
287 | loc nmulti: word count `new_color'
288 |
289 | loc firstframe: word 1 of `new_color'
290 |
291 | cwf `firstframe'
292 |
293 | forv i = 2/`nmulti' {
294 |
295 | loc change: word `i' of `new_color'
296 | qui frlink 1:1 `time', frame(`change')
297 |
298 | qui frget *, from(`change')
299 | frame drop `change'
300 | }
301 |
302 | frame put *, into(wideframe)
303 |
304 | frame put *, into(multiframe)
305 |
306 | frame multiframe {
307 | qui reshape long `depvar' cf te eventtime cfdd ymeandid ymeanfdid ddte, i(`time') j(`panel')
308 | sort `panel' `time'
309 |
310 | tempname cohort
311 |
312 | qbys `panel': egen `cohort' = min(`time') if eventtime==0
313 |
314 | qbys `panel': egen cohort = max(`cohort')
315 | qui drop fdid*
316 |
317 | qui xtset
318 |
319 | qui replace eventtime = eventtime / r(tdelta)
320 |
321 | qui g residsq = te^2 if eventtime < 0
322 |
323 | qbys cohort: egen totpost = max(eventtime+1) // Total Post by Cohort T_post^a
324 |
325 | qbys cohort: egen totpre = max(abs(eventtime)) if eventtime < 0 // T_pre_a
326 |
327 | egen max_totpost = max(totpost)
328 |
329 |
330 | qbys cohort: egen ATT = mean(te) if eventtime >=0 // Cohort ATT, tau_a (totpost/max_totpost)*
331 |
332 | tempvar tag
333 |
334 | egen `tag' = tag(id)
335 |
336 | egen NCohort = total(`tag'), by(cohort) // Number of Units per Cohort
337 |
338 | // By Cohort, Mean of Residuals Squared
339 |
340 | qbys cohort: egen CohortMSE = mean(residsq) if eventtime < 0 // Cohort Pre-Intervention MSE
341 |
342 | // Puts all of these into a frame
343 |
344 | frame put cohort ATT NCohort totpost totpre CohortMSE max_totpost, into(EffectFrame)
345 |
346 | cwf EffectFrame
347 |
348 | collapse (firstnm) ATT NCohort totpost totpre CohortMSE max_totpost, by(cohort)
349 |
350 |
351 | // Standard Error of the Cohort ATT
352 |
353 | qbys cohort: g SECohort = sqrt((totpost/totpre)*CohortMSE+CohortMSE)/sqrt(totpost)
354 |
355 | qbys cohort: g tstat = abs(ATT/SECohort)
356 |
357 | qbys cohort: g p_value = 2 * (1 - normal(tstat))
358 |
359 | qbys cohort: g LB = ATT - (invnormal(0.975) * SECohort)
360 |
361 | qbys cohort: g UB = ATT + (invnormal(0.975) * SECohort)
362 | ereturn clear
363 | tempname SA
364 |
365 | qui mkmat cohort ATT NCohort, mat(`SA') rowpre("Cohort ") // tstat p_value LB UB
366 |
367 |
368 | ereturn mat results= `SA'
369 |
370 |
371 | }
372 | cwf multiframe
373 | drop residsq-CohortMSE
374 | frame drop `firstframe'
375 | frame drop EffectFrame
376 | frame drop wideframe
377 | }
378 | cwf `originalframe'
379 |
380 | cap frame drop `defname'
381 | cap frame drop multiframe
382 | return clear
383 | end
384 |
385 | /**********************************************************
386 |
387 | *Section 1: Data Setup
388 |
389 | **********************************************************/
390 |
391 | prog numcheck, eclass
392 | // Original Data checking
393 | syntax, ///
394 | unit(varname) ///
395 | time(varname) ///
396 | depvar(varname) ///
397 | [transform(string)] ///
398 | treated(varname) cfframe(string) trnum(numlist min=1 max=1 >=1 int)
399 |
400 |
401 | /*#########################################################
402 |
403 | * Section 1.1: Extract panel vars
404 |
405 | Before DD can be done, we need panel data.
406 |
407 | a) Numeric
408 | b) Non-missing and
409 | c) Non-Constant
410 |
411 | *########################################################*/
412 |
413 | /*
414 | di as txt "{hline}"
415 | di as txt "Forward Difference in Differences"
416 | di as txt "{hline}"
417 | */
418 |
419 |
420 | /*The panel should be balanced, but in case it isn't somehow, we drop any variable
421 | without the maximum number of observations (unbalanced) */
422 |
423 |
424 | foreach v of var `unit' `time' `depvar' {
425 | cap {
426 | conf numeric v `v', ex // Numeric?
427 |
428 | as !mi(`v') // Not missing?
429 |
430 | qui: su `v'
431 |
432 | as r(sd) ~= 0 // Does the unit ID change?
433 | }
434 | }
435 |
436 | if _rc {
437 |
438 |
439 |
440 | disp as err "All variables `unit' (ID), `time' (Time) and `depvar' must be numeric, not missing and non-constant."
441 | exit 498
442 | }
443 |
444 |
445 |
446 | frame put `time' if `unit' == `trnum', into(`cfframe')
447 |
448 | end
449 |
450 |
451 | prog treatprocess, eclass
452 |
453 | syntax, time(varname) unit(varname) treated(varname) trnum(numlist min=1 max=1 >=1 int)
454 |
455 | /*#########################################################
456 |
457 | * Section 1.2: Check Treatment Variable
458 |
459 | Before DD can be done, we need a treatment variable.
460 |
461 |
462 | The treatment enters at a given time and never leaves.
463 | *########################################################*/
464 |
465 |
466 | qui xtset
467 | loc time_format: di r(tsfmt)
468 |
469 | qui su `time' if `treated' ==1 & `unit'==`trnum'
470 |
471 | loc last_date = r(max)
472 | loc interdate = r(min)
473 |
474 | qui su `unit' if `treated'==1
475 |
476 | loc treated_unit = r(mean)
477 |
478 | qui insp `time' if `treated' ~= 1 & `unit'==`treated_unit'
479 |
480 | loc npp = r(N)
481 |
482 |
483 | if !_rc {
484 |
485 | su `unit' if `treated' ==1, mean
486 |
487 | loc clab: label (`unit') `treated_unit'
488 | loc adidtreat_lab: disp "`clab'"
489 |
490 |
491 | qui: levelsof `unit' if `treated' == 0 & `time' > `interdate', l(labs)
492 |
493 | local lab : value label `unit'
494 |
495 | foreach l of local labs {
496 | local all `all' `: label `lab' `l'',
497 | }
498 |
499 | loc controls: display "`all'"
500 |
501 | //display "Treatment is measured from " `time_format' `interdate' " to " `time_format' `last_date' " (`npp' pre-periods)"
502 |
503 |
504 | qui su `unit' if `treated' == 0
505 |
506 | loc dp_num = r(N) - 1
507 |
508 | cap as `dp_num' >= 2
509 | if _rc {
510 |
511 | di in red "You need at least 2 donors for every treated unit"
512 | exit 489
513 | }
514 | //di as res "{hline}"
515 |
516 | }
517 |
518 | ereturn loc interdate = `interdate'
519 |
520 |
521 | end
522 |
523 |
524 |
525 | prog est_dd, eclass
526 |
527 | syntax, ///
528 | time(varname) ///
529 | interdate(numlist min=1 max=1 >=1 int) ///
530 | [intname(string)] ///
531 | [outlab(string)] ///
532 | [gr1opts(string asis)] ///
533 | [gr2opts(string asis)] ///
534 | treatst(string asis) ///
535 | panel(string asis) ///
536 | outcome(string asis) ///
537 | trnum(numlist min=1 max=1 >=1 int) ///
538 | treatment(string) [outlab(string asis)] ///
539 | cfframe(string) ntr(numlist min=1 max=1 >=1 int) copyname(string asis) [placebo(string)]
540 |
541 |
542 | local curframe = c(frame)
543 |
544 | tempname __reshape
545 |
546 | frame copy `curframe' `__reshape'
547 |
548 | cwf `__reshape'
549 |
550 |
551 | qbys `panel': egen et = max(`treatment')
552 |
553 | qui keep if et ==0 | `panel'==`trnum'
554 |
555 |
556 | qui keep `panel' `time' `outcome' `treatment'
557 |
558 | if "`placebo'" == "" {
559 |
560 |
561 | qui xtdidregress (`outcome') (`treatment'), group(`panel') time(`time')
562 |
563 | loc DDLB = r(table)[5,1]
564 | loc DDUB = r(table)[6,1]
565 | loc DDATT = r(table)[1,1]
566 | loc DDT= r(table)[3,1]
567 | loc ddse = r(table)[1,1]/`DDT'
568 | drop `treatment'
569 |
570 | }
571 |
572 | else {
573 |
574 | qui sdid_event `outcome' `panel' `time' `treatment', method(did) brep(500) placebo(all)
575 |
576 | loc ddse = e(H)[1,2]
577 | loc DDATT = e(H)[1,1]
578 |
579 | loc DDLB = e(H)[1,3]
580 |
581 |
582 | loc DDUB = e(H)[1,4]
583 | loc DDT= abs(`DDATT'/`ddse')
584 |
585 | drop `treatment'
586 | }
587 |
588 |
589 | qui reshape wide `outcome', j(`panel') i(`time')
590 |
591 | qui: tsset `time'
592 | loc time_format: di r(tsfmt)
593 |
594 |
595 | format `time' `time_format'
596 |
597 | order `outcome'`trnum', a(`time')
598 |
599 | qui ds
600 |
601 | loc temp: word 1 of `r(varlist)'
602 |
603 | loc time: disp "`temp'"
604 |
605 | loc t: word 2 of `r(varlist)'
606 |
607 | loc treated_unit: disp "`t'"
608 |
609 | local strings `r(varlist)'
610 |
611 |
612 | local trtime `treated_unit' `time'
613 |
614 | local predictors: list strings- trtime
615 |
616 | loc N0: word count `predictors'
617 |
618 | // Set U is a now empty set. It denotes the order of all units whose values,
619 | // when added to DID maximize the pre-period r-squared.
620 |
621 | local U
622 |
623 | // We use the mean of the y control units as our predictor in DID regression.
624 | // cfp= counterfactual predictions, used to calculate the R2/fit metrics.
625 |
626 | tempvar ym cfp ymeandid
627 |
628 |
629 | // Here is the placeholder for max r2.
630 | tempname max_r2
631 |
632 |
633 | // Forward Selection Algorithm ...
634 |
635 | qui summarize `treated_unit' if `time' < `interdate'
636 | local mean_observed = r(mean)
637 |
638 | * Calculate the Total Sum of Squares (TSS)
639 | qui generate double tss = (`treated_unit' - `mean_observed')^2 if `time' < `interdate'
640 | qui summarize tss if `time' < `interdate'
641 | local TSS = r(sum)
642 |
643 |
644 | egen `ymeandid'= rowmean(`predictors')
645 |
646 | constraint 1 `ymeandid' = 1
647 |
648 |
649 | qui cnsreg `treated_unit' `ymeandid' if `time' < `interdate' , constraint(1)
650 |
651 | // We predict our counterfactual
652 |
653 | qui predict cfdd`trnum'
654 |
655 | // Now we calculate the pre-intervention R2 statistic.
656 |
657 |
658 | * Calculate the Residual Sum of Squares (RSS)
659 | qui generate double rss = (`treated_unit' - cfdd`trnum')^2 if `time' < `interdate'
660 | qui summarize rss if `time' < `interdate'
661 | local RSS = r(sum)
662 |
663 | clonevar ymeandid`trnum' = `ymeandid'
664 |
665 | scalar DDr2 = 1 - (`RSS' / `TSS')
666 |
667 | // Forward Selection Algorithm ...
668 |
669 | local r2max 0
670 | while ("`predictors'" != "") {
671 |
672 | scalar `max_r2' = -99999999999
673 |
674 |
675 | foreach var of local predictors {
676 |
677 | // Drops these, as we need them for each R2 calculation
678 |
679 | cap drop rss
680 |
681 | {
682 |
683 | // We take the mean of each element of set U and each new predictor.
684 |
685 |
686 | egen `ym' = rowmean(`U' `var')
687 |
688 | // The coefficient for the control average has to be 1.
689 |
690 | constraint 1 `ym' = 1
691 |
692 |
693 | qui cnsreg `treated_unit' `ym' if `time' < `interdate' , constraint(1)
694 |
695 | // We predict our counterfactual
696 |
697 | qui predict `cfp' if e(sample)
698 |
699 | // Now we calculate the pre-intervention R2 statistic.
700 |
701 |
702 | * Calculate the Residual Sum of Squares (RSS)
703 | qui generate double rss = (`treated_unit' - `cfp')^2 if e(sample)
704 | qui summarize rss
705 | local RSS = r(sum)
706 |
707 |
708 | loc r2 = 1 - (`RSS' / `TSS')
709 |
710 | if `r2' > scalar(`max_r2') {
711 |
712 |
713 | scalar `max_r2' = `r2'
714 | local new_U `var'
715 |
716 |
717 | }
718 |
719 |
720 |
721 | // Here we determine which unit's values maximize the r2.
722 |
723 | drop `ym'
724 | drop `cfp'
725 |
726 | // We get rid of these now as they've served their purpose.
727 |
728 | }
729 |
730 | }
731 |
732 |
733 | local U `U' `new_U' // We add the newly selected unit to U.
734 |
735 | if scalar(`max_r2')>`r2max'{
736 | local r2max= scalar(`max_r2')
737 | local best_model = "`U'"
738 | }
739 |
740 | // and get rid of it from the predictors list.
741 |
742 | local predictors : list predictors - new_U
743 |
744 | }
745 |
746 |
747 | cwf `cfframe'
748 |
749 | //di "`best_model'"
750 |
751 |
752 | qui frlink 1:1 `time', frame(`__reshape')
753 |
754 | qui frget `treated_unit' `best_model' cfdd`trnum' ymeandid, from(`__reshape') //
755 | tempname longframeone
756 | frame put `time' `treated_unit' `best_model', into(`longframeone')
757 |
758 | frame `longframeone' {
759 | qui reshape long `outcome', i(`time') j(id)
760 | qui g treat = cond(id==`trnum'& `time'>=`interdate',1,0)
761 |
762 | qui su treat if treat==1
763 |
764 | loc post = r(N)
765 |
766 | qui xtset id `time'
767 |
768 | qui sdid_event `outcome' id `time' treat, method(did) brep(500) placebo(all)
769 | if "`placebo'" == "placebo" {
770 | loc FDDLB = e(H)[1,3]
771 | loc FDDUB = e(H)[1,4]
772 | }
773 | loc plase= e(H)[1,2]
774 | local row `= rowsof(e(H))'
775 |
776 |
777 | mat res = e(H)[2..`row',1..4]
778 |
779 | tempname newframe2
780 | mkf `newframe2'
781 | cwf `newframe2'
782 |
783 | qui svmat res
784 |
785 | qui rename (res1 res2 res3 res4) (eff se lb ub)
786 |
787 | g t = eff/se, a(se)
788 |
789 |
790 | gen eventtime = _n - 1 if !missing(eff)
791 | qui replace eventtime = `post' - _n if _n > `post' & !missing(eff)
792 | sort eventtime
793 |
794 |
795 | qui mkmat *, mat(longframe)
796 |
797 | }
798 |
799 | //di as txt "{hline}"
800 |
801 | egen ymeanfdid = rowmean(`best_model')
802 |
803 |
804 | // And estimate DID again.
805 | * Define the constraint
806 | constraint define 1 ymeanfdid = 1
807 |
808 | * Run the constrained regression
809 | qui cnsreg `treated_unit' ymeanfdid if `time' < `interdate', constraints(1)
810 | loc RMSE = e(rmse)
811 |
812 | qui predict cf`trnum'
813 |
814 | * Generate residuals
815 | qui gen residual = `treated_unit' - cf`trnum' if `time' < `interdate'
816 |
817 | * Calculate SS_res
818 | qui gen ss_res = residual^2 if `time' < `interdate'
819 | qui sum ss_res
820 | scalar SS_res = r(sum)
821 |
822 | * Calculate SS_tot
823 | qui sum `treated_unit' if `time' < `interdate'
824 | scalar meantr = r(mean)
825 | qui gen ss_tot = (`treated_unit' - meantr)^2 if `time' < `interdate'
826 | qui sum ss_tot if `time' < `interdate'
827 | scalar SS_tot = r(sum)
828 |
829 | * Calculate R2 manually
830 | scalar r2 = 1 - SS_res / SS_tot
831 |
832 | // Now we calculate our ATT
833 |
834 | qui su `treated_unit' if `time' >= `interdate'
835 |
836 | loc yobs = r(mean)
837 |
838 | * Here is the plot
839 |
840 | set tr off
841 |
842 | if ("`gr1opts'" ~= "") {
843 |
844 | if "`outlab'"=="" {
845 |
846 | loc outlab `outcome'
847 |
848 | }
849 |
850 |
851 | local fitname = "fit" + "`treatst'"
852 |
853 | local fitname_cleaned = subinstr("`fitname'", " ", "", .)
854 |
855 | // Define the string
856 | local myString `"`gr1opts'"'
857 |
858 | // Check if the word "name" is in the string
859 | local contains_name = strpos(`"`myString'"', "name")
860 |
861 | // Return 1 if the string contains the word "name", otherwise return 0
862 | local namecont = cond(`contains_name' > 0, 1, 0)
863 | di `namecont'
864 | cap as `namecont'==1
865 |
866 | if _rc != 0 {
867 |
868 |
869 | local fitname = "fit" + "`treatst'"
870 |
871 | local fitname_cleaned = subinstr("`fitname'", " ", "", .)
872 |
873 | loc fitname_cleaned: di ustrregexra("`fitname_cleaned'", "[?@#!{}%()]", "")
874 |
875 |
876 | loc grname name(`fitname_cleaned', replace)
877 |
878 | }
879 |
880 | twoway (line `treated_unit' `time', ///
881 | lcolor(black) lwidth(medthick)) ///
882 | (connected cf`trnum' `time', ///
883 | mcolor(gs11) msize(small) msymbol(smsquare) lcolor(gs11) lpattern(solid) lwidth(thin)) ///
884 | (connected cfdd`trnum' `time', ///
885 | mcolor(black) msize(small) msymbol(smcircle) lcolor(black) lwidth(thin)), ///
886 | ylabel(#5, grid glwidth(vthin) glcolor(gs4%20) glpattern(dash)) ///
887 | xline(`interdate', lwidth(medium) lpattern(solid) lcolor(black)) ///
888 | xlabel(#10, grid glwidth(vthin) glcolor(gs4%20) glpattern(dash)) ///
889 | legend(region(lcolor(none)) order(1 "Observed" 2 "FDID" 3 "DD") cols(1) position(3)) ///
890 | xsize(7.5) ///
891 | ysize(4.5) ///
892 | graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white)) ///
893 | plotregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white)) ///
894 | yti("`treatst' `outlab'") `grname' `gr1opts'
895 |
896 | }
897 |
898 |
899 |
900 | lab var cf`trnum' "FDID `treatst'"
901 | lab var `treated_unit'"Observed `treatst'"
902 |
903 | frame `copyname' {
904 |
905 | local n = ustrregexra("`best_model'","\D"," ")
906 |
907 | loc selected ""
908 |
909 | local nwords : word count `n'
910 |
911 |
912 | // We see which units were selected
913 |
914 | * Loop through each word in the macro `n`
915 | forv i = 1/`nwords' {
916 |
917 | local current_word : word `i' of `n'
918 |
919 | * Extract the ith word from the macro `n`
920 | local units: display "`: label (`panel') `current_word''"
921 |
922 | local selected `selected' `: label (`panel') `current_word'',
923 |
924 | loc controls: display "`selected'"
925 |
926 |
927 | }
928 |
929 | frame `cfframe': qui note: The selected units are "`controls'"
930 |
931 | }
932 |
933 |
934 | // see prop 2.1 of Li
935 |
936 |
937 | qui su `time' if `time' >=`interdate', d
938 | tempname t2
939 | scalar `t2' = r(N)
940 |
941 | qui su `time' if `time' <`interdate', d
942 | tempname t1
943 | scalar `t1' = r(N)
944 |
945 |
946 | g te`trnum' = `treated_unit' - cf`trnum'
947 |
948 | g ddte`trnum' =`treated_unit'-cfdd`trnum'
949 |
950 | lab var te`trnum' "Pointwise Treatment Effect"
951 | lab var ddte`trnum' "Pointwise DD Treatment Effect"
952 |
953 | qui g eventtime`trnum' = `time'-`interdate'
954 | tempvar residsq
955 |
956 | qui g `residsq' = te`trnum'^2 if eventtime`trnum' <0
957 |
958 | qui su if eventtime`trnum' <0
959 | scalar t1 = r(N)
960 |
961 | qui su eventtime`trnum' if eventtime`trnum' >=0
962 | scalar t2 = r(N)
963 |
964 | qui su `residsq', mean
965 | scalar o1hat=(scalar(t2) / scalar(t1))*(r(mean))
966 |
967 |
968 | qui su `residsq', mean
969 | scalar o2hat = (r(mean))
970 |
971 | scalar ohat = sqrt(scalar(o1hat) + scalar(o2hat))
972 |
973 | qui su te`trnum' if `time' >= `interdate'
974 |
975 |
976 | scalar ATT = r(mean)
977 |
978 | qui su cf`trnum' if eventtime`trnum' >=0
979 |
980 | scalar pATT = 100*scalar(ATT)/r(mean)
981 |
982 | if "`placebo'"== "placebo" {
983 | scalar SE = `plase'
984 | scalar CILB = `DDLB'
985 | scalar CIUB = `DDUB'
986 |
987 | }
988 |
989 | else {
990 |
991 | scalar SE = scalar(ohat)/sqrt(scalar(t2))
992 |
993 |
994 |
995 | loc FDDLB = scalar(ATT) - (((invnormal(0.975) * scalar(SE))))
996 |
997 | loc FDDUB = scalar(ATT) + (((invnormal(0.975) * scalar(SE))))
998 | }
999 | qui su cfdd`trnum' if eventtime`trnum' >=0
1000 |
1001 | scalar pATTDD = 100*`DDATT'/r(mean)
1002 |
1003 | qui rename (ymeandid ymeanfdid) (ymeandid`trnum' ymeanfdid`trnum')
1004 |
1005 | loc rmseround: di %9.5f `RMSE'
1006 | tempvar time2 coef se
1007 |
1008 | if ("`gr2opts'" ~= "") {
1009 |
1010 |
1011 | // Define the string
1012 | local myString "`gr2opts'"
1013 |
1014 | // Check if the word "name" is in the string
1015 | local contains_name = strpos("`myString'", "name")
1016 |
1017 | // Return 1 if the string contains the word "name", otherwise return 0
1018 | local namecont = cond(`contains_name' > 0, 1, 0)
1019 |
1020 |
1021 | cap as `namecont'==1
1022 |
1023 | if _rc != 0 {
1024 |
1025 |
1026 | local fitname = "te_" + "`treatst'"
1027 |
1028 | local fitname_cleaned = subinstr("`fitname'", " ", "", .)
1029 |
1030 | loc fitname_cleaned: di ustrregexra("`fitname_cleaned'", "[?@#!{}%()]", "")
1031 |
1032 |
1033 | loc grname name(`fitname_cleaned', replace)
1034 | }
1035 |
1036 | frame `newframe2' {
1037 |
1038 | twoway (rcap lb ub eventtime, fcolor(gs7%50) lcolor(gs7%50)) ///
1039 | (scatter eff eventtime, mc(black) ms(d) msize(*.5)), ///
1040 | ylabel(#5, grid glwidth(vthin) glcolor(gs4%20) glpattern(dash)) ///
1041 | xline(0, lwidth(medium) lpattern(solid) lcolor(black)) ///
1042 | yline(0, lwidth(medium) lpattern(solid) lcolor(black)) ///
1043 | xlabel(#10, grid glwidth(vthin) glcolor(gs4%20) glpattern(dash)) ///
1044 | legend(region(lcolor(none)) order(1 "95% CI" 2 "Treatment Effect") cols(1) position(3)) ///
1045 | xsize(7.5) ///
1046 | ysize(4.5) ///
1047 | graphregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white)) ///
1048 | plotregion(fcolor(white) lcolor(white) ifcolor(white) ilcolor(white)) ///
1049 | yti("{&tau} (treatment effect)") xti(t-`=ustrunescape("\u2113")' until Treatment) `grname' `gr2opts'
1050 | }
1051 | }
1052 | qui keep eventtime`trnum' `time' `treated_unit' cf`trnum' cfdd`trnum' te`trnum' ddte`trnum' ymeanfdid`trnum' ymeandid`trnum' //ci_top`trnum' ci_bottom`trnum'
1053 |
1054 | qui mkmat *, mat(series)
1055 |
1056 | scalar tstat = abs(scalar(ATT)/(scalar(SE)))
1057 | qui su ddte`trnum' if eventtime`trnum' < 0, mean
1058 |
1059 | loc DDRMSE = sqrt(r(mean))
1060 |
1061 | tempname my_matrix my_matrix2
1062 | matrix `my_matrix' = (scalar(ATT), scalar(pATT), scalar(SE), scalar(tstat), `FDDLB', `FDDUB', scalar(r2))
1063 | matrix `my_matrix2' = (`DDATT', round(scalar(pATTDD), 0.0001), `ddse', abs(`DDT'), `DDLB', `DDUB', round(scalar(DDr2), 0.0001))
1064 | matrix colnames `my_matrix' = ATT PATT SE t LB UB R2
1065 |
1066 | matrix colnames `my_matrix2' = ATT PATT SE t LB UB R2
1067 |
1068 | ereturn clear
1069 |
1070 | matrix b=scalar(ATT)
1071 | matrix V=scalar(SE)^2
1072 | matrix colnames b=`treatment'
1073 | matrix rownames b=`outcome'
1074 | matrix colnames V=`treatment'
1075 | matrix rownames V=`treatment'
1076 |
1077 | ereturn post b V, depname(`outcome')
1078 |
1079 |
1080 |
1081 | matrix rownames `my_matrix' = FDID
1082 | matrix rownames `my_matrix2' = DID
1083 | ereturn mat dyneff = longframe
1084 | ereturn loc U "`controls'"
1085 |
1086 | tempname resmat
1087 | mat `resmat' = `my_matrix' \ `my_matrix2'
1088 | ereturn mat results = `resmat'
1089 |
1090 | ereturn mat series = series
1091 |
1092 | * Assign the local N0U to the count of words in best_model and round if needed
1093 | local N0U: word count `best_model'
1094 |
1095 | tempname setting
1096 |
1097 | mat `setting' = (`N0',`N0U',`t1',`interdate',`t2',`t1'+`t2')
1098 |
1099 | mat rownames `setting' = "Setting"
1100 | mat colnames `setting' = "Total Controls" "Selected Controls" "Pre Periods" "Treatment Point" "Post Periods" "Total Time Periods"
1101 |
1102 | ereturn mat setting = `setting'
1103 |
1104 |
1105 | scalar p_value = 2 * (1 - normal(scalar(tstat)))
1106 |
1107 | local tabletitle "Forward Difference-in-Differences "
1108 |
1109 | if `ntr' == 1 {
1110 |
1111 | di ""
1112 | di ""
1113 |
1114 | di as res "`tabletitle'{c |} " " " "T0 R2: " %5.3f scalar(r2) " T0 RMSE: " `RMSE'
1115 | di as text "{hline 13}{c TT}{hline 63}"
1116 | di as text %12s abbrev("`outcome'",12) " {c |} ATT Std. Err. t P>|t| [95% Conf. Interval]"
1117 | di as text "{hline 13}{c +}{hline 63}"
1118 | di as text %12s abbrev("`treatment'",12) " {c |} " as result scalar(ATT) " " scalar(SE) %9.2f scalar(tstat) %9.3f scalar(p_value) " " `FDDLB' " " `FDDUB'
1119 | di as text "{hline 13}{c BT}{hline 63}"
1120 | * Display the footer information
1121 | di as text "Treated Unit: `treatst'"
1122 | di as res "FDID selects `controls' as the optimal donors."
1123 | di as text "See Li (2024) for technical details."
1124 |
1125 |
1126 | }
1127 | end
1128 |
--------------------------------------------------------------------------------
/fdid.pkg:
--------------------------------------------------------------------------------
1 | v 3
2 | d fdid: Stata module to estimate forward difference in differences for causal impact estimation.
3 |
4 | d {bf:Jared Greathouse, Andrew Young School of Policy Studies, Georgia State University.}
5 | d
6 | d {cmd:fdid} estimates treatment effects using the method documented in "Li, Kathleen T. (2024).
7 | d Frontiers: A simple forward difference-in-differences method. Marketing Science, 43(2), 267-279.
8 | d {browse "https://doi.org/10.1287/mksc.2022.0212"}".
9 | d
10 | d KW: difference in differences
11 | d KW: causal inference
12 | d KW: machine learning
13 | d
14 | d Requires: Stata version 16
15 | d
16 | d Distribution-Date: 20240714
17 | d
18 | d Author: Jared Greathouse
19 | d Support: email jgreathouse3@student.gsu.edu
20 | d
21 | f fdid.ado
22 | f fdid.sthlp
23 | f hcw.dta
24 | f basque.dta
25 | f barcelona.dta
26 | f smoking.dta
27 | f turnout.dta
28 | f fdid_test.do
29 |
--------------------------------------------------------------------------------
/fdid.sthlp:
--------------------------------------------------------------------------------
1 | {smcl}
2 | {* *! version 1.0.0 22jul2024}{...}
3 | {viewerjumpto "Syntax" "fdid##syntax"}{...}
4 | {viewerjumpto "Description" "examplehelpfile##description"}{...}
5 | {viewerjumpto "Options" "fdid##options"}{...}
6 | {viewerjumpto "Examples" "fdid##examples"}{...}
7 | {title:Forward Difference in Differences}
8 |
9 | {phang}
10 | {bf:fdid} {hline 2} Estimates Forward Difference-in-Differences.
11 |
12 |
13 | {marker linkspdf}{...}
14 | {title:Links to Online Documentation}
15 |
16 | For a more extended walkthrough, see the {browse "https://github.com/jgreathouse9/FDIDTutorial/blob/main/StataVignette.md":vignette}.
17 |
18 | {pstd}
19 |
20 |
21 | {marker syntax}{...}
22 | {title:Syntax}
23 |
24 | {p 8 17 2}
25 | {cmdab:fdid}
26 | [{depvar}]
27 | {ifin},
28 | {opt tr:eated}({varname}) [{opt gr1opts}({it:string}) {opt gr2opts}({it:string}) {cmd: unitnames}({it:{varname}}) {opt placebo}]
29 |
30 |
31 | {synoptset 20 tabbed}{...}
32 |
33 | {dlgtab:Requirements}
34 |
35 | {p2colreset}{...}
36 |
37 | {p 4 4 2}{helpb xtset} {it:panelvar} {it:timevar} must be used to declare a strongly balanced panel dataset without gaps. {cmd:sdid_event} is also {browse "https://github.com/DiegoCiccia/sdid/tree/main/sdid_event#github":required}.
38 |
39 | {p 4 4 2}
40 | {depvar} The numeric outcome of interest.{p_end}
41 |
42 | {p 4 4 2}
43 | {cmd: treated} Our treatment. It must be a 0 1 dummy, equal to 1 on and at the treatment date and afterwards. One may NOT use factor notation. {cmd:fdid} supports settings
44 | where we have multiple treated units with different treatment dates.
45 | Note that we assume the treatment is fully absorbed (once treated, always treated). {p_end}
46 | {synoptline}
47 | {p2colreset}{...}
48 | {p 4 6 2}
49 |
50 | {marker description}{...}
51 | {title:Description}
52 |
53 | {p 4 4 2}
54 | {cmd:fdid} estimates the average treatment effect on the treated unit
55 | as proposed by {browse "https://doi.org/10.1287/mksc.2022.0212": Li (2024)}. {cmd:fdid} selects the optimal pool
56 | of control units via forward selection. Below, we describe the selection algorithm for {cmd:fdid}. {p_end}
57 |
58 | {p 4 4 2}
59 | We begin with no selected control units. We first select the most optimal control unit by using linear regression to predict the
60 | pre-intervention outcome vector of the treated unit using each of the control units in a single-predictor OLS model.
61 | Of these controls, we select the control unit that has the highest R-squared statistic among them. Then, we select the next
62 | optimal control unit by taking the average of the first selected control and each of the units in the now N0-1 control group.
63 | Of these N0-1 models, we select the two-unit model with the highest R-squared statistic. We then add this next-best control unit
64 | to the next model, and continue like so until we have as many DID models as we have control units. In the process, we put the unit
65 | with the highest R2 at the front of the list of the new control group. {p_end}
66 |
67 | {p 4 4 2}
68 | Next, we select the optimal DID model. We do this by iteratively adding new control units.
69 | Since the algorithm has already sorted each new selected control by it's R-squared value, we store the value of the current highest R-squared statistic.
70 | For example, suppose the first model, using the first selected unit, has an R-squared of 50, and when we add the next selected unit the R-squared goes to 60. We would prefer this model
71 | using the two units since its R-squared statistic is higher than the one unit model. If another unit increases the R-Squared statistic, we add it to the optimal control group. We continue like so, using whatever DID model of the total N0 DID models has the highest R-squared statsitic.
72 | Naturally, the DID model with the highest R-squared statistic becomes the optimal DID model because this means that it has selected the optimal control group. {p_end}
73 |
74 | {p 4 4 2}
75 | After selecting the optimal control group, {cmd:fdid} calculates the treatment effect
76 | along with confidence intervals using the inference procedure as described in {browse "https://doi.org/10.1287/mksc.2022.0212": Li (2024)}. When there are many treated units,
77 | we take the expectation of the event-time ATTs across cohorts. {p_end}
78 |
79 |
80 | {marker options}{...}
81 | {title:Options}
82 |
83 | {dlgtab:Main}
84 |
85 | {phang}
86 | {opt gr1opts}: If specified, edits the display options of the observed versus predicted plot. It accepts the string literally as is. For example,
87 | {cmd: fdid gdp, tr(treat) unitnames(state) gr1opts(scheme(sj) name(hcw, replace))} returns a plot formatted in the most recent version of the Stata Journal's scheme, with the plot being named hcw. If not specified, no plot is created.
88 |
89 | {phang}
90 | {opt gr2opts}: If specified, edits the display of the treatment effect plot. It accepts the string literally as is. For example,
91 | {cmd: fdid gdp, tr(treat) unitnames(state) gr2opts(scheme(sj) name(hcwte, replace))} returns a plot formatted in the most recent version of the Stata Journal's scheme, with the plot being named hcwte. If not specified, no plot is created.
92 |
93 | {phang}
94 | {opt unitnames}: {cmd: fdid} requires the panel variable to have value {help label}s, where a number is indexed to a string (i.e., 1="australia"). If the panel variable already has them, no error is returned.
95 | However, if the panel does not come with value labels,
96 | then the user must specify the string variable they wish to use as the panel variable's value labels.
97 | Note that each string value pair must be uniquely identified.
98 |
99 | {phang}
100 | {opt placebo}: If specified, reports the placebo standard error for the ATT from {browse "https://github.com/DiegoCiccia/sdid/tree/main/sdid_event#github":sdid_event}.
101 |
102 | {synoptline}
103 |
104 | {marker results}{...}
105 | {title:Stored Results}
106 |
107 | {pstd}
108 | {cmd:fdid} stores the following in e():
109 |
110 | {synoptset 20 tabbed}{...}
111 | {p2col 5 20 24 2: Scalars}{p_end}
112 | {synopt:{cmd:e(T1)}}number of pre-intervention periods.{p_end}
113 | {synopt:{cmd:e(T0)}}treatment point.{p_end}
114 | {synopt:{cmd:e(T2)}}number of post-intervention periods{p_end}
115 | {synopt:{cmd:e(T)}}number of time periods.{p_end}
116 | {synopt:{cmd:e(N0)}}Number of controls.{p_end}
117 | {synopt:{cmd:e(N0U)}}Number of controls selected by FDID.{p_end}
118 |
119 | {synoptset 20 tabbed}{...}
120 | {p2col 5 20 24 2: Macros}{p_end}
121 | {synopt:{cmd:e(U)}}list of selected controls selected by FDID method (singe treated unit only).{p_end}
122 | {synopt:{cmd:e(depvar)}}dependent variable.{p_end}
123 | {synopt:{cmd:e(properties)}}list of properties.{p_end}
124 |
125 | {synoptset 20 tabbed}{...}
126 | {p2col 5 20 24 2: Matrices}{p_end}
127 | {synopt:{cmd:e(series)}}A matrix containing the time, observed values, counterfactual values, pointwise treatment effect, event time, and means for the all controls and FDID controls.{p_end}
128 | {synopt:{cmd:e(results)}}Table of results {p_end}
129 | {synopt:{cmd:e(b)}}Coefficients.{p_end}
130 | {synopt:{cmd:e(V)}}Covariance matrix.{p_end}
131 | {synopt:{cmd:e(dyneff)}}Dynamic Treatment Effects.{p_end}
132 |
133 | {marker examples}{...}
134 | {title:Examples}
135 |
136 | {phang}
137 |
138 | Users may install {cmd:fdid} like {stata "net install fdid, from(https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/main) replace"}.
139 |
140 | To obtain the data files, we do: {stata "net get fdid, from(https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/main) replace"}.
141 |
142 |
143 | Replicating HCW2012
144 |
145 | {stata "u hcw, clear"}
146 |
147 | {stata "fdid gdp, tr(treat) unitnames(state)"}
148 |
149 |
150 | {phang}
151 |
152 |
153 | {hline}
154 |
155 | {title:References}
156 | {p 4 8 2}
157 |
158 | Li, K. T. (2024). Frontiers: A simple forward difference-in-differences method. Marketing Science, 43(2), 267-279. {browse "https://doi.org/10.1287/mksc.2022.0212"}
159 |
160 | Abadie, A., & Gardeazabal, J. (2003). The economic costs of conflict: A case study of the basque country. Am. Econ. Rev., 93(1), 113-132. {browse "https://doi.org/10.1257/000282803321455188 "}
161 |
162 | Hsiao, C., Steve Ching, H., & Ki Wan, S. (2012). A panel data approach for program evaluation: Measuring the benefits of political and economic integration of hong kong with mainland china.
163 | J. Appl. Econom., 27(5), 705-740. {browse "https://doi.org/10.1002/jae.1230"}
164 |
165 | {title:Contact}
166 |
167 | Jared Greathouse, Georgia State University -- {browse "https://jgreathouse9.github.io/":personal website}
168 | Emails--
169 | Student: {browse "jgreathouse3@student.gsu.edu"}
170 | Personal: {browse "j.greathouse200@gmail.com"}
171 |
172 | Email me with questions, comments, suggestions or bug reports.
173 |
174 |
175 | {hline}
176 |
177 |
178 |
--------------------------------------------------------------------------------
/fdid_test.do:
--------------------------------------------------------------------------------
1 | qui {
2 | clear *
3 | set varabbrev off
4 | cls
5 | u "hcw.dta", clear
6 | }
7 | fdid gdp, tr(treat) unitnames(state) gr1opts(scheme(sj) name(econint, replace))
8 | fdid gdp if inrange(time,1,45), tr(polint) unitnames(state) gr1opts(name(polint, replace) scheme(sj))
9 |
10 | qui u smoking, clear
11 |
12 | fdid cigsale, tr(treated) unitnames(state) gr1opts(scheme(sj) name(p99, replace))
13 |
14 | qui u basque, clear
15 |
16 | fdid gdpcap, tr(treat) gr1opts(scheme(sj) name(eta, replace))
17 |
18 | qui u barcelona, clear
19 |
20 | fdid indexed_price, tr(treat) unitnames(fullname) gr1opts(name(barcelona, replace) scheme(sj))
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/fdidevent.do:
--------------------------------------------------------------------------------
1 | // requires sdid_event, fdid
2 |
3 | clear * // We can leave comments in our code like this
4 | cls
5 |
6 | use /// or we can leave comments like
7 | "https://github.com/jgreathouse9/FDIDTutorial/raw/main/smoking.dta" // this
8 |
9 | qui fdid cigsale, tr(treat) unitnames(state)
10 |
11 |
12 | ereturn list
13 | loc post = e(T2)
14 |
15 | mkf newframe
16 | cwf newframe
17 | svmat e(didframe), names(col)
18 | xtset id year
19 | qui sdid_event cigsale id year treat, method(did) brep(1000) placebo(all)
20 |
21 |
22 | local row `= rowsof(e(H))'
23 |
24 |
25 |
26 | mat res = e(H)[2..`row',1..4]
27 |
28 | mkf newframe2
29 | cwf newframe2
30 |
31 | svmat res
32 |
33 | rename (res1 res2 res3 res4) (eff se lb ub)
34 |
35 |
36 | gen eventtime = _n - 1 if !missing(eff)
37 | replace eventtime = `post' - _n if _n > `post' & !missing(eff)
38 | sort eventtime
39 |
40 | twoway (rcap lb ub eventtime, lcolor(black)) ///
41 | (scatter eff eventtime, mc(blue) ms(d)), ///
42 | legend(off) ///
43 | title(sdid_event) xtitle(Time to Event) ///
44 | ytitle(Pointwise Effect) ///
45 | yline(0,lc(red) lp(-)) xline(0, lc(black) lp(solid))
46 |
--------------------------------------------------------------------------------
/fitCali.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/fitCali.png
--------------------------------------------------------------------------------
/fithongkong.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/fithongkong.png
--------------------------------------------------------------------------------
/hcw-data.txt:
--------------------------------------------------------------------------------
1 | 0.062 0.040489125 -0.01308351 0.01006395 -0.012291821 -0.028357059 -0.015176804 -0.019679945 -0.023382736 0.012683435 0.058586402 0.043746305 -0.008439514 0.047243912 0.004946546 -0.032864896 0.015124314 0.022959217 0.087144743 -0.004380854 0.064024444 0.085938382 0.08 0.064902223 0.143
2 | 0.059 0.037856919 -0.007580798 0.021263869 -0.003092842 -0.023396894 -0.014549011 -0.015441155 -0.018115838 -0.005570701 0.069521038 0.012290427 0.006628538 0.038758688 0.020743349 -0.019817572 0.014794644 0.018935921 0.11807464 0.016635614 0.0660682 0.131188613 0.08 0.065123477 0.141
3 | 0.058 0.022509481 0.000542671 0.018919427 -0.007764421 -0.006017587 -0.016703892 -0.012700839 -0.016874733 -0.017558422 0.081646312 0.004462324 0.008677404 0.089917527 0.038871277 -0.004587253 0.029149047 0.017989993 0.111129529 0.031504281 0.05795939 0.10966639 0.08 0.067379182 0.135
4 | 0.062 0.02874655 0.001180751 0.025316832 -0.004048589 -0.004773978 -0.007475638 -0.011666753 -0.004963401 -0.010101253 0.08553336 0.015492945 0.00431236 0.06975085 0.053402924 0.013651317 0.036581146 0.020683461 0.125323696 0.034007481 0.062364893 0.075800585 0.08 0.069164437 0.135
5 | 0.079 0.033990391 0.025510849 0.043567149 0.031094401 0.0128856 0.003748037 0.022950407 -0.002249426 -0.022502562 0.08592229 0.034447585 0.016938685 0.06019911 0.031001539 0.026643555 0.030078254 0.029918331 0.130708998 0.049344055 0.049743123 0.049147143 0.112508784 0.0694515 0.125
6 | 0.068 0.037919372 0.019941313 0.050225385 0.064280003 0.035090496 0.016164869 0.02107002 0.011634742 -0.005157348 0.088414562 0.063911468 0.01898763 0.062555176 0.051312977 0.010767571 0.040359236 0.037840316 0.100986946 0.059129677 0.071988438 0.061173256 0.092613207 0.070135385 0.12
7 | 0.046 0.052289413 0.017087875 0.065121831 0.045955455 0.03524699 0.023914958 0.02066158 0.026411771 0.01408692 0.09279576 0.064321549 0.019583005 0.042924768 0.024323747 0.013645424 0.038045371 0.035534446 0.115983792 0.062869611 0.069357222 0.101110162 0.050375024 0.06929842 0.12
8 | 0.052 0.031070896 0.023035197 0.067330679 0.055166411 0.057250918 0.029710713 0.028744144 0.034282614 0.005427073 0.117461165 0.066942185 0.032141251 0.047608971 0.027523562 0.013112375 0.04503644 0.035410696 0.092047012 0.062090843 0.07034696 0.134977749 0.093951561 0.076048787 0.126
9 | 0.037 0.008696091 0.025292696 0.050921204 0.048057177 0.068382369 0.027445901 0.016826182 0.025393855 0.003918574 0.115387516 0.043418145 0.027248813 0.022149924 0.065271948 -0.00491017 0.031924391 0.026520307 0.066012658 0.052228445 0.103778145 0.130363062 0.10677946 0.071103882 0.115
10 | 0.029 0.006773674 0.021849955 0.031525062 0.011953605 0.079265123 0.021707601 0.028714841 0.022611853 0.01534905 0.121888917 -0.059942821 0.028448495 0.023302692 0.039222341 -0.000607307 0.03579039 0.012289612 0.079019644 0.041677561 0.083251981 0.119140255 0.112030624 0.069214347 0.106
11 | 0.012 0.00302829 0.018319173 0.018179956 0.02080968 0.085121775 0.014151847 0.026246209 0.028763934 0.015939281 0.132273511 -0.079015292 0.037761187 0.030454873 0.044369247 -0.009705493 0.03183939 0.017521932 0.087518521 0.032848697 0.070873384 0.065084514 0.083972276 0.063416637 0.101
12 | 0.015 0.010981606 0.013456931 0.015165864 0.008303516 0.085290234 0.007562377 0.012738997 0.027091569 0.026233075 0.092350922 -0.067619265 0.034739149 0.03069211 0.035111291 -0.012893527 0.017692653 0.013191209 0.079929979 0.036557487 0.076905152 0.074478603 0.045678924 0.048751686 0.105
13 | 0.025 0.038182054 0.015387368 0.007820651 0.010101924 0.049421759 0.008411554 0.000539058 0.034349544 0.026878431 0.094771994 -0.055921859 0.021425367 0.040892883 0.064231019 -0.001841571 0.033937339 0.016644276 0.111705937 0.032691728 0.038864438 0.10384238 0.005896824 0.054663768 0.102
14 | 0.036 0.034520058 0.017335817 0.011510062 0.030858829 0.029333903 0.003447306 -0.001448213 0.01694126 0.02247066 0.072357561 0.022376847 0.031889828 0.035847488 0.06247909 0.006181178 0.036520069 0.03002029 0.087512571 0.046899963 0.066379801 0.092732028 0.049586283 0.059415465 0.097
15 | 0.047 0.036673187 0.013595447 0.021660071 0.040113213 0.017545131 0.008331483 -0.000206099 0.014924653 0.012732653 0.05540767 0.054118563 0.025962524 0.037665111 0.086662181 -0.001991582 0.040161587 0.027754079 0.043669451 0.074278551 0.08834065 0.09690417 0.060579891 0.059393111 0.094
16 | 0.059 0.038987448 0.004195063 0.024703528 0.025693668 0.020857018 0.00662265 0.004315525 0.004135717 0.021495101 0.057294044 0.05555995 0.026266427 0.025593405 0.093091836 -0.001819066 0.037069481 0.030190223 0.060471179 0.028699798 0.129307728 0.095856097 0.043259392 0.07015109 0.096
17 | 0.058 0.025748034 -0.001534697 0.035775364 0.029461837 0.054266962 0.004803411 0.006648675 0.005500302 0.025330317 0.03170797 0.028028429 0.045406458 0.005592703 0.053712455 -0.004651872 0.03831716 0.033215288 0.055836322 0.052474672 0.130338535 0.068247624 -0.006867101 0.063613158 0.094
18 | 0.072 0.051862568 -0.002021203 0.038687725 0.0398559 0.055035837 0.018440822 0.006054242 0.029754225 0.006163891 0.05336402 0.056594745 0.041410674 0.034571752 0.073912796 0.005536613 0.039479035 0.035988372 0.089873248 0.051604752 0.100465648 0.06503466 -0.024811621 0.060896649 0.095
19 | 0.061 0.05928891 0.016148099 0.036729108 0.017642381 0.072253713 0.022483595 -0.002864786 0.025613428 -0.00346344 0.047719189 0.03932801 0.045173264 0.033532163 0.049587339 0.024280012 0.04032491 0.041709459 0.104316594 0.03768199 0.113961304 0.07327511 -0.03598934 0.064956724 0.089
20 | 0.014 0.063420685 0.017984419 0.0380428 0.03139775 0.073819245 0.033565788 0.000127711 0.036127953 -0.011225041 0.05488339 0.041836754 0.054115731 0.018211482 0.047910697 0.030494367 0.048353637 0.038636537 0.069337893 0.05894723 0.071993689 0.105052074 -0.045058163 0.065702835 0.088
21 | -0.032 0.062430177 0.030473691 0.033733727 0.033021095 0.07494442 0.043256898 0.022189243 0.026713652 -0.037047577 -0.028291378 0.071942732 0.048870128 0.018747105 0.007999274 0.040487268 0.047223408 0.042131057 0.026082442 0.018309548 0.129104251 0.043080731 -0.041647548 0.057225139 0.072
22 | -0.061 0.039491858 0.032317629 0.028866186 -0.010453694 0.085745939 0.039632969 0.010151787 0.019890042 -0.030667042 -0.094909072 0.043571362 0.037919661 -0.017352612 0.003761194 0.035309884 0.041360687 0.032315919 -0.016480034 0.010905336 -0.003199045 -0.0204842 -0.141182742 0.049895882 0.07
23 | -0.081 0.043181464 0.027196712 0.018947297 0.02242801 0.073973636 0.036534643 0.015577458 0.013835544 -0.018140731 -0.108108096 0.049928923 0.037527635 -0.019380177 -0.007978701 0.025482864 0.045809149 0.03165071 -0.042880406 0.008935778 -0.085406035 -0.05341627 -0.139601597 0.039404163 0.072
24 | -0.065 0.035846256 0.022831631 0.022483978 0.01477213 0.057491169 0.031087347 0.009057211 0.01712591 -0.026572912 -0.114551782 0.012183241 0.027419936 0.026928593 -0.01652179 0.015011846 0.039859048 0.040041524 -0.020253879 -0.009510532 -0.160642884 -0.145889916 -0.081585851 0.032540552 0.078
25 | -0.029 0.040899996 0.022420661 0.03785257 0.003396192 0.043853777 0.024459097 0.012833396 0.012470107 -0.013856981 0.007458419 -0.003817665 0.032389268 0.038890565 0.003396671 0.00612709 0.035512681 0.03784356 0.031149297 0.01564014 -0.195945703 -0.085261257 -0.068189315 0.046425014 0.083
26 | 0.005 0.038574985 0.028941363 0.048189992 0.029012985 0.024488765 0.023045797 0.012360672 0.011877243 -0.003746159 0.072648258 0.024232743 0.037503536 0.055414725 0.022323632 -0.005025436 0.03944164 0.037037852 0.068211974 0.041098479 -0.071362048 0.022742623 -0.00336661 0.065049369 0.076
27 | 0.039 0.026920092 0.038399996 0.063958385 0.010058184 0.031812036 0.026949211 0.017133279 0.017065058 -0.01438158 0.117399195 0.024181662 0.042164352 0.066648484 0.084775816 0.011326927 0.039239099 0.03471831 0.089711484 0.053023458 -0.014795235 0.058390438 0.04479968 0.047783895 0.074
28 | 0.083 0.03297373 0.038839519 0.064808811 0.026324545 0.039744576 0.031031525 0.023336932 0.019880951 -0.007076916 0.125923473 0.053241346 0.053575053 0.04101127 0.123958781 0.031112976 0.040825844 0.035377295 0.087188237 0.06136366 0.052130653 0.126366103 0.024073414 0.063896011 0.071
29 | 0.107 0.039596529 0.036271238 0.067251277 0.038574069 0.040860688 0.040201527 0.014567788 0.030316349 0.017714082 0.105816145 0.103715138 0.060761232 0.060542022 0.185684481 0.033385332 0.049522544 0.028335607 0.095852369 0.117985585 0.184236976 0.171468016 0.050799214 0.07698605 0.081
30 | 0.075 0.046876584 0.030074897 0.072955543 0.030016805 0.049907722 0.04084387 0.024026305 0.034473216 0.012408664 0.082337322 0.096315439 0.05964525 0.034375283 0.15893399 0.043154046 0.046534181 0.035570893 0.086742247 0.109919193 0.202799392 0.129321347 0.054830676 0.050742927 0.082
31 | 0.076 0.024391215 0.016178379 0.065640524 0.037173891 0.046138983 0.032336309 0.012354971 0.033131119 0.017866402 0.051854125 0.091217974 0.055389255 0.020759299 0.134925298 0.032836917 0.038264634 0.022754161 0.103831521 0.107863644 0.206566161 0.105891518 0.029734848 0.062556592 0.082
32 | 0.063 0.005838784 0.011539886 0.053217788 0.036737546 0.039393413 0.035590315 0.000387031 0.029593682 0.022577387 0.028516137 0.060445722 0.046784511 0.03254855 0.114690613 0.015631351 0.035624467 0.010894737 0.09664023 0.092376501 0.206786724 0.067812947 0.04196519 0.035074707 0.08
33 | 0.027 0.000732203 0.005441154 0.040560791 0.014247549 0.048968059 0.032272223 0.009609395 0.029250063 0.011066441 0.03457192 0.022440578 0.041545519 0.015890202 0.047596845 0.023107373 0.04070591 0.006540262 0.044885106 0.027668039 0.115513357 -0.006077476 0.028546476 0.00604108 0.081
34 | 0.015 -0.002025588 -0.005665725 0.007522337 0.007846387 0.031123158 0.018368607 -0.00304484 0.021218815 0.004616983 0.033580325 -0.006739162 0.030092004 0.054969338 0.041789475 0.009204748 0.030017895 -0.002529215 -0.007359081 0.026144858 0.126113137 -0.039040019 0.030258519 -0.031680916 0.079
35 | -0.001 0.028212607 -0.004335946 -0.017121906 0.01250594 0.019187103 0.021664386 0.001214802 0.014313981 -0.008811536 0.029444975 -0.018054251 0.02333076 0.036681066 -0.006837184 -0.000601392 0.025048116 0.000703428 -0.068045134 0.030249639 0.064460886 -0.048593715 0.030496837 -0.047382226 0.076
36 | -0.017 0.039982471 -0.000616657 -0.014710952 -0.00049339 0.013611011 0.011647254 0.016786478 0.01171309 -0.020385142 0.032666684 -0.022018848 0.017814843 0.049871839 -0.04528562 0.006773104 0.036088575 0.008620807 -0.064322097 0.037251147 0.031538988 -0.064756439 0.014515819 -0.014868587 0.073
37 | -0.01 0.03867529 0.003416942 -0.011813972 -0.00188065 -0.002925243 0.008660264 -0.005115074 0.006507139 -0.017498575 0.06034927 -0.028404343 -5.79803E-05 0.050448679 -0.04280226 0.000546396 0.028365747 0.01819515 -0.005363067 0.039232891 -0.014847404 -0.010393871 0.048033017 0.01328333 0.076
38 | 0.005 0.040834838 0.010456725 0.013136681 0.014913679 0.012758558 0.019017914 -0.001795724 0.011429295 -0.011544514 0.067838217 0.038315957 0.003731768 0.011498967 -0.02080591 0.00296127 0.041971524 0.016095939 0.047382885 0.055258571 -0.037004602 0.044844131 0.050304522 0.047137118 0.078
39 | 0.028 0.032215477 0.012784331 0.029683299 0.002654192 0.016903446 0.018552572 0.008420323 0.016709274 0.004707936 0.063928644 0.036068989 0.011432329 0.048281147 -0.014044139 0.009928013 0.047137467 0.022163675 0.063217605 0.019080331 0.003776353 0.082524712 0.05393544 0.064971001 0.079
40 | 0.048 0.032584458 0.010514015 0.037841487 -0.001912172 0.02579329 0.015559641 -0.000354394 0.014300158 0.009126096 0.082951074 0.05542101 0.008563752 0.015707554 0.006212187 -0.002820451 0.036432913 0.013289041 0.057410509 0.074342932 0.002348844 0.121767751 0.061942656 0.055360433 0.08
41 | 0.041 0.026943378 0.007017912 0.033014231 0.005125017 -0.003767132 0.007973122 -0.002533963 0.005957261 -0.009650656 0.024600971 0.086989755 0.009876838 0.020537651 0.014059161 -0.007644297 0.037573027 0.007130461 0.030574852 0.064691866 0.029827132 0.120497403 0.062935112 0.035519292 0.099
42 | -0.009 0.024647568 0.008199689 0.015919374 -0.018683318 0.006923411 0.005856956 0.000982494 0.001035277 0.001181611 0.016111843 0.027076707 0.00712277 0.040060397 -0.012477424 -0.003263337 0.040656159 0.017033929 -0.021944609 0.043078439 0.021107552 0.066025038 0.055036454 -0.002182844 0.082
43 | 0.038 0.039930774 0.005746135 0.025793217 -0.005911494 0.012721646 0.008426176 -0.000558271 0.009501416 0.000797645 0.021752313 0.039055252 -0.005198084 0.042209258 0.025473914 0.004631279 0.045143671 0.029984821 0.036523587 0.078100605 0.025726396 0.050059686 0.062450896 0.040149389 0.085
44 | 0.047 0.055290718 0.008809593 0.020631492 0.017165432 0.008593916 0.012428166 -0.001942541 0.002661124 0.010026317 0.025109938 0.052431428 0.004250249 0.054317111 0.030852725 0.012171244 0.052039382 0.038865454 0.074871045 0.047295464 0.023739413 0.078014661 0.075941212 0.061271432 0.091
45 | 0.077 0.05989317 0.013646583 0.027105462 0.028844029 0.049486168 0.015486534 0.009573193 0.019825695 0.027593339 0.043057355 0.05931818 0.010589546 0.066612795 0.069413626 0.031116663 0.044399322 0.045635719 0.084665027 0.06439665 0.025902978 0.09539261 0.054096856 0.076941123 0.098
46 | 0.12 0.057484912 0.017229607 0.048960033 0.037261725 0.042160267 0.017976832 0.00596597 0.023740403 0.017716038 0.048117549 0.090758314 0.015173029 0.068185557 0.089487998 0.030978618 0.048708737 0.041134306 0.117387085 0.079298418 0.050611964 0.133195678 0.070456498 0.08825136 0.097
47 | 0.066 0.046556414 0.024444017 0.050433716 0.036128162 0.039902229 0.014280823 -0.003891356 0.010862674 0.016593594 0.039261542 0.091318806 0.022815091 0.051118983 0.082078197 0.015899003 0.042342528 0.033754849 0.071035652 0.058730442 0.061428037 0.135315568 0.064354459 0.052321501 0.095
48 | 0.079 0.030096613 0.02429197 0.047816326 0.034328842 0.042251593 0.01874128 -0.010917142 0.016718293 0.002459338 0.020118634 0.078278488 0.019824127 0.040934983 0.101526182 0.01282584 0.040777813 0.030266981 0.06638691 0.044613861 0.096388238 0.096131245 0.070699456 0.024534666 0.095
49 | 0.062 0.031602373 0.023706961 0.039852612 0.020981267 0.025808121 0.023141644 -0.006108276 0.001654055 -0.001380556 0.001002505 0.046352756 0.021226259 0.015010459 0.082568784 0.000988211 0.033554581 0.033709494 0.036235487 0.03236994 0.094080409 0.078915734 0.050575774 0.021370159 0.105
50 | 0.071 0.045882628 0.025730498 0.03162305 0.052095245 0.017298066 0.018824099 -0.000536835 0.00673971 0.010182251 0.002856359 0.023975228 0.027282602 0.023873511 0.088191839 0.00565571 0.021442844 0.028465195 0.059124981 0.037329656 0.103475874 0.054562505 0.038451436 0.030643115 0.105
51 | 0.081 0.04553443 0.026474627 0.035747343 0.043741017 0.025926123 0.021581425 0.001897272 0.007175016 0.010886161 0.015429093 0.031832932 0.029114911 0.011341385 0.106851232 0.021343898 0.011351572 0.024817635 0.078946552 0.037029422 0.10469954 0.068517735 0.049963263 0.042729485 0.104
52 | 0.069 0.054982631 0.032616323 0.050333502 0.028752441 0.027938833 0.023968035 0.007968837 0.015251196 0.01894079 0.028249622 0.038414462 0.032864259 0.008914476 0.1002058 0.027786249 0.017170783 0.023995376 0.080030993 0.045503845 0.072814392 0.063078867 0.035448155 0.066583468 0.104
53 | 0.09 0.048066783 0.038320436 0.049475872 0.049316093 0.043948781 0.027027624 0.008847777 0.01768208 0.015202593 0.033213115 0.070146833 0.040600054 0.018647296 0.131180975 0.039038363 0.029646587 0.028241113 0.096518063 0.046005567 0.053086332 0.112993111 0.066224117 0.049955966 0.104
54 | 0.062 0.026981789 0.035103741 0.04119911 0.038801273 0.050649118 0.033462635 0.016911064 0.019596257 0.010000041 0.025253872 0.100337815 0.036385518 -0.009260945 0.096713895 0.035385211 0.025652908 0.026496487 0.076720708 0.036119644 0.043391357 0.116340287 0.054813474 0.049499434 0.11
55 | 0.064 0.032730877 0.03722008 0.031677015 0.041836013 0.044581552 0.030261134 0.023511433 0.011813292 0.005606123 0.019649646 0.083649017 0.034624141 0.011500795 0.071056006 0.04219592 0.040618501 0.022375661 0.067746589 0.043090604 0.054189414 0.110227803 0.048648424 0.052017616 0.108
56 | 0.066 0.038575452 0.038982379 0.020005097 0.029809159 0.046806281 0.034914679 0.033108151 0.012561183 0.015460435 0.014679614 0.058820876 0.042949323 0.036755714 0.03430307 0.042473193 0.032137244 0.032515148 0.063880195 0.048390839 0.080885353 0.100116283 0.042141693 0.039880348 0.111
57 | 0.055 0.058012895 0.036197655 0.030712058 0.033133515 0.047340215 0.032754948 0.035225081 0.025620895 0.023959317 0.03006762 0.038222216 0.033721417 0.039462476 0.00802016 0.046517008 0.0279462 0.017708359 0.063092249 0.065573174 0.095296906 0.039883922 0.049846026 0.041019256 0.111
58 | 0.062 0.059518707 0.032570252 0.039827091 -0.007168933 0.04680819 0.030354812 0.023897199 0.017251249 0.014192762 0.037897963 0.022226058 0.027530665 0.058293257 0.033556233 0.046338731 0.039619139 0.018755927 0.101376837 0.081891339 0.1109 0.080276298 0.051196739 0.05107325 0.1167
59 | 0.068 0.056648589 0.031558452 0.034741579 0.013516975 0.04564668 0.036748102 0.020773083 0.023338157 0.012906847 0.039380367 0.031162008 0.039287395 0.05114701 0.039770325 0.040958253 0.038681808 0.028224798 0.105562016 0.064358665 0.1101 0.093361157 0.05012558 0.066368862 0.1002
60 | 0.069 0.045824678 0.01909501 0.038128443 0.023794118 0.04417654 0.021744761 0.005864873 0.005081358 -0.004767757 0.035359475 0.062714448 0.031526713 0.045904925 0.089855018 0.037792163 0.033361604 0.009287713 0.069738567 0.065469801 0.1107 0.151739207 0.07958734 0.06292895 0.1017
61 | 0.073 0.027523303 0.017430725 0.029217311 -0.005199719 0.023731869 0.018386058 0.009020289 -0.005580681 -0.013646868 0.028462348 0.056493326 0.033958445 0.031214973 0.091567689 0.028397742 0.028491529 0.006136646 0.079338739 0.047944956 0.1334 0.167034069 0.049143571 0.058841044 0.1238
62 |
--------------------------------------------------------------------------------
/hcw.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/hcw.dta
--------------------------------------------------------------------------------
/kh-data.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/kh-data.txt
--------------------------------------------------------------------------------
/smoking.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/smoking.dta
--------------------------------------------------------------------------------
/stata.toc:
--------------------------------------------------------------------------------
1 | v 3
2 | p fdid Stata module to estimate forward difference in differences.
3 |
--------------------------------------------------------------------------------
/turnout.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgreathouse9/FDIDTutorial/a398e2d7be812bca54bf3163329e10072834b7a7/turnout.dta
--------------------------------------------------------------------------------