├── .gitignore
├── LICENSE
├── PythonCodeBoxes.ipynb
├── RCodeBoxes.R
├── README.md
├── Results.RData
├── StataCodeBoxes.do
├── rhc.Rdata
├── rhc.csv
└── rhc.dta
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Python ###
2 | .ipynb_checkpoints/*
3 | ### R ###
4 | # History files
5 | .Rhistory
6 | .Rapp.history
7 | # User-specific files
8 | .Ruserdata
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Miguel Angel Luque Fernandez
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
--------------------------------------------------------------------------------
/PythonCodeBoxes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tutorial: causal inference methods made easy for applied resarchers/epidemiologists/statisticians \n",
8 | "\n",
9 | "### ICON-LSHTM, LONDON, 16th October 2020\n",
10 | "\n",
11 | "Miguel Angel Luque Fernandez PhD, Assistant Professor of Epidemiology and Biostatistics\n",
12 | "\n",
13 | "Matthew Smith PhD, Research Fellow Inequalities in Cancer Outcomes Network, LSHTM, London, UK\n",
14 | "\n",
15 | "Paul Zivich, University of North Carolina at Chapel Hill\n",
16 | "\n",
17 | "Copyright (c) 2020 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n",
18 | "\n",
19 | "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n",
20 | "\n",
21 | "Bug reports: miguel-angel.luque at lshtm.ac.uk\n",
22 | "\n",
23 | "The rhc dataset can be dowloaded at http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stdout",
33 | "output_type": "stream",
34 | "text": [
35 | "patsy 0.5.1\n",
36 | "scipy 1.5.2\n",
37 | "numpy 1.19.1\n",
38 | "pandas 1.1.0\n",
39 | "statsmodels 0.11.1\n",
40 | "matplotlib 3.3.1\n",
41 | "zepid 0.9.0\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "%matplotlib inline\n",
47 | "\n",
48 | "# Importing libraries for the tutorial\n",
49 | "import patsy\n",
50 | "import scipy\n",
51 | "import numpy as np\n",
52 | "import pandas as pd\n",
53 | "import statsmodels.api as sm\n",
54 | "import statsmodels.formula.api as smf\n",
55 | "import matplotlib\n",
56 | "import matplotlib.pyplot as plt\n",
57 | "import zepid\n",
58 | "\n",
59 | "from scipy.stats.kde import gaussian_kde\n",
60 | "from scipy.stats import logistic\n",
61 | "from zepid.calc import probability_to_odds, odds_to_probability\n",
62 | "\n",
63 | "print(\"patsy \", patsy.__version__)\n",
64 | "print(\"scipy \", scipy.__version__)\n",
65 | "print(\"numpy \", np.__version__)\n",
66 | "print(\"pandas \", pd.__version__)\n",
67 | "print(\"statsmodels\", sm.__version__)\n",
68 | "print(\"matplotlib \", matplotlib.__version__)\n",
69 | "print(\"zepid \", zepid.__version__)"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "## Setting up the Data"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 2,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "# Box 1: Setting up the data\n",
86 | "data = pd.read_csv(\"rhc.csv\")\n",
87 | "data.rename(columns={\"rhc\": \"A\", \n",
88 | " \"death_d30\": \"Y\",\n",
89 | " \"gender\": \"C\",\n",
90 | " \"age\": \"W1\",\n",
91 | " \"edu\": \"W2\",\n",
92 | " \"race\": \"W3\",\n",
93 | " \"carcinoma\": \"W4\",\n",
94 | " }, inplace=True)\n",
95 | "data['A'] = np.where(data['A'] == \"Yes\", 1, 0)\n",
96 | "data['C'] = np.where(data['C'] == \"Female\", 0, 1)\n",
97 | "\n",
98 | "data = data[[\"Y\", \"A\", \"C\", \"W1\", \"W2\", \"W3\", \"W4\"]].copy()"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "## Naive estimate of the ATE"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 3,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "name": "stdout",
115 | "output_type": "stream",
116 | "text": [
117 | "0.07352\n"
118 | ]
119 | },
120 | {
121 | "data": {
122 | "text/html": [
123 | "
\n",
124 | "
OLS Regression Results
\n",
125 | "
\n",
126 | "
Dep. Variable:
Y
R-squared:
0.006
\n",
127 | "
\n",
128 | "
\n",
129 | "
Model:
OLS
Adj. R-squared:
0.005
\n",
130 | "
\n",
131 | "
\n",
132 | "
Method:
Least Squares
F-statistic:
16.59
\n",
133 | "
\n",
134 | "
\n",
135 | "
Date:
Mon, 21 Dec 2020
Prob (F-statistic):
6.58e-08
\n",
136 | "
\n",
137 | "
\n",
138 | "
Time:
06:30:58
Log-Likelihood:
-3812.9
\n",
139 | "
\n",
140 | "
\n",
141 | "
No. Observations:
5735
AIC:
7632.
\n",
142 | "
\n",
143 | "
\n",
144 | "
Df Residuals:
5732
BIC:
7652.
\n",
145 | "
\n",
146 | "
\n",
147 | "
Df Model:
2
\n",
148 | "
\n",
149 | "
\n",
150 | "
Covariance Type:
nonrobust
\n",
151 | "
\n",
152 | "
\n",
153 | "
\n",
154 | "
\n",
155 | "
coef
std err
t
P>|t|
[0.025
0.975]
\n",
156 | "
\n",
157 | "
\n",
158 | "
Intercept
0.3049
0.010
29.354
0.000
0.285
0.325
\n",
159 | "
\n",
160 | "
\n",
161 | "
A
0.0735
0.013
5.739
0.000
0.048
0.099
\n",
162 | "
\n",
163 | "
\n",
164 | "
C
0.0027
0.013
0.219
0.826
-0.022
0.027
\n",
165 | "
\n",
166 | "
\n",
167 | "
\n",
168 | "
\n",
169 | "
Omnibus:
36985.427
Durbin-Watson:
1.981
\n",
170 | "
\n",
171 | "
\n",
172 | "
Prob(Omnibus):
0.000
Jarque-Bera (JB):
993.118
\n",
173 | "
\n",
174 | "
\n",
175 | "
Skew:
0.696
Prob(JB):
2.22e-216
\n",
176 | "
\n",
177 | "
\n",
178 | "
Kurtosis:
1.511
Cond. No.
3.07
\n",
179 | "
\n",
180 | "
Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
181 | ],
182 | "text/plain": [
183 | "\n",
184 | "\"\"\"\n",
185 | " OLS Regression Results \n",
186 | "==============================================================================\n",
187 | "Dep. Variable: Y R-squared: 0.006\n",
188 | "Model: OLS Adj. R-squared: 0.005\n",
189 | "Method: Least Squares F-statistic: 16.59\n",
190 | "Date: Mon, 21 Dec 2020 Prob (F-statistic): 6.58e-08\n",
191 | "Time: 06:30:58 Log-Likelihood: -3812.9\n",
192 | "No. Observations: 5735 AIC: 7632.\n",
193 | "Df Residuals: 5732 BIC: 7652.\n",
194 | "Df Model: 2 \n",
195 | "Covariance Type: nonrobust \n",
196 | "==============================================================================\n",
197 | " coef std err t P>|t| [0.025 0.975]\n",
198 | "------------------------------------------------------------------------------\n",
199 | "Intercept 0.3049 0.010 29.354 0.000 0.285 0.325\n",
200 | "A 0.0735 0.013 5.739 0.000 0.048 0.099\n",
201 | "C 0.0027 0.013 0.219 0.826 -0.022 0.027\n",
202 | "==============================================================================\n",
203 | "Omnibus: 36985.427 Durbin-Watson: 1.981\n",
204 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 993.118\n",
205 | "Skew: 0.696 Prob(JB): 2.22e-216\n",
206 | "Kurtosis: 1.511 Cond. No. 3.07\n",
207 | "==============================================================================\n",
208 | "\n",
209 | "Warnings:\n",
210 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
211 | "\"\"\""
212 | ]
213 | },
214 | "execution_count": 3,
215 | "metadata": {},
216 | "output_type": "execute_result"
217 | }
218 | ],
219 | "source": [
220 | "# Box 2: Regression naive approach\n",
221 | "fm = smf.ols(\"Y ~ A + C\", data).fit()\n",
222 | "print(np.round(fm.params['A'], 5)) # ATE = 0.07352\n",
223 | "fm.summary() # Full model results"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 4,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "Prop. Male 0.56\n",
236 | "Prop. Female 0.44\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "# Box 3: Marginal probabilities\n",
242 | "pr_c1 = np.mean(data['C'])\n",
243 | "pr_c0 = 1 - pr_c1\n",
244 | "print(\"Prop. Male \", np.round(pr_c1, 2))\n",
245 | "print(\"Prop. Female\", np.round(pr_c0, 2))"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "## 3. G-Formula\n",
253 | "\n",
254 | "### 3.1 Non-parametric g-formula"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 5,
260 | "metadata": {},
261 | "outputs": [
262 | {
263 | "name": "stdout",
264 | "output_type": "stream",
265 | "text": [
266 | "ATE 0.073692\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "# Box 4: Non-parametric g-formula for the ATE\n",
272 | "pr_y_a1c1 = np.mean(data.loc[(data['C'] == 1) & (data['A'] == 1), 'Y'])\n",
273 | "pr_y_a0c1 = np.mean(data.loc[(data['C'] == 1) & (data['A'] == 0), 'Y'])\n",
274 | "pr_y_a1c0 = np.mean(data.loc[(data['C'] == 0) & (data['A'] == 1), 'Y'])\n",
275 | "pr_y_a0c0 = np.mean(data.loc[(data['C'] == 0) & (data['A'] == 0), 'Y'])\n",
276 | "\n",
277 | "ate = (pr_y_a1c1 - pr_y_a0c1)*pr_c1 + (pr_y_a1c0 - pr_y_a0c0)*pr_c0\n",
278 | "print(\"ATE\", np.round(ate, 6))"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 6,
284 | "metadata": {},
285 | "outputs": [
286 | {
287 | "name": "stdout",
288 | "output_type": "stream",
289 | "text": [
290 | "ATT 0.073248\n"
291 | ]
292 | }
293 | ],
294 | "source": [
295 | "# Box 5: Non-parametric g-formula for the ATT\n",
296 | "pr_c1_a1 = np.mean(data.loc[data['A'] == 1, 'C'])\n",
297 | "pr_c0_a1 = 1 - pr_c1_a1\n",
298 | "\n",
299 | "att = (pr_y_a1c1 - pr_y_a0c1)*pr_c1_a1 + (pr_y_a1c0 - pr_y_a0c0)*pr_c0_a1\n",
300 | "print(\"ATT\", np.round(att, 6))"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 7,
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "name": "stdout",
310 | "output_type": "stream",
311 | "text": [
312 | "95% Confidence limits for the ATE\n",
313 | "Percentile method: [0.047741 0.099149]\n",
314 | "Normal Approx method: [0.04798 0.099404]\n",
315 | "\n",
316 | "95% Confidence limits for the ATT\n",
317 | "Percentile method: [0.048054 0.098375]\n",
318 | "Normal Approx method: [0.047811 0.098686]\n"
319 | ]
320 | }
321 | ],
322 | "source": [
323 | "# Box 6: Bootstrap the 95% confidence intervals (CI) for the\n",
324 | "# ATE/ATT estimated using the non-parametric G-Formula\n",
325 | "\n",
326 | "def ate_nonparm_gformula(d):\n",
327 | " \"\"\"Function to estimate the ATE using the nonparametric\n",
328 | " g-formula\"\"\"\n",
329 | " pr_c1 = np.mean(d['C'])\n",
330 | " pr_c0 = 1 - pr_c1\n",
331 | "\n",
332 | " pr_y_11 = np.mean(d.loc[(d['C'] == 1) & (d['A'] == 1), 'Y'])\n",
333 | " pr_y_01 = np.mean(d.loc[(d['C'] == 1) & (d['A'] == 0), 'Y'])\n",
334 | " pr_y_10 = np.mean(d.loc[(d['C'] == 0) & (d['A'] == 1), 'Y'])\n",
335 | " pr_y_00 = np.mean(d.loc[(d['C'] == 0) & (d['A'] == 0), 'Y'])\n",
336 | " \n",
337 | " return (pr_y_11 - pr_y_01)*pr_c1 + (pr_y_10 - pr_y_00)*pr_c0\n",
338 | "\n",
339 | "## ATE ##\n",
340 | "ate_rs = []\n",
341 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n",
342 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n",
343 | " replace=True) # Draw with replacement\n",
344 | " ate_rs.append(ate_nonparm_gformula(d=d_star))\n",
345 | "\n",
346 | "print(\"95% Confidence limits for the ATE\")\n",
347 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n",
348 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n",
349 | "ate_se = np.std(ate_rs, ddof=1)\n",
350 | "print(\"Normal Approx method:\", np.round([ate - 1.96*ate_se,\n",
351 | " ate + 1.96*ate_se], 6))\n",
352 | "\n",
353 | "\n",
354 | "def att_nonparm_gformula(d):\n",
355 | " \"\"\"Function to estimate the ATT using the nonparametric\n",
356 | " g-formula\"\"\"\n",
357 | " pr_c1_a1 = np.mean(d.loc[data['A'] == 1, 'C'])\n",
358 | " pr_c0_a1 = 1 - pr_c1_a1\n",
359 | "\n",
360 | " pr_y_11 = np.mean(d.loc[(d['C'] == 1) & (d['A'] == 1), 'Y'])\n",
361 | " pr_y_01 = np.mean(d.loc[(d['C'] == 1) & (d['A'] == 0), 'Y'])\n",
362 | " pr_y_10 = np.mean(d.loc[(d['C'] == 0) & (d['A'] == 1), 'Y'])\n",
363 | " pr_y_00 = np.mean(d.loc[(d['C'] == 0) & (d['A'] == 0), 'Y'])\n",
364 | " \n",
365 | " return (pr_y_11 - pr_y_01)*pr_c1_a1 + (pr_y_10 - pr_y_00)*pr_c0_a1\n",
366 | "\n",
367 | "\n",
368 | "## ATT ##\n",
369 | "att_rs = []\n",
370 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n",
371 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n",
372 | " replace=True) # Draw with replacement\n",
373 | " att_rs.append(att_nonparm_gformula(d=d_star))\n",
374 | "\n",
375 | "print(\"\\n95% Confidence limits for the ATT\")\n",
376 | "ci_perc = np.percentile(att_rs, q=[2.5, 97.5])\n",
377 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n",
378 | "att_se = np.std(att_rs, ddof=1)\n",
379 | "print(\"Normal Approx method:\", np.round([att - 1.96*att_se,\n",
380 | " att + 1.96*att_se], 6)) "
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 8,
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "name": "stdout",
390 | "output_type": "stream",
391 | "text": [
392 | "ATE 0.073692\n"
393 | ]
394 | }
395 | ],
396 | "source": [
397 | "# Box 7: Non-parametric g-formula using saturated regression model (A)\n",
398 | "data[\"A1\"] = np.where(data['A'] == 1, 1, 0)\n",
399 | "data[\"A0\"] = np.where(data['A'] == 0, 1, 0)\n",
400 | "fm = smf.ols(\"Y ~ A1 + A0 + A1:C + A0:C - 1\", data).fit()\n",
401 | "betas = fm.params\n",
402 | "ate = np.mean((betas[\"A1\"] + betas[\"A1:C\"]*data[\"C\"]) -\n",
403 | " (betas[\"A0\"] + betas[\"A0:C\"]*data[\"C\"]))\n",
404 | "\n",
405 | "print(\"ATE\", np.round(ate, 6))"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": 9,
411 | "metadata": {},
412 | "outputs": [
413 | {
414 | "name": "stdout",
415 | "output_type": "stream",
416 | "text": [
417 | "ATE 0.073692\n"
418 | ]
419 | }
420 | ],
421 | "source": [
422 | "# Box 8: G-formula with saturated regression model using zEpid\n",
423 | "g_formula = zepid.causal.gformula.TimeFixedGFormula(data, \n",
424 | " exposure=\"A\", \n",
425 | " outcome=\"Y\")\n",
426 | "g_formula.outcome_model(\"A + C + A:C\", # Estimating model\n",
427 | " print_results=False)\n",
428 | "\n",
429 | "g_formula.fit(\"all\") # all sets A=1\n",
430 | "y_a1 = g_formula.marginal_outcome\n",
431 | "\n",
432 | "g_formula.fit(\"none\") # none sets A=0\n",
433 | "y_a0 = g_formula.marginal_outcome\n",
434 | "\n",
435 | "print(\"ATE\", np.round(y_a1 - y_a0, 6))"
436 | ]
437 | },
438 | {
439 | "cell_type": "markdown",
440 | "metadata": {},
441 | "source": [
442 | "### 3.2 Parametric g-formula"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": 10,
448 | "metadata": {},
449 | "outputs": [
450 | {
451 | "name": "stdout",
452 | "output_type": "stream",
453 | "text": [
454 | "ATE 0.073692\n"
455 | ]
456 | }
457 | ],
458 | "source": [
459 | "# Box 9: Parametric g-formula by hand\n",
460 | "f = sm.families.family.Binomial() # Using logit model unlike prev\n",
461 | "fm_a1 = smf.glm(\"Y ~ C\", data.loc[data[\"A\"] == 1], family=f).fit()\n",
462 | "fm_a0 = smf.glm(\"Y ~ C\", data.loc[data[\"A\"] == 0], family=f).fit()\n",
463 | "\n",
464 | "y_a1 = fm_a1.predict(data['C'])\n",
465 | "y_a0 = fm_a0.predict(data['C'])\n",
466 | "ate = np.mean(y_a1 - y_a0)\n",
467 | "\n",
468 | "print(\"ATE\", np.round(ate, 6))"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": 11,
474 | "metadata": {},
475 | "outputs": [
476 | {
477 | "name": "stdout",
478 | "output_type": "stream",
479 | "text": [
480 | "ATE 0.073514\n"
481 | ]
482 | }
483 | ],
484 | "source": [
485 | "# Box 10: Parametric Regression Adjustment\n",
486 | "f = sm.families.family.Binomial()\n",
487 | "fm = smf.glm(\"Y ~ A + C\", data, family=f).fit()\n",
488 | "\n",
489 | "da1 = data.copy()\n",
490 | "da1['A'] = 1\n",
491 | "y_a1 = fm.predict(da1)\n",
492 | "\n",
493 | "da0 = data.copy()\n",
494 | "da0['A'] = 0\n",
495 | "y_a0 = fm.predict(da0)\n",
496 | "ate = np.mean(y_a1 - y_a0)\n",
497 | "\n",
498 | "print(\"ATE\", np.round(ate, 6))"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": 12,
504 | "metadata": {},
505 | "outputs": [
506 | {
507 | "name": "stdout",
508 | "output_type": "stream",
509 | "text": [
510 | "95% Confidence limits for the ATE\n",
511 | "Percentile method: [0.048036 0.100379]\n",
512 | "Normal approx method: [0.047652 0.099376]\n"
513 | ]
514 | }
515 | ],
516 | "source": [
517 | "# Box 11: Bootstrap for the parametric regression adjustment\n",
518 | "ate_rs = []\n",
519 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n",
520 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n",
521 | " replace=True) # Draw with replacement\n",
522 | " fm = smf.glm(\"Y ~ A + C\", d_star, family=f).fit()\n",
523 | " da = d_star.copy()\n",
524 | " da['A'] = 1\n",
525 | " y_a1 = fm.predict(da)\n",
526 | " da['A'] = 0\n",
527 | " y_a0 = fm.predict(da)\n",
528 | " ate_rs.append(np.mean(y_a1 - y_a0))\n",
529 | "\n",
530 | "print(\"95% Confidence limits for the ATE\")\n",
531 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n",
532 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n",
533 | "ate_se = np.std(ate_rs, ddof=1)\n",
534 | "print(\"Normal approx method:\", np.round([ate - 1.96*ate_se,\n",
535 | " ate + 1.96*ate_se], 6))"
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": 13,
541 | "metadata": {},
542 | "outputs": [
543 | {
544 | "name": "stdout",
545 | "output_type": "stream",
546 | "text": [
547 | "ATE 0.083929\n"
548 | ]
549 | }
550 | ],
551 | "source": [
552 | "# Box 12: Parametric multivariate regression adjustment implementation\n",
553 | "f = sm.families.family.Binomial()\n",
554 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
555 | " data.loc[data[\"A\"] == 1], family=f).fit()\n",
556 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
557 | " data.loc[data[\"A\"] == 0], family=f).fit()\n",
558 | "\n",
559 | "y_a1 = fm_a1.predict(data)\n",
560 | "y_a0 = fm_a0.predict(data)\n",
561 | "ate = np.mean(y_a1 - y_a0)\n",
562 | "\n",
563 | "print(\"ATE\", np.round(ate, 6))"
564 | ]
565 | },
566 | {
567 | "cell_type": "code",
568 | "execution_count": 14,
569 | "metadata": {},
570 | "outputs": [
571 | {
572 | "name": "stdout",
573 | "output_type": "stream",
574 | "text": [
575 | "ATE 0.083929\n"
576 | ]
577 | }
578 | ],
579 | "source": [
580 | "# Box 13: Multivariate regression with zEpid\n",
581 | "g_formula = zepid.causal.gformula.TimeFixedGFormula(data, \n",
582 | " exposure=\"A\", \n",
583 | " outcome=\"Y\")\n",
584 | "g_formula.outcome_model(\"A + C + W1 + W2 + W3 + W4 + \"\n",
585 | " \"A:C + A:W1 + A:W2 + A:W3 + A:W4\",\n",
586 | " print_results=False)\n",
587 | "\n",
588 | "g_formula.fit(\"all\") # all sets A=1\n",
589 | "y_a1 = g_formula.marginal_outcome\n",
590 | "\n",
591 | "g_formula.fit(\"none\") # none sets A=0\n",
592 | "y_a0 = g_formula.marginal_outcome\n",
593 | "\n",
594 | "print(\"ATE\", np.round(y_a1 - y_a0, 6))"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 15,
600 | "metadata": {},
601 | "outputs": [],
602 | "source": [
603 | "# Box 14: Not Available for Python\n",
604 | "# zEpid does not support two version of the parametric g-formula"
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": 16,
610 | "metadata": {},
611 | "outputs": [
612 | {
613 | "name": "stdout",
614 | "output_type": "stream",
615 | "text": [
616 | "95% Confidence limits for the ATE\n",
617 | "Percentile method: [0.059649 0.106662]\n",
618 | "Normal approx method: [0.058851 0.109006]\n"
619 | ]
620 | }
621 | ],
622 | "source": [
623 | "# Box 15: Bootstrap for multivariate adjustment\n",
624 | "ate_rs = []\n",
625 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n",
626 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n",
627 | " replace=True) # Draw with replacement\n",
628 | " fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
629 | " d_star.loc[d_star[\"A\"] == 1], \n",
630 | " family=f).fit()\n",
631 | " fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
632 | " d_star.loc[d_star[\"A\"] == 0], \n",
633 | " family=f).fit()\n",
634 | " ate_rs.append(np.mean(fm_a1.predict(data) - fm_a0.predict(data)))\n",
635 | "\n",
636 | "print(\"95% Confidence limits for the ATE\")\n",
637 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n",
638 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n",
639 | "ate_se = np.std(ate_rs, ddof=1)\n",
640 | "print(\"Normal approx method:\", np.round([ate - 1.96*ate_se,\n",
641 | " ate + 1.96*ate_se], 6))"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 17,
647 | "metadata": {},
648 | "outputs": [
649 | {
650 | "name": "stdout",
651 | "output_type": "stream",
652 | "text": [
653 | "RR 1.2766\n"
654 | ]
655 | }
656 | ],
657 | "source": [
658 | "# Box 16: Computing the parametric marginal risk ratio\n",
659 | "f = sm.families.family.Binomial()\n",
660 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
661 | " data.loc[data[\"A\"] == 1], family=f).fit()\n",
662 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
663 | " data.loc[data[\"A\"] == 0], family=f).fit()\n",
664 | "\n",
665 | "y_a1 = fm_a1.predict(data)\n",
666 | "y_a0 = fm_a0.predict(data)\n",
667 | "risk_ratio = np.mean(y_a1) / np.mean(y_a0)\n",
668 | "\n",
669 | "print(\"RR\", np.round(risk_ratio, 4))"
670 | ]
671 | },
672 | {
673 | "cell_type": "markdown",
674 | "metadata": {},
675 | "source": [
676 | "## 4. Inverse Probability of Treatment Weighting\n",
677 | "\n",
678 | "### 4.1 Inverse probability of treatment weighting based on the propensity score plus regression adjustment"
679 | ]
680 | },
681 | {
682 | "cell_type": "code",
683 | "execution_count": 18,
684 | "metadata": {},
685 | "outputs": [
686 | {
687 | "name": "stdout",
688 | "output_type": "stream",
689 | "text": [
690 | "ATE 0.083294\n"
691 | ]
692 | }
693 | ],
694 | "source": [
695 | "# Box 17: Computation of the IPTW estimator for the ATE\n",
696 | "f = sm.families.family.Binomial()\n",
697 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n",
698 | " data, family=f).fit()\n",
699 | "p_score = fm_pa.predict(data) # Calculating propensity scores\n",
700 | "data['p_score'] = p_score\n",
701 | "\n",
702 | "iptw = 1 / np.where(data['A'] == 1, p_score, 1 - p_score) # IPTW\n",
703 | "data['iptw'] = iptw\n",
704 | "\n",
705 | "d_a1 = data.loc[data[\"A\"] == 1].copy()\n",
706 | "d_a0 = data.loc[data[\"A\"] == 0].copy()\n",
707 | "ate = (np.average(d_a1['Y'], weights=d_a1['iptw']) - \n",
708 | " np.average(d_a0['Y'], weights=d_a0['iptw']))\n",
709 | "print(\"ATE\", np.round(ate, 6))"
710 | ]
711 | },
712 | {
713 | "cell_type": "code",
714 | "execution_count": 19,
715 | "metadata": {},
716 | "outputs": [
717 | {
718 | "name": "stdout",
719 | "output_type": "stream",
720 | "text": [
721 | "95% Confidence limits for the ATE\n",
722 | "Percentile method: [0.057431 0.106584]\n",
723 | "Normal approx method: [0.058198 0.10839 ]\n"
724 | ]
725 | }
726 | ],
727 | "source": [
728 | "# Box 18: Bootstrap computation for the IPTW estimator\n",
729 | "ate_rs = []\n",
730 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n",
731 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n",
732 | " replace=True) # Draw with replacement\n",
733 | " fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n",
734 | " d_star, family=f).fit()\n",
735 | " ps_score = fm_pa.predict(d_star) # Calculating propensity scores\n",
736 | " d_star['iptw'] = 1 / np.where(d_star['A'] == 1, \n",
737 | " ps_score, 1 - ps_score) \n",
738 | " ds_a1 = d_star.loc[d_star[\"A\"] == 1].copy()\n",
739 | " ds_a0 = d_star.loc[d_star[\"A\"] == 0].copy()\n",
740 | " ate_rs.append(np.average(ds_a1['Y'], weights=ds_a1['iptw']) - \n",
741 | " np.average(ds_a0['Y'], weights=ds_a0['iptw']))\n",
742 | "\n",
743 | "print(\"95% Confidence limits for the ATE\")\n",
744 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n",
745 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n",
746 | "ate_se = np.std(ate_rs, ddof=1)\n",
747 | "print(\"Normal approx method:\", np.round([ate - 1.96*ate_se,\n",
748 | " ate + 1.96*ate_se], 6))"
749 | ]
750 | },
751 | {
752 | "cell_type": "code",
753 | "execution_count": 20,
754 | "metadata": {},
755 | "outputs": [
756 | {
757 | "name": "stdout",
758 | "output_type": "stream",
759 | "text": [
760 | " RD SE(RD) 95%LCL 95%UCL\n",
761 | "labels \n",
762 | "Intercept 0.303444 0.007712 0.288328 0.318559\n",
763 | "A 0.083294 0.013046 0.057723 0.108864\n"
764 | ]
765 | }
766 | ],
767 | "source": [
768 | "# Box 19: IPTW estimator using zEpid\n",
769 | "ipw = zepid.causal.ipw.IPTW(data, treatment=\"A\", outcome=\"Y\")\n",
770 | "ipw.treatment_model(\"C + W1 + W2 + W3 + W4\", \n",
771 | " stabilized=False, # Set to True for stabilized\n",
772 | " print_results=False)\n",
773 | "ipw.marginal_structural_model(\"A\")\n",
774 | "ipw.fit()\n",
775 | "print(ipw.risk_difference)"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": 21,
781 | "metadata": {},
782 | "outputs": [
783 | {
784 | "name": "stdout",
785 | "output_type": "stream",
786 | "text": [
787 | " Confounder Raw Weighted\n",
788 | "2 C 0.093144 0.000325\n",
789 | "3 W1 -0.061352 -0.003754\n",
790 | "4 W2 0.091364 -0.002439\n",
791 | "0 W3 0.035606 0.002426\n",
792 | "1 W4 0.071853 0.000404\n"
793 | ]
794 | },
795 | {
796 | "data": {
797 | "image/png": "\n",
798 | "text/plain": [
799 | ""
800 | ]
801 | },
802 | "metadata": {
803 | "needs_background": "light"
804 | },
805 | "output_type": "display_data"
806 | }
807 | ],
808 | "source": [
809 | "# Box 20: Assessing IPTW balance\n",
810 | "rename_cols = {\"smd_w\": \"Weighted\", \"smd_u\": \"Raw\", \n",
811 | " \"labels\": \"Confounder\"}\n",
812 | "\n",
813 | "smd = ipw.standardized_mean_differences().rename(columns=rename_cols)\n",
814 | "smd = smd.sort_values(by='Confounder')\n",
815 | "print(smd[['Confounder', 'Raw', 'Weighted']])\n",
816 | "\n",
817 | "# zEpid plotting functionality\n",
818 | "ipw.plot_love()\n",
819 | "plt.show()"
820 | ]
821 | },
822 | {
823 | "cell_type": "code",
824 | "execution_count": 22,
825 | "metadata": {},
826 | "outputs": [
827 | {
828 | "data": {
829 | "image/png": "\n",
830 | "text/plain": [
831 | ""
832 | ]
833 | },
834 | "metadata": {
835 | "needs_background": "light"
836 | },
837 | "output_type": "display_data"
838 | }
839 | ],
840 | "source": [
841 | "# Box 21: Assessing IPTW overlap by hand\n",
842 | "density_t = gaussian_kde(1 - data.loc[data[\"A\"] == 1, 'p_score'])\n",
843 | "density_u = gaussian_kde(1 - data.loc[data[\"A\"] == 0, 'p_score'])\n",
844 | "\n",
845 | "x = np.linspace(0, 1, 10000)\n",
846 | "\n",
847 | "ax = plt.gca()\n",
848 | "ax.fill_between(x, density_t(x), color=\"b\", alpha=0.2, label=None)\n",
849 | "ax.plot(x, density_t(x), color=\"b\", label='RHC = Y')\n",
850 | "ax.fill_between(x, density_u(x), color=\"r\", alpha=0.2, label=None)\n",
851 | "ax.plot(x, density_u(x), color=\"r\", label='RHC = N')\n",
852 | "ax.set_ylim([0, 10])\n",
853 | "ax.set_ylabel(\"density\")\n",
854 | "ax.set_xlim([0.45, 0.8])\n",
855 | "ax.set_xlabel(\"1 - Propensity Score\")\n",
856 | "ax.legend()\n",
857 | "plt.show()"
858 | ]
859 | },
860 | {
861 | "cell_type": "code",
862 | "execution_count": 23,
863 | "metadata": {},
864 | "outputs": [
865 | {
866 | "data": {
867 | "image/png": "\n",
868 | "text/plain": [
869 | ""
870 | ]
871 | },
872 | "metadata": {
873 | "needs_background": "light"
874 | },
875 | "output_type": "display_data"
876 | }
877 | ],
878 | "source": [
879 | "# Box 22: Assessing IPTW overlap using zEpid\n",
880 | "ipw.plot_kde()\n",
881 | "plt.ylim([0, 10])\n",
882 | "plt.xlim([0.2, 0.55])\n",
883 | "plt.show()"
884 | ]
885 | },
886 | {
887 | "cell_type": "markdown",
888 | "metadata": {},
889 | "source": [
890 | "### 4.2 Marginal structural model with stabilised weights"
891 | ]
892 | },
893 | {
894 | "cell_type": "code",
895 | "execution_count": 24,
896 | "metadata": {},
897 | "outputs": [
898 | {
899 | "name": "stderr",
900 | "output_type": "stream",
901 | "text": [
902 | "/home/pzivich/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/genmod/generalized_estimating_equations.py:501: DomainWarning: The identity link function does not respect the domain of the Binomial family.\n",
903 | " DomainWarning)\n",
904 | "/home/pzivich/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/genmod/generalized_linear_model.py:278: DomainWarning: The identity link function does not respect the domain of the Binomial family.\n",
905 | " DomainWarning)\n"
906 | ]
907 | },
908 | {
909 | "name": "stdout",
910 | "output_type": "stream",
911 | "text": [
912 | "Unstabilized Weights\n",
913 | "ATE 0.083294\n",
914 | "95% CL [0.05772325 0.10886425]\n"
915 | ]
916 | },
917 | {
918 | "name": "stderr",
919 | "output_type": "stream",
920 | "text": [
921 | "/home/pzivich/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/genmod/generalized_estimating_equations.py:501: DomainWarning: The identity link function does not respect the domain of the Binomial family.\n",
922 | " DomainWarning)\n",
923 | "/home/pzivich/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/genmod/generalized_linear_model.py:278: DomainWarning: The identity link function does not respect the domain of the Binomial family.\n",
924 | " DomainWarning)\n"
925 | ]
926 | },
927 | {
928 | "name": "stdout",
929 | "output_type": "stream",
930 | "text": [
931 | "\n",
932 | "Stabilized Weights\n",
933 | "ATE 0.083294\n",
934 | "95% CL [0.05772325 0.10886425]\n"
935 | ]
936 | }
937 | ],
938 | "source": [
939 | "# Box 23: Computation of the IPTW estimator using a MSM\n",
940 | "\n",
941 | "### Unstabilized IPTW ###\n",
942 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n",
943 | " data, family=f).fit()\n",
944 | "p_score = fm_pa.predict(data) # Calculating propensity scores\n",
945 | "iptw = 1 / np.where(data['A'] == 1, p_score, 1 - p_score) # IPTW\n",
946 | "# Estimating Marginal Structural Model\n",
947 | "f = sm.families.family.Binomial(sm.families.links.identity())\n",
948 | "fm = smf.gee(\"Y ~ A\", data.index, data,\n",
949 | " cov_struct=sm.cov_struct.Independence(), \n",
950 | " family=f, weights=iptw).fit()\n",
951 | "print(\"Unstabilized Weights\")\n",
952 | "print(\"ATE \", np.round(fm.params['A'], 6))\n",
953 | "print(\"95% CL\", np.asarray(fm.conf_int().loc[\"A\"]))\n",
954 | "\n",
955 | "### Stabilized IPTW ###\n",
956 | "f = sm.families.family.Binomial()\n",
957 | "# Numerator\n",
958 | "fm_ma = smf.glm(\"A ~ 1\", data, family=f).fit()\n",
959 | "num = np.where(data['A'] == 1, fm_ma.predict(data), \n",
960 | " 1 - fm_ma.predict(data))\n",
961 | "# Denominator\n",
962 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n",
963 | " data, family=f).fit()\n",
964 | "den = np.where(data['A'] == 1, fm_pa.predict(data), \n",
965 | " 1 - fm_pa.predict(data))\n",
966 | "# IPTW\n",
967 | "iptw = num / den\n",
968 | "# Estimating Marginal Structural Model\n",
969 | "f = sm.families.family.Binomial(sm.families.links.identity())\n",
970 | "fm = smf.gee(\"Y ~ A\", data.index, data,\n",
971 | " cov_struct=sm.cov_struct.Independence(), \n",
972 | " family=f, weights=iptw).fit()\n",
973 | "print(\"\\nStabilized Weights\")\n",
974 | "print(\"ATE \", np.round(fm.params['A'], 6))\n",
975 | "print(\"95% CL\", np.asarray(fm.conf_int().loc[\"A\"]))"
976 | ]
977 | },
978 | {
979 | "cell_type": "markdown",
980 | "metadata": {},
981 | "source": [
982 | "### 4.3 IPTW with regression adjustment"
983 | ]
984 | },
985 | {
986 | "cell_type": "code",
987 | "execution_count": 25,
988 | "metadata": {},
989 | "outputs": [
990 | {
991 | "name": "stdout",
992 | "output_type": "stream",
993 | "text": [
994 | "ATE 0.083929\n",
995 | "ATE 0.083426\n"
996 | ]
997 | }
998 | ],
999 | "source": [
1000 | "# Box 24: Computation of the IPTW-RA estimator\n",
1001 | "f = sm.families.family.Binomial()\n",
1002 | "\n",
1003 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
1004 | " data.loc[data[\"A\"] == 1], # Only A=1\n",
1005 | " weights=data.loc[data[\"A\"] == 1, 'iptw'], # Box 17\n",
1006 | " family=f).fit()\n",
1007 | "y_a1 = fm_a1.predict(data)\n",
1008 | "\n",
1009 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
1010 | " data.loc[data[\"A\"] == 0], # Only A=0\n",
1011 | " weights=data.loc[data[\"A\"] == 0, 'iptw'], # Box 17\n",
1012 | " family=f).fit()\n",
1013 | "y_a0 = fm_a0.predict(data)\n",
1014 | "\n",
1015 | "ate = np.mean(y_a1 - y_a0)\n",
1016 | "print(\"ATE\", np.round(ate, 6))\n",
1017 | "ate = (np.mean(data['iptw']*data['A']*y_a1) / np.mean(data['iptw']*data['A']) - \n",
1018 | " np.mean(data['iptw']*(1-data['A'])*y_a0) / np.mean(data['iptw']*(1-data['A'])))\n",
1019 | "print(\"ATE\", np.round(ate, 6))"
1020 | ]
1021 | },
1022 | {
1023 | "cell_type": "code",
1024 | "execution_count": 26,
1025 | "metadata": {},
1026 | "outputs": [],
1027 | "source": [
1028 | "# Box 25: IPTW-RA \n",
1029 | "# Not supported by zEpid"
1030 | ]
1031 | },
1032 | {
1033 | "cell_type": "markdown",
1034 | "metadata": {},
1035 | "source": [
1036 | "## 5. Augmented Inverse Probability Weighting"
1037 | ]
1038 | },
1039 | {
1040 | "cell_type": "code",
1041 | "execution_count": 27,
1042 | "metadata": {},
1043 | "outputs": [
1044 | {
1045 | "name": "stdout",
1046 | "output_type": "stream",
1047 | "text": [
1048 | "ATE 0.083796\n",
1049 | "95% Confidence limits for the ATE\n",
1050 | "Percentile method: [0.058572 0.109738]\n",
1051 | "Normal approx method: [0.058901 0.108691]\n"
1052 | ]
1053 | }
1054 | ],
1055 | "source": [
1056 | "# Box 26: Computation of the AIPTW estimator\n",
1057 | "f = sm.families.family.Binomial()\n",
1058 | "\n",
1059 | "# Step 1: g-computation\n",
1060 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
1061 | " data.loc[data[\"A\"] == 1], family=f).fit()\n",
1062 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
1063 | " data.loc[data[\"A\"] == 0], family=f).fit()\n",
1064 | "y_a1 = fm_a1.predict(data)\n",
1065 | "y_a0 = fm_a0.predict(data)\n",
1066 | "\n",
1067 | "# Step 2: propensity scores\n",
1068 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n",
1069 | " data, family=f).fit()\n",
1070 | "p_score = fm_pa.predict(data)\n",
1071 | "\n",
1072 | "# Step 3: analytic formula\n",
1073 | "ys_a1 = ((data['A'] * data['Y'])/ (p_score) + \n",
1074 | " (y_a1*(p_score-data['A'])) / p_score)\n",
1075 | "ys_a0 = (((1-data['A']) * data['Y'])/ (1-p_score) + \n",
1076 | " (y_a0*(data['A']-p_score)) / (1-p_score))\n",
1077 | "ate = np.mean(ys_a1 - ys_a0)\n",
1078 | "print(\"ATE\", np.round(ate, 6))\n",
1079 | "\n",
1080 | "# Step 4: bootstrap for inference\n",
1081 | "ate_rs = []\n",
1082 | "for i in range(1000):\n",
1083 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n",
1084 | " replace=True) # Draw with replacement\n",
1085 | " fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
1086 | " d_star.loc[d_star[\"A\"] == 1], family=f).fit()\n",
1087 | " fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
1088 | " d_star.loc[d_star[\"A\"] == 0], family=f).fit()\n",
1089 | " y_a1 = fm_a1.predict(d_star)\n",
1090 | " y_a0 = fm_a0.predict(d_star)\n",
1091 | " fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n",
1092 | " d_star, family=f).fit()\n",
1093 | " p_score = fm_pa.predict(d_star)\n",
1094 | " ys_a1 = ((d_star['A'] * d_star['Y'])/ (p_score) + \n",
1095 | " (y_a1*(p_score-d_star['A'])) / p_score)\n",
1096 | " ys_a0 = (((1-d_star['A']) * d_star['Y'])/ (1-p_score) + \n",
1097 | " (y_a0*(d_star['A']-p_score)) / (1-p_score))\n",
1098 | " ate_rs.append(np.mean(ys_a1 - ys_a0))\n",
1099 | "\n",
1100 | "\n",
1101 | "print(\"95% Confidence limits for the ATE\")\n",
1102 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n",
1103 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n",
1104 | "ate_se = np.std(ate_rs, ddof=1)\n",
1105 | "print(\"Normal approx method:\", np.round([ate - 1.96*ate_se,\n",
1106 | " ate + 1.96*ate_se], 6))"
1107 | ]
1108 | },
1109 | {
1110 | "cell_type": "code",
1111 | "execution_count": 28,
1112 | "metadata": {},
1113 | "outputs": [
1114 | {
1115 | "name": "stdout",
1116 | "output_type": "stream",
1117 | "text": [
1118 | "ATE 0.083796\n",
1119 | "95% CL [0.058546 0.109046]\n"
1120 | ]
1121 | }
1122 | ],
1123 | "source": [
1124 | "# Box 27: AIPTW estimator with zEpid\n",
1125 | "aipw = zepid.causal.doublyrobust.AIPTW(data, \n",
1126 | " exposure=\"A\", \n",
1127 | " outcome=\"Y\")\n",
1128 | "aipw.exposure_model(\"C + W1 + W2 + W3 + W4\", \n",
1129 | " print_results=False)\n",
1130 | "aipw.outcome_model(\"A + C + W1 + W2 + W3 + W4 + \"\n",
1131 | " \"A:C + A:W1 + A:W2 + A:W3 + A:W4\", \n",
1132 | " print_results=False)\n",
1133 | "aipw.fit()\n",
1134 | "\n",
1135 | "print(\"ATE \", np.round(aipw.risk_difference, 6))\n",
1136 | "print(\"95% CL\", np.round(aipw.risk_difference_ci, 6))\n",
1137 | "# zEpid calculates the variance using influence curves"
1138 | ]
1139 | },
1140 | {
1141 | "cell_type": "markdown",
1142 | "metadata": {},
1143 | "source": [
1144 | "## 6. Data-Adaptive Estimation: Ensemble Learning Targeted Maximum Likelihood Estimation"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "code",
1149 | "execution_count": 29,
1150 | "metadata": {},
1151 | "outputs": [
1152 | {
1153 | "name": "stdout",
1154 | "output_type": "stream",
1155 | "text": [
1156 | "ATE 0.083796\n",
1157 | "95% CL [0.058546 0.109047]\n"
1158 | ]
1159 | }
1160 | ],
1161 | "source": [
1162 | "# Box 28: Computation of TMLE by hand\n",
1163 | "f = sm.families.family.Binomial()\n",
1164 | "n = data.shape[0]\n",
1165 | "\n",
1166 | "# Step 1: g-computation\n",
1167 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
1168 | " data.loc[data[\"A\"] == 1], family=f).fit()\n",
1169 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n",
1170 | " data.loc[data[\"A\"] == 0], family=f).fit()\n",
1171 | "y_a1 = fm_a1.predict(data)\n",
1172 | "y_a0 = fm_a0.predict(data)\n",
1173 | "y_a_ = np.where(data['A'] == 1, y_a1, y_a0)\n",
1174 | "\n",
1175 | "# Step 2: propensity scores\n",
1176 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n",
1177 | " data, family=f).fit()\n",
1178 | "p_score = fm_pa.predict(data)\n",
1179 | "\n",
1180 | "# Step 3: targeting step\n",
1181 | "logodds_y1 = np.log(probability_to_odds(y_a1))\n",
1182 | "logodds_y0 = np.log(probability_to_odds(y_a0))\n",
1183 | "logodds_ya = np.log(probability_to_odds(y_a_))\n",
1184 | "\n",
1185 | "clever_cov_a1 = data['A']/p_score\n",
1186 | "clever_cov_a0 = - (1-data['A'])/(1-p_score)\n",
1187 | "\n",
1188 | "submodel = sm.GLM(data['Y'], \n",
1189 | " np.column_stack((clever_cov_a1, clever_cov_a0)), \n",
1190 | " offset=logodds_ya,\n",
1191 | " family=f).fit()\n",
1192 | "epsilon = submodel.params\n",
1193 | "\n",
1194 | "# Step 4: calculating ATE\n",
1195 | "ys_a1 = logistic.cdf(logodds_y1 + epsilon[0] / p_score)\n",
1196 | "ys_a0 = logistic.cdf(logodds_y0 - epsilon[1] / (1-p_score))\n",
1197 | "ate = np.mean(ys_a1 - ys_a0)\n",
1198 | "print(\"ATE\", np.round(ate, 6))\n",
1199 | "\n",
1200 | "# Step 5: inference via influence curve\n",
1201 | "ic = (clever_cov_a1 + clever_cov_a0) * (data['Y'] - y_a_) + (y_a1 - y_a0) - ate\n",
1202 | "sd = np.sqrt(np.nanvar(ic, ddof=1) / n)\n",
1203 | "cl = [ate - 1.96*sd, ate + 1.96*sd]\n",
1204 | "print(\"95% CL\", np.round(cl, 6))"
1205 | ]
1206 | },
1207 | {
1208 | "cell_type": "code",
1209 | "execution_count": 30,
1210 | "metadata": {},
1211 | "outputs": [
1212 | {
1213 | "name": "stdout",
1214 | "output_type": "stream",
1215 | "text": [
1216 | "ATE 0.083796\n",
1217 | "95% CL [0.058546 0.109047]\n"
1218 | ]
1219 | }
1220 | ],
1221 | "source": [
1222 | "# Box 29: TMLE with zEpid\n",
1223 | "tmle = zepid.causal.doublyrobust.TMLE(data, \n",
1224 | " exposure=\"A\", \n",
1225 | " outcome=\"Y\")\n",
1226 | "tmle.exposure_model(\"C + W1 + W2 + W3 + W4\", \n",
1227 | " print_results=False)\n",
1228 | "tmle.outcome_model(\"A + C + W1 + W2 + W3 + W4 + \"\n",
1229 | " \"A:C + A:W1 + A:W2 + A:W3 + A:W4\", \n",
1230 | " print_results=False)\n",
1231 | "tmle.fit()\n",
1232 | "\n",
1233 | "print(\"ATE \", np.round(tmle.risk_difference, 6))\n",
1234 | "print(\"95% CL\", np.round(tmle.risk_difference_ci, 6))"
1235 | ]
1236 | },
1237 | {
1238 | "cell_type": "markdown",
1239 | "metadata": {},
1240 | "source": [
1241 | "## 7. Simulation"
1242 | ]
1243 | },
1244 | {
1245 | "cell_type": "code",
1246 | "execution_count": 31,
1247 | "metadata": {},
1248 | "outputs": [],
1249 | "source": [
1250 | "def data_generator(n, true_ate=False):\n",
1251 | " \"\"\"Function to generate data consisting of `n` observations\"\"\"\n",
1252 | " d = pd.DataFrame()\n",
1253 | " # Confounders\n",
1254 | " d['W1'] = np.round(np.random.uniform(low=1, high=5, size=n))\n",
1255 | " d['W2'] = np.random.binomial(n=1, p=0.45, size=n)\n",
1256 | " d['W3'] = np.round(np.random.uniform(low=0, high=1, size=n)\n",
1257 | " + 0.8*d['W1'] + 0.75*d['W2'])\n",
1258 | " d['W3'] = np.where(d['W3'] > 4, 1, d['W3'])\n",
1259 | " d['W4'] = np.round(np.random.uniform(low=0, high=1, size=n)\n",
1260 | " + 0.2*d['W1'] + 0.75*d['W2'])\n",
1261 | " # Treatment\n",
1262 | " pr_a = logistic.cdf(-1 - 0.15*d['W4'] + 1.5*d['W2'] + \n",
1263 | " 0.75*d['W3'] + 0.25*d['W1'] + \n",
1264 | " 0.8*d['W2']*d['W4'])\n",
1265 | " d['A'] = np.random.binomial(n=1, \n",
1266 | " p=pr_a, \n",
1267 | " size=n)\n",
1268 | " # Potential outcomes\n",
1269 | " pr_y1 = logistic.cdf(-3 + 1 + 0.25*d['W4'] + 0.75*d['W3'] + \n",
1270 | " 0.8*d['W2']*d['W4'] + 0.05*d['W1'])\n",
1271 | " y1 = np.random.binomial(n=1, p=pr_y1, size=n)\n",
1272 | " pr_y0 = logistic.cdf(-3 + 0 + 0.25*d['W4'] + 0.75*d['W3'] + \n",
1273 | " 0.8*d['W2']*d['W4'] + 0.05*d['W1'])\n",
1274 | " y0 = np.random.binomial(n=1, p=pr_y0, size=n)\n",
1275 | " # Causal consistency\n",
1276 | " d['Y'] = np.where(d['A'] == 1, y1, y0)\n",
1277 | " # Return generated data\n",
1278 | " if true_ate:\n",
1279 | " return np.mean(y1 - y0), np.mean(y1) / np.mean(y0)\n",
1280 | " else:\n",
1281 | " return d\n",
1282 | "\n",
1283 | "\n",
1284 | "# Generating true value from super-population\n",
1285 | "true_ate, true_rr = data_generator(n=1000000, true_ate=True)\n",
1286 | "\n",
1287 | "# Setting up SuperLearner libraries\n",
1288 | "from zepid.superlearner import SuperLearner, StepwiseSL\n",
1289 | "from sklearn.linear_model import LogisticRegression\n",
1290 | "from pygam import LogisticGAM, f, s\n",
1291 | "\n",
1292 | "import warnings\n",
1293 | "warnings.simplefilter('ignore', RuntimeWarning) # Hides some NumPy errors for sparse models\n",
1294 | "\n",
1295 | "family = sm.families.family.Binomial()\n",
1296 | "\n",
1297 | "sl_library_main = [LogisticRegression(penalty='none', solver='lbfgs'),\n",
1298 | " StepwiseSL(family, selection=\"backward\"), \n",
1299 | " StepwiseSL(family, selection=\"forward\", order_interaction=1)]\n",
1300 | "sl_main_labs = [\"LogR\", \"Step.zero\", \"Step.one\"]\n",
1301 | "sl_main = SuperLearner(sl_library_main, sl_main_labs, folds=5,\n",
1302 | " loss_function='nloglik')\n",
1303 | "\n",
1304 | "sl_library_alt = [LogisticRegression(penalty='none', solver='lbfgs'),\n",
1305 | " StepwiseSL(family, selection=\"backward\"), \n",
1306 | " StepwiseSL(family, selection=\"forward\", order_interaction=1),\n",
1307 | " LogisticGAM(f(0) + s(1) + f(2) + s(3) + s(4), \n",
1308 | " lam=0.6)]\n",
1309 | "sl_alt_labs = [\"LogR\", \"Step.zero\", \"Step.one\", \"GAM\"]\n",
1310 | "sl_alt = SuperLearner(sl_library_alt, sl_alt_labs, folds=5,\n",
1311 | " loss_function='nloglik')\n",
1312 | "\n",
1313 | "# Simulation\n",
1314 | "naive_rd, gform_rd, iptw_rd, aipw_rd, tmle_rd = [], [], [], [], []\n",
1315 | "aipw_slm_rd, aipw_sla_rd, tmle_slm_rd, tmle_sla_rd = [], [], [], []\n",
1316 | "\n",
1317 | "for i in range(1000): \n",
1318 | " data = data_generator(n=1000)\n",
1319 | "\n",
1320 | " # Naive\n",
1321 | " fm = smf.ols(\"Y ~ A + W1 + W2 + W3 + W4\", data).fit()\n",
1322 | " naive_rd.append(fm.params['A'])\n",
1323 | " \n",
1324 | " # G-formula\n",
1325 | " g_formula = zepid.causal.gformula.TimeFixedGFormula(data, \n",
1326 | " exposure=\"A\", \n",
1327 | " outcome=\"Y\")\n",
1328 | " g_formula.outcome_model(\"A + W1 + W2 + W3 + W4\",\n",
1329 | " print_results=False)\n",
1330 | " g_formula.fit(\"all\") # all sets A=1\n",
1331 | " y_a1 = g_formula.marginal_outcome\n",
1332 | " g_formula.fit(\"none\") # none sets A=0\n",
1333 | " y_a0 = g_formula.marginal_outcome\n",
1334 | " gform_rd.append(y_a1 - y_a0)\n",
1335 | " \n",
1336 | " # IPTW\n",
1337 | " ipw = zepid.causal.ipw.IPTW(data, treatment=\"A\", outcome=\"Y\")\n",
1338 | " ipw.treatment_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n",
1339 | " stabilized=True, print_results=False)\n",
1340 | " ipw.marginal_structural_model(\"A\")\n",
1341 | " ipw.fit()\n",
1342 | " iptw_rd.append(ipw.risk_difference)\n",
1343 | "\n",
1344 | " # AIPW\n",
1345 | " aipw = zepid.causal.doublyrobust.AIPTW(data, \n",
1346 | " exposure=\"A\", \n",
1347 | " outcome=\"Y\")\n",
1348 | " aipw.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n",
1349 | " print_results=False)\n",
1350 | " aipw.outcome_model(\"A + W1 + W2 + W3 + W4\", \n",
1351 | " print_results=False)\n",
1352 | " aipw.fit()\n",
1353 | " aipw_rd.append(aipw.risk_difference)\n",
1354 | " \n",
1355 | " # AIPW -- Super Learner main\n",
1356 | " aipw = zepid.causal.doublyrobust.AIPTW(data, \n",
1357 | " exposure=\"A\", \n",
1358 | " outcome=\"Y\")\n",
1359 | " aipw.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n",
1360 | " custom_model=sl_main,\n",
1361 | " print_results=False)\n",
1362 | " aipw.outcome_model(\"A + W1 + W2 + W3 + W4\", \n",
1363 | " custom_model=sl_main,\n",
1364 | " print_results=False)\n",
1365 | " aipw.fit()\n",
1366 | " aipw_slm_rd.append(aipw.risk_difference)\n",
1367 | " \n",
1368 | " # AIPW -- Super Learner alternative\n",
1369 | " aipw = zepid.causal.doublyrobust.AIPTW(data, \n",
1370 | " exposure=\"A\", \n",
1371 | " outcome=\"Y\")\n",
1372 | " aipw.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n",
1373 | " custom_model=sl_alt,\n",
1374 | " print_results=False)\n",
1375 | " aipw.outcome_model(\"A + W1 + W2 + W3 + W4\", \n",
1376 | " custom_model=sl_alt,\n",
1377 | " print_results=False)\n",
1378 | " aipw.fit()\n",
1379 | " aipw_sla_rd.append(aipw.risk_difference)\n",
1380 | " \n",
1381 | " # TMLE\n",
1382 | " tmle = zepid.causal.doublyrobust.TMLE(data, \n",
1383 | " exposure=\"A\", \n",
1384 | " outcome=\"Y\")\n",
1385 | " tmle.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n",
1386 | " print_results=False)\n",
1387 | " tmle.outcome_model(\"A + W1 + W2 + W3 + W4\", \n",
1388 | " print_results=False)\n",
1389 | " tmle.fit()\n",
1390 | " tmle_rd.append(tmle.risk_difference)\n",
1391 | "\n",
1392 | " # TMLE -- Super Learner main\n",
1393 | " tmle = zepid.causal.doublyrobust.TMLE(data, \n",
1394 | " exposure=\"A\", \n",
1395 | " outcome=\"Y\")\n",
1396 | " tmle.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n",
1397 | " custom_model=sl_main,\n",
1398 | " print_results=False)\n",
1399 | " tmle.outcome_model(\"A + W1 + W2 + W3 + W4\", \n",
1400 | " custom_model=sl_main,\n",
1401 | " print_results=False)\n",
1402 | " tmle.fit()\n",
1403 | " tmle_slm_rd.append(tmle.risk_difference)\n",
1404 | " \n",
1405 | " # TMLE -- Super Learner alternative\n",
1406 | " tmle = zepid.causal.doublyrobust.TMLE(data, \n",
1407 | " exposure=\"A\", \n",
1408 | " outcome=\"Y\")\n",
1409 | " tmle.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n",
1410 | " custom_model=sl_alt,\n",
1411 | " print_results=False)\n",
1412 | " tmle.outcome_model(\"A + W1 + W2 + W3 + W4\", \n",
1413 | " custom_model=sl_alt,\n",
1414 | " print_results=False)\n",
1415 | " tmle.fit()\n",
1416 | " tmle_sla_rd.append(tmle.risk_difference)\n",
1417 | " \n"
1418 | ]
1419 | },
1420 | {
1421 | "cell_type": "code",
1422 | "execution_count": 32,
1423 | "metadata": {},
1424 | "outputs": [
1425 | {
1426 | "name": "stdout",
1427 | "output_type": "stream",
1428 | "text": [
1429 | "=============================\n",
1430 | "Naive\n",
1431 | "-----------------------------\n",
1432 | "Abs. Bias: 0.048\n",
1433 | "Rel. Bias: 26.7\n",
1434 | "=============================\n",
1435 | "=============================\n",
1436 | "G-formula\n",
1437 | "-----------------------------\n",
1438 | "Abs. Bias: 0.002\n",
1439 | "Rel. Bias: 0.8\n",
1440 | "=============================\n",
1441 | "=============================\n",
1442 | "IPTW\n",
1443 | "-----------------------------\n",
1444 | "Abs. Bias: 0.106\n",
1445 | "Rel. Bias: 58.5\n",
1446 | "=============================\n",
1447 | "=============================\n",
1448 | "AIPW\n",
1449 | "-----------------------------\n",
1450 | "Abs. Bias: 0.003\n",
1451 | "Rel. Bias: 1.9\n",
1452 | "=============================\n",
1453 | "=============================\n",
1454 | "AIPW-SL1\n",
1455 | "-----------------------------\n",
1456 | "Abs. Bias: 0.003\n",
1457 | "Rel. Bias: 1.5\n",
1458 | "=============================\n",
1459 | "=============================\n",
1460 | "AIPW-SL2\n",
1461 | "-----------------------------\n",
1462 | "Abs. Bias: 0.003\n",
1463 | "Rel. Bias: 1.7\n",
1464 | "=============================\n",
1465 | "=============================\n",
1466 | "TMLE\n",
1467 | "-----------------------------\n",
1468 | "Abs. Bias: 0.009\n",
1469 | "Rel. Bias: 4.8\n",
1470 | "=============================\n",
1471 | "=============================\n",
1472 | "TMLE-SL1\n",
1473 | "-----------------------------\n",
1474 | "Abs. Bias: 0.006\n",
1475 | "Rel. Bias: 3.5\n",
1476 | "=============================\n",
1477 | "=============================\n",
1478 | "TMLE-SL2\n",
1479 | "-----------------------------\n",
1480 | "Abs. Bias: 0.008\n",
1481 | "Rel. Bias: 4.5\n",
1482 | "=============================\n"
1483 | ]
1484 | }
1485 | ],
1486 | "source": [
1487 | "# Results\n",
1488 | "result = [naive_rd, gform_rd, iptw_rd, aipw_rd, aipw_slm_rd, \n",
1489 | " aipw_sla_rd, tmle_rd, tmle_slm_rd, tmle_sla_rd]\n",
1490 | "labels = [\"Naive\", \"G-formula\", \"IPTW\", \"AIPW\", \"AIPW-SL1\", \n",
1491 | " \"AIPW-SL2\", \"TMLE\", \"TMLE-SL1\", \"TMLE-SL2\"]\n",
1492 | "for x, y in zip(result, labels):\n",
1493 | " print(\"=============================\")\n",
1494 | " print(y)\n",
1495 | " print(\"-----------------------------\")\n",
1496 | " print(\"Abs. Bias:\", np.round(np.mean(x - true_ate), 3))\n",
1497 | " rel_bias = np.abs(np.mean((x - true_ate) / true_ate)*100)\n",
1498 | " print(\"Rel. Bias:\", np.round(rel_bias, 1))\n",
1499 | " print(\"=============================\")\n"
1500 | ]
1501 | },
1502 | {
1503 | "cell_type": "markdown",
1504 | "metadata": {},
1505 | "source": [
1506 | "END"
1507 | ]
1508 | }
1509 | ],
1510 | "metadata": {
1511 | "kernelspec": {
1512 | "display_name": "Python 3",
1513 | "language": "python",
1514 | "name": "python3"
1515 | },
1516 | "language_info": {
1517 | "codemirror_mode": {
1518 | "name": "ipython",
1519 | "version": 3
1520 | },
1521 | "file_extension": ".py",
1522 | "mimetype": "text/x-python",
1523 | "name": "python",
1524 | "nbconvert_exporter": "python",
1525 | "pygments_lexer": "ipython3",
1526 | "version": "3.6.5"
1527 | }
1528 | },
1529 | "nbformat": 4,
1530 | "nbformat_minor": 4
1531 | }
1532 |
--------------------------------------------------------------------------------
/RCodeBoxes.R:
--------------------------------------------------------------------------------
1 | ###################################################################################################
2 |
3 | # Tutorial: causal inference methods made easy for applied resarchers/epidemiologists/statisticians
4 |
5 | # ICON-LSHTM, LONDON, 16th October 2020
6 |
7 | # Miguel Angel Luque Fernandez, PhD
8 | # Assistant Professor of Epidemiology and Biostatistics
9 | # Matthew Smith, PhD
10 | # Research Fellow
11 |
12 | # Inequalities in Cancer Outcomes Network, LSHTM, London, UK
13 |
14 | # Copyright (c) 2020 Permission is hereby granted, free of charge, to any person obtaining a copy
15 | # of this software and associated documentation files (the "Software"), to deal in the Software
16 | # without restriction, including without limitation the rights to use, copy, modify, merge,
17 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
18 | # whom the Software is furnished to do so, subject to the following conditions: The above
19 | # copyright notice and this permission notice shall be included in all copies or substantial
20 | # portions of the Software.
21 |
22 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
23 | # BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON
24 | # INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
25 | # OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
26 | # IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 |
28 | # Bug reports: miguel-angel.luque at lshtm.ac.uk
29 |
30 | # The rhc dataset can be dowloaded at http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets
31 |
32 | ###################################################################################################
33 |
34 |
35 | # Preliminaries
36 | rm(list=ls())
37 |
38 | ### Box 1: Setting the data
39 | setwd("your path")
40 | #setwd("~/Dropbox/ESTIMATORSCIproject/R_Stata_master_files/Data")
41 | library(haven)
42 | #data <- read_dta("~/Dropbox/ESTIMATORSCIproject/R_Stata_master_files/Data/rhc.dta")
43 | data <- read_dta("rhc.dta")
44 | # Define the outcome (Y), exposure (A), confounder (C), and confounders (W)
45 | data$Y <- data$death_d30; data$Y <- as.numeric(data$Y); Y <- data$Y
46 | data$A <- data$rhc; data$A <- as.numeric(data$A); A <- data$A
47 | data$C <- data$gender; data$C <- as.numeric(data$C); C <- data$C
48 | data$w1 <- data$age; data$w1 <- as.numeric(data$w1); w1 <- data$w1
49 | data$w2 <- data$edu; data$w2 <- as.numeric(data$w2); w2 <- data$w2
50 | data$w3 <- data$race; data$w3 <- as.numeric(data$w3); w3 <- data$w3
51 | data$w4 <- data$carcinoma; data$w4 <- as.numeric(data$w4); w4 <- data$w4
52 | data2 <- as.data.frame(Y); data2$A <- A; data2$C <- C; data2$w1 <- w1; data2$w2 <- w2; data2$w3 <- w3; data2$w4 <- w4
53 |
54 |
55 | ### Box 2: Naive estimate of the ATE
56 | naive <- lm(Y ~ A + C, data=data); naive # Naive estimate of the ATE is 0.07352
57 |
58 |
59 | # 3. G-Formula
60 |
61 | ## 3.1 Non-parametric G-formula
62 |
63 | ### Box 3: Non-parametric G-formula for the ATE
64 | mean(data$A[data$C==1], na.rm=TRUE) #
65 | mean(data$A[data$C==0], na.rm=TRUE) #
66 | mean(data$Y[data$A==1], na.rm=TRUE) - mean(data$Y[data$A==0],na.rm=TRUE) # Unadjusted Estimate
67 | reg <- lm(Y ~ A, data=data); reg # Unadjusted Estimate Regression
68 | pr.l <- prop.table(table(data$C)); pr.l # Marginal probability of C
69 | tab.out <- aggregate(Y ~ A + C, data, mean); tab.out # Table of Means in
70 | ATE <- ((mean(data$Y[data$A==1 & data$C==1]) - mean(data$Y[data$A==0 & data$C==1]))*pr.l[2]) +
71 | (mean(data$Y[data$A==1 & data$C==0]) - mean(data$Y[data$A==0 & data$C==0]))*pr.l[1] # G-formula Non-parametric ATE
72 | ATE; rm(ATE) # The ATE from the non-parametric estimator is 0.073692
73 |
74 | ### Box 4: Non-parametric G-formula for the ATT
75 | ATTm <- mean(data$C[data$A==1], na.rm=TRUE) # Proportion of those who are male amongst treated
76 | ATTf <- 1-mean(data$C[data$A==1], na.rm=TRUE) # Proportion of those who are female amongst treated
77 | ATT <-((mean(data$Y[data$A==1 & data$C==1]) - mean(data$Y[data$A==0 & data$C==1]))*ATTm) +
78 | (mean(data$Y[data$A==1 & data$C==0]) - mean(data$Y[data$A==0 & data$C==0]))*ATTf # G-formula Non-parametric ATT
79 | ATT # The ATT from the non-parametric estimator is 0.073248
80 | rm(ATT)
81 |
82 | ### Box 5: Bootstrap the 95% confidence intervals (CI) for the ATE/ATT estimated using the non-parametric G-Formula
83 | # ATE
84 | library(boot)
85 | g.comp = function(data,indices) # Define the function to estimate the ATE
86 | {
87 | dat=data[indices,]
88 | pr.l <- prop.table(table(dat$C))
89 |
90 | ATE = ((mean(dat$Y[dat$A==1 & dat$C==1]) - mean(dat$Y[dat$A==0 & dat$C==1]))*pr.l[2]) +
91 | (mean(dat$Y[dat$A==1 & dat$C==0]) - mean(dat$Y[dat$A==0 & dat$C==0]))*pr.l[1] ; ATE
92 | }
93 | g.comp(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n
94 | boot.out=boot(data,g.comp,200) # Draw 200 bootstrap sample estimates
95 | boot.ci(boot.out,type="perc",conf=0.95) # compute confidence intervals using percentile method
96 | boot.ci(boot.out,type="norm",conf=0.95)
97 |
98 | # ATT
99 | g.comp = function(data,indices) # Define the function to estimate the ATT
100 | {
101 | dat=data[indices,]
102 |
103 | ATTm <- mean(dat$C[dat$A==1], na.rm=TRUE) # Proportion of those who are male among treated
104 | ATTf <- 1-mean(dat$C[dat$A==1], na.rm=TRUE)
105 |
106 | ((mean(dat$Y[dat$A==1 & dat$C==1]) - mean(dat$Y[dat$A==0 & dat$C==1]))*ATTm) +
107 | (mean(dat$Y[dat$A==1 & dat$C==0]) - mean(dat$Y[dat$A==0 & dat$C==0]))*ATTf
108 | }
109 | g.comp(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n
110 | boot.out=boot(data,g.comp,200) # Draw 200 bootstrap sample estimates
111 | boot.ci(boot.out,type="perc",conf=0.95) # compute confidence intervals using percentile method
112 | boot.ci(boot.out,type="norm",conf=0.95)
113 |
114 |
115 | ### Box 6: Non-parametric G-Formula using a fully saturated regression model in Stata (A)
116 | # Method 1: conditional probabilities
117 | data$A1 <- ifelse(data$A == 1, 1, 0)
118 | data$A0 <- ifelse(data$A == 0, 1, 0)
119 | data$C1 <- ifelse(data$C == 1, 1, 0)
120 | data$C0 <- ifelse(data$C == 0, 1, 0)
121 | reg <- glm(Y ~ -1 + (A1 + A0) + A1:(C1) + A0:(C1), data=data); summary(reg)
122 | ATE <- mean((reg$coefficients[1] + reg$coefficients[3]*C) - (reg$coefficients[2] + reg$coefficients[4]*C)); ATE
123 | rm(ATE)
124 |
125 | ### Box 7: Non-parametric G-Formula using a fully saturated regression model in Stata (B)
126 | # Method 2: Marginal probabilities
127 | install.packages("margins")
128 | library(margins)
129 | reg <- glm(Y ~ -1 + (A1 + A0) + A1:(C1) + A0:(C1), data=data); summary(reg)
130 | Y1 <- margins(reg, variables="A1"); Y1
131 | Y0 <- margins(reg, variables="A0"); Y0
132 | ATE <- Y1$fitted[A==1]-Y0$fitted[A==0]; mean(ATE)
133 | rm(ATE)
134 |
135 | ## 3.2 Parametric G-formula
136 | ### Box 8: Parametric G-formula by hand
137 | mod1 <- glm(Y ~ C, family="binomial", data=data[data$A==1,]) # Expected probability amongst those with RHC
138 | mod0 <- glm(Y ~ C, family="binomial", data=data[data$A==0,]) # Expected probability amongst those without RHC
139 | GcompRA <- cbind(Y1 = predict(mod1, newdata=data.frame(A = 1, C), type="response"),
140 | Y0 = predict(mod0, newdata=data.frame(A = 0, C), type="response"))
141 | GcompRA <- as.data.frame(GcompRA)
142 | Y.1 <- GcompRA$Y1
143 | Y.0 <- GcompRA$Y0
144 | ATE <- mean((Y.1) - (Y.0), na.rm=TRUE); ATE # Difference between expected probabilities (ATE)
145 | rm(ATE)
146 |
147 |
148 | ### Box 9: Parametric regression adjustment (one confounder) using stdReg R-package
149 | install.packages("stdReg")
150 | library(stdReg)
151 | reg <- glm(Y ~ A + C, data = data, family = poisson(link="log")); summary(reg)
152 | reg.std <- stdGlm(fit=reg, data = data, X = "A", x=seq(0,1))
153 | print(summary(reg.std, contrast = "difference", reference=0))
154 | plot(reg.std)
155 |
156 | ### Box 10: Bootstrap for the parametric regression adjustment one confounder)
157 | library(boot) # Install the Bootstrap package
158 | attach(data)
159 | g.comp=function(data,indices) # Define the function to estimate the ATE
160 | {
161 | dat=data[indices,]
162 | glm1 <- glm(Y ~ C, family="binomial", dat=dat[dat$A==1,])
163 | glm2 <- glm(Y ~ C, family="binomial", dat=dat[dat$A==0,])
164 | Y.1 = predict(glm1, newdata=data.frame(A = 1, C), type="response")
165 | Y.0 = predict(glm2, newdata=data.frame(A = 0, C), type="response")
166 | ATE <- mean((Y.1) - mean(Y.0)); ATE
167 | }
168 | g.comp(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n
169 | boot.out=boot(data,g.comp,200) # Draw 1000 bootstrap sample estimates of RD
170 | boot.ci(boot.out,type="norm",conf=0.95) # Bootstrapped 95% CI based on normal approximation
171 | boot.ci(boot.out,type="perc",conf=0.95) # Bootstrapped 95% CI based on percentiles of the bootstrap replicates
172 |
173 | # Now with more than one confounder
174 |
175 | ### Box 11: Parametric multivariate regression adjustment implementation of the G-Formula
176 | mod1 <- glm(Y ~ C + w1 + w2 + w3 + w4, family="binomial", data=data[data$A==1,]) # Expected probability amongst those with RHC
177 | mod0 <- glm(Y ~ C + w1 + w2 + w3 + w4, family="binomial", data=data[data$A==0,]) # Expected probability amongst those without RHC
178 | GcompRA <- cbind(Y1 = predict(mod1, newdata=data.frame(A = 1, C, w1, w2, w3, w4), type="response"),
179 | Y0 = predict(mod0, newdata=data.frame(A = 0, C, w1, w2, w3, w4), type="response"))
180 | GcompRA <- as.data.frame(GcompRA)
181 | Y.1 <- GcompRA$Y1
182 | Y.0 <- GcompRA$Y0
183 | ATE <- mean((Y.1) - (Y.0), na.rm=TRUE); ATE # ATE
184 | rm(ATE)
185 |
186 |
187 | ### Box 12: Parametric multivariate regression adjustment using "stdReg" R-package
188 | install.packages("stdReg")
189 | library(stdReg)
190 | reg <- glm(Y ~ A + C + w1 + w2 + w3 + w4, data = data, family = poisson(link="log")); summary(reg)
191 | reg.std <- stdGlm(fit=reg, data=data, X="A", x=seq(0,1))
192 | print(summary(reg.std, contrast="difference", reference=0))
193 | plot(reg.std)
194 |
195 |
196 | ### Box 13: Parametric multivariate regression adjustment using "margins" R-package
197 | reg1 <- glm(Y ~ -1 + (A1 + A0) + A1:(C1 + w1 + w2 + w3 + w4) + A0:(C0 + w1 + w2 + w3 + w4) , data=data); summary(reg1)
198 | poY1m <- margins(reg1, variables="A1"); poY1m
199 | poY0m <- margins(reg1, variables="A0"); poY0m
200 | ATE2 <- poY1m$fitted[A==1] - poY0m$fitted[A==0]; mean(ATE2)
201 |
202 | ### Box 14 Bootstrap for the multivariate parametric regression adjustment
203 | library(boot) # Install the Bootstrap package
204 | attach(data)
205 | g.comp=function(data,indices) # Define the function to estimate the ATE
206 | {
207 | dat=data[indices,]
208 | glm1 <- glm(Y ~ C + w1 + w2 + w3 + w4, family="binomial", dat=dat[dat$A==1,])
209 | glm2 <- glm(Y ~ C + w1 + w2 + w3 + w4, family="binomial", dat=dat[dat$A==0,])
210 | Y.1 = predict(glm1, newdata=data.frame(A = 1, C, w1, w2, w3, w4), type="response")
211 | Y.0 = predict(glm2, newdata=data.frame(A = 0, C, w1, w2, w3, w4), type="response")
212 | mean((Y.1) - mean(Y.0))
213 | }
214 | g.comp(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n
215 | boot.out=boot(data,g.comp,200) # Draw 1000 bootstrap sample estimates of RD
216 | boot.ci(boot.out,type="norm",conf=0.95) # Bootstrapped 95% CI based on normal approximation
217 | boot.ci(boot.out,type="perc",conf=0.95) # Bootstrapped 95% CI based on percentiles of the bootstrap replicates
218 |
219 |
220 | ### Box 15 Computing the parametric marginal risk ratio after regression adjustment
221 | reg <- glm(Y ~ A + C + w1 + w2 + w3 + w4, data=data2, family = binomial(link="logit")); summary(reg)
222 | reg.std <- stdGlm(fit=reg, data=data2, X="A", x=seq(0,1))
223 | print(summary(reg.std, contrast="ratio", reference=0)) # 27% (95% CI 1.18-1.37) increase in relative risk
224 | plot(reg.std)
225 |
226 | # 4. Inverse Probability of Treatment Weighting
227 |
228 | ## 4.1 Inverse probability of treatment weighting based on the propensity score plus regression adjustment
229 |
230 | # Box 16 (IPTW by hand)
231 | p.s <- glm(A ~ as.factor(C) + w1 + w2 + w3 + w4, data=data, family=binomial) # Propensity score mmodel for the exposure
232 | p.score <- ifelse(data$A == 0, 1 - predict(p.s, type = "response"), predict(p.s, type = "response")) # Assign Propensity score weights
233 | #table(p.score) # Table of Propensity Scores
234 | data$w <- 1/p.score # Generate IP Weights
235 | data2$w <- 1/p.score
236 | #table(data$w); summary(data$w); sd(data$w)
237 |
238 | ATE <- mean(data$w*as.numeric(data$A==1)*data$Y) - mean(data$w*as.numeric(data$A==0)*data$Y);ATE # Estimate ATE
239 | rm(ATE)
240 |
241 |
242 | # Box 17 Bootstrap computation for the IPTW estimator
243 | library(boot)
244 | iptw.w = function(data,indices) # Define the function to estimate the ATE
245 | {
246 | dat=data[indices,]
247 | mean(dat$w*as.numeric(dat$A==1)*dat$Y) - mean(dat$w*as.numeric(dat$A==0)*dat$Y)
248 | }
249 | iptw.w(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n
250 | boot.out=boot(data,iptw.w,100) # Draw 200 bootstrap sample estimates
251 | boot.ci(boot.out,type="perc",conf=0.95) # compute confidence intervals using percentile method
252 | boot.ci(boot.out,type="norm",conf=0.95)
253 |
254 |
255 | ### Box 18: Computation of the IPTW estimator for the ATE using IPW R-package
256 | install.packages("ipw", "survey")
257 | library(ipw)
258 | library(survey)
259 |
260 | # Univariable
261 | ipw.ATE <- ipwpoint(exposure = A, family = "binomial", link = "logit",
262 | numerator = ~ 1,
263 | denominator = ~ C,
264 | data = data2)
265 | summary(ipw.ATE$ipw.weights)
266 | ipwplot(weights = ipw.ATE$ipw.weights, logscale = FALSE, main = "Unstabilized weights", xlim = c(0.5, 2))
267 | summary(ipw.ATE$num.mod)
268 | summary(ipw.ATE$den.mod)
269 | data2$usw <- ipw.ATE$ipw.weights
270 | msm <- (svyglm(Y ~ A, design = svydesign(~ 1, weights = ~ usw, data = data2)))
271 | coef(msm); confint(msm)
272 |
273 | # Multivariable
274 | ipw.ATE <- ipwpoint(exposure = A, family = "binomial", link = "logit",
275 | numerator = ~ 1,
276 | denominator = ~ C + w1 + w2 + w3 + w4,
277 | data = data2)
278 | summary(ipw.ATE$ipw.weights)
279 | ipwplot(weights = ipw.ATE$ipw.weights, logscale = FALSE, main = "Unstabilized weights", xlim = c(0.5, 2))
280 | summary(ipw.ATE$num.mod)
281 | summary(ipw.ATE$den.mod)
282 | data2$usw <- ipw.ATE$ipw.weights
283 | msm <- (svyglm(Y ~ A, design = svydesign(~ 1, weights = ~ usw, data = data2)))
284 | coef(msm); confint(msm)
285 |
286 |
287 | ### Box 19: Assessing IPTW balance
288 | install.packages("twang")
289 | library(twang)
290 | ps.balance <- ps(A ~ C + w1 + w2 + w3 + w4, data = data2,
291 | n.trees=1000, interaction.depth=2, shrinkage=0.01, perm.test.iters=0,
292 | stop.method=c("es.mean","ks.max"), estimand = "ATE", verbose=FALSE)
293 | plot(ps.balance)
294 | summary(ps.balance$gbm.obj, n.trees=ps.balance$desc$ks.max.ATE$n.trees, plot=FALSE)
295 | data2.balance <- bal.table(ps.balance); data2.balance
296 |
297 |
298 | ### Box 20: Assessing IPTW overlap by hand
299 | install.packages("xtable")
300 | library(xtable)
301 | pretty.tab <- data2.balance$ks.max.ATE[,c("tx.mn","ct.mn","ks")]
302 | pretty.tab <- cbind(pretty.tab, data2.balance$unw[,"ct.mn"])
303 | names(pretty.tab) <- c("E(Y1|t=1)","E(Y0|t=1)","KS","E(Y0|t=0)")
304 | xtable(pretty.tab, caption = "Balance of the treatment and comparison groups",
305 | label = "tab:balance", digits = c(0, 2, 2, 2, 2), align=c("l","r","r","r","r"))
306 | plot(ps.balance, plots = 6)
307 |
308 |
309 | ### Box 21: Assessing overlap using plots
310 | # Fit a propensity score model
311 | m_PS<-glm(A ~ C + w1 + w2 + w3 + w4, data = data2, family=binomial(link="logit"))
312 | summary(m_PS)
313 |
314 | # Estimate the propensity score
315 | data$PS<-fitted.values(m_PS)
316 |
317 | # Histogram of the PS
318 | hist(data$PS[data$rhc==0])
319 | hist(data$PS[data$rhc==1])
320 | plot(density(data$PS[data$rhc==0]),col="red",lwd=2, xlab="PS")
321 | lines(density(data$PS[data$rhc==1]),col="blue",lwd=2)
322 | legend("topright", legend=c("No RHC", "RHC"), pch="--", col=c("red","blue"), bty="n", lwd=2)
323 |
324 | # Look at minimum and maximum PS in each exposure group
325 | min(data$PS[data$rhc==0])
326 | min(data$PS[data$rhc==1])
327 | max(data$PS[data$rhc==0])
328 | max(data$PS[data$rhc==1])
329 |
330 | # Investigate overlap (i.e. positivity)
331 | data$overlap <- ifelse(data$PS>=min(data$PS[data$rhc==1]) & data$PS<=max(data$PS[data$rhc==0]),1,0); table(data$overlap,data$rhc)
332 |
333 | ## 4.2 Marginal structural model with stabilised weights
334 | ### Box 22: Computation of the IPTW estimator for the ATE using a MSM
335 |
336 | # Unstabilized weights
337 | msm <- lm(Y ~ A + C + w1 + w2 + w3 + w4, data = data, weights = data$w) # MSM
338 | library(sandwich)
339 | SE <-sqrt(diag(vcovHC(msm, type="HC0"))) # robust standard errors
340 | beta <- coef(msm)
341 | lcl <- beta-1.96*SE
342 | ucl <- beta+1.96*SE
343 | cbind(beta, lcl, ucl)[2,]
344 |
345 | # Stabilized weights
346 | denom.fit <- glm(A ~ as.factor(C) + w1 + w2 + w3 + w4,
347 | family = binomial(), data = data)
348 | denom.p <- predict(denom.fit, type = "response") # Stablized Weights
349 |
350 | numer.fit <- glm(A ~ 1, family = binomial(), data = data)
351 | summary(numer.fit)
352 | numer.p <- predict(numer.fit, type = "response") # estimation of numerator of ip weights
353 |
354 | data$sw <- ifelse(data$A == 0, ((1-numer.p)/(1-denom.p)), (numer.p/denom.p))
355 |
356 | msm <- lm(Y ~ A, data = data, weights = sw)
357 |
358 | SE <-sqrt(diag(vcovHC(msm, type="HC0"))) # robust standard errors
359 | beta <- coef(msm)
360 | lcl <- beta-1.96*SE
361 | ucl <- beta+1.96*SE
362 | cbind(beta, lcl, ucl)[2,]
363 |
364 | ## 4.3 IPTW with regression adjustment
365 | ### Box 23: Computation of the IPTW-RA estimator for the ATE and bootstrap for statistical inference
366 | glm1 <- glm(Y ~ C + w1 + w2 + w3 + w4, weights = data$w[data$A==1], data=data[data$A==1,])
367 | Y.1 = predict(glm1, newdata=data.frame(A = 1, C, w1, w2, w3, w4), type="response")
368 | glm2 <- glm(Y ~ C + w1 + w2 + w3 + w4, weights = data$w[data$A==0], data=data[data$A==0,])
369 | Y.0 = predict(glm2, newdata=data.frame(A = 0, C, w1, w2, w3, w4), type="response")
370 | ATE <- mean(Y.1 - Y.0); ATE
371 | ATE2 <- mean(data$w*as.numeric(data$A==1)*Y.1)/mean( data$w*as.numeric(data$A==1)) - mean(data$w*as.numeric(data$A==0)*Y.0)/mean(data$w*as.numeric(data$A==0));ATE2
372 | rm(ATE, ATE2)
373 |
374 | ### Box 24: Computation of the IPTW-RA estimator for the ATE using the ipw R-package
375 | library(ipw)
376 | ipw.ATE <- ipwpoint(exposure = A, family = "binomial", link = "logit",
377 | numerator = ~ C,
378 | denominator = ~ C + w1 + w2 + w3 + w4,
379 | data = data2)
380 | summary(ipw.ATE$ipw.weights)
381 | ipwplot(weights = ipw.ATE$ipw.weights, logscale = FALSE, main = "Stabilized weights", xlim = c(0.5, 2))
382 | summary(ipw.ATE$num.mod)
383 | summary(ipw.ATE$den.mod)
384 |
385 | data2$sw <- ipw.ATE$ipw.weights
386 | msm <- (svyglm(Y ~ A, design = svydesign(~ 1, weights = ~ sw, data = data2)))
387 | coef(msm); confint(msm)
388 |
389 |
390 | # 5. Augmented inverse probability weighting
391 |
392 | ### Box 25: Computation of the AIPTW estimator for the ATE and bootstrap for statistical inference
393 | mod <- glm(Y ~ A + C + w1 + w2 + w3 + w4, family="binomial", data=data)
394 | PO <- cbind(Yhat = predict(mod),
395 | Y1 = predict(mod, newdata=data.frame(A = 1, C, w1, w2, w3, w4), type="response"),
396 | Y0 = predict(mod, newdata=data.frame(A = 0, C, w1, w2, w3, w4), type="response"))
397 | RA <- as.data.frame(PO) # Potential Outcomes
398 | Yhat <- RA$Yhat
399 | Y.1a <- RA$Y1
400 | Y.0a <- RA$Y0
401 |
402 | g <- glm(A ~ C + w1 + w2 + w3 + w4, family = binomial(), data = data)
403 | gw <- predict(g, type = "response")
404 | gws <- ifelse(data$A == 0, (-(1 - data$A)/(1 - gw)),(data$A/gw)); sum(gws) # estimation of weights
405 | AIPTW <- mean(gws*(data$Y - plogis(RA$Yhat)) + ((Y.1a) - (Y.0a))); AIPTW # ATE
406 | RR <- mean(Y.1a/Y.0a); RR # RR
407 |
408 | IC <- (gws*(data$Y - plogis(RA$Yhat)) + ((Y.1a) - (Y.0a)))-AIPTW # Estimate the influence function (functional Delta method)
409 | n <- nrow(data)
410 | varHat.IC <- var(IC)/n; varHat.IC
411 | lci <- AIPTW-1.96*sqrt(varHat.IC)
412 | uci <- AIPTW+1.96*sqrt(varHat.IC)
413 | cat(AIPTW,lci,uci) # Inference Influence function
414 |
415 | AIPTW.b = function(data,indices) # Inference using Bootstrap
416 | {
417 | dat=data[indices,]
418 | mod <- glm(Y ~ A + C + w1 + w2 + w3 + w4, family="binomial", data=data)
419 | Yhat = predict(mod)
420 | Y1 = predict(mod, newdata=data.frame(A = 1, C, w1, w2, w3, w4))
421 | Y0 = predict(mod, newdata=data.frame(A = 0, C, w1, w2, w3, w4))
422 | g <- glm(A ~ C + w1 + w2 + w3 + w4, family="binomial", data = data)
423 | gw <- predict(g,type="response")
424 | gws <- ifelse(A == 0, (-(1 - A)/(1 - gw)),(A/gw))
425 | mean(gws*(Y - plogis(Yhat)) + (plogis(Y1) - plogis(Y0)))
426 | }
427 | AIPTW.b(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n
428 | boot.out=boot(data,AIPTW.b,200) # Draw 200 bootstrap sample estimates
429 | boot.ci(boot.out,type="perc",conf=0.95) # compute confidence intervals using percentile method
430 | boot.ci(boot.out,type="norm",conf=0.95)
431 |
432 | ### Box 26: Computation of the AIPTW estimator for the ATE and marginal risk ratio
433 | w <- subset(data, select=c(C, w1, w2, w3 , w4))
434 | fit1 <- drtmle(W = w, A = A, Y = Y, # input data
435 | a_0 = c(0, 1), # return estimates for A = 0 and A = 1
436 | SL_Q = "SL.npreg", # use kernel regression for E(Y | A = a, W)
437 | glm_g = "C + w1 + w2 + w3 + w4", # use misspecified main terms glm for E(A | W)
438 | SL_Qr = "SL.npreg", # use kernel regression to guard against
439 | # misspecification of outcome regression
440 | #SL_gr = "SL.npreg", # use kernel regression to guard against
441 | # misspecification of propensity score
442 | returnModels = TRUE # for visualizing fits later
443 | )
444 | ATE <- ci(fit1, contrast = c(-1,1)); ATE
445 | RR <- riskRatio <- list(f = function(eff){ log(eff) },
446 | f_inv = function(eff){ exp(eff) },
447 | h = function(est){ est[2]/est[1] },
448 | fh_grad = function(est){ c(1/est[1],-1/est[2]) })
449 | ci(fit1, contrast = riskRatio)
450 | rm(ATE, RR)
451 |
452 | # 6. DATA-ADAPTIVE ESTIMATION: ENSEMBLE LEARNING TARGETED MAXIMUMLIKELIHOOD ESTIMATION
453 |
454 | ### Box 27: Computational implementation of TMLE by hand
455 | # Step 1
456 | Gcomp <- glm(Y ~ A + C + w1 + w2 + w3 + w4, family="binomial", data=data2)
457 | # Prediction for A, A=1 and, A=0
458 | QAW <- predict(Gcomp)
459 | Q1W = predict(Gcomp, newdata=data.frame(A = 1, data2[,c("C", "w1","w2","w3","w4")]))
460 | Q0W = predict(Gcomp, newdata=data.frame(A = 0, data2[,c("C", "w1","w2","w3","w4")]))
461 | # Step 2 estimation of the propensity score (ps)
462 | psm <- glm(A ~ C + w1 + w2 + w3 + w4, family = binomial, data=data2)
463 | gW = predict(psm, type = "response")
464 | g1W = (1 / gW)
465 | g0W = (-1 / (1-gW))
466 | # Step 3 computation of H and estimation of epsilon
467 | HAW <- (data2$A / gW -(1-data2$A) / (1 - gW))
468 | H1W = (1/gW)
469 | H0W = (-1 / (1 - gW))
470 | epsilon <- coef(glm(data2$Y ~ -1 + HAW + offset(QAW), family = "binomial"))
471 | # Step 4 ATE
472 | ATE<- mean(plogis(Q1W + epsilon * H1W) - plogis(Q0W + epsilon * H0W)); ATE
473 | # Step 5 Maringinal RR
474 | T1.EY1 <- mean(plogis(Q1W + epsilon * H1W))
475 | T1.EY0 <- mean(plogis(Q0W + epsilon * H0W))
476 | RR <- (T1.EY1/T1.EY0); RR
477 | rm(ATE, RR)
478 |
479 | ### Box 28: TMLE with data-adaptive estimation using the R package
480 | set.seed(777)
481 | library(tmle)
482 | w <- subset(data, select=c(C, w1, w2, w3 , w4))
483 | fittmle <- tmle(data$Y, data$A, W=w, family="binomial",
484 | Q.SL.library = c("SL.glm","SL.glm.interaction","SL.step.interaction","SL.gam","SL.randomForest"),
485 | g.SL.library = c("SL.glm","SL.glm.interaction","SL.step.interaction","SL.gam","SL.randomForest"))
486 | fittmle
487 |
488 | # 7. Simulation
489 | ### Box 29: Data generation for the Monte Carlo experiment
490 |
491 | rm(list=ls())
492 |
493 | # Super Learner libraries
494 | SL.library <- c("SL.glm","SL.step","SL.step.interaction","SL.glm.interaction","SL.gam") #"SL.randomForest","SL.glmnet"
495 |
496 | # Data generation A: dual misspecification for the model of the outcome and treatment
497 | set.seed(7777)
498 | generateData <- function(n){
499 | w1 <- round(runif(n, min=1, max=5), digits=0)
500 | w2 <- rbinom(n, size=1, prob=0.45)
501 | w3 <- round(runif(n, min=0, max=1), digits=0 + 0.75*w2 + 0.8*w1)
502 | w4 <- round(runif(n, min=0, max=1), digits=0 + 0.75*w2 + 0.2*w1)
503 | A <- rbinom(n, size=1, prob= plogis(-1 - 0.15*w4 + 1.5*w2 + 0.75*w3 + 0.25*w1 + 0.8*w2*w4))
504 | # Counterfactuals
505 | Y.1 <- rbinom(n, size=1, prob = plogis(-3 + 1 + 0.25*w4 + 0.75*w3 + 0.8*w2*w4 + 0.05*w1))
506 | Y.0 <- rbinom(n, size=1, prob = plogis(-3 + 0 + 0.25*w4 + 0.75*w3 + 0.8*w2*w4 + 0.05*w1))
507 | # Observed outcome
508 | Y <- Y.1*A + Y.0*(1 - A)
509 | # return data.frame
510 | data.frame(w1, w2, w3, w4, A, Y, Y.1, Y.0)
511 | }
512 |
513 | # True ATE
514 | ObsDataTrueATE <- generateData(n=5000000)
515 | True_ATE <- mean(ObsDataTrueATE$Y.1 - ObsDataTrueATE$Y.0);True_ATE
516 | True_EY.1 <- mean(ObsDataTrueATE$Y.1)
517 | True_EY.0 <- mean(ObsDataTrueATE$Y.0)
518 | True_RR <- (True_EY.1 / True_EY.0);True_RR
519 |
520 | #Simulations
521 | library(tmle)
522 | library(SuperLearner)
523 | #install.packages("dbarts")
524 | R <- 1000
525 | #Empty vectors
526 | naive_RR <- rep(NA,R)
527 | ATEtmle1 <- rep(NA,R)
528 | RRtmle1 <- rep(NA,R)
529 | ATE_AIPTW <- rep(NA,R)
530 | RR_AIPTW <- rep(NA,R)
531 | ATEtmle2 <- rep(NA,R)
532 | RRtmle2 <- rep(NA,R)
533 | ATEtmle3 <- rep(NA,R)
534 | RRtmle3 <- rep(NA,R)
535 | for(r in 1:R){
536 | print(paste("This is simulation run number",r))
537 | CancerData <- generateData(n=1000)
538 | # ATE naive approach
539 | naive_RR[r] <- exp(glm(data = CancerData, Y ~ A + w1 + w2 + w3 + w4, family = poisson(link="log"))$coef[2])
540 | # TMLE implementation by hand
541 | # Step 1
542 | gm <- glm(Y ~ A + w1 + w2 + w3 + w4, family="binomial", data=CancerData)
543 | # Prediction for A, A=1 and, A=0
544 | QAW <- predict(gm)
545 | Q1W = predict(gm, newdata=data.frame(A = 1, CancerData[,c("w1","w2","w3","w4")]))
546 | Q0W = predict(gm, newdata=data.frame(A = 0, CancerData[,c("w1","w2","w3","w4")]))
547 | # Step 2 estimation of the propensity score (ps)
548 | psm <- glm(A ~ w1 + w2 + w3 + w4, family = binomial, data=CancerData)
549 | gW = predict(psm, type = "response")
550 | g1W = (1 / gW)
551 | g0W = (-1 / (1-gW))
552 | # Step 3 computation of H and estimation of epsilon
553 | HAW <- (CancerData$A / gW -(1-CancerData$A) / (1 - gW))
554 | H1W = (1/gW)
555 | H0W = (-1 / (1 - gW))
556 | epsilon <- coef(glm(CancerData$Y ~ -1 + HAW + offset(QAW), family = "binomial"))
557 | # Step 4 updated ATE
558 | ATEtmle1[r] <- mean(plogis(Q1W + epsilon * H1W) - plogis(Q0W + epsilon * H0W))
559 | # Step 5 updated MOR
560 | T1.EY1 <- mean(plogis(Q1W + epsilon * H1W))
561 | T1.EY0 <- mean(plogis(Q0W + epsilon * H0W))
562 | RRtmle1[r] <- (T1.EY1 / T1.EY0)
563 |
564 | # Augmented inverse probability treatment weight (AIPTW) estimator
565 | ATE_AIPTW[r] <- mean((HAW*(CancerData$Y - plogis(QAW)) + (plogis(Q1W)-plogis(Q0W))))
566 | AIPTW1 <- mean(CancerData$A * (CancerData$Y - plogis(Q1W)) / gW + plogis(Q1W) )
567 | AIPTW0 <- mean((1- CancerData$A) * (CancerData$Y - plogis(Q0W)) / (1-gW) + plogis(Q0W))
568 | RR_AIPTW[r] <- mean( AIPTW1 / AIPTW0)
569 |
570 | # R-package tmle (base implementation includes SL.step, SL.glm and SL.glm.interaction)
571 | ATE2 <- tmle(Y=CancerData$Y, A=CancerData$A, W=CancerData[,c("w1","w2","w3","w4")], family="binomial")
572 | ATEtmle2[r] <- ATE2$estimates$ATE$psi
573 | RRtmle2[r] <- ATE2$estimates$RR$psi
574 |
575 | # Improved Super learner
576 | ATE3 <- tmle(Y = CancerData$Y, A=CancerData$A, W=CancerData[,c("w1","w2","w3","w4")], family="binomial", Q.SL.library=SL.library, g.SL.library=SL.library)
577 | ATEtmle3[r] <- ATE3$estimates$ATE$psi
578 | RRtmle3[r] <- ATE3$estimates$RR$psi
579 | }
580 | # Mean naive
581 | mean(naive_RR)
582 | # Mean AIPTW
583 | mean(ATE_AIPTW)
584 | mean(RR_AIPTW)
585 | # Estimate of TMLE by hand
586 | mean(ATEtmle1)
587 | mean(RRtmle1)
588 | # Estimate of TMLE + SL default implementation
589 | mean(ATEtmle2)
590 | mean(RRtmle2)
591 | # Estimate of TMLE + SL2 default plus more algorithms
592 | mean(ATEtmle3)
593 | mean(RRtmle3)
594 | save.image("your path\results.RData")
595 |
596 | # Relative Bias ATE
597 | abs(mean((True_ATE - ATE_AIPTW) / True_ATE)*100)
598 | abs(mean((True_ATE - ATEtmle1) / True_ATE)*100)
599 | abs(mean((True_ATE - ATEtmle2) / True_ATE)*100)
600 | abs(mean((True_ATE - ATEtmle3) / True_ATE)*100)
601 |
602 | # Relative Bias RR
603 | abs(mean((True_RR - naive_RR) / True_RR)*100)
604 | abs(mean((True_RR - RR_AIPTW) / True_RR)*100)
605 | abs(mean((True_RR - RRtmle1) / True_RR)*100)
606 | abs(mean((True_RR - RRtmle2) / True_RR)*100)
607 | abs(mean((True_RR - RRtmle3) / True_RR)*100)
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tutorial: Introduction to computational causal inference for applied researchers and epidemiologists
2 |
3 | ### Matthew James Smith, Camille Maringe, Bernard Rachet, Mohammad A. Mansournia, Paul Zivich, Stephen R. Cole, Miguel Angel Luque Fernandez
4 |
5 | ### This repository makes available to the scientific community the data and code used in the preprint manuscript available at
6 |
7 | [Link to the preprint article](https://arxiv.org/abs/2012.09920)
8 |
9 | ### CITE this repository:
10 |
11 | [](https://zenodo.org/badge/latestdoi/272439035)
12 |
13 | ### Matthew James Smith, Camille Maringe, Bernard Rachet, Mohammad A. Mansournia, Paul Zivich, Stephen R. Cole, Miguel Angel Luque Fernandez
14 |
15 | ### ABSTRACT
16 | The purpose of many health studies is to estimate the effect of an exposure on an outcome. It is not always ethical to assign an exposure to individuals in randomised controlled trials, instead observational data and appropriate study design must be used. There are major challenges with observational studies, one of which is confounding that can lead to biased estimates of the causal effects. Controlling for confounding is commonly performed by simple adjustment for measured confounders; although, often this is not enough. Recent advances in the field of causal inference have dealt with confounding by building on classical standardisation methods. However, these recent advances have progressed quickly with a relative paucity of computational-oriented applied tutorials contributing to some confusion in the use of these methods among applied researchers. In this tutorial, we show the computational implementation of different causal inference estimators from a historical perspective where different estimators were developed to overcome the limitations of the previous one. Furthermore, we also briefly introduce the potential outcomes framework, illustrate the use of different methods using an illustration from the health care setting, and most importantly, we provide reproducible and commented code in Stata, R and Python for researchers to apply in their own observational study. The code can be accessed at
17 |
18 | [https://github.com/migariane/TutorialCausalInferenceEstimators](https://github.com/migariane/TutorialCausalInferenceEstimators)
19 |
20 | KEYWORDS: Causal Inference; Regression adjustment; G-methods; G-formula; Propensity score; Inverse probability weighting; Double-robust methods; Machine learning; Targeted maximum likelihood estimation; Epidemiology; Statistics; Tutorial
21 |
--------------------------------------------------------------------------------
/Results.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/migariane/TutorialCausalInferenceEstimators/d809b657b382e227837d3032b1517612c478818d/Results.RData
--------------------------------------------------------------------------------
/StataCodeBoxes.do:
--------------------------------------------------------------------------------
1 | /*
2 | Tutorial: causal inference methods made easy for applied resarchers/epidemiologists/statisticians
3 | =================================================================================================
4 |
5 | ICON-LSHTM, LONDON, 16th October 2020
6 |
7 | Miguel Angel Luque Fernandez, PhD
8 | Assistant Professor of Epidemiology and Biostatistics
9 | Camille Maringe, PhD
10 | Assistant Professor
11 |
12 | Inequalities in Cancer Outcomes Network, LSHTM, London, UK
13 |
14 | Copyright (c) 2020 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 |
18 | Bug reports: miguel-angel.luque@lshtm.ac.uk
19 |
20 | The rhc dataset can be dowloaded at http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets
21 | */
22 |
23 |
24 |
25 |
26 | *** Preliminaries
27 | clear
28 | set more off
29 | cd "C:\Data" // this path should point to where the RHC data are
30 | use "rhc.dta", clear
31 | describe
32 | count
33 | * 83 variables and 5,735 observations
34 |
35 | /* Box 1: Setting the data */
36 | * Define the outcome (Y), exposure (A), confounder (C), and confounders (W)
37 | global Y death_d30
38 | global A rhc
39 | global C gender
40 | global W gender age edu race carcinoma
41 |
42 | /* Box 2: Naive estimate of the ATE */
43 | * Naive approach to estimate the causal effect
44 | regr $Y $A $C
45 | * The naive estimate of the causal effect is 0.07352
46 |
47 | /* 3. G-formula */
48 | /* 3.1 Non-parametric G-formula */
49 |
50 | * 1) ATE
51 | /* Box 3: Non-parametric G-Formula for the ATE */
52 | proportion $C
53 | matrix m=e(b)
54 | gen genderf = m[1,1]
55 | sum genderf
56 | gen genderm = m[1,2]
57 | sum genderm
58 | * you may need to install the command sumup, type:
59 | * ssc install sumup
60 | sumup $Y, by($A $C)
61 | * from sumup command extract the conditinal means by the given A and C levels i.e. zero and one
62 | * see matrix list y00: position subscript [3,1] is th one of interest
63 | matrix y00 = r(Stat1)
64 | matrix y01 = r(Stat2)
65 | matrix y10 = r(Stat3)
66 | matrix y11 = r(Stat4)
67 | gen EY1 = ((y11[3,1]-y01[3,1]))*genderm
68 | gen EY0 = ((y10[3,1]-y00[3,1]))*genderf
69 | qui: mean EY1 EY0
70 | matrix ATE = r(table)
71 | display "The ATE is: " ATE[1,1] + ATE[1,2]
72 | drop EY*
73 | * The ATE from non-parametric estimator is: 0.073692
74 | // Also one can try
75 | gen ATE = ((y11[3,1]-y01[3,1]))*genderm + ((y10[3,1]-y00[3,1]))*genderf
76 | qui sum ATE
77 | drop ATE
78 |
79 | * Check that Stata "teffects" command obtains the same estimate
80 | teffects ra ($Y $C) ($A)
81 | * The ATE from "teffects" implementation is: 0.073692
82 |
83 | * 2) ATT
84 | /* Box 4: Non-parametric G-Formula for the ATT */
85 | * Estimate the marginal probabilities
86 | proportion $C if $A==1
87 | matrix m=e(b)
88 | gen genderfatet = m[1,1]
89 | gen gendermatet = m[1,2]
90 | gen EY1 = ((y11[3,1]-y01[3,1]))*gendermatet
91 | gen EY0 = ((y10[3,1]-y00[3,1]))*genderfatet
92 | qui: mean EY1 EY0
93 | matrix ATT = r(table)
94 | display "The ATT is: " ATT[1,1] + ATT[1,2] // Applying the G-formula
95 | drop EY*
96 | * The ATT from non-parametric estimator is: 0.073248
97 | // Also one can try
98 | gen ATT = ((y11[3,1]-y01[3,1]))*gendermatet + ((y10[3,1]-y00[3,1]))*genderfatet
99 | qui sum ATT
100 | drop ATT
101 |
102 | * Check using Stata "teffects" command
103 | teffects ra ($Y $C) ($A), atet
104 | * The ATT from "teffects" implementation is: 0.073248
105 |
106 | /* Box 5: Bootstrap 95% Confidence Intervals (CI) for the ATE/ATT estimated using the Non-parametric G-Formula */
107 |
108 | * 1) For the ATE
109 | capture program drop ATE
110 | program define ATE, rclass
111 | capture drop y1
112 | capture drop y0
113 | capture drop ATE
114 | sumup $Y, by($A $C)
115 | matrix y00 = r(Stat1)
116 | matrix y01 = r(Stat2)
117 | matrix y10 = r(Stat3)
118 | matrix y11 = r(Stat4)
119 | gen ATE = ((y11[3,1]-y01[3,1]))*genderm + ((y10[3,1]-y00[3,1]))*genderf
120 | qui sum ATE
121 | return scalar ate = `r(mean)'
122 | end
123 |
124 | qui bootstrap r(ate), reps(1000): ATE
125 | estat boot, all
126 |
127 | * 2) For the ATT
128 | capture program drop ATT
129 | program define ATT, rclass
130 | capture drop y1
131 | capture drop y0
132 | capture drop ATT
133 | sumup $Y, by($A $C)
134 | matrix y00 = r(Stat1)
135 | matrix y01 = r(Stat2)
136 | matrix y10 = r(Stat3)
137 | matrix y11 = r(Stat4)
138 | gen ATT = ((y11[3,1]-y01[3,1]))*gendermatet + ((y10[3,1]-y00[3,1]))*genderfatet
139 | qui sum ATT
140 | return scalar att = `r(mean)'
141 | end
142 |
143 | qui bootstrap r(att), reps(1000): ATT
144 | estat boot, all
145 |
146 | drop ATE ATT
147 |
148 | /* Box 6: Non-parametric G-Formula using a fully saturated regression model in Stata (A) */
149 | * method 1: conditional probabilities
150 | regress $Y ibn.$A ibn.$A#c.($C) , noconstant vce(robust) coeflegend
151 | predictnl ATE = (_b[1.rhc] + _b[1.rhc#c.gender]*gender) - (_b[0bn.rhc] + _b[0bn.rhc#c.gender]*gender)
152 | qui: sum ATE
153 | display "The ATE is: " "`r(mean)'"
154 | drop ATE
155 |
156 | /* Box 7: Non-parametric G-Formula using a fully saturated regression model in Stata (B) */
157 | * method 2: marginal probabilities
158 | regress $Y ibn.$A ibn.$A#c.($C) , noconstant vce(robust) coeflegend
159 |
160 | * Marginal probability in each treatment group
161 | margins $A , vce(unconditional)
162 |
163 | * Difference in marginal probability between treatment groups
164 | margins r.$A , contrast(nowald)
165 |
166 | /* 3.2 PARAMETRIC G-FORMULA */
167 |
168 | * One confounder
169 |
170 | /* Box 8: Parametric G-formula */
171 | * Calculations by hand
172 | * Expected probability amongst treated
173 | regress $Y $C if $A==1
174 | predict double y1hat
175 |
176 | * Expected probability amongst untreated
177 | regress $Y $C if $A==0
178 | predict double y0hat
179 | mean y1hat y0hat
180 |
181 | * Difference between expected probabilities (ATE) and biased confidence interval
182 | lincom _b[y1hat] - _b[y0hat]
183 |
184 | /* Box 9: Parametric regression adjustment using Stata's teffects (one confounder) */
185 | teffects ra ($Y $C) ($A)
186 |
187 | /* Box 10: Bootstrap for the parametric regression adjustment */
188 | capture program drop ATE
189 | program define ATE, rclass
190 | capture drop y1
191 | capture drop y0
192 | reg $Y $C if $A==1
193 | predict double y1, xb
194 | quiet sum y1
195 | reg $Y $C if $A==0
196 | predict double y0, xb
197 | quiet sum y0
198 | mean y1 y0
199 | lincom _b[y1]-_b[y0]
200 | return scalar ace =`r(estimate)'
201 | end
202 | qui bootstrap r(ace), reps(1000): ATE
203 | estat boot, all
204 |
205 | * More than one confounder
206 |
207 | /* Box 11: Parametric multivariate regression adjustment implementation of the G-Formula */
208 | regress $Y $W if $A==1
209 | predict double y1hat
210 | regress $Y $W if $A==0
211 | predict double y0hat
212 | mean y1hat y0hat
213 | lincom _b[y1hat] - _b[y0hat]
214 |
215 | /* Box 12: Parametric multivariate regression adjustment using Stata’s teffects command */
216 | teffects ra ($Y $W) ($A)
217 |
218 | /* Box 13: Parametric multivariate regression adjustment using Stata’s margins command */
219 | regress $Y ibn.$A ibn.$A#c.($W) , noconstant vce(robust)
220 | margins $A, vce(unconditional)
221 | margins r.$A, contrast(nowald)
222 |
223 | /* Box 14: Bootstrap for the multivariate parametric regression adjustment */
224 | capture program drop ATE
225 | program define ATE, rclass
226 | capture drop y1
227 | capture drop y0
228 | reg $Y $W if $A==1
229 | predict double y1, xb
230 | quiet sum y1
231 | reg $Y $W if $A==0
232 | predict double y0, xb
233 | quiet sum y0
234 | mean y1 y0
235 | lincom _b[y1]-_b[y0]
236 | return scalar ace =`r(estimate)'
237 | end
238 | qui bootstrap r(ace), reps(1000): ATE dots
239 | estat boot, all
240 |
241 | /* Box 15: Computing the parametric marginal risk ratio after regression adjustment */
242 | teffects ra ($Y $W) ($A), aequations
243 | teffects ra ($Y $W) ($A), coeflegend
244 | nlcom 100*_b[ATE:r1vs0.$A]/_b[POmean:0.$A]
245 | * 27.4% increase in relative risk
246 | teffects ra ($Y $W) ($A), pom coeflegend
247 | nlcom _b[POmeans:1.rhc]/ _b[POmeans:0bn.rhc]
248 | * 27.4% increase in relative risk
249 |
250 | /* 4 Inverse probability of treatment weighting */
251 | /* 4.1 Inverse probability of treatment weighting based on the propensity score plus regression adjustment */
252 |
253 | /* Box 16: Computation of the IPTW estimator for the ATE */
254 | * propensity score model for the exposure
255 | logit $A $W, vce(robust) nolog
256 |
257 | * propensity score predictions
258 | predict double ps
259 |
260 | * Sampling weights for the treated group
261 | generate double ipw1 = ($A==1)/ps
262 |
263 | * Weighted outcome probability among treated
264 | regress $Y [pw=ipw1]
265 | scalar Y1 = _b[_cons]
266 |
267 | * Sampling weights for the non-treated group
268 | generate double ipw0 = ($A==0)/(1-ps)
269 | regress $Y [pw=ipw0]
270 | scalar Y0 = _b[_cons]
271 | display "ATE =" Y1 - Y0
272 |
273 | /* Box 17: Bootstrap computation for the IPTW estimator */
274 | * Bootstrap the confidence intervals
275 | capture program drop ATE
276 | program define ATE, rclass
277 | capture drop y1
278 | capture drop y0
279 | regress $Y [pw=ipw1]
280 | matrix y1 = e(b)
281 | gen double y1 = y1[1,1]
282 | regress $Y [pw=ipw0]
283 | matrix y0 = e(b)
284 | gen double y0 = y0[1,1]
285 | mean y1 y0
286 | lincom _b[y1]-_b[y0]
287 | return scalar ace = `r(estimate)'
288 | end
289 | qui bootstrap r(ace), reps(1000): ATE
290 | estat boot, all
291 |
292 | /* Box 18: Computation of the IPTW estimator for the ATE using Stata’s teffects command */
293 | teffects ipw ($Y) ($A $W, logit), nolog vsquish
294 |
295 | /* Box 19: Assessing IPTW balance */
296 | * Stata teffects and tebalance commands
297 | qui teffects ipw ($Y) ($A $W)
298 | tebalance summarize
299 |
300 | * By hand - with the example of gender
301 | egen genderst = std(gender) // Standardization
302 | logistic $A $W // Propensity score
303 | capture drop ps
304 | predict double ps
305 | gen ipw = .
306 | replace ipw=($A==1)/ps if $A==1
307 | replace ipw=($A==0)/(1-ps) if $A==0
308 | regress genderst $A // Raw difference
309 | regress genderst $A [pw=ipw] // Standardized difference
310 |
311 | /* Box 20: Assessing IPTW overlap by hand */
312 | sort $A
313 | by $A: summarize ps
314 | kdensity ps if $A==1, generate(x1pointsa d1A) nograph n(10000)
315 | kdensity ps if $A==0, generate(x0pointsa d0A) nograph n(10000)
316 | label variable d1A "density for RHC=1"
317 | label variable d0A "density for RHC=0"
318 | twoway (line d0A x0pointsa , yaxis(1))(line d1A x1pointsa, yaxis(2))
319 |
320 | /* Box 21: Assessing overlap using Stata's teffects overlap */
321 | qui: teffects ipw ($Y) ($A $W, logit), nolog vsquish
322 | teffects overlap
323 |
324 |
325 | /* 4.2 Marginal structural model with stabilized weights */
326 | /* Box 22: Computation of the IPTW estimator for the ATE using a MSM */
327 | * Baseline treatment probabilities
328 | logit $A, vce(robust) nolog
329 | predict double nps, pr
330 |
331 | * propensity score model
332 | logit $A $W, vce(robust) nolog
333 | predict double dps, pr
334 |
335 | * Unstabilized weight
336 | cap drop ipw
337 | gen ipw = .
338 | replace ipw=($A==1)/dps if $A==1
339 | replace ipw=($A==0)/(1-dps) if $A==0
340 | sum ipw
341 |
342 | * Stabilized weight
343 | gen sws = .
344 | replace sws = nps/dps if $A==1
345 | replace sws = (1-nps)/(1-dps) if $A==0
346 | sum sws
347 |
348 | * MSM
349 | reg $Y $A [pw=ipw], vce(robust) // MSM unstabilized weight
350 | reg $Y $A [pw=sws], vce(robust) // MSM stabilized weight
351 |
352 |
353 | /* 4.3 IPTW with regression adjustment */
354 |
355 | /* Box 23: Computation of the IPTW-RA estimator for the ATE and bootstrap for statistical inference */
356 | capture program drop ATE
357 | program define ATE, rclass
358 | capture drop y1
359 | capture drop y0
360 | reg $Y $W if $A==1 [pw=sws]
361 | predict double y1, xb
362 | quiet sum y1
363 | return scalar y1=`r(mean)'
364 | reg $Y $W if $A==0 [pw=sws]
365 | predict double y0, xb
366 | quiet sum y0
367 | return scalar y0=`r(mean)'
368 | mean y1 y0
369 | lincom _b[y1]-_b[y0]
370 | return scalar ace =`r(estimate)'
371 | end
372 | qui bootstrap r(ace), reps(10): ATE
373 | estat boot, all
374 |
375 | /* Box 24: Computation of the IPTW-RA estimator for the ATE using Stata’s teffects */
376 | teffects ipwra ($Y $W) ($A $W)
377 | nlcom 100*_b[r1vs0.$A]/_b[POmean:0.$A]
378 | teffects ipwra ($Y $W) ($A $W), pom coeflegend
379 | nlcom _b[POmeans:1.rhc]/ _b[POmeans:0bn.rhc]
380 | *eltmle to check marginal RR
381 | eltmle $Y $A $W, tmle
382 |
383 | /* 5. Augmented inverse probability weighting */
384 | /* Box 25: Computation of the AIPTW estimator for the ATE and bootstrap for statistical inference */
385 | * Step (i) prediction model for the outcome
386 | qui glm $Y $A $W, fam(bin)
387 | predict double QAW, mu
388 | qui glm $Y $W if $A==1, fam(bin)
389 | predict double Q1W, mu
390 | qui glm $Y $W if $A==0, fam(bin)
391 | predict double Q0W, mu
392 |
393 | * Step (ii): prediction model for the treatment
394 | cap drop dps nps sws y1 y0
395 | qui logit $A $W
396 | predict double dps, pr
397 | qui logit $A
398 | predict double nps, pr
399 | gen sws = .
400 | replace sws = nps/dps if $A==1
401 | replace sws = (1-nps)/(1-dps) if $A==0
402 |
403 | * Step (iii): Estimation equation
404 | gen double y1 = (sws*($Y-QAW) + (Q1W))
405 | quiet sum y1
406 | scalar y1=`r(mean)'
407 | gen double y0 = (sws*($Y-QAW) + (Q0W))
408 | quiet sum y0
409 | scalar y0=`r(mean)'
410 | mean y1 y0
411 | lincom _b[y1] - _b[y0]
412 |
413 | * Step (iv): Bootstrap confidence intervals
414 | capture program drop ATE
415 | program define ATE, rclass
416 | capture drop y1
417 | capture drop y0
418 | capture drop Q*
419 | qui glm $Y $A $W, fam(bin)
420 | predict double QAW, mu
421 | qui glm $Y $W if $A==1, fam(bin)
422 | predict double Q1W, mu
423 | qui glm $Y $W if $A==0, fam(bin)
424 | predict double Q0W, mu
425 | gen double y1 = (sws*($Y-QAW) + (Q1W))
426 | quiet sum y1
427 | return scalar y1=`r(mean)'
428 | gen double y0 = (sws*($Y-QAW) + (Q0W))
429 | quiet sum y0
430 | return scalar y0=`r(mean)'
431 | mean y1 y0
432 | lincom _b[y1] - _b[y0]
433 | return scalar ace =`r(estimate)'
434 | end
435 | qui bootstrap r(ace), reps(1000): ATE
436 | estat boot, all
437 |
438 | /* Box 26: Computation of the AIPTW estimator for the ATE and marginal risk ratio using Stata’s teffects */
439 | teffects aipw ($Y $W) ($A $W, logit)
440 | * marginal Relative Risk
441 | nlcom 100*_b[r1vs0.$A]/_b[POmean:0.$A]
442 | * another way to compute it
443 | teffects aipw ($Y $W) ($A $W, logit), pom coeflegend
444 | nlcom _b[POmeans:1.rhc]/ _b[POmeans:0bn.rhc]
445 |
446 | /* 6. DATA-ADAPTIVE ESTIMATION: ENSEMBLE LEARNING TARGETED MAXIMUMLIKELIHOOD ESTIMATION*/
447 | /*Box 27: Computational implementation of TMLE by hand */
448 |
449 | * Step 1: prediction model for the outcome Q0 (g-computation)
450 | glm $Y $A $W, fam(binomial)
451 | predict double QAW_0, mu
452 | gen aa=$A
453 | replace $A = 0
454 | predict double Q0W_0, mu
455 | replace $A= 1
456 | predict double Q1W_0, mu
457 | replace $A = aa
458 | drop aa
459 |
460 | // Q to logit scale
461 | gen logQAW = log(QAW / (1 - QAW))
462 | gen logQ1W = log(Q1W / (1 - Q1W))
463 | gen logQ0W = log(Q0W / (1 - Q0W))
464 |
465 | * Step 2: prediction model for the treatment g0 (IPTW)
466 | glm $A $W, fam(binomial)
467 | predict gw, mu
468 | gen double H1W = $A / gw
469 | gen double H0W = (1 - $A ) / (1 - gw)
470 |
471 | * Step 3: Computing the clever covariate H(A,W) and estimating the parameter (epsilon) (MLE)
472 | glm $Y H1W H0W, fam(binomial) offset(logQAW) noconstant
473 | mat a = e(b)
474 | gen eps1 = a[1,1]
475 | gen eps2 = a[1,2]
476 |
477 | * Step 4: update from Q0 to Q1
478 | gen double Q1W_1 = exp(eps1 / gw + logQ1W) / (1 + exp(eps1 / gw + logQ1W))
479 | gen double Q0W_1 = exp(eps2 / (1 - gw) + logQ0W) / (1 + exp(eps2 / (1 - gw) + logQ0W))
480 |
481 | * Step 5: Targeted estimate of the ATE
482 | gen ATE = (Q1W_1 - Q0W_1)
483 | summ ATE
484 | global ATE = r(mean)
485 | drop ATE
486 |
487 | * Step 6: Statistical inference (efficient influence curve)
488 | qui sum(Q1W_1)
489 | gen EY1tmle = r(mean)
490 | qui sum(Q0W_1)
491 | gen EY0tmle = r(mean)
492 |
493 | gen d1 = (($A * ($Y - Q1W_1)/gw)) + Q1W_1 - EY1tmle
494 | gen d0 = ((1 - $A ) * ($Y - Q0W_1)/(1 - gw)) + Q0W_1 - EY0tmle
495 |
496 | gen IC = d1 - d0
497 | qui sum IC
498 | gen varIC = r(Var) / r(N)
499 | drop d1 d0 IC
500 |
501 | global LCI = $ATE - 1.96*sqrt(varIC)
502 | global UCI = $ATE + 1.96*sqrt(varIC)
503 | display "ATE:" %05.4f $ATE _col(15) "95%CI: " %05.4f $LCI "," %05.4f $UCI
504 |
505 | /* Box 28: TMLE with data-adaptive estimation using the Stata’s user writen eltmle */
506 | * if not already installed, type:
507 | * ssc install eltmle
508 | preserve
509 | eltmle $Y $A $W, tmle
510 | restore
511 |
512 |
513 | /* 7. Simulation */
514 | /* Box 29: Data generation for the Monte Carlo experiment */
515 |
516 | * Data generation
517 | clear
518 | set obs 1000
519 | set seed 777
520 | gen w1 = round(runiform(1, 5)) //Quintiles of Socioeconomic Deprivation
521 | gen w2 = rbinomial(1, 0.45) //Binary: probability age >65 = 0.45
522 | gen w3 = round(runiform(0, 1) + 0.75*(w2) + 0.8*(w1)) //Stage
523 | recode w3 (5/6=1) //Stage (TNM): categorical 4 levels
524 | gen w4 = round(runiform(0, 1) + 0.75*(w2) + 0.2*(w1)) //Comorbidites: categorical four levels
525 | gen A = (rbinomial(1,invlogit(-1 - 0.15*(w4) + 1.5*(w2) + 0.75*(w3) + 0.25*(w1) + 0.8*(w2)*(w4)))) //Binary treatment
526 | gen Y1 = (invlogit(-3 + 1 + 0.25*(w4) + 0.75*(w3) + 0.8*(w2)*(w4) + 0.05*(w1))) // Potential outcome 1
527 | gen Y0 = (invlogit(-3 + 0 + 0.25*(w4) + 0.75*(w3) + 0.8*(w2)*(w4) + 0.05*(w1))) // Potential outcome 2
528 | gen psi = Y1-Y0 // Simulated ATE
529 | gen Y = A*(Y1) + (1 - A)*Y0 //Binary outcome
530 |
531 |
532 | // Estimate the true simulated ATE
533 | mean psi
534 |
535 | // ATE estimation
536 | * Regression adjustment
537 | teffects ra (Y w1 w2 w3 w4) (A)
538 | estimates store ra
539 |
540 | * IPTW
541 | teffects ipw (Y) (A w1 w2 w3 w4)
542 | estimates store ipw
543 |
544 | * IPTW-RA
545 | teffects ipwra (Y w1 w2 w3 w4) (A w1 w2 w3 w4)
546 | estimates store ipwra
547 |
548 | * AIPTW
549 | teffects aipw (Y w1 w2 w3 w4) (A w1 w2 w3 w4)
550 | estimates store aipw
551 |
552 | * Results
553 | qui reg psi
554 | estimates store psi
555 | estout psi ra ipw ipwra aipw
556 |
557 | // Ensemble learning maximum likelihood estimation
558 | preserve
559 | eltmle Y A w1 w2 w3 w4, tmle
560 | restore
561 |
562 | // Relative bias of each ATE
563 | * Regression adjustment
564 | display abs(0.1787 - 0.203419)/0.1787
565 |
566 | * IPTW
567 | display abs(0.1787 - 0.2776)/0.1787
568 |
569 | * IPTW-RA
570 | display abs(0.1787 - .2052088)/0.1787
571 |
572 | * AIPTW
573 | display abs(0.1787 - 0.2030)/0.1787
574 |
575 | * ELTMLE
576 | display abs(0.1787 - 0.1784)/0.1787
577 |
578 |
--------------------------------------------------------------------------------
/rhc.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/migariane/TutorialCausalInferenceEstimators/d809b657b382e227837d3032b1517612c478818d/rhc.Rdata
--------------------------------------------------------------------------------
/rhc.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/migariane/TutorialCausalInferenceEstimators/d809b657b382e227837d3032b1517612c478818d/rhc.dta
--------------------------------------------------------------------------------