├── .gitignore ├── LICENSE ├── PythonCodeBoxes.ipynb ├── RCodeBoxes.R ├── README.md ├── Results.RData ├── StataCodeBoxes.do ├── rhc.Rdata ├── rhc.csv └── rhc.dta /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | .ipynb_checkpoints/* 3 | ### R ### 4 | # History files 5 | .Rhistory 6 | .Rapp.history 7 | # User-specific files 8 | .Ruserdata 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Miguel Angel Luque Fernandez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /PythonCodeBoxes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tutorial: causal inference methods made easy for applied resarchers/epidemiologists/statisticians \n", 8 | "\n", 9 | "### ICON-LSHTM, LONDON, 16th October 2020\n", 10 | "\n", 11 | "Miguel Angel Luque Fernandez PhD, Assistant Professor of Epidemiology and Biostatistics\n", 12 | "\n", 13 | "Matthew Smith PhD, Research Fellow Inequalities in Cancer Outcomes Network, LSHTM, London, UK\n", 14 | "\n", 15 | "Paul Zivich, University of North Carolina at Chapel Hill\n", 16 | "\n", 17 | "Copyright (c) 2020 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n", 18 | "\n", 19 | "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n", 20 | "\n", 21 | "Bug reports: miguel-angel.luque at lshtm.ac.uk\n", 22 | "\n", 23 | "The rhc dataset can be dowloaded at http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "patsy 0.5.1\n", 36 | "scipy 1.5.2\n", 37 | "numpy 1.19.1\n", 38 | "pandas 1.1.0\n", 39 | "statsmodels 0.11.1\n", 40 | "matplotlib 3.3.1\n", 41 | "zepid 0.9.0\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "%matplotlib inline\n", 47 | "\n", 48 | "# Importing libraries for the tutorial\n", 49 | "import patsy\n", 50 | "import scipy\n", 51 | "import numpy as np\n", 52 | "import pandas as pd\n", 53 | "import statsmodels.api as sm\n", 54 | "import statsmodels.formula.api as smf\n", 55 | "import matplotlib\n", 56 | "import matplotlib.pyplot as plt\n", 57 | "import zepid\n", 58 | "\n", 59 | "from scipy.stats.kde import gaussian_kde\n", 60 | "from scipy.stats import logistic\n", 61 | "from zepid.calc import probability_to_odds, odds_to_probability\n", 62 | "\n", 63 | "print(\"patsy \", patsy.__version__)\n", 64 | "print(\"scipy \", scipy.__version__)\n", 65 | "print(\"numpy \", np.__version__)\n", 66 | "print(\"pandas \", pd.__version__)\n", 67 | "print(\"statsmodels\", sm.__version__)\n", 68 | "print(\"matplotlib \", matplotlib.__version__)\n", 69 | "print(\"zepid \", zepid.__version__)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## Setting up the Data" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# Box 1: Setting up the data\n", 86 | "data = pd.read_csv(\"rhc.csv\")\n", 87 | "data.rename(columns={\"rhc\": \"A\", \n", 88 | " \"death_d30\": \"Y\",\n", 89 | " \"gender\": \"C\",\n", 90 | " \"age\": \"W1\",\n", 91 | " \"edu\": \"W2\",\n", 92 | " \"race\": \"W3\",\n", 93 | " \"carcinoma\": \"W4\",\n", 94 | " }, inplace=True)\n", 95 | "data['A'] = np.where(data['A'] == \"Yes\", 1, 0)\n", 96 | "data['C'] = np.where(data['C'] == \"Female\", 0, 1)\n", 97 | "\n", 98 | "data = data[[\"Y\", \"A\", \"C\", \"W1\", \"W2\", \"W3\", \"W4\"]].copy()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "## Naive estimate of the ATE" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 3, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "0.07352\n" 118 | ] 119 | }, 120 | { 121 | "data": { 122 | "text/html": [ 123 | "\n", 124 | "\n", 125 | "\n", 126 | " \n", 127 | "\n", 128 | "\n", 129 | " \n", 130 | "\n", 131 | "\n", 132 | " \n", 133 | "\n", 134 | "\n", 135 | " \n", 136 | "\n", 137 | "\n", 138 | " \n", 139 | "\n", 140 | "\n", 141 | " \n", 142 | "\n", 143 | "\n", 144 | " \n", 145 | "\n", 146 | "\n", 147 | " \n", 148 | "\n", 149 | "\n", 150 | " \n", 151 | "\n", 152 | "
OLS Regression Results
Dep. Variable: Y R-squared: 0.006
Model: OLS Adj. R-squared: 0.005
Method: Least Squares F-statistic: 16.59
Date: Mon, 21 Dec 2020 Prob (F-statistic): 6.58e-08
Time: 06:30:58 Log-Likelihood: -3812.9
No. Observations: 5735 AIC: 7632.
Df Residuals: 5732 BIC: 7652.
Df Model: 2
Covariance Type: nonrobust
\n", 153 | "\n", 154 | "\n", 155 | " \n", 156 | "\n", 157 | "\n", 158 | " \n", 159 | "\n", 160 | "\n", 161 | " \n", 162 | "\n", 163 | "\n", 164 | " \n", 165 | "\n", 166 | "
coef std err t P>|t| [0.025 0.975]
Intercept 0.3049 0.010 29.354 0.000 0.285 0.325
A 0.0735 0.013 5.739 0.000 0.048 0.099
C 0.0027 0.013 0.219 0.826 -0.022 0.027
\n", 167 | "\n", 168 | "\n", 169 | " \n", 170 | "\n", 171 | "\n", 172 | " \n", 173 | "\n", 174 | "\n", 175 | " \n", 176 | "\n", 177 | "\n", 178 | " \n", 179 | "\n", 180 | "
Omnibus: 36985.427 Durbin-Watson: 1.981
Prob(Omnibus): 0.000 Jarque-Bera (JB): 993.118
Skew: 0.696 Prob(JB): 2.22e-216
Kurtosis: 1.511 Cond. No. 3.07


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." 181 | ], 182 | "text/plain": [ 183 | "\n", 184 | "\"\"\"\n", 185 | " OLS Regression Results \n", 186 | "==============================================================================\n", 187 | "Dep. Variable: Y R-squared: 0.006\n", 188 | "Model: OLS Adj. R-squared: 0.005\n", 189 | "Method: Least Squares F-statistic: 16.59\n", 190 | "Date: Mon, 21 Dec 2020 Prob (F-statistic): 6.58e-08\n", 191 | "Time: 06:30:58 Log-Likelihood: -3812.9\n", 192 | "No. Observations: 5735 AIC: 7632.\n", 193 | "Df Residuals: 5732 BIC: 7652.\n", 194 | "Df Model: 2 \n", 195 | "Covariance Type: nonrobust \n", 196 | "==============================================================================\n", 197 | " coef std err t P>|t| [0.025 0.975]\n", 198 | "------------------------------------------------------------------------------\n", 199 | "Intercept 0.3049 0.010 29.354 0.000 0.285 0.325\n", 200 | "A 0.0735 0.013 5.739 0.000 0.048 0.099\n", 201 | "C 0.0027 0.013 0.219 0.826 -0.022 0.027\n", 202 | "==============================================================================\n", 203 | "Omnibus: 36985.427 Durbin-Watson: 1.981\n", 204 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 993.118\n", 205 | "Skew: 0.696 Prob(JB): 2.22e-216\n", 206 | "Kurtosis: 1.511 Cond. No. 3.07\n", 207 | "==============================================================================\n", 208 | "\n", 209 | "Warnings:\n", 210 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 211 | "\"\"\"" 212 | ] 213 | }, 214 | "execution_count": 3, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "# Box 2: Regression naive approach\n", 221 | "fm = smf.ols(\"Y ~ A + C\", data).fit()\n", 222 | "print(np.round(fm.params['A'], 5)) # ATE = 0.07352\n", 223 | "fm.summary() # Full model results" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 4, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "Prop. Male 0.56\n", 236 | "Prop. Female 0.44\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "# Box 3: Marginal probabilities\n", 242 | "pr_c1 = np.mean(data['C'])\n", 243 | "pr_c0 = 1 - pr_c1\n", 244 | "print(\"Prop. Male \", np.round(pr_c1, 2))\n", 245 | "print(\"Prop. Female\", np.round(pr_c0, 2))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "## 3. G-Formula\n", 253 | "\n", 254 | "### 3.1 Non-parametric g-formula" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 5, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stdout", 264 | "output_type": "stream", 265 | "text": [ 266 | "ATE 0.073692\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "# Box 4: Non-parametric g-formula for the ATE\n", 272 | "pr_y_a1c1 = np.mean(data.loc[(data['C'] == 1) & (data['A'] == 1), 'Y'])\n", 273 | "pr_y_a0c1 = np.mean(data.loc[(data['C'] == 1) & (data['A'] == 0), 'Y'])\n", 274 | "pr_y_a1c0 = np.mean(data.loc[(data['C'] == 0) & (data['A'] == 1), 'Y'])\n", 275 | "pr_y_a0c0 = np.mean(data.loc[(data['C'] == 0) & (data['A'] == 0), 'Y'])\n", 276 | "\n", 277 | "ate = (pr_y_a1c1 - pr_y_a0c1)*pr_c1 + (pr_y_a1c0 - pr_y_a0c0)*pr_c0\n", 278 | "print(\"ATE\", np.round(ate, 6))" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 6, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "ATT 0.073248\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "# Box 5: Non-parametric g-formula for the ATT\n", 296 | "pr_c1_a1 = np.mean(data.loc[data['A'] == 1, 'C'])\n", 297 | "pr_c0_a1 = 1 - pr_c1_a1\n", 298 | "\n", 299 | "att = (pr_y_a1c1 - pr_y_a0c1)*pr_c1_a1 + (pr_y_a1c0 - pr_y_a0c0)*pr_c0_a1\n", 300 | "print(\"ATT\", np.round(att, 6))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 7, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stdout", 310 | "output_type": "stream", 311 | "text": [ 312 | "95% Confidence limits for the ATE\n", 313 | "Percentile method: [0.047741 0.099149]\n", 314 | "Normal Approx method: [0.04798 0.099404]\n", 315 | "\n", 316 | "95% Confidence limits for the ATT\n", 317 | "Percentile method: [0.048054 0.098375]\n", 318 | "Normal Approx method: [0.047811 0.098686]\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "# Box 6: Bootstrap the 95% confidence intervals (CI) for the\n", 324 | "# ATE/ATT estimated using the non-parametric G-Formula\n", 325 | "\n", 326 | "def ate_nonparm_gformula(d):\n", 327 | " \"\"\"Function to estimate the ATE using the nonparametric\n", 328 | " g-formula\"\"\"\n", 329 | " pr_c1 = np.mean(d['C'])\n", 330 | " pr_c0 = 1 - pr_c1\n", 331 | "\n", 332 | " pr_y_11 = np.mean(d.loc[(d['C'] == 1) & (d['A'] == 1), 'Y'])\n", 333 | " pr_y_01 = np.mean(d.loc[(d['C'] == 1) & (d['A'] == 0), 'Y'])\n", 334 | " pr_y_10 = np.mean(d.loc[(d['C'] == 0) & (d['A'] == 1), 'Y'])\n", 335 | " pr_y_00 = np.mean(d.loc[(d['C'] == 0) & (d['A'] == 0), 'Y'])\n", 336 | " \n", 337 | " return (pr_y_11 - pr_y_01)*pr_c1 + (pr_y_10 - pr_y_00)*pr_c0\n", 338 | "\n", 339 | "## ATE ##\n", 340 | "ate_rs = []\n", 341 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n", 342 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n", 343 | " replace=True) # Draw with replacement\n", 344 | " ate_rs.append(ate_nonparm_gformula(d=d_star))\n", 345 | "\n", 346 | "print(\"95% Confidence limits for the ATE\")\n", 347 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n", 348 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n", 349 | "ate_se = np.std(ate_rs, ddof=1)\n", 350 | "print(\"Normal Approx method:\", np.round([ate - 1.96*ate_se,\n", 351 | " ate + 1.96*ate_se], 6))\n", 352 | "\n", 353 | "\n", 354 | "def att_nonparm_gformula(d):\n", 355 | " \"\"\"Function to estimate the ATT using the nonparametric\n", 356 | " g-formula\"\"\"\n", 357 | " pr_c1_a1 = np.mean(d.loc[data['A'] == 1, 'C'])\n", 358 | " pr_c0_a1 = 1 - pr_c1_a1\n", 359 | "\n", 360 | " pr_y_11 = np.mean(d.loc[(d['C'] == 1) & (d['A'] == 1), 'Y'])\n", 361 | " pr_y_01 = np.mean(d.loc[(d['C'] == 1) & (d['A'] == 0), 'Y'])\n", 362 | " pr_y_10 = np.mean(d.loc[(d['C'] == 0) & (d['A'] == 1), 'Y'])\n", 363 | " pr_y_00 = np.mean(d.loc[(d['C'] == 0) & (d['A'] == 0), 'Y'])\n", 364 | " \n", 365 | " return (pr_y_11 - pr_y_01)*pr_c1_a1 + (pr_y_10 - pr_y_00)*pr_c0_a1\n", 366 | "\n", 367 | "\n", 368 | "## ATT ##\n", 369 | "att_rs = []\n", 370 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n", 371 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n", 372 | " replace=True) # Draw with replacement\n", 373 | " att_rs.append(att_nonparm_gformula(d=d_star))\n", 374 | "\n", 375 | "print(\"\\n95% Confidence limits for the ATT\")\n", 376 | "ci_perc = np.percentile(att_rs, q=[2.5, 97.5])\n", 377 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n", 378 | "att_se = np.std(att_rs, ddof=1)\n", 379 | "print(\"Normal Approx method:\", np.round([att - 1.96*att_se,\n", 380 | " att + 1.96*att_se], 6)) " 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 8, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "name": "stdout", 390 | "output_type": "stream", 391 | "text": [ 392 | "ATE 0.073692\n" 393 | ] 394 | } 395 | ], 396 | "source": [ 397 | "# Box 7: Non-parametric g-formula using saturated regression model (A)\n", 398 | "data[\"A1\"] = np.where(data['A'] == 1, 1, 0)\n", 399 | "data[\"A0\"] = np.where(data['A'] == 0, 1, 0)\n", 400 | "fm = smf.ols(\"Y ~ A1 + A0 + A1:C + A0:C - 1\", data).fit()\n", 401 | "betas = fm.params\n", 402 | "ate = np.mean((betas[\"A1\"] + betas[\"A1:C\"]*data[\"C\"]) -\n", 403 | " (betas[\"A0\"] + betas[\"A0:C\"]*data[\"C\"]))\n", 404 | "\n", 405 | "print(\"ATE\", np.round(ate, 6))" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 9, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "ATE 0.073692\n" 418 | ] 419 | } 420 | ], 421 | "source": [ 422 | "# Box 8: G-formula with saturated regression model using zEpid\n", 423 | "g_formula = zepid.causal.gformula.TimeFixedGFormula(data, \n", 424 | " exposure=\"A\", \n", 425 | " outcome=\"Y\")\n", 426 | "g_formula.outcome_model(\"A + C + A:C\", # Estimating model\n", 427 | " print_results=False)\n", 428 | "\n", 429 | "g_formula.fit(\"all\") # all sets A=1\n", 430 | "y_a1 = g_formula.marginal_outcome\n", 431 | "\n", 432 | "g_formula.fit(\"none\") # none sets A=0\n", 433 | "y_a0 = g_formula.marginal_outcome\n", 434 | "\n", 435 | "print(\"ATE\", np.round(y_a1 - y_a0, 6))" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "### 3.2 Parametric g-formula" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 10, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "name": "stdout", 452 | "output_type": "stream", 453 | "text": [ 454 | "ATE 0.073692\n" 455 | ] 456 | } 457 | ], 458 | "source": [ 459 | "# Box 9: Parametric g-formula by hand\n", 460 | "f = sm.families.family.Binomial() # Using logit model unlike prev\n", 461 | "fm_a1 = smf.glm(\"Y ~ C\", data.loc[data[\"A\"] == 1], family=f).fit()\n", 462 | "fm_a0 = smf.glm(\"Y ~ C\", data.loc[data[\"A\"] == 0], family=f).fit()\n", 463 | "\n", 464 | "y_a1 = fm_a1.predict(data['C'])\n", 465 | "y_a0 = fm_a0.predict(data['C'])\n", 466 | "ate = np.mean(y_a1 - y_a0)\n", 467 | "\n", 468 | "print(\"ATE\", np.round(ate, 6))" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 11, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "ATE 0.073514\n" 481 | ] 482 | } 483 | ], 484 | "source": [ 485 | "# Box 10: Parametric Regression Adjustment\n", 486 | "f = sm.families.family.Binomial()\n", 487 | "fm = smf.glm(\"Y ~ A + C\", data, family=f).fit()\n", 488 | "\n", 489 | "da1 = data.copy()\n", 490 | "da1['A'] = 1\n", 491 | "y_a1 = fm.predict(da1)\n", 492 | "\n", 493 | "da0 = data.copy()\n", 494 | "da0['A'] = 0\n", 495 | "y_a0 = fm.predict(da0)\n", 496 | "ate = np.mean(y_a1 - y_a0)\n", 497 | "\n", 498 | "print(\"ATE\", np.round(ate, 6))" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 12, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "name": "stdout", 508 | "output_type": "stream", 509 | "text": [ 510 | "95% Confidence limits for the ATE\n", 511 | "Percentile method: [0.048036 0.100379]\n", 512 | "Normal approx method: [0.047652 0.099376]\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "# Box 11: Bootstrap for the parametric regression adjustment\n", 518 | "ate_rs = []\n", 519 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n", 520 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n", 521 | " replace=True) # Draw with replacement\n", 522 | " fm = smf.glm(\"Y ~ A + C\", d_star, family=f).fit()\n", 523 | " da = d_star.copy()\n", 524 | " da['A'] = 1\n", 525 | " y_a1 = fm.predict(da)\n", 526 | " da['A'] = 0\n", 527 | " y_a0 = fm.predict(da)\n", 528 | " ate_rs.append(np.mean(y_a1 - y_a0))\n", 529 | "\n", 530 | "print(\"95% Confidence limits for the ATE\")\n", 531 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n", 532 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n", 533 | "ate_se = np.std(ate_rs, ddof=1)\n", 534 | "print(\"Normal approx method:\", np.round([ate - 1.96*ate_se,\n", 535 | " ate + 1.96*ate_se], 6))" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 13, 541 | "metadata": {}, 542 | "outputs": [ 543 | { 544 | "name": "stdout", 545 | "output_type": "stream", 546 | "text": [ 547 | "ATE 0.083929\n" 548 | ] 549 | } 550 | ], 551 | "source": [ 552 | "# Box 12: Parametric multivariate regression adjustment implementation\n", 553 | "f = sm.families.family.Binomial()\n", 554 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 555 | " data.loc[data[\"A\"] == 1], family=f).fit()\n", 556 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 557 | " data.loc[data[\"A\"] == 0], family=f).fit()\n", 558 | "\n", 559 | "y_a1 = fm_a1.predict(data)\n", 560 | "y_a0 = fm_a0.predict(data)\n", 561 | "ate = np.mean(y_a1 - y_a0)\n", 562 | "\n", 563 | "print(\"ATE\", np.round(ate, 6))" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 14, 569 | "metadata": {}, 570 | "outputs": [ 571 | { 572 | "name": "stdout", 573 | "output_type": "stream", 574 | "text": [ 575 | "ATE 0.083929\n" 576 | ] 577 | } 578 | ], 579 | "source": [ 580 | "# Box 13: Multivariate regression with zEpid\n", 581 | "g_formula = zepid.causal.gformula.TimeFixedGFormula(data, \n", 582 | " exposure=\"A\", \n", 583 | " outcome=\"Y\")\n", 584 | "g_formula.outcome_model(\"A + C + W1 + W2 + W3 + W4 + \"\n", 585 | " \"A:C + A:W1 + A:W2 + A:W3 + A:W4\",\n", 586 | " print_results=False)\n", 587 | "\n", 588 | "g_formula.fit(\"all\") # all sets A=1\n", 589 | "y_a1 = g_formula.marginal_outcome\n", 590 | "\n", 591 | "g_formula.fit(\"none\") # none sets A=0\n", 592 | "y_a0 = g_formula.marginal_outcome\n", 593 | "\n", 594 | "print(\"ATE\", np.round(y_a1 - y_a0, 6))" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 15, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "# Box 14: Not Available for Python\n", 604 | "# zEpid does not support two version of the parametric g-formula" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 16, 610 | "metadata": {}, 611 | "outputs": [ 612 | { 613 | "name": "stdout", 614 | "output_type": "stream", 615 | "text": [ 616 | "95% Confidence limits for the ATE\n", 617 | "Percentile method: [0.059649 0.106662]\n", 618 | "Normal approx method: [0.058851 0.109006]\n" 619 | ] 620 | } 621 | ], 622 | "source": [ 623 | "# Box 15: Bootstrap for multivariate adjustment\n", 624 | "ate_rs = []\n", 625 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n", 626 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n", 627 | " replace=True) # Draw with replacement\n", 628 | " fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 629 | " d_star.loc[d_star[\"A\"] == 1], \n", 630 | " family=f).fit()\n", 631 | " fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 632 | " d_star.loc[d_star[\"A\"] == 0], \n", 633 | " family=f).fit()\n", 634 | " ate_rs.append(np.mean(fm_a1.predict(data) - fm_a0.predict(data)))\n", 635 | "\n", 636 | "print(\"95% Confidence limits for the ATE\")\n", 637 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n", 638 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n", 639 | "ate_se = np.std(ate_rs, ddof=1)\n", 640 | "print(\"Normal approx method:\", np.round([ate - 1.96*ate_se,\n", 641 | " ate + 1.96*ate_se], 6))" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 17, 647 | "metadata": {}, 648 | "outputs": [ 649 | { 650 | "name": "stdout", 651 | "output_type": "stream", 652 | "text": [ 653 | "RR 1.2766\n" 654 | ] 655 | } 656 | ], 657 | "source": [ 658 | "# Box 16: Computing the parametric marginal risk ratio\n", 659 | "f = sm.families.family.Binomial()\n", 660 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 661 | " data.loc[data[\"A\"] == 1], family=f).fit()\n", 662 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 663 | " data.loc[data[\"A\"] == 0], family=f).fit()\n", 664 | "\n", 665 | "y_a1 = fm_a1.predict(data)\n", 666 | "y_a0 = fm_a0.predict(data)\n", 667 | "risk_ratio = np.mean(y_a1) / np.mean(y_a0)\n", 668 | "\n", 669 | "print(\"RR\", np.round(risk_ratio, 4))" 670 | ] 671 | }, 672 | { 673 | "cell_type": "markdown", 674 | "metadata": {}, 675 | "source": [ 676 | "## 4. Inverse Probability of Treatment Weighting\n", 677 | "\n", 678 | "### 4.1 Inverse probability of treatment weighting based on the propensity score plus regression adjustment" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": 18, 684 | "metadata": {}, 685 | "outputs": [ 686 | { 687 | "name": "stdout", 688 | "output_type": "stream", 689 | "text": [ 690 | "ATE 0.083294\n" 691 | ] 692 | } 693 | ], 694 | "source": [ 695 | "# Box 17: Computation of the IPTW estimator for the ATE\n", 696 | "f = sm.families.family.Binomial()\n", 697 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n", 698 | " data, family=f).fit()\n", 699 | "p_score = fm_pa.predict(data) # Calculating propensity scores\n", 700 | "data['p_score'] = p_score\n", 701 | "\n", 702 | "iptw = 1 / np.where(data['A'] == 1, p_score, 1 - p_score) # IPTW\n", 703 | "data['iptw'] = iptw\n", 704 | "\n", 705 | "d_a1 = data.loc[data[\"A\"] == 1].copy()\n", 706 | "d_a0 = data.loc[data[\"A\"] == 0].copy()\n", 707 | "ate = (np.average(d_a1['Y'], weights=d_a1['iptw']) - \n", 708 | " np.average(d_a0['Y'], weights=d_a0['iptw']))\n", 709 | "print(\"ATE\", np.round(ate, 6))" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 19, 715 | "metadata": {}, 716 | "outputs": [ 717 | { 718 | "name": "stdout", 719 | "output_type": "stream", 720 | "text": [ 721 | "95% Confidence limits for the ATE\n", 722 | "Percentile method: [0.057431 0.106584]\n", 723 | "Normal approx method: [0.058198 0.10839 ]\n" 724 | ] 725 | } 726 | ], 727 | "source": [ 728 | "# Box 18: Bootstrap computation for the IPTW estimator\n", 729 | "ate_rs = []\n", 730 | "for i in range(1000): # Drawing 1000 bootstrapped samples\n", 731 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n", 732 | " replace=True) # Draw with replacement\n", 733 | " fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n", 734 | " d_star, family=f).fit()\n", 735 | " ps_score = fm_pa.predict(d_star) # Calculating propensity scores\n", 736 | " d_star['iptw'] = 1 / np.where(d_star['A'] == 1, \n", 737 | " ps_score, 1 - ps_score) \n", 738 | " ds_a1 = d_star.loc[d_star[\"A\"] == 1].copy()\n", 739 | " ds_a0 = d_star.loc[d_star[\"A\"] == 0].copy()\n", 740 | " ate_rs.append(np.average(ds_a1['Y'], weights=ds_a1['iptw']) - \n", 741 | " np.average(ds_a0['Y'], weights=ds_a0['iptw']))\n", 742 | "\n", 743 | "print(\"95% Confidence limits for the ATE\")\n", 744 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n", 745 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n", 746 | "ate_se = np.std(ate_rs, ddof=1)\n", 747 | "print(\"Normal approx method:\", np.round([ate - 1.96*ate_se,\n", 748 | " ate + 1.96*ate_se], 6))" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": 20, 754 | "metadata": {}, 755 | "outputs": [ 756 | { 757 | "name": "stdout", 758 | "output_type": "stream", 759 | "text": [ 760 | " RD SE(RD) 95%LCL 95%UCL\n", 761 | "labels \n", 762 | "Intercept 0.303444 0.007712 0.288328 0.318559\n", 763 | "A 0.083294 0.013046 0.057723 0.108864\n" 764 | ] 765 | } 766 | ], 767 | "source": [ 768 | "# Box 19: IPTW estimator using zEpid\n", 769 | "ipw = zepid.causal.ipw.IPTW(data, treatment=\"A\", outcome=\"Y\")\n", 770 | "ipw.treatment_model(\"C + W1 + W2 + W3 + W4\", \n", 771 | " stabilized=False, # Set to True for stabilized\n", 772 | " print_results=False)\n", 773 | "ipw.marginal_structural_model(\"A\")\n", 774 | "ipw.fit()\n", 775 | "print(ipw.risk_difference)" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 21, 781 | "metadata": {}, 782 | "outputs": [ 783 | { 784 | "name": "stdout", 785 | "output_type": "stream", 786 | "text": [ 787 | " Confounder Raw Weighted\n", 788 | "2 C 0.093144 0.000325\n", 789 | "3 W1 -0.061352 -0.003754\n", 790 | "4 W2 0.091364 -0.002439\n", 791 | "0 W3 0.035606 0.002426\n", 792 | "1 W4 0.071853 0.000404\n" 793 | ] 794 | }, 795 | { 796 | "data": { 797 | "image/png": "\n", 798 | "text/plain": [ 799 | "
" 800 | ] 801 | }, 802 | "metadata": { 803 | "needs_background": "light" 804 | }, 805 | "output_type": "display_data" 806 | } 807 | ], 808 | "source": [ 809 | "# Box 20: Assessing IPTW balance\n", 810 | "rename_cols = {\"smd_w\": \"Weighted\", \"smd_u\": \"Raw\", \n", 811 | " \"labels\": \"Confounder\"}\n", 812 | "\n", 813 | "smd = ipw.standardized_mean_differences().rename(columns=rename_cols)\n", 814 | "smd = smd.sort_values(by='Confounder')\n", 815 | "print(smd[['Confounder', 'Raw', 'Weighted']])\n", 816 | "\n", 817 | "# zEpid plotting functionality\n", 818 | "ipw.plot_love()\n", 819 | "plt.show()" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": 22, 825 | "metadata": {}, 826 | "outputs": [ 827 | { 828 | "data": { 829 | "image/png": "\n", 830 | "text/plain": [ 831 | "
" 832 | ] 833 | }, 834 | "metadata": { 835 | "needs_background": "light" 836 | }, 837 | "output_type": "display_data" 838 | } 839 | ], 840 | "source": [ 841 | "# Box 21: Assessing IPTW overlap by hand\n", 842 | "density_t = gaussian_kde(1 - data.loc[data[\"A\"] == 1, 'p_score'])\n", 843 | "density_u = gaussian_kde(1 - data.loc[data[\"A\"] == 0, 'p_score'])\n", 844 | "\n", 845 | "x = np.linspace(0, 1, 10000)\n", 846 | "\n", 847 | "ax = plt.gca()\n", 848 | "ax.fill_between(x, density_t(x), color=\"b\", alpha=0.2, label=None)\n", 849 | "ax.plot(x, density_t(x), color=\"b\", label='RHC = Y')\n", 850 | "ax.fill_between(x, density_u(x), color=\"r\", alpha=0.2, label=None)\n", 851 | "ax.plot(x, density_u(x), color=\"r\", label='RHC = N')\n", 852 | "ax.set_ylim([0, 10])\n", 853 | "ax.set_ylabel(\"density\")\n", 854 | "ax.set_xlim([0.45, 0.8])\n", 855 | "ax.set_xlabel(\"1 - Propensity Score\")\n", 856 | "ax.legend()\n", 857 | "plt.show()" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": 23, 863 | "metadata": {}, 864 | "outputs": [ 865 | { 866 | "data": { 867 | "image/png": "\n", 868 | "text/plain": [ 869 | "
" 870 | ] 871 | }, 872 | "metadata": { 873 | "needs_background": "light" 874 | }, 875 | "output_type": "display_data" 876 | } 877 | ], 878 | "source": [ 879 | "# Box 22: Assessing IPTW overlap using zEpid\n", 880 | "ipw.plot_kde()\n", 881 | "plt.ylim([0, 10])\n", 882 | "plt.xlim([0.2, 0.55])\n", 883 | "plt.show()" 884 | ] 885 | }, 886 | { 887 | "cell_type": "markdown", 888 | "metadata": {}, 889 | "source": [ 890 | "### 4.2 Marginal structural model with stabilised weights" 891 | ] 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": 24, 896 | "metadata": {}, 897 | "outputs": [ 898 | { 899 | "name": "stderr", 900 | "output_type": "stream", 901 | "text": [ 902 | "/home/pzivich/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/genmod/generalized_estimating_equations.py:501: DomainWarning: The identity link function does not respect the domain of the Binomial family.\n", 903 | " DomainWarning)\n", 904 | "/home/pzivich/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/genmod/generalized_linear_model.py:278: DomainWarning: The identity link function does not respect the domain of the Binomial family.\n", 905 | " DomainWarning)\n" 906 | ] 907 | }, 908 | { 909 | "name": "stdout", 910 | "output_type": "stream", 911 | "text": [ 912 | "Unstabilized Weights\n", 913 | "ATE 0.083294\n", 914 | "95% CL [0.05772325 0.10886425]\n" 915 | ] 916 | }, 917 | { 918 | "name": "stderr", 919 | "output_type": "stream", 920 | "text": [ 921 | "/home/pzivich/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/genmod/generalized_estimating_equations.py:501: DomainWarning: The identity link function does not respect the domain of the Binomial family.\n", 922 | " DomainWarning)\n", 923 | "/home/pzivich/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/genmod/generalized_linear_model.py:278: DomainWarning: The identity link function does not respect the domain of the Binomial family.\n", 924 | " DomainWarning)\n" 925 | ] 926 | }, 927 | { 928 | "name": "stdout", 929 | "output_type": "stream", 930 | "text": [ 931 | "\n", 932 | "Stabilized Weights\n", 933 | "ATE 0.083294\n", 934 | "95% CL [0.05772325 0.10886425]\n" 935 | ] 936 | } 937 | ], 938 | "source": [ 939 | "# Box 23: Computation of the IPTW estimator using a MSM\n", 940 | "\n", 941 | "### Unstabilized IPTW ###\n", 942 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n", 943 | " data, family=f).fit()\n", 944 | "p_score = fm_pa.predict(data) # Calculating propensity scores\n", 945 | "iptw = 1 / np.where(data['A'] == 1, p_score, 1 - p_score) # IPTW\n", 946 | "# Estimating Marginal Structural Model\n", 947 | "f = sm.families.family.Binomial(sm.families.links.identity())\n", 948 | "fm = smf.gee(\"Y ~ A\", data.index, data,\n", 949 | " cov_struct=sm.cov_struct.Independence(), \n", 950 | " family=f, weights=iptw).fit()\n", 951 | "print(\"Unstabilized Weights\")\n", 952 | "print(\"ATE \", np.round(fm.params['A'], 6))\n", 953 | "print(\"95% CL\", np.asarray(fm.conf_int().loc[\"A\"]))\n", 954 | "\n", 955 | "### Stabilized IPTW ###\n", 956 | "f = sm.families.family.Binomial()\n", 957 | "# Numerator\n", 958 | "fm_ma = smf.glm(\"A ~ 1\", data, family=f).fit()\n", 959 | "num = np.where(data['A'] == 1, fm_ma.predict(data), \n", 960 | " 1 - fm_ma.predict(data))\n", 961 | "# Denominator\n", 962 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n", 963 | " data, family=f).fit()\n", 964 | "den = np.where(data['A'] == 1, fm_pa.predict(data), \n", 965 | " 1 - fm_pa.predict(data))\n", 966 | "# IPTW\n", 967 | "iptw = num / den\n", 968 | "# Estimating Marginal Structural Model\n", 969 | "f = sm.families.family.Binomial(sm.families.links.identity())\n", 970 | "fm = smf.gee(\"Y ~ A\", data.index, data,\n", 971 | " cov_struct=sm.cov_struct.Independence(), \n", 972 | " family=f, weights=iptw).fit()\n", 973 | "print(\"\\nStabilized Weights\")\n", 974 | "print(\"ATE \", np.round(fm.params['A'], 6))\n", 975 | "print(\"95% CL\", np.asarray(fm.conf_int().loc[\"A\"]))" 976 | ] 977 | }, 978 | { 979 | "cell_type": "markdown", 980 | "metadata": {}, 981 | "source": [ 982 | "### 4.3 IPTW with regression adjustment" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": 25, 988 | "metadata": {}, 989 | "outputs": [ 990 | { 991 | "name": "stdout", 992 | "output_type": "stream", 993 | "text": [ 994 | "ATE 0.083929\n", 995 | "ATE 0.083426\n" 996 | ] 997 | } 998 | ], 999 | "source": [ 1000 | "# Box 24: Computation of the IPTW-RA estimator\n", 1001 | "f = sm.families.family.Binomial()\n", 1002 | "\n", 1003 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 1004 | " data.loc[data[\"A\"] == 1], # Only A=1\n", 1005 | " weights=data.loc[data[\"A\"] == 1, 'iptw'], # Box 17\n", 1006 | " family=f).fit()\n", 1007 | "y_a1 = fm_a1.predict(data)\n", 1008 | "\n", 1009 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 1010 | " data.loc[data[\"A\"] == 0], # Only A=0\n", 1011 | " weights=data.loc[data[\"A\"] == 0, 'iptw'], # Box 17\n", 1012 | " family=f).fit()\n", 1013 | "y_a0 = fm_a0.predict(data)\n", 1014 | "\n", 1015 | "ate = np.mean(y_a1 - y_a0)\n", 1016 | "print(\"ATE\", np.round(ate, 6))\n", 1017 | "ate = (np.mean(data['iptw']*data['A']*y_a1) / np.mean(data['iptw']*data['A']) - \n", 1018 | " np.mean(data['iptw']*(1-data['A'])*y_a0) / np.mean(data['iptw']*(1-data['A'])))\n", 1019 | "print(\"ATE\", np.round(ate, 6))" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "execution_count": 26, 1025 | "metadata": {}, 1026 | "outputs": [], 1027 | "source": [ 1028 | "# Box 25: IPTW-RA \n", 1029 | "# Not supported by zEpid" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "markdown", 1034 | "metadata": {}, 1035 | "source": [ 1036 | "## 5. Augmented Inverse Probability Weighting" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 27, 1042 | "metadata": {}, 1043 | "outputs": [ 1044 | { 1045 | "name": "stdout", 1046 | "output_type": "stream", 1047 | "text": [ 1048 | "ATE 0.083796\n", 1049 | "95% Confidence limits for the ATE\n", 1050 | "Percentile method: [0.058572 0.109738]\n", 1051 | "Normal approx method: [0.058901 0.108691]\n" 1052 | ] 1053 | } 1054 | ], 1055 | "source": [ 1056 | "# Box 26: Computation of the AIPTW estimator\n", 1057 | "f = sm.families.family.Binomial()\n", 1058 | "\n", 1059 | "# Step 1: g-computation\n", 1060 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 1061 | " data.loc[data[\"A\"] == 1], family=f).fit()\n", 1062 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 1063 | " data.loc[data[\"A\"] == 0], family=f).fit()\n", 1064 | "y_a1 = fm_a1.predict(data)\n", 1065 | "y_a0 = fm_a0.predict(data)\n", 1066 | "\n", 1067 | "# Step 2: propensity scores\n", 1068 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n", 1069 | " data, family=f).fit()\n", 1070 | "p_score = fm_pa.predict(data)\n", 1071 | "\n", 1072 | "# Step 3: analytic formula\n", 1073 | "ys_a1 = ((data['A'] * data['Y'])/ (p_score) + \n", 1074 | " (y_a1*(p_score-data['A'])) / p_score)\n", 1075 | "ys_a0 = (((1-data['A']) * data['Y'])/ (1-p_score) + \n", 1076 | " (y_a0*(data['A']-p_score)) / (1-p_score))\n", 1077 | "ate = np.mean(ys_a1 - ys_a0)\n", 1078 | "print(\"ATE\", np.round(ate, 6))\n", 1079 | "\n", 1080 | "# Step 4: bootstrap for inference\n", 1081 | "ate_rs = []\n", 1082 | "for i in range(1000):\n", 1083 | " d_star = data.sample(n=data.shape[0], # Same size as input data\n", 1084 | " replace=True) # Draw with replacement\n", 1085 | " fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 1086 | " d_star.loc[d_star[\"A\"] == 1], family=f).fit()\n", 1087 | " fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 1088 | " d_star.loc[d_star[\"A\"] == 0], family=f).fit()\n", 1089 | " y_a1 = fm_a1.predict(d_star)\n", 1090 | " y_a0 = fm_a0.predict(d_star)\n", 1091 | " fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n", 1092 | " d_star, family=f).fit()\n", 1093 | " p_score = fm_pa.predict(d_star)\n", 1094 | " ys_a1 = ((d_star['A'] * d_star['Y'])/ (p_score) + \n", 1095 | " (y_a1*(p_score-d_star['A'])) / p_score)\n", 1096 | " ys_a0 = (((1-d_star['A']) * d_star['Y'])/ (1-p_score) + \n", 1097 | " (y_a0*(d_star['A']-p_score)) / (1-p_score))\n", 1098 | " ate_rs.append(np.mean(ys_a1 - ys_a0))\n", 1099 | "\n", 1100 | "\n", 1101 | "print(\"95% Confidence limits for the ATE\")\n", 1102 | "ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])\n", 1103 | "print(\"Percentile method: \", np.round(ci_perc, 6))\n", 1104 | "ate_se = np.std(ate_rs, ddof=1)\n", 1105 | "print(\"Normal approx method:\", np.round([ate - 1.96*ate_se,\n", 1106 | " ate + 1.96*ate_se], 6))" 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "code", 1111 | "execution_count": 28, 1112 | "metadata": {}, 1113 | "outputs": [ 1114 | { 1115 | "name": "stdout", 1116 | "output_type": "stream", 1117 | "text": [ 1118 | "ATE 0.083796\n", 1119 | "95% CL [0.058546 0.109046]\n" 1120 | ] 1121 | } 1122 | ], 1123 | "source": [ 1124 | "# Box 27: AIPTW estimator with zEpid\n", 1125 | "aipw = zepid.causal.doublyrobust.AIPTW(data, \n", 1126 | " exposure=\"A\", \n", 1127 | " outcome=\"Y\")\n", 1128 | "aipw.exposure_model(\"C + W1 + W2 + W3 + W4\", \n", 1129 | " print_results=False)\n", 1130 | "aipw.outcome_model(\"A + C + W1 + W2 + W3 + W4 + \"\n", 1131 | " \"A:C + A:W1 + A:W2 + A:W3 + A:W4\", \n", 1132 | " print_results=False)\n", 1133 | "aipw.fit()\n", 1134 | "\n", 1135 | "print(\"ATE \", np.round(aipw.risk_difference, 6))\n", 1136 | "print(\"95% CL\", np.round(aipw.risk_difference_ci, 6))\n", 1137 | "# zEpid calculates the variance using influence curves" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "markdown", 1142 | "metadata": {}, 1143 | "source": [ 1144 | "## 6. Data-Adaptive Estimation: Ensemble Learning Targeted Maximum Likelihood Estimation" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": 29, 1150 | "metadata": {}, 1151 | "outputs": [ 1152 | { 1153 | "name": "stdout", 1154 | "output_type": "stream", 1155 | "text": [ 1156 | "ATE 0.083796\n", 1157 | "95% CL [0.058546 0.109047]\n" 1158 | ] 1159 | } 1160 | ], 1161 | "source": [ 1162 | "# Box 28: Computation of TMLE by hand\n", 1163 | "f = sm.families.family.Binomial()\n", 1164 | "n = data.shape[0]\n", 1165 | "\n", 1166 | "# Step 1: g-computation\n", 1167 | "fm_a1 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 1168 | " data.loc[data[\"A\"] == 1], family=f).fit()\n", 1169 | "fm_a0 = smf.glm(\"Y ~ C + W1 + W2 + W3 + W4\", \n", 1170 | " data.loc[data[\"A\"] == 0], family=f).fit()\n", 1171 | "y_a1 = fm_a1.predict(data)\n", 1172 | "y_a0 = fm_a0.predict(data)\n", 1173 | "y_a_ = np.where(data['A'] == 1, y_a1, y_a0)\n", 1174 | "\n", 1175 | "# Step 2: propensity scores\n", 1176 | "fm_pa = smf.glm(\"A ~ C + W1 + W2 + W3 + W4\", \n", 1177 | " data, family=f).fit()\n", 1178 | "p_score = fm_pa.predict(data)\n", 1179 | "\n", 1180 | "# Step 3: targeting step\n", 1181 | "logodds_y1 = np.log(probability_to_odds(y_a1))\n", 1182 | "logodds_y0 = np.log(probability_to_odds(y_a0))\n", 1183 | "logodds_ya = np.log(probability_to_odds(y_a_))\n", 1184 | "\n", 1185 | "clever_cov_a1 = data['A']/p_score\n", 1186 | "clever_cov_a0 = - (1-data['A'])/(1-p_score)\n", 1187 | "\n", 1188 | "submodel = sm.GLM(data['Y'], \n", 1189 | " np.column_stack((clever_cov_a1, clever_cov_a0)), \n", 1190 | " offset=logodds_ya,\n", 1191 | " family=f).fit()\n", 1192 | "epsilon = submodel.params\n", 1193 | "\n", 1194 | "# Step 4: calculating ATE\n", 1195 | "ys_a1 = logistic.cdf(logodds_y1 + epsilon[0] / p_score)\n", 1196 | "ys_a0 = logistic.cdf(logodds_y0 - epsilon[1] / (1-p_score))\n", 1197 | "ate = np.mean(ys_a1 - ys_a0)\n", 1198 | "print(\"ATE\", np.round(ate, 6))\n", 1199 | "\n", 1200 | "# Step 5: inference via influence curve\n", 1201 | "ic = (clever_cov_a1 + clever_cov_a0) * (data['Y'] - y_a_) + (y_a1 - y_a0) - ate\n", 1202 | "sd = np.sqrt(np.nanvar(ic, ddof=1) / n)\n", 1203 | "cl = [ate - 1.96*sd, ate + 1.96*sd]\n", 1204 | "print(\"95% CL\", np.round(cl, 6))" 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "code", 1209 | "execution_count": 30, 1210 | "metadata": {}, 1211 | "outputs": [ 1212 | { 1213 | "name": "stdout", 1214 | "output_type": "stream", 1215 | "text": [ 1216 | "ATE 0.083796\n", 1217 | "95% CL [0.058546 0.109047]\n" 1218 | ] 1219 | } 1220 | ], 1221 | "source": [ 1222 | "# Box 29: TMLE with zEpid\n", 1223 | "tmle = zepid.causal.doublyrobust.TMLE(data, \n", 1224 | " exposure=\"A\", \n", 1225 | " outcome=\"Y\")\n", 1226 | "tmle.exposure_model(\"C + W1 + W2 + W3 + W4\", \n", 1227 | " print_results=False)\n", 1228 | "tmle.outcome_model(\"A + C + W1 + W2 + W3 + W4 + \"\n", 1229 | " \"A:C + A:W1 + A:W2 + A:W3 + A:W4\", \n", 1230 | " print_results=False)\n", 1231 | "tmle.fit()\n", 1232 | "\n", 1233 | "print(\"ATE \", np.round(tmle.risk_difference, 6))\n", 1234 | "print(\"95% CL\", np.round(tmle.risk_difference_ci, 6))" 1235 | ] 1236 | }, 1237 | { 1238 | "cell_type": "markdown", 1239 | "metadata": {}, 1240 | "source": [ 1241 | "## 7. Simulation" 1242 | ] 1243 | }, 1244 | { 1245 | "cell_type": "code", 1246 | "execution_count": 31, 1247 | "metadata": {}, 1248 | "outputs": [], 1249 | "source": [ 1250 | "def data_generator(n, true_ate=False):\n", 1251 | " \"\"\"Function to generate data consisting of `n` observations\"\"\"\n", 1252 | " d = pd.DataFrame()\n", 1253 | " # Confounders\n", 1254 | " d['W1'] = np.round(np.random.uniform(low=1, high=5, size=n))\n", 1255 | " d['W2'] = np.random.binomial(n=1, p=0.45, size=n)\n", 1256 | " d['W3'] = np.round(np.random.uniform(low=0, high=1, size=n)\n", 1257 | " + 0.8*d['W1'] + 0.75*d['W2'])\n", 1258 | " d['W3'] = np.where(d['W3'] > 4, 1, d['W3'])\n", 1259 | " d['W4'] = np.round(np.random.uniform(low=0, high=1, size=n)\n", 1260 | " + 0.2*d['W1'] + 0.75*d['W2'])\n", 1261 | " # Treatment\n", 1262 | " pr_a = logistic.cdf(-1 - 0.15*d['W4'] + 1.5*d['W2'] + \n", 1263 | " 0.75*d['W3'] + 0.25*d['W1'] + \n", 1264 | " 0.8*d['W2']*d['W4'])\n", 1265 | " d['A'] = np.random.binomial(n=1, \n", 1266 | " p=pr_a, \n", 1267 | " size=n)\n", 1268 | " # Potential outcomes\n", 1269 | " pr_y1 = logistic.cdf(-3 + 1 + 0.25*d['W4'] + 0.75*d['W3'] + \n", 1270 | " 0.8*d['W2']*d['W4'] + 0.05*d['W1'])\n", 1271 | " y1 = np.random.binomial(n=1, p=pr_y1, size=n)\n", 1272 | " pr_y0 = logistic.cdf(-3 + 0 + 0.25*d['W4'] + 0.75*d['W3'] + \n", 1273 | " 0.8*d['W2']*d['W4'] + 0.05*d['W1'])\n", 1274 | " y0 = np.random.binomial(n=1, p=pr_y0, size=n)\n", 1275 | " # Causal consistency\n", 1276 | " d['Y'] = np.where(d['A'] == 1, y1, y0)\n", 1277 | " # Return generated data\n", 1278 | " if true_ate:\n", 1279 | " return np.mean(y1 - y0), np.mean(y1) / np.mean(y0)\n", 1280 | " else:\n", 1281 | " return d\n", 1282 | "\n", 1283 | "\n", 1284 | "# Generating true value from super-population\n", 1285 | "true_ate, true_rr = data_generator(n=1000000, true_ate=True)\n", 1286 | "\n", 1287 | "# Setting up SuperLearner libraries\n", 1288 | "from zepid.superlearner import SuperLearner, StepwiseSL\n", 1289 | "from sklearn.linear_model import LogisticRegression\n", 1290 | "from pygam import LogisticGAM, f, s\n", 1291 | "\n", 1292 | "import warnings\n", 1293 | "warnings.simplefilter('ignore', RuntimeWarning) # Hides some NumPy errors for sparse models\n", 1294 | "\n", 1295 | "family = sm.families.family.Binomial()\n", 1296 | "\n", 1297 | "sl_library_main = [LogisticRegression(penalty='none', solver='lbfgs'),\n", 1298 | " StepwiseSL(family, selection=\"backward\"), \n", 1299 | " StepwiseSL(family, selection=\"forward\", order_interaction=1)]\n", 1300 | "sl_main_labs = [\"LogR\", \"Step.zero\", \"Step.one\"]\n", 1301 | "sl_main = SuperLearner(sl_library_main, sl_main_labs, folds=5,\n", 1302 | " loss_function='nloglik')\n", 1303 | "\n", 1304 | "sl_library_alt = [LogisticRegression(penalty='none', solver='lbfgs'),\n", 1305 | " StepwiseSL(family, selection=\"backward\"), \n", 1306 | " StepwiseSL(family, selection=\"forward\", order_interaction=1),\n", 1307 | " LogisticGAM(f(0) + s(1) + f(2) + s(3) + s(4), \n", 1308 | " lam=0.6)]\n", 1309 | "sl_alt_labs = [\"LogR\", \"Step.zero\", \"Step.one\", \"GAM\"]\n", 1310 | "sl_alt = SuperLearner(sl_library_alt, sl_alt_labs, folds=5,\n", 1311 | " loss_function='nloglik')\n", 1312 | "\n", 1313 | "# Simulation\n", 1314 | "naive_rd, gform_rd, iptw_rd, aipw_rd, tmle_rd = [], [], [], [], []\n", 1315 | "aipw_slm_rd, aipw_sla_rd, tmle_slm_rd, tmle_sla_rd = [], [], [], []\n", 1316 | "\n", 1317 | "for i in range(1000): \n", 1318 | " data = data_generator(n=1000)\n", 1319 | "\n", 1320 | " # Naive\n", 1321 | " fm = smf.ols(\"Y ~ A + W1 + W2 + W3 + W4\", data).fit()\n", 1322 | " naive_rd.append(fm.params['A'])\n", 1323 | " \n", 1324 | " # G-formula\n", 1325 | " g_formula = zepid.causal.gformula.TimeFixedGFormula(data, \n", 1326 | " exposure=\"A\", \n", 1327 | " outcome=\"Y\")\n", 1328 | " g_formula.outcome_model(\"A + W1 + W2 + W3 + W4\",\n", 1329 | " print_results=False)\n", 1330 | " g_formula.fit(\"all\") # all sets A=1\n", 1331 | " y_a1 = g_formula.marginal_outcome\n", 1332 | " g_formula.fit(\"none\") # none sets A=0\n", 1333 | " y_a0 = g_formula.marginal_outcome\n", 1334 | " gform_rd.append(y_a1 - y_a0)\n", 1335 | " \n", 1336 | " # IPTW\n", 1337 | " ipw = zepid.causal.ipw.IPTW(data, treatment=\"A\", outcome=\"Y\")\n", 1338 | " ipw.treatment_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n", 1339 | " stabilized=True, print_results=False)\n", 1340 | " ipw.marginal_structural_model(\"A\")\n", 1341 | " ipw.fit()\n", 1342 | " iptw_rd.append(ipw.risk_difference)\n", 1343 | "\n", 1344 | " # AIPW\n", 1345 | " aipw = zepid.causal.doublyrobust.AIPTW(data, \n", 1346 | " exposure=\"A\", \n", 1347 | " outcome=\"Y\")\n", 1348 | " aipw.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n", 1349 | " print_results=False)\n", 1350 | " aipw.outcome_model(\"A + W1 + W2 + W3 + W4\", \n", 1351 | " print_results=False)\n", 1352 | " aipw.fit()\n", 1353 | " aipw_rd.append(aipw.risk_difference)\n", 1354 | " \n", 1355 | " # AIPW -- Super Learner main\n", 1356 | " aipw = zepid.causal.doublyrobust.AIPTW(data, \n", 1357 | " exposure=\"A\", \n", 1358 | " outcome=\"Y\")\n", 1359 | " aipw.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n", 1360 | " custom_model=sl_main,\n", 1361 | " print_results=False)\n", 1362 | " aipw.outcome_model(\"A + W1 + W2 + W3 + W4\", \n", 1363 | " custom_model=sl_main,\n", 1364 | " print_results=False)\n", 1365 | " aipw.fit()\n", 1366 | " aipw_slm_rd.append(aipw.risk_difference)\n", 1367 | " \n", 1368 | " # AIPW -- Super Learner alternative\n", 1369 | " aipw = zepid.causal.doublyrobust.AIPTW(data, \n", 1370 | " exposure=\"A\", \n", 1371 | " outcome=\"Y\")\n", 1372 | " aipw.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n", 1373 | " custom_model=sl_alt,\n", 1374 | " print_results=False)\n", 1375 | " aipw.outcome_model(\"A + W1 + W2 + W3 + W4\", \n", 1376 | " custom_model=sl_alt,\n", 1377 | " print_results=False)\n", 1378 | " aipw.fit()\n", 1379 | " aipw_sla_rd.append(aipw.risk_difference)\n", 1380 | " \n", 1381 | " # TMLE\n", 1382 | " tmle = zepid.causal.doublyrobust.TMLE(data, \n", 1383 | " exposure=\"A\", \n", 1384 | " outcome=\"Y\")\n", 1385 | " tmle.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n", 1386 | " print_results=False)\n", 1387 | " tmle.outcome_model(\"A + W1 + W2 + W3 + W4\", \n", 1388 | " print_results=False)\n", 1389 | " tmle.fit()\n", 1390 | " tmle_rd.append(tmle.risk_difference)\n", 1391 | "\n", 1392 | " # TMLE -- Super Learner main\n", 1393 | " tmle = zepid.causal.doublyrobust.TMLE(data, \n", 1394 | " exposure=\"A\", \n", 1395 | " outcome=\"Y\")\n", 1396 | " tmle.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n", 1397 | " custom_model=sl_main,\n", 1398 | " print_results=False)\n", 1399 | " tmle.outcome_model(\"A + W1 + W2 + W3 + W4\", \n", 1400 | " custom_model=sl_main,\n", 1401 | " print_results=False)\n", 1402 | " tmle.fit()\n", 1403 | " tmle_slm_rd.append(tmle.risk_difference)\n", 1404 | " \n", 1405 | " # TMLE -- Super Learner alternative\n", 1406 | " tmle = zepid.causal.doublyrobust.TMLE(data, \n", 1407 | " exposure=\"A\", \n", 1408 | " outcome=\"Y\")\n", 1409 | " tmle.exposure_model(\"W1 + W2 + W3 + W4\", bound=0.01,\n", 1410 | " custom_model=sl_alt,\n", 1411 | " print_results=False)\n", 1412 | " tmle.outcome_model(\"A + W1 + W2 + W3 + W4\", \n", 1413 | " custom_model=sl_alt,\n", 1414 | " print_results=False)\n", 1415 | " tmle.fit()\n", 1416 | " tmle_sla_rd.append(tmle.risk_difference)\n", 1417 | " \n" 1418 | ] 1419 | }, 1420 | { 1421 | "cell_type": "code", 1422 | "execution_count": 32, 1423 | "metadata": {}, 1424 | "outputs": [ 1425 | { 1426 | "name": "stdout", 1427 | "output_type": "stream", 1428 | "text": [ 1429 | "=============================\n", 1430 | "Naive\n", 1431 | "-----------------------------\n", 1432 | "Abs. Bias: 0.048\n", 1433 | "Rel. Bias: 26.7\n", 1434 | "=============================\n", 1435 | "=============================\n", 1436 | "G-formula\n", 1437 | "-----------------------------\n", 1438 | "Abs. Bias: 0.002\n", 1439 | "Rel. Bias: 0.8\n", 1440 | "=============================\n", 1441 | "=============================\n", 1442 | "IPTW\n", 1443 | "-----------------------------\n", 1444 | "Abs. Bias: 0.106\n", 1445 | "Rel. Bias: 58.5\n", 1446 | "=============================\n", 1447 | "=============================\n", 1448 | "AIPW\n", 1449 | "-----------------------------\n", 1450 | "Abs. Bias: 0.003\n", 1451 | "Rel. Bias: 1.9\n", 1452 | "=============================\n", 1453 | "=============================\n", 1454 | "AIPW-SL1\n", 1455 | "-----------------------------\n", 1456 | "Abs. Bias: 0.003\n", 1457 | "Rel. Bias: 1.5\n", 1458 | "=============================\n", 1459 | "=============================\n", 1460 | "AIPW-SL2\n", 1461 | "-----------------------------\n", 1462 | "Abs. Bias: 0.003\n", 1463 | "Rel. Bias: 1.7\n", 1464 | "=============================\n", 1465 | "=============================\n", 1466 | "TMLE\n", 1467 | "-----------------------------\n", 1468 | "Abs. Bias: 0.009\n", 1469 | "Rel. Bias: 4.8\n", 1470 | "=============================\n", 1471 | "=============================\n", 1472 | "TMLE-SL1\n", 1473 | "-----------------------------\n", 1474 | "Abs. Bias: 0.006\n", 1475 | "Rel. Bias: 3.5\n", 1476 | "=============================\n", 1477 | "=============================\n", 1478 | "TMLE-SL2\n", 1479 | "-----------------------------\n", 1480 | "Abs. Bias: 0.008\n", 1481 | "Rel. Bias: 4.5\n", 1482 | "=============================\n" 1483 | ] 1484 | } 1485 | ], 1486 | "source": [ 1487 | "# Results\n", 1488 | "result = [naive_rd, gform_rd, iptw_rd, aipw_rd, aipw_slm_rd, \n", 1489 | " aipw_sla_rd, tmle_rd, tmle_slm_rd, tmle_sla_rd]\n", 1490 | "labels = [\"Naive\", \"G-formula\", \"IPTW\", \"AIPW\", \"AIPW-SL1\", \n", 1491 | " \"AIPW-SL2\", \"TMLE\", \"TMLE-SL1\", \"TMLE-SL2\"]\n", 1492 | "for x, y in zip(result, labels):\n", 1493 | " print(\"=============================\")\n", 1494 | " print(y)\n", 1495 | " print(\"-----------------------------\")\n", 1496 | " print(\"Abs. Bias:\", np.round(np.mean(x - true_ate), 3))\n", 1497 | " rel_bias = np.abs(np.mean((x - true_ate) / true_ate)*100)\n", 1498 | " print(\"Rel. Bias:\", np.round(rel_bias, 1))\n", 1499 | " print(\"=============================\")\n" 1500 | ] 1501 | }, 1502 | { 1503 | "cell_type": "markdown", 1504 | "metadata": {}, 1505 | "source": [ 1506 | "END" 1507 | ] 1508 | } 1509 | ], 1510 | "metadata": { 1511 | "kernelspec": { 1512 | "display_name": "Python 3", 1513 | "language": "python", 1514 | "name": "python3" 1515 | }, 1516 | "language_info": { 1517 | "codemirror_mode": { 1518 | "name": "ipython", 1519 | "version": 3 1520 | }, 1521 | "file_extension": ".py", 1522 | "mimetype": "text/x-python", 1523 | "name": "python", 1524 | "nbconvert_exporter": "python", 1525 | "pygments_lexer": "ipython3", 1526 | "version": "3.6.5" 1527 | } 1528 | }, 1529 | "nbformat": 4, 1530 | "nbformat_minor": 4 1531 | } 1532 | -------------------------------------------------------------------------------- /RCodeBoxes.R: -------------------------------------------------------------------------------- 1 | ################################################################################################### 2 | 3 | # Tutorial: causal inference methods made easy for applied resarchers/epidemiologists/statisticians 4 | 5 | # ICON-LSHTM, LONDON, 16th October 2020 6 | 7 | # Miguel Angel Luque Fernandez, PhD 8 | # Assistant Professor of Epidemiology and Biostatistics 9 | # Matthew Smith, PhD 10 | # Research Fellow 11 | 12 | # Inequalities in Cancer Outcomes Network, LSHTM, London, UK 13 | 14 | # Copyright (c) 2020 Permission is hereby granted, free of charge, to any person obtaining a copy 15 | # of this software and associated documentation files (the "Software"), to deal in the Software 16 | # without restriction, including without limitation the rights to use, copy, modify, merge, 17 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to 18 | # whom the Software is furnished to do so, subject to the following conditions: The above 19 | # copyright notice and this permission notice shall be included in all copies or substantial 20 | # portions of the Software. 21 | 22 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING 23 | # BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON 24 | # INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 25 | # OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 26 | # IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Bug reports: miguel-angel.luque at lshtm.ac.uk 29 | 30 | # The rhc dataset can be dowloaded at http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets 31 | 32 | ################################################################################################### 33 | 34 | 35 | # Preliminaries 36 | rm(list=ls()) 37 | 38 | ### Box 1: Setting the data 39 | setwd("your path") 40 | #setwd("~/Dropbox/ESTIMATORSCIproject/R_Stata_master_files/Data") 41 | library(haven) 42 | #data <- read_dta("~/Dropbox/ESTIMATORSCIproject/R_Stata_master_files/Data/rhc.dta") 43 | data <- read_dta("rhc.dta") 44 | # Define the outcome (Y), exposure (A), confounder (C), and confounders (W) 45 | data$Y <- data$death_d30; data$Y <- as.numeric(data$Y); Y <- data$Y 46 | data$A <- data$rhc; data$A <- as.numeric(data$A); A <- data$A 47 | data$C <- data$gender; data$C <- as.numeric(data$C); C <- data$C 48 | data$w1 <- data$age; data$w1 <- as.numeric(data$w1); w1 <- data$w1 49 | data$w2 <- data$edu; data$w2 <- as.numeric(data$w2); w2 <- data$w2 50 | data$w3 <- data$race; data$w3 <- as.numeric(data$w3); w3 <- data$w3 51 | data$w4 <- data$carcinoma; data$w4 <- as.numeric(data$w4); w4 <- data$w4 52 | data2 <- as.data.frame(Y); data2$A <- A; data2$C <- C; data2$w1 <- w1; data2$w2 <- w2; data2$w3 <- w3; data2$w4 <- w4 53 | 54 | 55 | ### Box 2: Naive estimate of the ATE 56 | naive <- lm(Y ~ A + C, data=data); naive # Naive estimate of the ATE is 0.07352 57 | 58 | 59 | # 3. G-Formula 60 | 61 | ## 3.1 Non-parametric G-formula 62 | 63 | ### Box 3: Non-parametric G-formula for the ATE 64 | mean(data$A[data$C==1], na.rm=TRUE) # 65 | mean(data$A[data$C==0], na.rm=TRUE) # 66 | mean(data$Y[data$A==1], na.rm=TRUE) - mean(data$Y[data$A==0],na.rm=TRUE) # Unadjusted Estimate 67 | reg <- lm(Y ~ A, data=data); reg # Unadjusted Estimate Regression 68 | pr.l <- prop.table(table(data$C)); pr.l # Marginal probability of C 69 | tab.out <- aggregate(Y ~ A + C, data, mean); tab.out # Table of Means in 70 | ATE <- ((mean(data$Y[data$A==1 & data$C==1]) - mean(data$Y[data$A==0 & data$C==1]))*pr.l[2]) + 71 | (mean(data$Y[data$A==1 & data$C==0]) - mean(data$Y[data$A==0 & data$C==0]))*pr.l[1] # G-formula Non-parametric ATE 72 | ATE; rm(ATE) # The ATE from the non-parametric estimator is 0.073692 73 | 74 | ### Box 4: Non-parametric G-formula for the ATT 75 | ATTm <- mean(data$C[data$A==1], na.rm=TRUE) # Proportion of those who are male amongst treated 76 | ATTf <- 1-mean(data$C[data$A==1], na.rm=TRUE) # Proportion of those who are female amongst treated 77 | ATT <-((mean(data$Y[data$A==1 & data$C==1]) - mean(data$Y[data$A==0 & data$C==1]))*ATTm) + 78 | (mean(data$Y[data$A==1 & data$C==0]) - mean(data$Y[data$A==0 & data$C==0]))*ATTf # G-formula Non-parametric ATT 79 | ATT # The ATT from the non-parametric estimator is 0.073248 80 | rm(ATT) 81 | 82 | ### Box 5: Bootstrap the 95% confidence intervals (CI) for the ATE/ATT estimated using the non-parametric G-Formula 83 | # ATE 84 | library(boot) 85 | g.comp = function(data,indices) # Define the function to estimate the ATE 86 | { 87 | dat=data[indices,] 88 | pr.l <- prop.table(table(dat$C)) 89 | 90 | ATE = ((mean(dat$Y[dat$A==1 & dat$C==1]) - mean(dat$Y[dat$A==0 & dat$C==1]))*pr.l[2]) + 91 | (mean(dat$Y[dat$A==1 & dat$C==0]) - mean(dat$Y[dat$A==0 & dat$C==0]))*pr.l[1] ; ATE 92 | } 93 | g.comp(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n 94 | boot.out=boot(data,g.comp,200) # Draw 200 bootstrap sample estimates 95 | boot.ci(boot.out,type="perc",conf=0.95) # compute confidence intervals using percentile method 96 | boot.ci(boot.out,type="norm",conf=0.95) 97 | 98 | # ATT 99 | g.comp = function(data,indices) # Define the function to estimate the ATT 100 | { 101 | dat=data[indices,] 102 | 103 | ATTm <- mean(dat$C[dat$A==1], na.rm=TRUE) # Proportion of those who are male among treated 104 | ATTf <- 1-mean(dat$C[dat$A==1], na.rm=TRUE) 105 | 106 | ((mean(dat$Y[dat$A==1 & dat$C==1]) - mean(dat$Y[dat$A==0 & dat$C==1]))*ATTm) + 107 | (mean(dat$Y[dat$A==1 & dat$C==0]) - mean(dat$Y[dat$A==0 & dat$C==0]))*ATTf 108 | } 109 | g.comp(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n 110 | boot.out=boot(data,g.comp,200) # Draw 200 bootstrap sample estimates 111 | boot.ci(boot.out,type="perc",conf=0.95) # compute confidence intervals using percentile method 112 | boot.ci(boot.out,type="norm",conf=0.95) 113 | 114 | 115 | ### Box 6: Non-parametric G-Formula using a fully saturated regression model in Stata (A) 116 | # Method 1: conditional probabilities 117 | data$A1 <- ifelse(data$A == 1, 1, 0) 118 | data$A0 <- ifelse(data$A == 0, 1, 0) 119 | data$C1 <- ifelse(data$C == 1, 1, 0) 120 | data$C0 <- ifelse(data$C == 0, 1, 0) 121 | reg <- glm(Y ~ -1 + (A1 + A0) + A1:(C1) + A0:(C1), data=data); summary(reg) 122 | ATE <- mean((reg$coefficients[1] + reg$coefficients[3]*C) - (reg$coefficients[2] + reg$coefficients[4]*C)); ATE 123 | rm(ATE) 124 | 125 | ### Box 7: Non-parametric G-Formula using a fully saturated regression model in Stata (B) 126 | # Method 2: Marginal probabilities 127 | install.packages("margins") 128 | library(margins) 129 | reg <- glm(Y ~ -1 + (A1 + A0) + A1:(C1) + A0:(C1), data=data); summary(reg) 130 | Y1 <- margins(reg, variables="A1"); Y1 131 | Y0 <- margins(reg, variables="A0"); Y0 132 | ATE <- Y1$fitted[A==1]-Y0$fitted[A==0]; mean(ATE) 133 | rm(ATE) 134 | 135 | ## 3.2 Parametric G-formula 136 | ### Box 8: Parametric G-formula by hand 137 | mod1 <- glm(Y ~ C, family="binomial", data=data[data$A==1,]) # Expected probability amongst those with RHC 138 | mod0 <- glm(Y ~ C, family="binomial", data=data[data$A==0,]) # Expected probability amongst those without RHC 139 | GcompRA <- cbind(Y1 = predict(mod1, newdata=data.frame(A = 1, C), type="response"), 140 | Y0 = predict(mod0, newdata=data.frame(A = 0, C), type="response")) 141 | GcompRA <- as.data.frame(GcompRA) 142 | Y.1 <- GcompRA$Y1 143 | Y.0 <- GcompRA$Y0 144 | ATE <- mean((Y.1) - (Y.0), na.rm=TRUE); ATE # Difference between expected probabilities (ATE) 145 | rm(ATE) 146 | 147 | 148 | ### Box 9: Parametric regression adjustment (one confounder) using stdReg R-package 149 | install.packages("stdReg") 150 | library(stdReg) 151 | reg <- glm(Y ~ A + C, data = data, family = poisson(link="log")); summary(reg) 152 | reg.std <- stdGlm(fit=reg, data = data, X = "A", x=seq(0,1)) 153 | print(summary(reg.std, contrast = "difference", reference=0)) 154 | plot(reg.std) 155 | 156 | ### Box 10: Bootstrap for the parametric regression adjustment one confounder) 157 | library(boot) # Install the Bootstrap package 158 | attach(data) 159 | g.comp=function(data,indices) # Define the function to estimate the ATE 160 | { 161 | dat=data[indices,] 162 | glm1 <- glm(Y ~ C, family="binomial", dat=dat[dat$A==1,]) 163 | glm2 <- glm(Y ~ C, family="binomial", dat=dat[dat$A==0,]) 164 | Y.1 = predict(glm1, newdata=data.frame(A = 1, C), type="response") 165 | Y.0 = predict(glm2, newdata=data.frame(A = 0, C), type="response") 166 | ATE <- mean((Y.1) - mean(Y.0)); ATE 167 | } 168 | g.comp(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n 169 | boot.out=boot(data,g.comp,200) # Draw 1000 bootstrap sample estimates of RD 170 | boot.ci(boot.out,type="norm",conf=0.95) # Bootstrapped 95% CI based on normal approximation 171 | boot.ci(boot.out,type="perc",conf=0.95) # Bootstrapped 95% CI based on percentiles of the bootstrap replicates 172 | 173 | # Now with more than one confounder 174 | 175 | ### Box 11: Parametric multivariate regression adjustment implementation of the G-Formula 176 | mod1 <- glm(Y ~ C + w1 + w2 + w3 + w4, family="binomial", data=data[data$A==1,]) # Expected probability amongst those with RHC 177 | mod0 <- glm(Y ~ C + w1 + w2 + w3 + w4, family="binomial", data=data[data$A==0,]) # Expected probability amongst those without RHC 178 | GcompRA <- cbind(Y1 = predict(mod1, newdata=data.frame(A = 1, C, w1, w2, w3, w4), type="response"), 179 | Y0 = predict(mod0, newdata=data.frame(A = 0, C, w1, w2, w3, w4), type="response")) 180 | GcompRA <- as.data.frame(GcompRA) 181 | Y.1 <- GcompRA$Y1 182 | Y.0 <- GcompRA$Y0 183 | ATE <- mean((Y.1) - (Y.0), na.rm=TRUE); ATE # ATE 184 | rm(ATE) 185 | 186 | 187 | ### Box 12: Parametric multivariate regression adjustment using "stdReg" R-package 188 | install.packages("stdReg") 189 | library(stdReg) 190 | reg <- glm(Y ~ A + C + w1 + w2 + w3 + w4, data = data, family = poisson(link="log")); summary(reg) 191 | reg.std <- stdGlm(fit=reg, data=data, X="A", x=seq(0,1)) 192 | print(summary(reg.std, contrast="difference", reference=0)) 193 | plot(reg.std) 194 | 195 | 196 | ### Box 13: Parametric multivariate regression adjustment using "margins" R-package 197 | reg1 <- glm(Y ~ -1 + (A1 + A0) + A1:(C1 + w1 + w2 + w3 + w4) + A0:(C0 + w1 + w2 + w3 + w4) , data=data); summary(reg1) 198 | poY1m <- margins(reg1, variables="A1"); poY1m 199 | poY0m <- margins(reg1, variables="A0"); poY0m 200 | ATE2 <- poY1m$fitted[A==1] - poY0m$fitted[A==0]; mean(ATE2) 201 | 202 | ### Box 14 Bootstrap for the multivariate parametric regression adjustment 203 | library(boot) # Install the Bootstrap package 204 | attach(data) 205 | g.comp=function(data,indices) # Define the function to estimate the ATE 206 | { 207 | dat=data[indices,] 208 | glm1 <- glm(Y ~ C + w1 + w2 + w3 + w4, family="binomial", dat=dat[dat$A==1,]) 209 | glm2 <- glm(Y ~ C + w1 + w2 + w3 + w4, family="binomial", dat=dat[dat$A==0,]) 210 | Y.1 = predict(glm1, newdata=data.frame(A = 1, C, w1, w2, w3, w4), type="response") 211 | Y.0 = predict(glm2, newdata=data.frame(A = 0, C, w1, w2, w3, w4), type="response") 212 | mean((Y.1) - mean(Y.0)) 213 | } 214 | g.comp(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n 215 | boot.out=boot(data,g.comp,200) # Draw 1000 bootstrap sample estimates of RD 216 | boot.ci(boot.out,type="norm",conf=0.95) # Bootstrapped 95% CI based on normal approximation 217 | boot.ci(boot.out,type="perc",conf=0.95) # Bootstrapped 95% CI based on percentiles of the bootstrap replicates 218 | 219 | 220 | ### Box 15 Computing the parametric marginal risk ratio after regression adjustment 221 | reg <- glm(Y ~ A + C + w1 + w2 + w3 + w4, data=data2, family = binomial(link="logit")); summary(reg) 222 | reg.std <- stdGlm(fit=reg, data=data2, X="A", x=seq(0,1)) 223 | print(summary(reg.std, contrast="ratio", reference=0)) # 27% (95% CI 1.18-1.37) increase in relative risk 224 | plot(reg.std) 225 | 226 | # 4. Inverse Probability of Treatment Weighting 227 | 228 | ## 4.1 Inverse probability of treatment weighting based on the propensity score plus regression adjustment 229 | 230 | # Box 16 (IPTW by hand) 231 | p.s <- glm(A ~ as.factor(C) + w1 + w2 + w3 + w4, data=data, family=binomial) # Propensity score mmodel for the exposure 232 | p.score <- ifelse(data$A == 0, 1 - predict(p.s, type = "response"), predict(p.s, type = "response")) # Assign Propensity score weights 233 | #table(p.score) # Table of Propensity Scores 234 | data$w <- 1/p.score # Generate IP Weights 235 | data2$w <- 1/p.score 236 | #table(data$w); summary(data$w); sd(data$w) 237 | 238 | ATE <- mean(data$w*as.numeric(data$A==1)*data$Y) - mean(data$w*as.numeric(data$A==0)*data$Y);ATE # Estimate ATE 239 | rm(ATE) 240 | 241 | 242 | # Box 17 Bootstrap computation for the IPTW estimator 243 | library(boot) 244 | iptw.w = function(data,indices) # Define the function to estimate the ATE 245 | { 246 | dat=data[indices,] 247 | mean(dat$w*as.numeric(dat$A==1)*dat$Y) - mean(dat$w*as.numeric(dat$A==0)*dat$Y) 248 | } 249 | iptw.w(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n 250 | boot.out=boot(data,iptw.w,100) # Draw 200 bootstrap sample estimates 251 | boot.ci(boot.out,type="perc",conf=0.95) # compute confidence intervals using percentile method 252 | boot.ci(boot.out,type="norm",conf=0.95) 253 | 254 | 255 | ### Box 18: Computation of the IPTW estimator for the ATE using IPW R-package 256 | install.packages("ipw", "survey") 257 | library(ipw) 258 | library(survey) 259 | 260 | # Univariable 261 | ipw.ATE <- ipwpoint(exposure = A, family = "binomial", link = "logit", 262 | numerator = ~ 1, 263 | denominator = ~ C, 264 | data = data2) 265 | summary(ipw.ATE$ipw.weights) 266 | ipwplot(weights = ipw.ATE$ipw.weights, logscale = FALSE, main = "Unstabilized weights", xlim = c(0.5, 2)) 267 | summary(ipw.ATE$num.mod) 268 | summary(ipw.ATE$den.mod) 269 | data2$usw <- ipw.ATE$ipw.weights 270 | msm <- (svyglm(Y ~ A, design = svydesign(~ 1, weights = ~ usw, data = data2))) 271 | coef(msm); confint(msm) 272 | 273 | # Multivariable 274 | ipw.ATE <- ipwpoint(exposure = A, family = "binomial", link = "logit", 275 | numerator = ~ 1, 276 | denominator = ~ C + w1 + w2 + w3 + w4, 277 | data = data2) 278 | summary(ipw.ATE$ipw.weights) 279 | ipwplot(weights = ipw.ATE$ipw.weights, logscale = FALSE, main = "Unstabilized weights", xlim = c(0.5, 2)) 280 | summary(ipw.ATE$num.mod) 281 | summary(ipw.ATE$den.mod) 282 | data2$usw <- ipw.ATE$ipw.weights 283 | msm <- (svyglm(Y ~ A, design = svydesign(~ 1, weights = ~ usw, data = data2))) 284 | coef(msm); confint(msm) 285 | 286 | 287 | ### Box 19: Assessing IPTW balance 288 | install.packages("twang") 289 | library(twang) 290 | ps.balance <- ps(A ~ C + w1 + w2 + w3 + w4, data = data2, 291 | n.trees=1000, interaction.depth=2, shrinkage=0.01, perm.test.iters=0, 292 | stop.method=c("es.mean","ks.max"), estimand = "ATE", verbose=FALSE) 293 | plot(ps.balance) 294 | summary(ps.balance$gbm.obj, n.trees=ps.balance$desc$ks.max.ATE$n.trees, plot=FALSE) 295 | data2.balance <- bal.table(ps.balance); data2.balance 296 | 297 | 298 | ### Box 20: Assessing IPTW overlap by hand 299 | install.packages("xtable") 300 | library(xtable) 301 | pretty.tab <- data2.balance$ks.max.ATE[,c("tx.mn","ct.mn","ks")] 302 | pretty.tab <- cbind(pretty.tab, data2.balance$unw[,"ct.mn"]) 303 | names(pretty.tab) <- c("E(Y1|t=1)","E(Y0|t=1)","KS","E(Y0|t=0)") 304 | xtable(pretty.tab, caption = "Balance of the treatment and comparison groups", 305 | label = "tab:balance", digits = c(0, 2, 2, 2, 2), align=c("l","r","r","r","r")) 306 | plot(ps.balance, plots = 6) 307 | 308 | 309 | ### Box 21: Assessing overlap using plots 310 | # Fit a propensity score model 311 | m_PS<-glm(A ~ C + w1 + w2 + w3 + w4, data = data2, family=binomial(link="logit")) 312 | summary(m_PS) 313 | 314 | # Estimate the propensity score 315 | data$PS<-fitted.values(m_PS) 316 | 317 | # Histogram of the PS 318 | hist(data$PS[data$rhc==0]) 319 | hist(data$PS[data$rhc==1]) 320 | plot(density(data$PS[data$rhc==0]),col="red",lwd=2, xlab="PS") 321 | lines(density(data$PS[data$rhc==1]),col="blue",lwd=2) 322 | legend("topright", legend=c("No RHC", "RHC"), pch="--", col=c("red","blue"), bty="n", lwd=2) 323 | 324 | # Look at minimum and maximum PS in each exposure group 325 | min(data$PS[data$rhc==0]) 326 | min(data$PS[data$rhc==1]) 327 | max(data$PS[data$rhc==0]) 328 | max(data$PS[data$rhc==1]) 329 | 330 | # Investigate overlap (i.e. positivity) 331 | data$overlap <- ifelse(data$PS>=min(data$PS[data$rhc==1]) & data$PS<=max(data$PS[data$rhc==0]),1,0); table(data$overlap,data$rhc) 332 | 333 | ## 4.2 Marginal structural model with stabilised weights 334 | ### Box 22: Computation of the IPTW estimator for the ATE using a MSM 335 | 336 | # Unstabilized weights 337 | msm <- lm(Y ~ A + C + w1 + w2 + w3 + w4, data = data, weights = data$w) # MSM 338 | library(sandwich) 339 | SE <-sqrt(diag(vcovHC(msm, type="HC0"))) # robust standard errors 340 | beta <- coef(msm) 341 | lcl <- beta-1.96*SE 342 | ucl <- beta+1.96*SE 343 | cbind(beta, lcl, ucl)[2,] 344 | 345 | # Stabilized weights 346 | denom.fit <- glm(A ~ as.factor(C) + w1 + w2 + w3 + w4, 347 | family = binomial(), data = data) 348 | denom.p <- predict(denom.fit, type = "response") # Stablized Weights 349 | 350 | numer.fit <- glm(A ~ 1, family = binomial(), data = data) 351 | summary(numer.fit) 352 | numer.p <- predict(numer.fit, type = "response") # estimation of numerator of ip weights 353 | 354 | data$sw <- ifelse(data$A == 0, ((1-numer.p)/(1-denom.p)), (numer.p/denom.p)) 355 | 356 | msm <- lm(Y ~ A, data = data, weights = sw) 357 | 358 | SE <-sqrt(diag(vcovHC(msm, type="HC0"))) # robust standard errors 359 | beta <- coef(msm) 360 | lcl <- beta-1.96*SE 361 | ucl <- beta+1.96*SE 362 | cbind(beta, lcl, ucl)[2,] 363 | 364 | ## 4.3 IPTW with regression adjustment 365 | ### Box 23: Computation of the IPTW-RA estimator for the ATE and bootstrap for statistical inference 366 | glm1 <- glm(Y ~ C + w1 + w2 + w3 + w4, weights = data$w[data$A==1], data=data[data$A==1,]) 367 | Y.1 = predict(glm1, newdata=data.frame(A = 1, C, w1, w2, w3, w4), type="response") 368 | glm2 <- glm(Y ~ C + w1 + w2 + w3 + w4, weights = data$w[data$A==0], data=data[data$A==0,]) 369 | Y.0 = predict(glm2, newdata=data.frame(A = 0, C, w1, w2, w3, w4), type="response") 370 | ATE <- mean(Y.1 - Y.0); ATE 371 | ATE2 <- mean(data$w*as.numeric(data$A==1)*Y.1)/mean( data$w*as.numeric(data$A==1)) - mean(data$w*as.numeric(data$A==0)*Y.0)/mean(data$w*as.numeric(data$A==0));ATE2 372 | rm(ATE, ATE2) 373 | 374 | ### Box 24: Computation of the IPTW-RA estimator for the ATE using the ipw R-package 375 | library(ipw) 376 | ipw.ATE <- ipwpoint(exposure = A, family = "binomial", link = "logit", 377 | numerator = ~ C, 378 | denominator = ~ C + w1 + w2 + w3 + w4, 379 | data = data2) 380 | summary(ipw.ATE$ipw.weights) 381 | ipwplot(weights = ipw.ATE$ipw.weights, logscale = FALSE, main = "Stabilized weights", xlim = c(0.5, 2)) 382 | summary(ipw.ATE$num.mod) 383 | summary(ipw.ATE$den.mod) 384 | 385 | data2$sw <- ipw.ATE$ipw.weights 386 | msm <- (svyglm(Y ~ A, design = svydesign(~ 1, weights = ~ sw, data = data2))) 387 | coef(msm); confint(msm) 388 | 389 | 390 | # 5. Augmented inverse probability weighting 391 | 392 | ### Box 25: Computation of the AIPTW estimator for the ATE and bootstrap for statistical inference 393 | mod <- glm(Y ~ A + C + w1 + w2 + w3 + w4, family="binomial", data=data) 394 | PO <- cbind(Yhat = predict(mod), 395 | Y1 = predict(mod, newdata=data.frame(A = 1, C, w1, w2, w3, w4), type="response"), 396 | Y0 = predict(mod, newdata=data.frame(A = 0, C, w1, w2, w3, w4), type="response")) 397 | RA <- as.data.frame(PO) # Potential Outcomes 398 | Yhat <- RA$Yhat 399 | Y.1a <- RA$Y1 400 | Y.0a <- RA$Y0 401 | 402 | g <- glm(A ~ C + w1 + w2 + w3 + w4, family = binomial(), data = data) 403 | gw <- predict(g, type = "response") 404 | gws <- ifelse(data$A == 0, (-(1 - data$A)/(1 - gw)),(data$A/gw)); sum(gws) # estimation of weights 405 | AIPTW <- mean(gws*(data$Y - plogis(RA$Yhat)) + ((Y.1a) - (Y.0a))); AIPTW # ATE 406 | RR <- mean(Y.1a/Y.0a); RR # RR 407 | 408 | IC <- (gws*(data$Y - plogis(RA$Yhat)) + ((Y.1a) - (Y.0a)))-AIPTW # Estimate the influence function (functional Delta method) 409 | n <- nrow(data) 410 | varHat.IC <- var(IC)/n; varHat.IC 411 | lci <- AIPTW-1.96*sqrt(varHat.IC) 412 | uci <- AIPTW+1.96*sqrt(varHat.IC) 413 | cat(AIPTW,lci,uci) # Inference Influence function 414 | 415 | AIPTW.b = function(data,indices) # Inference using Bootstrap 416 | { 417 | dat=data[indices,] 418 | mod <- glm(Y ~ A + C + w1 + w2 + w3 + w4, family="binomial", data=data) 419 | Yhat = predict(mod) 420 | Y1 = predict(mod, newdata=data.frame(A = 1, C, w1, w2, w3, w4)) 421 | Y0 = predict(mod, newdata=data.frame(A = 0, C, w1, w2, w3, w4)) 422 | g <- glm(A ~ C + w1 + w2 + w3 + w4, family="binomial", data = data) 423 | gw <- predict(g,type="response") 424 | gws <- ifelse(A == 0, (-(1 - A)/(1 - gw)),(A/gw)) 425 | mean(gws*(Y - plogis(Yhat)) + (plogis(Y1) - plogis(Y0))) 426 | } 427 | AIPTW.b(data,indices=1:nrow(data)) # Can get original estimate, by plugging in indices 1:n 428 | boot.out=boot(data,AIPTW.b,200) # Draw 200 bootstrap sample estimates 429 | boot.ci(boot.out,type="perc",conf=0.95) # compute confidence intervals using percentile method 430 | boot.ci(boot.out,type="norm",conf=0.95) 431 | 432 | ### Box 26: Computation of the AIPTW estimator for the ATE and marginal risk ratio 433 | w <- subset(data, select=c(C, w1, w2, w3 , w4)) 434 | fit1 <- drtmle(W = w, A = A, Y = Y, # input data 435 | a_0 = c(0, 1), # return estimates for A = 0 and A = 1 436 | SL_Q = "SL.npreg", # use kernel regression for E(Y | A = a, W) 437 | glm_g = "C + w1 + w2 + w3 + w4", # use misspecified main terms glm for E(A | W) 438 | SL_Qr = "SL.npreg", # use kernel regression to guard against 439 | # misspecification of outcome regression 440 | #SL_gr = "SL.npreg", # use kernel regression to guard against 441 | # misspecification of propensity score 442 | returnModels = TRUE # for visualizing fits later 443 | ) 444 | ATE <- ci(fit1, contrast = c(-1,1)); ATE 445 | RR <- riskRatio <- list(f = function(eff){ log(eff) }, 446 | f_inv = function(eff){ exp(eff) }, 447 | h = function(est){ est[2]/est[1] }, 448 | fh_grad = function(est){ c(1/est[1],-1/est[2]) }) 449 | ci(fit1, contrast = riskRatio) 450 | rm(ATE, RR) 451 | 452 | # 6. DATA-ADAPTIVE ESTIMATION: ENSEMBLE LEARNING TARGETED MAXIMUMLIKELIHOOD ESTIMATION 453 | 454 | ### Box 27: Computational implementation of TMLE by hand 455 | # Step 1 456 | Gcomp <- glm(Y ~ A + C + w1 + w2 + w3 + w4, family="binomial", data=data2) 457 | # Prediction for A, A=1 and, A=0 458 | QAW <- predict(Gcomp) 459 | Q1W = predict(Gcomp, newdata=data.frame(A = 1, data2[,c("C", "w1","w2","w3","w4")])) 460 | Q0W = predict(Gcomp, newdata=data.frame(A = 0, data2[,c("C", "w1","w2","w3","w4")])) 461 | # Step 2 estimation of the propensity score (ps) 462 | psm <- glm(A ~ C + w1 + w2 + w3 + w4, family = binomial, data=data2) 463 | gW = predict(psm, type = "response") 464 | g1W = (1 / gW) 465 | g0W = (-1 / (1-gW)) 466 | # Step 3 computation of H and estimation of epsilon 467 | HAW <- (data2$A / gW -(1-data2$A) / (1 - gW)) 468 | H1W = (1/gW) 469 | H0W = (-1 / (1 - gW)) 470 | epsilon <- coef(glm(data2$Y ~ -1 + HAW + offset(QAW), family = "binomial")) 471 | # Step 4 ATE 472 | ATE<- mean(plogis(Q1W + epsilon * H1W) - plogis(Q0W + epsilon * H0W)); ATE 473 | # Step 5 Maringinal RR 474 | T1.EY1 <- mean(plogis(Q1W + epsilon * H1W)) 475 | T1.EY0 <- mean(plogis(Q0W + epsilon * H0W)) 476 | RR <- (T1.EY1/T1.EY0); RR 477 | rm(ATE, RR) 478 | 479 | ### Box 28: TMLE with data-adaptive estimation using the R package 480 | set.seed(777) 481 | library(tmle) 482 | w <- subset(data, select=c(C, w1, w2, w3 , w4)) 483 | fittmle <- tmle(data$Y, data$A, W=w, family="binomial", 484 | Q.SL.library = c("SL.glm","SL.glm.interaction","SL.step.interaction","SL.gam","SL.randomForest"), 485 | g.SL.library = c("SL.glm","SL.glm.interaction","SL.step.interaction","SL.gam","SL.randomForest")) 486 | fittmle 487 | 488 | # 7. Simulation 489 | ### Box 29: Data generation for the Monte Carlo experiment 490 | 491 | rm(list=ls()) 492 | 493 | # Super Learner libraries 494 | SL.library <- c("SL.glm","SL.step","SL.step.interaction","SL.glm.interaction","SL.gam") #"SL.randomForest","SL.glmnet" 495 | 496 | # Data generation A: dual misspecification for the model of the outcome and treatment 497 | set.seed(7777) 498 | generateData <- function(n){ 499 | w1 <- round(runif(n, min=1, max=5), digits=0) 500 | w2 <- rbinom(n, size=1, prob=0.45) 501 | w3 <- round(runif(n, min=0, max=1), digits=0 + 0.75*w2 + 0.8*w1) 502 | w4 <- round(runif(n, min=0, max=1), digits=0 + 0.75*w2 + 0.2*w1) 503 | A <- rbinom(n, size=1, prob= plogis(-1 - 0.15*w4 + 1.5*w2 + 0.75*w3 + 0.25*w1 + 0.8*w2*w4)) 504 | # Counterfactuals 505 | Y.1 <- rbinom(n, size=1, prob = plogis(-3 + 1 + 0.25*w4 + 0.75*w3 + 0.8*w2*w4 + 0.05*w1)) 506 | Y.0 <- rbinom(n, size=1, prob = plogis(-3 + 0 + 0.25*w4 + 0.75*w3 + 0.8*w2*w4 + 0.05*w1)) 507 | # Observed outcome 508 | Y <- Y.1*A + Y.0*(1 - A) 509 | # return data.frame 510 | data.frame(w1, w2, w3, w4, A, Y, Y.1, Y.0) 511 | } 512 | 513 | # True ATE 514 | ObsDataTrueATE <- generateData(n=5000000) 515 | True_ATE <- mean(ObsDataTrueATE$Y.1 - ObsDataTrueATE$Y.0);True_ATE 516 | True_EY.1 <- mean(ObsDataTrueATE$Y.1) 517 | True_EY.0 <- mean(ObsDataTrueATE$Y.0) 518 | True_RR <- (True_EY.1 / True_EY.0);True_RR 519 | 520 | #Simulations 521 | library(tmle) 522 | library(SuperLearner) 523 | #install.packages("dbarts") 524 | R <- 1000 525 | #Empty vectors 526 | naive_RR <- rep(NA,R) 527 | ATEtmle1 <- rep(NA,R) 528 | RRtmle1 <- rep(NA,R) 529 | ATE_AIPTW <- rep(NA,R) 530 | RR_AIPTW <- rep(NA,R) 531 | ATEtmle2 <- rep(NA,R) 532 | RRtmle2 <- rep(NA,R) 533 | ATEtmle3 <- rep(NA,R) 534 | RRtmle3 <- rep(NA,R) 535 | for(r in 1:R){ 536 | print(paste("This is simulation run number",r)) 537 | CancerData <- generateData(n=1000) 538 | # ATE naive approach 539 | naive_RR[r] <- exp(glm(data = CancerData, Y ~ A + w1 + w2 + w3 + w4, family = poisson(link="log"))$coef[2]) 540 | # TMLE implementation by hand 541 | # Step 1 542 | gm <- glm(Y ~ A + w1 + w2 + w3 + w4, family="binomial", data=CancerData) 543 | # Prediction for A, A=1 and, A=0 544 | QAW <- predict(gm) 545 | Q1W = predict(gm, newdata=data.frame(A = 1, CancerData[,c("w1","w2","w3","w4")])) 546 | Q0W = predict(gm, newdata=data.frame(A = 0, CancerData[,c("w1","w2","w3","w4")])) 547 | # Step 2 estimation of the propensity score (ps) 548 | psm <- glm(A ~ w1 + w2 + w3 + w4, family = binomial, data=CancerData) 549 | gW = predict(psm, type = "response") 550 | g1W = (1 / gW) 551 | g0W = (-1 / (1-gW)) 552 | # Step 3 computation of H and estimation of epsilon 553 | HAW <- (CancerData$A / gW -(1-CancerData$A) / (1 - gW)) 554 | H1W = (1/gW) 555 | H0W = (-1 / (1 - gW)) 556 | epsilon <- coef(glm(CancerData$Y ~ -1 + HAW + offset(QAW), family = "binomial")) 557 | # Step 4 updated ATE 558 | ATEtmle1[r] <- mean(plogis(Q1W + epsilon * H1W) - plogis(Q0W + epsilon * H0W)) 559 | # Step 5 updated MOR 560 | T1.EY1 <- mean(plogis(Q1W + epsilon * H1W)) 561 | T1.EY0 <- mean(plogis(Q0W + epsilon * H0W)) 562 | RRtmle1[r] <- (T1.EY1 / T1.EY0) 563 | 564 | # Augmented inverse probability treatment weight (AIPTW) estimator 565 | ATE_AIPTW[r] <- mean((HAW*(CancerData$Y - plogis(QAW)) + (plogis(Q1W)-plogis(Q0W)))) 566 | AIPTW1 <- mean(CancerData$A * (CancerData$Y - plogis(Q1W)) / gW + plogis(Q1W) ) 567 | AIPTW0 <- mean((1- CancerData$A) * (CancerData$Y - plogis(Q0W)) / (1-gW) + plogis(Q0W)) 568 | RR_AIPTW[r] <- mean( AIPTW1 / AIPTW0) 569 | 570 | # R-package tmle (base implementation includes SL.step, SL.glm and SL.glm.interaction) 571 | ATE2 <- tmle(Y=CancerData$Y, A=CancerData$A, W=CancerData[,c("w1","w2","w3","w4")], family="binomial") 572 | ATEtmle2[r] <- ATE2$estimates$ATE$psi 573 | RRtmle2[r] <- ATE2$estimates$RR$psi 574 | 575 | # Improved Super learner 576 | ATE3 <- tmle(Y = CancerData$Y, A=CancerData$A, W=CancerData[,c("w1","w2","w3","w4")], family="binomial", Q.SL.library=SL.library, g.SL.library=SL.library) 577 | ATEtmle3[r] <- ATE3$estimates$ATE$psi 578 | RRtmle3[r] <- ATE3$estimates$RR$psi 579 | } 580 | # Mean naive 581 | mean(naive_RR) 582 | # Mean AIPTW 583 | mean(ATE_AIPTW) 584 | mean(RR_AIPTW) 585 | # Estimate of TMLE by hand 586 | mean(ATEtmle1) 587 | mean(RRtmle1) 588 | # Estimate of TMLE + SL default implementation 589 | mean(ATEtmle2) 590 | mean(RRtmle2) 591 | # Estimate of TMLE + SL2 default plus more algorithms 592 | mean(ATEtmle3) 593 | mean(RRtmle3) 594 | save.image("your path\results.RData") 595 | 596 | # Relative Bias ATE 597 | abs(mean((True_ATE - ATE_AIPTW) / True_ATE)*100) 598 | abs(mean((True_ATE - ATEtmle1) / True_ATE)*100) 599 | abs(mean((True_ATE - ATEtmle2) / True_ATE)*100) 600 | abs(mean((True_ATE - ATEtmle3) / True_ATE)*100) 601 | 602 | # Relative Bias RR 603 | abs(mean((True_RR - naive_RR) / True_RR)*100) 604 | abs(mean((True_RR - RR_AIPTW) / True_RR)*100) 605 | abs(mean((True_RR - RRtmle1) / True_RR)*100) 606 | abs(mean((True_RR - RRtmle2) / True_RR)*100) 607 | abs(mean((True_RR - RRtmle3) / True_RR)*100) 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tutorial: Introduction to computational causal inference for applied researchers and epidemiologists 2 | 3 | ### Matthew James Smith, Camille Maringe, Bernard Rachet, Mohammad A. Mansournia, Paul Zivich, Stephen R. Cole, Miguel Angel Luque Fernandez 4 | 5 | ### This repository makes available to the scientific community the data and code used in the preprint manuscript available at 6 | 7 | [Link to the preprint article](https://arxiv.org/abs/2012.09920) 8 | 9 | ### CITE this repository: 10 | 11 | [![DOI](https://zenodo.org/badge/272439035.svg)](https://zenodo.org/badge/latestdoi/272439035) 12 | 13 | ### Matthew James Smith, Camille Maringe, Bernard Rachet, Mohammad A. Mansournia, Paul Zivich, Stephen R. Cole, Miguel Angel Luque Fernandez 14 | 15 | ### ABSTRACT 16 | The purpose of many health studies is to estimate the effect of an exposure on an outcome. It is not always ethical to assign an exposure to individuals in randomised controlled trials, instead observational data and appropriate study design must be used. There are major challenges with observational studies, one of which is confounding that can lead to biased estimates of the causal effects. Controlling for confounding is commonly performed by simple adjustment for measured confounders; although, often this is not enough. Recent advances in the field of causal inference have dealt with confounding by building on classical standardisation methods. However, these recent advances have progressed quickly with a relative paucity of computational-oriented applied tutorials contributing to some confusion in the use of these methods among applied researchers. In this tutorial, we show the computational implementation of different causal inference estimators from a historical perspective where different estimators were developed to overcome the limitations of the previous one. Furthermore, we also briefly introduce the potential outcomes framework, illustrate the use of different methods using an illustration from the health care setting, and most importantly, we provide reproducible and commented code in Stata, R and Python for researchers to apply in their own observational study. The code can be accessed at 17 | 18 | [https://github.com/migariane/TutorialCausalInferenceEstimators](https://github.com/migariane/TutorialCausalInferenceEstimators) 19 | 20 | KEYWORDS: Causal Inference; Regression adjustment; G-methods; G-formula; Propensity score; Inverse probability weighting; Double-robust methods; Machine learning; Targeted maximum likelihood estimation; Epidemiology; Statistics; Tutorial 21 | -------------------------------------------------------------------------------- /Results.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/migariane/TutorialCausalInferenceEstimators/d809b657b382e227837d3032b1517612c478818d/Results.RData -------------------------------------------------------------------------------- /StataCodeBoxes.do: -------------------------------------------------------------------------------- 1 | /* 2 | Tutorial: causal inference methods made easy for applied resarchers/epidemiologists/statisticians 3 | ================================================================================================= 4 | 5 | ICON-LSHTM, LONDON, 16th October 2020 6 | 7 | Miguel Angel Luque Fernandez, PhD 8 | Assistant Professor of Epidemiology and Biostatistics 9 | Camille Maringe, PhD 10 | Assistant Professor 11 | 12 | Inequalities in Cancer Outcomes Network, LSHTM, London, UK 13 | 14 | Copyright (c) 2020 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | Bug reports: miguel-angel.luque@lshtm.ac.uk 19 | 20 | The rhc dataset can be dowloaded at http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets 21 | */ 22 | 23 | 24 | 25 | 26 | *** Preliminaries 27 | clear 28 | set more off 29 | cd "C:\Data" // this path should point to where the RHC data are 30 | use "rhc.dta", clear 31 | describe 32 | count 33 | * 83 variables and 5,735 observations 34 | 35 | /* Box 1: Setting the data */ 36 | * Define the outcome (Y), exposure (A), confounder (C), and confounders (W) 37 | global Y death_d30 38 | global A rhc 39 | global C gender 40 | global W gender age edu race carcinoma 41 | 42 | /* Box 2: Naive estimate of the ATE */ 43 | * Naive approach to estimate the causal effect 44 | regr $Y $A $C 45 | * The naive estimate of the causal effect is 0.07352 46 | 47 | /* 3. G-formula */ 48 | /* 3.1 Non-parametric G-formula */ 49 | 50 | * 1) ATE 51 | /* Box 3: Non-parametric G-Formula for the ATE */ 52 | proportion $C 53 | matrix m=e(b) 54 | gen genderf = m[1,1] 55 | sum genderf 56 | gen genderm = m[1,2] 57 | sum genderm 58 | * you may need to install the command sumup, type: 59 | * ssc install sumup 60 | sumup $Y, by($A $C) 61 | * from sumup command extract the conditinal means by the given A and C levels i.e. zero and one 62 | * see matrix list y00: position subscript [3,1] is th one of interest 63 | matrix y00 = r(Stat1) 64 | matrix y01 = r(Stat2) 65 | matrix y10 = r(Stat3) 66 | matrix y11 = r(Stat4) 67 | gen EY1 = ((y11[3,1]-y01[3,1]))*genderm 68 | gen EY0 = ((y10[3,1]-y00[3,1]))*genderf 69 | qui: mean EY1 EY0 70 | matrix ATE = r(table) 71 | display "The ATE is: " ATE[1,1] + ATE[1,2] 72 | drop EY* 73 | * The ATE from non-parametric estimator is: 0.073692 74 | // Also one can try 75 | gen ATE = ((y11[3,1]-y01[3,1]))*genderm + ((y10[3,1]-y00[3,1]))*genderf 76 | qui sum ATE 77 | drop ATE 78 | 79 | * Check that Stata "teffects" command obtains the same estimate 80 | teffects ra ($Y $C) ($A) 81 | * The ATE from "teffects" implementation is: 0.073692 82 | 83 | * 2) ATT 84 | /* Box 4: Non-parametric G-Formula for the ATT */ 85 | * Estimate the marginal probabilities 86 | proportion $C if $A==1 87 | matrix m=e(b) 88 | gen genderfatet = m[1,1] 89 | gen gendermatet = m[1,2] 90 | gen EY1 = ((y11[3,1]-y01[3,1]))*gendermatet 91 | gen EY0 = ((y10[3,1]-y00[3,1]))*genderfatet 92 | qui: mean EY1 EY0 93 | matrix ATT = r(table) 94 | display "The ATT is: " ATT[1,1] + ATT[1,2] // Applying the G-formula 95 | drop EY* 96 | * The ATT from non-parametric estimator is: 0.073248 97 | // Also one can try 98 | gen ATT = ((y11[3,1]-y01[3,1]))*gendermatet + ((y10[3,1]-y00[3,1]))*genderfatet 99 | qui sum ATT 100 | drop ATT 101 | 102 | * Check using Stata "teffects" command 103 | teffects ra ($Y $C) ($A), atet 104 | * The ATT from "teffects" implementation is: 0.073248 105 | 106 | /* Box 5: Bootstrap 95% Confidence Intervals (CI) for the ATE/ATT estimated using the Non-parametric G-Formula */ 107 | 108 | * 1) For the ATE 109 | capture program drop ATE 110 | program define ATE, rclass 111 | capture drop y1 112 | capture drop y0 113 | capture drop ATE 114 | sumup $Y, by($A $C) 115 | matrix y00 = r(Stat1) 116 | matrix y01 = r(Stat2) 117 | matrix y10 = r(Stat3) 118 | matrix y11 = r(Stat4) 119 | gen ATE = ((y11[3,1]-y01[3,1]))*genderm + ((y10[3,1]-y00[3,1]))*genderf 120 | qui sum ATE 121 | return scalar ate = `r(mean)' 122 | end 123 | 124 | qui bootstrap r(ate), reps(1000): ATE 125 | estat boot, all 126 | 127 | * 2) For the ATT 128 | capture program drop ATT 129 | program define ATT, rclass 130 | capture drop y1 131 | capture drop y0 132 | capture drop ATT 133 | sumup $Y, by($A $C) 134 | matrix y00 = r(Stat1) 135 | matrix y01 = r(Stat2) 136 | matrix y10 = r(Stat3) 137 | matrix y11 = r(Stat4) 138 | gen ATT = ((y11[3,1]-y01[3,1]))*gendermatet + ((y10[3,1]-y00[3,1]))*genderfatet 139 | qui sum ATT 140 | return scalar att = `r(mean)' 141 | end 142 | 143 | qui bootstrap r(att), reps(1000): ATT 144 | estat boot, all 145 | 146 | drop ATE ATT 147 | 148 | /* Box 6: Non-parametric G-Formula using a fully saturated regression model in Stata (A) */ 149 | * method 1: conditional probabilities 150 | regress $Y ibn.$A ibn.$A#c.($C) , noconstant vce(robust) coeflegend 151 | predictnl ATE = (_b[1.rhc] + _b[1.rhc#c.gender]*gender) - (_b[0bn.rhc] + _b[0bn.rhc#c.gender]*gender) 152 | qui: sum ATE 153 | display "The ATE is: " "`r(mean)'" 154 | drop ATE 155 | 156 | /* Box 7: Non-parametric G-Formula using a fully saturated regression model in Stata (B) */ 157 | * method 2: marginal probabilities 158 | regress $Y ibn.$A ibn.$A#c.($C) , noconstant vce(robust) coeflegend 159 | 160 | * Marginal probability in each treatment group 161 | margins $A , vce(unconditional) 162 | 163 | * Difference in marginal probability between treatment groups 164 | margins r.$A , contrast(nowald) 165 | 166 | /* 3.2 PARAMETRIC G-FORMULA */ 167 | 168 | * One confounder 169 | 170 | /* Box 8: Parametric G-formula */ 171 | * Calculations by hand 172 | * Expected probability amongst treated 173 | regress $Y $C if $A==1 174 | predict double y1hat 175 | 176 | * Expected probability amongst untreated 177 | regress $Y $C if $A==0 178 | predict double y0hat 179 | mean y1hat y0hat 180 | 181 | * Difference between expected probabilities (ATE) and biased confidence interval 182 | lincom _b[y1hat] - _b[y0hat] 183 | 184 | /* Box 9: Parametric regression adjustment using Stata's teffects (one confounder) */ 185 | teffects ra ($Y $C) ($A) 186 | 187 | /* Box 10: Bootstrap for the parametric regression adjustment */ 188 | capture program drop ATE 189 | program define ATE, rclass 190 | capture drop y1 191 | capture drop y0 192 | reg $Y $C if $A==1 193 | predict double y1, xb 194 | quiet sum y1 195 | reg $Y $C if $A==0 196 | predict double y0, xb 197 | quiet sum y0 198 | mean y1 y0 199 | lincom _b[y1]-_b[y0] 200 | return scalar ace =`r(estimate)' 201 | end 202 | qui bootstrap r(ace), reps(1000): ATE 203 | estat boot, all 204 | 205 | * More than one confounder 206 | 207 | /* Box 11: Parametric multivariate regression adjustment implementation of the G-Formula */ 208 | regress $Y $W if $A==1 209 | predict double y1hat 210 | regress $Y $W if $A==0 211 | predict double y0hat 212 | mean y1hat y0hat 213 | lincom _b[y1hat] - _b[y0hat] 214 | 215 | /* Box 12: Parametric multivariate regression adjustment using Stata’s teffects command */ 216 | teffects ra ($Y $W) ($A) 217 | 218 | /* Box 13: Parametric multivariate regression adjustment using Stata’s margins command */ 219 | regress $Y ibn.$A ibn.$A#c.($W) , noconstant vce(robust) 220 | margins $A, vce(unconditional) 221 | margins r.$A, contrast(nowald) 222 | 223 | /* Box 14: Bootstrap for the multivariate parametric regression adjustment */ 224 | capture program drop ATE 225 | program define ATE, rclass 226 | capture drop y1 227 | capture drop y0 228 | reg $Y $W if $A==1 229 | predict double y1, xb 230 | quiet sum y1 231 | reg $Y $W if $A==0 232 | predict double y0, xb 233 | quiet sum y0 234 | mean y1 y0 235 | lincom _b[y1]-_b[y0] 236 | return scalar ace =`r(estimate)' 237 | end 238 | qui bootstrap r(ace), reps(1000): ATE dots 239 | estat boot, all 240 | 241 | /* Box 15: Computing the parametric marginal risk ratio after regression adjustment */ 242 | teffects ra ($Y $W) ($A), aequations 243 | teffects ra ($Y $W) ($A), coeflegend 244 | nlcom 100*_b[ATE:r1vs0.$A]/_b[POmean:0.$A] 245 | * 27.4% increase in relative risk 246 | teffects ra ($Y $W) ($A), pom coeflegend 247 | nlcom _b[POmeans:1.rhc]/ _b[POmeans:0bn.rhc] 248 | * 27.4% increase in relative risk 249 | 250 | /* 4 Inverse probability of treatment weighting */ 251 | /* 4.1 Inverse probability of treatment weighting based on the propensity score plus regression adjustment */ 252 | 253 | /* Box 16: Computation of the IPTW estimator for the ATE */ 254 | * propensity score model for the exposure 255 | logit $A $W, vce(robust) nolog 256 | 257 | * propensity score predictions 258 | predict double ps 259 | 260 | * Sampling weights for the treated group 261 | generate double ipw1 = ($A==1)/ps 262 | 263 | * Weighted outcome probability among treated 264 | regress $Y [pw=ipw1] 265 | scalar Y1 = _b[_cons] 266 | 267 | * Sampling weights for the non-treated group 268 | generate double ipw0 = ($A==0)/(1-ps) 269 | regress $Y [pw=ipw0] 270 | scalar Y0 = _b[_cons] 271 | display "ATE =" Y1 - Y0 272 | 273 | /* Box 17: Bootstrap computation for the IPTW estimator */ 274 | * Bootstrap the confidence intervals 275 | capture program drop ATE 276 | program define ATE, rclass 277 | capture drop y1 278 | capture drop y0 279 | regress $Y [pw=ipw1] 280 | matrix y1 = e(b) 281 | gen double y1 = y1[1,1] 282 | regress $Y [pw=ipw0] 283 | matrix y0 = e(b) 284 | gen double y0 = y0[1,1] 285 | mean y1 y0 286 | lincom _b[y1]-_b[y0] 287 | return scalar ace = `r(estimate)' 288 | end 289 | qui bootstrap r(ace), reps(1000): ATE 290 | estat boot, all 291 | 292 | /* Box 18: Computation of the IPTW estimator for the ATE using Stata’s teffects command */ 293 | teffects ipw ($Y) ($A $W, logit), nolog vsquish 294 | 295 | /* Box 19: Assessing IPTW balance */ 296 | * Stata teffects and tebalance commands 297 | qui teffects ipw ($Y) ($A $W) 298 | tebalance summarize 299 | 300 | * By hand - with the example of gender 301 | egen genderst = std(gender) // Standardization 302 | logistic $A $W // Propensity score 303 | capture drop ps 304 | predict double ps 305 | gen ipw = . 306 | replace ipw=($A==1)/ps if $A==1 307 | replace ipw=($A==0)/(1-ps) if $A==0 308 | regress genderst $A // Raw difference 309 | regress genderst $A [pw=ipw] // Standardized difference 310 | 311 | /* Box 20: Assessing IPTW overlap by hand */ 312 | sort $A 313 | by $A: summarize ps 314 | kdensity ps if $A==1, generate(x1pointsa d1A) nograph n(10000) 315 | kdensity ps if $A==0, generate(x0pointsa d0A) nograph n(10000) 316 | label variable d1A "density for RHC=1" 317 | label variable d0A "density for RHC=0" 318 | twoway (line d0A x0pointsa , yaxis(1))(line d1A x1pointsa, yaxis(2)) 319 | 320 | /* Box 21: Assessing overlap using Stata's teffects overlap */ 321 | qui: teffects ipw ($Y) ($A $W, logit), nolog vsquish 322 | teffects overlap 323 | 324 | 325 | /* 4.2 Marginal structural model with stabilized weights */ 326 | /* Box 22: Computation of the IPTW estimator for the ATE using a MSM */ 327 | * Baseline treatment probabilities 328 | logit $A, vce(robust) nolog 329 | predict double nps, pr 330 | 331 | * propensity score model 332 | logit $A $W, vce(robust) nolog 333 | predict double dps, pr 334 | 335 | * Unstabilized weight 336 | cap drop ipw 337 | gen ipw = . 338 | replace ipw=($A==1)/dps if $A==1 339 | replace ipw=($A==0)/(1-dps) if $A==0 340 | sum ipw 341 | 342 | * Stabilized weight 343 | gen sws = . 344 | replace sws = nps/dps if $A==1 345 | replace sws = (1-nps)/(1-dps) if $A==0 346 | sum sws 347 | 348 | * MSM 349 | reg $Y $A [pw=ipw], vce(robust) // MSM unstabilized weight 350 | reg $Y $A [pw=sws], vce(robust) // MSM stabilized weight 351 | 352 | 353 | /* 4.3 IPTW with regression adjustment */ 354 | 355 | /* Box 23: Computation of the IPTW-RA estimator for the ATE and bootstrap for statistical inference */ 356 | capture program drop ATE 357 | program define ATE, rclass 358 | capture drop y1 359 | capture drop y0 360 | reg $Y $W if $A==1 [pw=sws] 361 | predict double y1, xb 362 | quiet sum y1 363 | return scalar y1=`r(mean)' 364 | reg $Y $W if $A==0 [pw=sws] 365 | predict double y0, xb 366 | quiet sum y0 367 | return scalar y0=`r(mean)' 368 | mean y1 y0 369 | lincom _b[y1]-_b[y0] 370 | return scalar ace =`r(estimate)' 371 | end 372 | qui bootstrap r(ace), reps(10): ATE 373 | estat boot, all 374 | 375 | /* Box 24: Computation of the IPTW-RA estimator for the ATE using Stata’s teffects */ 376 | teffects ipwra ($Y $W) ($A $W) 377 | nlcom 100*_b[r1vs0.$A]/_b[POmean:0.$A] 378 | teffects ipwra ($Y $W) ($A $W), pom coeflegend 379 | nlcom _b[POmeans:1.rhc]/ _b[POmeans:0bn.rhc] 380 | *eltmle to check marginal RR 381 | eltmle $Y $A $W, tmle 382 | 383 | /* 5. Augmented inverse probability weighting */ 384 | /* Box 25: Computation of the AIPTW estimator for the ATE and bootstrap for statistical inference */ 385 | * Step (i) prediction model for the outcome 386 | qui glm $Y $A $W, fam(bin) 387 | predict double QAW, mu 388 | qui glm $Y $W if $A==1, fam(bin) 389 | predict double Q1W, mu 390 | qui glm $Y $W if $A==0, fam(bin) 391 | predict double Q0W, mu 392 | 393 | * Step (ii): prediction model for the treatment 394 | cap drop dps nps sws y1 y0 395 | qui logit $A $W 396 | predict double dps, pr 397 | qui logit $A 398 | predict double nps, pr 399 | gen sws = . 400 | replace sws = nps/dps if $A==1 401 | replace sws = (1-nps)/(1-dps) if $A==0 402 | 403 | * Step (iii): Estimation equation 404 | gen double y1 = (sws*($Y-QAW) + (Q1W)) 405 | quiet sum y1 406 | scalar y1=`r(mean)' 407 | gen double y0 = (sws*($Y-QAW) + (Q0W)) 408 | quiet sum y0 409 | scalar y0=`r(mean)' 410 | mean y1 y0 411 | lincom _b[y1] - _b[y0] 412 | 413 | * Step (iv): Bootstrap confidence intervals 414 | capture program drop ATE 415 | program define ATE, rclass 416 | capture drop y1 417 | capture drop y0 418 | capture drop Q* 419 | qui glm $Y $A $W, fam(bin) 420 | predict double QAW, mu 421 | qui glm $Y $W if $A==1, fam(bin) 422 | predict double Q1W, mu 423 | qui glm $Y $W if $A==0, fam(bin) 424 | predict double Q0W, mu 425 | gen double y1 = (sws*($Y-QAW) + (Q1W)) 426 | quiet sum y1 427 | return scalar y1=`r(mean)' 428 | gen double y0 = (sws*($Y-QAW) + (Q0W)) 429 | quiet sum y0 430 | return scalar y0=`r(mean)' 431 | mean y1 y0 432 | lincom _b[y1] - _b[y0] 433 | return scalar ace =`r(estimate)' 434 | end 435 | qui bootstrap r(ace), reps(1000): ATE 436 | estat boot, all 437 | 438 | /* Box 26: Computation of the AIPTW estimator for the ATE and marginal risk ratio using Stata’s teffects */ 439 | teffects aipw ($Y $W) ($A $W, logit) 440 | * marginal Relative Risk 441 | nlcom 100*_b[r1vs0.$A]/_b[POmean:0.$A] 442 | * another way to compute it 443 | teffects aipw ($Y $W) ($A $W, logit), pom coeflegend 444 | nlcom _b[POmeans:1.rhc]/ _b[POmeans:0bn.rhc] 445 | 446 | /* 6. DATA-ADAPTIVE ESTIMATION: ENSEMBLE LEARNING TARGETED MAXIMUMLIKELIHOOD ESTIMATION*/ 447 | /*Box 27: Computational implementation of TMLE by hand */ 448 | 449 | * Step 1: prediction model for the outcome Q0 (g-computation) 450 | glm $Y $A $W, fam(binomial) 451 | predict double QAW_0, mu 452 | gen aa=$A 453 | replace $A = 0 454 | predict double Q0W_0, mu 455 | replace $A= 1 456 | predict double Q1W_0, mu 457 | replace $A = aa 458 | drop aa 459 | 460 | // Q to logit scale 461 | gen logQAW = log(QAW / (1 - QAW)) 462 | gen logQ1W = log(Q1W / (1 - Q1W)) 463 | gen logQ0W = log(Q0W / (1 - Q0W)) 464 | 465 | * Step 2: prediction model for the treatment g0 (IPTW) 466 | glm $A $W, fam(binomial) 467 | predict gw, mu 468 | gen double H1W = $A / gw 469 | gen double H0W = (1 - $A ) / (1 - gw) 470 | 471 | * Step 3: Computing the clever covariate H(A,W) and estimating the parameter (epsilon) (MLE) 472 | glm $Y H1W H0W, fam(binomial) offset(logQAW) noconstant 473 | mat a = e(b) 474 | gen eps1 = a[1,1] 475 | gen eps2 = a[1,2] 476 | 477 | * Step 4: update from Q0 to Q1 478 | gen double Q1W_1 = exp(eps1 / gw + logQ1W) / (1 + exp(eps1 / gw + logQ1W)) 479 | gen double Q0W_1 = exp(eps2 / (1 - gw) + logQ0W) / (1 + exp(eps2 / (1 - gw) + logQ0W)) 480 | 481 | * Step 5: Targeted estimate of the ATE 482 | gen ATE = (Q1W_1 - Q0W_1) 483 | summ ATE 484 | global ATE = r(mean) 485 | drop ATE 486 | 487 | * Step 6: Statistical inference (efficient influence curve) 488 | qui sum(Q1W_1) 489 | gen EY1tmle = r(mean) 490 | qui sum(Q0W_1) 491 | gen EY0tmle = r(mean) 492 | 493 | gen d1 = (($A * ($Y - Q1W_1)/gw)) + Q1W_1 - EY1tmle 494 | gen d0 = ((1 - $A ) * ($Y - Q0W_1)/(1 - gw)) + Q0W_1 - EY0tmle 495 | 496 | gen IC = d1 - d0 497 | qui sum IC 498 | gen varIC = r(Var) / r(N) 499 | drop d1 d0 IC 500 | 501 | global LCI = $ATE - 1.96*sqrt(varIC) 502 | global UCI = $ATE + 1.96*sqrt(varIC) 503 | display "ATE:" %05.4f $ATE _col(15) "95%CI: " %05.4f $LCI "," %05.4f $UCI 504 | 505 | /* Box 28: TMLE with data-adaptive estimation using the Stata’s user writen eltmle */ 506 | * if not already installed, type: 507 | * ssc install eltmle 508 | preserve 509 | eltmle $Y $A $W, tmle 510 | restore 511 | 512 | 513 | /* 7. Simulation */ 514 | /* Box 29: Data generation for the Monte Carlo experiment */ 515 | 516 | * Data generation 517 | clear 518 | set obs 1000 519 | set seed 777 520 | gen w1 = round(runiform(1, 5)) //Quintiles of Socioeconomic Deprivation 521 | gen w2 = rbinomial(1, 0.45) //Binary: probability age >65 = 0.45 522 | gen w3 = round(runiform(0, 1) + 0.75*(w2) + 0.8*(w1)) //Stage 523 | recode w3 (5/6=1) //Stage (TNM): categorical 4 levels 524 | gen w4 = round(runiform(0, 1) + 0.75*(w2) + 0.2*(w1)) //Comorbidites: categorical four levels 525 | gen A = (rbinomial(1,invlogit(-1 - 0.15*(w4) + 1.5*(w2) + 0.75*(w3) + 0.25*(w1) + 0.8*(w2)*(w4)))) //Binary treatment 526 | gen Y1 = (invlogit(-3 + 1 + 0.25*(w4) + 0.75*(w3) + 0.8*(w2)*(w4) + 0.05*(w1))) // Potential outcome 1 527 | gen Y0 = (invlogit(-3 + 0 + 0.25*(w4) + 0.75*(w3) + 0.8*(w2)*(w4) + 0.05*(w1))) // Potential outcome 2 528 | gen psi = Y1-Y0 // Simulated ATE 529 | gen Y = A*(Y1) + (1 - A)*Y0 //Binary outcome 530 | 531 | 532 | // Estimate the true simulated ATE 533 | mean psi 534 | 535 | // ATE estimation 536 | * Regression adjustment 537 | teffects ra (Y w1 w2 w3 w4) (A) 538 | estimates store ra 539 | 540 | * IPTW 541 | teffects ipw (Y) (A w1 w2 w3 w4) 542 | estimates store ipw 543 | 544 | * IPTW-RA 545 | teffects ipwra (Y w1 w2 w3 w4) (A w1 w2 w3 w4) 546 | estimates store ipwra 547 | 548 | * AIPTW 549 | teffects aipw (Y w1 w2 w3 w4) (A w1 w2 w3 w4) 550 | estimates store aipw 551 | 552 | * Results 553 | qui reg psi 554 | estimates store psi 555 | estout psi ra ipw ipwra aipw 556 | 557 | // Ensemble learning maximum likelihood estimation 558 | preserve 559 | eltmle Y A w1 w2 w3 w4, tmle 560 | restore 561 | 562 | // Relative bias of each ATE 563 | * Regression adjustment 564 | display abs(0.1787 - 0.203419)/0.1787 565 | 566 | * IPTW 567 | display abs(0.1787 - 0.2776)/0.1787 568 | 569 | * IPTW-RA 570 | display abs(0.1787 - .2052088)/0.1787 571 | 572 | * AIPTW 573 | display abs(0.1787 - 0.2030)/0.1787 574 | 575 | * ELTMLE 576 | display abs(0.1787 - 0.1784)/0.1787 577 | 578 | -------------------------------------------------------------------------------- /rhc.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/migariane/TutorialCausalInferenceEstimators/d809b657b382e227837d3032b1517612c478818d/rhc.Rdata -------------------------------------------------------------------------------- /rhc.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/migariane/TutorialCausalInferenceEstimators/d809b657b382e227837d3032b1517612c478818d/rhc.dta --------------------------------------------------------------------------------