├── 9781484228715.jpg ├── Contributing.md ├── Gender_StatsData_worldbank.org_ccby40.xlsx ├── LICENSE.txt ├── README.md ├── errata.md ├── gam.R ├── glm1.R ├── glm2.R ├── glmma.R ├── glmmi.R ├── glmml.R ├── iiv.R ├── intro.R ├── md.R ├── mdv.R ├── mli.R ├── mls.R ├── mlu.R └── udv.R /9781484228715.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/advanced-r-statistical-programming-and-data-models/ace9ee821d60d591091c7dc0a97c8c4620406b30/9781484228715.jpg -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /Gender_StatsData_worldbank.org_ccby40.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/advanced-r-statistical-programming-and-data-models/ace9ee821d60d591091c7dc0a97c8c4620406b30/Gender_StatsData_worldbank.org_ccby40.xlsx -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Freeware License, some rights reserved 2 | 3 | Copyright (c) 2019 Matt Wiley and Joshua F. Wiley 4 | 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 6 | of this software and associated documentation files (the "Software"), 7 | to work with the Software within the limits of freeware distribution and fair use. 8 | This includes the rights to use, copy, and modify the Software for personal use. 9 | Users are also allowed and encouraged to submit corrections and modifications 10 | to the Software for the benefit of other users. 11 | 12 | It is not allowed to reuse, modify, or redistribute the Software for 13 | commercial use in any way, or for a user’s educational materials such as books 14 | or blog articles without prior permission from the copyright holder. 15 | 16 | The above copyright notice and this permission notice need to be included 17 | in all copies or substantial portions of the software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apress Source Code 2 | 3 | This repository accompanies [*Advanced R Statistical Programming and Data Models*](https://www.apress.com/9781484228715) by Matt Wiley and Joshua F. Wiley (Apress, 2019). 4 | 5 | [comment]: #cover 6 | ![Cover image](9781484228715.jpg) 7 | 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git. 9 | 10 | ## Releases 11 | 12 | Release v1.0 corresponds to the code in the published book, without corrections or updates. 13 | 14 | ## Contributions 15 | 16 | See the file Contributing.md for more information on how you can contribute to this repository. -------------------------------------------------------------------------------- /errata.md: -------------------------------------------------------------------------------- 1 | # Errata for *Book Title* 2 | 3 | On **page xx** [Summary of error]: 4 | 5 | Details of error here. Highlight key pieces in **bold**. 6 | 7 | *** 8 | 9 | On **page xx** [Summary of error]: 10 | 11 | Details of error here. Highlight key pieces in **bold**. 12 | 13 | *** -------------------------------------------------------------------------------- /gam.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(knitr) 11 | library(data.table) 12 | library(ggplot2) 13 | library(ggthemes) 14 | library(scales) 15 | library(viridis) 16 | library(car) 17 | library(mgcv) 18 | library(VGAM) 19 | library(ipw) 20 | library(JWileymisc) 21 | library(xtable) 22 | 23 | options( 24 | width = 70, 25 | stringsAsFactors = FALSE, 26 | datatable.print.nrows = 20, 27 | datatable.print.topn = 3, 28 | digits = 2) 29 | 30 | 31 | ## ----fgam-poly1, fig.width=7, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Graph showing an intercept only (flat line) and progressively higher order polynomials."---- 32 | 33 | acl <- readRDS("advancedr_acl_data.RDS") 34 | 35 | ggplot(acl, aes(AGE_W1, CESD11_W1)) + 36 | stat_smooth(method = "lm", formula = y ~ 1, 37 | colour = viridis(6)[1], linetype = 1, se = FALSE) + 38 | stat_smooth(method = "lm", formula = y ~ x, 39 | colour = viridis(6)[2], linetype = 4, se = FALSE) + 40 | stat_smooth(method = "lm", formula = y ~ poly(x, 2), 41 | colour = viridis(6)[3], linetype = 2, se = FALSE) + 42 | stat_smooth(method = "lm", formula = y ~ poly(x, 3), 43 | colour = viridis(6)[4], linetype = 3, se = FALSE) + 44 | stat_smooth(method = "lm", formula = y ~ poly(x, 4), 45 | colour = viridis(6)[5], linetype = 1, se = FALSE) + 46 | stat_smooth(method = "lm", formula = y ~ poly(x, 10), 47 | colour = viridis(6)[6], linetype = 5, se = FALSE) 48 | 49 | 50 | ## ------------------------------------------------------------------------ 51 | 52 | ## > and < 53 | 1:5 %gl% c(2, 4) 54 | 55 | ## > and <= 56 | 1:5 %gle% c(2, 4) 57 | 58 | ## >= and < 59 | 1:5 %gel% c(2, 4) 60 | 61 | ## >= and <= 62 | 1:5 %gele% c(2, 4) 63 | 64 | 65 | 66 | ## ----fgam-spline1, fig.width=7, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Graph showing an step function spline, linear splines, and quadratic splines, all with two inner knots."---- 67 | 68 | ggplot(acl, aes(AGE_W1, CESD11_W1)) + 69 | stat_smooth(method = "lm", 70 | formula = y ~ 1 + 71 | ifelse(x %gle% c(42, 65), 1, 0) + 72 | ifelse(x %gle% c(65, 96), 1, 0), 73 | colour = viridis(6)[1], linetype = 1, se = FALSE) + 74 | stat_smooth(method = "lm", 75 | formula = y ~ bs(x, df = 3, degree = 1L), 76 | colour = viridis(6)[2], linetype = 2, se = FALSE) + 77 | stat_smooth(method = "lm", 78 | formula = y ~ bs(x, df = 4, degree = 2L), 79 | colour = viridis(6)[3], linetype = 3, se = FALSE) + 80 | stat_smooth(method = "lm", 81 | formula = y ~ bs(x, df = 5, degree = 3L), 82 | colour = viridis(6)[4], linetype = 4, se = FALSE) 83 | 84 | 85 | ## ----fgam-spline2, fig.width=10, fig.height=9, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Graph showing B-Splines (basis splines)."---- 86 | 87 | knots <- c(33, 42, 57, 65, 72) 88 | x <- seq(from = min(acl$AGE_W1), 89 | to = max(acl$AGE_W1), by = .01) 90 | 91 | p1 <- ggplot(melt(bs(x, degree = 1, 92 | knots = knots, intercept = TRUE)), 93 | aes(Var1, value, colour = factor(Var2))) + 94 | geom_line() + 95 | scale_color_viridis("Basis", discrete = TRUE) + 96 | theme_tufte() 97 | 98 | plot_grid( 99 | p1 + 100 | ggtitle("5 Knots, Degree = 1"), 101 | p1 %+% melt(bs(x, degree = 2, 102 | knots = knots, intercept = TRUE)) + 103 | ggtitle("5 Knots, Degree = 2"), 104 | p1 %+% melt(bs(x, degree = 3, 105 | knots = knots, intercept = TRUE)) + 106 | ggtitle("5 Knots, Degree = 3"), 107 | p1 %+% melt(bs(x, degree = 4, 108 | knots = knots, intercept = TRUE)) + 109 | ggtitle("5 Knots, Degree = 4"), 110 | ncol = 2) 111 | 112 | 113 | ## ------------------------------------------------------------------------ 114 | 115 | mgam <- vgam(CESD11_W1 ~ Sex + s(AGE_W1, df = 3), data = acl, 116 | family = uninormal(), model = TRUE) 117 | 118 | summary(mgam) 119 | 120 | 121 | ## ------------------------------------------------------------------------ 122 | 123 | coef(mgam) 124 | 125 | 126 | ## ------------------------------------------------------------------------ 127 | 128 | ## test parametric coefficient for sex 129 | linearHypothesis(mgam, "Sex(2) FEMALE", 130 | coef. = coef(mgam), vcov = vcov(mgam)) 131 | 132 | 133 | ## ------------------------------------------------------------------------ 134 | 135 | ## test parametric coefficient for 136 | ## intercept and sex simultaneously 137 | linearHypothesis(mgam, 138 | c("(Intercept):1", "Sex(2) FEMALE"), 139 | coef. = coef(mgam), vcov = vcov(mgam)) 140 | 141 | 142 | ## ----fgam-agegam1, fig.width=9, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for a generalized additive model with sex as a parametric term and age as a smooth spline.", warning=FALSE---- 143 | 144 | par(mfrow = c(1, 2)) 145 | plot(mgam, se = TRUE, 146 | lcol = viridis(4)[1], scol = viridis(4)[2]) 147 | 148 | 149 | ## ------------------------------------------------------------------------ 150 | 151 | mlin <- vglm(CESD11_W1 ~ Sex + AGE_W1, data = acl, 152 | family = uninormal(), model = TRUE) 153 | mquad <- vglm(CESD11_W1 ~ Sex + poly(AGE_W1, 2), data = acl, 154 | family = uninormal(), model = TRUE) 155 | 156 | 157 | ## ----fgam-agegam2, fig.width=9, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Two panel plot showing the predicted depression symptom level by age from a generalized additive model versus a linear fit on the left and a quadratic fit on the right.", warning=FALSE---- 158 | 159 | par(mfrow = c(1, 2)) 160 | plot(mgam, se = TRUE, which.term = 2, 161 | lcol = viridis(4)[1], scol = viridis(4)[1]) 162 | plot(as(mlin, "vgam"), se = TRUE, which.term = 2, 163 | lcol = viridis(4)[2], scol = viridis(4)[2], 164 | overlay = TRUE, add = TRUE) 165 | 166 | plot(mgam, se = TRUE, which.term = 2, 167 | lcol = viridis(4)[1], scol = viridis(4)[1]) 168 | plot(as(mquad, "vgam"), se = TRUE, which.term = 2, 169 | lcol = viridis(4)[3], scol = viridis(4)[3], 170 | overlay = TRUE, add = TRUE) 171 | 172 | 173 | ## ------------------------------------------------------------------------ 174 | 175 | mgam2 <- vgam(CESD11_W2 ~ Sex + 176 | s(CESD11_W1, df = 3) + 177 | s(AGE_W1, df = 3), data = acl, 178 | family = uninormal(), model = TRUE) 179 | 180 | summary(mgam2) 181 | 182 | 183 | ## ------------------------------------------------------------------------ 184 | 185 | mgam3 <- vgam(CESD11_W2 ~ Sex + 186 | s(CESD11_W1, df = 3) + 187 | AGE_W1, data = acl, 188 | family = uninormal(), model = TRUE) 189 | 190 | summary(mgam3) 191 | 192 | 193 | ## ------------------------------------------------------------------------ 194 | 195 | names(coef(mgam3)) 196 | 197 | linearHypothesis(mgam3, 198 | "Sex(2) FEMALE", 199 | coef. = coef(mgam3), vcov = vcov(mgam3)) 200 | 201 | linearHypothesis(mgam3, 202 | "AGE_W1", 203 | coef. = coef(mgam3), vcov = vcov(mgam3)) 204 | 205 | 206 | ## ----fgam-gam3, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for a generalized additive model with sex and age as parametric terms and wave 1 depression symptoms as a smooth spline.", warning=FALSE---- 207 | 208 | par(mfrow = c(2, 2)) 209 | plot(mgam3, se = TRUE, 210 | lcol = viridis(4)[1], 211 | scol = viridis(4)[2]) 212 | 213 | 214 | ## ------------------------------------------------------------------------ 215 | 216 | 217 | ## generate new data for prediction 218 | ## use the whole range of sex and depression symptoms 219 | ## and a five number summary of age 220 | ## (min, 25th 50th 75th percentiles and max) 221 | newdat <- as.data.table(expand.grid( 222 | Sex = levels(acl$Sex), 223 | CESD11_W1 = seq( 224 | from = min(acl$CESD11_W1, na.rm=TRUE), 225 | to = max(acl$CESD11_W1, na.rm=TRUE), 226 | length.out = 1000), 227 | AGE_W1 = fivenum(acl$AGE_W1))) 228 | 229 | newdat$yhat <- predict(mgam3, newdata = newdat) 230 | 231 | 232 | ## ----fgam-gampred, fig.width=10, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Predicted depression symptoms at wave 2 across levels of wave 1 depression symptoms at varying ages and sex.", warning=FALSE---- 233 | 234 | ggplot(newdat, 235 | aes(CESD11_W1, yhat, 236 | colour = factor(AGE_W1), 237 | linetype = factor(AGE_W1))) + 238 | geom_line() + 239 | scale_color_viridis("Age", discrete = TRUE) + 240 | scale_linetype_discrete("Age") + 241 | facet_wrap(~ Sex) + 242 | theme(legend.position = c(.75, .2), 243 | legend.key.width = unit(1.5, "cm")) + 244 | xlab("Depression Symptoms (Wave 1)") + 245 | ylab("Depression Symptoms (Wave 2)") 246 | 247 | 248 | ## ------------------------------------------------------------------------ 249 | 250 | detach("package:VGAM") 251 | library(mgcv) 252 | 253 | 254 | ## ------------------------------------------------------------------------ 255 | 256 | mgam4 <- gam(CESD11_W2 ~ Sex + 257 | s(CESD11_W1, k = 3) + 258 | s(AGE_W1, k = 3), data = acl, 259 | family = gaussian()) 260 | 261 | 262 | ## ------------------------------------------------------------------------ 263 | 264 | summary(mgam4) 265 | 266 | 267 | ## ------------------------------------------------------------------------ 268 | 269 | mgam5 <- gam(CESD11_W2 ~ Sex + 270 | s(CESD11_W1, k = 4) + 271 | s(AGE_W1, k = 4), data = acl, 272 | family = gaussian()) 273 | 274 | summary(mgam5) 275 | 276 | 277 | ## ----fgam-gam5, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for two generalized additive models varying the maximum flexibility of the smooth splines.", warning=FALSE---- 278 | 279 | par(mfrow = c(2, 2)) 280 | plot(mgam4, se = TRUE, scale = 0, main = "k = 3") 281 | plot(mgam5, se = TRUE, scale = 0, main = "k = 4") 282 | 283 | 284 | ## ----fgam-gam6, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for a generalized additive models allowing splines to vary by sex.", warning=FALSE---- 285 | 286 | mgam6 <- gam(CESD11_W2 ~ Sex + 287 | s(CESD11_W1, k = 4, by = Sex) + 288 | s(AGE_W1, k = 4, by = Sex), 289 | data = acl, 290 | family = gaussian()) 291 | 292 | summary(mgam6) 293 | 294 | par(mfrow = c(2, 2)) 295 | plot(mgam6, ask = FALSE, scale = 0) 296 | 297 | 298 | ## ------------------------------------------------------------------------ 299 | 300 | AIC(mgam5, mgam6) 301 | BIC(mgam5, mgam6) 302 | 303 | 304 | ## ------------------------------------------------------------------------ 305 | 306 | mgam7 <- gam(CESD11_W2 ~ Sex + 307 | te(CESD11_W1, SelfEsteem_W1, k = 4^2), 308 | data = acl, 309 | family = gaussian()) 310 | 311 | summary(mgam7) 312 | 313 | 314 | ## ----fgam-persp3d, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "3D perspective plots showing the result of a tensor product smooth between depression symptoms and self-esteem at wave 1 predicting depression at wave 2.", warning=FALSE---- 315 | 316 | par(mfrow = c(2, 2), mar = c(.1, .1, .1, .1)) 317 | vis.gam(mgam7, 318 | view = c("CESD11_W1", "SelfEsteem_W1"), 319 | theta = 210, phi = 40, 320 | color = "topo", 321 | plot.type = "persp") 322 | vis.gam(mgam7, 323 | view = c("CESD11_W1", "SelfEsteem_W1"), 324 | theta = 150, phi = 40, 325 | color = "topo", 326 | plot.type = "persp") 327 | vis.gam(mgam7, 328 | view = c("CESD11_W1", "SelfEsteem_W1"), 329 | theta = 60, phi = 40, 330 | color = "topo", 331 | plot.type = "persp") 332 | vis.gam(mgam7, 333 | view = c("CESD11_W1", "SelfEsteem_W1"), 334 | theta = 10, phi = 40, 335 | color = "topo", 336 | plot.type = "persp") 337 | 338 | 339 | ## ----fgam-contour, fig.width=7, fig.height=7, out.width='0.6\\linewidth', fig.pos="!h", fig.cap = "Contour plot showing the result of a tensor product smooth between depression symptoms and self-esteem at wave 1 predicting depression at wave 2.", warning=FALSE---- 340 | 341 | par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1)) 342 | vis.gam(mgam7, 343 | view = c("CESD11_W1", "SelfEsteem_W1"), 344 | color = "topo", 345 | plot.type = "contour") 346 | 347 | 348 | ## ------------------------------------------------------------------------ 349 | 350 | mgam8 <- gam(CESD11_W2 ~ Sex + 351 | ti(CESD11_W1, k = 4) + 352 | ti(SelfEsteem_W1, k = 4) + 353 | ti(CESD11_W1, SelfEsteem_W1, k = 4^2), 354 | data = acl, 355 | family = gaussian()) 356 | 357 | summary(mgam8) 358 | 359 | 360 | ## ------------------------------------------------------------------------ 361 | 362 | mgam5 <- gam(CESD11_W2 ~ Sex + 363 | s(CESD11_W1, k = 4) + 364 | s(AGE_W1, k = 4), data = acl, 365 | family = gaussian()) 366 | 367 | summary(mgam5) 368 | 369 | 370 | ## ----fgam-checkit, fig.width=7, fig.height=7, out.width='0.7\\linewidth', fig.pos="!h", fig.cap = "Diagnostics plots from generalized additive model.", warning=FALSE---- 371 | 372 | par(mfrow = c(2, 2)) 373 | set.seed(12345) 374 | gam.check(mgam5) 375 | 376 | 377 | ## ------------------------------------------------------------------------ 378 | 379 | mgam5b <- gam(CESD11_W2 ~ Sex + 380 | s(CESD11_W1, k = 20) + 381 | s(AGE_W1, k = 20), data = acl, 382 | family = gaussian()) 383 | 384 | summary(mgam5b) 385 | 386 | 387 | ## ----fgam-gam5b, fig.width=10, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for generalized additive model after increasing k for depression symptoms."---- 388 | 389 | par(mfrow = c(1, 2)) 390 | plot(mgam5b, se = TRUE, scale = 0) 391 | 392 | 393 | ## ------------------------------------------------------------------------ 394 | 395 | library(VGAM) 396 | acl$CurSmoke <- as.integer(acl$Smoke_W1 == "(1) Cur Smok") 397 | 398 | mgam.lr1 <- vgam(CurSmoke ~ s(AGE_W1, df = 3), 399 | family = binomialff(link = "logit"), 400 | data = acl, model = TRUE) 401 | 402 | summary(mgam.lr1) 403 | 404 | 405 | ## ----fgam-lr1, fig.width=7, fig.height=7, out.width='0.7\\linewidth', fig.pos="!h", fig.cap = "Generalized additive model for age and current smoking status.", warning = FALSE---- 406 | 407 | par(mfrow = c(1, 1)) 408 | plot(mgam.lr1, se = TRUE, 409 | lcol = viridis(4)[1], 410 | scol = viridis(4)[2]) 411 | 412 | 413 | ## ------------------------------------------------------------------------ 414 | 415 | ## generate new data for prediction 416 | ## use the whole range of age 417 | newdat <- as.data.table(expand.grid( 418 | AGE_W1 = seq( 419 | from = min(acl$AGE_W1, na.rm=TRUE), 420 | to = max(acl$AGE_W1, na.rm=TRUE), 421 | length.out = 1000))) 422 | 423 | newdat$yhat <- predict(mgam.lr1, 424 | newdata = newdat, 425 | type = "response") 426 | 427 | 428 | ## ----fgam-lr1pred, fig.width=6, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Predicted probability of smoking across ages."---- 429 | 430 | ggplot(newdat, aes(AGE_W1, yhat)) + 431 | geom_line() + 432 | scale_y_continuous(labels = percent) + 433 | xlab("Age (years)") + 434 | ylab("Probability of Smoking") + 435 | coord_cartesian(xlim = range(acl$AGE_W1), 436 | ylim = c(0, .4), 437 | expand = FALSE) 438 | 439 | 440 | ## ------------------------------------------------------------------------ 441 | 442 | nboot <- 500 443 | 444 | out <- matrix(NA_real_, ncol = nboot, nrow = nrow(newdat)) 445 | 446 | start.time <- proc.time() 447 | set.seed(12345) 448 | for (i in 1:500) { 449 | tmp <- vgam(CurSmoke ~ s(AGE_W1, df = 3), 450 | family = binomialff(link = "logit"), 451 | data = acl[sample(nrow(acl), replace = TRUE)], model = TRUE) 452 | out[, i] <- predict(tmp, 453 | newdata = newdat, 454 | type = "response") 455 | } 456 | stop.time <- proc.time() 457 | 458 | ## time to bootstrap 500 times 459 | stop.time - start.time 460 | 461 | 462 | ## ------------------------------------------------------------------------ 463 | 464 | mean(abs(newdat$yhat - rowMeans(out))) 465 | 466 | 467 | ## ------------------------------------------------------------------------ 468 | 469 | newdat$LL <- apply(out, 1, quantile, 470 | probs = .025, na.rm = TRUE) 471 | 472 | newdat$UL <- apply(out, 1, quantile, 473 | probs = .975, na.rm = TRUE) 474 | 475 | 476 | ## ----fgam-lr1predboot, fig.width=6, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Predicted probability of smoking across ages."---- 477 | 478 | ggplot(newdat, aes(AGE_W1, yhat)) + 479 | geom_ribbon(aes(ymin = LL, ymax = UL), fill = "grey80") + 480 | geom_line(size = 2) + 481 | scale_y_continuous(labels = percent) + 482 | xlab("Age (years)") + 483 | ylab("Probability of Smoking") + 484 | theme_tufte() + 485 | coord_cartesian(xlim = range(acl$AGE_W1), 486 | ylim = c(0, .5), 487 | expand = FALSE) 488 | 489 | 490 | ## ------------------------------------------------------------------------ 491 | 492 | acl[, EmployG_W2 := as.character(Employment_W2)] 493 | acl[EmployG_W2 %in% c( 494 | "(2) 2500+HRS", "(3) 15002499", 495 | "(4) 500-1499", "(5) 1-499HRS"), 496 | EmployG_W2 := "(2) EMPLOYED"] 497 | acl[, EmployG_W2 := factor(EmployG_W2)] 498 | 499 | mgam.mr1 <- vgam(EmployG_W2 ~ s(AGE_W1, k = 5), 500 | family = multinomial(), 501 | data = acl, model = TRUE) 502 | 503 | summary(mgam.mr1) 504 | 505 | 506 | ## ----fgam-mr1, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Generalized additive model for age and employment status as a 5 level unordered categorical outcome resulting in four distinct effects of age."---- 507 | 508 | par(mfrow = c(2, 2)) 509 | plot(mgam.mr1, se = TRUE, 510 | lcol = viridis(4)[1], 511 | scol = viridis(4)[2]) 512 | 513 | 514 | ## ------------------------------------------------------------------------ 515 | 516 | ## generate new data for prediction 517 | ## use the whole range of age 518 | newdat <- as.data.table(expand.grid( 519 | AGE_W1 = seq( 520 | from = min(acl$AGE_W1, na.rm=TRUE), 521 | to = max(acl$AGE_W1, na.rm=TRUE), 522 | length.out = 1000))) 523 | 524 | newdat <- cbind(newdat, predict(mgam.mr1, 525 | newdata = newdat, 526 | type = "response")) 527 | 528 | newdatlong <- melt(newdat, id.vars = "AGE_W1") 529 | 530 | summary(newdatlong) 531 | 532 | 533 | ## ----fgam-mr1pred, fig.width=8, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Predicted probability of employment status across ages."---- 534 | 535 | ggplot(newdatlong, aes( 536 | AGE_W1, value, 537 | colour = variable, linetype = variable)) + 538 | geom_line(size = 2) + 539 | scale_color_viridis(discrete = TRUE) + 540 | scale_x_continuous("Age (years)") + 541 | scale_y_continuous("Probability", label = percent) + 542 | coord_cartesian(ylim = c(0, 1), expand = FALSE) + 543 | theme_tufte() + 544 | theme(legend.position = c(.2, .5), 545 | legend.key.width = unit(2, "cm")) 546 | 547 | 548 | ## ------------------------------------------------------------------------ 549 | 550 | ## negative binomial regression model 551 | mgam.nbr1 <- vgam(NChronic12_W2 ~ Sex + s(AGE_W1, k = 5), 552 | family = negbinomial(), 553 | data = acl, model = TRUE) 554 | 555 | summary(mgam.nbr1) 556 | 557 | 558 | ## ----fgam-nbr1, fig.width=10, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Generalized additive model for sex an age and number of chronic conditions.", warning=FALSE---- 559 | 560 | par(mfrow = c(1, 2)) 561 | plot(mgam.nbr1, se = TRUE, 562 | lcol = viridis(4)[1], 563 | scol = viridis(4)[2]) 564 | 565 | 566 | ## ------------------------------------------------------------------------ 567 | 568 | ## generate new data for prediction 569 | ## use the whole range of age and sex 570 | newdat <- as.data.table(expand.grid( 571 | Sex = levels(acl$Sex), 572 | AGE_W1 = seq( 573 | from = min(acl$AGE_W1, na.rm=TRUE), 574 | to = max(acl$AGE_W1, na.rm=TRUE), 575 | length.out = 1000))) 576 | 577 | newdat$yhat <- predict(mgam.nbr1, 578 | newdata = newdat, 579 | type = "response") 580 | 581 | 582 | ## ----fgam-nbr1pred, fig.width=6, fig.height=5, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Predicted number of chronic conditions across ages by sex."---- 583 | 584 | ggplot(newdat, aes(AGE_W1, yhat, colour = Sex)) + 585 | geom_line(size = 2) + 586 | scale_color_viridis(discrete = TRUE) + 587 | xlab("Age (years)") + 588 | ylab("Number Chronic Conditions") + 589 | theme_tufte() + 590 | coord_cartesian(xlim = range(acl$AGE_W1), 591 | ylim = c(0, 2.5), 592 | expand = FALSE) + 593 | theme(legend.position = c(.2, .8), 594 | legend.key.width = unit(1, "cm")) 595 | 596 | 597 | ## ------------------------------------------------------------------------ 598 | 599 | detach("package:VGAM") 600 | library(mgcv) 601 | 602 | mgam.nbr2 <- gam(NChronic12_W2 ~ Sex + s(AGE_W1, k = 10, by = Sex), 603 | family = nb(), data = acl) 604 | 605 | summary(mgam.nbr2) 606 | 607 | 608 | ## ------------------------------------------------------------------------ 609 | 610 | ## generate new data for prediction 611 | ## use the whole range of age and sex 612 | newdat <- as.data.table(expand.grid( 613 | Sex = levels(acl$Sex), 614 | AGE_W1 = seq( 615 | from = min(acl$AGE_W1, na.rm=TRUE), 616 | to = max(acl$AGE_W1, na.rm=TRUE), 617 | length.out = 1000))) 618 | 619 | newdat$yhat <- predict(mgam.nbr2, 620 | newdata = newdat, 621 | type = "response") 622 | 623 | 624 | ## ----fgam-nbr2pred, fig.width=6, fig.height=5, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Predicted number of chronic conditions across ages by sex from an interaction model."---- 625 | 626 | ggplot(newdat, aes(AGE_W1, yhat, colour = Sex)) + 627 | geom_line(size = 2) + 628 | scale_color_viridis(discrete = TRUE) + 629 | xlab("Age (years)") + 630 | ylab("Number Chronic Conditions") + 631 | theme_tufte() + 632 | coord_cartesian(xlim = range(acl$AGE_W1), 633 | ylim = c(0, 2.7), 634 | expand = FALSE) + 635 | theme(legend.position = c(.2, .8), 636 | legend.key.width = unit(1, "cm")) 637 | 638 | 639 | ## ------------------------------------------------------------------------ 640 | 641 | nboot <- 500 642 | 643 | out <- matrix(NA_real_, ncol = nboot, nrow = nrow(newdat)) 644 | 645 | start.time <- proc.time() 646 | set.seed(12345) 647 | for (i in 1:500) { 648 | tmp <- gam(NChronic12_W2 ~ Sex + s(AGE_W1, k = 10, by = Sex), 649 | family = nb(), 650 | data = acl[sample(nrow(acl), replace = TRUE)]) 651 | out[, i] <- predict(tmp, 652 | newdata = newdat, 653 | type = "response") 654 | } 655 | stop.time <- proc.time() 656 | 657 | ## time to bootstrap 500 times 658 | stop.time - start.time 659 | 660 | 661 | ## ------------------------------------------------------------------------ 662 | 663 | mean(abs(newdat$yhat - rowMeans(out))) 664 | 665 | 666 | ## ------------------------------------------------------------------------ 667 | 668 | newdat$LL <- apply(out, 1, quantile, 669 | probs = .025, na.rm = TRUE) 670 | 671 | newdat$UL <- apply(out, 1, quantile, 672 | probs = .975, na.rm = TRUE) 673 | 674 | 675 | ## ----fgam-nbr1predboot, fig.width=7, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Predicted count of chronic conditions across ages by sex with bootstrapped confidence intervals."---- 676 | 677 | ggplot(newdat, aes(AGE_W1, yhat)) + 678 | geom_ribbon(aes(ymin = LL, ymax = UL, fill = Sex), alpha = .2) + 679 | geom_line(aes(colour = Sex), size = 2) + 680 | scale_color_viridis(discrete = TRUE) + 681 | scale_fill_viridis(discrete = TRUE) + 682 | xlab("Age (years)") + 683 | ylab("Number Chronic Conditions") + 684 | theme_tufte() + 685 | coord_cartesian(xlim = range(acl$AGE_W1), 686 | ylim = c(0, 4), 687 | expand = FALSE) + 688 | theme(legend.position = c(.2, .8), 689 | legend.key.width = unit(2, "cm")) 690 | 691 | 692 | ## ------------------------------------------------------------------------ 693 | 694 | xtabs(~Sex + I(AGE_W1 > 80), data = acl[!is.na(NChronic12_W2)]) 695 | 696 | 697 | -------------------------------------------------------------------------------- /glm1.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | library(checkpoint) 3 | checkpoint("2018-09-28", R.version = "3.5.1", 4 | project = book_directory, 5 | checkpointLocation = checkpoint_directory, 6 | scanForPackages = FALSE, 7 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 8 | 9 | library(knitr) 10 | library(data.table) 11 | library(ggplot2) 12 | library(visreg) 13 | library(ez) 14 | library(emmeans) 15 | library(rms) 16 | library(ipw) 17 | library(JWileymisc) 18 | library(RcppEigen) 19 | library(texreg) 20 | 21 | options( 22 | width = 70, 23 | stringsAsFactors = FALSE, 24 | datatable.print.nrows = 20, 25 | datatable.print.topn = 3, 26 | digits = 2) 27 | 28 | 29 | ## ------------------------------------------------------------------------ 30 | f1 <- y ~ x1 + x2 + x1:x2 31 | 32 | update(f1, . ~ .) 33 | 34 | update(f1, w ~ .) 35 | 36 | update(f1, . ~ . + x3) 37 | 38 | update(f1, . ~ . - x1:x2) 39 | 40 | 41 | ## ------------------------------------------------------------------------ 42 | 43 | set.seed(1234) 44 | example <- data.table( 45 | y = rnorm(9), 46 | Condition = factor(rep(c("A", "B", "Control"), each = 3), 47 | levels = c("Control", "A", "B"))) 48 | 49 | coef(lm(y ~ Condition, data = example)) 50 | 51 | 52 | ## ------------------------------------------------------------------------ 53 | 54 | example[, .(M = mean(y)), by = Condition] 55 | 56 | 57 | ## ----include=FALSE------------------------------------------------------- 58 | 59 | printq <- sprintf("$%0.2f = %0.2f - %0.2f$", 60 | coef(lm(y ~ Condition, data = example))[["ConditionA"]], 61 | mean(example[Condition == "A", y]), 62 | mean(example[Condition == "Control", y])) 63 | 64 | 65 | ## ------------------------------------------------------------------------ 66 | 67 | model.matrix(~ 0 + Condition, data = example) 68 | 69 | 70 | ## ------------------------------------------------------------------------ 71 | 72 | coef(lm(y ~ 0 + Condition, data = example)) 73 | 74 | 75 | ## ------------------------------------------------------------------------ 76 | 77 | pf(.72, df1 = 1, df2 = 6, lower.tail = FALSE) 78 | 79 | 80 | ## ------------------------------------------------------------------------ 81 | 82 | mtcars <- as.data.table(mtcars) 83 | mtcars[, ID := factor(1:.N)] 84 | mtcars[, vs := factor(vs)] 85 | mtcars[, am := factor(am)] 86 | 87 | head(model.matrix(~ vs * am, data = mtcars)) 88 | 89 | 90 | ## ------------------------------------------------------------------------ 91 | 92 | example[, ID := factor(1:.N)] 93 | 94 | print(ezANOVA( 95 | data = example, 96 | dv = y, 97 | wid = ID, 98 | between = Condition, 99 | type = 3, 100 | detailed = TRUE)) 101 | 102 | 103 | ## ------------------------------------------------------------------------ 104 | 105 | print(ezANOVA( 106 | data = mtcars, 107 | dv = mpg, 108 | wid = ID, 109 | between = vs * am, 110 | type = 3, 111 | detailed = TRUE)) 112 | 113 | 114 | ## ----fglm1-tukeyhsd, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Graph of cell means with confidence intervals. Cells that share letters are not statistically significantly different based on Tukey's Honestly Significant Difference."---- 115 | 116 | mtcars[, Cells := factor(sprintf("vs=%s, am=%s", vs, am))] 117 | TukeyHSDgg("Cells", "hp", mtcars) + 118 | theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1)) + 119 | xlab("") 120 | 121 | 122 | ## ----fglm1-density, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Density plot of satisfaction with life (black line) with normal density overlayed (blue line)."---- 123 | 124 | acl <- readRDS("advancedr_acl_data.RDS") 125 | 126 | testdistr(acl$SWL_W1, "normal", 127 | varlab = "Satisfaction with Life", plot = FALSE, 128 | extremevalues = "theoretical", 129 | adjust = 2)$DensityPlot 130 | 131 | 132 | ## ------------------------------------------------------------------------ 133 | 134 | m.ols <- ols(SWL_W1 ~ Sex + AGE_W1 + SESCategory, data = acl, x = TRUE) 135 | m.ols 136 | 137 | 138 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 139 | ## 140 | ## texreg(m.ols, single.row = TRUE, label = "tglm1-olstex") 141 | ## 142 | 143 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 144 | 145 | texreg(m.ols, single.row = TRUE, label = "tglm1-olstex", float.pos = "!hb") 146 | 147 | 148 | ## ------------------------------------------------------------------------ 149 | 150 | vif(m.ols) 151 | 152 | 153 | ## ----fglm1-quantreg, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = c("Graphing residuals to assess normality.", "Examining residuals versus fitted values with quantile regression to explore heteroskedasticity.")---- 154 | 155 | diagnostic.data <- data.table( 156 | fitted = fitted(m.ols), 157 | resid = residuals(m.ols)) 158 | 159 | testdistr(diagnostic.data$resid, 160 | "normal", 161 | varlab = "Satisfaction with Life Residuals", plot = FALSE, 162 | extremevalues = "theoretical", 163 | adjust = 2)$DensityPlot 164 | 165 | ggplot(diagnostic.data, aes(fitted, resid)) + 166 | geom_point(alpha = .2, colour = "grey50") + 167 | geom_quantile(quantiles = .5, colour = 'black', size = 1) + 168 | geom_quantile(quantiles = c(.25, .75), 169 | colour = 'blue', linetype = 2, size = 1) + 170 | geom_quantile(quantiles = c(.05, .95), 171 | colour = 'black', linetype = 3, size = 1) 172 | 173 | 174 | ## ------------------------------------------------------------------------ 175 | 176 | m.glm <- glm(SWL_W1 ~ Sex + AGE_W1 + SESCategory, 177 | data=acl, family = gaussian(link="identity")) 178 | m.glm 179 | 180 | summary(m.glm) 181 | 182 | 183 | ## ------------------------------------------------------------------------ 184 | 185 | (m.ols2 <- update(m.ols, . ~ . + Employment_W1)) 186 | 187 | 188 | ## ------------------------------------------------------------------------ 189 | 190 | anova(m.ols2) 191 | 192 | 193 | ## ------------------------------------------------------------------------ 194 | 195 | (m.ols3 <- update(m.ols2, . ~ . + AGE_W1 * SESCategory - Sex)) 196 | 197 | 198 | ## ----fglm1-visreg1, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Estimated satisfaction with life across age by SES category. Shaded region indicates 95\\% confidence intervals for regression estimates."---- 199 | 200 | plot(visreg(m.ols3, xvar = "AGE_W1", by = "SESCategory", 201 | plot = FALSE), 202 | overlay = TRUE, partial = FALSE, rug = FALSE, 203 | xlab = "Age (years)", ylab = "Predicted Life Satisfaction", 204 | line = list(lty = 1:4)) 205 | 206 | 207 | ## ----fglm1-visreg2, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Estimated satisfaction with life across age by SES category. Confidence intervals removed."---- 208 | 209 | plot(visreg(m.ols3, xvar = "AGE_W1", by = "SESCategory", 210 | plot = FALSE), 211 | overlay = TRUE, partial = FALSE, rug = FALSE, 212 | xlab = "Age (years)", ylab = "Predicted Life Satisfaction", 213 | line = list( 214 | lty = 1:4, 215 | col = c("black", "grey75", "grey50", "grey25")), 216 | band = FALSE) 217 | 218 | 219 | ## ------------------------------------------------------------------------ 220 | 221 | newdata <- as.data.table(expand.grid( 222 | AGE_W1=quantile(acl$AGE_W1, .1):quantile(acl$AGE_W1, .9), 223 | SESCategory = factor(1:4, levels = levels(acl$SESCategory)), 224 | Employment_W1 = factor("(3) 15002499", 225 | levels = levels(acl$Employment_W1)))) 226 | newdata 227 | 228 | 229 | ## ------------------------------------------------------------------------ 230 | 231 | newdata[, c("SWL_W1", "SE") := 232 | predict(m.ols3, newdata = newdata, se.fit = TRUE)] 233 | newdata 234 | 235 | 236 | ## ------------------------------------------------------------------------ 237 | 238 | print(qnorm(.05/2), digits = 7) 239 | 240 | print(qnorm(1 - (.05/2)), digits = 7) 241 | 242 | 243 | ## ----fglm1-intplot, fig.width=8, fig.height=5, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Estimated satisfaction with life across age by SES category. Shaded region indicates 95\\% confidence intervals for regression estimates."---- 244 | 245 | ggplot(newdata, aes(AGE_W1, SWL_W1, linetype=SESCategory)) + 246 | geom_ribbon(aes(ymin = SWL_W1 + SE * qnorm(.025), 247 | ymax = SWL_W1 + SE * qnorm(.975)), 248 | alpha = .2) + 249 | geom_line(size = 1) + 250 | scale_x_continuous("Age (years)") + 251 | ylab("Satisfaction with Life") + 252 | theme_cowplot() + 253 | theme( 254 | legend.position = c(.8, .16), 255 | legend.key.width = unit(2, "cm")) 256 | 257 | 258 | ## ------------------------------------------------------------------------ 259 | 260 | confint(m.ols3) 261 | 262 | 263 | ## ------------------------------------------------------------------------ 264 | 265 | tmpdat <- na.omit(acl[, .(SWL_W1, AGE_W1, SESCategory, Employment_W1)]) 266 | ## use if using Microsoft R Open with Intel's MKL linear algebra library 267 | setMKLthreads(1) 268 | 269 | 270 | ## ------------------------------------------------------------------------ 271 | 272 | set.seed(12345) 273 | t1 <- system.time(ols.boot <- sapply(1:500, function(i) { 274 | index <- sample(nrow(tmpdat), 275 | size = nrow(tmpdat), replace = TRUE) 276 | coef(ols(SWL_W1 ~ AGE_W1 * SESCategory + Employment_W1, 277 | data = tmpdat[index])) 278 | })) 279 | 280 | t1 281 | 282 | 283 | ## ------------------------------------------------------------------------ 284 | 285 | set.seed(12345) 286 | t2 <- system.time(rcpp.boot1 <- sapply(1:500, function(i) { 287 | index <- sample(nrow(tmpdat), size = nrow(tmpdat), replace = TRUE) 288 | coef(fastLm(SWL_W1 ~ AGE_W1 * SESCategory + Employment_W1, data = tmpdat[index])) 289 | })) 290 | 291 | t2 292 | 293 | 294 | ## ------------------------------------------------------------------------ 295 | 296 | set.seed(12345) 297 | t3 <- system.time({ 298 | y <- tmpdat[, SWL_W1] 299 | X <- model.matrix(~ AGE_W1 * SESCategory + Employment_W1, data = tmpdat) 300 | N <- nrow(tmpdat) 301 | rcpp.boot2 <- sapply(1:500, function(i) { 302 | index <- sample.int(N, size = N, replace = TRUE) 303 | fastLmPure(X = X[index, ], y = y[index])$coefficients 304 | }) 305 | }) 306 | 307 | t3 308 | 309 | 310 | ## ------------------------------------------------------------------------ 311 | 312 | set.seed(12345) 313 | t4 <- system.time({ 314 | y <- tmpdat[, SWL_W1] 315 | X <- model.matrix(~ AGE_W1 * SESCategory + Employment_W1, data = tmpdat) 316 | N <- nrow(tmpdat) 317 | rcpp.boot3 <- sapply(1:10000, function(i) { 318 | index <- sample.int(N, size = N, replace = TRUE) 319 | fastLmPure(X = X[index, ], y = y[index])$coefficients 320 | }) 321 | }) 322 | 323 | t4 324 | 325 | 326 | ## ------------------------------------------------------------------------ 327 | 328 | all.equal(ols.boot, rcpp.boot1, check.attributes = FALSE) 329 | 330 | all.equal(ols.boot, rcpp.boot2, check.attributes = FALSE) 331 | 332 | 333 | ## ------------------------------------------------------------------------ 334 | m0 <- ols(CESD11_W2 ~ SelfEfficacy_W1, data = acl) 335 | 336 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 337 | ## texreg(m0, label = "tglm1-olsunadj") 338 | 339 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 340 | texreg(m0, label = "tglm1-olsunadj", float.pos = "!hb") 341 | 342 | ## ----fglm1-weights, fig.width=7, fig.height=8, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Raw and trimmed inverse probability weights for self-efficacy."---- 343 | 344 | ## weights 345 | w <- ipwpoint( 346 | exposure = SelfEfficacy_W1, 347 | family = "gaussian", 348 | numerator = ~ 1, 349 | denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1 + NChronic12_W1, 350 | data = acl) 351 | 352 | plot_grid( 353 | testdistr(w$ipw.weights, plot = FALSE)$DensityPlot, 354 | testdistr(winsorizor(w$ipw.weights, .01), 355 | plot = FALSE)$DensityPlot, 356 | ncol = 1) 357 | 358 | 359 | ## ----results = 'asis'---------------------------------------------------- 360 | 361 | ## unweighted, unadjusted 362 | m0 <- ols(CESD11_W2 ~ SelfEfficacy_W1, data = acl) 363 | 364 | ## weighted, adjusted 365 | m1 <- ols(CESD11_W2 ~ SelfEfficacy_W1, data = acl, 366 | weights = winsorizor(w$ipw.weights, .01)) 367 | 368 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 369 | ## texreg(list(m0, m1), 370 | ## label = "tglm1-weight1") 371 | ## 372 | 373 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 374 | texreg(list(m0, m1), 375 | label = "tglm1-weight1", float.pos = "!hb") 376 | 377 | ## ----results = 'asis'---------------------------------------------------- 378 | 379 | # weighted, fully adjusted 380 | w2 <- ipwpoint( 381 | exposure = SelfEfficacy_W1, 382 | family = "gaussian", 383 | numerator = ~ 1, 384 | denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1 + NChronic12_W1 + 385 | SESCategory + Employment_W1 + BMI_W1 + Smoke_W1 + PhysActCat_W1, 386 | data = acl) 387 | 388 | m2 <- ols(CESD11_W2 ~ SelfEfficacy_W1, data = acl, 389 | weights = winsorizor(w2$ipw.weights, .01)) 390 | 391 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 392 | ## texreg(list(m0, m1, m2), 393 | ## label = "tglm1-weight2") 394 | ## 395 | 396 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 397 | texreg(list(m0, m1, m2), 398 | label = "tglm1-weight2", float.pos = "!hb") 399 | 400 | ## ------------------------------------------------------------------------ 401 | 402 | m1b <- ols(CESD11_W2 ~ Sex + RaceEthnicity + AGE_W1 + 403 | NChronic12_W1 + SelfEfficacy_W1, 404 | data = acl) 405 | 406 | m2b <- ols(CESD11_W2 ~ Sex + RaceEthnicity + AGE_W1 + 407 | NChronic12_W1 + SESCategory + 408 | Employment_W1 + BMI_W1 + Smoke_W1 + PhysActCat_W1 + 409 | SelfEfficacy_W1, data = acl) 410 | 411 | 412 | ## ------------------------------------------------------------------------ 413 | 414 | m1c <- ols(CESD11_W2 ~ Sex + RaceEthnicity + AGE_W1 + 415 | NChronic12_W1 + SelfEfficacy_W1, 416 | data = acl, 417 | weights = winsorizor(w$ipw.weights, .01)) 418 | 419 | m2c <- ols(CESD11_W2 ~ Sex + RaceEthnicity + AGE_W1 + 420 | NChronic12_W1 + SESCategory + 421 | Employment_W1 + BMI_W1 + Smoke_W1 + PhysActCat_W1 + 422 | SelfEfficacy_W1, data = acl, 423 | weights = winsorizor(w2$ipw.weights, .01)) 424 | 425 | 426 | ## ----fglm1-adjustcompare, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Comparison of the estimate and confidence interval for the association of self-efficacy with depression symptoms from various models. Covs = covariate adjusted models. IPW = inverse probability weight adjusted models. Covs + IPW = models that both include inverse probability weights and the same potential confounds explicitly in the model again."---- 427 | 428 | ## write an extract function 429 | extractor <- function(obj, label) { 430 | b <- coef(obj) 431 | ci <- confint(obj) 432 | data.table( 433 | Type = label, 434 | B = b[["SelfEfficacy_W1"]], 435 | LL = ci["SelfEfficacy_W1", "2.5 %"], 436 | UL = ci["SelfEfficacy_W1", "97.5 %"]) 437 | } 438 | 439 | allresults <- rbind( 440 | extractor(m0, "M0: Unadjusted"), 441 | extractor(m1, "M1: Partial IPW"), 442 | extractor(m1b, "M1: Partial Covs"), 443 | extractor(m1c, "M1: Partial Covs + IPW"), 444 | extractor(m2, "M2: Full IPW"), 445 | extractor(m2b, "M2: Full Covs"), 446 | extractor(m2c, "M2: Full Covs + IPW")) 447 | allresults[, Type := factor(Type, levels = Type)] 448 | 449 | 450 | ggplot(allresults, aes(Type, y = B, ymin = LL, ymax = UL)) + 451 | geom_pointrange() + 452 | coord_flip() + 453 | xlab("") + ylab("Estimate + 95% CI") 454 | 455 | 456 | ## ------------------------------------------------------------------------ 457 | 458 | set.seed(12345) 459 | adosleep <- data.table( 460 | SOLacti = rnorm(150, 4.4, 1.3)^2, 461 | DBAS = rnorm(150, 72, 26), 462 | DAS = rnorm(150, 125, 32), 463 | Female = rbinom(150, 1, .53), 464 | Stress = rnorm(150, 32, 11)) 465 | adosleep[, SSQ := rnorm(150, 466 | (.36 * 3 / 12.5) * SOLacti + 467 | (.16 * 3 / 26) * DBAS + 468 | (.18 * 3 / .5) * Female + 469 | (.20 * 3 / 11) * Stress, 2.6)] 470 | adosleep[, MOOD := rnorm(150, 471 | (-.07 / 12.5) * SOLacti + 472 | (.29 / 3) * SSQ + 473 | (.14 / 26) * DBAS + 474 | (.21 / 32) * DAS + 475 | (.12 / 32) * SSQ * (DAS-50) + 476 | (.44 / .5) * Female + 477 | (.28 / 11) * Stress, 2)] 478 | adosleep[, Female := factor(Female, levels = 0:1, 479 | labels = c("Males", "Females"))] 480 | 481 | 482 | ## ----include=FALSE------------------------------------------------------- 483 | ## ols(scale(MOOD) ~ scale(SOLacti) + scale(SSQ) + scale(DBAS) + scale(DAS) + scale(SSQ):scale(DAS) + scale(Female) + scale(Stress), data = adosleep) 484 | ## ols(MOOD ~ SOLacti + SSQ + DBAS + DAS + SSQ:DAS + Female + Stress, data = adosleep) 485 | 486 | ## ----fglm1-case1, fig.width=8, fig.height=8, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Distributions of case study variables."---- 487 | 488 | plot_grid( 489 | testdistr(adosleep$MOOD, extremevalues = "theoretical", 490 | plot=FALSE, varlab = "MOOD")$Density, 491 | testdistr(adosleep$SSQ, extremevalues = "theoretical", 492 | plot=FALSE, varlab = "SSQ")$Density, 493 | testdistr(adosleep$SOLacti, extremevalues = "theoretical", 494 | plot=FALSE, varlab = "SOLacti")$Density, 495 | testdistr(adosleep$DAS, extremevalues = "theoretical", 496 | plot=FALSE, varlab = "DAS")$Density, 497 | ncol = 2) 498 | 499 | 500 | ## ----fglm1-case2, fig.width=6, fig.height=4.5, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Heatmap of the correlations between study variables."---- 501 | 502 | plot(SEMSummary( 503 | ~ MOOD + SOLacti + DBAS + DAS + Stress + SSQ, 504 | data = adosleep), plot = "cor") + 505 | theme(axis.text.x = element_text( 506 | angle = 45, hjust = 1, vjust = 1)) 507 | 508 | 509 | ## ------------------------------------------------------------------------ 510 | 511 | egltable(c("SOLacti", "SSQ", "MOOD", "Stress", 512 | "DBAS", "DAS", "Female"), 513 | data = as.data.frame(adosleep)) 514 | 515 | 516 | ## ------------------------------------------------------------------------ 517 | 518 | adosleep[, zMOOD := as.vector(scale(MOOD))] 519 | adosleep[, zDBAS := as.vector(scale(DBAS))] 520 | adosleep[, zDAS := as.vector(scale(DAS))] 521 | adosleep[, zSSQ := as.vector(scale(SSQ))] 522 | adosleep[, zSOLacti := as.vector(scale(SOLacti))] 523 | adosleep[, zStress := as.vector(scale(Stress))] 524 | 525 | 526 | ## ------------------------------------------------------------------------ 527 | 528 | m.adosleep1 <- ols(zMOOD ~ zSOLacti + zDBAS + Female + zStress, 529 | data = adosleep) 530 | m.adosleep2 <- update(m.adosleep1, . ~ . + zSSQ + zDAS) 531 | m.adosleep3 <- update(m.adosleep2, . ~ . + zSSQ:zDAS) 532 | 533 | screenreg(list(m.adosleep1, m.adosleep2, m.adosleep3)) 534 | 535 | 536 | ## ----fglm1-case-resids, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Distribution of model residuals."---- 537 | 538 | vif(m.adosleep3) 539 | 540 | testdistr(resid(m.adosleep3), plot=FALSE, varlab = "Residuals")$QQPlot 541 | 542 | 543 | ## ------------------------------------------------------------------------ 544 | 545 | ## refit model on raw data 546 | m.adosleep.raw <- ols(MOOD ~ SOLacti + DBAS + Female + 547 | Stress + SSQ * DAS, 548 | data = adosleep) 549 | 550 | ## create a dataset 551 | adosleep.newdat <- as.data.table(with(adosleep, expand.grid( 552 | SOLacti = mean(SOLacti), 553 | DBAS = mean(DBAS), 554 | Female = factor("Females", levels(Female)), 555 | Stress = mean(Stress), 556 | SSQ = seq(from = min(SSQ), to = max(SSQ), length.out = 100), 557 | DAS = mean(DAS) + c(1, -1) * sd(DAS)))) 558 | 559 | adosleep.newdat$MOOD <- predict(m.adosleep.raw, 560 | newdata = adosleep.newdat, 561 | se.fit = FALSE) 562 | 563 | adosleep.newdat[, DAS := factor(round(DAS), 564 | levels = c(100, 161), 565 | labels = c("M - 1 SD", "M + 1 SD"))] 566 | 567 | 568 | ## ----fglm1-case-intplot, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Interaction between subjective sleep quality and overall dysfunctional beliefs predicting negative mood."---- 569 | 570 | ggplot(adosleep.newdat, aes(SSQ, MOOD, linetype=DAS)) + 571 | geom_line(size = 2) + 572 | scale_x_continuous("Subjective sleep quality\n(higher is worse)") + 573 | ylab("Negative Mood") + 574 | theme_cowplot() + 575 | theme( 576 | legend.position = c(.85, .15), 577 | legend.key.width = unit(2, "cm")) 578 | 579 | 580 | -------------------------------------------------------------------------------- /glm2.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | library(checkpoint) 3 | checkpoint("2018-09-28", R.version = "3.5.1", 4 | project = book_directory, 5 | checkpointLocation = checkpoint_directory, 6 | scanForPackages = FALSE, 7 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 8 | 9 | library(knitr) 10 | library(data.table) 11 | library(ggplot2) 12 | library(ggthemes) 13 | library(scales) 14 | library(viridis) 15 | library(VGAM) 16 | library(ipw) 17 | library(JWileymisc) 18 | library(xtable) 19 | library(texreg) 20 | 21 | options( 22 | width = 70, 23 | stringsAsFactors = FALSE, 24 | datatable.print.nrows = 20, 25 | datatable.print.topn = 3, 26 | digits = 2) 27 | 28 | 29 | ## ----fglm2-poisson, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "Density for a Poisson distribution with lambda = 2 and lambda = 6."---- 30 | 31 | dpoisson <- data.table(X = 0:20) 32 | dpoisson[, Lambda2 := dpois(X, lambda = 2)] 33 | dpoisson[, Lambda6 := dpois(X, lambda = 6)] 34 | 35 | ggplot(melt(dpoisson, id.vars = "X"), 36 | aes(X, value, fill = variable)) + 37 | geom_col(position = "dodge") + 38 | scale_fill_viridis(discrete = TRUE) + 39 | theme(legend.position = c(.7, .8)) + 40 | xlab("Y Score") + ylab("Poisson Density") 41 | 42 | 43 | ## ------------------------------------------------------------------------ 44 | 45 | acl <- readRDS("advancedr_acl_data.RDS") 46 | 47 | 48 | ## ------------------------------------------------------------------------ 49 | 50 | acl$CurSmoke <- as.integer(acl$Smoke_W1 == "(1) Cur Smok") 51 | 52 | m.lr <- vglm(CurSmoke ~ Sex, 53 | family = binomialff(link = "logit"), 54 | data = acl, model = TRUE) 55 | summary(m.lr) 56 | 57 | 58 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 59 | ## or.tab <- xtabs(~ Sex + CurSmoke, data = acl) 60 | ## or.tab.res <- (or.tab[1,1]/or.tab[2,1])/(or.tab[1,2]/or.tab[2,2]) 61 | ## xtable(or.tab, caption = "Observed frequency table", 62 | ## label = "tglm2-obsfreq") 63 | 64 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 65 | or.tab <- xtabs(~ Sex + CurSmoke, data = acl) 66 | or.tab.res <- (or.tab[1,1]/or.tab[2,1])/(or.tab[1,2]/or.tab[2,2]) 67 | xtable(or.tab, caption = "Observed frequency table", 68 | label = "tglm2-obsfreq") 69 | 70 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 71 | 72 | cat(sprintf(" 73 | \\begin{equation*} 74 | \\frac{\\frac{%d}{%d}}{\\frac{%d}{%d}} = %0.2f 75 | \\end{equation*} 76 | ", or.tab[1,1], or.tab[2,1], or.tab[1,2], or.tab[2,2], 77 | or.tab.res 78 | )) 79 | 80 | 81 | ## ------------------------------------------------------------------------ 82 | 83 | preddat <- data.table(Sex = levels(acl$Sex)) 84 | preddat$yhat <- predict(m.lr, newdata = preddat, 85 | type = "response") 86 | 87 | 88 | ## ----fglm2-predprob0, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Graph showing the probability of smoking by sex."---- 89 | 90 | ggplot(preddat, aes(Sex, yhat)) + 91 | geom_bar(stat = "identity") + 92 | scale_y_continuous("Smoking Probability", labels = percent) + 93 | theme_tufte() 94 | 95 | 96 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 97 | ## xtable(coef(summary(m.lr)), digits = 2, 98 | ## caption = paste( 99 | ## "Summary of logistic regression model", 100 | ## "including coefficients, standard errors", 101 | ## "andd p-values."), label = "tglm2-orsimple") 102 | 103 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 104 | xtable(coef(summary(m.lr)), digits = 2, 105 | caption = paste( 106 | "Summary of logistic regression model", 107 | "including coefficients, standard errors", 108 | "andd p-values."), label = "tglm2-orsimple") 109 | 110 | ## ------------------------------------------------------------------------ 111 | 112 | ## unadjusted model 113 | m0.lr <- vglm(CurSmoke ~ SelfEfficacy_W1, 114 | family = binomialff(link = "logit"), 115 | data = acl, model = TRUE) 116 | 117 | ## estimate IPWs 118 | w <- ipwpoint( 119 | exposure = SelfEfficacy_W1, 120 | family = "gaussian", 121 | numerator = ~ 1, 122 | denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1, 123 | data = acl) 124 | 125 | ## adjusted logistic regression model 126 | m1.lr <- vglm(CurSmoke ~ SelfEfficacy_W1, 127 | family = binomialff(link = "logit"), 128 | data = acl, model = TRUE, 129 | weights = winsorizor(w$ipw.weights, .01)) 130 | 131 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 132 | ## xtable(rbind( 133 | ## data.table(Type = "Raw", coef(summary(m0.lr))), 134 | ## data.table(Type = "Adj", coef(summary(m1.lr)))), 135 | ## digits = 2, 136 | ## caption = paste("Comparison of unadjusted (raw)", 137 | ## "and adjusted regression models"), 138 | ## label = "tglm2-lrcompare") 139 | 140 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 141 | xtable(rbind( 142 | data.table(Type = "Raw", coef(summary(m0.lr))), 143 | data.table(Type = "Adj", coef(summary(m1.lr)))), 144 | digits = 2, 145 | caption = paste("Comparison of unadjusted (raw)", 146 | "and adjusted regression models"), 147 | label = "tglm2-lrcompare") 148 | 149 | ## ----fglm2-predprob1, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the probability of smoking by self-efficacy")---- 150 | 151 | preddat2 <- data.table(SelfEfficacy_W1 = 152 | seq(from = min(acl$SelfEfficacy_W1, na.rm = TRUE), 153 | to = max(acl$SelfEfficacy_W1, na.rm = TRUE), 154 | length.out = 1000)) 155 | preddat2$yhat <- predict(m1.lr, newdata = preddat2, 156 | type = "response") 157 | 158 | ggplot(preddat2, aes(SelfEfficacy_W1, yhat)) + 159 | geom_line() + 160 | scale_x_continuous("Self-Efficacy") + 161 | scale_y_continuous("Smoking Probability", label = percent) + 162 | theme_tufte() + coord_cartesian(ylim = c(.25, .40)) 163 | 164 | 165 | ## ------------------------------------------------------------------------ 166 | 167 | ## delta value for change in self efficacy 168 | delta <- .01 169 | 170 | ## create a copy of the dataset 171 | ## where we increase everyone's self-efficacy by delta 172 | aclalt <- copy(acl) 173 | aclalt$SelfEfficacy_W1 <- aclalt$SelfEfficacy_W1 + delta 174 | 175 | ## calculate predicted probabilities 176 | p1 <- predict(m1.lr, newdata = acl, type = "response") 177 | p2 <- predict(m1.lr, newdata = aclalt, type = "response") 178 | 179 | ## calculate the average, marginal change in probabilities 180 | ## per unit change in self efficacy 181 | ## in percents and rounded 182 | round(mean((p2 - p1) / delta) * 100, 1) 183 | 184 | 185 | ## ------------------------------------------------------------------------ 186 | 187 | acl$PhysActCat_W2 <- factor(acl$PhysActCat_W2, ordered = TRUE) 188 | 189 | ## adjusted ordered logistic regression model 190 | m0.or <- vglm(PhysActCat_W2 ~ SelfEfficacy_W1, 191 | family = propodds(), 192 | data = acl) 193 | 194 | ## estimate IPWs 195 | w <- ipwpoint( 196 | exposure = SelfEfficacy_W1, 197 | family = "gaussian", 198 | numerator = ~ 1, 199 | denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1, 200 | data = acl) 201 | 202 | ## adjusted ordered logistic regression model 203 | m1.or <- vglm(PhysActCat_W2 ~ SelfEfficacy_W1, 204 | family = propodds(), 205 | data = acl, model = TRUE, 206 | weights = winsorizor(w$ipw.weights, .01)) 207 | 208 | 209 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 210 | ## xtable(rbind( 211 | ## data.table(Type = "Raw", 212 | ## Labels = rownames(coef(summary(m0.or))), 213 | ## coef(summary(m0.or))), 214 | ## data.table(Type = "Adj", 215 | ## Labels = rownames(coef(summary(m1.or))), 216 | ## coef(summary(m1.or)))), 217 | ## digits = 2, 218 | ## caption = paste("Comparison of unadjusted (raw) and", 219 | ## "adjusted ordered logistic regression models"), 220 | ## label = "tglm2-orcompare") 221 | 222 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 223 | xtable(rbind( 224 | data.table(Type = "Raw", 225 | Labels = rownames(coef(summary(m0.or))), 226 | coef(summary(m0.or))), 227 | data.table(Type = "Adj", 228 | Labels = rownames(coef(summary(m1.or))), 229 | coef(summary(m1.or)))), 230 | digits = 2, 231 | caption = paste("Comparison of unadjusted (raw) and", 232 | "adjusted ordered logistic regression models"), 233 | label = "tglm2-orcompare") 234 | 235 | ## ----fglm2-predprob2, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the probability of different physical activity categories by self-efficacy")---- 236 | 237 | preddat3 <- data.table(SelfEfficacy_W1 = 238 | seq(from = min(acl$SelfEfficacy_W1, na.rm = TRUE), 239 | to = max(acl$SelfEfficacy_W1, na.rm = TRUE), 240 | length.out = 1000)) 241 | preddat3 <- cbind(preddat3, 242 | predict(m1.or, newdata = preddat3, 243 | type = "response")) 244 | preddat3 <- melt(preddat3, id.vars = "SelfEfficacy_W1") 245 | 246 | ggplot(preddat3, aes(SelfEfficacy_W1, value, 247 | colour = variable, linetype = variable)) + 248 | geom_line(size = 2) + 249 | scale_color_viridis(discrete = TRUE) + 250 | scale_x_continuous("Self-Efficacy") + 251 | scale_y_continuous("Activity Probability", label = percent) + 252 | coord_cartesian(ylim = c(0, .6), expand = FALSE) + 253 | theme_tufte() + 254 | theme(legend.position = c(.7, .8), 255 | legend.key.width = unit(2, "cm")) 256 | 257 | 258 | ## ------------------------------------------------------------------------ 259 | 260 | ## delta value for change in self efficacy 261 | delta <- .01 262 | 263 | ## create a copy of the dataset 264 | ## where we increase everyone's self-efficacy by delta 265 | aclalt <- copy(acl) 266 | aclalt$SelfEfficacy_W1 <- aclalt$SelfEfficacy_W1 + delta 267 | 268 | ## calculate predicted probabilities 269 | p1 <- predict(m1.or, newdata = acl, type = "response") 270 | p2 <- predict(m1.or, newdata = aclalt, type = "response") 271 | 272 | ## average marginal change in probability of 273 | ## membership in each category 274 | round(colMeans((p2 - p1) / delta) * 100, 1) 275 | 276 | 277 | ## ------------------------------------------------------------------------ 278 | 279 | acl[, EmployG_W2 := as.character(Employment_W2)] 280 | acl[EmployG_W2 %in% c( 281 | "(2) 2500+HRS", "(3) 15002499", 282 | "(4) 500-1499", "(5) 1-499HRS"), 283 | EmployG_W2 := "(2) EMPLOYED"] 284 | acl[, EmployG_W2 := factor(EmployG_W2)] 285 | 286 | 287 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 288 | ## xtable(as.data.frame(table(acl$EmployG_W2)), 289 | ## caption = "Frequency table of employment", 290 | ## label = "tglm2-freqtab") 291 | 292 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 293 | xtable(as.data.frame(table(acl$EmployG_W2)), 294 | caption = "Frequency table of employment", 295 | label = "tglm2-freqtab") 296 | 297 | ## ------------------------------------------------------------------------ 298 | 299 | ## unadjusted multinomial logistic regression model 300 | m0.mr <- vglm(EmployG_W2 ~ SelfEfficacy_W1, 301 | family = multinomial(), 302 | data = acl, model = TRUE) 303 | 304 | ## estimate IPWs 305 | w <- ipwpoint( 306 | exposure = SelfEfficacy_W1, 307 | family = "gaussian", 308 | numerator = ~ 1, 309 | denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1, 310 | data = acl) 311 | 312 | ## adjusted multinomial logistic regression model 313 | m1.mr <- vglm(EmployG_W2 ~ SelfEfficacy_W1, 314 | family = multinomial(), 315 | data = acl, model = TRUE, 316 | weights = winsorizor(w$ipw.weights, .01)) 317 | 318 | 319 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 320 | ## xtable(rbind( 321 | ## data.table(Type = "Raw", 322 | ## Labels = rownames(coef(summary(m0.mr))), 323 | ## coef(summary(m0.mr))), 324 | ## data.table(Type = "Adj", 325 | ## Labels = rownames(coef(summary(m1.mr))), 326 | ## coef(summary(m1.mr)))), 327 | ## digits = 2, 328 | ## caption = paste("Comparison of unadjusted (raw) and", 329 | ## "adjusted multinomial logistic regression models"), 330 | ## label = "tglm2-mrcompare") 331 | 332 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 333 | xtable(rbind( 334 | data.table(Type = "Raw", 335 | Labels = rownames(coef(summary(m0.mr))), 336 | coef(summary(m0.mr))), 337 | data.table(Type = "Adj", 338 | Labels = rownames(coef(summary(m1.mr))), 339 | coef(summary(m1.mr)))), 340 | digits = 2, 341 | caption = paste("Comparison of unadjusted (raw) and", 342 | "adjusted multinomial logistic regression models"), 343 | label = "tglm2-mrcompare") 344 | 345 | ## ----fglm2-predprob3, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the probability of different employment categories by self-efficacy")---- 346 | 347 | preddat4 <- data.table(SelfEfficacy_W1 = 348 | seq(from = min(acl$SelfEfficacy_W1, na.rm = TRUE), 349 | to = max(acl$SelfEfficacy_W1, na.rm = TRUE), 350 | length.out = 1000)) 351 | preddat4 <- cbind(preddat4, 352 | predict(m1.mr, newdata = preddat4, 353 | type = "response")) 354 | preddat4 <- melt(preddat4, id.vars = "SelfEfficacy_W1") 355 | 356 | ggplot(preddat4, aes( 357 | SelfEfficacy_W1, value, 358 | colour = variable, linetype = variable)) + 359 | geom_line(size = 2) + 360 | scale_color_viridis(discrete = TRUE) + 361 | scale_x_continuous("Self-Efficacy") + 362 | scale_y_continuous("Probability", label = percent) + 363 | coord_cartesian(ylim = c(0, .65), expand = FALSE) + 364 | theme_tufte() + 365 | theme(legend.position = c(.18, .82), 366 | legend.key.width = unit(2, "cm")) 367 | 368 | 369 | ## ------------------------------------------------------------------------ 370 | 371 | ## delta value for change in self efficacy 372 | delta <- .01 373 | 374 | ## create a copy of the dataset 375 | ## where we increase everyone's self-efficacy by delta 376 | aclalt <- copy(acl) 377 | aclalt$SelfEfficacy_W1 <- aclalt$SelfEfficacy_W1 + delta 378 | 379 | ## calculate predicted probabilities 380 | p1 <- predict(m1.mr, newdata = acl, type = "response") 381 | p2 <- predict(m1.mr, newdata = aclalt, type = "response") 382 | 383 | ## average marginal change in probability of 384 | ## membership in each category 385 | round(colMeans((p2 - p1) / delta) * 100, 1) 386 | 387 | 388 | ## ------------------------------------------------------------------------ 389 | 390 | egltable(c("NChronic12_W1", "NChronic12_W2"), 391 | data = acl, parametric = FALSE) 392 | 393 | 394 | ## ----fglm2-freqplot, fig.width=5, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the frequency of each number of chronic conditions at each wave in the ACL data")---- 395 | 396 | plot_grid( 397 | ggplot(acl, aes(NChronic12_W1)) + 398 | geom_bar() + theme_tufte(), 399 | ggplot(acl, aes(NChronic12_W2)) + 400 | geom_bar() + theme_tufte(), 401 | ncol = 1, 402 | labels = c("Wave 1", "Wave 2"), 403 | label_x = .8) 404 | 405 | 406 | ## ------------------------------------------------------------------------ 407 | 408 | ## unadjusted poisson regression model 409 | m0.pr <- vglm(NChronic12_W2 ~ SelfEfficacy_W1, 410 | family = poissonff(), 411 | data = acl, model = TRUE) 412 | 413 | summary(m0.pr) 414 | 415 | 416 | ## ------------------------------------------------------------------------ 417 | 418 | ## unadjusted negative binomial regression model 419 | m0.nbr <- vglm(NChronic12_W2 ~ SelfEfficacy_W1, 420 | family = negbinomial(), 421 | data = acl, model = TRUE) 422 | 423 | AIC(m0.nbr) - AIC(m0.pr) 424 | BIC(m0.nbr) - BIC(m0.pr) 425 | 426 | 427 | ## ----fglm2-simplot, fig.width=6, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the frequency of each number of chronic conditions based on the true data, simulations from the negative binomial model, and simulations from the poisson regression model.")---- 428 | 429 | test.pr <- simulate(m0.pr, nsim = 1, seed = 1234)$sim_1 430 | test.nbr <- simulate(m0.nbr, nsim = 1, seed = 1234)$sim_1 431 | test.all <- data.table( 432 | Type = rep(c("Truth", "Poisson", "Negative\nBinomial"), 433 | times = c( 434 | nrow(model.frame(m0.pr)), 435 | length(test.pr), 436 | length(test.nbr))), 437 | Score = c( 438 | model.frame(m0.pr)$NChronic12_W2, 439 | test.pr, 440 | test.nbr)) 441 | 442 | ggplot(test.all, aes(Score, fill = Type)) + 443 | geom_bar(position = "dodge") + 444 | scale_fill_viridis(discrete = TRUE) + 445 | theme_tufte() + 446 | theme(legend.position = c(.8, .8)) 447 | 448 | 449 | ## ------------------------------------------------------------------------ 450 | 451 | ## estimate IPWs 452 | w <- ipwpoint( 453 | exposure = SelfEfficacy_W1, 454 | family = "gaussian", 455 | numerator = ~ 1, 456 | denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1, 457 | data = acl) 458 | 459 | ## adjusted negative binomial regression model 460 | m1.nbr <- vglm(NChronic12_W2 ~ SelfEfficacy_W1, 461 | family = negbinomial(), 462 | data = acl, model = TRUE, 463 | weights = winsorizor(w$ipw.weights, .01)) 464 | 465 | 466 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 467 | ## xtable(rbind( 468 | ## data.table(Type = "Raw", 469 | ## Labels = rownames(coef(summary(m0.nbr))), 470 | ## coef(summary(m0.nbr))), 471 | ## data.table(Type = "Adj", 472 | ## Labels = rownames(coef(summary(m1.nbr))), 473 | ## coef(summary(m1.nbr)))), 474 | ## digits = 2, 475 | ## caption = paste("Comparison of unadjusted (raw) and", 476 | ## "adjusted negative binomial regression models"), 477 | ## label = "tglm2-nbrcompare") 478 | 479 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 480 | xtable(rbind( 481 | data.table(Type = "Raw", 482 | Labels = rownames(coef(summary(m0.nbr))), 483 | coef(summary(m0.nbr))), 484 | data.table(Type = "Adj", 485 | Labels = rownames(coef(summary(m1.nbr))), 486 | coef(summary(m1.nbr)))), 487 | digits = 2, 488 | caption = paste("Comparison of unadjusted (raw) and", 489 | "adjusted negative binomial regression models"), 490 | label = "tglm2-nbrcompare") 491 | 492 | ## ----fglm2-predcount1, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the predicted number of chronic conditions as a function of self-efficacy.")---- 493 | 494 | preddat5 <- data.table(SelfEfficacy_W1 = 495 | seq(from = min(acl$SelfEfficacy_W1, na.rm = TRUE), 496 | to = max(acl$SelfEfficacy_W1, na.rm = TRUE), 497 | length.out = 1000)) 498 | preddat5$yhat <- predict(m1.nbr, newdata = preddat5, 499 | type = "response") 500 | 501 | ggplot(preddat5, aes(SelfEfficacy_W1, yhat)) + 502 | geom_line() + 503 | scale_x_continuous("Self-Efficacy") + 504 | scale_y_continuous("Expected Number Conditions") + 505 | theme_tufte() 506 | 507 | 508 | ## ------------------------------------------------------------------------ 509 | 510 | acl[, Smoke_W2W1 := NA_character_] 511 | acl[Smoke_W1 == "(3) Nevr Smo" & 512 | Smoke_W2 == "(3) W2 Never Smoker", 513 | Smoke_W2W1 := "Stable Never Smoker"] 514 | acl[Smoke_W1 == "(2) Past Smo" & 515 | Smoke_W2 == "(2) W2 Former Smoker", 516 | Smoke_W2W1 := "Stable Former Smoker"] 517 | acl[Smoke_W1 == "(1) Cur Smok" & 518 | Smoke_W2 == "(1) W2 Current Smoker", 519 | Smoke_W2W1 := "Stable Current Smoker"] 520 | acl[Smoke_W1 %in% c("(2) Past Smo", "(3) Nevr Smo") & 521 | Smoke_W2 == "(1) W2 Current Smoker", 522 | Smoke_W2W1 := "New Smoker"] 523 | acl[Smoke_W1 == "(1) Cur Smok" & 524 | Smoke_W2 == "(2) W2 Former Smoker", 525 | Smoke_W2W1 := "Recently Quit Smoker"] 526 | 527 | acl[, Smoke_W2W1 := factor(Smoke_W2W1, 528 | levels = c("Stable Never Smoker", "Stable Former Smoker", 529 | "Stable Current Smoker", "Recently Quit Smoker", 530 | "New Smoker"))] 531 | 532 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 533 | ## xtable(as.data.frame(table(acl$Smoke_W2W1)), 534 | ## caption = "Frequency table of smoking over time", 535 | ## label = "tglm2-freqtab-smoke") 536 | 537 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 538 | xtable(as.data.frame(table(acl$Smoke_W2W1)), 539 | caption = "Frequency table of smoking over time", 540 | label = "tglm2-freqtab-smoke") 541 | 542 | ## ------------------------------------------------------------------------ 543 | 544 | acl[, SES := as.numeric(SESCategory)] 545 | 546 | mr.ses <- vglm(Smoke_W2W1 ~ Sex + SES + AGE_W1, 547 | family = multinomial(), 548 | data = acl, model = TRUE) 549 | 550 | mr.psych <- vglm(Smoke_W2W1 ~ SWL_W1 + InformalSI_W1 + 551 | FormalSI_W1 + SelfEfficacy_W1 + CESD11_W1, 552 | family = multinomial(), 553 | data = acl, model = TRUE) 554 | 555 | mr.health <- vglm(Smoke_W2W1 ~ PhysActCat_W1 + 556 | BMI_W1 + NChronic12_W1, 557 | family = multinomial(), 558 | data = acl, model = TRUE) 559 | 560 | 561 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 562 | ## xtable( 563 | ## data.table( 564 | ## Model = c("Sociodemographics", "Psychosocial", "Health"), 565 | ## AIC = c(AIC(mr.ses), AIC(mr.psych), AIC(mr.health)), 566 | ## BIC = c(BIC(mr.ses), BIC(mr.psych), BIC(mr.health))), 567 | ## caption = "Model Comparisons", 568 | ## label = "tglm2-modelcomparisons") 569 | 570 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 571 | xtable( 572 | data.table( 573 | Model = c("Sociodemographics", "Psychosocial", "Health"), 574 | AIC = c(AIC(mr.ses), AIC(mr.psych), AIC(mr.health)), 575 | BIC = c(BIC(mr.ses), BIC(mr.psych), BIC(mr.health))), 576 | caption = "Model Comparisons", 577 | label = "tglm2-modelcomparisons") 578 | 579 | ## ------------------------------------------------------------------------ 580 | 581 | summary(mr.ses) 582 | 583 | 584 | ## ------------------------------------------------------------------------ 585 | 586 | acl[, AGE_W1 := AGE_W1 / 10] 587 | 588 | 589 | ## ------------------------------------------------------------------------ 590 | 591 | mr.ses1 <- vglm(Smoke_W2W1 ~ Sex + SES + AGE_W1, 592 | family = multinomial(refLevel = 1), 593 | data = acl, model = TRUE) 594 | mr.ses2 <- update(mr.ses1, 595 | family = multinomial(refLevel = 2)) 596 | mr.ses3 <- update(mr.ses1, 597 | family = multinomial(refLevel = 3)) 598 | 599 | 600 | ## ------------------------------------------------------------------------ 601 | data.table( 602 | Ref = "Stable Never Smoker", 603 | Term = names(coef(mr.ses1)), 604 | OR = exp(coef(mr.ses1)), 605 | exp(confint(mr.ses1))) 606 | 607 | data.table( 608 | Ref = "Stable Current Smoker", 609 | Term = names(coef(mr.ses3)), 610 | OR = exp(coef(mr.ses3)), 611 | exp(confint(mr.ses3))) 612 | 613 | 614 | ## ------------------------------------------------------------------------ 615 | 616 | ## delta value for change in age and SES 617 | delta <- .01 618 | 619 | ## create a copy of the dataset 620 | ## where we increase everyone's age by delta 621 | aclage <- copy(acl) 622 | aclage[, AGE_W1 := AGE_W1 + delta] 623 | 624 | ## create a copy of the dataset 625 | ## where we increase everyone's SES by delta 626 | aclses <- copy(acl) 627 | aclses[, SES := SES + delta] 628 | 629 | ## create two copies of the data 630 | ## one where se set everyone to "female" and another to "male" 631 | aclfemale <- copy(acl) 632 | aclfemale[, Sex := factor("(2) FEMALE", 633 | levels = levels(acl$Sex))] 634 | 635 | aclmale <- copy(acl) 636 | aclmale[, Sex := factor("(1) MALE", 637 | levels = levels(acl$Sex))] 638 | 639 | ## calculate predicted probabilities 640 | p.ref <- predict(mr.ses1, newdata = acl, 641 | type = "response") 642 | p.age <- predict(mr.ses1, newdata = aclage, 643 | type = "response") 644 | p.ses <- predict(mr.ses1, newdata = aclses, 645 | type = "response") 646 | p.female <- predict(mr.ses1, newdata = aclfemale, 647 | type = "response") 648 | p.male <- predict(mr.ses1, newdata = aclmale, 649 | type = "response") 650 | 651 | 652 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 653 | ## xtable( 654 | ## data.table( 655 | ## Level = colnames(p.ref), 656 | ## Age = colMeans((p.age - p.ref) / delta) * 100, 657 | ## SES = colMeans((p.ses - p.ref) / delta) * 100, 658 | ## Female = colMeans(p.female - p.male) * 100), 659 | ## digits = 2, 660 | ## caption = "Average marginal change in predicted probability", 661 | ## label = "tglm2-margprobs") 662 | 663 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 664 | xtable( 665 | data.table( 666 | Level = colnames(p.ref), 667 | Age = colMeans((p.age - p.ref) / delta) * 100, 668 | SES = colMeans((p.ses - p.ref) / delta) * 100, 669 | Female = colMeans(p.female - p.male) * 100), 670 | digits = 2, 671 | caption = "Average marginal change in predicted probability", 672 | label = "tglm2-margprobs") 673 | 674 | -------------------------------------------------------------------------------- /glmma.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(knitr) 11 | library(ggplot2) 12 | library(cowplot) 13 | library(viridis) 14 | library(JWileymisc) 15 | library(data.table) 16 | library(lme4) 17 | library(lmerTest) 18 | library(chron) 19 | library(zoo) 20 | library(pander) 21 | library(texreg) 22 | library(xtable) 23 | library(splines) 24 | library(parallel) 25 | library(boot) 26 | library(optimx) 27 | library(dfoptim) 28 | 29 | options(width = 70, digits = 2) 30 | 31 | 32 | ## ------------------------------------------------------------------------ 33 | 34 | data(aces_daily) 35 | draw <- as.data.table(aces_daily) 36 | d <- readRDS("aces_daily_sim_processed.RDS") 37 | 38 | 39 | ## ------------------------------------------------------------------------ 40 | 41 | d[, SOLs30 := as.integer(SOLs >= 30)] 42 | 43 | 44 | ## ------------------------------------------------------------------------ 45 | 46 | m1.glmm <- glmer(SOLs30 ~ BCOPEDis + BWASONs + (1 | UserID), 47 | family = binomial(link = logit), 48 | data = d, nAGQ = 9) 49 | summary(m1.glmm) 50 | 51 | 52 | ## ------------------------------------------------------------------------ 53 | 54 | plogis(fixef(m1.glmm)[["(Intercept)"]]) 55 | 56 | 57 | ## ------------------------------------------------------------------------ 58 | 59 | exp(cbind( 60 | B = fixef(m1.glmm), 61 | confint(m1.glmm, parm = "beta_", method = "Wald"))) 62 | 63 | 64 | ## ------------------------------------------------------------------------ 65 | 66 | preddat <- as.data.table(expand.grid( 67 | BCOPEDis = seq( 68 | from = min(d$BCOPEDis, na.rm=TRUE), 69 | to = max(d$BCOPEDis, na.rm = TRUE), 70 | length.out = 1000), 71 | BWASONs = quantile(d$BWASONs, probs = c(.2, .8), 72 | na.rm = TRUE))) 73 | 74 | ## predictions based on average random effects 75 | preddat$yhat <- predict(m1.glmm, 76 | newdata = preddat, 77 | type = "response", 78 | re.form = ~ 0) 79 | 80 | 81 | ## ------------------------------------------------------------------------ 82 | 83 | preddat2 <- as.data.table(expand.grid( 84 | UserID = unique(d$UserID), 85 | BCOPEDis = seq( 86 | from = min(d$BCOPEDis, na.rm=TRUE), 87 | to = max(d$BCOPEDis, na.rm = TRUE), 88 | length.out = 1000), 89 | BWASONs = quantile(d$BWASONs, probs = c(.2, .8), 90 | na.rm = TRUE))) 91 | 92 | ## predictions based on average random effects 93 | preddat2$yhat <- predict(m1.glmm, 94 | newdata = preddat2, 95 | type = "response", 96 | re.form = NULL) 97 | 98 | 99 | ## ------------------------------------------------------------------------ 100 | 101 | ## calculate predicted probabilities 102 | ## averaging across participants 103 | preddat3 <- preddat2[, .(yhat = mean(yhat)), 104 | by = .(BCOPEDis, BWASONs)] 105 | 106 | 107 | ## ----fglmma-probs1, fig.width=10, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Graph of the predicted probabilities setting random effects to zero (average on the logit scale) and averaging all of them."---- 108 | 109 | ggplot(rbind( 110 | cbind(preddat, Type = "Zero"), 111 | cbind(preddat3, Type = "Average")), 112 | aes(BCOPEDis, yhat, colour = Type)) + 113 | geom_line(size = 1) + 114 | scale_color_viridis(discrete = TRUE) + 115 | facet_wrap(~ round(BWASONs, 1)) + 116 | theme( 117 | legend.key.width = unit(1, "cm"), 118 | legend.position = c(.1, .9)) + 119 | xlab("Average disengagement coping") + 120 | ylab("Probability of sleep onset latency 30+ min") + 121 | coord_cartesian( 122 | xlim = c(1, 4), 123 | ylim = c(0, .6), 124 | expand = FALSE) 125 | 126 | 127 | 128 | ## ----fglmma-probs2, fig.width=10, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Graph of the individual predicted probabilities across levels of disengagement coping and average number of awakenings."---- 129 | 130 | ggplot(preddat2, 131 | aes(BCOPEDis, yhat, group = UserID)) + 132 | geom_line(alpha = .2) + 133 | facet_wrap(~ round(BWASONs, 1))+ 134 | xlab("Average disengagement coping") + 135 | ylab("Probability of sleep onset latency 30+ min") + 136 | coord_cartesian( 137 | xlim = c(1, 4), 138 | ylim = c(0, 1), 139 | expand = FALSE) 140 | 141 | 142 | 143 | ## ------------------------------------------------------------------------ 144 | 145 | m2.glmm <- glmer(SOLs30 ~ BPosAff + WPosAff + 146 | (1 + WPosAff | UserID), 147 | family = binomial(link = logit), 148 | data = d, nAGQ = 1) 149 | 150 | summary(m2.glmm) 151 | 152 | 153 | ## ------------------------------------------------------------------------ 154 | 155 | exp(cbind( 156 | B = fixef(m2.glmm), 157 | confint(m2.glmm, parm = "beta_", method = "Wald"))) 158 | 159 | 160 | ## ------------------------------------------------------------------------ 161 | 162 | bpa.low <- quantile(d$BPosAff, probs = .2, na.rm=TRUE) 163 | bpa.high <- quantile(d$BPosAff, probs = .8, na.rm=TRUE) 164 | 165 | preddat4.low <- as.data.table(expand.grid( 166 | UserID = unique(d$UserID), 167 | WPosAff = seq( 168 | from = min(d[BPosAff <= bpa.low]$WPosAff, 169 | na.rm = TRUE), 170 | to = max(d[BPosAff <= bpa.low]$WPosAff, 171 | na.rm = TRUE), 172 | length.out = 1000), 173 | BPosAff = bpa.low)) 174 | 175 | preddat4.high <- as.data.table(expand.grid( 176 | UserID = unique(d$UserID), 177 | WPosAff = seq( 178 | from = min(d[BPosAff >= bpa.high]$WPosAff, 179 | na.rm = TRUE), 180 | to = max(d[BPosAff >= bpa.high]$WPosAff, 181 | na.rm = TRUE), 182 | length.out = 1000), 183 | BPosAff = bpa.high)) 184 | 185 | preddat4 <- rbind( 186 | preddat4.low, 187 | preddat4.high) 188 | 189 | ## predictions including random effects 190 | preddat4$yhat <- predict(m2.glmm, 191 | newdata = preddat4, 192 | type = "response", 193 | re.form = NULL) 194 | 195 | ## calculate predicted probabilities 196 | ## averaging across participants 197 | preddat4b <- preddat4[, .(yhat = mean(yhat)), 198 | by = .(WPosAff, BPosAff)] 199 | 200 | 201 | ## ----fglmma-probs3, fig.width=7, fig.height=6, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Graph of the predicted probabilities averaging across individuals accounting for random intercept and slope of within person positive affect."---- 202 | 203 | ggplot(preddat4b, 204 | aes(WPosAff, yhat, colour = factor(round(BPosAff, 1)))) + 205 | geom_line(size = 1) + 206 | scale_color_viridis("Average\nPositive Affect", 207 | discrete = TRUE) + 208 | theme( 209 | legend.key.width = unit(1.5, "cm"), 210 | legend.position = c(.7, .9)) + 211 | coord_cartesian( 212 | xlim = c(-4, 4), 213 | ylim = c(0, .45), 214 | expand = FALSE) + 215 | xlab(paste0("Within person positive affect\n", 216 | "(deviations from own mean)")) + 217 | ylab("Probability of sleep onset latency 30+ min") 218 | 219 | 220 | ## ------------------------------------------------------------------------ 221 | 222 | m3.glmm <- glmer(WASONs ~ Age + BornAUS + 223 | (1 | UserID), 224 | family = poisson(link = log), 225 | data = d, nAGQ = 9) 226 | 227 | summary(m3.glmm) 228 | 229 | 230 | ## ------------------------------------------------------------------------ 231 | 232 | exp(cbind( 233 | B = fixef(m3.glmm), 234 | confint(m3.glmm, parm = "beta_", method = "Wald"))) 235 | 236 | ## ------------------------------------------------------------------------ 237 | 238 | preddat5 <- as.data.table(expand.grid( 239 | UserID = unique(d[!is.na(BornAUS) & !is.na(Age)]$UserID), 240 | Age = seq( 241 | from = min(d$Age, na.rm=TRUE), 242 | to = max(d$Age, na.rm = TRUE), 243 | length.out = 1000), 244 | BornAUS = 0:1)) 245 | 246 | ## predictions based on average random effects 247 | preddat5$yhat <- predict(m3.glmm, 248 | newdata = preddat5, 249 | type = "response", 250 | re.form = NULL) 251 | 252 | ## calculate predicted counts 253 | ## averaging across participants 254 | preddat5 <- preddat5[, .(yhat = mean(yhat)), 255 | by = .(Age, BornAUS)] 256 | 257 | 258 | ## ----fglmma-probs5, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Graph of the average predicted counts of wakenings after sleep onset by age and whether people were born in Australia or not."---- 259 | 260 | ggplot(preddat5, 261 | aes(Age, yhat, colour = factor(BornAUS))) + 262 | geom_line(size = 2) + 263 | scale_colour_viridis("Born in Australia", discrete = TRUE) + 264 | xlab("Age (years)") + 265 | ylab("Predicted # wakenings after sleep onset") + 266 | theme( 267 | legend.key.width = unit(1.5, "cm"), 268 | legend.position = c(.1, .9)) + 269 | coord_cartesian( 270 | xlim = c(18, 26.5), 271 | ylim = c(0, 2), 272 | expand = FALSE) 273 | 274 | 275 | ## ------------------------------------------------------------------------ 276 | 277 | m3.glmm.nb <- glmer.nb(formula(m3.glmm), 278 | data = d) 279 | 280 | 281 | ## ------------------------------------------------------------------------ 282 | 283 | ## load R code shipped with lme4 to provide the allFit() 284 | source(system.file("utils", "allFit.R", package="lme4")) 285 | m3.all <- allFit(m3.glmm.nb) 286 | 287 | 288 | ## ------------------------------------------------------------------------ 289 | 290 | m3.all.sum <- summary(m3.all) 291 | 292 | m3.all.sum$fixef 293 | m3.all.sum$llik 294 | m3.all.sum$theta 295 | 296 | 297 | ## ------------------------------------------------------------------------ 298 | 299 | screenreg( 300 | list(Poisson = m3.glmm, 301 | NegBin = m3.glmm.nb)) 302 | 303 | 304 | ## ------------------------------------------------------------------------ 305 | 306 | exp(cbind( 307 | fixef(m3.glmm), 308 | confint(m3.glmm, parm = "beta_", method = "Wald"), 309 | fixef(m3.glmm.nb), 310 | confint(m3.glmm.nb, parm = "beta_", method = "Wald"))) 311 | 312 | 313 | ## ------------------------------------------------------------------------ 314 | 315 | theta <- getME(m3.glmm.nb, "glmer.nb.theta") 316 | 317 | density <- data.table( 318 | X = as.integer(names(table(d$WASONs))), 319 | Observed = as.vector(prop.table(table(d$WASONs)))) 320 | 321 | density$NegBin <- colMeans(do.call(rbind, lapply(fitted(m3.glmm.nb), function(mu) { 322 | dnbinom(density$X, size = theta, mu = mu) 323 | }))) 324 | 325 | density$Poisson <- colMeans(do.call(rbind, lapply(fitted(m3.glmm), function(mu) { 326 | dpois(density$X, lambda = mu) 327 | }))) 328 | 329 | 330 | ## ----fglmma-pnbdens, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Observed and expected average densities of number of awakenings at night based on a poisson and negative binomial GLMM."---- 331 | 332 | ggplot(melt(density, id.vars = "X"), 333 | aes(X, value, fill = variable)) + 334 | geom_col(position = "dodge") + 335 | scale_fill_viridis("Type", discrete = TRUE) + 336 | theme(legend.position = c(.8, .8)) + 337 | xlab("Number of awakenings") + 338 | ylab("Density") + 339 | coord_cartesian( 340 | xlim = c(-.5, 4.5), 341 | ylim = c(0, .5), 342 | expand = FALSE) 343 | 344 | 345 | ## ------------------------------------------------------------------------ 346 | 347 | getME(m3.glmm.nb, "glmer.nb.theta") 348 | 349 | 350 | ## ----fglmma-wasodist, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Distribution of within person deviations from typical number of awakenings after sleep onset lagged to the previous day."---- 351 | 352 | testdistr(d[, WWASONsLag1], 353 | varlab = "Within WASONs lag 1") 354 | 355 | 356 | ## ------------------------------------------------------------------------ 357 | 358 | m4.glmm <- glmer(WASONs ~ Age + BornAUS + 359 | WWASONsLag1 + 360 | (1 + WWASONsLag1 | UserID), 361 | family = poisson(link = log), 362 | data = d, nAGQ = 1) 363 | 364 | summary(m4.glmm) 365 | 366 | 367 | ## ------------------------------------------------------------------------ 368 | 369 | exp(cbind( 370 | B = fixef(m4.glmm), 371 | confint(m4.glmm, parm = "beta_", method = "Wald"))) 372 | 373 | 374 | ## ----fglmma-wasodist2, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "For nearly all people, when they have more than usual awakenings one night, they tend to have fewer awakenings the following night."---- 375 | 376 | testdistr(exp(coef(m4.glmm)$UserID$WWASONsLag1)) 377 | 378 | 379 | ## ------------------------------------------------------------------------ 380 | 381 | preddat.boot <- as.data.table(expand.grid( 382 | UserID = unique(model.frame(m4.glmm)$UserID), 383 | WWASONsLag1 = seq( 384 | from = min(d$WWASONsLag1, na.rm = TRUE), 385 | to = max(d$WWASONsLag1, na.rm = TRUE), 386 | length.out = 100), 387 | Age = quantile(d[!duplicated(UserID)]$Age, 388 | probs = c(.2, .8), na.rm = TRUE), 389 | BornAUS = 0:1)) 390 | 391 | preddat.boot$yhat <- predict(m4.glmm, 392 | newdata = preddat.boot) 393 | 394 | 395 | 396 | ## ------------------------------------------------------------------------ 397 | 398 | genPred <- function(m) { 399 | predict(m, 400 | newdata = preddat.boot) 401 | } 402 | 403 | cl <- makeCluster(4) 404 | clusterExport(cl, c("book_directory", 405 | "checkpoint_directory", 406 | "preddat.boot", "d", "genPred")) 407 | 408 | clusterEvalQ(cl, { 409 | library(checkpoint) 410 | checkpoint("2018-09-28", R.version = "3.5.1", 411 | project = book_directory, 412 | checkpointLocation = checkpoint_directory, 413 | scanForPackages = FALSE, 414 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 415 | 416 | library(data.table) 417 | library(lme4) 418 | library(lmerTest) 419 | }) 420 | 421 | 422 | 423 | ## ------------------------------------------------------------------------ 424 | 425 | system.time(bootres <- bootMer(m4.glmm, 426 | FUN = genPred, 427 | nsim = 100, 428 | seed = 12345, 429 | use.u = FALSE, 430 | type = "parametric", 431 | parallel = "snow", 432 | ncpus = 4, 433 | cl = cl)) 434 | 435 | 436 | ## ------------------------------------------------------------------------ 437 | 438 | preddat.boot[, Index := rep(1L:400L, 439 | each = length(unique(UserID)))] 440 | 441 | 442 | ## ------------------------------------------------------------------------ 443 | 444 | preddat.boot.avg <- preddat.boot[, .(yhat = mean(exp(yhat))), 445 | by = .(WWASONsLag1, Age, BornAUS)] 446 | 447 | 448 | ## ------------------------------------------------------------------------ 449 | 450 | dim(bootres$t) 451 | 452 | for (i in 1:400) { 453 | ## find which indices to use 454 | ok <- which(preddat.boot$Index == i) 455 | 456 | ## now average across people 457 | tmp_avg <- rowMeans(exp(bootres$t[, ok])) 458 | 459 | ## lower confidence interval 460 | preddat.boot.avg[i, 461 | LL := quantile(tmp_avg, probs = .025, na.rm = TRUE)] 462 | preddat.boot.avg[i, 463 | UL := quantile(tmp_avg, probs = .975, na.rm = TRUE)] 464 | } 465 | 466 | 467 | ## ----fglmml-poispredboot, fig.width = 10, fig.height = 6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of predicted number of awakenings by previous night awakenings (relative to own average), separated by age (years) at the 20th and 80th percentiles (19.4y and 25y, respectively) and born in Australia (0 = no, 1 = yes). Bootstrap confidence intervals around the mean predicted count are show through shading."---- 468 | 469 | ggplot(preddat.boot.avg, aes(WWASONsLag1, yhat, 470 | colour = factor(BornAUS), fill = factor(BornAUS))) + 471 | geom_ribbon(aes(ymin = LL, ymax = UL), 472 | alpha = .25, colour = NA) + 473 | geom_line(size = 1) + 474 | ylab("Predicted Awakenings") + 475 | xlab("Within person awakenings lag 1") + 476 | scale_color_viridis("Born in Australia", discrete = TRUE) + 477 | scale_fill_viridis("Born in Australia", discrete = TRUE) + 478 | theme( 479 | legend.position = "bottom", 480 | legend.key.width = unit(1, "cm")) + 481 | facet_wrap(~ Age) + 482 | coord_cartesian( 483 | xlim = c(-3, 3), 484 | ylim = c(0, 2.5), 485 | expand = FALSE) 486 | 487 | 488 | -------------------------------------------------------------------------------- /glmmi.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | 4 | library(checkpoint) 5 | checkpoint("2018-09-28", R.version = "3.5.1", 6 | project = book_directory, 7 | checkpointLocation = checkpoint_directory, 8 | scanForPackages = FALSE, 9 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 10 | 11 | 12 | library(knitr) 13 | library(ggplot2) 14 | library(cowplot) 15 | library(viridis) 16 | library(JWileymisc) 17 | library(data.table) 18 | library(lme4) 19 | library(lmerTest) 20 | library(chron) 21 | library(zoo) 22 | library(pander) 23 | library(texreg) 24 | 25 | options(width = 70, digits = 2) 26 | 27 | 28 | ## ------------------------------------------------------------------------ 29 | 30 | ex.wide <- data.table( 31 | ID = c(1, 2, 3), 32 | SBPT1 = c(135, 120, 121), 33 | SBPT2 = c(130, 125, 125), 34 | SBPT3 = c(125, 121, NA)) 35 | 36 | print(ex.wide) 37 | 38 | 39 | reshape( 40 | data = ex.wide, 41 | varying = list(paste0("SBPT", 1:3)), 42 | v.names = c("SBP"), 43 | idvar = "ID", 44 | direction = "long") 45 | 46 | 47 | ## ------------------------------------------------------------------------ 48 | 49 | ex.long <- data.table( 50 | ID = c(1, 1, 1, 2, 2, 2, 3, 3), 51 | SBP = c(135, 130, 125, 120, 125, 121, 121, 125), 52 | Time = c(1, 2, 3, 1, 2, 3, 1, 2)) 53 | 54 | print(ex.long) 55 | 56 | reshape( 57 | data = ex.long, 58 | v.names = "SBP", 59 | timevar = "Time", 60 | sep = "T", 61 | idvar = "ID", 62 | direction = "wide") 63 | 64 | 65 | ## ------------------------------------------------------------------------ 66 | 67 | data(aces_daily) 68 | str(aces_daily) 69 | 70 | 71 | ## ----cache=FALSE--------------------------------------------------------- 72 | 73 | draw <- as.data.table(aces_daily) 74 | draw <- draw[order(UserID, SurveyDay, SurveyInteger)] 75 | draw[, UserID := factor(UserID)] 76 | 77 | tmpdata <- draw[!is.na(SurveyDay) & !is.na(SurveyInteger)][, .( 78 | MinD = min(SurveyDay), 79 | MinS = min(SurveyInteger[SurveyDay == min(SurveyDay)]), 80 | MaxD = max(SurveyDay), 81 | MaxS = max(SurveyInteger[SurveyDay == max(SurveyDay)])), 82 | by = UserID] 83 | 84 | tmpdata <- tmpdata[, .( 85 | SurveyInteger = c( 86 | MinS:3L, #first day 87 | rep(1L:3L, times = MaxD - MinD - 1), #all days between first/last 88 | 1L:MaxS), #last day 89 | SurveyDay = as.Date(rep(MinD:MaxD, c( 90 | 4L - MinS, #first day 91 | rep(3, MaxD - MinD - 1), #all days between first/last 92 | MaxS)), origin = "1970-01-01")), #lastday 93 | by = UserID] 94 | 95 | d <- merge(draw, tmpdata, by = c("UserID", "SurveyDay", "SurveyInteger"), 96 | all = TRUE) 97 | 98 | nrow(draw) 99 | nrow(d) 100 | 101 | nrow(draw)/nrow(d) 102 | 103 | 104 | ## ----fglmmi-varplot, fig.width=4, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Plot showing hypothetical data with high between variance and low between variance. In the high between variance, observations within a person vary little, but there are large individual differences. In low between variance, there are not many individual differences, but large variability within each person"---- 105 | 106 | set.seed(1234) 107 | ex.data.1 <- data.table( 108 | ID = factor(rep(1:4, each = 10)), 109 | time = rep(1:10, times = 4), 110 | y = rnorm(40, rep(1:4, each = 10), .2)) 111 | 112 | ex.data.2 <- data.table( 113 | ID = factor(rep(1:4, each = 10)), 114 | time = rep(1:10, times = 4), 115 | y = rnorm(40, 2.5, 1)) 116 | 117 | plot_grid( 118 | ggplot(ex.data.1, 119 | aes(time, y, colour = ID, shape = ID)) + 120 | stat_smooth(method = "lm", formula = y ~ 1, se=FALSE) + 121 | geom_point() + 122 | scale_color_viridis(discrete = TRUE), 123 | ggplot(ex.data.2, 124 | aes(time, y, colour = ID, shape = ID)) + 125 | stat_smooth(method = "lm", formula = y ~ 1, se=FALSE) + 126 | geom_point() + 127 | scale_color_viridis(discrete = TRUE), 128 | ncol = 1, 129 | labels = c( 130 | "High Between Variance", 131 | "Low Between Variance"), 132 | align = "hv") 133 | 134 | 135 | ## ------------------------------------------------------------------------ 136 | 137 | ## mean and SD on all observations 138 | egltable("PosAff", data = d) 139 | 140 | ## mean and SD first averaging within ID 141 | egltable("PosAff", 142 | data = d[, .( 143 | PosAff = mean(PosAff, na.rm = TRUE)), 144 | by = UserID]) 145 | 146 | ## mean and SD on first observations 147 | egltable("PosAff", data = d[ 148 | order(UserID, SurveyDay, SurveyInteger)][, 149 | .(PosAff = PosAff[1]), by = UserID]) 150 | 151 | 152 | ## ------------------------------------------------------------------------ 153 | 154 | tab <- egltable(c("Female", "Age", "BornAUS", "SES_1", "EDU"), 155 | data = d[!duplicated(UserID)], 156 | strict = FALSE) 157 | tab 158 | 159 | 160 | ## ----fglmmi-likplot, fig.width=7, fig.height=4, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Plot showing average coping ratings for women and men."---- 161 | 162 | ## create a dataset of the means and labels by gender 163 | copeplotdata <- d[!is.na(Female), .( 164 | M = c( 165 | mean(COPEPrb, na.rm = TRUE), 166 | mean(COPEPrc, na.rm = TRUE), 167 | mean(COPEExp, na.rm = TRUE), 168 | mean(COPEDis, na.rm = TRUE)), 169 | Var = 1:4, 170 | Low = sprintf("I usually don't do this at all\n[%s]", 171 | c("Problem Focused", "Emotional Processing", 172 | "Emotional Expression", "Disengagement")), 173 | High = sprintf("I usually do this a lot\n[%s]", 174 | c("Problem Focused", "Emotional Processing", 175 | "Emotional Expression", "Disengagement"))), 176 | by = Female] 177 | 178 | ## coded 0/1 but for plotting, R needs to know 179 | ## it is discrete not a continuous number 180 | copeplotdata[, Female := factor(Female)] 181 | 182 | ## create a plot 183 | gglikert(x = "M", y = "Var", leftLab = "Low", rightLab = "High", 184 | data = copeplotdata, colour = "Female", 185 | xlim = c(1, 4), title = "Average Coping") + 186 | scale_colour_manual(values = 187 | c("1" = "grey70", "0" = "grey30")) 188 | 189 | 190 | ## ----fglmmi-likplotaf, fig.width=7, fig.height=3, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Plot showing average coping ratings for women and men."---- 191 | 192 | ## create a dataset of the means and labels by stress 193 | afplotdata <- d[!is.na(STRESS), .( 194 | M = c( 195 | mean(PosAff, na.rm = TRUE), 196 | mean(NegAff, na.rm = TRUE)), 197 | Var = 1:2, 198 | Low = sprintf("Very Slightly or\nNot at all\n[%s]", 199 | c("Positive Affect", "Negative Affect")), 200 | High = sprintf("Extremely\n\n[%s]", 201 | c("Positive Affect", "Negative Affect"))), 202 | by = .(Stress = STRESS > 5)] 203 | 204 | ## add labels to understand stress 205 | afplotdata[, Stress := factor(Stress, levels = c(FALSE, TRUE), 206 | labels = c("<= 5", "> 5"))] 207 | 208 | ## create a plot 209 | gglikert(x = "M", y = "Var", leftLab = "Low", rightLab = "High", 210 | data = afplotdata, colour = "Stress", 211 | xlim = c(1, 5), title = "Affect by Stress") + 212 | scale_colour_manual(values = 213 | c("<= 5" = "grey70", "> 5" = "grey30")) 214 | 215 | 216 | ## ------------------------------------------------------------------------ 217 | 218 | d[, Survey := factor(SurveyInteger, levels = 1:3, 219 | labels = c("Morning", "Afternoon", "Evening"))] 220 | 221 | egltable(c("PosAff", "NegAff", "STRESS"), g = "Survey", 222 | data = d[, .( 223 | PosAff = mean(PosAff, na.rm = TRUE), 224 | NegAff = mean(NegAff, na.rm = TRUE), 225 | STRESS = mean(STRESS, na.rm = TRUE) 226 | ), by = .(UserID, Survey)]) 227 | 228 | 229 | ## ------------------------------------------------------------------------ 230 | 231 | d[, BPosAff := mean(PosAff, na.rm = TRUE), by = UserID] 232 | d[, WPosAff := PosAff - BPosAff] 233 | 234 | egltable("BPosAff", data = d[!duplicated(UserID)]) 235 | egltable("WPosAff", data = d) 236 | 237 | 238 | ## ------------------------------------------------------------------------ 239 | 240 | ## define a new function 241 | bwmean <- function(x, na.rm = TRUE) { 242 | m <- mean(x, na.rm = na.rm) 243 | list(m, x - m) 244 | } 245 | 246 | ## apply it to affect, support, and stress, by ID 247 | d[, c("BNegAff", "WNegAff") := bwmean(NegAff), by = UserID] 248 | d[, c("BSUPPORT", "WSUPPORT") := bwmean(SUPPORT), by = UserID] 249 | d[, c("BSTRESS", "WSTRESS") := bwmean(STRESS), by = UserID] 250 | 251 | 252 | ## ------------------------------------------------------------------------ 253 | 254 | d[, .( 255 | NCope = sum(!is.na(COPEPrb)), 256 | NSOLs = sum(!is.na(SOLs))), 257 | by = Survey] 258 | 259 | 260 | ## ------------------------------------------------------------------------ 261 | 262 | d[, BCOPEPrb := mean(COPEPrb, na.rm = TRUE), by = UserID] 263 | d[, WCOPEPrb := na.omit(COPEPrb) - BCOPEPrb, 264 | by = .(UserID, SurveyDay)] 265 | d[, BCOPEPrc := mean(COPEPrc, na.rm = TRUE), by = UserID] 266 | d[, WCOPEPrc := na.omit(COPEPrc) - BCOPEPrc, 267 | by = .(UserID, SurveyDay)] 268 | d[, BCOPEExp := mean(COPEExp, na.rm = TRUE), by = UserID] 269 | d[, WCOPEExp := na.omit(COPEExp) - BCOPEExp, 270 | by = .(UserID, SurveyDay)] 271 | d[, BCOPEDis := mean(COPEDis, na.rm = TRUE), by = UserID] 272 | d[, WCOPEDis := na.omit(COPEDis) - BCOPEDis, 273 | by = .(UserID, SurveyDay)] 274 | 275 | d[, BSOLs := mean(SOLs, na.rm = TRUE), by = UserID] 276 | d[, WSOLs := na.omit(SOLs) - BSOLs, 277 | by = .(UserID, SurveyDay)] 278 | d[, BWASONs := mean(WASONs, na.rm = TRUE), by = UserID] 279 | d[, WWASONs := na.omit(WASONs) - BWASONs, 280 | by = .(UserID, SurveyDay)] 281 | 282 | 283 | ## ------------------------------------------------------------------------ 284 | 285 | iccMixed("NegAff", "UserID", d) 286 | 287 | iccMixed("PosAff", "UserID", d) 288 | 289 | 290 | ## ------------------------------------------------------------------------ 291 | 292 | ## number of units 293 | n <- length(unique(d$UserID)) 294 | 295 | ## average observations per unit 296 | k <- nrow(d[!is.na(NegAff)])/n 297 | 298 | ## effective sample size 299 | nEffective(n, k, dv = "NegAff", id = "UserID", data = d) 300 | 301 | k <- nrow(d[!is.na(PosAff)])/n 302 | nEffective(n, k, dv = "PosAff", id = "UserID", data = d) 303 | 304 | 305 | ## ------------------------------------------------------------------------ 306 | 307 | tmp <- meanDecompose(PosAff ~ UserID, data = d) 308 | str(tmp, max.level = 1) 309 | 310 | 311 | ## ----fglmmi-bdistpa, fig.width=4, fig.height=5, out.width='.4\\linewidth', fig.pos="!h", fig.cap = "Between person positive affect against a normal distribution."---- 312 | 313 | testdistr(tmp[[1]]$X, varlab = names(tmp)[1], 314 | extremevalues = "theoretical", robust=TRUE) 315 | 316 | 317 | ## ----fglmmi-mdistpa, fig.width=8, fig.height=8, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Between and within person positive affect against a normal distribution."---- 318 | 319 | plots <- lapply(names(tmp), function(x) { 320 | testdistr(tmp[[x]]$X, plot = FALSE, varlab = x, 321 | extremevalues = "theoretical", robust=TRUE)[1:2] 322 | }) 323 | 324 | do.call(plot_grid, c(unlist(plots, FALSE), ncol = 2)) 325 | 326 | 327 | ## ----fglmmi-mdistna, fig.width=8, fig.height=12, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Different levels of negative affect against a normal distribution."---- 328 | 329 | tmp <- meanDecompose(NegAff ~ UserID + SurveyDay, data = d) 330 | do.call(plot_grid, c(unlist(lapply(names(tmp), function(x) { 331 | testdistr(tmp[[x]]$X, plot = FALSE, varlab = x, 332 | extremevalues = "theoretical", robust=TRUE)[1:2] 333 | }), FALSE), ncol = 2)) 334 | 335 | 336 | ## ----fglmmi-mdistlogna, fig.width=8, fig.height=12, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Different levels of the natural logarithm of negative affect against a normal distribution."---- 337 | 338 | d[, logNegAff := log(NegAff)] 339 | tmp <- meanDecompose(logNegAff ~ UserID + SurveyDay, data = d) 340 | do.call(plot_grid, c(unlist(lapply(names(tmp), function(x) { 341 | testdistr(tmp[[x]]$X, plot = FALSE, varlab = x, 342 | extremevalues = "theoretical", robust=TRUE)[1:2] 343 | }), FALSE), ncol = 2)) 344 | 345 | 346 | ## ----fglmmi-timetrends, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Trends in variables over time with a gam smooth.", warning=FALSE---- 347 | 348 | dt <- d[, .( 349 | WPosAff = mean(WPosAff, na.rm = TRUE), 350 | WNegAff = mean(WNegAff, na.rm = TRUE), 351 | WSTRESS = mean(WSTRESS, na.rm = TRUE), 352 | WSUPPORT = mean(WSUPPORT, na.rm = TRUE), 353 | WSOLs = mean(WSOLs, na.rm = TRUE), 354 | WWASONs = mean(WWASONs, na.rm = TRUE)) , by = SurveyDay] 355 | dt <- melt(dt, id.var = "SurveyDay") 356 | 357 | ggplot(dt, aes(SurveyDay, value)) + 358 | geom_point() + 359 | stat_smooth(method = "gam", formula = y ~ s(x, k = 10)) + 360 | facet_wrap(~ variable, scales = "free") 361 | 362 | 363 | ## ----fglmmi-weekend, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Trends in variables over time with a gam smooth."---- 364 | 365 | dt[, Weekend := weekdays(SurveyDay) %in% c("Saturday", "Sunday")] 366 | 367 | ggplot(dt, aes(Weekend, value)) + 368 | stat_summary(fun.data = mean_cl_boot) + 369 | facet_wrap(~ variable, scales = "free") 370 | 371 | 372 | ## ------------------------------------------------------------------------ 373 | 374 | d[, StartTimec11Alt := ifelse(is.na(SurveyStartTimec11), 375 | mean(SurveyStartTimec11, na.rm = TRUE), 376 | SurveyStartTimec11), 377 | by = .(UserID, Survey)] 378 | d[, StartDayTimec11Alt := chron( 379 | dates. = format(SurveyDay, "%m/%d/%Y"), 380 | times. = StartTimec11Alt)] 381 | 382 | 383 | ## ----fglmmi-acf1, fig.width=5, fig.height=4, out.width='.4\\linewidth', fig.pos="!h", fig.cap = "Autocorrelation for one participant.", warning=FALSE---- 384 | 385 | tmpd <- d[UserID == 1] 386 | acf(na.approx(zoo(tmpd$PosAff, 387 | order.by = tmpd$StartDayTimec11Alt)), 388 | lag.max = 10) 389 | 390 | 391 | ## ----fglmmi-acfall, fig.width=4, fig.height=8, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Autocorrelation for all participant for positive and negative affect and stress.", message = FALSE, warning = FALSE---- 392 | 393 | acf.posaff <- acfByID("PosAff", "StartDayTimec11Alt", 394 | "UserID", d) 395 | 396 | print(acf.posaff) 397 | 398 | ## make for other measures 399 | acf.negaff <- acfByID("NegAff", "StartDayTimec11Alt", 400 | "UserID", d) 401 | acf.stress <- acfByID("STRESS", "StartDayTimec11Alt", 402 | "UserID", d) 403 | 404 | ## put into one dataset for plotting a panel 405 | acf.all <- rbind( 406 | acf.posaff, acf.negaff, 407 | acf.stress) 408 | 409 | ggplot(acf.all, 410 | aes(factor(Lag), y = AutoCorrelation)) + 411 | geom_hline(yintercept = 0, colour = "grey50", size = 1) + 412 | geom_hline(yintercept = c(-.5, .5), 413 | linetype = 2, colour = "grey50", size = 1) + 414 | geom_boxplot() + ylab("Auto Correlation") + 415 | facet_wrap(~ Variable, ncol = 1) 416 | 417 | 418 | ## ------------------------------------------------------------------------ 419 | 420 | ## ensure data ordered by ID, date, and time 421 | d <- d[order(UserID, SurveyDay, SurveyInteger)] 422 | ## calculate a number for the survey from 1 to total 423 | d[, USURVEYID := 1:.N, by = .(UserID)] 424 | 425 | d[, 426 | c("NegAffLag1", "WNegAffLag1", 427 | "PosAffLag1", "WPosAffLag1", 428 | "STRESSLag1", "WSTRESSLag1") := 429 | .SD[.(UserID = UserID, USURVEYID = USURVEYID - 1), 430 | .(NegAff, WNegAff, 431 | PosAff, WPosAff, 432 | STRESS, WSTRESS), 433 | on = c("UserID", "USURVEYID")]] 434 | 435 | d[, 436 | c("WCOPEPrbLag1", "WCOPEPrcLag1", 437 | "WCOPEExpLag1", "WCOPEDisLag1", 438 | "WSOLsLag1", "WWASONsLag1") := 439 | .SD[.(UserID = UserID, Survey = Survey, SurveyDay = SurveyDay - 1), 440 | .(WCOPEPrb, WCOPEPrc, WCOPEExp, WCOPEDis, 441 | WSOLs, WWASONs), 442 | on = c("UserID", "Survey", "SurveyDay")]] 443 | 444 | ## save data after processing, with compression 445 | ## for use in subsequent chapters 446 | saveRDS(d, file = "aces_daily_sim_processed.RDS", 447 | compress = "xz") 448 | 449 | 450 | ## ----fglmmi-glmmdiagnegaff, fig.width=8, fig.height=10, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Negative affect mixed effects model diagnostic plots showing the distribution of residuals (top left), the residuals versus fitted values to assess homogeneity of variance (top right), the distribution of the random intercept (middle left), the distribution of the random slope (middle right), and whether the random effects are multivariate normal (bottom left).", warning = FALSE---- 451 | 452 | m.negaff <- lmer(NegAff ~ 1 + BSTRESS + WSTRESS + 453 | (1 + WSTRESS | UserID), data = d) 454 | assumptiontests <- plotDiagnosticsLMER(m.negaff, plot = FALSE) 455 | do.call(plot_grid, c( 456 | assumptiontests[c("ResPlot", "ResFittedPlot")], 457 | assumptiontests$RanefPlot, ncol = 2)) 458 | 459 | 460 | ## ----fglmmi-glmmdiagposaff, fig.width=8, fig.height=10, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Positive affect mixed effects model diagnostic plots showing the distribution of residuals (top left), the residuals versus fitted values to assess homogeneity of variance (top right), the distribution of the random intercept (middle left), the distribution of the random slope (middle right), and whether the random effects are multivariate normal (bottom left).", warning = FALSE---- 461 | 462 | m.posaff <- lmer(PosAff ~ 1 + BSTRESS + WSTRESS + 463 | (1 + WSTRESS | UserID), data = d) 464 | 465 | assumptiontests <- plotDiagnosticsLMER(m.posaff, plot = FALSE) 466 | do.call(plot_grid, c( 467 | assumptiontests[c("ResPlot", "ResFittedPlot")], 468 | assumptiontests$RanefPlot, ncol = 2)) 469 | 470 | 471 | ## ----fglmmi-glmmdiagposaff2, fig.width=8, fig.height=10, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Positive affect mixed effects model diagnostic plots showing the distribution of residuals (top left), the residuals versus fitted values to assess homogeneity of variance (top right), the distribution of the random intercept (middle left), the distribution of the random slope (middle right), and whether the random effects are multivariate normal (bottom left). Results after removing two multivariate outliers, IDs 57 and 123.", warning = FALSE---- 472 | 473 | assumptiontests$ExtremeValues[ 474 | EffectType == "Multivariate Random Effect UserID"] 475 | 476 | m.posaff <- lmer(PosAff ~ 1 + BSTRESS + WSTRESS + 477 | (1 + WSTRESS | UserID), 478 | data = d[!UserID %in% c(57, 123)]) 479 | 480 | assumptiontests <- plotDiagnosticsLMER(m.posaff, plot = FALSE) 481 | do.call(plot_grid, c( 482 | assumptiontests[c("ResPlot", "ResFittedPlot")], 483 | assumptiontests$RanefPlot, ncol = 2)) 484 | 485 | 486 | 487 | 488 | ## ----fglmmi-posaffext, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Positive affect and stress associations, highlight extreme cases", warning = FALSE---- 489 | 490 | ggplot() + 491 | stat_smooth(aes(WSTRESS, PosAff, group = UserID), 492 | data = d[!UserID %in% c(123)], method = "lm", 493 | se = FALSE, colour = "grey50") + 494 | stat_smooth(aes(WSTRESS, PosAff, group = UserID), 495 | data = d[UserID %in% c(123)], method = "lm", 496 | se = FALSE, colour = "blue", size = 2) + 497 | geom_point(aes(WSTRESS, PosAff), 498 | data = d[UserID %in% c(123)], colour = "blue", size = 2) + 499 | stat_smooth(aes(WSTRESS, PosAff, group = UserID), 500 | data = d[UserID %in% c(57)], method = "lm", 501 | se = FALSE, colour = "orange", size = 2) + 502 | geom_point(aes(WSTRESS, PosAff), 503 | data = d[UserID %in% c(57)], colour = "orange", size = 2) 504 | 505 | 506 | -------------------------------------------------------------------------------- /iiv.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(knitr) 11 | library(ggplot2) 12 | library(cowplot) 13 | library(viridis) 14 | library(data.table) 15 | library(JWileymisc) 16 | library(varian) 17 | library(mice) 18 | library(parallel) 19 | 20 | options(width = 70, digits = 2) 21 | 22 | 23 | ## ------------------------------------------------------------------------ 24 | 25 | ## ordered 26 | sd(c(1, 3, 5, 7, 9)) 27 | rmssd(c(1, 3, 5, 7, 9)) 28 | 29 | ## randomized 30 | sd(c(3, 1, 9, 5, 7)) 31 | rmssd(c(3, 1, 9, 5, 7)) 32 | 33 | 34 | ## ------------------------------------------------------------------------ 35 | 36 | data(aces_daily) 37 | draw <- as.data.table(aces_daily) 38 | d <- readRDS("aces_daily_sim_processed.RDS") 39 | 40 | variability_measures <- function(x) { 41 | x <- na.omit(x) 42 | list( 43 | SD = sd(x), 44 | VAR = sd(x)^2, 45 | RMSSD = rmssd(x), 46 | MSSD = rmssd(x)^2, 47 | MAD = median(abs(x - median(x))), 48 | RANGE = range(x), 49 | IQR = abs(diff(quantile(x, probs = c(.25, .75)))), 50 | CV = sd(x) / mean(x)) 51 | } 52 | 53 | 54 | ## ----fiiv-varplot, fig.width=14, fig.height=14, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Correlations between variability measures for applied to individuals across four different measures."---- 55 | 56 | plot_grid( 57 | plot(SEMSummary(~ ., 58 | data = d[, variability_measures(PosAff), by = UserID][,-1]), 59 | order = "asis") + 60 | ggtitle("PosAff"), 61 | plot(SEMSummary(~ ., 62 | data = d[, variability_measures(NegAff), by = UserID][,-1]), 63 | order = "asis") + 64 | ggtitle("NegAff"), 65 | plot(SEMSummary(~ ., 66 | data = d[, variability_measures(COPEPrc), by = UserID][,-1]), 67 | order = "asis") + 68 | ggtitle("COPEPrc"), 69 | plot(SEMSummary(~ ., 70 | data = d[, variability_measures(SOLs), by = UserID][,-1]), 71 | order = "asis") + 72 | ggtitle("SOLs"), 73 | ncol = 2) 74 | 75 | 76 | ## ----fiiv-varplot2, fig.width=7, fig.height=7, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Two hypothetical people given an intervention, Person A (purple) and Person B (yellow), both improve at about the same rate, but Person B is less consistent."---- 77 | 78 | iivdat <- data.table( 79 | Assessment = 0:15, 80 | PersonA = c(1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10), 81 | PersonB = c(2, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11, 8, 12)) 82 | 83 | ggplot(iivdat, aes(Assessment)) + 84 | stat_smooth(aes(y = PersonA), method = "lm", se=FALSE, 85 | colour = viridis(2)[1], linetype = 2) + 86 | geom_line(aes(y = PersonA), 87 | colour = viridis(2)[1], size = 1) + 88 | stat_smooth(aes(y = PersonB), method = "lm", se=FALSE, 89 | colour = viridis(2)[2], linetype = 2) + 90 | geom_line(aes(y = PersonB), 91 | colour = viridis(2)[2], size = 1) + 92 | ylab("Outcome Scores") 93 | 94 | 95 | ## ------------------------------------------------------------------------ 96 | 97 | ## ISD 98 | sd(iivdat$PersonA) 99 | sd(iivdat$PersonB) 100 | 101 | 102 | ## ------------------------------------------------------------------------ 103 | 104 | ## ISD, after removing systematic improvements 105 | sd(resid(lm(PersonA ~ Assessment, data = iivdat))) 106 | sd(resid(lm(PersonB ~ Assessment, data = iivdat))) 107 | 108 | 109 | ## ------------------------------------------------------------------------ 110 | 111 | cl <- makeCluster(2) 112 | clusterExport(cl, c("book_directory", "checkpoint_directory" )) 113 | 114 | clusterEvalQ(cl, { 115 | library(checkpoint) 116 | checkpoint("2018-09-28", R.version = "3.5.1", 117 | project = book_directory, 118 | checkpointLocation = checkpoint_directory, 119 | scanForPackages = FALSE, 120 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 121 | 122 | library(varian) 123 | }) 124 | 125 | system.time(m <- varian( 126 | y.formula = BNegAff ~ 1, 127 | v.formula = PosAff ~ 1 | UserID, 128 | data = d, 129 | design = "V -> Y", 130 | useU = TRUE, 131 | totaliter = 10000, 132 | warmup = 500, thin = 5, 133 | chains = 2, verbose=TRUE, 134 | cl = cl)) 135 | 136 | 137 | ## ----fiiv-bvmdiag1, fig.width=9, fig.height=14, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Diagnostics including percent scale reduction factor (Rhat), effective sample size, distribution of the individual standard deviations, distribution of the individual means, and individual estimates of individual standard deviations and means with credible intervals."---- 138 | 139 | ## check diagnostics 140 | vm_diagnostics(m) 141 | 142 | 143 | ## ----fiiv-bvmres1, fig.width=8, fig.height=8, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Plots of the distributions, bivariate scatter plot, and proportion of cases on each side of zero for empirical p-values."---- 144 | 145 | ## extract MCMC samples 146 | mcmc.samples <- extract(m$results, 147 | permute = TRUE) 148 | 149 | ## examine MCMC samples of 150 | ## the alpha regression coefficients 151 | vmp_plot(mcmc.samples$Yalpha) 152 | 153 | 154 | ## ------------------------------------------------------------------------ 155 | 156 | ## intercept of average negative affect 157 | param_summary(mcmc.samples$YB[, 1]) 158 | 159 | ## IIV on average negative affect 160 | param_summary(mcmc.samples$Yalpha[, 1]) 161 | 162 | ## individual mean on average negative affect 163 | param_summary(mcmc.samples$Yalpha[, 2]) 164 | 165 | ## residual error of average negative affect 166 | param_summary(mcmc.samples$sigma_Y) 167 | 168 | 169 | ## ------------------------------------------------------------------------ 170 | 171 | ## intercept of positive affect 172 | param_summary(mcmc.samples$VB[, 1]) 173 | 174 | ## positive affect random intercept standard deviation 175 | param_summary(mcmc.samples$sigma_U) 176 | 177 | ## estimate of the gamma rate parameter for IIVs 178 | param_summary(mcmc.samples$rate) 179 | 180 | ## estimate of the gamma shape parameter for IIVs 181 | param_summary(mcmc.samples$shape) 182 | 183 | ## ------------------------------------------------------------------------ 184 | 185 | dim(mcmc.samples$Sigma_V) 186 | str(mcmc.samples$Sigma_V) 187 | 188 | 189 | ## ------------------------------------------------------------------------ 190 | 191 | avg_dataset <- cbind( 192 | d[!duplicated(UserID), .(BNegAff)], 193 | IIV = colMeans(mcmc.samples$Sigma_V), 194 | IIM = colMeans(mcmc.samples$U)) 195 | 196 | avg_model <- lm(BNegAff ~ IIV + IIM, data = avg_dataset) 197 | 198 | summary(avg_model) 199 | 200 | 201 | ## ------------------------------------------------------------------------ 202 | 203 | ind_dataset <- lapply(seq(1, 1000, by = 10), function(i) { 204 | cbind( 205 | d[!duplicated(UserID), .(BNegAff)], 206 | IIV = mcmc.samples$Sigma_V[i, ], 207 | IIM = mcmc.samples$U[i, ]) 208 | }) 209 | 210 | 211 | ind_model <- lapply(ind_dataset, function(tmpdat) { 212 | lm(BNegAff ~ IIV + IIM, data = tmpdat) 213 | }) 214 | 215 | ind_model_pooled <- pool(as.mira(ind_model)) 216 | 217 | 218 | ## ------------------------------------------------------------------------ 219 | 220 | raw_model <- lm(BNegAff ~ IIV + IIM, 221 | data = d[, .(BNegAff = BNegAff[1], 222 | IIV = sd(PosAff, na.rm = TRUE), 223 | IIM = mean(PosAff, na.rm = TRUE)), 224 | by = UserID]) 225 | 226 | 227 | ## ------------------------------------------------------------------------ 228 | 229 | ## Bayesian Results 230 | param_summary(mcmc.samples$YB[, 1]) ## intercept 231 | param_summary(mcmc.samples$Yalpha[, 1]) ## IIV 232 | param_summary(mcmc.samples$Yalpha[, 2]) ## IIM 233 | 234 | ## using averages only 235 | cbind(B = coef(avg_model), confint(avg_model)) 236 | 237 | ## using raw ISDs 238 | cbind(B = coef(raw_model), confint(raw_model)) 239 | 240 | ## treating as multiply imputed 241 | summary(ind_model_pooled, conf.int = TRUE) 242 | 243 | 244 | -------------------------------------------------------------------------------- /intro.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | library(checkpoint) 3 | checkpoint("2018-09-28", R.version = "3.5.1", 4 | project = book_directory, 5 | checkpointLocation = checkpoint_directory, 6 | scanForPackages = FALSE, 7 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 8 | 9 | library(data.table) 10 | 11 | options( 12 | width = 70, 13 | stringsAsFactors = FALSE, 14 | digits = 2) 15 | 16 | ## ------------------------------------------------------------------------ 17 | 18 | load("../ICPSR_04690/DS0001/04690-0001-Data.rda") 19 | ls() 20 | 21 | acl <- as.data.table(da04690.0001) 22 | acl <- acl[, .( 23 | V2, V1801, V2101, V2064, 24 | V3007, V2623, V2636, V2640, 25 | V2000, 26 | V2200, V2201, V2202, 27 | V2613, V2614, V2616, 28 | V2618, V2681, 29 | V7007, V6623, V6636, V6640, 30 | V6201, V6202, 31 | V6613, V6614, V6616, 32 | V6618, V6681 33 | )] 34 | 35 | setnames(acl, names(acl), c( 36 | "ID", "Sex", "RaceEthnicity", "SESCategory", 37 | "Employment_W1", "BMI_W1", "Smoke_W1", "PhysActCat_W1", 38 | "AGE_W1", 39 | "SWL_W1", "InformalSI_W1", "FormalSI_W1", 40 | "SelfEsteem_W1", "Mastery_W1", "SelfEfficacy_W1", 41 | "CESD11_W1", "NChronic12_W1", 42 | "Employment_W2", "BMI_W2", "Smoke_W2", "PhysActCat_W2", 43 | "InformalSI_W2", "FormalSI_W2", 44 | "SelfEsteem_W2", "Mastery_W2", "SelfEfficacy_W2", 45 | "CESD11_W2", "NChronic12_W2" 46 | )) 47 | 48 | acl[, ID := factor(ID)] 49 | acl[, SESCategory := factor(SESCategory)] 50 | acl[, SWL_W1 := SWL_W1 * -1] 51 | 52 | saveRDS(acl, "advancedr_acl_data.RDS", compress = "xz") 53 | 54 | 55 | -------------------------------------------------------------------------------- /md.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(knitr) 11 | library(ggplot2) 12 | library(cowplot) 13 | library(lattice) 14 | library(viridis) 15 | library(VIM) 16 | 17 | library(mice) 18 | library(micemd) 19 | library(parallel) 20 | 21 | library(data.table) 22 | library(xtable) 23 | library(JWileymisc) # has data 24 | 25 | options(width = 70, digits = 2) 26 | 27 | 28 | ## ----cache=FALSE--------------------------------------------------------- 29 | 30 | ## load example dataset 31 | data("aces_daily") 32 | draw <- as.data.table(aces_daily)[order(UserID)] 33 | davg <- na.omit(draw[, .( 34 | Female = na.omit(Female)[1], 35 | Age = na.omit(Age)[1], 36 | SES_1 = na.omit(SES_1)[1], 37 | EDU = na.omit(EDU)[1], 38 | STRESS = mean(STRESS, na.rm = TRUE), 39 | SUPPORT = mean(SUPPORT, na.rm = TRUE), 40 | PosAff = mean(PosAff, na.rm = TRUE), 41 | NegAff = mean(NegAff, na.rm = TRUE)), 42 | by = UserID]) 43 | 44 | 45 | ## ------------------------------------------------------------------------ 46 | 47 | ## missing depending on support and stress 48 | davg[, MissingProb := ifelse( 49 | SUPPORT < 5, 50 | ifelse(STRESS > 2.5, .4, .0), 51 | ifelse(STRESS > 2.5, 0, .4))] 52 | 53 | set.seed(1234) 54 | davgmiss <- copy(davg) 55 | davgmiss[, PosAff := ifelse(rbinom( 56 | .N, size = 1, prob = MissingProb) == 1, 57 | NA, PosAff)] 58 | davgmiss[, NegAff := ifelse(rbinom( 59 | .N, size = 1, prob = MissingProb) == 1, 60 | NA, NegAff)] 61 | ## random missingness on stress and support 62 | davgmiss[, STRESS := ifelse(rbinom( 63 | .N, size = 1, prob = .1) == 1, 64 | NA, STRESS)] 65 | davgmiss[, SUPPORT := ifelse(rbinom( 66 | .N, size = 1, prob = .1) == 1, 67 | NA, SUPPORT)] 68 | davgmiss[, Age := ifelse(rbinom( 69 | .N, size = 1, prob = .1) == 1, 70 | NA, Age)] 71 | davgmiss[, SES_1 := ifelse(rbinom( 72 | .N, size = 1, prob = .1) == 1, 73 | NA, SES_1)] 74 | davgmiss[, Female := factor(ifelse(rbinom( 75 | .N, size = 1, prob = .1) == 1, 76 | NA, Female), levels = 0:1, 77 | labels = c("Male", "Female"))] 78 | davgmiss[, EDU := factor(ifelse(rbinom( 79 | .N, size = 1, prob = .1) == 1, 80 | NA, EDU), levels = 0:1, 81 | labels = c("< Uni Graduate", "Uni Graduate +"))] 82 | ## drop unneeded variables to make analysis easier 83 | davgmiss[, MissingProb := NULL] 84 | davgmiss[, UserID := NULL] 85 | 86 | 87 | ## ----fmd-missplot, fig.width=12, fig.height=12/1.1, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Visual summary of the missingness by variable and missingness patterns."---- 88 | 89 | aggr(davgmiss, prop = TRUE, 90 | numbers = TRUE) 91 | 92 | 93 | ## ----fmd-missbi, fig.width=9, fig.height=9, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Bivariate plots of missing data. Central dots show the non-missing data. Margin dots show missing data. Boxplots summarize each variable by whether the other variable is missing or present."---- 94 | 95 | par(mfrow = c(2, 2)) 96 | marginplot(davgmiss[,.(STRESS, NegAff)]) 97 | marginplot(davgmiss[,.(SUPPORT, NegAff)]) 98 | marginplot(davgmiss[,.(STRESS, PosAff)]) 99 | marginplot(davgmiss[,.(SUPPORT, PosAff)]) 100 | 101 | 102 | ## ------------------------------------------------------------------------ 103 | 104 | ## does age differ by missing on negative affect? 105 | t.test(Age ~ is.na(NegAff), data = davgmiss)$p.value 106 | 107 | ## does age differ by missing on positive affect? 108 | t.test(Age ~ is.na(PosAff), data = davgmiss)$p.value 109 | 110 | ## does stress differ by missing on negative affect? 111 | t.test(STRESS ~ is.na(NegAff), data = davgmiss)$p.value 112 | 113 | ## does stress differ by missing on positive affect? 114 | t.test(STRESS ~ is.na(PosAff), data = davgmiss)$p.value 115 | 116 | ## does social support differ by missing on negative affect? 117 | t.test(SUPPORT ~ is.na(NegAff), data = davgmiss)$p.value 118 | 119 | ## does social support differ by missing on positive affect? 120 | t.test(SUPPORT ~ is.na(PosAff), data = davgmiss)$p.value 121 | 122 | 123 | ## ------------------------------------------------------------------------ 124 | 125 | system.time(mi.1 <- mice( 126 | davgmiss, 127 | m = 6, maxit = 10, 128 | defaultMethod = c("norm", "logreg", "polyreg", "polr"), 129 | seed = 1234, printFlag = FALSE) 130 | ) 131 | 132 | 133 | ## ----fmd-micediag, fig.width=7, fig.height=9, out.width='1\\linewidth', fig.pos="!h", fig.cap = c("mice diagnostics for convergence", "mice diagnostics for convergence after more iterations")---- 134 | 135 | ## plot convergence diagnostics 136 | plot(mi.1, PosAff + NegAff + SUPPORT ~ .it | .ms) 137 | 138 | ## run an additional iterations 139 | system.time(mi.1 <- mice.mids( 140 | mi.1, maxit = 10, 141 | printFlag = FALSE) 142 | ) 143 | 144 | ## plot convergence diagnostics 145 | plot(mi.1, PosAff + NegAff + SUPPORT ~ .it | .ms) 146 | 147 | 148 | ## ----fmd-micediag2, fig.width=7, fig.height=7, out.width='1\\linewidth', fig.pos="!h", fig.cap = c("Univariate density plots for observed and imputed data, separated by imputation.", "Bivariate scatter plots with imputed data colored separately by observed and imputed data.")---- 149 | 150 | densityplot(mi.1, ~ PosAff + NegAff + SUPPORT + STRESS) 151 | 152 | xyplot(mi.1, NegAff + PosAff ~ STRESS + SUPPORT) 153 | 154 | 155 | ## ------------------------------------------------------------------------ 156 | 157 | lm.1 <- with(mi.1, lm(PosAff ~ STRESS + Age + EDU + Female)) 158 | 159 | lm.1 160 | 161 | 162 | ## ----fmd-modeldiag, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Linear regression model diagnostics from first imputed dataset.")---- 163 | 164 | par(mfcol = c(2,2 )) 165 | plot(lm.1$analyses[[1]]) 166 | par(mfcol = c(1,1)) 167 | 168 | 169 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 170 | ## xtable(summary(pool(lm.1), conf.int=TRUE), 171 | ## digits = 2, 172 | ## caption = "Regression results pooled across multiply imputed data", 173 | ## label = "tmd-pooledres1") 174 | 175 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 176 | xtable(summary(pool(lm.1), conf.int=TRUE), 177 | digits = 2, 178 | caption = "Regression results pooled across multiply imputed data", 179 | label = "tmd-pooledres1") 180 | 181 | ## ------------------------------------------------------------------------ 182 | 183 | pool.r.squared(lm.1) 184 | 185 | 186 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 187 | ## xtable(summary(pool(lm.1), type = "all", conf.int=TRUE), 188 | ## digits = 2, 189 | ## caption = "Regression results pooled across multiply imputed data with additional information", 190 | ## label = "tmd-pooledres1alt") 191 | 192 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 193 | xtable(summary(pool(lm.1), type = "all", conf.int=TRUE), 194 | digits = 2, 195 | caption = "Regression results pooled across multiply imputed data with additional information", 196 | label = "tmd-pooledres1alt") 197 | 198 | ## ----fmd-predhat, fig.width=5, fig.height=4, out.width='.5\\linewidth', fig.pos="!h", fig.cap = c("Pooled predictions from linear regression models of the association between stress and positive affect.")---- 199 | 200 | newdat <- data.frame( 201 | STRESS = seq(from = 0, to = 6, length.out = 100), 202 | Age = mean(davg$Age), 203 | EDU = factor("< Uni Graduate", levels = levels(davgmiss$EDU)), 204 | Female = factor("Female", levels = levels(davgmiss$Female))) 205 | 206 | newdat$PosAff <- rowMeans(sapply(1:6, function(i) { 207 | predict(lm.1$analyses[[i]], newdata = newdat) 208 | })) 209 | 210 | ggplot(newdat, aes(STRESS, PosAff)) + 211 | geom_line() 212 | 213 | 214 | ## ------------------------------------------------------------------------ 215 | 216 | cl <- makeCluster(2) 217 | clusterExport(cl, c("book_directory", "checkpoint_directory" )) 218 | 219 | clusterEvalQ(cl, { 220 | library(checkpoint) 221 | checkpoint("2018-09-28", R.version = "3.5.1", 222 | project = book_directory, 223 | checkpointLocation = checkpoint_directory, 224 | scanForPackages = FALSE, 225 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 226 | library(mice) 227 | library(randomForest) 228 | library(data.table) 229 | }) 230 | 231 | imputation_seeds <- c( 232 | 403L, 2L, 2118700268L, 1567504751L, 233 | -161759579L, -1822093220L) 234 | 235 | clusterExport(cl, c("davgmiss", "imputation_seeds")) 236 | 237 | 238 | ## ------------------------------------------------------------------------ 239 | 240 | system.time(mi.par <- parLapplyLB(cl, 1:6, function(i) { 241 | mice( 242 | davgmiss, 243 | m = 1, maxit = 20, 244 | defaultMethod = c("norm", "logreg", "polyreg", "polr"), 245 | seed = imputation_seeds[i]) 246 | })) 247 | 248 | 249 | ## ------------------------------------------------------------------------ 250 | 251 | ## combine the separate imputations into a single object 252 | mi.par2 <- ibind(mi.par[[1]], mi.par[[2]]) 253 | for (i in 3:6) { 254 | mi.par2 <- ibind(mi.par2, mi.par[[i]]) 255 | } 256 | 257 | mi.par2 258 | 259 | 260 | ## ----fmd-rfdiag, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Convergence diagnostics for random forest imputation model.", "Density plots of observed and imputed values from random forest model.", "Scatter plots of affect versus stress and social support for observed and imputed values.")---- 261 | 262 | system.time(mi.rfpar <- parLapplyLB(cl, 1:6, function(i) { 263 | mice( 264 | davgmiss, 265 | m = 1, maxit = 30, 266 | method = "rf", 267 | seed = imputation_seeds[i], 268 | ntree = 500, nodesize = 10) 269 | })) 270 | 271 | ## combine into a single object 272 | mi.rf <- ibind(mi.rfpar[[1]], mi.rfpar[[2]]) 273 | for (i in 3:6) { 274 | mi.rf <- ibind(mi.rf, mi.rfpar[[i]]) 275 | } 276 | 277 | ## plot convergence diagnostics 278 | plot(mi.rf, PosAff + NegAff + SUPPORT ~ .it | .ms) 279 | 280 | ## model diagnostics 281 | densityplot(mi.rf, ~ PosAff + NegAff + SUPPORT + STRESS) 282 | 283 | xyplot(mi.rf, NegAff + PosAff ~ STRESS + SUPPORT) 284 | 285 | 286 | ## ----fmd-micompare, fig.width=7, fig.height=5, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Pooled predictions from linear regression models of the association between stress and positive affect.")---- 287 | 288 | m.true <- lm(PosAff ~ STRESS + Age + EDU + Female, data = davg) 289 | m.cc <- lm(PosAff ~ STRESS + Age + EDU + Female, data = davgmiss) 290 | m.mireg <- summary(pool(with(mi.1, 291 | lm(PosAff ~ STRESS + Age + EDU + Female))), 292 | conf.int = TRUE) 293 | m.mirf <- summary(pool(with(mi.rf, 294 | lm(PosAff ~ STRESS + Age + EDU + Female))), 295 | conf.int = TRUE) 296 | 297 | res.true <- as.data.table(cbind(coef(m.true), confint(m.true))) 298 | res.cc <- as.data.table(cbind(coef(m.cc), confint(m.cc))) 299 | res.mireg <- as.data.table(m.mireg[, c("estimate", "2.5 %", "97.5 %")]) 300 | res.mirf <- as.data.table(m.mirf[, c("estimate", "2.5 %", "97.5 %")]) 301 | setnames(res.true, c("B", "LL", "UL")) 302 | setnames(res.cc, c("B", "LL", "UL")) 303 | setnames(res.mireg, c("B", "LL", "UL")) 304 | setnames(res.mirf, c("B", "LL", "UL")) 305 | 306 | res.compare <- rbind( 307 | cbind(Type = "Truth", Param = names(coef(m.true)), res.true), 308 | cbind(Type = "CC", Param = names(coef(m.true)), res.cc), 309 | cbind(Type = "MI Reg", Param = names(coef(m.true)), res.mireg), 310 | cbind(Type = "MI RF", Param = names(coef(m.true)), res.mirf)) 311 | 312 | ggplot(res.compare, aes(factor(""), 313 | y = B, ymin = LL, ymax = UL, colour = Type)) + 314 | geom_pointrange(position = position_dodge(.4)) + 315 | scale_color_viridis(discrete = TRUE) + 316 | facet_wrap(~Param, scales = "free") + 317 | theme( 318 | legend.position = c(1, 0), 319 | legend.justification = c("right", "bottom")) 320 | 321 | ## clean up cluster 322 | stopCluster(cl) 323 | rm(cl) 324 | 325 | 326 | ## ------------------------------------------------------------------------ 327 | 328 | cl <- makeCluster(2) 329 | clusterExport(cl, c("book_directory", "checkpoint_directory" )) 330 | 331 | clusterEvalQ(cl, { 332 | library(checkpoint) 333 | checkpoint("2018-09-28", R.version = "3.5.1", 334 | project = book_directory, 335 | checkpointLocation = checkpoint_directory, 336 | scanForPackages = FALSE, 337 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 338 | library(mice) 339 | library(randomForest) 340 | library(data.table) 341 | }) 342 | 343 | 344 | ## ------------------------------------------------------------------------ 345 | 346 | ## example of how to have R return some seed values 347 | dput(.Random.seed[1:5]) 348 | 349 | ## random seeds 350 | imputation_seeds <- c(403L, 148L, -1767993668L, 351 | 1417792552L, 298386660L, 1360311820L, 352 | 1356573822L, -1472988872L, 1215046494L, 759520201L, 353 | 1399305648L, -455288776L, 969619279L, 518793662L, 354 | -383967014L, -1983801345L, -698559309L, 1957301883L, 355 | -1457959076L, 1321574932L, -537238757L, 356 | 11573466L, 1466816383L, -2113923363L, 1663041018L) 357 | 358 | clusterExport(cl, c("davgmiss", "imputation_seeds")) 359 | 360 | 361 | ## ----fmd-rfcdx, fig.width=9, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Density plots for continuous variables to be included in the imputation model"---- 362 | 363 | ggplot(melt(davgmiss[, sapply(davgmiss, is.numeric), 364 | with = FALSE], measure.vars = 1:6), aes(value)) + 365 | geom_density() + geom_rug() + 366 | facet_wrap(~variable, scales = "free") 367 | 368 | 369 | ## ------------------------------------------------------------------------ 370 | 371 | start.time <- proc.time() 372 | mi.rfpar1 <- parLapplyLB(cl, 1:4, function(i) { 373 | mice( 374 | davgmiss, 375 | m = 1, maxit = 5, 376 | method = "rf", 377 | seed = imputation_seeds[i], 378 | ntree = 100, nodesize = 10) 379 | }) 380 | stop.time <- proc.time() 381 | 382 | ## estimate of how long it took 383 | stop.time - start.time 384 | 385 | ## combine into a single object 386 | mi.rf1 <- ibind(mi.rfpar1[[1]], mi.rfpar1[[2]]) 387 | for (i in 3:4) { 388 | mi.rf1 <- ibind(mi.rf1, mi.rfpar1[[i]]) 389 | } 390 | 391 | 392 | ## ----fmd-rfcase1, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Convergence diagnostics for random forest imputation model.")---- 393 | 394 | ## plot convergence diagnostics 395 | plot(mi.rf1, NegAff + STRESS + Age ~ .it | .ms) 396 | 397 | 398 | ## ----fmd-rfcase2, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Density plots of observed and imputed values from random forest model.")---- 399 | 400 | ## model diagnostics for continuous study variables 401 | densityplot(mi.rf1, ~ NegAff + STRESS + Age) 402 | 403 | 404 | ## ----fmd-rfcase3, fig.width=7, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Distribution plot (density and QQ Deviates) for model residuals.")---- 405 | 406 | ## fit the models 407 | fit.mirf1 <- with(mi.rf1, 408 | lm(NegAff ~ STRESS + Age + EDU + Female + SES_1)) 409 | 410 | testdistr(unlist(lapply(fit.mirf1$analyses, rstandard))) 411 | 412 | 413 | ## ------------------------------------------------------------------------ 414 | 415 | ## pool results and summarize 416 | m.mirf1 <- summary(pool(fit.mirf1), conf.int = TRUE) 417 | 418 | 419 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 420 | ## xtable(m.mirf1, 421 | ## digits = 2, 422 | ## caption = "Regression results pooled across multiply imputed data test run", 423 | ## label = "tmd-pooledres2") 424 | 425 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 426 | xtable(m.mirf1, 427 | digits = 2, 428 | caption = "Regression results pooled across multiply imputed data test run", 429 | label = "tmd-pooledres2") 430 | 431 | ## ----fmd-rfcase4, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Convergence diagnostics for random forest imputation model.", "Density plots of observed and imputed values from random forest model.")---- 432 | 433 | start.time2 <- proc.time() 434 | mi.rfpar2 <- parLapplyLB(cl, 1:10, function(i) { 435 | mice( 436 | davgmiss, 437 | m = 1, maxit = 30, 438 | method = "rf", 439 | seed = imputation_seeds[i], 440 | ntree = 100, nodesize = 10) 441 | }) 442 | stop.time2 <- proc.time() 443 | 444 | ## time taken 445 | stop.time2 - start.time2 446 | 447 | ## combine into a single object 448 | mi.rf2 <- ibind(mi.rfpar2[[1]], mi.rfpar2[[2]]) 449 | for (i in 3:10) { 450 | mi.rf2 <- ibind(mi.rf2, mi.rfpar2[[i]]) 451 | } 452 | 453 | 454 | ## ----fmd-rfcase5, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Convergence diagnostics for random forest imputation model.")---- 455 | 456 | ## plot convergence diagnostics 457 | plot(mi.rf2, NegAff + STRESS + Age ~ .it | .ms) 458 | 459 | 460 | ## ----fmd-rfcase6, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Density plots of observed and imputed values from random forest model.")---- 461 | 462 | ## model diagnostics for continuous study variables 463 | densityplot(mi.rf2, ~ NegAff + STRESS + Age) 464 | 465 | 466 | ## ----fmd-rfcase7, fig.width=7, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Distribution plot (density and QQ Deviates) for model residuals.")---- 467 | 468 | ## fit the models 469 | fit.mirf2 <- with(mi.rf2, 470 | lm(NegAff ~ STRESS + Age + EDU + Female + SES_1)) 471 | 472 | testdistr(unlist(lapply(fit.mirf2$analyses, rstandard))) 473 | 474 | 475 | ## ------------------------------------------------------------------------ 476 | 477 | ## pool results and summarize 478 | m.mirf2 <- summary(pool(fit.mirf2), conf.int = TRUE) 479 | 480 | 481 | ## ----echo=TRUE, eval=FALSE----------------------------------------------- 482 | ## xtable(m.mirf2, 483 | ## digits = 2, 484 | ## caption = "Regression results pooled across multiply imputed data final run", 485 | ## label = "tmd-pooledres3") 486 | 487 | ## ----echo=FALSE, results='asis', listings=FALSE-------------------------- 488 | xtable(m.mirf2, 489 | digits = 2, 490 | caption = "Regression results pooled across multiply imputed data final run", 491 | label = "tmd-pooledres3") 492 | 493 | -------------------------------------------------------------------------------- /mdv.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(knitr) 11 | library(ggplot2) 12 | library(cowplot) 13 | library(MASS) 14 | library(mvtnorm) 15 | library(mgcv) 16 | library(quantreg) 17 | library(JWileymisc) 18 | library(data.table) 19 | 20 | options(width = 70, digits = 2) 21 | 22 | 23 | ## ----fmdv-density2d, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "2D empirical density plot for multivariate normal data."---- 24 | 25 | mu <- c(0, 0) 26 | sigma <- matrix(c(1, .5, .5, 1), 2) 27 | 28 | set.seed(1234) 29 | d <- as.data.table(rmvnorm(500, mean = mu, sigma = sigma)) 30 | setnames(d, names(d), c("x", "y")) 31 | 32 | ggplot(d, aes(x, y)) + 33 | geom_point(colour = "grey60") + 34 | geom_density2d(size = 1, colour = "black") + 35 | theme_cowplot() 36 | 37 | 38 | ## ----fmdv-density2dnorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "2D empirical density versus multivariate normal density plot."---- 39 | 40 | testd <- as.data.table(expand.grid( 41 | x = seq(from = min(d$x), to = max(d$x), length.out = 50), 42 | y = seq(from = min(d$y), to = max(d$y), length.out = 50))) 43 | testd[, Density := dmvnorm(cbind(x, y), mean = colMeans(d), sigma = cov(d))] 44 | 45 | ggplot(d, aes(x, y)) + 46 | geom_contour(aes(x, y, z = Density), data = testd, 47 | colour = "blue", size = 1, linetype = 2) + 48 | geom_density2d(size = 1, colour = "black") + 49 | theme_cowplot() 50 | 51 | 52 | ## ----fmdv-uninorm, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Univariate density plots showing the simulated variables are univariate normal."---- 53 | 54 | set.seed(1234) 55 | d2 <- data.table(x = rnorm(500)) 56 | d2[, y := ifelse(abs(x) > 1, x, -x)] 57 | 58 | plot_grid( 59 | testdistr(d2$x, plot = FALSE)$Density, 60 | testdistr(d2$y, plot = FALSE, varlab = "Y")$Density, 61 | ncol = 2) 62 | 63 | 64 | ## ----fmdv-2dnotnorm, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "2D density plot showing data that are not multivariate normal."---- 65 | 66 | testd2 <- as.data.table(expand.grid( 67 | x = seq(from = min(d2$x), to = max(d2$x), length.out = 50), 68 | y = seq(from = min(d2$y), to = max(d2$y), length.out = 50))) 69 | testd2[, Density := dmvnorm(cbind(x, y), mean = colMeans(d2), sigma = cov(d2))] 70 | 71 | ggplot(d2, aes(x, y)) + 72 | geom_contour(aes(x, y, z = Density), data = testd2, 73 | colour = "blue", size = 1, linetype = 2) + 74 | geom_density2d(size = 1, colour = "black") + 75 | theme_cowplot() 76 | 77 | 78 | ## ----fmdv-testdistr-mvnorm, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = c("Density plot superimposing multivariate normal distribution and QQ plot, showing multivariate normal data.", "Density plot superimposing multivariate normal distribution and QQ plot, showing data that are not multivariate normal.")---- 79 | 80 | testdistr(d, "mvnorm", ncol = 2) 81 | 82 | testdistr(d2, "mvnorm", ncol = 2) 83 | 84 | 85 | ## ----fmdv-mtcars-mvnorm, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Density plot superimposing multivariate normal distribution and QQ plot for mtcars data."---- 86 | 87 | testdistr(mtcars, "mvnorm", ncol = 2) 88 | 89 | 90 | ## ----fmdv-uvoutlier, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Density plot superimposing normal distribution for data with an anomalous value (Panel A) and with the anomalous value removed (Panel B)."---- 91 | 92 | mu <- c(0, 0, 0) 93 | sigma <- matrix(.7, 3, 3) 94 | diag(sigma) <- 1 95 | 96 | set.seed(12345) 97 | d <- as.data.table(rmvnorm(200, mean = mu, sigma = sigma))[order(V1)] 98 | d[c(1, 200), V3 := c(2.2, 50)] 99 | 100 | plot_grid( 101 | testdistr(d$V3, extremevalues = "theoretical", plot=FALSE)$Density, 102 | testdistr(d[V3 < 40]$V3, extremevalues = "theoretical", plot=FALSE)$Density, 103 | ncol = 2, labels = c("A", "B")) 104 | 105 | 106 | ## ----fmdv-mvoutliers, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = paste0("Graph of multivariate normality and (multivariate) anomalous values", c(".", " with one extreme anomalous value removed."))---- 107 | 108 | testdistr(d, "mvnorm", ncol = 2, extremevalues = "theoretical") 109 | 110 | testdistr(d[V3 < 40], "mvnorm", ncol = 2, extremevalues = "theoretical") 111 | 112 | 113 | ## ----fmdv-mvrobust, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = paste0("Graph of multivariate normality and (multivariate) anomalous values", c(" using the robust estimator.", " with both anomalous value removed."))---- 114 | 115 | testdistr(d, "mvnorm", ncol = 2, robust = TRUE, extremevalues = "theoretical") 116 | 117 | testdistr(d[-c(1,200)], "mvnorm", ncol = 2, extremevalues = "theoretical") 118 | 119 | 120 | ## ----fmdv-boxplot-cut, fig.width=8, fig.height=8, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Boxplots from cutting a continuous variable into quartiles showing a non-linear relationship."---- 121 | 122 | set.seed(12345) 123 | d2 <- data.table(x = rnorm(100)) 124 | d2[, y := rnorm(100, mean = 2 + x + 2 * x^2, sd = 3)] 125 | 126 | p.cut3 <- ggplot( 127 | data = d2[, .(y, 128 | xcut = cut(x, quantile(x, 129 | probs = seq(0, 1, by = 1/3)), include.lowest = TRUE))], 130 | aes(xcut, y)) + 131 | geom_boxplot(width=.25) + 132 | theme(axis.text.x = element_text( 133 | angle = 45, hjust = 1, vjust = 1)) + 134 | xlab("") 135 | 136 | p.cut4 <- p.cut3 %+% d2[, .(y, 137 | xcut = cut(x, quantile(x, 138 | probs = seq(0, 1, by = 1/4)), include.lowest = TRUE))] 139 | 140 | p.cut5 <- p.cut3 %+% d2[, .(y, 141 | xcut = cut(x, quantile(x, 142 | probs = seq(0, 1, by = 1/5)), include.lowest = TRUE))] 143 | 144 | p.cut10 <- p.cut3 %+% d2[, .(y, 145 | xcut = cut(x, quantile(x, 146 | probs = seq(0, 1, by= 1/10)), include.lowest = TRUE))] 147 | 148 | plot_grid( 149 | p.cut3, p.cut4, 150 | p.cut5, p.cut10, 151 | ncol = 2, 152 | labels = c("A", "B", "C", "D"), 153 | align = "hv") 154 | 155 | 156 | ## ----fmdv-loessline, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Loess line of best fit showing a non-linear relationship."---- 157 | 158 | ggplot(d2, aes(x, y)) + 159 | geom_point(colour="grey50") + 160 | stat_smooth(method = "loess", colour = "black") + 161 | stat_smooth(method = "lm", colour = "blue", linetype = 2) 162 | 163 | 164 | ## ----fmdv-loesslineq, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Loess line and quadratic line."---- 165 | 166 | ggplot(d2, aes(x, y)) + 167 | geom_point(colour="grey50") + 168 | stat_smooth(method = "loess", colour = "black") + 169 | stat_smooth(method = "lm", 170 | formula = y ~ x + I(x^2), 171 | colour = "blue", linetype = 2) 172 | 173 | 174 | ## ----fmdv-loess-span, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Loess line with varying degree of smoothing."---- 175 | 176 | ggplot(d2, aes(x, y)) + 177 | geom_point(colour="grey50") + 178 | stat_smooth(method = "loess", span = .2, 179 | colour = "black") + 180 | stat_smooth(method = "loess", span = 2, 181 | colour = "black", linetype = 2) 182 | 183 | 184 | ## ----fmdv-loess-mvbv, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Loess line of bivariate relationship from multivariate data."---- 185 | 186 | set.seed(1234) 187 | d3 <- data.table( 188 | x = rnorm(500), 189 | w = rnorm(500), 190 | z = rbinom(500, 1, .4)) 191 | d3[, y := rnorm(500, mean = 3 + 192 | ifelse(x < 0 & w < 0, -2, 0) * x + 193 | ifelse(x < 0, 0, 2) * w * x^2 + 4 * z * w, 194 | sd = 1)] 195 | d3[, z := factor(z)] 196 | 197 | ggplot(d3, aes(x, y)) + 198 | geom_point(colour="grey50") + 199 | stat_smooth(method = "loess", colour = "black") 200 | 201 | 202 | ## ----fmdv-loess-mv, fig.width=8, fig.height=8, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Loess lines for multivariate data."---- 203 | 204 | ggplot(d3, aes(x, winsorizor(y, .01), colour = z)) + 205 | geom_point() + 206 | stat_smooth(method = "loess") + 207 | scale_colour_manual(values = c("1" = "black", "0" = "grey40")) + 208 | facet_wrap(~ cut(w, quantile(w), include.lowest = TRUE)) 209 | 210 | 211 | ## ----fmdv-gam-pred, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Contour plots showing predicted y values for different combinations of x and w panelled by z."---- 212 | 213 | m <- gam(winsorizor(y, .01) ~ z + te(x, w, k = 7, by = z), data = d3) 214 | 215 | newdat <- expand.grid( 216 | x = seq(min(d3$x), max(d3$x), length.out = 100), 217 | w = seq(min(d3$w), max(d3$w), length.out = 100), 218 | z = factor(0:1, levels = levels(d3$z))) 219 | 220 | newdat$yhat <- predict(m, newdata = newdat) 221 | 222 | ggplot(newdat, aes(x = x, y = w, z = yhat)) + 223 | geom_raster(aes(fill = yhat)) + 224 | geom_contour(colour = "white", binwidth = 1, alpha = .5) + 225 | facet_wrap(~ z) 226 | 227 | 228 | ## ------------------------------------------------------------------------ 229 | 230 | diris <- as.data.table(iris) 231 | diris[, .(V = var(Sepal.Length)), by = Species] 232 | 233 | 234 | ## ----fmdv-slboxplot, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Box and Whisker Diagrams of sepal length by species. Outliers appear as dots."---- 235 | 236 | plot_grid( 237 | ggplot(diris, aes(Species, Sepal.Length)) + 238 | geom_boxplot() + 239 | xlab(""), 240 | ggplot(diris[, .(Sepal.Length = Sepal.Length - 241 | median(Sepal.Length)), by = Species], 242 | aes(Species, Sepal.Length)) + 243 | geom_boxplot() + 244 | xlab(""), 245 | ncol = 2, labels = c("A", "B"), align = "hv") 246 | 247 | 248 | ## ----fmdv-violinplot, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Violin plots with box and whisker diagrams in center."---- 249 | 250 | ggplot(diris, aes(Species, Sepal.Length)) + 251 | geom_violin() + 252 | geom_boxplot(width = .1) + 253 | xlab("") 254 | 255 | 256 | ## ----fmdv-mvviolinplot, fig.width=8, fig.height=8, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Violin plots with box and whisker diagrams in center by quartiles of x, colored by z."---- 257 | 258 | ## create cuts 259 | d3[, xquartile := cut(x, quantile(x), include.lowest = TRUE)] 260 | d3[, wquartile := cut(w, quantile(w), include.lowest = TRUE)] 261 | d3[, yclean := winsorizor(y, .01)] 262 | 263 | ## median center y by group to facilitate comparison 264 | d3[, yclean := yclean - median(yclean), 265 | by = .(xquartile, wquartile, z)] 266 | 267 | p <- position_dodge(.5) 268 | 269 | ggplot(d3, aes(xquartile, yclean, colour = z)) + 270 | geom_violin(position = p) + 271 | geom_boxplot(position = p, width = .1) + 272 | scale_colour_manual(values = c("1" = "black", "0" = "grey40")) + 273 | facet_wrap(~ wquartile) + 274 | theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) + 275 | coord_cartesian(ylim = c(-5, 5), expand = FALSE) 276 | 277 | 278 | ## ------------------------------------------------------------------------ 279 | 280 | set.seed(1234) 281 | d4 <- data.table(x = runif(500, 0, 5)) 282 | d4[, y1 := rnorm(500, mean = 2 + x, sd = 1)] 283 | d4[, y2 := rnorm(500, mean = 2 + x, sd = .25 + x)] 284 | 285 | 286 | ## ----fmdv-quantreg, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Homoskedasticity versus Heteroskedasticity.", warning=FALSE---- 287 | 288 | plot_grid( 289 | ggplot(d4, aes(x, y1)) + 290 | geom_point(colour = "grey70") + 291 | geom_quantile(quantiles = .5, colour = 'black') + 292 | geom_quantile(quantiles = c(.25, .75), 293 | colour = 'blue', linetype = 2) + 294 | geom_quantile(quantiles = c(.05, .95), 295 | colour = 'black', linetype = 3), 296 | ggplot(d4, aes(x, y2)) + 297 | geom_point(colour = "grey70") + 298 | geom_quantile(quantiles = .5, colour = 'black') + 299 | geom_quantile(quantiles = c(.25, .75), 300 | colour = 'blue', linetype = 2) + 301 | geom_quantile(quantiles = c(.05, .95), 302 | colour = 'black', linetype = 3), 303 | ncol = 2, labels = c("A", "B")) 304 | 305 | 306 | -------------------------------------------------------------------------------- /mli.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(knitr) 11 | library(tidyverse) 12 | library(rsample) 13 | library(data.table) 14 | library(boot) 15 | library(parallel) 16 | library(foreach) 17 | library(doParallel) 18 | 19 | options(width = 70, digits = 3) 20 | 21 | 22 | ## ------------------------------------------------------------------------ 23 | 24 | set.seed(5) 25 | case_data <- initial_split(data = iris, prop = 0.8) 26 | case_data 27 | 28 | 29 | ## ------------------------------------------------------------------------ 30 | data_train <- training(case_data) 31 | data_test <- testing(case_data) 32 | glimpse(data_train) 33 | 34 | ## ------------------------------------------------------------------------ 35 | unique(data_train$Species) 36 | 37 | ## ------------------------------------------------------------------------ 38 | length.lm = lm(Petal.Length ~ Sepal.Length + 39 | Sepal.Width + Petal.Width, 40 | data = data_train) 41 | length.lm 42 | summary(length.lm) 43 | 44 | ## ------------------------------------------------------------------------ 45 | sqrt( 46 | sum( 47 | (fitted(length.lm)-data_train$Petal.Length)^2 48 | )/(nrow(data_train)-4) 49 | ) 50 | 51 | ## ------------------------------------------------------------------------ 52 | mse_train<- mean(length.lm$residuals^2) 53 | mse_train 54 | 55 | ## ------------------------------------------------------------------------ 56 | sqrt( 57 | sum( 58 | (predict(length.lm, data_test)-data_test$Petal.Length)^2 59 | )/(nrow(data_test)-2) 60 | ) 61 | 62 | mse_test <- mean((predict(length.lm, data_test) - 63 | data_test$Petal.Length)^2) 64 | mse_test 65 | 66 | ## ----mli-qqplot, fig.width=7, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "2D and Q-Q plots of some data"---- 67 | par(mfrow=c(2,2)) 68 | plot(data_train$Sepal.Length, data_train$Petal.Length) 69 | plot(data_test$Sepal.Length, data_test$Petal.Length) 70 | qqnorm(data_train$Petal.Length, 71 | xlab = "Theoretical Quantiles Train") 72 | qqnorm(data_test$Petal.Length, 73 | xlab = "Theoretical Quantiles Test") 74 | 75 | ## ------------------------------------------------------------------------ 76 | summary(data_train$Sepal.Length) 77 | summary(data_test$Sepal.Length) 78 | 79 | ## ------------------------------------------------------------------------ 80 | crossData <- iris %>% 81 | sample_n(nrow(iris), replace = FALSE) 82 | crossData <- add_column(crossData, 83 | Bin = cut(1:150, breaks = 5, labels = c(1:5))) 84 | store <- tibble(Fold=1:5, MSE=NA_integer_) 85 | 86 | ## ------------------------------------------------------------------------ 87 | for(i in 1:5){ 88 | data_train<-crossData %>% filter(Bin != i) 89 | data_test<-crossData %>% filter(Bin == i) 90 | lengthFold.lm = lm(Petal.Length ~ Sepal.Length + 91 | Sepal.Width + Petal.Width, 92 | data = data_train) 93 | store[i,]$MSE <- mean((predict(lengthFold.lm, data_test) - 94 | data_test$Petal.Length)^2) 95 | } 96 | 97 | ## ------------------------------------------------------------------------ 98 | mse_k <- mean(store$MSE) 99 | mse_k 100 | 101 | ## ------------------------------------------------------------------------ 102 | lengthFold.lm <- lm(Petal.Length ~ Sepal.Length + 103 | Sepal.Width + Petal.Width, 104 | data = iris) 105 | lengthFold.lm 106 | mse_ALL <- mean(lengthFold.lm$residuals^2) 107 | mse_ALL 108 | 109 | ## ------------------------------------------------------------------------ 110 | store 111 | 112 | ## ------------------------------------------------------------------------ 113 | mse <- function(data, i) { 114 | lengthBoot.lm <- lm(Petal.Length ~ Sepal.Length + 115 | Sepal.Width + Petal.Width, 116 | data=data[i,]) 117 | return(mean(lengthBoot.lm$residuals^2)) 118 | } 119 | 120 | ## ------------------------------------------------------------------------ 121 | bootResults <- boot(data=iris, statistic=mse, R=10000) 122 | bootResults 123 | 124 | ## ----mli-btplot1, fig.width=6, fig.height=4, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Boot Strap Results"---- 125 | plot(bootResults) 126 | 127 | ## ------------------------------------------------------------------------ 128 | boot.ci(bootResults, conf = 0.95, type="bca") 129 | 130 | ## ------------------------------------------------------------------------ 131 | detectCores() 132 | detectCores(logical = TRUE) 133 | detectCores(logical = FALSE) 134 | 135 | ## ------------------------------------------------------------------------ 136 | ## notice 10000/2 = 5000 137 | runP <- function(...) boot(data=iris, statistic=mse, R=5000) 138 | 139 | ## makes a cluster with 2 cores as 10000/5000 = 2 140 | cl<-makeCluster(2) 141 | 142 | ## passes along parts of the global environment 143 | ## to each node / part of the cluster 144 | ## again, base is a file path variable to our book's path 145 | ## set book_directory <- "C:/YourPathHere/" 146 | clusterExport(cl, c("runP", "mse", "book_directory", "checkpoint_directory" )) 147 | 148 | ## creates the library and some environment on 149 | ## each of the parts of the cluster 150 | clusterEvalQ(cl, { 151 | 152 | library(checkpoint) 153 | checkpoint("2018-09-28", R.version = "3.5.1", 154 | project = book_directory, 155 | checkpointLocation = checkpoint_directory, 156 | scanForPackages = FALSE, 157 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 158 | 159 | library(boot) 160 | }) 161 | 162 | ## similar to set.seed() except for clusters 163 | clusterSetRNGStream(cl, 5) 164 | 165 | ## uses the parLapply() function which works on windows too 166 | pBootResults <- do.call(c, parLapply(cl, seq_len(2), runP)) 167 | 168 | #stop the cluster 169 | stopCluster(cl) 170 | 171 | # view results 172 | pBootResults 173 | 174 | ## get 95% confidence interval of the MSEs 175 | ## (note 0.95 is the default) 176 | boot.ci(pBootResults, conf = 0.95, type="bca") 177 | 178 | ## ----mli-btplot2, fig.width=6, fig.height=4, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Boot Strap Results"---- 179 | plot(pBootResults) 180 | 181 | ## ------------------------------------------------------------------------ 182 | cl <- makeCluster(2) 183 | registerDoParallel(cl) 184 | 185 | clusterExport(cl, c("book_directory", "checkpoint_directory")) 186 | 187 | clusterEvalQ(cl, { 188 | library(checkpoint) 189 | checkpoint("2018-09-28", R.version = "3.5.1", 190 | project = book_directory, 191 | checkpointLocation = checkpoint_directory, 192 | scanForPackages = FALSE, 193 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 194 | 195 | library("tidyverse") 196 | }) 197 | 198 | ## ------------------------------------------------------------------------ 199 | k <- foreach(i=1:5, .combine = c) %dopar% { 200 | data_train <- crossData %>% filter(Bin != i) 201 | data_test <- crossData %>% filter(Bin == i) 202 | lengthFold.lm <- lm(Petal.Length ~ Sepal.Length + 203 | Sepal.Width + Petal.Width, 204 | data = data_train) 205 | mean((predict(lengthFold.lm, data_test) - 206 | data_test$Petal.Length)^2) 207 | } 208 | 209 | ## ------------------------------------------------------------------------ 210 | stopCluster(cl) 211 | mse_Pk<-mean(k) 212 | mse_Pk 213 | 214 | -------------------------------------------------------------------------------- /mls.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(ggplot2) 11 | library(cowplot) 12 | library(data.table) 13 | library(readxl) 14 | library(viridis) 15 | 16 | 17 | library(RSNNS) 18 | library(kernlab) 19 | library(rpart) 20 | library(rattle) 21 | library(DALEX) 22 | library(caret) 23 | library(spdep) 24 | library(ranger) 25 | library(e1071) 26 | library(gbm) 27 | library(plyr) 28 | set.seed(1234) 29 | 30 | options(width = 70, digits = 2) 31 | 32 | ## ------------------------------------------------------------------------ 33 | ## Note: download Excel file from publisher website first 34 | dRaw <- read_excel("Gender_StatsData_worldbank.org_ccby40.xlsx") 35 | dRaw <- as.data.table(dRaw) # convert data to data.table format. 36 | 37 | dRaw[,`Indicator Name`:= NULL] 38 | 39 | ## collapse columns into a super long dataset 40 | ## with Year as a new variable 41 | data <- melt(dRaw, measure.vars = 3:20, variable.name = "Year", variable.factor = FALSE) 42 | 43 | ## cast the data wide again 44 | ## this time with separate variables by indicator code 45 | ## keeping a country and time (Year) variable 46 | data <- dcast(data, CountryName + Year ~ IndicatorCode) 47 | rm(dRaw) #remove unneeded variable 48 | 49 | #rename columns with shortened, unique names 50 | x<-colnames(data) 51 | x<-gsub("[[:punct:]]", "", x) 52 | (y <- abbreviate(x, minlength = 4, method = "both.sides")) 53 | names(data) <- y 54 | 55 | #shorten regional names to abbreviations. 56 | data$CntN<-abbreviate(data$CntN, minlength = 5, method = "left.kept") 57 | 58 | ## ------------------------------------------------------------------------ 59 | d <- copy(data) 60 | sort(unique(d$CntN)) 61 | 62 | ## ------------------------------------------------------------------------ 63 | str(d) 64 | d[,Year:=as.numeric(Year)] 65 | ddum <- dummyVars("~.", data = d) 66 | d <- data.table(predict(ddum, newdata = d)) 67 | rm(ddum) #remove ddum as unneeded 68 | str(d) 69 | 70 | ## ------------------------------------------------------------------------ 71 | dScaled<-scale(d[,-c(1:9)]) 72 | dScaled<-as.data.table(dScaled) 73 | d <- cbind(d[,c(1:9)], dScaled) 74 | rm(dScaled) #remove d2 as unneeded 75 | str(d) 76 | 77 | ## ----mls-eda1, fig.width=12, fig.height=5, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "Looking for significant data misshapes."---- 78 | boxplot(d[,-c(1:9)], las = 2) 79 | 80 | ## ----mls-eda2, fig.width=12, fig.height=5, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "Looking for significant data misshapes."---- 81 | par(mfrow = c(1,2)) 82 | hist(d$ERMA, 100) 83 | qqnorm(d$ERMA) 84 | par(mfrow = c(1,1)) 85 | 86 | ## ------------------------------------------------------------------------ 87 | shapiro.test(d$ERMA) 88 | 89 | ## ------------------------------------------------------------------------ 90 | range(d$ERMA) 91 | range(data$ERMA) 92 | shapiro.test( log(data$ERMA) ) 93 | 94 | ## ----mls-eda3, fig.width=4, fig.height=4, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Looking for significant data misshapes."---- 95 | par(mfrow = c(1,2)) 96 | hist(data$ERMA, 100) 97 | hist( log (data$ERMA) , 100) 98 | par(mfrow = c(1,1)) 99 | 100 | ## ------------------------------------------------------------------------ 101 | d2 <- copy(data[,.(SPAD, ERMA)]) 102 | d2[, Log.ERMA := log(ERMA)] 103 | cor(d2) 104 | rm(d2) #no longer needed 105 | 106 | ## ------------------------------------------------------------------------ 107 | lapply(data[,-c(1:2)], shapiro.test) 108 | 109 | dlog <- copy(data) 110 | dlog <- sapply(dlog[,-c(1:2)], log) 111 | dlog<-as.data.table(dlog) 112 | colnames(dlog) <- paste(colnames(dlog), "LOG", sep = ".") 113 | 114 | dlog<-cbind(data, dlog) 115 | View(cor(dlog[,-c(1:2)])) 116 | rm(dlog) #remove as we will not use. 117 | 118 | ## ------------------------------------------------------------------------ 119 | set.seed(1234) 120 | index <- createDataPartition(data$CntN, p = 0.8, list = FALSE) 121 | trainData <- data[index, ] 122 | validationData <- data[-index, ] 123 | 124 | ## ------------------------------------------------------------------------ 125 | #source("https://bioconductor.org/biocLite.R") 126 | #biocLite("pcaMethods") 127 | library(pcaMethods) 128 | 129 | ## ------------------------------------------------------------------------ 130 | #confirm structure 131 | str(trainData[,c(3:8,10:13)]) 132 | 133 | #base R / traditional method 134 | pc <- prcomp(trainData[,c(3:8,10:13)], center = TRUE, scale. = TRUE) 135 | summary(pc) 136 | pcValidationData1 <- predict(pc, newdata = validationData[,c(3:8,10:13)]) 137 | 138 | #scalable method using PcaMethods 139 | pc<-pca(trainData[,c(1:8,10:13)], method = "svd",nPcs = 4, scale = "uv", center = TRUE) 140 | pc 141 | summary(pc) 142 | pcValidationData2 <- predict(pc, newdata = validationData[,c(3:8,10:13)]) 143 | 144 | #demonstration of how to access transformed validation data 145 | pcValidationData1[,1] 146 | pcValidationData2$scores[,1] 147 | 148 | ## ----mls-svm1, fig.width=8, fig.height=4, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Per Capita GNP vs Adolescent Fertility Rate."---- 149 | 150 | svmDataTrain <- trainData[,.(SPAD, NYGN)] 151 | svmDataValidate <- validationData[,.(SPAD, NYGN)] 152 | 153 | p1 <- ggplot(data = svmDataTrain, 154 | aes(x = NYGN, y = SPAD)) 155 | ## data poins colored by country 156 | p1 + geom_point(aes(colour = trainData$CntN)) + 157 | scale_colour_viridis(discrete = TRUE) 158 | 159 | 160 | ## ------------------------------------------------------------------------ 161 | set.seed(12345) 162 | 163 | svm <- train(x = svmDataTrain, 164 | y = trainData$CntN, 165 | method = "svmLinear", 166 | preProcess = NULL, 167 | metric = "Accuracy", 168 | trControl = trainControl(method = "cv", 169 | number = 5, 170 | seeds = c(123, 234, 345, 456, 567, 678) 171 | ) 172 | ) 173 | svm 174 | 175 | ## ------------------------------------------------------------------------ 176 | #predict the country name on our training data using our new model 177 | predictOnTrain <- predict(svm, newdata = svmDataTrain) 178 | 179 | mean( predictOnTrain == trainData$CntN) 180 | 181 | ## ------------------------------------------------------------------------ 182 | predictOnTest <- predict(svm, newdata = svmDataValidate) 183 | mean(predictOnTest == validationData$CntN) 184 | 185 | ## ----mls-svm2, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Train vs Test Predictions."---- 186 | 187 | p1 <- ggplot(data = validationData, 188 | aes(x = NYGN, y = SPAD)) 189 | 190 | plot_grid( 191 | ## data poins colored by country 192 | p1 + geom_point(aes(colour = validationData$CntN, size = validationData$CntN)) + 193 | scale_colour_viridis(discrete = TRUE), 194 | 195 | ## data poins colored by predicted country 196 | p1 + geom_point(aes(colour = predictOnTest, size = predictOnTest)) + 197 | scale_colour_viridis(discrete = TRUE), 198 | ncol = 1 199 | ) 200 | 201 | ## ------------------------------------------------------------------------ 202 | rm(p1) 203 | rm(svm) 204 | rm(svmDataTrain) 205 | rm(svmDataValidate) 206 | rm(pcValidationData1) 207 | rm(pcValidationData2) 208 | rm(predictOnTest) 209 | rm(predictOnTrain) 210 | rm(pc) 211 | rm(d) 212 | 213 | ## ------------------------------------------------------------------------ 214 | # set up training & validation data 215 | svmDataTrain <- trainData[,-1] 216 | svmDataTrain[,Year:=as.numeric(Year)] 217 | svmDataValidation <- validationData[,-1] 218 | svmDataValidation[,Year:=as.numeric(Year)] 219 | #run linear SVM on the full data set 220 | set.seed(12345) 221 | svmLinear <- train(x = svmDataTrain, 222 | y = trainData$CntN, 223 | method = "svmLinear", 224 | preProcess = c("scale", "center", "pca"), 225 | metric = "Accuracy", 226 | trControl = trainControl(method = "cv", 227 | number = 5, 228 | seeds = c(123, 234, 345, 456, 567, 678) 229 | ) 230 | ) 231 | svmLinear 232 | 233 | ## ------------------------------------------------------------------------ 234 | #run polynomial SVM on the full data set 235 | set.seed(12345) 236 | svmPoly <- train(x = svmDataTrain, 237 | y = trainData$CntN, 238 | method = "svmPoly", 239 | preProcess = c("scale", "center", "pca"), 240 | metric = "Accuracy", 241 | trControl = trainControl(method = "cv", 242 | number = 5 243 | ) 244 | ) 245 | 246 | svmPoly 247 | 248 | ## ------------------------------------------------------------------------ 249 | predictOnTrainL <- predict(svmLinear, newdata = svmDataTrain) 250 | mean( predictOnTrainL == trainData$CntN) 251 | 252 | predictOnTrainP <- predict(svmPoly, newdata = svmDataTrain) 253 | mean( predictOnTrainP == trainData$CntN) 254 | 255 | ## ------------------------------------------------------------------------ 256 | predictOnTestL <- predict(svmLinear, newdata = svmDataValidation) 257 | mean(predictOnTestL == validationData$CntN) 258 | 259 | ## ------------------------------------------------------------------------ 260 | cartDataTrain <- copy(trainData[,-1]) 261 | cartDataTrain[,Year:=as.numeric(Year)] 262 | cartDataValidation <- copy(validationData[,-1]) 263 | cartDataValidation[,Year:=as.numeric(Year)] 264 | 265 | set.seed(12345) 266 | cartModel <- train(x = cartDataTrain, 267 | y = trainData$CntN, 268 | method = "rpart", 269 | preProcess = c("scale", "center", "pca"), 270 | metric = "Accuracy", 271 | tuneLength = 10, 272 | trControl = trainControl(method = "cv", 273 | number = 5 274 | ) 275 | ) 276 | 277 | cartModel 278 | 279 | ## ----mls-crt1, fig.width=9, fig.height=7, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Classification Tree Graph."---- 280 | plot(cartModel$finalModel) 281 | text(cartModel$finalModel, cex = 0.5) 282 | 283 | ## ----mls-crt2, fig.width=6, fig.height=7, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Fancy Classification Tree Graph"---- 284 | fancyRpartPlot(cartModel$finalModel, cex = 0.4, main = "") 285 | 286 | ## ------------------------------------------------------------------------ 287 | predictOnTrainT <- predict(cartModel, newdata = cartDataTrain) 288 | mean( predictOnTrainT == trainData$CntN) 289 | 290 | predictOnTestT <- predict(cartModel, newdata = cartDataValidation) 291 | mean(predictOnTestT == validationData$CntN) 292 | 293 | ## ------------------------------------------------------------------------ 294 | confusionMatrix(predictOnTestT, as.factor(validationData$CntN)) 295 | 296 | ## ------------------------------------------------------------------------ 297 | rfDataTrain <- copy(trainData[,-1]) 298 | rfDataTrain[,Year:=as.numeric(Year)] 299 | rfDataValidation <- copy(validationData[,-1]) 300 | rfDataValidation[,Year:=as.numeric(Year)] 301 | 302 | set.seed(12345) 303 | 304 | rfModel <- train(x = rfDataTrain, 305 | y = trainData$CntN, 306 | method = "ranger", 307 | preProcess = c("scale", "center", "pca"), 308 | metric = "Accuracy", 309 | num.trees = 20, 310 | trControl = trainControl(method = "cv", 311 | number = 5 312 | ) 313 | ) 314 | 315 | rfModel 316 | rfModel$finalModel$num.trees 317 | 318 | ## ------------------------------------------------------------------------ 319 | predictOnTrainR <- predict(rfModel, newdata = rfDataTrain) 320 | mean( predictOnTrainR == trainData$CntN) 321 | 322 | predictOnTestR <- predict(rfModel, newdata = rfDataValidation) 323 | mean(predictOnTestR == validationData$CntN) 324 | 325 | ## ------------------------------------------------------------------------ 326 | set.seed(12345) 327 | rfModel <- train(x = rfDataTrain, 328 | y = trainData$CntN, 329 | method = "ranger", 330 | preProcess = c("scale", "center", "pca"), 331 | metric = "Accuracy", 332 | num.trees = 50, 333 | trControl = trainControl(method = "cv", 334 | number = 5 335 | ) 336 | ) 337 | rfModel 338 | rfModel$finalModel$num.trees 339 | 340 | ## ------------------------------------------------------------------------ 341 | predictOnTrainR <- predict(rfModel, newdata = rfDataTrain) 342 | mean( predictOnTrainR == trainData$CntN) 343 | 344 | predictOnTestR <- predict(rfModel, newdata = rfDataValidation) 345 | mean(predictOnTestR == validationData$CntN) 346 | 347 | ## ------------------------------------------------------------------------ 348 | 349 | set.seed(12345) 350 | 351 | rfModel <- train(x = rfDataTrain, 352 | y = trainData$CntN, 353 | method = "ranger", 354 | preProcess = c("scale", "center", "pca"), 355 | metric = "Accuracy", 356 | num.trees = 20, 357 | trControl = trainControl(method = "cv", 358 | number = 5 359 | ), 360 | tuneGrid = expand.grid(mtry = c(1, 2, 3, 4), 361 | splitrule = "extratrees", 362 | min.node.size = c(1, 5, 10, 15)) 363 | ) 364 | 365 | rfModel 366 | rfModel$finalModel$num.trees 367 | rfModel$finalModel$mtry 368 | rfModel$finalModel$splitrule 369 | rfModel$finalModel$min.node.size 370 | 371 | ## ------------------------------------------------------------------------ 372 | predictOnTrainR <- predict(rfModel, newdata = rfDataTrain) 373 | mean( predictOnTrainR == trainData$CntN) 374 | 375 | predictOnTestR <- predict(rfModel, newdata = rfDataValidation) 376 | mean(predictOnTestR == validationData$CntN) 377 | 378 | ## ------------------------------------------------------------------------ 379 | sgbDataTrain <- copy(trainData) 380 | sgbDataTrain[,Year:=as.numeric(Year)] 381 | sgbDataValidation <- copy(validationData) 382 | sgbDataValidation[,Year:=as.numeric(Year)] 383 | 384 | ddum <- dummyVars("~.", data = sgbDataTrain) 385 | sgbDataTrain <- data.table(predict(ddum, newdata = sgbDataTrain)) 386 | sgbDataValidation <- data.table(predict(ddum, newdata = sgbDataValidation)) 387 | rm(ddum) 388 | 389 | ## ------------------------------------------------------------------------ 390 | set.seed(12345) 391 | sgbModel <- train(SPAD ~., 392 | data = sgbDataTrain, 393 | method = "gbm", 394 | preProcess = c("scale", "center"), 395 | metric = "RMSE", 396 | trControl = trainControl(method = "cv", 397 | number = 5 398 | ), 399 | tuneGrid = expand.grid(interaction.depth = 1:3, 400 | shrinkage = 0.1, 401 | n.trees = c(50, 100, 150), 402 | n.minobsinnode = 10), 403 | verbose = FALSE 404 | ) 405 | sgbModel 406 | 407 | ## ----mls-sgb1, fig.width=5, fig.height=10, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "Relative Influnce Visualisation."---- 408 | summary(sgbModel) 409 | 410 | ## ------------------------------------------------------------------------ 411 | mean(stats::residuals(sgbModel)^2) 412 | 413 | mean((predict(sgbModel, sgbDataValidation) - 414 | sgbDataValidation$SPAD)^2) 415 | 416 | ## ------------------------------------------------------------------------ 417 | explainSGBt <- explain(sgbModel, label = "sgbt", 418 | data = sgbDataTrain, 419 | y = sgbDataTrain$SPAD) 420 | 421 | explainSGBv <- explain(sgbModel, label = "sgbv", 422 | data = sgbDataValidation, 423 | y = sgbDataValidation$SPAD) 424 | 425 | ## ----mls-sgb2, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "DALEX residual visualisations."---- 426 | performanceSGBt <- model_performance(explainSGBt) 427 | performanceSGBv <- model_performance(explainSGBv) 428 | 429 | plot_grid( 430 | plot(performanceSGBt, performanceSGBv), 431 | plot(performanceSGBt, performanceSGBv, geom = "boxplot"), 432 | ncol = 2) 433 | 434 | 435 | ## ----mls-sgb3, fig.width=6, fig.height=6, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "DALEX Drop Out Loss."---- 436 | 437 | importanceSGBt <- variable_importance(explainSGBt) 438 | importanceSGBv <- variable_importance(explainSGBv) 439 | plot(importanceSGBt, importanceSGBv) 440 | 441 | ## ----mls-sgb4, fig.width=6, fig.height=6, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "DALEX primary school non-attendance count versus teen pregnancy."---- 442 | responseSGBprmt <- variable_response(explainSGBt, variable = "SEPR", type = "pdp") 443 | responseSGBprmv <- variable_response(explainSGBv, variable = "SEPR", type = "pdp") 444 | plot(responseSGBprmt, responseSGBprmv) 445 | 446 | ## ----mls-sgb5, fig.width=6, fig.height=6, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "DALEX death rate per 1000 versus teen pregnancy."---- 447 | responseSGBdynt <- variable_response(explainSGBt, variable = "SPDY", type = "pdp") 448 | responseSGBdynv <- variable_response(explainSGBv, variable = "SPDY", type = "pdp") 449 | plot(responseSGBdynt, responseSGBdynv) 450 | 451 | ## ------------------------------------------------------------------------ 452 | mlpDataTrain <- copy(trainData) 453 | mlpDataTrain[,Year:=as.numeric(Year)] 454 | mlpDataValidation <- copy(validationData) 455 | mlpDataValidation[,Year:=as.numeric(Year)] 456 | 457 | ddum <- dummyVars("~.", data = mlpDataTrain) 458 | mlpDataTrain <- data.table(predict(ddum, newdata = mlpDataTrain)) 459 | mlpDataValidation <- data.table(predict(ddum, newdata = mlpDataValidation)) 460 | rm(ddum) 461 | 462 | ## ------------------------------------------------------------------------ 463 | set.seed(12345) 464 | suppressWarnings( 465 | mlpModel <- train( 466 | SPAD ~ ., 467 | data = mlpDataTrain, 468 | method = "mlpML", 469 | preProcess = c("scale", "center"), 470 | metric = "RMSE", 471 | trControl = trainControl(method = "cv", 472 | number = 5) 473 | ) 474 | ) 475 | mlpModel 476 | 477 | ## ------------------------------------------------------------------------ 478 | summary(mlpModel) 479 | 480 | ## ------------------------------------------------------------------------ 481 | mean(stats::residuals(mlpModel)^2) 482 | 483 | mean((predict(mlpModel, mlpDataValidation) - 484 | mlpDataValidation$SPAD)^2) 485 | 486 | ## ------------------------------------------------------------------------ 487 | explainMLPt <- explain(mlpModel, label = "mlpt", 488 | data = mlpDataTrain, 489 | y = mlpDataTrain$SPAD) 490 | 491 | explainMLPv <- explain(mlpModel, label = "mlpv", 492 | data = mlpDataValidation, 493 | y = mlpDataValidation$SPAD) 494 | 495 | ## ----mls-mlp1, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Model performance contrasting SGB vs MLP methods."---- 496 | performanceMLPt <- model_performance(explainMLPt) 497 | performanceMLPv <- model_performance(explainMLPv) 498 | 499 | plot_grid( 500 | plot(performanceMLPt, performanceMLPv, performanceSGBt, performanceSGBv), 501 | plot(performanceMLPt, performanceMLPv, performanceSGBt, performanceSGBv, geom = "boxplot"), 502 | ncol = 2 503 | ) 504 | 505 | ## ----mls-mlp2, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Determining top key variables."---- 506 | 507 | importanceMLPt <- variable_importance(explainMLPt) 508 | importanceMLPv <- variable_importance(explainMLPv) 509 | plot(importanceMLPt, importanceMLPv, importanceSGBt, importanceSGBv) 510 | 511 | ## ----mls-mlp3, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Understanding the impact of primary school attendance on teen pregnancy."---- 512 | responseMLPprmt <- variable_response(explainMLPt, variable = "SEPR", type = "pdp") 513 | responseMLPprmv <- variable_response(explainMLPv, variable = "SEPR", type = "pdp") 514 | plot(responseMLPprmt, responseMLPprmv, responseSGBprmt, responseSGBprmv) 515 | 516 | ## ----mls-mlp4, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "DALEX death rate per 1000 versus teen pregnancy."---- 517 | responseMLPdynt <- variable_response(explainMLPt, variable = "SPDY", type = "pdp") 518 | responseMLPdynv <- variable_response(explainMLPv, variable = "SPDY", type = "pdp") 519 | plot(responseMLPdynt, responseMLPdynv, responseSGBdynt, responseSGBdynv) 520 | 521 | ## ------------------------------------------------------------------------ 522 | set.seed(12345) 523 | suppressWarnings( 524 | mlpModelb <- train( 525 | SPAD ~ ., 526 | data = mlpDataTrain, 527 | method = "mlpML", 528 | preProcess = c("scale", "center"), 529 | metric = "RMSE", 530 | verbose = FALSE, 531 | trControl = trainControl(method = "cv", 532 | number = 5), 533 | tuneGrid = expand.grid( 534 | layer1 = 0:10, 535 | layer2 = 0:10, 536 | layer3 = 0:10 537 | ) 538 | ) 539 | ) 540 | mlpModelb 541 | 542 | mean(stats::residuals(mlpModelb)^2) 543 | 544 | mean((predict(mlpModelb, mlpDataValidation) - 545 | mlpDataValidation$SPAD)^2) 546 | summary(mlpModelb) 547 | 548 | -------------------------------------------------------------------------------- /mlu.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(ggplot2) 11 | library(cowplot) 12 | library(viridis) 13 | library(scales) 14 | library(readxl) 15 | library(data.table) 16 | library(ape) 17 | library(MASS) 18 | library(matrixStats) 19 | 20 | options(width = 70, digits = 2) 21 | 22 | 23 | ## ----eval = FALSE-------------------------------------------------------- 24 | ## 25 | ## source("https://bioconductor.org/biocLite.R") 26 | ## biocLite("pcaMethods") 27 | ## 28 | 29 | ## ----echo=TRUE, message=FALSE, results = "hide"-------------------------- 30 | 31 | library(pcaMethods) 32 | 33 | 34 | ## ------------------------------------------------------------------------ 35 | 36 | ## Note: download Excel file from publisher website first 37 | dRaw <- read_excel("Gender_StatsData_worldbank.org_ccby40.xlsx") 38 | dRaw <- as.data.table(dRaw) # convert data to data.table format. 39 | 40 | 41 | ## ------------------------------------------------------------------------ 42 | 43 | str(dRaw) 44 | 45 | summary(dRaw) 46 | 47 | unique(dRaw$CountryName) 48 | 49 | unique(dRaw$IndicatorCode) 50 | 51 | 52 | ## ------------------------------------------------------------------------ 53 | 54 | dRaw[,`Indicator Name`:= NULL] 55 | 56 | 57 | ## ------------------------------------------------------------------------ 58 | 59 | ## collapse columns into a super long dataset 60 | ## with Year as a new variable 61 | d <- melt(dRaw, measure.vars = 3:20, variable.name = "Year") 62 | head(d) 63 | str(d) 64 | 65 | ## finally cast the data wide again 66 | ## this time with separate variables by indicator code 67 | ## keeping a country and time (Year) variable 68 | d <- dcast(d, CountryName + Year ~ IndicatorCode) 69 | 70 | head(d) 71 | str(d) 72 | 73 | 74 | ## ------------------------------------------------------------------------ 75 | 76 | ## rename columns with shortened, unique names 77 | x<-colnames(d) 78 | x<-gsub("[[:punct:]]", "", x) 79 | (y <- abbreviate(x, minlength = 4, method = "both.sides")) 80 | names(d) <- y 81 | 82 | ## shorten regional names to abbreviations. 83 | d$CntN<-abbreviate(d$CntN, minlength = 5, 84 | method = "left.kept") 85 | 86 | 87 | ## ------------------------------------------------------------------------ 88 | 89 | summary(d) 90 | 91 | str(d) 92 | 93 | d[, Year := as.character(Year)] 94 | 95 | 96 | ## ----mlu-gnpadoplot1, fig.width=7, fig.height=10, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women."---- 97 | 98 | ## ggplot2 plot object indicating x and y variables 99 | p1 <- ggplot(d, aes(NYGN, SPAD)) 100 | 101 | ## make a grid of two plots 102 | plot_grid( 103 | ## first plot data points only 104 | p1 + geom_point(), 105 | ## data poins colored by year 106 | p1 + geom_point(aes(colour = Year)) + 107 | scale_colour_viridis(discrete = TRUE), 108 | ncol = 1 109 | ) 110 | 111 | 112 | ## ----mlu-kmeans1, fig.width=9, fig.height=9, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women for different numbers of k clusters."---- 113 | 114 | set.seed(2468) 115 | wgss <- vector("numeric", 8) 116 | plots <- vector("list", 9) 117 | p1 <- ggplot(d, aes(NYGN, SPAD)) 118 | 119 | for(i in 2:9) { 120 | km <- kmeans(d[, .(NYGN, SPAD)], 121 | centers = i) 122 | 123 | wgss[i - 1] <- km$tot.withinss 124 | 125 | plots[[i - 1]] <- p1 + 126 | geom_point(aes_(colour = factor(km$cluster))) + 127 | scale_color_viridis(discrete = TRUE) + 128 | theme(legend.position = "none") + 129 | ggtitle(paste("kmeans centers = ", i)) 130 | } 131 | 132 | plots[[9]] <- ggplot() + 133 | geom_point(aes(x = 2:9, y = wgss)) + 134 | xlab("Number of Clusters") + 135 | ylab("Within SS") + 136 | ggtitle("Scree Plot") 137 | 138 | do.call(plot_grid, c(plots, ncol = 3)) 139 | 140 | 141 | ## ------------------------------------------------------------------------ 142 | 143 | summary(d[,.(NYGN, SPAD)]) 144 | 145 | 146 | ## ------------------------------------------------------------------------ 147 | 148 | x <- scale(d[,.(NYGN, SPAD)]) 149 | summary(x) 150 | 151 | 152 | ## ----mlu-kmeans2, fig.width=9, fig.height=9, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women for different numbers of k clusters."---- 153 | 154 | set.seed(2468) 155 | wgss <- vector("numeric", 8) 156 | plots <- vector("list", 9) 157 | p1 <- ggplot(d, aes(NYGN, SPAD)) 158 | 159 | for(i in 2:9) { 160 | km <- kmeans(x, centers = i) 161 | 162 | wgss[i - 1] <- km$tot.withinss 163 | 164 | plots[[i - 1]] <- p1 + 165 | geom_point(aes_(colour = factor(km$cluster))) + 166 | scale_color_viridis(discrete = TRUE) + 167 | theme(legend.position = "none") + 168 | ggtitle(paste("kmeans centers = ", i)) 169 | } 170 | 171 | plots[[9]] <- ggplot() + 172 | geom_point(aes(x = 2:9, y = wgss)) + 173 | xlab("Number of Clusters") + 174 | ylab("Within SS") + 175 | ggtitle("Scree Plot") 176 | 177 | do.call(plot_grid, c(plots, ncol = 3)) 178 | 179 | 180 | ## ----mlu-kmeans3, fig.width=9, fig.height=9, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women for different numbers of iterations."---- 181 | 182 | set.seed(2468) 183 | plots <- vector("list", 9) 184 | p1 <- ggplot(d, aes(NYGN, SPAD)) 185 | 186 | for(i in 6:14) { 187 | km <- kmeans(x, centers = 6, iter.max = i) 188 | 189 | plots[[i - 5]] <- p1 + 190 | geom_point(aes_(colour = factor(km$cluster))) + 191 | scale_color_viridis(discrete = TRUE) + 192 | theme(legend.position = "none") + 193 | ggtitle(paste("kmeans iters = ", i)) 194 | } 195 | 196 | do.call(plot_grid, c(plots, ncol = 3)) 197 | 198 | 199 | ## ----mlu-kmeans4, fig.width=9, fig.height=9, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women for different nstart values."---- 200 | 201 | set.seed(2468) 202 | plots <- vector("list", 9) 203 | p1 <- ggplot(d, aes(NYGN, SPAD)) 204 | 205 | for(i in 1:9) { 206 | km <- kmeans(x, centers = 6, iter.max = 10, nstart = i) 207 | 208 | plots[[i]] <- p1 + 209 | geom_point(aes_(colour = factor(km$cluster))) + 210 | scale_color_viridis(discrete = TRUE) + 211 | theme(legend.position = "none") + 212 | ggtitle(paste("kmeans nstarts = ", i)) 213 | } 214 | 215 | do.call(plot_grid, c(plots, ncol = 3)) 216 | 217 | 218 | ## ----mlu-kmeans5, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Scree Plot for all ."---- 219 | 220 | x <- scale(d[,-c(1,2)]) 221 | wgss<-0 222 | set.seed(2468) 223 | for( i in 1:11){ 224 | km <- kmeans(x, centers = i) 225 | wgss[i]<-km$tot.withinss 226 | } 227 | 228 | ggplot() + 229 | geom_point(aes(x = 1:11, y = wgss)) + 230 | xlab("Number of Clusters") + 231 | ylab("Within SS") + 232 | ggtitle("Scree Plot - All Variables") 233 | 234 | 235 | ## ------------------------------------------------------------------------ 236 | 237 | kmAll <- kmeans(x, centers = 4, nstart = 25) 238 | x <- cbind(d[, c(1,2)], x, 239 | Cluster = kmAll$cluster) 240 | tail(x) 241 | 242 | 243 | ## ------------------------------------------------------------------------ 244 | 245 | xtabs(~ CntN + Cluster, data = x) 246 | 247 | 248 | ## ------------------------------------------------------------------------ 249 | 250 | unique(x[ 251 | order(CntN, Year, Cluster), 252 | .(CntN, Year, Cluster)][ 253 | CntN=="EsA&P"]) 254 | 255 | unique(x[ 256 | order(CntN, Year, Cluster), 257 | .(CntN, Year, Cluster)][ 258 | CntN == "ErpnU"]) 259 | 260 | 261 | ## ------------------------------------------------------------------------ 262 | 263 | hdist <- dist(d[,.(NYGN, SPAD)]) 264 | str(hdist) 265 | 266 | 267 | ## ----mlu-hclust1, fig.width=14, fig.height=10, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with row numbers."---- 268 | 269 | hclust <- hclust(hdist) 270 | plot(hclust) 271 | 272 | 273 | ## ----mlu-hclust2, fig.width=14, fig.height=12, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with country names and year."---- 274 | 275 | x <- d[, .(CntN, Year, NYGN, SPAD)] 276 | x[, Key := paste(CntN, Year)] 277 | x[, CntN := NULL] 278 | x[, Year := NULL] 279 | 280 | hdist <- dist(x[,.(NYGN, SPAD)]) 281 | hclust <- hclust(hdist) 282 | plot(hclust, labels = x$Key) 283 | 284 | 285 | ## ----mlu-hclust3, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with country names and year and a height line."---- 286 | 287 | plot(hclust, labels = x$Key) 288 | abline(h = 30000, col = "blue") 289 | 290 | 291 | ## ------------------------------------------------------------------------ 292 | 293 | summary(x) 294 | d[, mean(NYGN), by = CntN][order(V1)] 295 | d[, mean(SPAD), by = CntN][order(V1)] 296 | 297 | 298 | ## ----mlu-hclust4, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with country names and year and another height line."---- 299 | 300 | plot(hclust, labels = x$Key) 301 | abline(h = 20000, col = "blue") 302 | 303 | 304 | ## ----mlu-hclust5, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with country names and year and all dimensions of data."---- 305 | 306 | x <- copy(d) 307 | x[, Key := paste(CntN, Year)] 308 | x[, CntN := NULL] 309 | x[, Year := NULL] 310 | 311 | hdist <- dist(x[, -12]) 312 | hclust <- hclust(hdist) 313 | 314 | plot(hclust, labels = x$Key) 315 | 316 | 317 | ## ----mlu-hclust6, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram using the ward.D2 method."---- 318 | 319 | hclust <- hclust(hdist, method = "ward.D2") 320 | plot(hclust, labels = x$Key) 321 | 322 | 323 | ## ----mlu-hclust7, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with scaling."---- 324 | 325 | x <- scale(d[,-c(1,2)]) 326 | row.names(x) <- paste(d$CntN, d$Year) 327 | hdist <- dist(x) 328 | hclust <- hclust(hdist) 329 | 330 | plot(hclust, labels = paste(d$CntN, d$Year)) 331 | abline(h = 6, col = "blue") 332 | 333 | 334 | ## ------------------------------------------------------------------------ 335 | 336 | cut_hclust <- cutree(hclust, h = 6) 337 | unique(cut_hclust) 338 | 339 | 340 | ## ------------------------------------------------------------------------ 341 | 342 | dcopy <- as.data.table(copy(d)) 343 | dcopy[, cluster:= NA_integer_] 344 | 345 | dcopy$cluster <- cutree(hclust, k = 3) 346 | 347 | tail(dcopy) 348 | 349 | 350 | ## ----mlu-hclust8, fig.width=14, fig.height=14, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Variations on Dendrogram via ape package."---- 351 | 352 | plot(as.phylo(hclust), type = "cladogram") 353 | 354 | 355 | ## ----mlu-hclust9, fig.width=14, fig.height=14, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Variations on Dendrogram via ape package."---- 356 | 357 | plot(as.phylo(hclust), type = "fan") 358 | 359 | 360 | ## ----mlu-hclust10, fig.width=14, fig.height=14, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Variations on Dendrogram via ape package."---- 361 | 362 | plot(as.phylo(hclust), type = "radial") 363 | 364 | 365 | ## ----mlu-hclust11, fig.width=14, fig.height=14, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "unrooted type on 4 clusters."---- 366 | 367 | hclust4 <- cutree(hclust, k = 4) 368 | plot(as.phylo(hclust), type = "unrooted", label.offset = 1, 369 | tip.color = hclust4, cex = 0.8) 370 | 371 | 372 | ## ----mlu-pca1, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "A highly correlated plot - are there really two dimensions here?"---- 373 | 374 | cor(d$NYGD, d$NYGN) 375 | 376 | summary(d[,.(NYGD, NYGN)]) 377 | 378 | ggplot(d, aes(NYGD, NYGN)) + 379 | geom_point() 380 | 381 | 382 | ## ----mlu-pca2, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "A highly correlated plot - are there really two dimensions here?"---- 383 | 384 | ggplot(d, aes(NYGD, SESC)) + 385 | geom_point() 386 | 387 | cor(d$NYGD, d$SESC) 388 | 389 | 390 | ## ------------------------------------------------------------------------ 391 | 392 | x <- d[,.( NYGN, SPAD)] 393 | res <- pca(x, method="svd", center=TRUE, scale = "uv") 394 | 395 | summary(res) 396 | 397 | 398 | ## ----mlu-pca3, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "comparison of raw data and pca"---- 399 | 400 | biplot(res, main = "Biplot of PCA") 401 | 402 | 403 | ## ----mlu-pca4, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Scree plot for traditional PCA on all features in the data."---- 404 | 405 | x <- d[, -c(1,2)] 406 | res <- pca(x, method="svd", center=TRUE, scale = "uv", 407 | nPcs = ncol(x)) 408 | 409 | summary(res) 410 | 411 | ## reverse scree plot 412 | ggplot() + 413 | geom_bar(aes(1:11, cumsum(res@R2)), 414 | stat = "identity") + 415 | scale_x_continuous("Principal Component", 1:11) + 416 | scale_y_continuous(expression(R^2), labels = percent) + 417 | ggtitle("Scree Plot") + 418 | coord_cartesian(xlim = c(.5, 11.5), ylim = c(.5, 1), 419 | expand = FALSE) 420 | 421 | 422 | ## ----mlu-pca5, fig.width=6, fig.height=6, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "Biplot for first two principal components."---- 423 | 424 | biplot(res, choices = c(1, 2)) 425 | 426 | 427 | ## ------------------------------------------------------------------------ 428 | 429 | head(scores(res)) 430 | round(cor(scores(res)),2) 431 | 432 | 433 | ## ----mlu-pca6, fig.width=5, fig.height=10, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "Plot loadings from PCA models with and without outliers using traditional, SVD, PCA and robust PCA."---- 434 | 435 | x <- d[, -c(1,2)] 436 | x <- prep(x, center = TRUE, scale = "uv") 437 | 438 | xout <- copy(x) 439 | xout[1:5, "NYGD"] <- (-10) 440 | 441 | res1 <- pca(x, method = "svd", 442 | center = FALSE, nPcs = 4) 443 | res2 <- pca(xout, method = "svd", 444 | center = FALSE, nPcs = 4) 445 | 446 | res1rob <- pca(x, method = "robustPca", 447 | center = FALSE, nPcs = 4) 448 | res2rob <- pca(xout, method = "robustPca", 449 | center = FALSE, nPcs = 4) 450 | plot_grid( 451 | ggplot() + 452 | geom_point(aes( 453 | x = as.numeric(loadings(res1)), 454 | y = as.numeric(loadings(res2)))) + 455 | xlab("Loadings, SVD, No Outliers") + 456 | ylab("Loadings, SVD, Outliers"), 457 | ggplot() + 458 | geom_point(aes( 459 | x = as.numeric(loadings(res1rob)), 460 | y = as.numeric(loadings(res2rob)))) + 461 | xlab("Loadings, Robust PCA, No Outliers") + 462 | ylab("Loadings, Robust PCA, Outliers"), 463 | ncol = 1) 464 | 465 | 466 | ## ------------------------------------------------------------------------ 467 | 468 | x <- scale(d[, -c(1,2)]) 469 | row.names(x) <- paste(d$CntN, d$Year) 470 | head(x) 471 | 472 | sdist <- dist(x) 473 | 474 | xSammon <- sammon(sdist, k = 2) 475 | head(xSammon$points) 476 | 477 | 478 | ## ----mlu-sam1, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "Plot of Sammon points with text labels"---- 479 | 480 | plot(xSammon$points, type = "n") 481 | text(xSammon$points, labels = row.names(x) ) 482 | 483 | 484 | -------------------------------------------------------------------------------- /udv.R: -------------------------------------------------------------------------------- 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE---- 2 | 3 | library(checkpoint) 4 | checkpoint("2018-09-28", R.version = "3.5.1", 5 | project = book_directory, 6 | checkpointLocation = checkpoint_directory, 7 | scanForPackages = FALSE, 8 | scan.rnw.with.knitr = TRUE, use.knitr = TRUE) 9 | 10 | library(knitr) 11 | library(ggplot2) 12 | library(cowplot) 13 | library(MASS) 14 | library(JWileymisc) 15 | library(data.table) 16 | 17 | options(width = 70, digits = 2) 18 | 19 | 20 | ## ----fudv-dotplot-mpg, fig.width=4, fig.height=1.5, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Stacked dot plot of miles per gallon from old cars."---- 21 | 22 | ggplot(mtcars, aes(mpg)) + 23 | geom_dotplot() 24 | 25 | 26 | ## ----fudv-hist-sl, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Histogram of sepal length from the iris data."---- 27 | 28 | ggplot(iris, aes(Sepal.Length)) + 29 | geom_histogram() 30 | 31 | 32 | ## ----fudv-hist-lynx-raw, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Histogram of annual Canadian lynx trappings."---- 33 | 34 | ggplot(data.table(lynx = as.vector(lynx)), aes(lynx)) + 35 | geom_histogram() 36 | 37 | 38 | ## ----fudv-hist-lynx-log, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Histogram of annual Canadian lynx trappings after a natural log transformation."---- 39 | 40 | ggplot(data.table(lynx = as.vector(lynx)), aes(log(lynx))) + 41 | geom_histogram() 42 | 43 | 44 | ## ----fudv-density, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "This is the density plot for our sepal lengths."---- 45 | 46 | ggplot(iris, aes(Sepal.Length)) + 47 | geom_density() 48 | 49 | 50 | ## ----fudv-densityadjust, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = c("A noisy density plot.", "A very smooth density plot.")---- 51 | 52 | ggplot(iris, aes(Sepal.Length)) + 53 | geom_density(adjust = .5) 54 | 55 | ggplot(iris, aes(Sepal.Length)) + 56 | geom_density(adjust = 5) 57 | 58 | 59 | ## ----fudv-qqnorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Normal data look like a straight line. Sepal.Length seems fairly normal."---- 60 | 61 | ggplot(iris, aes(sample = Sepal.Length)) + 62 | geom_qq() 63 | 64 | 65 | ## ------------------------------------------------------------------------ 66 | 67 | qnorm(p = .1, mean = 0, sd = 1) 68 | 69 | 70 | ## ------------------------------------------------------------------------ 71 | 72 | qnorm(p = c(.25, .50, .75), mean = 0, sd = 1) 73 | 74 | 75 | ## ------------------------------------------------------------------------ 76 | 77 | ppoints(n = 3, a = 0) 78 | 79 | 80 | ## ------------------------------------------------------------------------ 81 | 82 | ppoints(n = 3) 83 | 84 | 85 | ## ----fudv-manual-qnorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Shows theoretical norms on the x-axis (based on predictions from mean and standard deviation)."---- 86 | 87 | qplot( 88 | x = qnorm( 89 | p = ppoints(length(iris$Sepal.Length)), 90 | mean = mean(iris$Sepal.Length), 91 | sd = sd(iris$Sepal.Length)), 92 | y = sort(iris$Sepal.Length), 93 | xlab = "Theoretical Normal Quantiles", 94 | ylab = "Sepal Length") + 95 | geom_abline(slope = 1, intercept = 0) 96 | 97 | 98 | ## ----fudv-qqlnorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = c("Testing whether lynx data are consistent with a log normal distribution.", "Testing whether lynx data are consistent with a Poisson distribution.")---- 99 | 100 | ggplot(data.table(lynx = as.vector(lynx)), aes(sample = lynx)) + 101 | geom_qq(distribution = qlnorm) 102 | 103 | ggplot(data.table(lynx = as.vector(lynx)), aes(sample = lynx)) + 104 | geom_qq(distribution = qpois, dparams = list(lambda = mean(lynx))) 105 | 106 | 107 | ## ----fudv-densitynorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "A normal curve and our density plot (with default smoothness of 1)."---- 108 | 109 | ggplot(iris, aes(Sepal.Length)) + 110 | geom_density() + 111 | stat_function(fun = dnorm, 112 | args = list( 113 | mean = mean(iris$Sepal.Length), 114 | sd = sd(iris$Sepal.Length)), 115 | colour = "blue") 116 | 117 | 118 | ## ------------------------------------------------------------------------ 119 | 120 | set.seed(1234) 121 | y <- rbeta(150, 1, 4) 122 | head(y) 123 | 124 | 125 | ## ----warning=FALSE------------------------------------------------------- 126 | 127 | y.fit <- fitdistr(y, densfun = "beta", 128 | start = list(shape1 = .5, shape2 = .5)) 129 | 130 | 131 | ## ------------------------------------------------------------------------ 132 | 133 | y.fit 134 | 135 | y.fit$estimate["shape1"] 136 | 137 | y.fit$estimate["shape2"] 138 | 139 | 140 | ## ------------------------------------------------------------------------ 141 | 142 | logLik(y.fit) 143 | 144 | 145 | ## ------------------------------------------------------------------------ 146 | 147 | y.fit2 <- fitdistr(y, densfun = "normal") 148 | logLik(y.fit2) 149 | 150 | 151 | ## ----fudv-testdistr-norm, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Density plot with superimposed normal distributions and normal Q-Q plot."---- 152 | 153 | testdistr(y) 154 | 155 | 156 | ## ----fudv-testdistr, fig.width=6, fig.height=6, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Shows density plot with superimposed beta or normal distributions along with Q-Q plot fits."---- 157 | 158 | test.beta <- testdistr(y, "beta", 159 | starts = list(shape1 = .5, shape2 = .5), 160 | varlab = "Y", plot = FALSE) 161 | 162 | test.normal <- testdistr(y, "normal", varlab = "Y", plot = FALSE) 163 | 164 | plot_grid( 165 | test.beta$DensityPlot, test.beta$QQPlot, 166 | test.normal$DensityPlot, test.normal$QQPlot, 167 | ncol = 2) 168 | 169 | 170 | ## ----fudv-testdistr-pois, fig.width=6, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Discrete observed proportions with the theoretical probabilies from a poisson plotted in blue."---- 171 | 172 | set.seed(1234) 173 | y <- rnbinom(500, mu = 5, size = 2) 174 | testdistr(y, "poisson") 175 | 176 | 177 | ## ----fudv-testdistr-nbinom, fig.width=6, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Discrete observed proportions with the theoretical probabilies from a negative binomial distribution plotted in blue."---- 178 | 179 | testdistr(y, "nbinom") 180 | 181 | 182 | ## ------------------------------------------------------------------------ 183 | 184 | pnorm(c(-3, 3)) 185 | 186 | 187 | ## ----fudv-anomaly-sd, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Panel graph showing stacked dot plots with anomalous values."---- 188 | 189 | set.seed(1234) 190 | d <- data.table( 191 | y1 = rnorm(200, 0, 1), 192 | y2 = rnorm(200, 0, .2) + rep(c(-3, -1, 1, 3), each = 50)) 193 | 194 | plot_grid( 195 | qplot(c(d$y1, rep(5, 3)), geom = "dotplot", binwidth = .1), 196 | qplot(c(d$y2, rep(5, 3)), geom = "dotplot", binwidth = .1), 197 | ncol = 1, labels = c("A", "B")) 198 | 199 | 200 | ## ----fudv-anomaly-gamma, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Panel graph showing randomly generated (no added anomalous values) data from a Gamma and normal distribution."---- 201 | 202 | set.seed(1234) 203 | d2 <- data.table( 204 | y1 = rgamma(200, 1, .1), 205 | y2 = rnorm(200, 10, 10)) 206 | 207 | plot_grid( 208 | qplot(d2$y1, geom = "dotplot", binwidth = 1), 209 | qplot(d2$y2, geom = "dotplot", binwidth = 1), 210 | ncol = 1, labels = c("A", "B")) 211 | 212 | 213 | ## ----fudv-anomaly-testdistr, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Graph showing highlighting of extreme values."---- 214 | 215 | testdistr(d$y1, extremevalues = "empirical", 216 | ev.perc = .01) 217 | 218 | 219 | ## ----fudv-anomaly-td-gamma, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = paste("Graph showing highlighting of extreme values based on theoretical percentiles from a", c("normal distribution.", "Gamma distribution."))---- 220 | 221 | testdistr(d2$y1, "normal", extremevalues = "theoretical", 222 | ev.perc = .001) 223 | 224 | testdistr(d2$y1, "gamma", extremevalues = "theoretical", 225 | ev.perc = .001) 226 | 227 | 228 | ## ----fudv-anomaly-masked, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Graph showing an anomalous value of 100 masked by the more extreme anomalous value of 1000."---- 229 | 230 | testdistr(c(d2$y2, 100, 1000), "normal", 231 | extremevalues = "theoretical", 232 | ev.perc = .001) 233 | 234 | 235 | ## ----fudv-anomaly-robust, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Graph highlighting extreme values based on a robust estimator."---- 236 | 237 | testdistr(c(d2$y2, 100, 1000), "normal", 238 | robust = TRUE, 239 | extremevalues = "theoretical", 240 | ev.perc = .001) 241 | 242 | 243 | ## ------------------------------------------------------------------------ 244 | 245 | winsorizor(1:10, .1) 246 | 247 | 248 | ## ----fudv-anomaly-wins, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Panel graph comparing data before (A) and after (B) winsorizing the (empirical) bottom and top 1\\%."---- 249 | 250 | plot_grid( 251 | testdistr(d2$y1, "gamma", extremevalues = "theoretical", 252 | ev.perc = .005, plot=FALSE)$QQPlot, 253 | testdistr(winsorizor(d2$y1, .01), "gamma", extremevalues = "theoretical", 254 | ev.perc = .005, plot=FALSE)$QQPlot, 255 | ncol = 2, labels = c("A", "B"), align = "hv") 256 | 257 | 258 | --------------------------------------------------------------------------------