├── 9781484228715.jpg
├── Contributing.md
├── Gender_StatsData_worldbank.org_ccby40.xlsx
├── LICENSE.txt
├── README.md
├── errata.md
├── gam.R
├── glm1.R
├── glm2.R
├── glmma.R
├── glmmi.R
├── glmml.R
├── iiv.R
├── intro.R
├── md.R
├── mdv.R
├── mli.R
├── mls.R
├── mlu.R
└── udv.R


/9781484228715.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/advanced-r-statistical-programming-and-data-models/ace9ee821d60d591091c7dc0a97c8c4620406b30/9781484228715.jpg


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/Gender_StatsData_worldbank.org_ccby40.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/advanced-r-statistical-programming-and-data-models/ace9ee821d60d591091c7dc0a97c8c4620406b30/Gender_StatsData_worldbank.org_ccby40.xlsx


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ﻿Freeware License, some rights reserved
 2 | 
 3 | Copyright (c) 2019 Matt Wiley and Joshua F. Wiley
 4 | 
 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 
 6 | of this software and associated documentation files (the "Software"), 
 7 | to work with the Software within the limits of freeware distribution and fair use. 
 8 | This includes the rights to use, copy, and modify the Software for personal use. 
 9 | Users are also allowed and encouraged to submit corrections and modifications 
10 | to the Software for the benefit of other users.
11 | 
12 | It is not allowed to reuse,  modify, or redistribute the Software for 
13 | commercial use in any way, or for a user’s educational materials such as books 
14 | or blog articles without prior permission from the copyright holder. 
15 | 
16 | The above copyright notice and this permission notice need to be included 
17 | in all copies or substantial portions of the software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Advanced R Statistical Programming and Data Models*](https://www.apress.com/9781484228715) by Matt Wiley and Joshua F. Wiley (Apress, 2019).
 4 | 
 5 | [comment]: #cover
 6 | ![Cover image](9781484228715.jpg)
 7 | 
 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 9 | 
10 | ## Releases
11 | 
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 | 
14 | ## Contributions
15 | 
16 | See the file Contributing.md for more information on how you can contribute to this repository.


--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
 1 | # Errata for *Book Title*
 2 | 
 3 | On **page xx** [Summary of error]:
 4 |  
 5 | Details of error here. Highlight key pieces in **bold**.
 6 | 
 7 | ***
 8 | 
 9 | On **page xx** [Summary of error]:
10 |  
11 | Details of error here. Highlight key pieces in **bold**.
12 | 
13 | ***


--------------------------------------------------------------------------------
/gam.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 | library(checkpoint)
  4 | checkpoint("2018-09-28", R.version = "3.5.1",
  5 |   project = book_directory,
  6 |   checkpointLocation = checkpoint_directory,
  7 |   scanForPackages = FALSE,
  8 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(knitr)
 11 | library(data.table)
 12 | library(ggplot2)
 13 | library(ggthemes)
 14 | library(scales)
 15 | library(viridis)
 16 | library(car)
 17 | library(mgcv)
 18 | library(VGAM)
 19 | library(ipw)
 20 | library(JWileymisc)
 21 | library(xtable)
 22 | 
 23 | options(
 24 |   width = 70,
 25 |   stringsAsFactors = FALSE,
 26 |   datatable.print.nrows = 20,
 27 |   datatable.print.topn = 3,
 28 |   digits = 2)
 29 | 
 30 | 
 31 | ## ----fgam-poly1, fig.width=7, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Graph showing an intercept only (flat line) and progressively higher order polynomials."----
 32 | 
 33 | acl <- readRDS("advancedr_acl_data.RDS")
 34 | 
 35 | ggplot(acl, aes(AGE_W1, CESD11_W1)) +
 36 |   stat_smooth(method = "lm", formula = y ~ 1,
 37 |     colour = viridis(6)[1], linetype = 1, se = FALSE) +
 38 |   stat_smooth(method = "lm", formula = y ~ x,
 39 |     colour = viridis(6)[2], linetype = 4, se = FALSE) +
 40 |   stat_smooth(method = "lm", formula = y ~ poly(x, 2),
 41 |     colour = viridis(6)[3], linetype = 2, se = FALSE) +
 42 |   stat_smooth(method = "lm", formula = y ~ poly(x, 3),
 43 |     colour = viridis(6)[4], linetype = 3, se = FALSE) +
 44 |   stat_smooth(method = "lm", formula = y ~ poly(x, 4),
 45 |     colour = viridis(6)[5], linetype = 1, se = FALSE) +
 46 |   stat_smooth(method = "lm", formula = y ~ poly(x, 10),
 47 |     colour = viridis(6)[6], linetype = 5, se = FALSE)
 48 | 
 49 | 
 50 | ## ------------------------------------------------------------------------
 51 | 
 52 | ## > and <
 53 | 1:5 %gl% c(2, 4)
 54 | 
 55 | ## > and <=
 56 | 1:5 %gle% c(2, 4)
 57 | 
 58 | ## >= and <
 59 | 1:5 %gel% c(2, 4)
 60 | 
 61 | ## >= and <=
 62 | 1:5 %gele% c(2, 4)
 63 | 
 64 | 
 65 | 
 66 | ## ----fgam-spline1, fig.width=7, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Graph showing an step function spline, linear splines, and quadratic splines, all with two inner knots."----
 67 | 
 68 | ggplot(acl, aes(AGE_W1, CESD11_W1)) +
 69 |   stat_smooth(method = "lm",
 70 |     formula = y ~ 1 +
 71 |       ifelse(x %gle% c(42, 65), 1, 0) +
 72 |       ifelse(x %gle% c(65, 96), 1, 0),
 73 |     colour = viridis(6)[1], linetype = 1, se = FALSE) +
 74 |   stat_smooth(method = "lm",
 75 |     formula = y ~ bs(x, df = 3, degree = 1L),
 76 |     colour = viridis(6)[2], linetype = 2, se = FALSE) +
 77 |   stat_smooth(method = "lm",
 78 |     formula = y ~ bs(x, df = 4, degree = 2L),
 79 |     colour = viridis(6)[3], linetype = 3, se = FALSE) +
 80 |   stat_smooth(method = "lm",
 81 |     formula = y ~ bs(x, df = 5, degree = 3L),
 82 |     colour = viridis(6)[4], linetype = 4, se = FALSE)
 83 | 
 84 | 
 85 | ## ----fgam-spline2, fig.width=10, fig.height=9, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Graph showing B-Splines (basis splines)."----
 86 | 
 87 | knots <- c(33, 42, 57, 65, 72)
 88 | x <- seq(from = min(acl$AGE_W1),
 89 |          to = max(acl$AGE_W1), by = .01)
 90 | 
 91 | p1 <- ggplot(melt(bs(x, degree = 1,
 92 |           knots = knots, intercept = TRUE)),
 93 |           aes(Var1, value, colour = factor(Var2))) +
 94 |   geom_line() +
 95 |   scale_color_viridis("Basis", discrete = TRUE) +
 96 |   theme_tufte()
 97 | 
 98 | plot_grid(
 99 |   p1 +
100 |     ggtitle("5 Knots, Degree = 1"),
101 |   p1 %+% melt(bs(x, degree = 2,
102 |           knots = knots, intercept = TRUE)) +
103 |     ggtitle("5 Knots, Degree = 2"),
104 |   p1 %+% melt(bs(x, degree = 3,
105 |           knots = knots, intercept = TRUE)) +
106 |     ggtitle("5 Knots, Degree = 3"),
107 |   p1 %+% melt(bs(x, degree = 4,
108 |           knots = knots, intercept = TRUE)) +
109 |     ggtitle("5 Knots, Degree = 4"),
110 |   ncol = 2)
111 | 
112 | 
113 | ## ------------------------------------------------------------------------
114 | 
115 | mgam <- vgam(CESD11_W1 ~ Sex + s(AGE_W1, df = 3), data = acl,
116 |         family = uninormal(), model = TRUE)
117 | 
118 | summary(mgam)
119 | 
120 | 
121 | ## ------------------------------------------------------------------------
122 | 
123 | coef(mgam)
124 | 
125 | 
126 | ## ------------------------------------------------------------------------
127 | 
128 | ## test parametric coefficient for sex
129 | linearHypothesis(mgam, "Sex(2) FEMALE",
130 |   coef. = coef(mgam), vcov = vcov(mgam))
131 | 
132 | 
133 | ## ------------------------------------------------------------------------
134 | 
135 | ## test parametric coefficient for
136 | ## intercept and sex simultaneously
137 | linearHypothesis(mgam,
138 |   c("(Intercept):1", "Sex(2) FEMALE"),
139 |   coef. = coef(mgam), vcov = vcov(mgam))
140 | 
141 | 
142 | ## ----fgam-agegam1, fig.width=9, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for a generalized additive model with sex as a parametric term and age as a smooth spline.", warning=FALSE----
143 | 
144 | par(mfrow = c(1, 2))
145 | plot(mgam, se = TRUE,
146 |      lcol = viridis(4)[1], scol = viridis(4)[2])
147 | 
148 | 
149 | ## ------------------------------------------------------------------------
150 | 
151 | mlin <- vglm(CESD11_W1 ~ Sex + AGE_W1, data = acl,
152 |         family = uninormal(), model = TRUE)
153 | mquad <- vglm(CESD11_W1 ~ Sex + poly(AGE_W1, 2), data = acl,
154 |         family = uninormal(), model = TRUE)
155 | 
156 | 
157 | ## ----fgam-agegam2, fig.width=9, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Two panel plot showing the predicted depression symptom level by age from a generalized additive model versus a linear fit on the left and a quadratic fit on the right.", warning=FALSE----
158 | 
159 | par(mfrow = c(1, 2))
160 | plot(mgam, se = TRUE, which.term = 2,
161 |      lcol = viridis(4)[1], scol = viridis(4)[1])
162 | plot(as(mlin, "vgam"), se = TRUE, which.term = 2,
163 |      lcol = viridis(4)[2], scol = viridis(4)[2],
164 |      overlay = TRUE, add = TRUE)
165 | 
166 | plot(mgam, se = TRUE, which.term = 2,
167 |      lcol = viridis(4)[1], scol = viridis(4)[1])
168 | plot(as(mquad, "vgam"), se = TRUE, which.term = 2,
169 |      lcol = viridis(4)[3], scol = viridis(4)[3],
170 |      overlay = TRUE, add = TRUE)
171 | 
172 | 
173 | ## ------------------------------------------------------------------------
174 | 
175 | mgam2 <- vgam(CESD11_W2 ~ Sex +
176 |                s(CESD11_W1, df = 3) +
177 |                s(AGE_W1, df = 3), data = acl,
178 |         family = uninormal(), model = TRUE)
179 | 
180 | summary(mgam2)
181 | 
182 | 
183 | ## ------------------------------------------------------------------------
184 | 
185 | mgam3 <- vgam(CESD11_W2 ~ Sex +
186 |                s(CESD11_W1, df = 3) +
187 |                AGE_W1, data = acl,
188 |         family = uninormal(), model = TRUE)
189 | 
190 | summary(mgam3)
191 | 
192 | 
193 | ## ------------------------------------------------------------------------
194 | 
195 | names(coef(mgam3))
196 | 
197 | linearHypothesis(mgam3,
198 |   "Sex(2) FEMALE",
199 |   coef. = coef(mgam3), vcov = vcov(mgam3))
200 | 
201 | linearHypothesis(mgam3,
202 |   "AGE_W1",
203 |   coef. = coef(mgam3), vcov = vcov(mgam3))
204 | 
205 | 
206 | ## ----fgam-gam3, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for a generalized additive model with sex and age as parametric terms and wave 1 depression symptoms as a smooth spline.", warning=FALSE----
207 | 
208 | par(mfrow = c(2, 2))
209 | plot(mgam3, se = TRUE,
210 |      lcol = viridis(4)[1],
211 |      scol = viridis(4)[2])
212 | 
213 | 
214 | ## ------------------------------------------------------------------------
215 | 
216 | 
217 | ## generate new data for prediction
218 | ## use the whole range of sex and depression symptoms
219 | ## and a five number summary of age
220 | ## (min, 25th 50th 75th percentiles and max)
221 | newdat <- as.data.table(expand.grid(
222 |   Sex = levels(acl$Sex),
223 |   CESD11_W1 = seq(
224 |     from = min(acl$CESD11_W1, na.rm=TRUE),
225 |     to = max(acl$CESD11_W1, na.rm=TRUE),
226 |     length.out = 1000),
227 |   AGE_W1 = fivenum(acl$AGE_W1)))
228 | 
229 | newdat$yhat <- predict(mgam3, newdata = newdat)
230 | 
231 | 
232 | ## ----fgam-gampred, fig.width=10, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Predicted depression symptoms at wave 2 across levels of wave 1 depression symptoms at varying ages and sex.", warning=FALSE----
233 | 
234 | ggplot(newdat,
235 |        aes(CESD11_W1, yhat,
236 |            colour = factor(AGE_W1),
237 |            linetype = factor(AGE_W1))) +
238 |   geom_line() +
239 |   scale_color_viridis("Age", discrete = TRUE) +
240 |   scale_linetype_discrete("Age") +
241 |   facet_wrap(~ Sex) +
242 |   theme(legend.position = c(.75, .2),
243 |         legend.key.width = unit(1.5, "cm")) +
244 |   xlab("Depression Symptoms (Wave 1)") +
245 |   ylab("Depression Symptoms (Wave 2)")
246 | 
247 | 
248 | ## ------------------------------------------------------------------------
249 | 
250 | detach("package:VGAM")
251 | library(mgcv)
252 | 
253 | 
254 | ## ------------------------------------------------------------------------
255 | 
256 | mgam4 <- gam(CESD11_W2 ~ Sex +
257 |                s(CESD11_W1, k = 3) +
258 |                s(AGE_W1, k = 3), data = acl,
259 |         family = gaussian())
260 | 
261 | 
262 | ## ------------------------------------------------------------------------
263 | 
264 | summary(mgam4)
265 | 
266 | 
267 | ## ------------------------------------------------------------------------
268 | 
269 | mgam5 <- gam(CESD11_W2 ~ Sex +
270 |                s(CESD11_W1, k = 4) +
271 |                s(AGE_W1, k = 4), data = acl,
272 |         family = gaussian())
273 | 
274 | summary(mgam5)
275 | 
276 | 
277 | ## ----fgam-gam5, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for two generalized additive models varying the maximum flexibility of the smooth splines.", warning=FALSE----
278 | 
279 | par(mfrow = c(2, 2))
280 | plot(mgam4, se = TRUE, scale = 0, main = "k = 3")
281 | plot(mgam5, se = TRUE, scale = 0, main = "k = 4")
282 | 
283 | 
284 | ## ----fgam-gam6, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for a generalized additive models allowing splines to vary by sex.", warning=FALSE----
285 | 
286 | mgam6 <- gam(CESD11_W2 ~ Sex +
287 |               s(CESD11_W1, k = 4, by = Sex) +
288 |               s(AGE_W1, k = 4, by = Sex),
289 |              data = acl,
290 |         family = gaussian())
291 | 
292 | summary(mgam6)
293 | 
294 | par(mfrow = c(2, 2))
295 | plot(mgam6, ask = FALSE, scale = 0)
296 | 
297 | 
298 | ## ------------------------------------------------------------------------
299 | 
300 | AIC(mgam5, mgam6)
301 | BIC(mgam5, mgam6)
302 | 
303 | 
304 | ## ------------------------------------------------------------------------
305 | 
306 | mgam7 <- gam(CESD11_W2 ~ Sex +
307 |                te(CESD11_W1, SelfEsteem_W1, k = 4^2),
308 |              data = acl,
309 |         family = gaussian())
310 | 
311 | summary(mgam7)
312 | 
313 | 
314 | ## ----fgam-persp3d, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "3D perspective plots showing the result of a tensor product smooth between depression symptoms and self-esteem at wave 1 predicting depression at wave 2.", warning=FALSE----
315 | 
316 | par(mfrow = c(2, 2), mar = c(.1, .1, .1, .1))
317 | vis.gam(mgam7,
318 |   view = c("CESD11_W1", "SelfEsteem_W1"),
319 |   theta = 210, phi = 40,
320 |   color = "topo",
321 |   plot.type = "persp")
322 | vis.gam(mgam7,
323 |   view = c("CESD11_W1", "SelfEsteem_W1"),
324 |   theta = 150, phi = 40,
325 |   color = "topo",
326 |   plot.type = "persp")
327 | vis.gam(mgam7,
328 |   view = c("CESD11_W1", "SelfEsteem_W1"),
329 |   theta = 60, phi = 40,
330 |   color = "topo",
331 |   plot.type = "persp")
332 | vis.gam(mgam7,
333 |   view = c("CESD11_W1", "SelfEsteem_W1"),
334 |   theta = 10, phi = 40,
335 |   color = "topo",
336 |   plot.type = "persp")
337 | 
338 | 
339 | ## ----fgam-contour, fig.width=7, fig.height=7, out.width='0.6\\linewidth', fig.pos="!h", fig.cap = "Contour plot showing the result of a tensor product smooth between depression symptoms and self-esteem at wave 1 predicting depression at wave 2.", warning=FALSE----
340 | 
341 | par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))
342 | vis.gam(mgam7,
343 |   view = c("CESD11_W1", "SelfEsteem_W1"),
344 |   color = "topo",
345 |   plot.type = "contour")
346 | 
347 | 
348 | ## ------------------------------------------------------------------------
349 | 
350 | mgam8 <- gam(CESD11_W2 ~ Sex +
351 |                ti(CESD11_W1, k = 4) +
352 |                ti(SelfEsteem_W1, k = 4) +
353 |                ti(CESD11_W1, SelfEsteem_W1, k = 4^2),
354 |              data = acl,
355 |         family = gaussian())
356 | 
357 | summary(mgam8)
358 | 
359 | 
360 | ## ------------------------------------------------------------------------
361 | 
362 | mgam5 <- gam(CESD11_W2 ~ Sex +
363 |                s(CESD11_W1, k = 4) +
364 |                s(AGE_W1, k = 4), data = acl,
365 |         family = gaussian())
366 | 
367 | summary(mgam5)
368 | 
369 | 
370 | ## ----fgam-checkit, fig.width=7, fig.height=7, out.width='0.7\\linewidth', fig.pos="!h", fig.cap = "Diagnostics plots from generalized additive model.", warning=FALSE----
371 | 
372 | par(mfrow = c(2, 2))
373 | set.seed(12345)
374 | gam.check(mgam5)
375 | 
376 | 
377 | ## ------------------------------------------------------------------------
378 | 
379 | mgam5b <- gam(CESD11_W2 ~ Sex +
380 |                s(CESD11_W1, k = 20) +
381 |                s(AGE_W1, k = 20), data = acl,
382 |         family = gaussian())
383 | 
384 | summary(mgam5b)
385 | 
386 | 
387 | ## ----fgam-gam5b, fig.width=10, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of model results for generalized additive model after increasing k for depression symptoms."----
388 | 
389 | par(mfrow = c(1, 2))
390 | plot(mgam5b, se = TRUE, scale = 0)
391 | 
392 | 
393 | ## ------------------------------------------------------------------------
394 | 
395 | library(VGAM)
396 | acl$CurSmoke <- as.integer(acl$Smoke_W1 == "(1) Cur Smok")
397 | 
398 | mgam.lr1 <- vgam(CurSmoke ~ s(AGE_W1, df = 3),
399 |              family = binomialff(link = "logit"),
400 |              data = acl, model = TRUE)
401 | 
402 | summary(mgam.lr1)
403 | 
404 | 
405 | ## ----fgam-lr1, fig.width=7, fig.height=7, out.width='0.7\\linewidth', fig.pos="!h", fig.cap = "Generalized additive model for age and current smoking status.", warning = FALSE----
406 | 
407 | par(mfrow = c(1, 1))
408 | plot(mgam.lr1, se = TRUE,
409 |      lcol = viridis(4)[1],
410 |      scol = viridis(4)[2])
411 | 
412 | 
413 | ## ------------------------------------------------------------------------
414 | 
415 | ## generate new data for prediction
416 | ## use the whole range of age
417 | newdat <- as.data.table(expand.grid(
418 |   AGE_W1 = seq(
419 |     from = min(acl$AGE_W1, na.rm=TRUE),
420 |     to = max(acl$AGE_W1, na.rm=TRUE),
421 |     length.out = 1000)))
422 | 
423 | newdat$yhat <- predict(mgam.lr1,
424 |                        newdata = newdat,
425 |                        type = "response")
426 | 
427 | 
428 | ## ----fgam-lr1pred, fig.width=6, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Predicted probability of smoking across ages."----
429 | 
430 | ggplot(newdat, aes(AGE_W1, yhat)) +
431 |   geom_line() +
432 |   scale_y_continuous(labels = percent) +
433 |   xlab("Age (years)") +
434 |   ylab("Probability of Smoking") +
435 |   coord_cartesian(xlim = range(acl$AGE_W1),
436 |                   ylim = c(0, .4),
437 |                   expand = FALSE)
438 | 
439 | 
440 | ## ------------------------------------------------------------------------
441 | 
442 | nboot <- 500
443 | 
444 | out <- matrix(NA_real_, ncol = nboot, nrow = nrow(newdat))
445 | 
446 | start.time <- proc.time()
447 | set.seed(12345)
448 | for (i in 1:500) {
449 |   tmp <- vgam(CurSmoke ~ s(AGE_W1, df = 3),
450 |              family = binomialff(link = "logit"),
451 |              data = acl[sample(nrow(acl), replace = TRUE)], model = TRUE)
452 |   out[, i] <- predict(tmp,
453 |                       newdata = newdat,
454 |                       type = "response")
455 | }
456 | stop.time <- proc.time()
457 | 
458 | ## time to bootstrap 500 times
459 | stop.time - start.time
460 | 
461 | 
462 | ## ------------------------------------------------------------------------
463 | 
464 | mean(abs(newdat$yhat - rowMeans(out)))
465 | 
466 | 
467 | ## ------------------------------------------------------------------------
468 | 
469 | newdat$LL <- apply(out, 1, quantile,
470 |   probs = .025, na.rm = TRUE)
471 | 
472 | newdat$UL <- apply(out, 1, quantile,
473 |   probs = .975, na.rm = TRUE)
474 | 
475 | 
476 | ## ----fgam-lr1predboot, fig.width=6, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Predicted probability of smoking across ages."----
477 | 
478 | ggplot(newdat, aes(AGE_W1, yhat)) +
479 |   geom_ribbon(aes(ymin = LL, ymax = UL), fill = "grey80") +
480 |   geom_line(size = 2) +
481 |   scale_y_continuous(labels = percent) +
482 |   xlab("Age (years)") +
483 |   ylab("Probability of Smoking") +
484 |   theme_tufte() +
485 |   coord_cartesian(xlim = range(acl$AGE_W1),
486 |                   ylim = c(0, .5),
487 |                   expand = FALSE)
488 | 
489 | 
490 | ## ------------------------------------------------------------------------
491 | 
492 | acl[, EmployG_W2 := as.character(Employment_W2)]
493 | acl[EmployG_W2 %in% c(
494 |   "(2) 2500+HRS", "(3) 15002499",
495 |   "(4) 500-1499", "(5) 1-499HRS"),
496 |   EmployG_W2 := "(2) EMPLOYED"]
497 | acl[, EmployG_W2 := factor(EmployG_W2)]
498 | 
499 | mgam.mr1 <- vgam(EmployG_W2 ~ s(AGE_W1, k = 5),
500 |                family = multinomial(),
501 |                data = acl, model = TRUE)
502 | 
503 | summary(mgam.mr1)
504 | 
505 | 
506 | ## ----fgam-mr1, fig.width=10, fig.height=10, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Generalized additive model for age and employment status as a 5 level unordered categorical outcome resulting in four distinct effects of age."----
507 | 
508 | par(mfrow = c(2, 2))
509 | plot(mgam.mr1, se = TRUE,
510 |      lcol = viridis(4)[1],
511 |      scol = viridis(4)[2])
512 | 
513 | 
514 | ## ------------------------------------------------------------------------
515 | 
516 | ## generate new data for prediction
517 | ## use the whole range of age
518 | newdat <- as.data.table(expand.grid(
519 |   AGE_W1 = seq(
520 |     from = min(acl$AGE_W1, na.rm=TRUE),
521 |     to = max(acl$AGE_W1, na.rm=TRUE),
522 |     length.out = 1000)))
523 | 
524 | newdat <- cbind(newdat, predict(mgam.mr1,
525 |                 newdata = newdat,
526 |                 type = "response"))
527 | 
528 | newdatlong <- melt(newdat, id.vars = "AGE_W1")
529 | 
530 | summary(newdatlong)
531 | 
532 | 
533 | ## ----fgam-mr1pred, fig.width=8, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Predicted probability of employment status across ages."----
534 | 
535 | ggplot(newdatlong, aes(
536 |   AGE_W1, value,
537 |   colour = variable, linetype = variable)) +
538 |   geom_line(size = 2) +
539 |   scale_color_viridis(discrete = TRUE) +
540 |   scale_x_continuous("Age (years)") +
541 |   scale_y_continuous("Probability", label = percent) +
542 |   coord_cartesian(ylim = c(0, 1), expand = FALSE) +
543 |   theme_tufte() +
544 |   theme(legend.position = c(.2, .5),
545 |         legend.key.width = unit(2, "cm"))
546 | 
547 | 
548 | ## ------------------------------------------------------------------------
549 | 
550 | ## negative binomial regression model
551 | mgam.nbr1 <- vgam(NChronic12_W2 ~ Sex + s(AGE_W1, k = 5),
552 |               family = negbinomial(),
553 |               data = acl, model = TRUE)
554 | 
555 | summary(mgam.nbr1)
556 | 
557 | 
558 | ## ----fgam-nbr1, fig.width=10, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Generalized additive model for sex an age and number of chronic conditions.", warning=FALSE----
559 | 
560 | par(mfrow = c(1, 2))
561 | plot(mgam.nbr1, se = TRUE,
562 |      lcol = viridis(4)[1],
563 |      scol = viridis(4)[2])
564 | 
565 | 
566 | ## ------------------------------------------------------------------------
567 | 
568 | ## generate new data for prediction
569 | ## use the whole range of age and sex
570 | newdat <- as.data.table(expand.grid(
571 |   Sex = levels(acl$Sex),
572 |   AGE_W1 = seq(
573 |     from = min(acl$AGE_W1, na.rm=TRUE),
574 |     to = max(acl$AGE_W1, na.rm=TRUE),
575 |     length.out = 1000)))
576 | 
577 | newdat$yhat <- predict(mgam.nbr1,
578 |                        newdata = newdat,
579 |                        type = "response")
580 | 
581 | 
582 | ## ----fgam-nbr1pred, fig.width=6, fig.height=5, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Predicted number of chronic conditions across ages by sex."----
583 | 
584 | ggplot(newdat, aes(AGE_W1, yhat, colour = Sex)) +
585 |   geom_line(size = 2) +
586 |   scale_color_viridis(discrete = TRUE) +
587 |   xlab("Age (years)") +
588 |   ylab("Number Chronic Conditions") +
589 |   theme_tufte() +
590 |   coord_cartesian(xlim = range(acl$AGE_W1),
591 |                   ylim = c(0, 2.5),
592 |                   expand = FALSE) +
593 |   theme(legend.position = c(.2, .8),
594 |         legend.key.width = unit(1, "cm"))
595 | 
596 | 
597 | ## ------------------------------------------------------------------------
598 | 
599 | detach("package:VGAM")
600 | library(mgcv)
601 | 
602 | mgam.nbr2 <- gam(NChronic12_W2 ~ Sex + s(AGE_W1, k = 10, by = Sex),
603 |               family = nb(), data = acl)
604 | 
605 | summary(mgam.nbr2)
606 | 
607 | 
608 | ## ------------------------------------------------------------------------
609 | 
610 | ## generate new data for prediction
611 | ## use the whole range of age and sex
612 | newdat <- as.data.table(expand.grid(
613 |   Sex = levels(acl$Sex),
614 |   AGE_W1 = seq(
615 |     from = min(acl$AGE_W1, na.rm=TRUE),
616 |     to = max(acl$AGE_W1, na.rm=TRUE),
617 |     length.out = 1000)))
618 | 
619 | newdat$yhat <- predict(mgam.nbr2,
620 |                        newdata = newdat,
621 |                        type = "response")
622 | 
623 | 
624 | ## ----fgam-nbr2pred, fig.width=6, fig.height=5, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Predicted number of chronic conditions across ages by sex from an interaction model."----
625 | 
626 | ggplot(newdat, aes(AGE_W1, yhat, colour = Sex)) +
627 |   geom_line(size = 2) +
628 |   scale_color_viridis(discrete = TRUE) +
629 |   xlab("Age (years)") +
630 |   ylab("Number Chronic Conditions") +
631 |   theme_tufte() +
632 |   coord_cartesian(xlim = range(acl$AGE_W1),
633 |                   ylim = c(0, 2.7),
634 |                   expand = FALSE) +
635 |   theme(legend.position = c(.2, .8),
636 |         legend.key.width = unit(1, "cm"))
637 | 
638 | 
639 | ## ------------------------------------------------------------------------
640 | 
641 | nboot <- 500
642 | 
643 | out <- matrix(NA_real_, ncol = nboot, nrow = nrow(newdat))
644 | 
645 | start.time <- proc.time()
646 | set.seed(12345)
647 | for (i in 1:500) {
648 |   tmp <- gam(NChronic12_W2 ~ Sex + s(AGE_W1, k = 10, by = Sex),
649 |               family = nb(),
650 |              data = acl[sample(nrow(acl), replace = TRUE)])
651 |   out[, i] <- predict(tmp,
652 |                       newdata = newdat,
653 |                       type = "response")
654 | }
655 | stop.time <- proc.time()
656 | 
657 | ## time to bootstrap 500 times
658 | stop.time - start.time
659 | 
660 | 
661 | ## ------------------------------------------------------------------------
662 | 
663 | mean(abs(newdat$yhat - rowMeans(out)))
664 | 
665 | 
666 | ## ------------------------------------------------------------------------
667 | 
668 | newdat$LL <- apply(out, 1, quantile,
669 |   probs = .025, na.rm = TRUE)
670 | 
671 | newdat$UL <- apply(out, 1, quantile,
672 |   probs = .975, na.rm = TRUE)
673 | 
674 | 
675 | ## ----fgam-nbr1predboot, fig.width=7, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Predicted count of chronic conditions across ages by sex with bootstrapped confidence intervals."----
676 | 
677 | ggplot(newdat, aes(AGE_W1, yhat)) +
678 |   geom_ribbon(aes(ymin = LL, ymax = UL, fill = Sex), alpha = .2) +
679 |   geom_line(aes(colour = Sex), size = 2) +
680 |   scale_color_viridis(discrete = TRUE) +
681 |   scale_fill_viridis(discrete = TRUE) +
682 |   xlab("Age (years)") +
683 |   ylab("Number Chronic Conditions") +
684 |   theme_tufte() +
685 |   coord_cartesian(xlim = range(acl$AGE_W1),
686 |                   ylim = c(0, 4),
687 |                   expand = FALSE) +
688 |   theme(legend.position = c(.2, .8),
689 |         legend.key.width = unit(2, "cm"))
690 | 
691 | 
692 | ## ------------------------------------------------------------------------
693 | 
694 | xtabs(~Sex + I(AGE_W1 > 80), data = acl[!is.na(NChronic12_W2)])
695 | 
696 | 
697 | 


--------------------------------------------------------------------------------
/glm1.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | library(checkpoint)
  3 | checkpoint("2018-09-28", R.version = "3.5.1",
  4 |   project = book_directory,
  5 |   checkpointLocation = checkpoint_directory,
  6 |   scanForPackages = FALSE,
  7 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  8 | 
  9 | library(knitr)
 10 | library(data.table)
 11 | library(ggplot2)
 12 | library(visreg)
 13 | library(ez)
 14 | library(emmeans)
 15 | library(rms)
 16 | library(ipw)
 17 | library(JWileymisc)
 18 | library(RcppEigen)
 19 | library(texreg)
 20 | 
 21 | options(
 22 |   width = 70,
 23 |   stringsAsFactors = FALSE,
 24 |   datatable.print.nrows = 20,
 25 |   datatable.print.topn = 3,
 26 |   digits = 2)
 27 | 
 28 | 
 29 | ## ------------------------------------------------------------------------
 30 | f1 <- y ~ x1 + x2 + x1:x2
 31 | 
 32 | update(f1, . ~ .)
 33 | 
 34 | update(f1, w ~ .)
 35 | 
 36 | update(f1, . ~ . + x3)
 37 | 
 38 | update(f1, . ~ . - x1:x2)
 39 | 
 40 | 
 41 | ## ------------------------------------------------------------------------
 42 | 
 43 | set.seed(1234)
 44 | example <- data.table(
 45 |   y = rnorm(9),
 46 |   Condition = factor(rep(c("A", "B", "Control"), each = 3),
 47 |                      levels = c("Control", "A", "B")))
 48 | 
 49 | coef(lm(y ~ Condition, data = example))
 50 | 
 51 | 
 52 | ## ------------------------------------------------------------------------
 53 | 
 54 | example[, .(M = mean(y)), by = Condition]
 55 | 
 56 | 
 57 | ## ----include=FALSE-------------------------------------------------------
 58 | 
 59 | printq <- sprintf("$%0.2f = %0.2f - %0.2f$",
 60 |                   coef(lm(y ~ Condition, data = example))[["ConditionA"]],
 61 |                   mean(example[Condition == "A", y]),
 62 |                   mean(example[Condition == "Control", y]))
 63 | 
 64 | 
 65 | ## ------------------------------------------------------------------------
 66 | 
 67 | model.matrix(~ 0 + Condition, data = example)
 68 | 
 69 | 
 70 | ## ------------------------------------------------------------------------
 71 | 
 72 | coef(lm(y ~ 0 + Condition, data = example))
 73 | 
 74 | 
 75 | ## ------------------------------------------------------------------------
 76 | 
 77 | pf(.72, df1 = 1, df2 = 6, lower.tail = FALSE)
 78 | 
 79 | 
 80 | ## ------------------------------------------------------------------------
 81 | 
 82 | mtcars <- as.data.table(mtcars)
 83 | mtcars[, ID := factor(1:.N)]
 84 | mtcars[, vs := factor(vs)]
 85 | mtcars[, am := factor(am)]
 86 | 
 87 | head(model.matrix(~ vs * am, data = mtcars))
 88 | 
 89 | 
 90 | ## ------------------------------------------------------------------------
 91 | 
 92 | example[, ID := factor(1:.N)]
 93 | 
 94 | print(ezANOVA(
 95 |   data = example,
 96 |   dv = y,
 97 |   wid = ID,
 98 |   between = Condition,
 99 |   type = 3,
100 |   detailed = TRUE))
101 | 
102 | 
103 | ## ------------------------------------------------------------------------
104 | 
105 | print(ezANOVA(
106 |   data = mtcars,
107 |   dv = mpg,
108 |   wid = ID,
109 |   between = vs * am,
110 |   type = 3,
111 |   detailed = TRUE))
112 | 
113 | 
114 | ## ----fglm1-tukeyhsd, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Graph of cell means with confidence intervals. Cells that share letters are not statistically significantly different based on Tukey's Honestly Significant Difference."----
115 | 
116 | mtcars[, Cells := factor(sprintf("vs=%s, am=%s", vs, am))]
117 | TukeyHSDgg("Cells", "hp", mtcars) +
118 |   theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1)) +
119 |   xlab("")
120 | 
121 | 
122 | ## ----fglm1-density, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Density plot of satisfaction with life (black line) with normal density overlayed (blue line)."----
123 | 
124 | acl <- readRDS("advancedr_acl_data.RDS")
125 | 
126 | testdistr(acl$SWL_W1, "normal",
127 |           varlab = "Satisfaction with Life", plot = FALSE,
128 |           extremevalues = "theoretical",
129 |           adjust = 2)$DensityPlot
130 | 
131 | 
132 | ## ------------------------------------------------------------------------
133 | 
134 | m.ols <- ols(SWL_W1 ~ Sex + AGE_W1 + SESCategory, data = acl, x = TRUE)
135 | m.ols
136 | 
137 | 
138 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
139 | ## 
140 | ## texreg(m.ols, single.row = TRUE, label = "tglm1-olstex")
141 | ## 
142 | 
143 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
144 | 
145 | texreg(m.ols, single.row = TRUE, label = "tglm1-olstex", float.pos = "!hb")
146 | 
147 | 
148 | ## ------------------------------------------------------------------------
149 | 
150 | vif(m.ols)
151 | 
152 | 
153 | ## ----fglm1-quantreg, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = c("Graphing residuals to assess normality.", "Examining residuals versus fitted values with quantile regression to explore heteroskedasticity.")----
154 | 
155 | diagnostic.data <- data.table(
156 |   fitted = fitted(m.ols),
157 |   resid = residuals(m.ols))
158 | 
159 | testdistr(diagnostic.data$resid,
160 |           "normal",
161 |           varlab = "Satisfaction with Life Residuals", plot = FALSE,
162 |           extremevalues = "theoretical",
163 |           adjust = 2)$DensityPlot
164 | 
165 | ggplot(diagnostic.data, aes(fitted, resid)) +
166 |   geom_point(alpha = .2, colour = "grey50") +
167 |   geom_quantile(quantiles = .5, colour = 'black', size = 1) +
168 |   geom_quantile(quantiles = c(.25, .75),
169 |                 colour = 'blue', linetype = 2, size = 1) +
170 |   geom_quantile(quantiles = c(.05, .95),
171 |                 colour = 'black', linetype = 3, size = 1)
172 | 
173 | 
174 | ## ------------------------------------------------------------------------
175 | 
176 | m.glm <- glm(SWL_W1 ~ Sex + AGE_W1 + SESCategory,
177 |              data=acl, family = gaussian(link="identity"))
178 | m.glm
179 | 
180 | summary(m.glm)
181 | 
182 | 
183 | ## ------------------------------------------------------------------------
184 | 
185 | (m.ols2 <- update(m.ols, . ~ . + Employment_W1))
186 | 
187 | 
188 | ## ------------------------------------------------------------------------
189 | 
190 | anova(m.ols2)
191 | 
192 | 
193 | ## ------------------------------------------------------------------------
194 | 
195 | (m.ols3 <- update(m.ols2, . ~ . + AGE_W1 * SESCategory - Sex))
196 | 
197 | 
198 | ## ----fglm1-visreg1, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Estimated satisfaction with life across age by SES category. Shaded region indicates 95\\% confidence intervals for regression estimates."----
199 | 
200 | plot(visreg(m.ols3, xvar = "AGE_W1", by = "SESCategory",
201 |             plot = FALSE), 
202 |      overlay = TRUE, partial = FALSE, rug = FALSE, 
203 |      xlab = "Age (years)", ylab = "Predicted Life Satisfaction",
204 |      line = list(lty = 1:4))
205 | 
206 | 
207 | ## ----fglm1-visreg2, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Estimated satisfaction with life across age by SES category. Confidence intervals removed."----
208 | 
209 | plot(visreg(m.ols3, xvar = "AGE_W1", by = "SESCategory",
210 |             plot = FALSE),
211 |      overlay = TRUE, partial = FALSE, rug = FALSE, 
212 |      xlab = "Age (years)", ylab = "Predicted Life Satisfaction",
213 |      line = list(
214 |        lty = 1:4, 
215 |        col = c("black", "grey75", "grey50", "grey25")), 
216 |      band = FALSE)
217 | 
218 | 
219 | ## ------------------------------------------------------------------------
220 | 
221 | newdata <- as.data.table(expand.grid(
222 |   AGE_W1=quantile(acl$AGE_W1, .1):quantile(acl$AGE_W1, .9),
223 |   SESCategory = factor(1:4, levels = levels(acl$SESCategory)),
224 |   Employment_W1 = factor("(3) 15002499",
225 |     levels = levels(acl$Employment_W1))))
226 | newdata
227 | 
228 | 
229 | ## ------------------------------------------------------------------------
230 | 
231 | newdata[, c("SWL_W1", "SE") :=
232 |           predict(m.ols3, newdata = newdata, se.fit = TRUE)]
233 | newdata
234 | 
235 | 
236 | ## ------------------------------------------------------------------------
237 | 
238 | print(qnorm(.05/2), digits = 7)
239 | 
240 | print(qnorm(1 - (.05/2)), digits = 7)
241 | 
242 | 
243 | ## ----fglm1-intplot, fig.width=8, fig.height=5, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Estimated satisfaction with life across age by SES category. Shaded region indicates 95\\% confidence intervals for regression estimates."----
244 | 
245 | ggplot(newdata, aes(AGE_W1, SWL_W1, linetype=SESCategory)) +
246 |   geom_ribbon(aes(ymin = SWL_W1 + SE * qnorm(.025),
247 |                   ymax = SWL_W1 + SE * qnorm(.975)),
248 |               alpha = .2) +
249 |   geom_line(size = 1) +
250 |   scale_x_continuous("Age (years)") +
251 |   ylab("Satisfaction with Life") +
252 |   theme_cowplot() +
253 |   theme(
254 |     legend.position = c(.8, .16),
255 |     legend.key.width = unit(2, "cm"))
256 | 
257 | 
258 | ## ------------------------------------------------------------------------
259 | 
260 | confint(m.ols3)
261 | 
262 | 
263 | ## ------------------------------------------------------------------------
264 | 
265 | tmpdat <- na.omit(acl[, .(SWL_W1, AGE_W1, SESCategory, Employment_W1)])
266 | ## use if using Microsoft R Open with Intel's MKL linear algebra library
267 | setMKLthreads(1)
268 | 
269 | 
270 | ## ------------------------------------------------------------------------
271 | 
272 | set.seed(12345)
273 | t1 <- system.time(ols.boot <- sapply(1:500, function(i) {
274 |   index <- sample(nrow(tmpdat),
275 |                   size = nrow(tmpdat), replace = TRUE)
276 |   coef(ols(SWL_W1 ~ AGE_W1 * SESCategory + Employment_W1,
277 |            data = tmpdat[index]))
278 | }))
279 | 
280 | t1
281 | 
282 | 
283 | ## ------------------------------------------------------------------------
284 | 
285 | set.seed(12345)
286 | t2 <- system.time(rcpp.boot1 <- sapply(1:500, function(i) {
287 |   index <- sample(nrow(tmpdat), size = nrow(tmpdat), replace = TRUE)
288 |   coef(fastLm(SWL_W1 ~ AGE_W1 * SESCategory + Employment_W1, data = tmpdat[index]))
289 | }))
290 | 
291 | t2
292 | 
293 | 
294 | ## ------------------------------------------------------------------------
295 | 
296 | set.seed(12345)
297 | t3 <- system.time({
298 |   y <- tmpdat[, SWL_W1]
299 |   X <- model.matrix(~ AGE_W1 * SESCategory + Employment_W1, data = tmpdat)
300 |   N <- nrow(tmpdat)
301 |   rcpp.boot2 <- sapply(1:500, function(i) {
302 |     index <- sample.int(N, size = N, replace = TRUE)
303 |     fastLmPure(X = X[index, ], y = y[index])$coefficients
304 |   })
305 | })
306 | 
307 | t3
308 | 
309 | 
310 | ## ------------------------------------------------------------------------
311 | 
312 | set.seed(12345)
313 | t4 <- system.time({
314 |   y <- tmpdat[, SWL_W1]
315 |   X <- model.matrix(~ AGE_W1 * SESCategory + Employment_W1, data = tmpdat)
316 |   N <- nrow(tmpdat)
317 |   rcpp.boot3 <- sapply(1:10000, function(i) {
318 |     index <- sample.int(N, size = N, replace = TRUE)
319 |     fastLmPure(X = X[index, ], y = y[index])$coefficients
320 |   })
321 | })
322 | 
323 | t4
324 | 
325 | 
326 | ## ------------------------------------------------------------------------
327 | 
328 | all.equal(ols.boot, rcpp.boot1, check.attributes = FALSE)
329 | 
330 | all.equal(ols.boot, rcpp.boot2, check.attributes = FALSE)
331 | 
332 | 
333 | ## ------------------------------------------------------------------------
334 | m0 <- ols(CESD11_W2 ~ SelfEfficacy_W1, data = acl)
335 | 
336 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
337 | ## texreg(m0, label = "tglm1-olsunadj")
338 | 
339 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
340 | texreg(m0, label = "tglm1-olsunadj", float.pos = "!hb")
341 | 
342 | ## ----fglm1-weights, fig.width=7, fig.height=8, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Raw and trimmed inverse probability weights for self-efficacy."----
343 | 
344 | ## weights
345 | w <- ipwpoint(
346 |   exposure = SelfEfficacy_W1,
347 |   family = "gaussian",
348 |   numerator = ~ 1,
349 |   denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1 + NChronic12_W1,
350 |   data = acl)
351 |   
352 | plot_grid(
353 |   testdistr(w$ipw.weights, plot = FALSE)$DensityPlot,
354 |   testdistr(winsorizor(w$ipw.weights, .01),
355 |             plot = FALSE)$DensityPlot,
356 |   ncol = 1)
357 | 
358 | 
359 | ## ----results = 'asis'----------------------------------------------------
360 | 
361 | ## unweighted, unadjusted
362 | m0 <- ols(CESD11_W2 ~ SelfEfficacy_W1, data = acl)
363 | 
364 | ## weighted, adjusted 
365 | m1 <- ols(CESD11_W2 ~ SelfEfficacy_W1, data = acl, 
366 |   weights = winsorizor(w$ipw.weights, .01))
367 | 
368 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
369 | ## texreg(list(m0, m1),
370 | ##        label = "tglm1-weight1")
371 | ## 
372 | 
373 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
374 | texreg(list(m0, m1),
375 |        label = "tglm1-weight1", float.pos = "!hb")
376 | 
377 | ## ----results = 'asis'----------------------------------------------------
378 | 
379 | # weighted, fully adjusted
380 | w2 <- ipwpoint(
381 |   exposure = SelfEfficacy_W1,
382 |   family = "gaussian",
383 |   numerator = ~ 1,
384 |   denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1 + NChronic12_W1 + 
385 |     SESCategory + Employment_W1 + BMI_W1 + Smoke_W1 + PhysActCat_W1,
386 |   data = acl)
387 |   
388 | m2 <- ols(CESD11_W2 ~ SelfEfficacy_W1, data = acl, 
389 |   weights = winsorizor(w2$ipw.weights, .01))
390 | 
391 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
392 | ## texreg(list(m0, m1, m2),
393 | ##        label = "tglm1-weight2")
394 | ## 
395 | 
396 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
397 | texreg(list(m0, m1, m2),
398 |        label = "tglm1-weight2", float.pos = "!hb")
399 | 
400 | ## ------------------------------------------------------------------------
401 | 
402 | m1b <- ols(CESD11_W2 ~ Sex + RaceEthnicity + AGE_W1 + 
403 |   NChronic12_W1 + SelfEfficacy_W1, 
404 |   data = acl)
405 | 
406 | m2b <- ols(CESD11_W2 ~ Sex + RaceEthnicity + AGE_W1 + 
407 |   NChronic12_W1 + SESCategory + 
408 |   Employment_W1 + BMI_W1 + Smoke_W1 + PhysActCat_W1 + 
409 |   SelfEfficacy_W1, data = acl)
410 | 
411 | 
412 | ## ------------------------------------------------------------------------
413 | 
414 | m1c <- ols(CESD11_W2 ~ Sex + RaceEthnicity + AGE_W1 + 
415 |   NChronic12_W1 + SelfEfficacy_W1, 
416 |   data = acl, 
417 |   weights = winsorizor(w$ipw.weights, .01))
418 | 
419 | m2c <- ols(CESD11_W2 ~ Sex + RaceEthnicity + AGE_W1 + 
420 |   NChronic12_W1 + SESCategory + 
421 |   Employment_W1 + BMI_W1 + Smoke_W1 + PhysActCat_W1 + 
422 |   SelfEfficacy_W1, data = acl,
423 |   weights = winsorizor(w2$ipw.weights, .01))
424 | 
425 | 
426 | ## ----fglm1-adjustcompare, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Comparison of the estimate and confidence interval for the association of self-efficacy with depression symptoms from various models. Covs = covariate adjusted models. IPW = inverse probability weight adjusted models. Covs + IPW = models that both include inverse probability weights and the same potential confounds explicitly in the model again."----
427 | 
428 | ## write an extract function
429 | extractor <- function(obj, label) {
430 |   b <- coef(obj)
431 |   ci <- confint(obj)
432 |   data.table(
433 |     Type = label,
434 |     B = b[["SelfEfficacy_W1"]],
435 |     LL = ci["SelfEfficacy_W1", "2.5 %"],
436 |     UL = ci["SelfEfficacy_W1", "97.5 %"])
437 | }
438 | 
439 | allresults <- rbind(
440 |   extractor(m0,  "M0: Unadjusted"),
441 |   extractor(m1,  "M1: Partial IPW"),
442 |   extractor(m1b, "M1: Partial Covs"),
443 |   extractor(m1c, "M1: Partial Covs + IPW"),
444 |   extractor(m2,  "M2: Full IPW"),
445 |   extractor(m2b, "M2: Full Covs"),
446 |   extractor(m2c, "M2: Full Covs + IPW"))
447 | allresults[, Type := factor(Type, levels = Type)]
448 | 
449 | 
450 | ggplot(allresults, aes(Type, y = B, ymin = LL, ymax = UL)) + 
451 |   geom_pointrange() + 
452 |   coord_flip() + 
453 |   xlab("") + ylab("Estimate + 95% CI")
454 |   
455 | 
456 | ## ------------------------------------------------------------------------
457 | 
458 | set.seed(12345)
459 | adosleep <- data.table(
460 |   SOLacti = rnorm(150, 4.4, 1.3)^2,
461 |   DBAS = rnorm(150, 72, 26),
462 |   DAS = rnorm(150, 125, 32),
463 |   Female = rbinom(150, 1, .53),
464 |   Stress = rnorm(150, 32, 11))
465 | adosleep[, SSQ := rnorm(150,
466 |              (.36 * 3 / 12.5) * SOLacti +
467 |              (.16 * 3 / 26) * DBAS +
468 |              (.18 * 3 / .5) * Female +
469 |              (.20 * 3 / 11) * Stress, 2.6)]
470 | adosleep[, MOOD := rnorm(150,
471 |              (-.07 / 12.5) * SOLacti +
472 |              (.29  / 3) * SSQ +
473 |              (.14  / 26) * DBAS +
474 |              (.21  / 32) * DAS +
475 |              (.12  / 32) * SSQ * (DAS-50) +
476 |              (.44  / .5) * Female +
477 |              (.28 / 11) * Stress, 2)]
478 | adosleep[, Female := factor(Female, levels = 0:1,
479 |                             labels = c("Males", "Females"))]
480 | 
481 | 
482 | ## ----include=FALSE-------------------------------------------------------
483 | ## ols(scale(MOOD) ~ scale(SOLacti) + scale(SSQ) + scale(DBAS) + scale(DAS) + scale(SSQ):scale(DAS) + scale(Female) + scale(Stress), data = adosleep)
484 | ## ols(MOOD ~ SOLacti + SSQ + DBAS + DAS + SSQ:DAS + Female + Stress, data = adosleep)
485 | 
486 | ## ----fglm1-case1, fig.width=8, fig.height=8, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Distributions of case study variables."----
487 | 
488 | plot_grid(
489 |   testdistr(adosleep$MOOD, extremevalues = "theoretical",
490 |             plot=FALSE, varlab = "MOOD")$Density,
491 |   testdistr(adosleep$SSQ, extremevalues = "theoretical",
492 |             plot=FALSE, varlab = "SSQ")$Density,
493 |   testdistr(adosleep$SOLacti, extremevalues = "theoretical",
494 |             plot=FALSE, varlab = "SOLacti")$Density,
495 |   testdistr(adosleep$DAS, extremevalues = "theoretical",
496 |             plot=FALSE, varlab = "DAS")$Density,
497 |   ncol = 2)
498 | 
499 | 
500 | ## ----fglm1-case2, fig.width=6, fig.height=4.5, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Heatmap of the correlations between study variables."----
501 | 
502 | plot(SEMSummary(
503 |   ~ MOOD + SOLacti + DBAS + DAS + Stress + SSQ,
504 |   data = adosleep), plot = "cor") +
505 |   theme(axis.text.x = element_text(
506 |           angle = 45, hjust = 1, vjust = 1))
507 | 
508 | 
509 | ## ------------------------------------------------------------------------
510 | 
511 | egltable(c("SOLacti", "SSQ", "MOOD", "Stress",
512 |            "DBAS", "DAS", "Female"),
513 |          data = as.data.frame(adosleep))
514 | 
515 | 
516 | ## ------------------------------------------------------------------------
517 | 
518 | adosleep[, zMOOD := as.vector(scale(MOOD))]
519 | adosleep[, zDBAS := as.vector(scale(DBAS))]
520 | adosleep[, zDAS := as.vector(scale(DAS))]
521 | adosleep[, zSSQ := as.vector(scale(SSQ))]
522 | adosleep[, zSOLacti := as.vector(scale(SOLacti))]
523 | adosleep[, zStress := as.vector(scale(Stress))]
524 | 
525 | 
526 | ## ------------------------------------------------------------------------
527 | 
528 | m.adosleep1 <- ols(zMOOD ~ zSOLacti + zDBAS + Female + zStress,
529 |                    data = adosleep)
530 | m.adosleep2 <- update(m.adosleep1, . ~ . + zSSQ + zDAS)
531 | m.adosleep3 <- update(m.adosleep2, . ~ . + zSSQ:zDAS)
532 | 
533 | screenreg(list(m.adosleep1, m.adosleep2, m.adosleep3))
534 | 
535 | 
536 | ## ----fglm1-case-resids, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Distribution of model residuals."----
537 | 
538 | vif(m.adosleep3)
539 | 
540 | testdistr(resid(m.adosleep3), plot=FALSE, varlab = "Residuals")$QQPlot
541 | 
542 | 
543 | ## ------------------------------------------------------------------------
544 | 
545 | ## refit model on raw data
546 | m.adosleep.raw <- ols(MOOD ~ SOLacti + DBAS + Female +
547 |                         Stress + SSQ * DAS,
548 |                       data = adosleep)
549 | 
550 | ## create a dataset
551 | adosleep.newdat <- as.data.table(with(adosleep, expand.grid(
552 |   SOLacti = mean(SOLacti),
553 |   DBAS = mean(DBAS),
554 |   Female = factor("Females", levels(Female)),
555 |   Stress = mean(Stress),
556 |   SSQ = seq(from = min(SSQ), to = max(SSQ), length.out = 100),
557 |   DAS = mean(DAS) + c(1, -1) * sd(DAS))))
558 | 
559 | adosleep.newdat$MOOD <- predict(m.adosleep.raw,
560 |                                 newdata = adosleep.newdat,
561 |                                 se.fit = FALSE)
562 | 
563 | adosleep.newdat[, DAS := factor(round(DAS),
564 |   levels = c(100, 161),
565 |   labels = c("M - 1 SD", "M + 1 SD"))]
566 | 
567 | 
568 | ## ----fglm1-case-intplot, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Interaction between subjective sleep quality and overall dysfunctional beliefs predicting negative mood."----
569 | 
570 | ggplot(adosleep.newdat, aes(SSQ, MOOD, linetype=DAS)) +
571 |   geom_line(size = 2) +
572 |   scale_x_continuous("Subjective sleep quality\n(higher is worse)") +
573 |   ylab("Negative Mood") +
574 |   theme_cowplot() +
575 |   theme(
576 |     legend.position = c(.85, .15),
577 |     legend.key.width = unit(2, "cm"))
578 | 
579 | 
580 | 


--------------------------------------------------------------------------------
/glm2.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | library(checkpoint)
  3 | checkpoint("2018-09-28", R.version = "3.5.1",
  4 |   project = book_directory,
  5 |   checkpointLocation = checkpoint_directory,
  6 |   scanForPackages = FALSE,
  7 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  8 | 
  9 | library(knitr)
 10 | library(data.table)
 11 | library(ggplot2)
 12 | library(ggthemes)
 13 | library(scales)
 14 | library(viridis)
 15 | library(VGAM)
 16 | library(ipw)
 17 | library(JWileymisc)
 18 | library(xtable)
 19 | library(texreg)
 20 | 
 21 | options(
 22 |   width = 70,
 23 |   stringsAsFactors = FALSE,
 24 |   datatable.print.nrows = 20,
 25 |   datatable.print.topn = 3,
 26 |   digits = 2)
 27 | 
 28 | 
 29 | ## ----fglm2-poisson, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "Density for a Poisson distribution with lambda = 2 and lambda = 6."----
 30 | 
 31 | dpoisson <- data.table(X = 0:20)
 32 | dpoisson[, Lambda2 := dpois(X, lambda = 2)]
 33 | dpoisson[, Lambda6 := dpois(X, lambda = 6)]
 34 | 
 35 | ggplot(melt(dpoisson, id.vars = "X"), 
 36 |        aes(X, value, fill = variable)) + 
 37 |   geom_col(position = "dodge") + 
 38 |   scale_fill_viridis(discrete = TRUE) + 
 39 |   theme(legend.position = c(.7, .8)) + 
 40 |   xlab("Y Score") + ylab("Poisson Density")
 41 | 
 42 | 
 43 | ## ------------------------------------------------------------------------
 44 | 
 45 | acl <- readRDS("advancedr_acl_data.RDS")
 46 | 
 47 | 
 48 | ## ------------------------------------------------------------------------
 49 | 
 50 | acl$CurSmoke <- as.integer(acl$Smoke_W1 == "(1) Cur Smok")
 51 | 
 52 | m.lr <- vglm(CurSmoke ~ Sex, 
 53 |              family = binomialff(link = "logit"),
 54 |              data = acl, model = TRUE)
 55 | summary(m.lr)
 56 | 
 57 | 
 58 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
 59 | ## or.tab <- xtabs(~ Sex + CurSmoke, data = acl)
 60 | ## or.tab.res <- (or.tab[1,1]/or.tab[2,1])/(or.tab[1,2]/or.tab[2,2])
 61 | ## xtable(or.tab, caption = "Observed frequency table",
 62 | ##        label = "tglm2-obsfreq")
 63 | 
 64 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
 65 | or.tab <- xtabs(~ Sex + CurSmoke, data = acl)
 66 | or.tab.res <- (or.tab[1,1]/or.tab[2,1])/(or.tab[1,2]/or.tab[2,2])
 67 | xtable(or.tab, caption = "Observed frequency table",
 68 |        label = "tglm2-obsfreq")
 69 | 
 70 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
 71 | 
 72 | cat(sprintf("
 73 | \\begin{equation*}
 74 |   \\frac{\\frac{%d}{%d}}{\\frac{%d}{%d}} = %0.2f
 75 | \\end{equation*}
 76 | ", or.tab[1,1], or.tab[2,1], or.tab[1,2], or.tab[2,2],
 77 | or.tab.res
 78 | ))
 79 | 
 80 | 
 81 | ## ------------------------------------------------------------------------
 82 | 
 83 | preddat <- data.table(Sex = levels(acl$Sex))
 84 | preddat$yhat <- predict(m.lr, newdata = preddat,
 85 |         type = "response")
 86 | 
 87 | 
 88 | ## ----fglm2-predprob0, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Graph showing the probability of smoking by sex."----
 89 | 
 90 | ggplot(preddat, aes(Sex, yhat)) +
 91 |   geom_bar(stat = "identity") +
 92 |   scale_y_continuous("Smoking Probability", labels = percent) +
 93 |   theme_tufte()
 94 | 
 95 | 
 96 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
 97 | ## xtable(coef(summary(m.lr)), digits = 2,
 98 | ##        caption = paste(
 99 | ##   "Summary of logistic regression model",
100 | ##   "including coefficients, standard errors",
101 | ##   "andd p-values."), label = "tglm2-orsimple")
102 | 
103 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
104 | xtable(coef(summary(m.lr)), digits = 2, 
105 |        caption = paste(
106 |   "Summary of logistic regression model",
107 |   "including coefficients, standard errors",
108 |   "andd p-values."), label = "tglm2-orsimple")
109 | 
110 | ## ------------------------------------------------------------------------
111 | 
112 | ## unadjusted model
113 | m0.lr <- vglm(CurSmoke ~ SelfEfficacy_W1, 
114 |              family = binomialff(link = "logit"),
115 |              data = acl, model = TRUE)
116 | 
117 | ## estimate IPWs
118 | w <- ipwpoint(
119 |   exposure = SelfEfficacy_W1,
120 |   family = "gaussian",
121 |   numerator = ~ 1,
122 |   denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1,
123 |   data = acl)
124 | 
125 | ## adjusted logistic regression model
126 | m1.lr <- vglm(CurSmoke ~ SelfEfficacy_W1, 
127 |              family = binomialff(link = "logit"),
128 |              data = acl, model = TRUE, 
129 |              weights = winsorizor(w$ipw.weights, .01))
130 | 
131 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
132 | ## xtable(rbind(
133 | ##   data.table(Type = "Raw", coef(summary(m0.lr))),
134 | ##   data.table(Type = "Adj", coef(summary(m1.lr)))),
135 | ##   digits = 2,
136 | ##   caption = paste("Comparison of unadjusted (raw)",
137 | ##     "and adjusted regression models"),
138 | ##   label = "tglm2-lrcompare")
139 | 
140 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
141 | xtable(rbind(
142 |   data.table(Type = "Raw", coef(summary(m0.lr))),
143 |   data.table(Type = "Adj", coef(summary(m1.lr)))),
144 |   digits = 2, 
145 |   caption = paste("Comparison of unadjusted (raw)",
146 |     "and adjusted regression models"),
147 |   label = "tglm2-lrcompare")
148 | 
149 | ## ----fglm2-predprob1, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the probability of smoking by self-efficacy")----
150 | 
151 | preddat2 <- data.table(SelfEfficacy_W1 = 
152 |   seq(from = min(acl$SelfEfficacy_W1, na.rm = TRUE),
153 |       to = max(acl$SelfEfficacy_W1, na.rm = TRUE),
154 |       length.out = 1000))
155 | preddat2$yhat <- predict(m1.lr, newdata = preddat2,
156 |                         type = "response")
157 | 
158 | ggplot(preddat2, aes(SelfEfficacy_W1, yhat)) +
159 |   geom_line() +
160 |   scale_x_continuous("Self-Efficacy") +
161 |   scale_y_continuous("Smoking Probability", label = percent) +
162 |   theme_tufte() + coord_cartesian(ylim = c(.25, .40))
163 | 
164 | 
165 | ## ------------------------------------------------------------------------
166 | 
167 | ## delta value for change in self efficacy
168 | delta <- .01
169 | 
170 | ## create a copy of the dataset
171 | ## where we increase everyone's self-efficacy by delta
172 | aclalt <- copy(acl)
173 | aclalt$SelfEfficacy_W1 <- aclalt$SelfEfficacy_W1 + delta
174 | 
175 | ## calculate predicted probabilities
176 | p1 <- predict(m1.lr, newdata = acl, type = "response")
177 | p2 <- predict(m1.lr, newdata = aclalt, type = "response")
178 | 
179 | ## calculate the average, marginal change in probabilities 
180 | ## per unit change in self efficacy
181 | ## in percents and rounded
182 | round(mean((p2 - p1) / delta) * 100, 1)
183 | 
184 | 
185 | ## ------------------------------------------------------------------------
186 | 
187 | acl$PhysActCat_W2 <- factor(acl$PhysActCat_W2, ordered = TRUE)
188 | 
189 | ## adjusted ordered logistic regression model
190 | m0.or <- vglm(PhysActCat_W2 ~ SelfEfficacy_W1,
191 |               family = propodds(),
192 |               data = acl)
193 | 
194 | ## estimate IPWs
195 | w <- ipwpoint(
196 |   exposure = SelfEfficacy_W1,
197 |   family = "gaussian",
198 |   numerator = ~ 1,
199 |   denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1,
200 |   data = acl)
201 | 
202 | ## adjusted ordered logistic regression model
203 | m1.or <- vglm(PhysActCat_W2 ~ SelfEfficacy_W1, 
204 |              family = propodds(),
205 |              data = acl, model = TRUE, 
206 |              weights = winsorizor(w$ipw.weights, .01))
207 | 
208 | 
209 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
210 | ## xtable(rbind(
211 | ##   data.table(Type = "Raw",
212 | ##              Labels = rownames(coef(summary(m0.or))),
213 | ##              coef(summary(m0.or))),
214 | ##   data.table(Type = "Adj",
215 | ##              Labels = rownames(coef(summary(m1.or))),
216 | ##              coef(summary(m1.or)))),
217 | ##   digits = 2,
218 | ##   caption = paste("Comparison of unadjusted (raw) and",
219 | ##    "adjusted ordered logistic regression models"),
220 | ##   label = "tglm2-orcompare")
221 | 
222 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
223 | xtable(rbind(
224 |   data.table(Type = "Raw", 
225 |              Labels = rownames(coef(summary(m0.or))),
226 |              coef(summary(m0.or))),
227 |   data.table(Type = "Adj", 
228 |              Labels = rownames(coef(summary(m1.or))),
229 |              coef(summary(m1.or)))),
230 |   digits = 2, 
231 |   caption = paste("Comparison of unadjusted (raw) and",
232 |    "adjusted ordered logistic regression models"),
233 |   label = "tglm2-orcompare")
234 | 
235 | ## ----fglm2-predprob2, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the probability of different physical activity categories by self-efficacy")----
236 | 
237 | preddat3 <- data.table(SelfEfficacy_W1 = 
238 |   seq(from = min(acl$SelfEfficacy_W1, na.rm = TRUE),
239 |       to = max(acl$SelfEfficacy_W1, na.rm = TRUE),
240 |       length.out = 1000))
241 | preddat3 <- cbind(preddat3, 
242 |   predict(m1.or, newdata = preddat3,
243 |           type = "response"))
244 | preddat3 <- melt(preddat3, id.vars = "SelfEfficacy_W1")
245 | 
246 | ggplot(preddat3, aes(SelfEfficacy_W1, value, 
247 |                      colour = variable, linetype = variable)) +
248 |   geom_line(size = 2) +
249 |   scale_color_viridis(discrete = TRUE) + 
250 |   scale_x_continuous("Self-Efficacy") +
251 |   scale_y_continuous("Activity Probability", label = percent) +
252 |   coord_cartesian(ylim = c(0, .6), expand = FALSE) + 
253 |   theme_tufte() + 
254 |   theme(legend.position = c(.7, .8), 
255 |         legend.key.width = unit(2, "cm"))
256 | 
257 | 
258 | ## ------------------------------------------------------------------------
259 | 
260 | ## delta value for change in self efficacy
261 | delta <- .01
262 | 
263 | ## create a copy of the dataset
264 | ## where we increase everyone's self-efficacy by delta
265 | aclalt <- copy(acl)
266 | aclalt$SelfEfficacy_W1 <- aclalt$SelfEfficacy_W1 + delta
267 | 
268 | ## calculate predicted probabilities
269 | p1 <- predict(m1.or, newdata = acl, type = "response")
270 | p2 <- predict(m1.or, newdata = aclalt, type = "response")
271 | 
272 | ## average marginal change in probability of 
273 | ## membership in each category
274 | round(colMeans((p2 - p1) / delta) * 100, 1)
275 | 
276 | 
277 | ## ------------------------------------------------------------------------
278 | 
279 | acl[, EmployG_W2 := as.character(Employment_W2)]
280 | acl[EmployG_W2 %in% c(
281 |   "(2) 2500+HRS", "(3) 15002499", 
282 |   "(4) 500-1499", "(5) 1-499HRS"), 
283 |   EmployG_W2 := "(2) EMPLOYED"]
284 | acl[, EmployG_W2 := factor(EmployG_W2)]
285 | 
286 | 
287 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
288 | ## xtable(as.data.frame(table(acl$EmployG_W2)),
289 | ##        caption = "Frequency table of employment",
290 | ##        label = "tglm2-freqtab")
291 | 
292 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
293 | xtable(as.data.frame(table(acl$EmployG_W2)),
294 |        caption = "Frequency table of employment",
295 |        label = "tglm2-freqtab")
296 | 
297 | ## ------------------------------------------------------------------------
298 | 
299 | ## unadjusted multinomial logistic regression model
300 | m0.mr <- vglm(EmployG_W2 ~ SelfEfficacy_W1,
301 |               family = multinomial(),
302 |               data = acl, model = TRUE)
303 | 
304 | ## estimate IPWs
305 | w <- ipwpoint(
306 |   exposure = SelfEfficacy_W1,
307 |   family = "gaussian",
308 |   numerator = ~ 1,
309 |   denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1,
310 |   data = acl)
311 | 
312 | ## adjusted multinomial logistic regression model
313 | m1.mr <- vglm(EmployG_W2 ~ SelfEfficacy_W1,
314 |               family = multinomial(),
315 |              data = acl, model = TRUE, 
316 |              weights = winsorizor(w$ipw.weights, .01))
317 | 
318 | 
319 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
320 | ## xtable(rbind(
321 | ##   data.table(Type = "Raw",
322 | ##              Labels = rownames(coef(summary(m0.mr))),
323 | ##              coef(summary(m0.mr))),
324 | ##   data.table(Type = "Adj",
325 | ##              Labels = rownames(coef(summary(m1.mr))),
326 | ##              coef(summary(m1.mr)))),
327 | ##   digits = 2,
328 | ##   caption = paste("Comparison of unadjusted (raw) and",
329 | ##    "adjusted multinomial  logistic regression models"),
330 | ##   label = "tglm2-mrcompare")
331 | 
332 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
333 | xtable(rbind(
334 |   data.table(Type = "Raw", 
335 |              Labels = rownames(coef(summary(m0.mr))),
336 |              coef(summary(m0.mr))),
337 |   data.table(Type = "Adj", 
338 |              Labels = rownames(coef(summary(m1.mr))),
339 |              coef(summary(m1.mr)))),
340 |   digits = 2, 
341 |   caption = paste("Comparison of unadjusted (raw) and",
342 |    "adjusted multinomial  logistic regression models"),
343 |   label = "tglm2-mrcompare")
344 | 
345 | ## ----fglm2-predprob3, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the probability of different employment categories by self-efficacy")----
346 | 
347 | preddat4 <- data.table(SelfEfficacy_W1 = 
348 |   seq(from = min(acl$SelfEfficacy_W1, na.rm = TRUE),
349 |       to = max(acl$SelfEfficacy_W1, na.rm = TRUE),
350 |       length.out = 1000))
351 | preddat4 <- cbind(preddat4, 
352 |   predict(m1.mr, newdata = preddat4,
353 |           type = "response"))
354 | preddat4 <- melt(preddat4, id.vars = "SelfEfficacy_W1")
355 | 
356 | ggplot(preddat4, aes(
357 |   SelfEfficacy_W1, value, 
358 |   colour = variable, linetype = variable)) +
359 |   geom_line(size = 2) +
360 |   scale_color_viridis(discrete = TRUE) + 
361 |   scale_x_continuous("Self-Efficacy") +
362 |   scale_y_continuous("Probability", label = percent) +
363 |   coord_cartesian(ylim = c(0, .65), expand = FALSE) + 
364 |   theme_tufte() + 
365 |   theme(legend.position = c(.18, .82), 
366 |         legend.key.width = unit(2, "cm"))
367 | 
368 | 
369 | ## ------------------------------------------------------------------------
370 | 
371 | ## delta value for change in self efficacy
372 | delta <- .01
373 | 
374 | ## create a copy of the dataset
375 | ## where we increase everyone's self-efficacy by delta
376 | aclalt <- copy(acl)
377 | aclalt$SelfEfficacy_W1 <- aclalt$SelfEfficacy_W1 + delta
378 | 
379 | ## calculate predicted probabilities
380 | p1 <- predict(m1.mr, newdata = acl, type = "response")
381 | p2 <- predict(m1.mr, newdata = aclalt, type = "response")
382 | 
383 | ## average marginal change in probability of 
384 | ## membership in each category
385 | round(colMeans((p2 - p1) / delta) * 100, 1)
386 | 
387 | 
388 | ## ------------------------------------------------------------------------
389 | 
390 | egltable(c("NChronic12_W1", "NChronic12_W2"), 
391 |          data = acl, parametric = FALSE)
392 | 
393 | 
394 | ## ----fglm2-freqplot, fig.width=5, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the frequency of each number of chronic conditions at each wave in the ACL data")----
395 | 
396 | plot_grid(
397 |   ggplot(acl, aes(NChronic12_W1)) + 
398 |   geom_bar() + theme_tufte(),
399 |   ggplot(acl, aes(NChronic12_W2)) + 
400 |   geom_bar() + theme_tufte(),
401 |   ncol = 1, 
402 |   labels = c("Wave 1", "Wave 2"),
403 |   label_x = .8)
404 | 
405 | 
406 | ## ------------------------------------------------------------------------
407 | 
408 | ## unadjusted poisson regression model
409 | m0.pr <- vglm(NChronic12_W2 ~ SelfEfficacy_W1,
410 |               family = poissonff(),
411 |               data = acl, model = TRUE)
412 | 
413 | summary(m0.pr)
414 | 
415 | 
416 | ## ------------------------------------------------------------------------
417 | 
418 | ## unadjusted negative binomial regression model
419 | m0.nbr <- vglm(NChronic12_W2 ~ SelfEfficacy_W1,
420 |               family = negbinomial(),
421 |               data = acl, model = TRUE)
422 | 
423 | AIC(m0.nbr) - AIC(m0.pr)
424 | BIC(m0.nbr) - BIC(m0.pr)
425 | 
426 | 
427 | ## ----fglm2-simplot, fig.width=6, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the frequency of each number of chronic conditions based on the true data, simulations from the negative binomial model, and simulations from the poisson regression model.")----
428 | 
429 | test.pr <- simulate(m0.pr, nsim = 1, seed = 1234)$sim_1
430 | test.nbr <- simulate(m0.nbr, nsim = 1, seed = 1234)$sim_1
431 | test.all <- data.table(
432 |   Type = rep(c("Truth", "Poisson", "Negative\nBinomial"),
433 |              times = c(
434 |                nrow(model.frame(m0.pr)), 
435 |                length(test.pr),
436 |                length(test.nbr))),
437 |   Score = c(
438 |     model.frame(m0.pr)$NChronic12_W2,
439 |     test.pr,
440 |     test.nbr))
441 | 
442 | ggplot(test.all, aes(Score, fill = Type)) + 
443 |   geom_bar(position = "dodge") + 
444 |   scale_fill_viridis(discrete = TRUE) + 
445 |   theme_tufte() + 
446 |   theme(legend.position = c(.8, .8))
447 | 
448 | 
449 | ## ------------------------------------------------------------------------
450 | 
451 | ## estimate IPWs
452 | w <- ipwpoint(
453 |   exposure = SelfEfficacy_W1,
454 |   family = "gaussian",
455 |   numerator = ~ 1,
456 |   denominator = ~ 1 + Sex + RaceEthnicity + AGE_W1,
457 |   data = acl)
458 | 
459 | ## adjusted negative binomial regression model
460 | m1.nbr <- vglm(NChronic12_W2 ~ SelfEfficacy_W1,
461 |               family = negbinomial(),
462 |              data = acl, model = TRUE, 
463 |              weights = winsorizor(w$ipw.weights, .01))
464 | 
465 | 
466 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
467 | ## xtable(rbind(
468 | ##   data.table(Type = "Raw",
469 | ##              Labels = rownames(coef(summary(m0.nbr))),
470 | ##              coef(summary(m0.nbr))),
471 | ##   data.table(Type = "Adj",
472 | ##              Labels = rownames(coef(summary(m1.nbr))),
473 | ##              coef(summary(m1.nbr)))),
474 | ##   digits = 2,
475 | ##   caption = paste("Comparison of unadjusted (raw) and",
476 | ##    "adjusted negative binomial regression models"),
477 | ##   label = "tglm2-nbrcompare")
478 | 
479 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
480 | xtable(rbind(
481 |   data.table(Type = "Raw", 
482 |              Labels = rownames(coef(summary(m0.nbr))),
483 |              coef(summary(m0.nbr))),
484 |   data.table(Type = "Adj", 
485 |              Labels = rownames(coef(summary(m1.nbr))),
486 |              coef(summary(m1.nbr)))),
487 |   digits = 2, 
488 |   caption = paste("Comparison of unadjusted (raw) and",
489 |    "adjusted negative binomial regression models"),
490 |   label = "tglm2-nbrcompare")
491 | 
492 | ## ----fglm2-predcount1, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!h", fig.cap = c("Graph showing the predicted number of chronic conditions as a function of self-efficacy.")----
493 | 
494 | preddat5 <- data.table(SelfEfficacy_W1 = 
495 |   seq(from = min(acl$SelfEfficacy_W1, na.rm = TRUE),
496 |       to = max(acl$SelfEfficacy_W1, na.rm = TRUE),
497 |       length.out = 1000))
498 | preddat5$yhat <- predict(m1.nbr, newdata = preddat5,
499 |           type = "response")
500 | 
501 | ggplot(preddat5, aes(SelfEfficacy_W1, yhat)) + 
502 |   geom_line() +
503 |   scale_x_continuous("Self-Efficacy") +
504 |   scale_y_continuous("Expected Number Conditions") + 
505 |   theme_tufte()
506 | 
507 | 
508 | ## ------------------------------------------------------------------------
509 | 
510 | acl[, Smoke_W2W1 := NA_character_]
511 | acl[Smoke_W1 == "(3) Nevr Smo" & 
512 |     Smoke_W2 == "(3) W2 Never Smoker", 
513 |     Smoke_W2W1 := "Stable Never Smoker"]
514 | acl[Smoke_W1 == "(2) Past Smo" & 
515 |     Smoke_W2 == "(2) W2 Former Smoker", 
516 |     Smoke_W2W1 := "Stable Former Smoker"]
517 | acl[Smoke_W1 == "(1) Cur Smok" & 
518 |     Smoke_W2 == "(1) W2 Current Smoker", 
519 |     Smoke_W2W1 := "Stable Current Smoker"]
520 | acl[Smoke_W1 %in% c("(2) Past Smo", "(3) Nevr Smo") & 
521 |     Smoke_W2 == "(1) W2 Current Smoker", 
522 |     Smoke_W2W1 := "New Smoker"]
523 | acl[Smoke_W1 == "(1) Cur Smok" & 
524 |     Smoke_W2 == "(2) W2 Former Smoker", 
525 |     Smoke_W2W1 := "Recently Quit Smoker"]
526 | 
527 | acl[, Smoke_W2W1 := factor(Smoke_W2W1,
528 |   levels = c("Stable Never Smoker", "Stable Former Smoker",
529 |              "Stable Current Smoker", "Recently Quit Smoker", 
530 |              "New Smoker"))]
531 | 
532 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
533 | ## xtable(as.data.frame(table(acl$Smoke_W2W1)),
534 | ##        caption = "Frequency table of smoking over time",
535 | ##        label = "tglm2-freqtab-smoke")
536 | 
537 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
538 | xtable(as.data.frame(table(acl$Smoke_W2W1)),
539 |        caption = "Frequency table of smoking over time",
540 |        label = "tglm2-freqtab-smoke")
541 | 
542 | ## ------------------------------------------------------------------------
543 | 
544 | acl[, SES := as.numeric(SESCategory)]
545 | 
546 | mr.ses <- vglm(Smoke_W2W1 ~ Sex + SES + AGE_W1,
547 |   family = multinomial(),
548 |   data = acl, model = TRUE)
549 | 
550 | mr.psych <- vglm(Smoke_W2W1 ~ SWL_W1 + InformalSI_W1 + 
551 |   FormalSI_W1 + SelfEfficacy_W1 + CESD11_W1,
552 |   family = multinomial(),
553 |   data = acl, model = TRUE)
554 | 
555 | mr.health <- vglm(Smoke_W2W1 ~ PhysActCat_W1 + 
556 |   BMI_W1 + NChronic12_W1,
557 |   family = multinomial(),
558 |   data = acl, model = TRUE)
559 | 
560 | 
561 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
562 | ## xtable(
563 | ##  data.table(
564 | ##   Model = c("Sociodemographics", "Psychosocial",  "Health"),
565 | ##   AIC = c(AIC(mr.ses), AIC(mr.psych), AIC(mr.health)),
566 | ##   BIC = c(BIC(mr.ses), BIC(mr.psych), BIC(mr.health))),
567 | ##   caption = "Model Comparisons",
568 | ##   label = "tglm2-modelcomparisons")
569 | 
570 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
571 | xtable(
572 |  data.table(
573 |   Model = c("Sociodemographics", "Psychosocial",  "Health"),
574 |   AIC = c(AIC(mr.ses), AIC(mr.psych), AIC(mr.health)),
575 |   BIC = c(BIC(mr.ses), BIC(mr.psych), BIC(mr.health))),
576 |   caption = "Model Comparisons",
577 |   label = "tglm2-modelcomparisons")
578 | 
579 | ## ------------------------------------------------------------------------
580 | 
581 | summary(mr.ses)
582 | 
583 | 
584 | ## ------------------------------------------------------------------------
585 | 
586 | acl[, AGE_W1 := AGE_W1 / 10]
587 | 
588 | 
589 | ## ------------------------------------------------------------------------
590 | 
591 | mr.ses1 <- vglm(Smoke_W2W1 ~ Sex + SES + AGE_W1,
592 |               family = multinomial(refLevel = 1),
593 |               data = acl, model = TRUE)
594 | mr.ses2 <- update(mr.ses1, 
595 |                   family = multinomial(refLevel = 2))
596 | mr.ses3 <- update(mr.ses1, 
597 |                   family = multinomial(refLevel = 3))
598 | 
599 | 
600 | ## ------------------------------------------------------------------------
601 | data.table(
602 |   Ref = "Stable Never Smoker",
603 |   Term = names(coef(mr.ses1)),
604 |   OR = exp(coef(mr.ses1)),
605 |   exp(confint(mr.ses1)))
606 | 
607 | data.table(
608 |   Ref = "Stable Current Smoker",
609 |   Term = names(coef(mr.ses3)),
610 |   OR = exp(coef(mr.ses3)),
611 |   exp(confint(mr.ses3)))
612 | 
613 | 
614 | ## ------------------------------------------------------------------------
615 | 
616 | ## delta value for change in age and SES
617 | delta <- .01
618 | 
619 | ## create a copy of the dataset
620 | ## where we increase everyone's age by delta
621 | aclage <- copy(acl)
622 | aclage[, AGE_W1 := AGE_W1 + delta]
623 | 
624 | ## create a copy of the dataset
625 | ## where we increase everyone's SES by delta
626 | aclses <- copy(acl)
627 | aclses[, SES := SES + delta]
628 | 
629 | ## create two copies of the data
630 | ## one where se set everyone to "female" and another to "male"
631 | aclfemale <- copy(acl)
632 | aclfemale[, Sex := factor("(2) FEMALE", 
633 |                           levels = levels(acl$Sex))]
634 | 
635 | aclmale <- copy(acl)
636 | aclmale[, Sex := factor("(1) MALE", 
637 |                         levels = levels(acl$Sex))]
638 | 
639 | ## calculate predicted probabilities
640 | p.ref <- predict(mr.ses1, newdata = acl, 
641 |                  type = "response")
642 | p.age <- predict(mr.ses1, newdata = aclage, 
643 |                  type = "response")
644 | p.ses <- predict(mr.ses1, newdata = aclses, 
645 |                  type = "response")
646 | p.female <- predict(mr.ses1, newdata = aclfemale, 
647 |                     type = "response")
648 | p.male <- predict(mr.ses1, newdata = aclmale, 
649 |                     type = "response")
650 | 
651 | 
652 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
653 | ## xtable(
654 | ## data.table(
655 | ##   Level = colnames(p.ref),
656 | ##   Age = colMeans((p.age - p.ref) / delta) * 100,
657 | ##   SES = colMeans((p.ses - p.ref) / delta) * 100,
658 | ##   Female = colMeans(p.female - p.male) * 100),
659 | ##   digits = 2,
660 | ##   caption = "Average marginal change in predicted probability",
661 | ##   label = "tglm2-margprobs")
662 | 
663 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
664 | xtable(
665 | data.table(
666 |   Level = colnames(p.ref),
667 |   Age = colMeans((p.age - p.ref) / delta) * 100,
668 |   SES = colMeans((p.ses - p.ref) / delta) * 100,
669 |   Female = colMeans(p.female - p.male) * 100),
670 |   digits = 2,
671 |   caption = "Average marginal change in predicted probability",
672 |   label = "tglm2-margprobs")
673 | 
674 | 


--------------------------------------------------------------------------------
/glmma.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 | library(checkpoint)
  4 | checkpoint("2018-09-28", R.version = "3.5.1",
  5 |   project = book_directory,
  6 |   checkpointLocation = checkpoint_directory,
  7 |   scanForPackages = FALSE,
  8 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(knitr)
 11 | library(ggplot2)
 12 | library(cowplot)
 13 | library(viridis)
 14 | library(JWileymisc)
 15 | library(data.table)
 16 | library(lme4)
 17 | library(lmerTest)
 18 | library(chron)
 19 | library(zoo)
 20 | library(pander)
 21 | library(texreg)
 22 | library(xtable)
 23 | library(splines)
 24 | library(parallel)
 25 | library(boot)
 26 | library(optimx)
 27 | library(dfoptim)
 28 | 
 29 | options(width = 70, digits = 2)
 30 | 
 31 | 
 32 | ## ------------------------------------------------------------------------
 33 | 
 34 | data(aces_daily)
 35 | draw <- as.data.table(aces_daily)
 36 | d <- readRDS("aces_daily_sim_processed.RDS")
 37 | 
 38 | 
 39 | ## ------------------------------------------------------------------------
 40 | 
 41 | d[, SOLs30 := as.integer(SOLs >= 30)]
 42 | 
 43 | 
 44 | ## ------------------------------------------------------------------------
 45 | 
 46 | m1.glmm <- glmer(SOLs30 ~ BCOPEDis + BWASONs + (1 | UserID),
 47 |                  family = binomial(link = logit),
 48 |                  data = d, nAGQ = 9)
 49 | summary(m1.glmm)
 50 | 
 51 | 
 52 | ## ------------------------------------------------------------------------
 53 | 
 54 | plogis(fixef(m1.glmm)[["(Intercept)"]])
 55 | 
 56 | 
 57 | ## ------------------------------------------------------------------------
 58 | 
 59 | exp(cbind(
 60 |   B = fixef(m1.glmm),
 61 |   confint(m1.glmm, parm = "beta_", method = "Wald")))
 62 | 
 63 | 
 64 | ## ------------------------------------------------------------------------
 65 | 
 66 | preddat <- as.data.table(expand.grid(
 67 |   BCOPEDis = seq(
 68 |     from = min(d$BCOPEDis, na.rm=TRUE),
 69 |     to = max(d$BCOPEDis, na.rm = TRUE),
 70 |     length.out = 1000),
 71 |   BWASONs = quantile(d$BWASONs, probs = c(.2, .8),
 72 |                      na.rm = TRUE)))
 73 | 
 74 | ## predictions based on average random effects
 75 | preddat$yhat <- predict(m1.glmm,
 76 |   newdata = preddat,
 77 |   type = "response",
 78 |   re.form = ~ 0)
 79 | 
 80 | 
 81 | ## ------------------------------------------------------------------------
 82 | 
 83 | preddat2 <- as.data.table(expand.grid(
 84 |   UserID = unique(d$UserID),
 85 |   BCOPEDis = seq(
 86 |     from = min(d$BCOPEDis, na.rm=TRUE),
 87 |     to = max(d$BCOPEDis, na.rm = TRUE),
 88 |     length.out = 1000),
 89 |   BWASONs = quantile(d$BWASONs, probs = c(.2, .8),
 90 |                      na.rm = TRUE)))
 91 | 
 92 | ## predictions based on average random effects
 93 | preddat2$yhat <- predict(m1.glmm,
 94 |   newdata = preddat2,
 95 |   type = "response",
 96 |   re.form = NULL)
 97 | 
 98 | 
 99 | ## ------------------------------------------------------------------------
100 | 
101 | ## calculate predicted probabilities
102 | ## averaging across participants
103 | preddat3 <- preddat2[, .(yhat = mean(yhat)),
104 |          by = .(BCOPEDis, BWASONs)]
105 | 
106 | 
107 | ## ----fglmma-probs1, fig.width=10, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Graph of the predicted probabilities setting random effects to zero (average on the logit scale) and averaging all of them."----
108 | 
109 | ggplot(rbind(
110 |   cbind(preddat, Type = "Zero"),
111 |   cbind(preddat3, Type = "Average")),
112 |   aes(BCOPEDis, yhat, colour = Type)) +
113 |   geom_line(size = 1) +
114 |   scale_color_viridis(discrete = TRUE) +
115 |   facet_wrap(~ round(BWASONs, 1)) +
116 |   theme(
117 |     legend.key.width = unit(1, "cm"),
118 |     legend.position = c(.1, .9)) +
119 |   xlab("Average disengagement coping") +
120 |   ylab("Probability of sleep onset latency 30+ min") +
121 |   coord_cartesian(
122 |     xlim = c(1, 4),
123 |     ylim = c(0, .6),
124 |     expand = FALSE)
125 | 
126 | 
127 | 
128 | ## ----fglmma-probs2, fig.width=10, fig.height=5, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Graph of the individual predicted probabilities across levels of disengagement coping and average number of awakenings."----
129 | 
130 | ggplot(preddat2,
131 |   aes(BCOPEDis, yhat, group = UserID)) +
132 |   geom_line(alpha = .2) +
133 |   facet_wrap(~ round(BWASONs, 1))+
134 |   xlab("Average disengagement coping") +
135 |   ylab("Probability of sleep onset latency 30+ min") +
136 |   coord_cartesian(
137 |     xlim = c(1, 4),
138 |     ylim = c(0, 1),
139 |     expand = FALSE)
140 | 
141 | 
142 | 
143 | ## ------------------------------------------------------------------------
144 | 
145 | m2.glmm <- glmer(SOLs30 ~ BPosAff + WPosAff +
146 |   (1 + WPosAff | UserID),
147 |   family = binomial(link = logit),
148 |   data = d, nAGQ = 1)
149 | 
150 | summary(m2.glmm)
151 | 
152 | 
153 | ## ------------------------------------------------------------------------
154 | 
155 | exp(cbind(
156 |   B = fixef(m2.glmm),
157 |   confint(m2.glmm, parm = "beta_", method = "Wald")))
158 | 
159 | 
160 | ## ------------------------------------------------------------------------
161 | 
162 | bpa.low <- quantile(d$BPosAff, probs = .2, na.rm=TRUE)
163 | bpa.high <- quantile(d$BPosAff, probs = .8, na.rm=TRUE)
164 | 
165 | preddat4.low <- as.data.table(expand.grid(
166 |   UserID = unique(d$UserID),
167 |   WPosAff = seq(
168 |     from = min(d[BPosAff <= bpa.low]$WPosAff,
169 |                na.rm = TRUE),
170 |     to = max(d[BPosAff <= bpa.low]$WPosAff,
171 |              na.rm = TRUE),
172 |     length.out = 1000),
173 |   BPosAff = bpa.low))
174 | 
175 | preddat4.high <- as.data.table(expand.grid(
176 |   UserID = unique(d$UserID),
177 |   WPosAff = seq(
178 |     from = min(d[BPosAff >= bpa.high]$WPosAff,
179 |                na.rm = TRUE),
180 |     to = max(d[BPosAff >= bpa.high]$WPosAff,
181 |              na.rm = TRUE),
182 |     length.out = 1000),
183 |   BPosAff = bpa.high))
184 | 
185 | preddat4 <- rbind(
186 |   preddat4.low,
187 |   preddat4.high)
188 | 
189 | ## predictions including random effects
190 | preddat4$yhat <- predict(m2.glmm,
191 |   newdata = preddat4,
192 |   type = "response",
193 |   re.form = NULL)
194 | 
195 | ## calculate predicted probabilities
196 | ## averaging across participants
197 | preddat4b <- preddat4[, .(yhat = mean(yhat)),
198 |          by = .(WPosAff, BPosAff)]
199 | 
200 | 
201 | ## ----fglmma-probs3, fig.width=7, fig.height=6, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Graph of the predicted probabilities averaging across individuals accounting for random intercept and slope of within person positive affect."----
202 | 
203 | ggplot(preddat4b,
204 |   aes(WPosAff, yhat, colour = factor(round(BPosAff, 1)))) +
205 |   geom_line(size = 1) +
206 |   scale_color_viridis("Average\nPositive Affect",
207 |                       discrete = TRUE) +
208 |   theme(
209 |     legend.key.width = unit(1.5, "cm"),
210 |     legend.position = c(.7, .9)) +
211 |   coord_cartesian(
212 |     xlim = c(-4, 4),
213 |     ylim = c(0, .45),
214 |     expand = FALSE) +
215 |   xlab(paste0("Within person positive affect\n",
216 |               "(deviations from own mean)")) +
217 |   ylab("Probability of sleep onset latency 30+ min")
218 | 
219 | 
220 | ## ------------------------------------------------------------------------
221 | 
222 | m3.glmm <- glmer(WASONs ~ Age + BornAUS +
223 |   (1 | UserID),
224 |   family = poisson(link = log),
225 |   data = d, nAGQ = 9)
226 | 
227 | summary(m3.glmm)
228 | 
229 | 
230 | ## ------------------------------------------------------------------------
231 | 
232 | exp(cbind(
233 |   B = fixef(m3.glmm),
234 |   confint(m3.glmm, parm = "beta_", method = "Wald")))
235 | 
236 | ## ------------------------------------------------------------------------
237 | 
238 | preddat5 <- as.data.table(expand.grid(
239 |   UserID = unique(d[!is.na(BornAUS) & !is.na(Age)]$UserID),
240 |   Age = seq(
241 |     from = min(d$Age, na.rm=TRUE),
242 |     to = max(d$Age, na.rm = TRUE),
243 |     length.out = 1000),
244 |   BornAUS = 0:1))
245 | 
246 | ## predictions based on average random effects
247 | preddat5$yhat <- predict(m3.glmm,
248 |   newdata = preddat5,
249 |   type = "response",
250 |   re.form = NULL)
251 | 
252 | ## calculate predicted counts
253 | ## averaging across participants
254 | preddat5 <- preddat5[, .(yhat = mean(yhat)),
255 |          by = .(Age, BornAUS)]
256 | 
257 | 
258 | ## ----fglmma-probs5, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Graph of the average predicted counts of wakenings after sleep onset by age and whether people were born in Australia or not."----
259 | 
260 | ggplot(preddat5,
261 |   aes(Age, yhat, colour = factor(BornAUS))) +
262 |   geom_line(size = 2) +
263 |   scale_colour_viridis("Born in Australia", discrete = TRUE) +
264 |   xlab("Age (years)") +
265 |   ylab("Predicted # wakenings after sleep onset") +
266 |   theme(
267 |     legend.key.width = unit(1.5, "cm"),
268 |     legend.position = c(.1, .9)) +
269 |   coord_cartesian(
270 |     xlim = c(18, 26.5),
271 |     ylim = c(0, 2),
272 |     expand = FALSE)
273 | 
274 | 
275 | ## ------------------------------------------------------------------------
276 | 
277 | m3.glmm.nb <- glmer.nb(formula(m3.glmm),
278 |   data = d)
279 | 
280 | 
281 | ## ------------------------------------------------------------------------
282 | 
283 | ## load R code shipped with lme4 to provide the allFit()
284 | source(system.file("utils", "allFit.R", package="lme4"))
285 | m3.all <- allFit(m3.glmm.nb)
286 | 
287 | 
288 | ## ------------------------------------------------------------------------
289 | 
290 | m3.all.sum <- summary(m3.all)
291 | 
292 | m3.all.sum$fixef
293 | m3.all.sum$llik
294 | m3.all.sum$theta
295 | 
296 | 
297 | ## ------------------------------------------------------------------------
298 | 
299 | screenreg(
300 |   list(Poisson = m3.glmm,
301 |        NegBin = m3.glmm.nb))
302 | 
303 | 
304 | ## ------------------------------------------------------------------------
305 | 
306 | exp(cbind(
307 |   fixef(m3.glmm),
308 |   confint(m3.glmm, parm = "beta_", method = "Wald"),
309 |   fixef(m3.glmm.nb),
310 |   confint(m3.glmm.nb, parm = "beta_", method = "Wald")))
311 | 
312 | 
313 | ## ------------------------------------------------------------------------
314 | 
315 | theta <- getME(m3.glmm.nb, "glmer.nb.theta")
316 | 
317 | density <- data.table(
318 |   X = as.integer(names(table(d$WASONs))),
319 |   Observed = as.vector(prop.table(table(d$WASONs))))
320 | 
321 | density$NegBin <- colMeans(do.call(rbind, lapply(fitted(m3.glmm.nb), function(mu) {
322 |   dnbinom(density$X, size = theta, mu = mu)
323 |   })))
324 | 
325 | density$Poisson <- colMeans(do.call(rbind, lapply(fitted(m3.glmm), function(mu) {
326 |   dpois(density$X, lambda = mu)
327 |   })))
328 | 
329 | 
330 | ## ----fglmma-pnbdens, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Observed and expected average densities of number of awakenings at night based on a poisson and negative binomial GLMM."----
331 | 
332 | ggplot(melt(density, id.vars = "X"),
333 |   aes(X, value, fill = variable)) +
334 |   geom_col(position = "dodge") +
335 |   scale_fill_viridis("Type", discrete = TRUE) +
336 |   theme(legend.position = c(.8, .8)) +
337 |   xlab("Number of awakenings") +
338 |   ylab("Density") +
339 |   coord_cartesian(
340 |     xlim = c(-.5, 4.5),
341 |     ylim = c(0, .5),
342 |     expand = FALSE)
343 | 
344 | 
345 | ## ------------------------------------------------------------------------
346 | 
347 | getME(m3.glmm.nb, "glmer.nb.theta")
348 | 
349 | 
350 | ## ----fglmma-wasodist, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Distribution of within person deviations from typical number of awakenings after sleep onset lagged to the previous day."----
351 | 
352 | testdistr(d[, WWASONsLag1],
353 |           varlab = "Within WASONs lag 1")
354 | 
355 | 
356 | ## ------------------------------------------------------------------------
357 | 
358 | m4.glmm <- glmer(WASONs ~ Age + BornAUS +
359 |    WWASONsLag1 +
360 |   (1 + WWASONsLag1  | UserID),
361 |   family = poisson(link = log),
362 |   data = d, nAGQ = 1)
363 | 
364 | summary(m4.glmm)
365 | 
366 | 
367 | ## ------------------------------------------------------------------------
368 | 
369 | exp(cbind(
370 |   B = fixef(m4.glmm),
371 |   confint(m4.glmm, parm = "beta_", method = "Wald")))
372 | 
373 | 
374 | ## ----fglmma-wasodist2, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "For nearly all people, when they have more than usual awakenings one night, they tend to have fewer awakenings the following night."----
375 | 
376 | testdistr(exp(coef(m4.glmm)$UserID$WWASONsLag1))
377 | 
378 | 
379 | ## ------------------------------------------------------------------------
380 | 
381 | preddat.boot <- as.data.table(expand.grid(
382 |   UserID = unique(model.frame(m4.glmm)$UserID),
383 |   WWASONsLag1 = seq(
384 |     from = min(d$WWASONsLag1, na.rm = TRUE),
385 |     to = max(d$WWASONsLag1, na.rm = TRUE),
386 |     length.out = 100),
387 |   Age = quantile(d[!duplicated(UserID)]$Age,
388 |                  probs = c(.2, .8), na.rm = TRUE),
389 |   BornAUS = 0:1))
390 | 
391 | preddat.boot$yhat <- predict(m4.glmm,
392 |   newdata = preddat.boot)
393 | 
394 | 
395 | 
396 | ## ------------------------------------------------------------------------
397 | 
398 | genPred <- function(m) {
399 |   predict(m,
400 |     newdata = preddat.boot)
401 | }
402 | 
403 | cl <- makeCluster(4)
404 | clusterExport(cl, c("book_directory",
405 |                     "checkpoint_directory",
406 |                     "preddat.boot", "d", "genPred"))
407 | 
408 | clusterEvalQ(cl, {
409 |   library(checkpoint)
410 |   checkpoint("2018-09-28", R.version = "3.5.1",
411 |     project = book_directory,
412 |     checkpointLocation = checkpoint_directory,
413 |     scanForPackages = FALSE,
414 |     scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
415 | 
416 |   library(data.table)
417 |   library(lme4)
418 |   library(lmerTest)
419 | })
420 | 
421 | 
422 | 
423 | ## ------------------------------------------------------------------------
424 | 
425 | system.time(bootres <- bootMer(m4.glmm,
426 |     FUN = genPred,
427 |     nsim = 100,
428 |     seed = 12345,
429 |     use.u = FALSE,
430 |     type = "parametric",
431 |     parallel = "snow",
432 |     ncpus = 4,
433 |     cl = cl))
434 | 
435 | 
436 | ## ------------------------------------------------------------------------
437 | 
438 | preddat.boot[, Index := rep(1L:400L,
439 |   each = length(unique(UserID)))]
440 | 
441 | 
442 | ## ------------------------------------------------------------------------
443 | 
444 | preddat.boot.avg <- preddat.boot[, .(yhat = mean(exp(yhat))),
445 |   by = .(WWASONsLag1, Age, BornAUS)]
446 | 
447 | 
448 | ## ------------------------------------------------------------------------
449 | 
450 | dim(bootres$t)
451 | 
452 | for (i in 1:400) {
453 |   ## find which indices to use
454 |   ok <- which(preddat.boot$Index == i)
455 | 
456 |   ## now average across people
457 |   tmp_avg <- rowMeans(exp(bootres$t[, ok]))
458 | 
459 |   ## lower confidence interval
460 |   preddat.boot.avg[i,
461 |     LL := quantile(tmp_avg, probs = .025, na.rm = TRUE)]
462 |   preddat.boot.avg[i,
463 |     UL := quantile(tmp_avg, probs = .975, na.rm = TRUE)]
464 | }
465 | 
466 | 
467 | ## ----fglmml-poispredboot, fig.width = 10, fig.height = 6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Plot of predicted number of awakenings by previous night awakenings (relative to own average), separated by age (years) at the 20th and 80th percentiles (19.4y and 25y, respectively) and born in Australia (0 = no, 1 = yes). Bootstrap confidence intervals around the mean predicted count are show through shading."----
468 | 
469 | ggplot(preddat.boot.avg, aes(WWASONsLag1, yhat,
470 |   colour = factor(BornAUS), fill = factor(BornAUS))) +
471 |   geom_ribbon(aes(ymin = LL, ymax = UL),
472 |               alpha = .25, colour = NA) +
473 |   geom_line(size = 1) +
474 |   ylab("Predicted Awakenings") +
475 |   xlab("Within person awakenings lag 1") +
476 |   scale_color_viridis("Born in Australia", discrete = TRUE) +
477 |   scale_fill_viridis("Born in Australia", discrete = TRUE) +
478 |   theme(
479 |     legend.position = "bottom",
480 |     legend.key.width = unit(1, "cm")) +
481 |   facet_wrap(~ Age) +
482 |   coord_cartesian(
483 |    xlim = c(-3, 3),
484 |    ylim = c(0, 2.5),
485 |    expand = FALSE)
486 | 
487 | 
488 | 


--------------------------------------------------------------------------------
/glmmi.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 | 
  4 | library(checkpoint)
  5 | checkpoint("2018-09-28", R.version = "3.5.1",
  6 |   project = book_directory,
  7 |   checkpointLocation = checkpoint_directory,
  8 |   scanForPackages = FALSE,
  9 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
 10 | 
 11 | 
 12 | library(knitr)
 13 | library(ggplot2)
 14 | library(cowplot)
 15 | library(viridis)
 16 | library(JWileymisc)
 17 | library(data.table)
 18 | library(lme4)
 19 | library(lmerTest)
 20 | library(chron)
 21 | library(zoo)
 22 | library(pander)
 23 | library(texreg)
 24 | 
 25 | options(width = 70, digits = 2)
 26 | 
 27 | 
 28 | ## ------------------------------------------------------------------------
 29 | 
 30 | ex.wide <- data.table(
 31 |   ID = c(1, 2, 3),
 32 |   SBPT1 = c(135, 120, 121),
 33 |   SBPT2 = c(130, 125, 125),
 34 |   SBPT3 = c(125, 121, NA))
 35 | 
 36 | print(ex.wide)
 37 | 
 38 | 
 39 | reshape(
 40 |   data = ex.wide,
 41 |   varying = list(paste0("SBPT", 1:3)),
 42 |   v.names = c("SBP"),
 43 |   idvar = "ID",
 44 |   direction = "long")
 45 | 
 46 | 
 47 | ## ------------------------------------------------------------------------
 48 | 
 49 | ex.long <- data.table(
 50 |   ID = c(1, 1, 1, 2, 2, 2, 3, 3),
 51 |   SBP = c(135, 130, 125, 120, 125, 121, 121, 125),
 52 |   Time = c(1, 2, 3, 1, 2, 3, 1, 2))
 53 | 
 54 | print(ex.long)
 55 | 
 56 | reshape(
 57 |   data = ex.long,
 58 |   v.names = "SBP",
 59 |   timevar = "Time",
 60 |   sep = "T",
 61 |   idvar = "ID",
 62 |   direction = "wide")
 63 | 
 64 | 
 65 | ## ------------------------------------------------------------------------
 66 | 
 67 | data(aces_daily)
 68 | str(aces_daily)
 69 | 
 70 | 
 71 | ## ----cache=FALSE---------------------------------------------------------
 72 | 
 73 | draw <- as.data.table(aces_daily)
 74 | draw <- draw[order(UserID, SurveyDay, SurveyInteger)]
 75 | draw[, UserID := factor(UserID)]
 76 | 
 77 | tmpdata <- draw[!is.na(SurveyDay) & !is.na(SurveyInteger)][, .(
 78 |   MinD = min(SurveyDay),
 79 |   MinS = min(SurveyInteger[SurveyDay == min(SurveyDay)]),
 80 |   MaxD = max(SurveyDay),
 81 |   MaxS = max(SurveyInteger[SurveyDay == max(SurveyDay)])),
 82 |   by = UserID]
 83 | 
 84 | tmpdata <- tmpdata[, .(
 85 |   SurveyInteger = c(
 86 |     MinS:3L, #first day
 87 |     rep(1L:3L, times = MaxD - MinD - 1), #all days between first/last
 88 |     1L:MaxS), #last day
 89 |   SurveyDay = as.Date(rep(MinD:MaxD, c(
 90 |       4L - MinS, #first day
 91 |       rep(3, MaxD - MinD - 1), #all days between first/last
 92 |       MaxS)), origin = "1970-01-01")), #lastday
 93 |   by = UserID]
 94 | 
 95 | d <- merge(draw, tmpdata, by = c("UserID", "SurveyDay", "SurveyInteger"),
 96 |            all = TRUE)
 97 | 
 98 | nrow(draw)
 99 | nrow(d)
100 | 
101 | nrow(draw)/nrow(d)
102 | 
103 | 
104 | ## ----fglmmi-varplot, fig.width=4, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Plot showing hypothetical data with high between variance and low between variance. In the high between variance, observations within a person vary little, but there are large individual differences. In low between variance, there are not many individual differences, but large variability within each person"----
105 | 
106 | set.seed(1234)
107 | ex.data.1 <- data.table(
108 |   ID = factor(rep(1:4, each = 10)),
109 |   time = rep(1:10, times = 4),
110 |   y = rnorm(40, rep(1:4, each = 10), .2))
111 | 
112 | ex.data.2 <- data.table(
113 |   ID = factor(rep(1:4, each = 10)),
114 |   time = rep(1:10, times = 4),
115 |   y = rnorm(40, 2.5, 1))
116 | 
117 | plot_grid(
118 |  ggplot(ex.data.1,
119 |         aes(time, y, colour = ID, shape = ID)) +
120 |   stat_smooth(method = "lm", formula = y ~ 1, se=FALSE) +
121 |   geom_point() +
122 |   scale_color_viridis(discrete = TRUE),
123 |  ggplot(ex.data.2,
124 |         aes(time, y, colour = ID, shape = ID)) +
125 |   stat_smooth(method = "lm", formula = y ~ 1, se=FALSE) +
126 |   geom_point() +
127 |   scale_color_viridis(discrete = TRUE),
128 |  ncol = 1,
129 |  labels = c(
130 |    "High Between Variance",
131 |    "Low Between Variance"),
132 |  align = "hv")
133 | 
134 | 
135 | ## ------------------------------------------------------------------------
136 | 
137 | ## mean and SD on all observations
138 | egltable("PosAff", data = d)
139 | 
140 | ## mean and SD first averaging within ID
141 | egltable("PosAff",
142 |   data = d[, .(
143 |     PosAff = mean(PosAff, na.rm = TRUE)),
144 |     by = UserID])
145 | 
146 | ## mean and SD on first observations
147 | egltable("PosAff", data = d[
148 |   order(UserID, SurveyDay, SurveyInteger)][,
149 |     .(PosAff = PosAff[1]), by = UserID])
150 | 
151 | 
152 | ## ------------------------------------------------------------------------
153 | 
154 | tab <- egltable(c("Female", "Age", "BornAUS", "SES_1", "EDU"),
155 |                 data = d[!duplicated(UserID)],
156 |                 strict = FALSE)
157 | tab
158 | 
159 | 
160 | ## ----fglmmi-likplot, fig.width=7, fig.height=4, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Plot showing average coping ratings for women and men."----
161 | 
162 | ## create a dataset of the means and labels by gender
163 | copeplotdata <- d[!is.na(Female), .(
164 |   M = c(
165 |     mean(COPEPrb, na.rm = TRUE),
166 |     mean(COPEPrc, na.rm = TRUE),
167 |     mean(COPEExp, na.rm = TRUE),
168 |     mean(COPEDis, na.rm = TRUE)),
169 |   Var = 1:4,
170 |   Low = sprintf("I usually don't do this at all\n[%s]",
171 |                 c("Problem Focused", "Emotional Processing",
172 |                   "Emotional Expression", "Disengagement")),
173 |   High = sprintf("I usually do this a lot\n[%s]",
174 |                  c("Problem Focused", "Emotional Processing",
175 |                    "Emotional Expression", "Disengagement"))),
176 |   by = Female]
177 | 
178 | ## coded 0/1 but for plotting, R needs to know
179 | ## it is discrete not a continuous number
180 | copeplotdata[, Female := factor(Female)]
181 | 
182 | ## create a plot
183 | gglikert(x = "M", y = "Var", leftLab = "Low", rightLab = "High",
184 |          data = copeplotdata, colour = "Female",
185 |   xlim = c(1, 4), title = "Average Coping") +
186 |   scale_colour_manual(values =
187 |     c("1" = "grey70", "0" = "grey30"))
188 | 
189 | 
190 | ## ----fglmmi-likplotaf, fig.width=7, fig.height=3, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Plot showing average coping ratings for women and men."----
191 | 
192 | ## create a dataset of the means and labels by stress
193 | afplotdata <- d[!is.na(STRESS), .(
194 |   M = c(
195 |     mean(PosAff, na.rm = TRUE),
196 |     mean(NegAff, na.rm = TRUE)),
197 |   Var = 1:2,
198 |   Low = sprintf("Very Slightly or\nNot at all\n[%s]",
199 |                 c("Positive Affect", "Negative Affect")),
200 |   High = sprintf("Extremely\n\n[%s]",
201 |                 c("Positive Affect", "Negative Affect"))),
202 |   by = .(Stress = STRESS > 5)]
203 | 
204 | ## add labels to understand stress
205 | afplotdata[, Stress := factor(Stress, levels = c(FALSE, TRUE),
206 |                               labels = c("<= 5", "> 5"))]
207 | 
208 | ## create a plot
209 | gglikert(x = "M", y = "Var", leftLab = "Low", rightLab = "High",
210 |          data = afplotdata, colour = "Stress",
211 |   xlim = c(1, 5), title = "Affect by Stress") +
212 |   scale_colour_manual(values =
213 |     c("<= 5" = "grey70", "> 5" = "grey30"))
214 | 
215 | 
216 | ## ------------------------------------------------------------------------
217 | 
218 | d[, Survey := factor(SurveyInteger, levels = 1:3,
219 |     labels = c("Morning", "Afternoon", "Evening"))]
220 | 
221 | egltable(c("PosAff", "NegAff", "STRESS"), g = "Survey",
222 |   data = d[, .(
223 |     PosAff = mean(PosAff, na.rm = TRUE),
224 |     NegAff = mean(NegAff, na.rm = TRUE),
225 |     STRESS = mean(STRESS, na.rm = TRUE)
226 |     ), by = .(UserID, Survey)])
227 | 
228 | 
229 | ## ------------------------------------------------------------------------
230 | 
231 | d[, BPosAff := mean(PosAff, na.rm = TRUE), by = UserID]
232 | d[, WPosAff := PosAff - BPosAff]
233 | 
234 | egltable("BPosAff", data = d[!duplicated(UserID)])
235 | egltable("WPosAff", data = d)
236 | 
237 | 
238 | ## ------------------------------------------------------------------------
239 | 
240 | ## define a new function
241 | bwmean <- function(x, na.rm = TRUE) {
242 |   m <- mean(x, na.rm = na.rm)
243 |   list(m, x - m)
244 | }
245 | 
246 | ## apply it to affect, support, and stress, by ID
247 | d[, c("BNegAff", "WNegAff") := bwmean(NegAff), by = UserID]
248 | d[, c("BSUPPORT", "WSUPPORT") := bwmean(SUPPORT), by = UserID]
249 | d[, c("BSTRESS", "WSTRESS") := bwmean(STRESS), by = UserID]
250 | 
251 | 
252 | ## ------------------------------------------------------------------------
253 | 
254 | d[, .(
255 |   NCope = sum(!is.na(COPEPrb)),
256 |   NSOLs = sum(!is.na(SOLs))),
257 |   by = Survey]
258 | 
259 | 
260 | ## ------------------------------------------------------------------------
261 | 
262 | d[, BCOPEPrb := mean(COPEPrb, na.rm = TRUE), by = UserID]
263 | d[, WCOPEPrb := na.omit(COPEPrb) - BCOPEPrb,
264 |   by = .(UserID, SurveyDay)]
265 | d[, BCOPEPrc := mean(COPEPrc, na.rm = TRUE), by = UserID]
266 | d[, WCOPEPrc := na.omit(COPEPrc) - BCOPEPrc,
267 |   by = .(UserID, SurveyDay)]
268 | d[, BCOPEExp := mean(COPEExp, na.rm = TRUE), by = UserID]
269 | d[, WCOPEExp := na.omit(COPEExp) - BCOPEExp,
270 |   by = .(UserID, SurveyDay)]
271 | d[, BCOPEDis := mean(COPEDis, na.rm = TRUE), by = UserID]
272 | d[, WCOPEDis := na.omit(COPEDis) - BCOPEDis,
273 |   by = .(UserID, SurveyDay)]
274 | 
275 | d[, BSOLs := mean(SOLs, na.rm = TRUE), by = UserID]
276 | d[, WSOLs := na.omit(SOLs) - BSOLs,
277 |   by = .(UserID, SurveyDay)]
278 | d[, BWASONs := mean(WASONs, na.rm = TRUE), by = UserID]
279 | d[, WWASONs := na.omit(WASONs) - BWASONs,
280 |   by = .(UserID, SurveyDay)]
281 | 
282 | 
283 | ## ------------------------------------------------------------------------
284 | 
285 | iccMixed("NegAff", "UserID", d)
286 | 
287 | iccMixed("PosAff", "UserID", d)
288 | 
289 | 
290 | ## ------------------------------------------------------------------------
291 | 
292 | ## number of units
293 | n <- length(unique(d$UserID))
294 | 
295 | ## average observations per unit
296 | k <- nrow(d[!is.na(NegAff)])/n
297 | 
298 | ## effective sample size
299 | nEffective(n, k, dv = "NegAff", id = "UserID", data = d)
300 | 
301 | k <- nrow(d[!is.na(PosAff)])/n
302 | nEffective(n, k, dv = "PosAff", id = "UserID", data = d)
303 | 
304 | 
305 | ## ------------------------------------------------------------------------
306 | 
307 | tmp <- meanDecompose(PosAff ~ UserID, data = d)
308 | str(tmp, max.level = 1)
309 | 
310 | 
311 | ## ----fglmmi-bdistpa, fig.width=4, fig.height=5, out.width='.4\\linewidth', fig.pos="!h", fig.cap = "Between person positive affect against a normal distribution."----
312 | 
313 | testdistr(tmp[[1]]$X, varlab = names(tmp)[1],
314 |           extremevalues = "theoretical", robust=TRUE)
315 | 
316 | 
317 | ## ----fglmmi-mdistpa, fig.width=8, fig.height=8, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Between and within person positive affect against a normal distribution."----
318 | 
319 | plots <- lapply(names(tmp), function(x) {
320 |   testdistr(tmp[[x]]$X, plot = FALSE, varlab = x,
321 |             extremevalues = "theoretical", robust=TRUE)[1:2]
322 | })
323 | 
324 | do.call(plot_grid, c(unlist(plots, FALSE), ncol = 2))
325 | 
326 | 
327 | ## ----fglmmi-mdistna, fig.width=8, fig.height=12, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Different levels of negative affect against a normal distribution."----
328 | 
329 | tmp <- meanDecompose(NegAff ~ UserID + SurveyDay, data = d)
330 | do.call(plot_grid, c(unlist(lapply(names(tmp), function(x) {
331 |   testdistr(tmp[[x]]$X, plot = FALSE, varlab = x,
332 |             extremevalues = "theoretical", robust=TRUE)[1:2]
333 | }), FALSE), ncol = 2))
334 | 
335 | 
336 | ## ----fglmmi-mdistlogna, fig.width=8, fig.height=12, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Different levels of the natural logarithm of negative affect against a normal distribution."----
337 | 
338 | d[, logNegAff := log(NegAff)]
339 | tmp <- meanDecompose(logNegAff ~ UserID + SurveyDay, data = d)
340 | do.call(plot_grid, c(unlist(lapply(names(tmp), function(x) {
341 |   testdistr(tmp[[x]]$X, plot = FALSE, varlab = x,
342 |             extremevalues = "theoretical", robust=TRUE)[1:2]
343 | }), FALSE), ncol = 2))
344 | 
345 | 
346 | ## ----fglmmi-timetrends, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Trends in variables over time with a gam smooth.", warning=FALSE----
347 | 
348 | dt <- d[, .(
349 |   WPosAff = mean(WPosAff, na.rm = TRUE),
350 |   WNegAff = mean(WNegAff, na.rm = TRUE),
351 |   WSTRESS = mean(WSTRESS, na.rm = TRUE),
352 |   WSUPPORT = mean(WSUPPORT, na.rm = TRUE),
353 |   WSOLs = mean(WSOLs, na.rm = TRUE),
354 |   WWASONs = mean(WWASONs, na.rm = TRUE)) , by = SurveyDay]
355 | dt <- melt(dt, id.var = "SurveyDay")
356 | 
357 | ggplot(dt, aes(SurveyDay, value)) +
358 |   geom_point() +
359 |   stat_smooth(method = "gam", formula = y ~ s(x, k = 10)) +
360 |   facet_wrap(~ variable, scales = "free")
361 | 
362 | 
363 | ## ----fglmmi-weekend, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Trends in variables over time with a gam smooth."----
364 | 
365 | dt[, Weekend := weekdays(SurveyDay) %in% c("Saturday", "Sunday")]
366 | 
367 | ggplot(dt, aes(Weekend, value)) +
368 |   stat_summary(fun.data = mean_cl_boot) +
369 |   facet_wrap(~ variable, scales = "free")
370 | 
371 | 
372 | ## ------------------------------------------------------------------------
373 | 
374 | d[, StartTimec11Alt := ifelse(is.na(SurveyStartTimec11),
375 |                               mean(SurveyStartTimec11, na.rm = TRUE),
376 |                               SurveyStartTimec11),
377 |   by = .(UserID, Survey)]
378 | d[, StartDayTimec11Alt := chron(
379 |       dates. = format(SurveyDay, "%m/%d/%Y"),
380 |       times. = StartTimec11Alt)]
381 | 
382 | 
383 | ## ----fglmmi-acf1, fig.width=5, fig.height=4, out.width='.4\\linewidth', fig.pos="!h", fig.cap = "Autocorrelation for one participant.", warning=FALSE----
384 | 
385 | tmpd <- d[UserID == 1]
386 | acf(na.approx(zoo(tmpd$PosAff,
387 |     order.by = tmpd$StartDayTimec11Alt)),
388 |     lag.max = 10)
389 | 
390 | 
391 | ## ----fglmmi-acfall, fig.width=4, fig.height=8, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Autocorrelation for all participant for positive and negative affect and stress.", message = FALSE, warning = FALSE----
392 | 
393 | acf.posaff <- acfByID("PosAff", "StartDayTimec11Alt",
394 |                       "UserID", d)
395 | 
396 | print(acf.posaff)
397 | 
398 | ## make for other measures
399 | acf.negaff <- acfByID("NegAff", "StartDayTimec11Alt",
400 |                       "UserID", d)
401 | acf.stress <- acfByID("STRESS", "StartDayTimec11Alt",
402 |                       "UserID", d)
403 | 
404 | ## put into one dataset for plotting a panel
405 | acf.all <- rbind(
406 |   acf.posaff, acf.negaff,
407 |   acf.stress)
408 | 
409 | ggplot(acf.all,
410 |     aes(factor(Lag), y = AutoCorrelation)) +
411 |   geom_hline(yintercept = 0, colour = "grey50", size = 1) +
412 |   geom_hline(yintercept = c(-.5, .5),
413 |              linetype = 2, colour = "grey50", size = 1) +
414 |   geom_boxplot() + ylab("Auto Correlation") +
415 |   facet_wrap(~ Variable, ncol = 1)
416 | 
417 | 
418 | ## ------------------------------------------------------------------------
419 | 
420 | ## ensure data ordered by ID, date, and time
421 | d <- d[order(UserID, SurveyDay, SurveyInteger)]
422 | ## calculate a number for the survey from 1 to total
423 | d[, USURVEYID := 1:.N, by = .(UserID)]
424 | 
425 | d[,
426 |   c("NegAffLag1", "WNegAffLag1",
427 |     "PosAffLag1", "WPosAffLag1",
428 |     "STRESSLag1", "WSTRESSLag1") :=
429 |     .SD[.(UserID = UserID, USURVEYID = USURVEYID - 1),
430 |     .(NegAff, WNegAff,
431 |       PosAff, WPosAff,
432 |       STRESS, WSTRESS),
433 |       on = c("UserID", "USURVEYID")]]
434 | 
435 | d[,
436 |   c("WCOPEPrbLag1", "WCOPEPrcLag1",
437 |     "WCOPEExpLag1", "WCOPEDisLag1",
438 |     "WSOLsLag1", "WWASONsLag1") :=
439 |   .SD[.(UserID = UserID, Survey = Survey, SurveyDay = SurveyDay - 1),
440 |       .(WCOPEPrb, WCOPEPrc, WCOPEExp, WCOPEDis,
441 |         WSOLs, WWASONs),
442 |       on = c("UserID", "Survey", "SurveyDay")]]
443 | 
444 | ## save data after processing, with compression
445 | ## for use in subsequent chapters
446 | saveRDS(d, file = "aces_daily_sim_processed.RDS",
447 |         compress = "xz")
448 | 
449 | 
450 | ## ----fglmmi-glmmdiagnegaff, fig.width=8, fig.height=10, out.width='.7\\linewidth', fig.pos="!h", fig.cap = "Negative affect mixed effects model diagnostic plots showing the distribution of residuals (top left), the residuals versus fitted values to assess homogeneity of variance (top right), the distribution of the random intercept (middle left), the distribution of the random slope (middle right), and whether the random effects are multivariate normal (bottom left).", warning = FALSE----
451 | 
452 | m.negaff <- lmer(NegAff ~ 1 + BSTRESS + WSTRESS +
453 |             (1 + WSTRESS | UserID), data = d)
454 | assumptiontests <- plotDiagnosticsLMER(m.negaff, plot = FALSE)
455 | do.call(plot_grid, c(
456 |   assumptiontests[c("ResPlot", "ResFittedPlot")],
457 |   assumptiontests$RanefPlot, ncol = 2))
458 | 
459 | 
460 | ## ----fglmmi-glmmdiagposaff, fig.width=8, fig.height=10, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Positive affect mixed effects model diagnostic plots showing the distribution of residuals (top left), the residuals versus fitted values to assess homogeneity of variance (top right), the distribution of the random intercept (middle left), the distribution of the random slope (middle right), and whether the random effects are multivariate normal (bottom left).", warning = FALSE----
461 | 
462 | m.posaff <- lmer(PosAff ~ 1 + BSTRESS + WSTRESS +
463 |             (1 + WSTRESS | UserID), data = d)
464 | 
465 | assumptiontests <- plotDiagnosticsLMER(m.posaff, plot = FALSE)
466 | do.call(plot_grid, c(
467 |   assumptiontests[c("ResPlot", "ResFittedPlot")],
468 |   assumptiontests$RanefPlot, ncol = 2))
469 | 
470 | 
471 | ## ----fglmmi-glmmdiagposaff2, fig.width=8, fig.height=10, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Positive affect mixed effects model diagnostic plots showing the distribution of residuals (top left), the residuals versus fitted values to assess homogeneity of variance (top right), the distribution of the random intercept (middle left), the distribution of the random slope (middle right), and whether the random effects are multivariate normal (bottom left). Results after removing two multivariate outliers, IDs 57 and 123.", warning = FALSE----
472 | 
473 | assumptiontests$ExtremeValues[
474 |   EffectType == "Multivariate Random Effect UserID"]
475 | 
476 | m.posaff <- lmer(PosAff ~ 1 + BSTRESS + WSTRESS +
477 |             (1 + WSTRESS | UserID),
478 |             data = d[!UserID %in% c(57, 123)])
479 | 
480 | assumptiontests <- plotDiagnosticsLMER(m.posaff, plot = FALSE)
481 | do.call(plot_grid, c(
482 |   assumptiontests[c("ResPlot", "ResFittedPlot")],
483 |   assumptiontests$RanefPlot, ncol = 2))
484 | 
485 | 
486 | 
487 | 
488 | ## ----fglmmi-posaffext, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!h", fig.cap = "Positive affect and stress associations, highlight extreme cases", warning = FALSE----
489 | 
490 | ggplot() +
491 |   stat_smooth(aes(WSTRESS, PosAff, group = UserID),
492 |     data = d[!UserID %in% c(123)], method = "lm",
493 |    se = FALSE, colour = "grey50") +
494 |   stat_smooth(aes(WSTRESS, PosAff, group = UserID),
495 |     data = d[UserID %in% c(123)], method = "lm",
496 |    se = FALSE, colour = "blue", size = 2) +
497 |   geom_point(aes(WSTRESS, PosAff),
498 |     data = d[UserID %in% c(123)], colour = "blue", size = 2) +
499 |   stat_smooth(aes(WSTRESS, PosAff, group = UserID),
500 |     data = d[UserID %in% c(57)], method = "lm",
501 |    se = FALSE, colour = "orange", size = 2) +
502 |   geom_point(aes(WSTRESS, PosAff),
503 |     data = d[UserID %in% c(57)], colour = "orange", size = 2)
504 | 
505 | 
506 | 


--------------------------------------------------------------------------------
/iiv.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 | library(checkpoint)
  4 | checkpoint("2018-09-28", R.version = "3.5.1",
  5 |   project = book_directory,
  6 |   checkpointLocation = checkpoint_directory,
  7 |   scanForPackages = FALSE,
  8 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(knitr)
 11 | library(ggplot2)
 12 | library(cowplot)
 13 | library(viridis)
 14 | library(data.table)
 15 | library(JWileymisc)
 16 | library(varian)
 17 | library(mice)
 18 | library(parallel)
 19 | 
 20 | options(width = 70, digits = 2)
 21 | 
 22 | 
 23 | ## ------------------------------------------------------------------------
 24 | 
 25 | ## ordered
 26 | sd(c(1, 3, 5, 7, 9))
 27 | rmssd(c(1, 3, 5, 7, 9))
 28 | 
 29 | ## randomized
 30 | sd(c(3, 1, 9, 5, 7))
 31 | rmssd(c(3, 1, 9, 5, 7))
 32 | 
 33 | 
 34 | ## ------------------------------------------------------------------------
 35 | 
 36 | data(aces_daily)
 37 | draw <- as.data.table(aces_daily)
 38 | d <- readRDS("aces_daily_sim_processed.RDS")
 39 | 
 40 | variability_measures <- function(x) {
 41 |   x <- na.omit(x)
 42 |   list(
 43 |     SD = sd(x),
 44 |     VAR = sd(x)^2,
 45 |     RMSSD = rmssd(x),
 46 |     MSSD = rmssd(x)^2,
 47 |     MAD = median(abs(x - median(x))),
 48 |     RANGE = range(x),
 49 |     IQR = abs(diff(quantile(x, probs = c(.25, .75)))),
 50 |     CV = sd(x) / mean(x))
 51 | }
 52 | 
 53 | 
 54 | ## ----fiiv-varplot, fig.width=14, fig.height=14, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Correlations between variability measures for applied to individuals across four different measures."----
 55 | 
 56 | plot_grid(
 57 |   plot(SEMSummary(~ .,
 58 |     data = d[, variability_measures(PosAff), by = UserID][,-1]),
 59 |     order = "asis") +
 60 |     ggtitle("PosAff"),
 61 |   plot(SEMSummary(~ .,
 62 |     data = d[, variability_measures(NegAff), by = UserID][,-1]),
 63 |     order = "asis") +
 64 |     ggtitle("NegAff"),
 65 |   plot(SEMSummary(~ .,
 66 |     data = d[, variability_measures(COPEPrc), by = UserID][,-1]),
 67 |     order = "asis") +
 68 |     ggtitle("COPEPrc"),
 69 |   plot(SEMSummary(~ .,
 70 |     data = d[, variability_measures(SOLs), by = UserID][,-1]),
 71 |     order = "asis") +
 72 |     ggtitle("SOLs"),
 73 | ncol = 2)
 74 | 
 75 | 
 76 | ## ----fiiv-varplot2, fig.width=7, fig.height=7, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Two hypothetical people given an intervention, Person A (purple) and Person B (yellow), both improve at about the same rate, but Person B is less consistent."----
 77 | 
 78 | iivdat <- data.table(
 79 |   Assessment = 0:15,
 80 |   PersonA = c(1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10),
 81 |   PersonB = c(2, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11, 8, 12))
 82 | 
 83 | ggplot(iivdat, aes(Assessment)) +
 84 |   stat_smooth(aes(y = PersonA), method = "lm", se=FALSE,
 85 |               colour = viridis(2)[1], linetype = 2) +
 86 |   geom_line(aes(y = PersonA),
 87 |             colour = viridis(2)[1], size = 1) +
 88 |   stat_smooth(aes(y = PersonB), method = "lm", se=FALSE,
 89 |               colour = viridis(2)[2], linetype = 2) +
 90 |   geom_line(aes(y = PersonB),
 91 |             colour = viridis(2)[2], size = 1) +
 92 |   ylab("Outcome Scores")
 93 | 
 94 | 
 95 | ## ------------------------------------------------------------------------
 96 | 
 97 | ## ISD
 98 | sd(iivdat$PersonA)
 99 | sd(iivdat$PersonB)
100 | 
101 | 
102 | ## ------------------------------------------------------------------------
103 | 
104 | ## ISD, after removing systematic improvements
105 | sd(resid(lm(PersonA ~ Assessment, data = iivdat)))
106 | sd(resid(lm(PersonB ~ Assessment, data = iivdat)))
107 | 
108 | 
109 | ## ------------------------------------------------------------------------
110 | 
111 | cl <- makeCluster(2)
112 | clusterExport(cl, c("book_directory", "checkpoint_directory" ))
113 | 
114 | clusterEvalQ(cl, {
115 |   library(checkpoint)
116 |   checkpoint("2018-09-28", R.version = "3.5.1",
117 |   project = book_directory,
118 |   checkpointLocation = checkpoint_directory,
119 |   scanForPackages = FALSE,
120 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
121 | 
122 |   library(varian)
123 | })
124 | 
125 | system.time(m <- varian(
126 |   y.formula = BNegAff ~ 1,
127 |   v.formula = PosAff ~ 1 | UserID,
128 |   data = d,
129 |   design = "V -> Y",
130 |   useU = TRUE,
131 |   totaliter = 10000,
132 |   warmup = 500, thin = 5,
133 |   chains = 2, verbose=TRUE,
134 |   cl = cl))
135 | 
136 | 
137 | ## ----fiiv-bvmdiag1, fig.width=9, fig.height=14, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Diagnostics including percent scale reduction factor (Rhat), effective sample size, distribution of the individual standard deviations, distribution of the individual means, and individual estimates of individual standard deviations and means with credible intervals."----
138 | 
139 | ## check diagnostics
140 | vm_diagnostics(m)
141 | 
142 | 
143 | ## ----fiiv-bvmres1, fig.width=8, fig.height=8, out.width='.6\\linewidth', fig.pos="!h", fig.cap = "Plots of the distributions, bivariate scatter plot, and proportion of cases on each side of zero for empirical p-values."----
144 | 
145 | ## extract MCMC samples
146 | mcmc.samples <- extract(m$results,
147 |   permute = TRUE)
148 | 
149 | ## examine MCMC samples of
150 | ## the alpha regression coefficients
151 | vmp_plot(mcmc.samples$Yalpha)
152 | 
153 | 
154 | ## ------------------------------------------------------------------------
155 | 
156 | ## intercept of average negative affect
157 | param_summary(mcmc.samples$YB[, 1])
158 | 
159 | ## IIV on average negative affect
160 | param_summary(mcmc.samples$Yalpha[, 1])
161 | 
162 | ## individual mean on average negative affect
163 | param_summary(mcmc.samples$Yalpha[, 2])
164 | 
165 | ## residual error of average negative affect
166 | param_summary(mcmc.samples$sigma_Y)
167 | 
168 | 
169 | ## ------------------------------------------------------------------------
170 | 
171 | ## intercept of positive affect
172 | param_summary(mcmc.samples$VB[, 1])
173 | 
174 | ## positive affect random intercept standard deviation
175 | param_summary(mcmc.samples$sigma_U)
176 | 
177 | ## estimate of the gamma rate parameter for IIVs
178 | param_summary(mcmc.samples$rate)
179 | 
180 | ## estimate of the gamma shape parameter for IIVs
181 | param_summary(mcmc.samples$shape)
182 | 
183 | ## ------------------------------------------------------------------------
184 | 
185 | dim(mcmc.samples$Sigma_V)
186 | str(mcmc.samples$Sigma_V)
187 | 
188 | 
189 | ## ------------------------------------------------------------------------
190 | 
191 | avg_dataset <- cbind(
192 |   d[!duplicated(UserID), .(BNegAff)],
193 |   IIV = colMeans(mcmc.samples$Sigma_V),
194 |   IIM = colMeans(mcmc.samples$U))
195 | 
196 | avg_model <- lm(BNegAff ~ IIV + IIM, data = avg_dataset)
197 | 
198 | summary(avg_model)
199 | 
200 | 
201 | ## ------------------------------------------------------------------------
202 | 
203 | ind_dataset <- lapply(seq(1, 1000, by = 10), function(i) {
204 |   cbind(
205 |   d[!duplicated(UserID), .(BNegAff)],
206 |   IIV = mcmc.samples$Sigma_V[i, ],
207 |   IIM = mcmc.samples$U[i, ])
208 | })
209 | 
210 | 
211 | ind_model <- lapply(ind_dataset, function(tmpdat) {
212 |   lm(BNegAff ~ IIV + IIM, data = tmpdat)
213 | })
214 | 
215 | ind_model_pooled <- pool(as.mira(ind_model))
216 | 
217 | 
218 | ## ------------------------------------------------------------------------
219 | 
220 | raw_model <- lm(BNegAff ~ IIV + IIM,
221 |  data = d[, .(BNegAff = BNegAff[1],
222 |               IIV = sd(PosAff, na.rm = TRUE),
223 |               IIM = mean(PosAff, na.rm = TRUE)),
224 |           by = UserID])
225 | 
226 | 
227 | ## ------------------------------------------------------------------------
228 | 
229 | ## Bayesian Results
230 | param_summary(mcmc.samples$YB[, 1]) ## intercept
231 | param_summary(mcmc.samples$Yalpha[, 1]) ## IIV
232 | param_summary(mcmc.samples$Yalpha[, 2]) ## IIM
233 | 
234 | ## using averages only
235 | cbind(B = coef(avg_model), confint(avg_model))
236 | 
237 | ## using raw ISDs
238 | cbind(B = coef(raw_model), confint(raw_model))
239 | 
240 | ## treating as multiply imputed
241 | summary(ind_model_pooled, conf.int = TRUE)
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/intro.R:
--------------------------------------------------------------------------------
 1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
 2 | library(checkpoint)
 3 | checkpoint("2018-09-28", R.version = "3.5.1",
 4 |   project = book_directory,
 5 |   checkpointLocation = checkpoint_directory,
 6 |   scanForPackages = FALSE,
 7 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
 8 | 
 9 | library(data.table)
10 | 
11 | options(
12 |   width = 70,
13 |   stringsAsFactors = FALSE,
14 |   digits = 2)
15 | 
16 | ## ------------------------------------------------------------------------
17 | 
18 | load("../ICPSR_04690/DS0001/04690-0001-Data.rda")
19 | ls()
20 | 
21 | acl <- as.data.table(da04690.0001)
22 | acl <- acl[, .(
23 |   V2, V1801, V2101, V2064,
24 |   V3007, V2623, V2636, V2640,
25 |   V2000,
26 |   V2200, V2201, V2202,
27 |   V2613, V2614, V2616,
28 |   V2618, V2681,
29 |   V7007, V6623, V6636, V6640,
30 |   V6201, V6202,
31 |   V6613, V6614, V6616,
32 |   V6618, V6681
33 | )]
34 | 
35 | setnames(acl, names(acl), c(
36 |   "ID", "Sex", "RaceEthnicity", "SESCategory",
37 |   "Employment_W1", "BMI_W1", "Smoke_W1", "PhysActCat_W1",
38 |   "AGE_W1",
39 |   "SWL_W1", "InformalSI_W1", "FormalSI_W1",
40 |   "SelfEsteem_W1", "Mastery_W1", "SelfEfficacy_W1",
41 |   "CESD11_W1", "NChronic12_W1",
42 |   "Employment_W2", "BMI_W2", "Smoke_W2", "PhysActCat_W2",
43 |   "InformalSI_W2", "FormalSI_W2",
44 |   "SelfEsteem_W2", "Mastery_W2", "SelfEfficacy_W2",
45 |   "CESD11_W2", "NChronic12_W2"
46 |            ))
47 | 
48 | acl[, ID := factor(ID)]
49 | acl[, SESCategory := factor(SESCategory)]
50 | acl[, SWL_W1 := SWL_W1 * -1]
51 | 
52 | saveRDS(acl, "advancedr_acl_data.RDS", compress = "xz")
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/md.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 | library(checkpoint)
  4 | checkpoint("2018-09-28", R.version = "3.5.1",
  5 |   project = book_directory,
  6 |   checkpointLocation = checkpoint_directory,
  7 |   scanForPackages = FALSE,
  8 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(knitr)
 11 | library(ggplot2)
 12 | library(cowplot)
 13 | library(lattice)
 14 | library(viridis)
 15 | library(VIM)
 16 | 
 17 | library(mice)
 18 | library(micemd)
 19 | library(parallel)
 20 | 
 21 | library(data.table)
 22 | library(xtable)
 23 | library(JWileymisc) # has data
 24 | 
 25 | options(width = 70, digits = 2)
 26 | 
 27 | 
 28 | ## ----cache=FALSE---------------------------------------------------------
 29 | 
 30 | ## load example dataset
 31 | data("aces_daily")
 32 | draw <- as.data.table(aces_daily)[order(UserID)]
 33 | davg <- na.omit(draw[, .(
 34 |   Female = na.omit(Female)[1],
 35 |   Age = na.omit(Age)[1],
 36 |   SES_1 = na.omit(SES_1)[1],
 37 |   EDU = na.omit(EDU)[1],
 38 |   STRESS = mean(STRESS, na.rm = TRUE),
 39 |   SUPPORT = mean(SUPPORT, na.rm = TRUE),
 40 |   PosAff = mean(PosAff, na.rm = TRUE),
 41 |   NegAff = mean(NegAff, na.rm = TRUE)),
 42 |   by = UserID])
 43 | 
 44 | 
 45 | ## ------------------------------------------------------------------------
 46 | 
 47 | ## missing depending on support and stress
 48 | davg[, MissingProb := ifelse(
 49 |          SUPPORT < 5,
 50 |            ifelse(STRESS > 2.5, .4, .0),
 51 |            ifelse(STRESS > 2.5, 0, .4))]
 52 | 
 53 | set.seed(1234)
 54 | davgmiss <- copy(davg)
 55 | davgmiss[, PosAff := ifelse(rbinom(
 56 |              .N, size = 1, prob = MissingProb) == 1,
 57 |              NA, PosAff)]
 58 | davgmiss[, NegAff := ifelse(rbinom(
 59 |              .N, size = 1, prob = MissingProb) == 1,
 60 |              NA, NegAff)]
 61 | ## random missingness on stress and support
 62 | davgmiss[, STRESS := ifelse(rbinom(
 63 |              .N, size = 1, prob = .1) == 1,
 64 |              NA, STRESS)]
 65 | davgmiss[, SUPPORT := ifelse(rbinom(
 66 |              .N, size = 1, prob = .1) == 1,
 67 |              NA, SUPPORT)]
 68 | davgmiss[, Age := ifelse(rbinom(
 69 |              .N, size = 1, prob = .1) == 1,
 70 |              NA, Age)]
 71 | davgmiss[, SES_1 := ifelse(rbinom(
 72 |              .N, size = 1, prob = .1) == 1,
 73 |              NA, SES_1)]
 74 | davgmiss[, Female := factor(ifelse(rbinom(
 75 |              .N, size = 1, prob = .1) == 1,
 76 |              NA, Female), levels = 0:1,
 77 |              labels = c("Male", "Female"))]
 78 | davgmiss[, EDU := factor(ifelse(rbinom(
 79 |              .N, size = 1, prob = .1) == 1,
 80 |              NA, EDU), levels = 0:1,
 81 |              labels = c("< Uni Graduate", "Uni Graduate +"))]
 82 | ## drop unneeded variables to make analysis easier
 83 | davgmiss[, MissingProb := NULL]
 84 | davgmiss[, UserID := NULL]
 85 | 
 86 | 
 87 | ## ----fmd-missplot, fig.width=12, fig.height=12/1.1, out.width='.8\\linewidth', fig.pos="!h", fig.cap = "Visual summary of the missingness by variable and missingness patterns."----
 88 | 
 89 | aggr(davgmiss, prop = TRUE,
 90 |      numbers = TRUE)
 91 | 
 92 | 
 93 | ## ----fmd-missbi, fig.width=9, fig.height=9, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Bivariate plots of missing data. Central dots show the non-missing data. Margin dots show missing data. Boxplots summarize each variable by whether the other variable is missing or present."----
 94 | 
 95 | par(mfrow = c(2, 2))
 96 | marginplot(davgmiss[,.(STRESS, NegAff)])
 97 | marginplot(davgmiss[,.(SUPPORT, NegAff)])
 98 | marginplot(davgmiss[,.(STRESS, PosAff)])
 99 | marginplot(davgmiss[,.(SUPPORT, PosAff)])
100 | 
101 | 
102 | ## ------------------------------------------------------------------------
103 | 
104 | ## does age differ by missing on negative affect?
105 | t.test(Age ~ is.na(NegAff), data = davgmiss)$p.value
106 | 
107 | ## does age differ by missing on positive affect?
108 | t.test(Age ~ is.na(PosAff), data = davgmiss)$p.value
109 | 
110 | ## does stress differ by missing on negative affect?
111 | t.test(STRESS ~ is.na(NegAff), data = davgmiss)$p.value
112 | 
113 | ## does stress differ by missing on positive affect?
114 | t.test(STRESS ~ is.na(PosAff), data = davgmiss)$p.value
115 | 
116 | ## does social support differ by missing on negative affect?
117 | t.test(SUPPORT ~ is.na(NegAff), data = davgmiss)$p.value
118 | 
119 | ## does social support differ by missing on positive affect?
120 | t.test(SUPPORT ~ is.na(PosAff), data = davgmiss)$p.value
121 | 
122 | 
123 | ## ------------------------------------------------------------------------
124 | 
125 | system.time(mi.1 <- mice(
126 |   davgmiss,
127 |   m = 6,   maxit = 10,
128 |   defaultMethod = c("norm", "logreg", "polyreg", "polr"),
129 |   seed = 1234, printFlag = FALSE)
130 | )
131 | 
132 | 
133 | ## ----fmd-micediag, fig.width=7, fig.height=9, out.width='1\\linewidth', fig.pos="!h", fig.cap = c("mice diagnostics for convergence", "mice diagnostics for convergence after more iterations")----
134 | 
135 | ## plot convergence diagnostics
136 | plot(mi.1, PosAff + NegAff + SUPPORT ~ .it | .ms)
137 | 
138 | ## run an additional iterations
139 | system.time(mi.1 <- mice.mids(
140 |   mi.1, maxit = 10,
141 |   printFlag = FALSE)
142 | )
143 | 
144 | ## plot convergence diagnostics
145 | plot(mi.1, PosAff + NegAff + SUPPORT ~ .it | .ms)
146 | 
147 | 
148 | ## ----fmd-micediag2, fig.width=7, fig.height=7, out.width='1\\linewidth', fig.pos="!h", fig.cap = c("Univariate density plots for observed and imputed data, separated by imputation.", "Bivariate scatter plots with imputed data colored separately by observed and imputed data.")----
149 | 
150 | densityplot(mi.1, ~ PosAff + NegAff + SUPPORT + STRESS)
151 | 
152 | xyplot(mi.1, NegAff + PosAff ~ STRESS + SUPPORT)
153 | 
154 | 
155 | ## ------------------------------------------------------------------------
156 | 
157 | lm.1 <- with(mi.1, lm(PosAff ~ STRESS + Age + EDU + Female))
158 | 
159 | lm.1
160 | 
161 | 
162 | ## ----fmd-modeldiag, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Linear regression model diagnostics from first imputed dataset.")----
163 | 
164 | par(mfcol = c(2,2 ))
165 | plot(lm.1$analyses[[1]])
166 | par(mfcol = c(1,1))
167 | 
168 | 
169 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
170 | ## xtable(summary(pool(lm.1), conf.int=TRUE),
171 | ##   digits = 2,
172 | ##   caption = "Regression results pooled across multiply imputed data",
173 | ##   label = "tmd-pooledres1")
174 | 
175 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
176 | xtable(summary(pool(lm.1), conf.int=TRUE),
177 |   digits = 2,
178 |   caption = "Regression results pooled across multiply imputed data",
179 |   label = "tmd-pooledres1")
180 | 
181 | ## ------------------------------------------------------------------------
182 | 
183 | pool.r.squared(lm.1)
184 | 
185 | 
186 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
187 | ## xtable(summary(pool(lm.1), type = "all", conf.int=TRUE),
188 | ##   digits = 2,
189 | ##   caption = "Regression results pooled across multiply imputed data with additional information",
190 | ##   label = "tmd-pooledres1alt")
191 | 
192 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
193 | xtable(summary(pool(lm.1), type = "all", conf.int=TRUE),
194 |   digits = 2,
195 |   caption = "Regression results pooled across multiply imputed data with additional information",
196 |   label = "tmd-pooledres1alt")
197 | 
198 | ## ----fmd-predhat, fig.width=5, fig.height=4, out.width='.5\\linewidth', fig.pos="!h", fig.cap = c("Pooled predictions from linear regression models of the association between stress and positive affect.")----
199 | 
200 | newdat <- data.frame(
201 |   STRESS = seq(from = 0, to = 6, length.out = 100),
202 |   Age = mean(davg$Age),
203 |   EDU = factor("< Uni Graduate", levels = levels(davgmiss$EDU)),
204 |   Female = factor("Female", levels = levels(davgmiss$Female)))
205 | 
206 | newdat$PosAff <- rowMeans(sapply(1:6, function(i) {
207 |   predict(lm.1$analyses[[i]], newdata = newdat)
208 | }))
209 | 
210 | ggplot(newdat, aes(STRESS, PosAff)) +
211 |   geom_line()
212 | 
213 | 
214 | ## ------------------------------------------------------------------------
215 | 
216 | cl <- makeCluster(2)
217 | clusterExport(cl, c("book_directory", "checkpoint_directory" ))
218 | 
219 | clusterEvalQ(cl, {
220 |   library(checkpoint)
221 |   checkpoint("2018-09-28", R.version = "3.5.1",
222 |   project = book_directory,
223 |   checkpointLocation = checkpoint_directory,
224 |   scanForPackages = FALSE,
225 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
226 |   library(mice)
227 |   library(randomForest)
228 |   library(data.table)
229 | })
230 | 
231 | imputation_seeds <- c(
232 |   403L, 2L, 2118700268L, 1567504751L,
233 |   -161759579L, -1822093220L)
234 | 
235 | clusterExport(cl, c("davgmiss", "imputation_seeds"))
236 | 
237 | 
238 | ## ------------------------------------------------------------------------
239 | 
240 | system.time(mi.par <- parLapplyLB(cl, 1:6, function(i) {
241 | mice(
242 |   davgmiss,
243 |   m = 1,   maxit = 20,
244 |   defaultMethod = c("norm", "logreg", "polyreg", "polr"),
245 |   seed = imputation_seeds[i])
246 | }))
247 | 
248 | 
249 | ## ------------------------------------------------------------------------
250 | 
251 | ## combine the separate imputations into a single object
252 | mi.par2 <- ibind(mi.par[[1]], mi.par[[2]])
253 | for (i in 3:6) {
254 |   mi.par2 <- ibind(mi.par2, mi.par[[i]])
255 | }
256 | 
257 | mi.par2
258 | 
259 | 
260 | ## ----fmd-rfdiag, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Convergence diagnostics for random forest imputation model.", "Density plots of observed and imputed values from random forest model.", "Scatter plots of affect versus stress and social support for observed and imputed values.")----
261 | 
262 | system.time(mi.rfpar <- parLapplyLB(cl, 1:6, function(i) {
263 |   mice(
264 |     davgmiss,
265 |     m = 1, maxit = 30,
266 |     method = "rf",
267 |     seed = imputation_seeds[i],
268 |     ntree = 500, nodesize = 10)
269 | }))
270 | 
271 | ## combine into a single object
272 | mi.rf <- ibind(mi.rfpar[[1]], mi.rfpar[[2]])
273 | for (i in 3:6) {
274 |   mi.rf <- ibind(mi.rf, mi.rfpar[[i]])
275 | }
276 | 
277 | ## plot convergence diagnostics
278 | plot(mi.rf, PosAff + NegAff + SUPPORT ~ .it | .ms)
279 | 
280 | ## model diagnostics
281 | densityplot(mi.rf, ~ PosAff + NegAff + SUPPORT + STRESS)
282 | 
283 | xyplot(mi.rf, NegAff + PosAff ~ STRESS + SUPPORT)
284 | 
285 | 
286 | ## ----fmd-micompare, fig.width=7, fig.height=5, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Pooled predictions from linear regression models of the association between stress and positive affect.")----
287 | 
288 | m.true <- lm(PosAff ~ STRESS + Age + EDU + Female, data = davg)
289 | m.cc <- lm(PosAff ~ STRESS + Age + EDU + Female, data = davgmiss)
290 | m.mireg <- summary(pool(with(mi.1,
291 |   lm(PosAff ~ STRESS + Age + EDU + Female))),
292 |   conf.int = TRUE)
293 | m.mirf <- summary(pool(with(mi.rf,
294 |   lm(PosAff ~ STRESS + Age + EDU + Female))),
295 |   conf.int = TRUE)
296 | 
297 | res.true <- as.data.table(cbind(coef(m.true), confint(m.true)))
298 | res.cc <- as.data.table(cbind(coef(m.cc), confint(m.cc)))
299 | res.mireg <- as.data.table(m.mireg[, c("estimate", "2.5 %", "97.5 %")])
300 | res.mirf <- as.data.table(m.mirf[, c("estimate", "2.5 %", "97.5 %")])
301 | setnames(res.true, c("B", "LL", "UL"))
302 | setnames(res.cc, c("B", "LL", "UL"))
303 | setnames(res.mireg, c("B", "LL", "UL"))
304 | setnames(res.mirf, c("B", "LL", "UL"))
305 | 
306 | res.compare <- rbind(
307 |   cbind(Type = "Truth", Param = names(coef(m.true)), res.true),
308 |   cbind(Type = "CC", Param = names(coef(m.true)), res.cc),
309 |   cbind(Type = "MI Reg", Param = names(coef(m.true)), res.mireg),
310 |   cbind(Type = "MI RF", Param = names(coef(m.true)), res.mirf))
311 | 
312 | ggplot(res.compare, aes(factor(""),
313 |    y = B, ymin = LL, ymax = UL, colour = Type)) +
314 |   geom_pointrange(position = position_dodge(.4)) +
315 |   scale_color_viridis(discrete = TRUE) +
316 |   facet_wrap(~Param, scales = "free") +
317 |   theme(
318 |     legend.position = c(1, 0),
319 |     legend.justification = c("right", "bottom"))
320 | 
321 | ## clean up cluster
322 | stopCluster(cl)
323 | rm(cl)
324 | 
325 | 
326 | ## ------------------------------------------------------------------------
327 | 
328 | cl <- makeCluster(2)
329 | clusterExport(cl, c("book_directory", "checkpoint_directory" ))
330 | 
331 | clusterEvalQ(cl, {
332 |   library(checkpoint)
333 |   checkpoint("2018-09-28", R.version = "3.5.1",
334 |   project = book_directory,
335 |   checkpointLocation = checkpoint_directory,
336 |   scanForPackages = FALSE,
337 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
338 |   library(mice)
339 |   library(randomForest)
340 |   library(data.table)
341 | })
342 | 
343 | 
344 | ## ------------------------------------------------------------------------
345 | 
346 | ## example of how to have R return some seed values
347 | dput(.Random.seed[1:5])
348 | 
349 | ## random seeds
350 | imputation_seeds <- c(403L, 148L, -1767993668L,
351 |   1417792552L, 298386660L, 1360311820L,
352 | 1356573822L, -1472988872L, 1215046494L, 759520201L,
353 | 1399305648L, -455288776L, 969619279L, 518793662L,
354 | -383967014L, -1983801345L, -698559309L, 1957301883L,
355 | -1457959076L, 1321574932L, -537238757L,
356 | 11573466L, 1466816383L, -2113923363L, 1663041018L)
357 | 
358 | clusterExport(cl, c("davgmiss", "imputation_seeds"))
359 | 
360 | 
361 | ## ----fmd-rfcdx, fig.width=9, fig.height=6, out.width='1\\linewidth', fig.pos="!h", fig.cap = "Density plots for continuous variables to be included in the imputation model"----
362 | 
363 | ggplot(melt(davgmiss[, sapply(davgmiss, is.numeric),
364 |               with = FALSE], measure.vars = 1:6), aes(value)) +
365 |          geom_density() + geom_rug() +
366 |          facet_wrap(~variable, scales = "free")
367 | 
368 | 
369 | ## ------------------------------------------------------------------------
370 | 
371 | start.time <- proc.time()
372 | mi.rfpar1 <- parLapplyLB(cl, 1:4, function(i) {
373 |   mice(
374 |     davgmiss,
375 |     m = 1, maxit = 5,
376 |     method = "rf",
377 |     seed = imputation_seeds[i],
378 |     ntree = 100, nodesize = 10)
379 | })
380 | stop.time <- proc.time()
381 | 
382 | ## estimate of how long it took
383 | stop.time - start.time
384 | 
385 | ## combine into a single object
386 | mi.rf1 <- ibind(mi.rfpar1[[1]], mi.rfpar1[[2]])
387 | for (i in 3:4) {
388 |   mi.rf1 <- ibind(mi.rf1, mi.rfpar1[[i]])
389 | }
390 | 
391 | 
392 | ## ----fmd-rfcase1, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Convergence diagnostics for random forest imputation model.")----
393 | 
394 | ## plot convergence diagnostics
395 | plot(mi.rf1, NegAff + STRESS + Age ~ .it | .ms)
396 | 
397 | 
398 | ## ----fmd-rfcase2, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Density plots of observed and imputed values from random forest model.")----
399 | 
400 | ## model diagnostics for continuous study variables
401 | densityplot(mi.rf1, ~ NegAff + STRESS + Age)
402 | 
403 | 
404 | ## ----fmd-rfcase3, fig.width=7, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Distribution plot (density and QQ Deviates) for model residuals.")----
405 | 
406 | ## fit the models
407 | fit.mirf1 <- with(mi.rf1,
408 |   lm(NegAff ~ STRESS + Age + EDU + Female + SES_1))
409 | 
410 | testdistr(unlist(lapply(fit.mirf1$analyses, rstandard)))
411 | 
412 | 
413 | ## ------------------------------------------------------------------------
414 | 
415 | ## pool results and summarize
416 | m.mirf1 <- summary(pool(fit.mirf1), conf.int = TRUE)
417 | 
418 | 
419 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
420 | ## xtable(m.mirf1,
421 | ##   digits = 2,
422 | ##   caption = "Regression results pooled across multiply imputed data test run",
423 | ##   label = "tmd-pooledres2")
424 | 
425 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
426 | xtable(m.mirf1,
427 |   digits = 2,
428 |   caption = "Regression results pooled across multiply imputed data test run",
429 |   label = "tmd-pooledres2")
430 | 
431 | ## ----fmd-rfcase4, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Convergence diagnostics for random forest imputation model.", "Density plots of observed and imputed values from random forest model.")----
432 | 
433 | start.time2 <- proc.time()
434 | mi.rfpar2 <- parLapplyLB(cl, 1:10, function(i) {
435 |   mice(
436 |     davgmiss,
437 |     m = 1, maxit = 30,
438 |     method = "rf",
439 |     seed = imputation_seeds[i],
440 |     ntree = 100, nodesize = 10)
441 | })
442 | stop.time2 <- proc.time()
443 | 
444 | ## time taken
445 | stop.time2 - start.time2
446 | 
447 | ## combine into a single object
448 | mi.rf2 <- ibind(mi.rfpar2[[1]], mi.rfpar2[[2]])
449 | for (i in 3:10) {
450 |   mi.rf2 <- ibind(mi.rf2, mi.rfpar2[[i]])
451 | }
452 | 
453 | 
454 | ## ----fmd-rfcase5, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Convergence diagnostics for random forest imputation model.")----
455 | 
456 | ## plot convergence diagnostics
457 | plot(mi.rf2, NegAff + STRESS + Age ~ .it | .ms)
458 | 
459 | 
460 | ## ----fmd-rfcase6, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Density plots of observed and imputed values from random forest model.")----
461 | 
462 | ## model diagnostics for continuous study variables
463 | densityplot(mi.rf2, ~ NegAff + STRESS + Age)
464 | 
465 | 
466 | ## ----fmd-rfcase7, fig.width=7, fig.height=6, out.width='.8\\linewidth', fig.pos="!h", fig.cap = c("Distribution plot (density and QQ Deviates) for model residuals.")----
467 | 
468 | ## fit the models
469 | fit.mirf2 <- with(mi.rf2,
470 |   lm(NegAff ~ STRESS + Age + EDU + Female + SES_1))
471 | 
472 | testdistr(unlist(lapply(fit.mirf2$analyses, rstandard)))
473 | 
474 | 
475 | ## ------------------------------------------------------------------------
476 | 
477 | ## pool results and summarize
478 | m.mirf2 <- summary(pool(fit.mirf2), conf.int = TRUE)
479 | 
480 | 
481 | ## ----echo=TRUE, eval=FALSE-----------------------------------------------
482 | ## xtable(m.mirf2,
483 | ##   digits = 2,
484 | ##   caption = "Regression results pooled across multiply imputed data final run",
485 | ##   label = "tmd-pooledres3")
486 | 
487 | ## ----echo=FALSE, results='asis', listings=FALSE--------------------------
488 | xtable(m.mirf2,
489 |   digits = 2,
490 |   caption = "Regression results pooled across multiply imputed data final run",
491 |   label = "tmd-pooledres3")
492 | 
493 | 


--------------------------------------------------------------------------------
/mdv.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 | library(checkpoint)
  4 | checkpoint("2018-09-28", R.version = "3.5.1",
  5 |   project = book_directory,
  6 |   checkpointLocation = checkpoint_directory,
  7 |   scanForPackages = FALSE,
  8 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(knitr)
 11 | library(ggplot2)
 12 | library(cowplot)
 13 | library(MASS)
 14 | library(mvtnorm)
 15 | library(mgcv)
 16 | library(quantreg)
 17 | library(JWileymisc)
 18 | library(data.table)
 19 | 
 20 | options(width = 70, digits = 2)
 21 | 
 22 | 
 23 | ## ----fmdv-density2d, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "2D empirical density plot for multivariate normal data."----
 24 | 
 25 | mu <- c(0, 0)
 26 | sigma <- matrix(c(1, .5, .5, 1), 2)
 27 | 
 28 | set.seed(1234)
 29 | d <- as.data.table(rmvnorm(500, mean = mu, sigma = sigma))
 30 | setnames(d, names(d), c("x", "y"))
 31 | 
 32 | ggplot(d, aes(x, y)) +
 33 |   geom_point(colour = "grey60") +
 34 |   geom_density2d(size = 1, colour = "black") +
 35 |   theme_cowplot()
 36 | 
 37 | 
 38 | ## ----fmdv-density2dnorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "2D empirical density versus multivariate normal density plot."----
 39 | 
 40 | testd <- as.data.table(expand.grid(
 41 |   x = seq(from = min(d$x), to = max(d$x), length.out = 50),
 42 |   y = seq(from = min(d$y), to = max(d$y), length.out = 50)))
 43 | testd[, Density := dmvnorm(cbind(x, y), mean = colMeans(d), sigma = cov(d))]
 44 | 
 45 | ggplot(d, aes(x, y)) +
 46 |   geom_contour(aes(x, y, z = Density), data = testd,
 47 |                colour = "blue", size = 1, linetype = 2) +
 48 |   geom_density2d(size = 1, colour = "black") +
 49 |   theme_cowplot()
 50 | 
 51 | 
 52 | ## ----fmdv-uninorm, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Univariate density plots showing the simulated variables are univariate normal."----
 53 | 
 54 | set.seed(1234)
 55 | d2 <- data.table(x = rnorm(500))
 56 | d2[, y := ifelse(abs(x) > 1, x, -x)]
 57 | 
 58 | plot_grid(
 59 |   testdistr(d2$x, plot = FALSE)$Density,
 60 |   testdistr(d2$y, plot = FALSE, varlab = "Y")$Density,
 61 |   ncol = 2)
 62 | 
 63 | 
 64 | ## ----fmdv-2dnotnorm, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "2D density plot showing data that are not multivariate normal."----
 65 | 
 66 | testd2 <- as.data.table(expand.grid(
 67 |   x = seq(from = min(d2$x), to = max(d2$x), length.out = 50),
 68 |   y = seq(from = min(d2$y), to = max(d2$y), length.out = 50)))
 69 | testd2[, Density := dmvnorm(cbind(x, y), mean = colMeans(d2), sigma = cov(d2))]
 70 | 
 71 | ggplot(d2, aes(x, y)) +
 72 |   geom_contour(aes(x, y, z = Density), data = testd2,
 73 |                colour = "blue", size = 1, linetype = 2) +
 74 |   geom_density2d(size = 1, colour = "black") +
 75 |   theme_cowplot()
 76 | 
 77 | 
 78 | ## ----fmdv-testdistr-mvnorm, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = c("Density plot superimposing multivariate normal distribution and QQ plot, showing multivariate normal data.", "Density plot superimposing multivariate normal distribution and QQ plot, showing data that are not multivariate normal.")----
 79 | 
 80 | testdistr(d, "mvnorm", ncol = 2)
 81 | 
 82 | testdistr(d2, "mvnorm", ncol = 2)
 83 | 
 84 | 
 85 | ## ----fmdv-mtcars-mvnorm, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Density plot superimposing multivariate normal distribution and QQ plot for mtcars data."----
 86 | 
 87 | testdistr(mtcars, "mvnorm", ncol = 2)
 88 | 
 89 | 
 90 | ## ----fmdv-uvoutlier, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Density plot superimposing normal distribution for data with an anomalous value (Panel A) and with the anomalous value removed (Panel B)."----
 91 | 
 92 | mu <- c(0, 0, 0)
 93 | sigma <- matrix(.7, 3, 3)
 94 | diag(sigma) <- 1
 95 | 
 96 | set.seed(12345)
 97 | d <- as.data.table(rmvnorm(200, mean = mu, sigma = sigma))[order(V1)]
 98 | d[c(1, 200), V3 := c(2.2, 50)]
 99 | 
100 | plot_grid(
101 |   testdistr(d$V3, extremevalues = "theoretical", plot=FALSE)$Density,
102 |   testdistr(d[V3 < 40]$V3, extremevalues = "theoretical", plot=FALSE)$Density,
103 |   ncol = 2, labels = c("A", "B"))
104 | 
105 | 
106 | ## ----fmdv-mvoutliers, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = paste0("Graph of multivariate normality and (multivariate) anomalous values", c(".", " with one extreme anomalous value removed."))----
107 | 
108 | testdistr(d, "mvnorm", ncol = 2, extremevalues = "theoretical")
109 | 
110 | testdistr(d[V3 < 40], "mvnorm", ncol = 2, extremevalues = "theoretical")
111 | 
112 | 
113 | ## ----fmdv-mvrobust, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = paste0("Graph of multivariate normality and (multivariate) anomalous values", c(" using the robust estimator.", " with both anomalous value removed."))----
114 | 
115 | testdistr(d, "mvnorm", ncol = 2, robust = TRUE, extremevalues = "theoretical")
116 | 
117 | testdistr(d[-c(1,200)], "mvnorm", ncol = 2, extremevalues = "theoretical")
118 | 
119 | 
120 | ## ----fmdv-boxplot-cut, fig.width=8, fig.height=8, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Boxplots from cutting a continuous variable into quartiles showing a non-linear relationship."----
121 | 
122 | set.seed(12345)
123 | d2 <- data.table(x = rnorm(100))
124 | d2[, y := rnorm(100, mean = 2 + x + 2 * x^2, sd = 3)]
125 | 
126 | p.cut3 <- ggplot(
127 |   data = d2[, .(y,
128 |     xcut = cut(x, quantile(x,
129 |       probs = seq(0, 1, by = 1/3)), include.lowest = TRUE))],
130 |   aes(xcut, y)) +
131 |   geom_boxplot(width=.25) +
132 |   theme(axis.text.x = element_text(
133 |           angle = 45, hjust = 1, vjust = 1)) +
134 |   xlab("")
135 | 
136 | p.cut4 <- p.cut3 %+% d2[, .(y,
137 |     xcut = cut(x, quantile(x,
138 |       probs = seq(0, 1, by = 1/4)), include.lowest = TRUE))]
139 | 
140 | p.cut5 <- p.cut3 %+% d2[, .(y,
141 |     xcut = cut(x, quantile(x,
142 |       probs = seq(0, 1, by = 1/5)), include.lowest = TRUE))]
143 | 
144 | p.cut10 <- p.cut3 %+% d2[, .(y,
145 |     xcut = cut(x, quantile(x,
146 |       probs = seq(0, 1, by= 1/10)), include.lowest = TRUE))]
147 | 
148 | plot_grid(
149 |   p.cut3, p.cut4,
150 |   p.cut5, p.cut10,
151 |   ncol = 2,
152 |   labels = c("A", "B", "C", "D"),
153 |   align = "hv")
154 | 
155 | 
156 | ## ----fmdv-loessline, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Loess line of best fit showing a non-linear relationship."----
157 | 
158 | ggplot(d2, aes(x, y)) +
159 |   geom_point(colour="grey50") +
160 |   stat_smooth(method = "loess", colour = "black") +
161 |   stat_smooth(method = "lm", colour = "blue", linetype = 2)
162 | 
163 | 
164 | ## ----fmdv-loesslineq, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Loess line and quadratic line."----
165 | 
166 | ggplot(d2, aes(x, y)) +
167 |   geom_point(colour="grey50") +
168 |   stat_smooth(method = "loess", colour = "black") +
169 |   stat_smooth(method = "lm",
170 |               formula = y ~ x + I(x^2),
171 |               colour = "blue", linetype = 2)
172 | 
173 | 
174 | ## ----fmdv-loess-span, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Loess line with varying degree of smoothing."----
175 | 
176 | ggplot(d2, aes(x, y)) +
177 |   geom_point(colour="grey50") +
178 |   stat_smooth(method = "loess", span = .2,
179 |               colour = "black") +
180 |   stat_smooth(method = "loess", span = 2,
181 |               colour = "black", linetype = 2)
182 | 
183 | 
184 | ## ----fmdv-loess-mvbv, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Loess line of bivariate relationship from multivariate data."----
185 | 
186 | set.seed(1234)
187 | d3 <- data.table(
188 |   x = rnorm(500),
189 |   w = rnorm(500),
190 |   z = rbinom(500, 1, .4))
191 | d3[, y := rnorm(500, mean = 3 +
192 |        ifelse(x < 0 & w < 0, -2, 0) * x +
193 |        ifelse(x < 0, 0, 2) * w * x^2 + 4 * z * w,
194 |     sd = 1)]
195 | d3[, z := factor(z)]
196 | 
197 | ggplot(d3, aes(x, y)) +
198 |   geom_point(colour="grey50") +
199 |   stat_smooth(method = "loess", colour = "black")
200 | 
201 | 
202 | ## ----fmdv-loess-mv, fig.width=8, fig.height=8, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Loess lines for multivariate data."----
203 | 
204 | ggplot(d3, aes(x, winsorizor(y, .01), colour = z)) +
205 |   geom_point() +
206 |   stat_smooth(method = "loess") +
207 |   scale_colour_manual(values = c("1" = "black", "0" = "grey40")) +
208 |   facet_wrap(~ cut(w, quantile(w), include.lowest = TRUE))
209 | 
210 | 
211 | ## ----fmdv-gam-pred, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Contour plots showing predicted y values for different combinations of x and w panelled by z."----
212 | 
213 | m <- gam(winsorizor(y, .01) ~ z + te(x, w, k = 7, by = z), data = d3)
214 | 
215 | newdat <- expand.grid(
216 |   x = seq(min(d3$x), max(d3$x), length.out = 100),
217 |   w = seq(min(d3$w), max(d3$w), length.out = 100),
218 |   z = factor(0:1, levels = levels(d3$z)))
219 | 
220 | newdat$yhat <- predict(m, newdata = newdat)
221 | 
222 | ggplot(newdat, aes(x = x, y = w, z = yhat)) +
223 |   geom_raster(aes(fill = yhat)) +
224 |   geom_contour(colour = "white", binwidth = 1, alpha = .5) +
225 |   facet_wrap(~ z)
226 | 
227 | 
228 | ## ------------------------------------------------------------------------
229 | 
230 | diris <- as.data.table(iris)
231 | diris[, .(V = var(Sepal.Length)), by = Species]
232 | 
233 | 
234 | ## ----fmdv-slboxplot, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Box and Whisker Diagrams of sepal length by species.  Outliers appear as dots."----
235 | 
236 | plot_grid(
237 |   ggplot(diris, aes(Species, Sepal.Length)) +
238 |     geom_boxplot() +
239 |     xlab(""),
240 |   ggplot(diris[, .(Sepal.Length = Sepal.Length -
241 |                               median(Sepal.Length)), by = Species],
242 |          aes(Species, Sepal.Length)) +
243 |     geom_boxplot() +
244 |     xlab(""),
245 |   ncol = 2, labels = c("A", "B"), align = "hv")
246 | 
247 | 
248 | ## ----fmdv-violinplot, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Violin plots with box and whisker diagrams in center."----
249 | 
250 | ggplot(diris, aes(Species, Sepal.Length)) +
251 |   geom_violin() +
252 |   geom_boxplot(width = .1) +
253 |   xlab("")
254 | 
255 | 
256 | ## ----fmdv-mvviolinplot, fig.width=8, fig.height=8, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Violin plots with box and whisker diagrams in center by quartiles of x, colored by z."----
257 | 
258 | ## create cuts
259 | d3[, xquartile := cut(x, quantile(x), include.lowest = TRUE)]
260 | d3[, wquartile := cut(w, quantile(w), include.lowest = TRUE)]
261 | d3[, yclean := winsorizor(y, .01)]
262 | 
263 | ## median center y by group to facilitate comparison
264 | d3[, yclean := yclean - median(yclean),
265 |    by = .(xquartile, wquartile, z)]
266 | 
267 | p <- position_dodge(.5)
268 | 
269 | ggplot(d3, aes(xquartile, yclean, colour = z)) +
270 |   geom_violin(position = p) +
271 |   geom_boxplot(position = p, width = .1) +
272 |   scale_colour_manual(values = c("1" = "black", "0" = "grey40")) +
273 |   facet_wrap(~ wquartile) +
274 |   theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
275 |   coord_cartesian(ylim = c(-5, 5), expand = FALSE)
276 | 
277 | 
278 | ## ------------------------------------------------------------------------
279 | 
280 | set.seed(1234)
281 | d4 <- data.table(x = runif(500, 0, 5))
282 | d4[, y1 := rnorm(500, mean = 2 + x, sd = 1)]
283 | d4[, y2 := rnorm(500, mean = 2 + x, sd = .25 + x)]
284 | 
285 | 
286 | ## ----fmdv-quantreg, fig.width=8, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Homoskedasticity versus Heteroskedasticity.", warning=FALSE----
287 | 
288 | plot_grid(
289 |   ggplot(d4, aes(x, y1)) +
290 |     geom_point(colour = "grey70") +
291 |     geom_quantile(quantiles = .5, colour = 'black') +
292 |     geom_quantile(quantiles = c(.25, .75),
293 |                   colour = 'blue', linetype = 2) +
294 |     geom_quantile(quantiles = c(.05, .95),
295 |                   colour = 'black', linetype = 3),
296 |   ggplot(d4, aes(x, y2)) +
297 |     geom_point(colour = "grey70") +
298 |     geom_quantile(quantiles = .5, colour = 'black') +
299 |     geom_quantile(quantiles = c(.25, .75),
300 |                   colour = 'blue', linetype = 2) +
301 |     geom_quantile(quantiles = c(.05, .95),
302 |                   colour = 'black', linetype = 3),
303 |   ncol = 2, labels = c("A", "B"))
304 | 
305 | 
306 | 


--------------------------------------------------------------------------------
/mli.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 | library(checkpoint)
  4 | checkpoint("2018-09-28", R.version = "3.5.1",
  5 |   project = book_directory,
  6 |   checkpointLocation = checkpoint_directory,
  7 |   scanForPackages = FALSE,
  8 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(knitr)
 11 | library(tidyverse)
 12 | library(rsample)
 13 | library(data.table)
 14 | library(boot)
 15 | library(parallel)
 16 | library(foreach)
 17 | library(doParallel)
 18 | 
 19 | options(width = 70, digits = 3)
 20 | 
 21 | 
 22 | ## ------------------------------------------------------------------------
 23 | 
 24 | set.seed(5)
 25 | case_data <- initial_split(data = iris, prop = 0.8)
 26 | case_data
 27 | 
 28 | 
 29 | ## ------------------------------------------------------------------------
 30 | data_train <- training(case_data)
 31 | data_test <- testing(case_data)
 32 | glimpse(data_train)
 33 | 
 34 | ## ------------------------------------------------------------------------
 35 | unique(data_train$Species)
 36 | 
 37 | ## ------------------------------------------------------------------------
 38 | length.lm = lm(Petal.Length ~ Sepal.Length + 
 39 |                  Sepal.Width + Petal.Width, 
 40 |                data = data_train)
 41 | length.lm
 42 | summary(length.lm)
 43 | 
 44 | ## ------------------------------------------------------------------------
 45 | sqrt(
 46 |   sum(
 47 |     (fitted(length.lm)-data_train$Petal.Length)^2
 48 |   )/(nrow(data_train)-4)
 49 | )
 50 | 
 51 | ## ------------------------------------------------------------------------
 52 | mse_train<- mean(length.lm$residuals^2)
 53 | mse_train
 54 | 
 55 | ## ------------------------------------------------------------------------
 56 | sqrt(
 57 |   sum(
 58 |     (predict(length.lm, data_test)-data_test$Petal.Length)^2
 59 |   )/(nrow(data_test)-2)
 60 | )
 61 | 
 62 | mse_test <- mean((predict(length.lm, data_test) - 
 63 |                   data_test$Petal.Length)^2)
 64 | mse_test
 65 | 
 66 | ## ----mli-qqplot, fig.width=7, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "2D and Q-Q plots of some data"----
 67 | par(mfrow=c(2,2))
 68 | plot(data_train$Sepal.Length, data_train$Petal.Length)
 69 | plot(data_test$Sepal.Length, data_test$Petal.Length)
 70 | qqnorm(data_train$Petal.Length, 
 71 |        xlab = "Theoretical Quantiles Train")
 72 | qqnorm(data_test$Petal.Length, 
 73 |        xlab = "Theoretical Quantiles Test")
 74 | 
 75 | ## ------------------------------------------------------------------------
 76 | summary(data_train$Sepal.Length)
 77 | summary(data_test$Sepal.Length)
 78 | 
 79 | ## ------------------------------------------------------------------------
 80 | crossData <- iris %>% 
 81 |   sample_n(nrow(iris), replace = FALSE)
 82 | crossData <- add_column(crossData, 
 83 |   Bin = cut(1:150, breaks = 5, labels = c(1:5)))
 84 | store <- tibble(Fold=1:5, MSE=NA_integer_)
 85 | 
 86 | ## ------------------------------------------------------------------------
 87 | for(i in 1:5){
 88 |   data_train<-crossData %>% filter(Bin != i)
 89 |   data_test<-crossData %>% filter(Bin == i)
 90 |   lengthFold.lm = lm(Petal.Length ~ Sepal.Length + 
 91 |                       Sepal.Width + Petal.Width, 
 92 |                      data = data_train)
 93 |   store[i,]$MSE <- mean((predict(lengthFold.lm, data_test) - 
 94 |                            data_test$Petal.Length)^2)
 95 | }
 96 | 
 97 | ## ------------------------------------------------------------------------
 98 | mse_k <- mean(store$MSE)
 99 | mse_k
100 | 
101 | ## ------------------------------------------------------------------------
102 | lengthFold.lm <- lm(Petal.Length ~ Sepal.Length + 
103 |                    Sepal.Width + Petal.Width, 
104 |                    data = iris)
105 | lengthFold.lm
106 | mse_ALL <- mean(lengthFold.lm$residuals^2)
107 | mse_ALL
108 | 
109 | ## ------------------------------------------------------------------------
110 | store
111 | 
112 | ## ------------------------------------------------------------------------
113 | mse <- function(data, i) {
114 |   lengthBoot.lm <- lm(Petal.Length ~ Sepal.Length + 
115 |                       Sepal.Width + Petal.Width,
116 |                       data=data[i,])
117 |   return(mean(lengthBoot.lm$residuals^2))
118 | }
119 | 
120 | ## ------------------------------------------------------------------------
121 | bootResults <- boot(data=iris, statistic=mse, R=10000)
122 | bootResults 
123 | 
124 | ## ----mli-btplot1, fig.width=6, fig.height=4, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Boot Strap Results"----
125 | plot(bootResults)
126 | 
127 | ## ------------------------------------------------------------------------
128 | boot.ci(bootResults, conf = 0.95, type="bca")
129 | 
130 | ## ------------------------------------------------------------------------
131 | detectCores()
132 | detectCores(logical = TRUE)
133 | detectCores(logical = FALSE)
134 | 
135 | ## ------------------------------------------------------------------------
136 | ## notice 10000/2 = 5000
137 | runP <- function(...) boot(data=iris, statistic=mse, R=5000)
138 | 
139 | ## makes a cluster with 2 cores as 10000/5000 = 2
140 | cl<-makeCluster(2) 
141 | 
142 | ## passes along parts of the global environment 
143 | ## to each node / part of the cluster
144 | ## again, base is a file path variable to our book's path
145 | ## set book_directory <- "C:/YourPathHere/"
146 | clusterExport(cl, c("runP", "mse", "book_directory", "checkpoint_directory" ))
147 | 
148 | ## creates the library and some environment on 
149 | ## each of the parts of the cluster
150 | clusterEvalQ(cl, {
151 |   
152 | library(checkpoint)
153 |   checkpoint("2018-09-28", R.version = "3.5.1",
154 |   project = book_directory,
155 |   checkpointLocation = checkpoint_directory,
156 |   scanForPackages = FALSE,
157 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
158 |   
159 |     library(boot)
160 |   })
161 | 
162 | ## similar to set.seed() except for clusters
163 | clusterSetRNGStream(cl, 5) 
164 | 
165 | ## uses the parLapply() function which works on windows too
166 | pBootResults <- do.call(c, parLapply(cl, seq_len(2), runP))
167 |   
168 | #stop the cluster
169 | stopCluster(cl)
170 | 
171 | # view results
172 | pBootResults
173 | 
174 | ## get 95% confidence interval of the MSEs
175 | ## (note 0.95 is the default) 
176 | boot.ci(pBootResults, conf = 0.95, type="bca")
177 | 
178 | ## ----mli-btplot2, fig.width=6, fig.height=4, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Boot Strap Results"----
179 | plot(pBootResults)
180 | 
181 | ## ------------------------------------------------------------------------
182 | cl <- makeCluster(2)
183 | registerDoParallel(cl)
184 | 
185 | clusterExport(cl, c("book_directory", "checkpoint_directory"))
186 | 
187 | clusterEvalQ(cl, {
188 |   library(checkpoint)
189 |   checkpoint("2018-09-28", R.version = "3.5.1",
190 |   project = book_directory,
191 |   checkpointLocation = checkpoint_directory,
192 |   scanForPackages = FALSE,
193 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
194 |     
195 |   library("tidyverse")
196 |   })
197 | 
198 | ## ------------------------------------------------------------------------
199 | k <- foreach(i=1:5, .combine = c) %dopar% {
200 |   data_train <- crossData %>% filter(Bin != i)
201 |   data_test <- crossData %>% filter(Bin == i)
202 |   lengthFold.lm <- lm(Petal.Length ~ Sepal.Length + 
203 |                       Sepal.Width + Petal.Width,
204 |                       data = data_train)
205 |   mean((predict(lengthFold.lm, data_test) - 
206 |           data_test$Petal.Length)^2)
207 | }
208 | 
209 | ## ------------------------------------------------------------------------
210 | stopCluster(cl)
211 | mse_Pk<-mean(k)
212 | mse_Pk
213 | 
214 | 


--------------------------------------------------------------------------------
/mls.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 |  library(checkpoint)
  4 |  checkpoint("2018-09-28", R.version = "3.5.1",
  5 |    project = book_directory,
  6 |    checkpointLocation = checkpoint_directory,
  7 |    scanForPackages = FALSE,
  8 |    scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(ggplot2)
 11 | library(cowplot)
 12 | library(data.table)
 13 | library(readxl)
 14 | library(viridis)
 15 | 
 16 | 
 17 | library(RSNNS)
 18 | library(kernlab)
 19 | library(rpart)
 20 | library(rattle)
 21 | library(DALEX)
 22 | library(caret)
 23 | library(spdep)
 24 | library(ranger)
 25 | library(e1071)
 26 | library(gbm)
 27 | library(plyr)
 28 | set.seed(1234)
 29 | 
 30 | options(width = 70, digits = 2)
 31 | 
 32 | ## ------------------------------------------------------------------------
 33 | ## Note: download Excel file  from publisher website first
 34 | dRaw <- read_excel("Gender_StatsData_worldbank.org_ccby40.xlsx")
 35 | dRaw <- as.data.table(dRaw) # convert data to data.table format.
 36 | 
 37 | dRaw[,`Indicator Name`:= NULL]
 38 | 
 39 | ## collapse columns into a super long dataset
 40 | ## with Year as a new variable
 41 | data <- melt(dRaw, measure.vars = 3:20, variable.name = "Year", variable.factor = FALSE)
 42 | 
 43 | ## cast the data wide again
 44 | ## this time with separate variables by indicator code
 45 | ## keeping a country and time (Year) variable
 46 | data <- dcast(data, CountryName + Year ~ IndicatorCode)
 47 | rm(dRaw) #remove unneeded variable
 48 | 
 49 | #rename columns with shortened, unique names
 50 | x<-colnames(data)
 51 | x<-gsub("[[:punct:]]", "", x)
 52 | (y <- abbreviate(x, minlength = 4, method = "both.sides"))
 53 | names(data) <- y
 54 | 
 55 | #shorten regional names to abbreviations.
 56 | data$CntN<-abbreviate(data$CntN, minlength = 5, method = "left.kept")
 57 | 
 58 | ## ------------------------------------------------------------------------
 59 | d <- copy(data)
 60 | sort(unique(d$CntN))
 61 | 
 62 | ## ------------------------------------------------------------------------
 63 | str(d)
 64 | d[,Year:=as.numeric(Year)]
 65 | ddum <- dummyVars("~.", data = d)
 66 | d <- data.table(predict(ddum, newdata = d))
 67 | rm(ddum) #remove ddum as unneeded
 68 | str(d)
 69 | 
 70 | ## ------------------------------------------------------------------------
 71 | dScaled<-scale(d[,-c(1:9)])
 72 | dScaled<-as.data.table(dScaled)
 73 | d <- cbind(d[,c(1:9)], dScaled)
 74 | rm(dScaled) #remove d2 as unneeded
 75 | str(d)
 76 | 
 77 | ## ----mls-eda1, fig.width=12, fig.height=5, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "Looking for significant data misshapes."----
 78 | boxplot(d[,-c(1:9)], las = 2)
 79 | 
 80 | ## ----mls-eda2, fig.width=12, fig.height=5, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "Looking for significant data misshapes."----
 81 | par(mfrow = c(1,2))
 82 | hist(d$ERMA, 100)
 83 | qqnorm(d$ERMA)
 84 | par(mfrow = c(1,1))
 85 | 
 86 | ## ------------------------------------------------------------------------
 87 | shapiro.test(d$ERMA)
 88 | 
 89 | ## ------------------------------------------------------------------------
 90 | range(d$ERMA)
 91 | range(data$ERMA)
 92 | shapiro.test( log(data$ERMA) )
 93 | 
 94 | ## ----mls-eda3, fig.width=4, fig.height=4, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Looking for significant data misshapes."----
 95 | par(mfrow = c(1,2))
 96 | hist(data$ERMA, 100)
 97 | hist( log (data$ERMA) , 100)
 98 | par(mfrow = c(1,1))
 99 | 
100 | ## ------------------------------------------------------------------------
101 | d2 <- copy(data[,.(SPAD, ERMA)])
102 | d2[, Log.ERMA := log(ERMA)]
103 | cor(d2)
104 | rm(d2) #no longer needed
105 | 
106 | ## ------------------------------------------------------------------------
107 | lapply(data[,-c(1:2)], shapiro.test)
108 | 
109 | dlog <- copy(data)
110 | dlog <- sapply(dlog[,-c(1:2)], log)
111 | dlog<-as.data.table(dlog)
112 | colnames(dlog) <- paste(colnames(dlog), "LOG", sep = ".")
113 | 
114 | dlog<-cbind(data, dlog)
115 | View(cor(dlog[,-c(1:2)]))
116 | rm(dlog) #remove as we will not use.
117 | 
118 | ## ------------------------------------------------------------------------
119 | set.seed(1234)
120 | index <- createDataPartition(data$CntN, p = 0.8, list = FALSE)
121 | trainData <- data[index, ]
122 | validationData <- data[-index, ]
123 | 
124 | ## ------------------------------------------------------------------------
125 | #source("https://bioconductor.org/biocLite.R")
126 | #biocLite("pcaMethods")
127 | library(pcaMethods)
128 | 
129 | ## ------------------------------------------------------------------------
130 | #confirm structure
131 | str(trainData[,c(3:8,10:13)])
132 | 
133 | #base R / traditional method
134 | pc <- prcomp(trainData[,c(3:8,10:13)], center = TRUE, scale. = TRUE)
135 | summary(pc)
136 | pcValidationData1 <- predict(pc, newdata = validationData[,c(3:8,10:13)])
137 | 
138 | #scalable method using PcaMethods
139 | pc<-pca(trainData[,c(1:8,10:13)], method = "svd",nPcs = 4, scale = "uv", center = TRUE)
140 | pc
141 | summary(pc)
142 | pcValidationData2 <- predict(pc, newdata = validationData[,c(3:8,10:13)])
143 | 
144 | #demonstration of how to access transformed validation data
145 | pcValidationData1[,1]
146 | pcValidationData2$scores[,1]
147 | 
148 | ## ----mls-svm1, fig.width=8, fig.height=4, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Per Capita GNP vs Adolescent Fertility Rate."----
149 | 
150 | svmDataTrain <- trainData[,.(SPAD, NYGN)]
151 | svmDataValidate <- validationData[,.(SPAD, NYGN)]
152 | 
153 | p1 <- ggplot(data = svmDataTrain,
154 |              aes(x = NYGN, y = SPAD))
155 |   ## data poins colored by country
156 |   p1 + geom_point(aes(colour = trainData$CntN)) + 
157 |     scale_colour_viridis(discrete = TRUE)
158 | 
159 | 
160 | ## ------------------------------------------------------------------------
161 | set.seed(12345)
162 | 
163 |   svm <- train(x = svmDataTrain,
164 |              y = trainData$CntN,
165 |              method = "svmLinear",
166 |              preProcess = NULL,
167 |              metric = "Accuracy",
168 |              trControl = trainControl(method = "cv",
169 |                                       number = 5,
170 |                                       seeds = c(123, 234, 345, 456, 567, 678)
171 |                                       )
172 |              )
173 | svm
174 | 
175 | ## ------------------------------------------------------------------------
176 | #predict the country name on our training data using our new model
177 | predictOnTrain <- predict(svm, newdata = svmDataTrain)
178 | 
179 | mean( predictOnTrain == trainData$CntN)
180 | 
181 | ## ------------------------------------------------------------------------
182 | predictOnTest <- predict(svm, newdata = svmDataValidate)
183 | mean(predictOnTest == validationData$CntN)
184 | 
185 | ## ----mls-svm2, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Train vs Test Predictions."----
186 | 
187 | p1 <- ggplot(data = validationData,
188 |              aes(x = NYGN, y = SPAD))
189 | 
190 | plot_grid(
191 |   ## data poins colored by country
192 |   p1 + geom_point(aes(colour = validationData$CntN, size = validationData$CntN)) + 
193 |     scale_colour_viridis(discrete = TRUE),
194 | 
195 |   ## data poins colored by predicted country
196 |   p1 + geom_point(aes(colour = predictOnTest, size = predictOnTest)) + 
197 |     scale_colour_viridis(discrete = TRUE),
198 | ncol = 1
199 | )  
200 | 
201 | ## ------------------------------------------------------------------------
202 | rm(p1)
203 | rm(svm)
204 | rm(svmDataTrain)
205 | rm(svmDataValidate)
206 | rm(pcValidationData1)
207 | rm(pcValidationData2)
208 | rm(predictOnTest)
209 | rm(predictOnTrain)
210 | rm(pc)
211 | rm(d)
212 | 
213 | ## ------------------------------------------------------------------------
214 | # set up training & validation data
215 | svmDataTrain <- trainData[,-1]
216 | svmDataTrain[,Year:=as.numeric(Year)]
217 | svmDataValidation <- validationData[,-1]
218 | svmDataValidation[,Year:=as.numeric(Year)]
219 | #run linear SVM on the full data set
220 | set.seed(12345)
221 | svmLinear <- train(x = svmDataTrain,
222 |              y = trainData$CntN,
223 |              method = "svmLinear",
224 |              preProcess = c("scale", "center", "pca"),
225 |              metric = "Accuracy",
226 |              trControl = trainControl(method = "cv",
227 |                                       number = 5,
228 |                                       seeds = c(123, 234, 345, 456, 567, 678)
229 |                                       )
230 |              )
231 | svmLinear
232 | 
233 | ## ------------------------------------------------------------------------
234 | #run polynomial SVM on the full data set
235 | set.seed(12345)
236 | svmPoly <- train(x = svmDataTrain,
237 |              y = trainData$CntN,
238 |              method = "svmPoly",
239 |              preProcess = c("scale", "center", "pca"),
240 |              metric = "Accuracy",
241 |              trControl = trainControl(method = "cv",
242 |                                       number = 5
243 |                                       )
244 |              )
245 | 
246 | svmPoly
247 | 
248 | ## ------------------------------------------------------------------------
249 | predictOnTrainL <- predict(svmLinear, newdata = svmDataTrain)
250 | mean( predictOnTrainL == trainData$CntN)
251 | 
252 | predictOnTrainP <- predict(svmPoly, newdata = svmDataTrain)
253 | mean( predictOnTrainP == trainData$CntN)
254 | 
255 | ## ------------------------------------------------------------------------
256 | predictOnTestL <- predict(svmLinear, newdata = svmDataValidation)
257 | mean(predictOnTestL == validationData$CntN)
258 | 
259 | ## ------------------------------------------------------------------------
260 | cartDataTrain <- copy(trainData[,-1])
261 | cartDataTrain[,Year:=as.numeric(Year)]
262 | cartDataValidation <- copy(validationData[,-1])
263 | cartDataValidation[,Year:=as.numeric(Year)]
264 | 
265 | set.seed(12345)
266 | cartModel <- train(x = cartDataTrain,
267 |              y = trainData$CntN,
268 |              method = "rpart",
269 |              preProcess = c("scale", "center", "pca"),
270 |              metric = "Accuracy",
271 |              tuneLength = 10,
272 |              trControl = trainControl(method = "cv",
273 |                                       number = 5
274 |                                       )
275 |              )
276 | 
277 | cartModel
278 | 
279 | ## ----mls-crt1, fig.width=9, fig.height=7, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Classification Tree Graph."----
280 | plot(cartModel$finalModel)
281 | text(cartModel$finalModel, cex = 0.5)
282 | 
283 | ## ----mls-crt2, fig.width=6, fig.height=7, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Fancy Classification Tree Graph"----
284 | fancyRpartPlot(cartModel$finalModel, cex = 0.4, main = "")
285 | 
286 | ## ------------------------------------------------------------------------
287 | predictOnTrainT <- predict(cartModel, newdata = cartDataTrain)
288 | mean( predictOnTrainT == trainData$CntN)
289 | 
290 | predictOnTestT <- predict(cartModel, newdata = cartDataValidation)
291 | mean(predictOnTestT == validationData$CntN)
292 | 
293 | ## ------------------------------------------------------------------------
294 | confusionMatrix(predictOnTestT, as.factor(validationData$CntN))
295 | 
296 | ## ------------------------------------------------------------------------
297 | rfDataTrain <- copy(trainData[,-1])
298 | rfDataTrain[,Year:=as.numeric(Year)]
299 | rfDataValidation <- copy(validationData[,-1])
300 | rfDataValidation[,Year:=as.numeric(Year)]
301 | 
302 | set.seed(12345)
303 | 
304 | rfModel <- train(x = rfDataTrain,
305 |              y = trainData$CntN,
306 |              method = "ranger",
307 |              preProcess = c("scale", "center", "pca"),
308 |              metric = "Accuracy",
309 |              num.trees = 20,
310 |              trControl = trainControl(method = "cv",
311 |                                       number = 5
312 |                                       )
313 |              )
314 | 
315 | rfModel
316 | rfModel$finalModel$num.trees
317 | 
318 | ## ------------------------------------------------------------------------
319 | predictOnTrainR <- predict(rfModel, newdata = rfDataTrain)
320 | mean( predictOnTrainR == trainData$CntN)
321 | 
322 | predictOnTestR <- predict(rfModel, newdata = rfDataValidation)
323 | mean(predictOnTestR == validationData$CntN)
324 | 
325 | ## ------------------------------------------------------------------------
326 | set.seed(12345)
327 | rfModel <- train(x = rfDataTrain,
328 |              y = trainData$CntN,
329 |              method = "ranger",
330 |              preProcess = c("scale", "center", "pca"),
331 |              metric = "Accuracy",
332 |              num.trees = 50,
333 |              trControl = trainControl(method = "cv",
334 |                                       number = 5
335 |                                       )
336 |              )
337 | rfModel
338 | rfModel$finalModel$num.trees
339 | 
340 | ## ------------------------------------------------------------------------
341 | predictOnTrainR <- predict(rfModel, newdata = rfDataTrain)
342 | mean( predictOnTrainR == trainData$CntN)
343 | 
344 | predictOnTestR <- predict(rfModel, newdata = rfDataValidation)
345 | mean(predictOnTestR == validationData$CntN)
346 | 
347 | ## ------------------------------------------------------------------------
348 | 
349 | set.seed(12345)
350 | 
351 | rfModel <- train(x = rfDataTrain,
352 |              y = trainData$CntN,
353 |              method = "ranger",
354 |              preProcess = c("scale", "center", "pca"),
355 |              metric = "Accuracy",
356 |              num.trees = 20,
357 |              trControl = trainControl(method = "cv",
358 |                                       number = 5
359 |                                       ),
360 |              tuneGrid = expand.grid(mtry = c(1, 2, 3, 4),
361 |                                     splitrule = "extratrees",
362 |                                     min.node.size = c(1, 5, 10, 15))
363 |              )
364 | 
365 | rfModel
366 | rfModel$finalModel$num.trees
367 | rfModel$finalModel$mtry
368 | rfModel$finalModel$splitrule
369 | rfModel$finalModel$min.node.size
370 | 
371 | ## ------------------------------------------------------------------------
372 | predictOnTrainR <- predict(rfModel, newdata = rfDataTrain)
373 | mean( predictOnTrainR == trainData$CntN)
374 | 
375 | predictOnTestR <- predict(rfModel, newdata = rfDataValidation)
376 | mean(predictOnTestR == validationData$CntN)
377 | 
378 | ## ------------------------------------------------------------------------
379 | sgbDataTrain <- copy(trainData)
380 | sgbDataTrain[,Year:=as.numeric(Year)]
381 | sgbDataValidation <- copy(validationData)
382 | sgbDataValidation[,Year:=as.numeric(Year)]
383 | 
384 | ddum <- dummyVars("~.", data = sgbDataTrain)
385 | sgbDataTrain <- data.table(predict(ddum, newdata = sgbDataTrain))
386 | sgbDataValidation <- data.table(predict(ddum, newdata = sgbDataValidation))
387 | rm(ddum)
388 | 
389 | ## ------------------------------------------------------------------------
390 | set.seed(12345)
391 | sgbModel <- train(SPAD ~.,
392 |                   data = sgbDataTrain,
393 |              method = "gbm",
394 |              preProcess = c("scale", "center"),
395 |              metric = "RMSE",
396 |              trControl = trainControl(method = "cv",
397 |                                       number = 5
398 |                                       ),
399 |              tuneGrid = expand.grid(interaction.depth = 1:3,
400 |                                     shrinkage = 0.1,
401 |                                     n.trees = c(50, 100, 150),
402 |                                     n.minobsinnode = 10),
403 |              verbose = FALSE
404 |              )
405 | sgbModel
406 | 
407 | ## ----mls-sgb1, fig.width=5, fig.height=10, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "Relative Influnce Visualisation."----
408 | summary(sgbModel)
409 | 
410 | ## ------------------------------------------------------------------------
411 | mean(stats::residuals(sgbModel)^2)
412 | 
413 | mean((predict(sgbModel, sgbDataValidation) - 
414 |                   sgbDataValidation$SPAD)^2)
415 | 
416 | ## ------------------------------------------------------------------------
417 | explainSGBt <- explain(sgbModel, label = "sgbt",
418 |                data = sgbDataTrain,
419 |                y = sgbDataTrain$SPAD)
420 | 
421 | explainSGBv <- explain(sgbModel, label = "sgbv",
422 |                data = sgbDataValidation,
423 |                y = sgbDataValidation$SPAD)
424 | 
425 | ## ----mls-sgb2, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "DALEX residual visualisations."----
426 | performanceSGBt <- model_performance(explainSGBt)
427 | performanceSGBv <- model_performance(explainSGBv)
428 | 
429 | plot_grid(
430 |   plot(performanceSGBt, performanceSGBv),
431 |   plot(performanceSGBt, performanceSGBv, geom = "boxplot"),
432 |   ncol = 2)
433 | 
434 | 
435 | ## ----mls-sgb3, fig.width=6, fig.height=6, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "DALEX Drop Out Loss."----
436 | 
437 | importanceSGBt <- variable_importance(explainSGBt)
438 | importanceSGBv <- variable_importance(explainSGBv)
439 | plot(importanceSGBt, importanceSGBv)
440 | 
441 | ## ----mls-sgb4, fig.width=6, fig.height=6, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "DALEX primary school non-attendance count versus teen pregnancy."----
442 | responseSGBprmt <- variable_response(explainSGBt, variable = "SEPR", type = "pdp")
443 | responseSGBprmv <- variable_response(explainSGBv, variable = "SEPR", type = "pdp")
444 | plot(responseSGBprmt, responseSGBprmv)
445 | 
446 | ## ----mls-sgb5, fig.width=6, fig.height=6, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "DALEX death rate per 1000 versus teen pregnancy."----
447 | responseSGBdynt <- variable_response(explainSGBt, variable = "SPDY", type = "pdp")
448 | responseSGBdynv <- variable_response(explainSGBv, variable = "SPDY", type = "pdp")
449 | plot(responseSGBdynt, responseSGBdynv)
450 | 
451 | ## ------------------------------------------------------------------------
452 | mlpDataTrain <- copy(trainData)
453 | mlpDataTrain[,Year:=as.numeric(Year)]
454 | mlpDataValidation <- copy(validationData)
455 | mlpDataValidation[,Year:=as.numeric(Year)]
456 | 
457 | ddum <- dummyVars("~.", data = mlpDataTrain)
458 | mlpDataTrain <- data.table(predict(ddum, newdata = mlpDataTrain))
459 | mlpDataValidation <- data.table(predict(ddum, newdata = mlpDataValidation))
460 | rm(ddum)
461 | 
462 | ## ------------------------------------------------------------------------
463 | set.seed(12345)
464 | suppressWarnings(
465 |   mlpModel <- train(
466 |     SPAD ~ .,
467 |     data = mlpDataTrain,
468 |     method = "mlpML",
469 |     preProcess = c("scale", "center"),
470 |     metric = "RMSE",
471 |     trControl = trainControl(method = "cv",
472 |                              number = 5)
473 |   )
474 | )
475 | mlpModel
476 | 
477 | ## ------------------------------------------------------------------------
478 | summary(mlpModel)
479 | 
480 | ## ------------------------------------------------------------------------
481 | mean(stats::residuals(mlpModel)^2)
482 | 
483 | mean((predict(mlpModel, mlpDataValidation) - 
484 |                   mlpDataValidation$SPAD)^2)
485 | 
486 | ## ------------------------------------------------------------------------
487 | explainMLPt <- explain(mlpModel, label = "mlpt",
488 |                data = mlpDataTrain,
489 |                y = mlpDataTrain$SPAD)
490 | 
491 | explainMLPv <- explain(mlpModel, label = "mlpv",
492 |                data = mlpDataValidation,
493 |                y = mlpDataValidation$SPAD)
494 | 
495 | ## ----mls-mlp1, fig.width=8, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Model performance contrasting SGB vs MLP methods."----
496 | performanceMLPt <- model_performance(explainMLPt)
497 | performanceMLPv <- model_performance(explainMLPv)
498 | 
499 | plot_grid(
500 | plot(performanceMLPt, performanceMLPv, performanceSGBt, performanceSGBv),
501 | plot(performanceMLPt, performanceMLPv, performanceSGBt, performanceSGBv, geom = "boxplot"),
502 | ncol = 2
503 | )
504 | 
505 | ## ----mls-mlp2, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Determining top key variables."----
506 | 
507 | importanceMLPt <- variable_importance(explainMLPt)
508 | importanceMLPv <- variable_importance(explainMLPv)
509 | plot(importanceMLPt, importanceMLPv, importanceSGBt, importanceSGBv)
510 | 
511 | ## ----mls-mlp3, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Understanding the impact of primary school attendance on teen pregnancy."----
512 | responseMLPprmt <- variable_response(explainMLPt, variable = "SEPR", type = "pdp")
513 | responseMLPprmv <- variable_response(explainMLPv, variable = "SEPR", type = "pdp")
514 | plot(responseMLPprmt, responseMLPprmv, responseSGBprmt, responseSGBprmv)
515 | 
516 | ## ----mls-mlp4, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "DALEX death rate per 1000 versus teen pregnancy."----
517 | responseMLPdynt <- variable_response(explainMLPt, variable = "SPDY", type = "pdp")
518 | responseMLPdynv <- variable_response(explainMLPv, variable = "SPDY", type = "pdp")
519 | plot(responseMLPdynt, responseMLPdynv, responseSGBdynt, responseSGBdynv)
520 | 
521 | ## ------------------------------------------------------------------------
522 | set.seed(12345)
523 | suppressWarnings(
524 |   mlpModelb <- train(
525 |     SPAD ~ .,
526 |     data = mlpDataTrain,
527 |     method = "mlpML",
528 |     preProcess = c("scale", "center"),
529 |     metric = "RMSE",
530 |     verbose = FALSE,
531 |     trControl = trainControl(method = "cv",
532 |                              number = 5),
533 |     tuneGrid = expand.grid(
534 |       layer1 = 0:10,
535 |       layer2 = 0:10,
536 |       layer3 = 0:10
537 |     )
538 |   )
539 | )
540 | mlpModelb
541 | 
542 | mean(stats::residuals(mlpModelb)^2)
543 | 
544 | mean((predict(mlpModelb, mlpDataValidation) -
545 |                   mlpDataValidation$SPAD)^2)
546 | summary(mlpModelb)
547 | 
548 | 


--------------------------------------------------------------------------------
/mlu.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 |  library(checkpoint)
  4 |  checkpoint("2018-09-28", R.version = "3.5.1",
  5 |    project = book_directory,
  6 |    checkpointLocation = checkpoint_directory,
  7 |    scanForPackages = FALSE,
  8 |    scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(ggplot2)
 11 | library(cowplot)
 12 | library(viridis)
 13 | library(scales)
 14 | library(readxl)
 15 | library(data.table)
 16 | library(ape)
 17 | library(MASS)
 18 | library(matrixStats)
 19 | 
 20 | options(width = 70, digits = 2)
 21 | 
 22 | 
 23 | ## ----eval = FALSE--------------------------------------------------------
 24 | ## 
 25 | ## source("https://bioconductor.org/biocLite.R")
 26 | ## biocLite("pcaMethods")
 27 | ## 
 28 | 
 29 | ## ----echo=TRUE, message=FALSE, results = "hide"--------------------------
 30 | 
 31 | library(pcaMethods)
 32 | 
 33 | 
 34 | ## ------------------------------------------------------------------------
 35 | 
 36 | ## Note: download Excel file  from publisher website first
 37 | dRaw <- read_excel("Gender_StatsData_worldbank.org_ccby40.xlsx")
 38 | dRaw <- as.data.table(dRaw) # convert data to data.table format.
 39 | 
 40 | 
 41 | ## ------------------------------------------------------------------------
 42 | 
 43 | str(dRaw)
 44 | 
 45 | summary(dRaw)
 46 | 
 47 | unique(dRaw$CountryName)
 48 | 
 49 | unique(dRaw$IndicatorCode)
 50 | 
 51 | 
 52 | ## ------------------------------------------------------------------------
 53 | 
 54 | dRaw[,`Indicator Name`:= NULL]
 55 | 
 56 | 
 57 | ## ------------------------------------------------------------------------
 58 | 
 59 | ## collapse columns into a super long dataset
 60 | ## with Year as a new variable
 61 | d <- melt(dRaw, measure.vars = 3:20, variable.name = "Year")
 62 | head(d)
 63 | str(d)
 64 | 
 65 | ## finally cast the data wide again
 66 | ## this time with separate variables by indicator code
 67 | ## keeping a country and time (Year) variable
 68 | d <- dcast(d, CountryName + Year ~ IndicatorCode)
 69 | 
 70 | head(d)
 71 | str(d)
 72 | 
 73 | 
 74 | ## ------------------------------------------------------------------------
 75 | 
 76 | ## rename columns with shortened, unique names
 77 | x<-colnames(d)
 78 | x<-gsub("[[:punct:]]", "", x)
 79 | (y <- abbreviate(x, minlength = 4, method = "both.sides"))
 80 | names(d) <- y
 81 | 
 82 | ## shorten regional names to abbreviations.
 83 | d$CntN<-abbreviate(d$CntN, minlength = 5,
 84 |                    method = "left.kept")
 85 | 
 86 | 
 87 | ## ------------------------------------------------------------------------
 88 | 
 89 | summary(d)
 90 | 
 91 | str(d)
 92 | 
 93 | d[, Year := as.character(Year)]
 94 | 
 95 | 
 96 | ## ----mlu-gnpadoplot1, fig.width=7, fig.height=10, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women."----
 97 | 
 98 | ## ggplot2 plot object indicating x and y variables
 99 | p1 <- ggplot(d, aes(NYGN, SPAD))
100 | 
101 | ## make a grid of two plots
102 | plot_grid(
103 |   ## first plot data points only
104 |   p1 + geom_point(),
105 |   ## data poins colored by year
106 |   p1 + geom_point(aes(colour = Year)) +
107 |     scale_colour_viridis(discrete = TRUE),
108 |   ncol = 1
109 | )
110 | 
111 | 
112 | ## ----mlu-kmeans1, fig.width=9, fig.height=9, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women for different numbers of k clusters."----
113 | 
114 | set.seed(2468)
115 | wgss <- vector("numeric", 8)
116 | plots <- vector("list", 9)
117 | p1 <- ggplot(d, aes(NYGN, SPAD))
118 | 
119 | for(i in 2:9) {
120 |   km <- kmeans(d[, .(NYGN, SPAD)],
121 |              centers = i)
122 | 
123 |   wgss[i - 1] <- km$tot.withinss
124 | 
125 |   plots[[i - 1]] <- p1 +
126 |     geom_point(aes_(colour = factor(km$cluster))) +
127 |     scale_color_viridis(discrete = TRUE) +
128 |     theme(legend.position = "none") +
129 |     ggtitle(paste("kmeans centers = ", i))
130 | }
131 | 
132 | plots[[9]] <- ggplot() +
133 |   geom_point(aes(x = 2:9, y = wgss)) +
134 |   xlab("Number of Clusters") +
135 |   ylab("Within SS") +
136 |   ggtitle("Scree Plot")
137 | 
138 | do.call(plot_grid, c(plots, ncol = 3))
139 | 
140 | 
141 | ## ------------------------------------------------------------------------
142 | 
143 | summary(d[,.(NYGN, SPAD)])
144 | 
145 | 
146 | ## ------------------------------------------------------------------------
147 | 
148 | x <- scale(d[,.(NYGN, SPAD)])
149 | summary(x)
150 | 
151 | 
152 | ## ----mlu-kmeans2, fig.width=9, fig.height=9, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women for different numbers of k clusters."----
153 | 
154 | set.seed(2468)
155 | wgss <- vector("numeric", 8)
156 | plots <- vector("list", 9)
157 | p1 <- ggplot(d, aes(NYGN, SPAD))
158 | 
159 | for(i in 2:9) {
160 |   km <- kmeans(x, centers = i)
161 | 
162 |   wgss[i - 1] <- km$tot.withinss
163 | 
164 |   plots[[i - 1]] <- p1 +
165 |     geom_point(aes_(colour = factor(km$cluster))) +
166 |     scale_color_viridis(discrete = TRUE) +
167 |     theme(legend.position = "none") +
168 |     ggtitle(paste("kmeans centers = ", i))
169 | }
170 | 
171 | plots[[9]] <- ggplot() +
172 |   geom_point(aes(x = 2:9, y = wgss)) +
173 |   xlab("Number of Clusters") +
174 |   ylab("Within SS") +
175 |   ggtitle("Scree Plot")
176 | 
177 | do.call(plot_grid, c(plots, ncol = 3))
178 | 
179 | 
180 | ## ----mlu-kmeans3, fig.width=9, fig.height=9, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women for different numbers of iterations."----
181 | 
182 | set.seed(2468)
183 | plots <- vector("list", 9)
184 | p1 <- ggplot(d, aes(NYGN, SPAD))
185 | 
186 | for(i in 6:14) {
187 |   km <- kmeans(x, centers = 6, iter.max = i)
188 | 
189 |   plots[[i - 5]] <- p1 +
190 |     geom_point(aes_(colour = factor(km$cluster))) +
191 |     scale_color_viridis(discrete = TRUE) +
192 |     theme(legend.position = "none") +
193 |     ggtitle(paste("kmeans iters = ", i))
194 | }
195 | 
196 | do.call(plot_grid, c(plots, ncol = 3))
197 | 
198 | 
199 | ## ----mlu-kmeans4, fig.width=9, fig.height=9, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Plot of Gross National Product Per Capita and Adolescent Fertility Rate per 1,000 women for different nstart values."----
200 | 
201 | set.seed(2468)
202 | plots <- vector("list", 9)
203 | p1 <- ggplot(d, aes(NYGN, SPAD))
204 | 
205 | for(i in 1:9) {
206 |   km <- kmeans(x, centers = 6, iter.max = 10, nstart = i)
207 | 
208 |   plots[[i]] <- p1 +
209 |     geom_point(aes_(colour = factor(km$cluster))) +
210 |     scale_color_viridis(discrete = TRUE) +
211 |     theme(legend.position = "none") +
212 |     ggtitle(paste("kmeans nstarts = ", i))
213 | }
214 | 
215 | do.call(plot_grid, c(plots, ncol = 3))
216 | 
217 | 
218 | ## ----mlu-kmeans5, fig.width=6, fig.height=6, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Scree Plot for all ."----
219 | 
220 | x <- scale(d[,-c(1,2)])
221 | wgss<-0
222 | set.seed(2468)
223 | for( i in 1:11){
224 |   km <- kmeans(x, centers = i)
225 |   wgss[i]<-km$tot.withinss
226 | }
227 | 
228 | ggplot() +
229 |   geom_point(aes(x = 1:11, y = wgss)) +
230 |   xlab("Number of Clusters") +
231 |   ylab("Within SS") +
232 |   ggtitle("Scree Plot - All Variables")
233 | 
234 | 
235 | ## ------------------------------------------------------------------------
236 | 
237 | kmAll <- kmeans(x, centers = 4, nstart = 25)
238 | x <- cbind(d[, c(1,2)], x,
239 |            Cluster = kmAll$cluster)
240 | tail(x)
241 | 
242 | 
243 | ## ------------------------------------------------------------------------
244 | 
245 | xtabs(~ CntN + Cluster, data = x)
246 | 
247 | 
248 | ## ------------------------------------------------------------------------
249 | 
250 | unique(x[
251 |   order(CntN, Year, Cluster),
252 |   .(CntN, Year, Cluster)][
253 |     CntN=="EsA&P"])
254 | 
255 | unique(x[
256 |   order(CntN, Year, Cluster),
257 |   .(CntN, Year, Cluster)][
258 |     CntN == "ErpnU"])
259 | 
260 | 
261 | ## ------------------------------------------------------------------------
262 | 
263 | hdist <- dist(d[,.(NYGN, SPAD)])
264 | str(hdist)
265 | 
266 | 
267 | ## ----mlu-hclust1, fig.width=14, fig.height=10, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with row numbers."----
268 | 
269 | hclust <- hclust(hdist)
270 | plot(hclust)
271 | 
272 | 
273 | ## ----mlu-hclust2, fig.width=14, fig.height=12, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with country names and year."----
274 | 
275 | x <- d[, .(CntN, Year, NYGN, SPAD)]
276 | x[, Key := paste(CntN, Year)]
277 | x[, CntN := NULL]
278 | x[, Year := NULL]
279 | 
280 | hdist <- dist(x[,.(NYGN, SPAD)])
281 | hclust <- hclust(hdist)
282 | plot(hclust, labels = x$Key)
283 | 
284 | 
285 | ## ----mlu-hclust3, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with country names and year and a height line."----
286 | 
287 | plot(hclust, labels = x$Key)
288 | abline(h = 30000, col = "blue")
289 | 
290 | 
291 | ## ------------------------------------------------------------------------
292 | 
293 | summary(x)
294 | d[, mean(NYGN), by = CntN][order(V1)]
295 | d[, mean(SPAD), by = CntN][order(V1)]
296 | 
297 | 
298 | ## ----mlu-hclust4, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with country names and year and another height line."----
299 | 
300 | plot(hclust, labels = x$Key)
301 | abline(h = 20000, col = "blue")
302 | 
303 | 
304 | ## ----mlu-hclust5, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with country names and year and all dimensions of data."----
305 | 
306 | x <- copy(d)
307 | x[, Key := paste(CntN, Year)]
308 | x[, CntN := NULL]
309 | x[, Year := NULL]
310 | 
311 | hdist <- dist(x[, -12])
312 | hclust <- hclust(hdist)
313 | 
314 | plot(hclust, labels = x$Key)
315 | 
316 | 
317 | ## ----mlu-hclust6, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram using the ward.D2 method."----
318 | 
319 | hclust <- hclust(hdist, method = "ward.D2")
320 | plot(hclust, labels = x$Key)
321 | 
322 | 
323 | ## ----mlu-hclust7, fig.width=14, fig.height=12, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Cluster Dendrogram with scaling."----
324 | 
325 | x <- scale(d[,-c(1,2)])
326 | row.names(x) <- paste(d$CntN, d$Year)
327 | hdist <- dist(x)
328 | hclust <- hclust(hdist)
329 | 
330 | plot(hclust, labels = paste(d$CntN, d$Year))
331 | abline(h = 6, col = "blue")
332 | 
333 | 
334 | ## ------------------------------------------------------------------------
335 | 
336 | cut_hclust <- cutree(hclust, h = 6)
337 | unique(cut_hclust)
338 | 
339 | 
340 | ## ------------------------------------------------------------------------
341 | 
342 | dcopy <- as.data.table(copy(d))
343 | dcopy[, cluster:= NA_integer_]
344 | 
345 | dcopy$cluster <- cutree(hclust, k = 3)
346 | 
347 | tail(dcopy)
348 | 
349 | 
350 | ## ----mlu-hclust8, fig.width=14, fig.height=14, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Variations on Dendrogram via ape package."----
351 | 
352 | plot(as.phylo(hclust), type = "cladogram")
353 | 
354 | 
355 | ## ----mlu-hclust9, fig.width=14, fig.height=14, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Variations on Dendrogram via ape package."----
356 | 
357 | plot(as.phylo(hclust), type = "fan")
358 | 
359 | 
360 | ## ----mlu-hclust10, fig.width=14, fig.height=14, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "Variations on Dendrogram via ape package."----
361 | 
362 | plot(as.phylo(hclust), type = "radial")
363 | 
364 | 
365 | ## ----mlu-hclust11, fig.width=14, fig.height=14, out.width='.9\\linewidth', fig.pos="!ht", fig.cap = "unrooted type on 4 clusters."----
366 | 
367 | hclust4 <- cutree(hclust, k = 4)
368 | plot(as.phylo(hclust), type = "unrooted", label.offset = 1,
369 |      tip.color = hclust4, cex = 0.8)
370 | 
371 | 
372 | ## ----mlu-pca1, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "A highly correlated plot - are there really two dimensions here?"----
373 | 
374 | cor(d$NYGD, d$NYGN)
375 | 
376 | summary(d[,.(NYGD, NYGN)])
377 | 
378 | ggplot(d, aes(NYGD, NYGN)) +
379 |   geom_point()
380 | 
381 | 
382 | ## ----mlu-pca2, fig.width=6, fig.height=6, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "A highly correlated plot - are there really two dimensions here?"----
383 | 
384 | ggplot(d, aes(NYGD, SESC)) +
385 |   geom_point()
386 | 
387 | cor(d$NYGD, d$SESC)
388 | 
389 | 
390 | ## ------------------------------------------------------------------------
391 | 
392 | x <- d[,.( NYGN, SPAD)]
393 | res <- pca(x, method="svd", center=TRUE, scale = "uv")
394 | 
395 | summary(res)
396 | 
397 | 
398 | ## ----mlu-pca3, fig.width=7, fig.height=7, out.width='.8\\linewidth', fig.pos="!ht", fig.cap = "comparison of raw data and pca"----
399 | 
400 | biplot(res, main = "Biplot of PCA")
401 | 
402 | 
403 | ## ----mlu-pca4, fig.width=5, fig.height=5, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Scree plot for traditional PCA on all features in the data."----
404 | 
405 | x <- d[, -c(1,2)]
406 | res <- pca(x, method="svd", center=TRUE, scale = "uv",
407 |            nPcs = ncol(x))
408 | 
409 | summary(res)
410 | 
411 | ## reverse scree plot
412 | ggplot() +
413 |   geom_bar(aes(1:11, cumsum(res@R2)),
414 |            stat = "identity") +
415 |   scale_x_continuous("Principal Component", 1:11) +
416 |   scale_y_continuous(expression(R^2), labels = percent) +
417 |   ggtitle("Scree Plot") +
418 |   coord_cartesian(xlim = c(.5, 11.5), ylim = c(.5, 1),
419 |                   expand = FALSE)
420 | 
421 | 
422 | ## ----mlu-pca5, fig.width=6, fig.height=6, out.width='.7\\linewidth', fig.pos="!ht", fig.cap = "Biplot for first two principal components."----
423 | 
424 | biplot(res, choices = c(1, 2))
425 | 
426 | 
427 | ## ------------------------------------------------------------------------
428 | 
429 | head(scores(res))
430 | round(cor(scores(res)),2)
431 | 
432 | 
433 | ## ----mlu-pca6, fig.width=5, fig.height=10, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "Plot loadings from PCA models with and without outliers using traditional, SVD, PCA and robust PCA."----
434 | 
435 | x <- d[, -c(1,2)]
436 | x <- prep(x, center = TRUE, scale = "uv")
437 | 
438 | xout <- copy(x)
439 | xout[1:5, "NYGD"] <- (-10)
440 | 
441 | res1 <- pca(x, method = "svd",
442 |             center = FALSE, nPcs = 4)
443 | res2 <- pca(xout, method = "svd",
444 |             center = FALSE, nPcs = 4)
445 | 
446 | res1rob <- pca(x, method = "robustPca",
447 |                center = FALSE, nPcs = 4)
448 | res2rob <- pca(xout, method = "robustPca",
449 |                center = FALSE, nPcs = 4)
450 | plot_grid(
451 |   ggplot() +
452 |     geom_point(aes(
453 |       x = as.numeric(loadings(res1)),
454 |      y = as.numeric(loadings(res2)))) +
455 |     xlab("Loadings, SVD, No Outliers") +
456 |     ylab("Loadings, SVD, Outliers"),
457 |   ggplot() +
458 |     geom_point(aes(
459 |       x = as.numeric(loadings(res1rob)),
460 |      y = as.numeric(loadings(res2rob)))) +
461 |     xlab("Loadings, Robust PCA, No Outliers") +
462 |     ylab("Loadings, Robust PCA, Outliers"),
463 |   ncol = 1)
464 | 
465 | 
466 | ## ------------------------------------------------------------------------
467 | 
468 | x <- scale(d[, -c(1,2)])
469 | row.names(x) <- paste(d$CntN, d$Year)
470 | head(x)
471 | 
472 | sdist <- dist(x)
473 | 
474 | xSammon <- sammon(sdist, k = 2)
475 | head(xSammon$points)
476 | 
477 | 
478 | ## ----mlu-sam1, fig.width=5, fig.height=5, out.width='.6\\linewidth', fig.pos="!ht", fig.cap = "Plot of Sammon points with text labels"----
479 | 
480 | plot(xSammon$points, type = "n")
481 | text(xSammon$points, labels = row.names(x) )
482 | 
483 | 
484 | 


--------------------------------------------------------------------------------
/udv.R:
--------------------------------------------------------------------------------
  1 | ## ----setup, include=TRUE, results='hide', message = FALSE, warning = FALSE, cache=FALSE----
  2 | 
  3 | library(checkpoint)
  4 | checkpoint("2018-09-28", R.version = "3.5.1",
  5 |   project = book_directory,
  6 |   checkpointLocation = checkpoint_directory,
  7 |   scanForPackages = FALSE,
  8 |   scan.rnw.with.knitr = TRUE, use.knitr = TRUE)
  9 | 
 10 | library(knitr)
 11 | library(ggplot2)
 12 | library(cowplot)
 13 | library(MASS)
 14 | library(JWileymisc)
 15 | library(data.table)
 16 | 
 17 | options(width = 70, digits = 2)
 18 | 
 19 | 
 20 | ## ----fudv-dotplot-mpg, fig.width=4, fig.height=1.5, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Stacked dot plot of miles per gallon from old cars."----
 21 | 
 22 | ggplot(mtcars, aes(mpg)) +
 23 |   geom_dotplot()
 24 | 
 25 | 
 26 | ## ----fudv-hist-sl, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Histogram of sepal length from the iris data."----
 27 | 
 28 | ggplot(iris, aes(Sepal.Length)) +
 29 |   geom_histogram()
 30 | 
 31 | 
 32 | ## ----fudv-hist-lynx-raw, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Histogram of annual Canadian lynx trappings."----
 33 | 
 34 | ggplot(data.table(lynx = as.vector(lynx)), aes(lynx)) +
 35 |   geom_histogram()
 36 | 
 37 | 
 38 | ## ----fudv-hist-lynx-log, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Histogram of annual Canadian lynx trappings after a natural log transformation."----
 39 | 
 40 | ggplot(data.table(lynx = as.vector(lynx)), aes(log(lynx))) +
 41 |   geom_histogram()
 42 | 
 43 | 
 44 | ## ----fudv-density, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "This is the density plot for our sepal lengths."----
 45 | 
 46 | ggplot(iris, aes(Sepal.Length)) +
 47 |   geom_density()
 48 | 
 49 | 
 50 | ## ----fudv-densityadjust, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = c("A noisy density plot.", "A very smooth density plot.")----
 51 | 
 52 | ggplot(iris, aes(Sepal.Length)) +
 53 |   geom_density(adjust = .5)
 54 | 
 55 | ggplot(iris, aes(Sepal.Length)) +
 56 |   geom_density(adjust = 5)
 57 | 
 58 | 
 59 | ## ----fudv-qqnorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Normal data look like a straight line.  Sepal.Length seems fairly normal."----
 60 | 
 61 | ggplot(iris, aes(sample = Sepal.Length)) +
 62 |   geom_qq()
 63 | 
 64 | 
 65 | ## ------------------------------------------------------------------------
 66 | 
 67 | qnorm(p = .1, mean = 0, sd = 1)
 68 | 
 69 | 
 70 | ## ------------------------------------------------------------------------
 71 | 
 72 | qnorm(p = c(.25, .50, .75), mean = 0, sd = 1)
 73 | 
 74 | 
 75 | ## ------------------------------------------------------------------------
 76 | 
 77 | ppoints(n = 3, a = 0)
 78 | 
 79 | 
 80 | ## ------------------------------------------------------------------------
 81 | 
 82 | ppoints(n = 3)
 83 | 
 84 | 
 85 | ## ----fudv-manual-qnorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Shows theoretical norms on the x-axis (based on predictions from mean and standard deviation)."----
 86 | 
 87 | qplot(
 88 |   x = qnorm(
 89 |     p = ppoints(length(iris$Sepal.Length)),
 90 |     mean = mean(iris$Sepal.Length),
 91 |     sd = sd(iris$Sepal.Length)),
 92 |   y = sort(iris$Sepal.Length),
 93 |   xlab = "Theoretical Normal Quantiles",
 94 |   ylab = "Sepal Length") +
 95 |   geom_abline(slope = 1, intercept = 0)
 96 | 
 97 | 
 98 | ## ----fudv-qqlnorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = c("Testing whether lynx data are consistent with a log normal distribution.", "Testing whether lynx data are consistent with a Poisson distribution.")----
 99 | 
100 | ggplot(data.table(lynx = as.vector(lynx)), aes(sample = lynx)) +
101 |   geom_qq(distribution = qlnorm)
102 | 
103 | ggplot(data.table(lynx = as.vector(lynx)), aes(sample = lynx)) +
104 |   geom_qq(distribution = qpois, dparams = list(lambda = mean(lynx)))
105 | 
106 | 
107 | ## ----fudv-densitynorm, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "A normal curve and our density plot (with default smoothness of 1)."----
108 | 
109 | ggplot(iris, aes(Sepal.Length)) +
110 |   geom_density() +
111 |   stat_function(fun = dnorm,
112 |                 args = list(
113 |                   mean = mean(iris$Sepal.Length),
114 |                   sd = sd(iris$Sepal.Length)),
115 |                 colour = "blue")
116 | 
117 | 
118 | ## ------------------------------------------------------------------------
119 | 
120 | set.seed(1234)
121 | y <- rbeta(150, 1, 4)
122 | head(y)
123 | 
124 | 
125 | ## ----warning=FALSE-------------------------------------------------------
126 | 
127 | y.fit <- fitdistr(y, densfun = "beta",
128 |                   start = list(shape1 = .5, shape2 = .5))
129 | 
130 | 
131 | ## ------------------------------------------------------------------------
132 | 
133 | y.fit
134 | 
135 | y.fit$estimate["shape1"]
136 | 
137 | y.fit$estimate["shape2"]
138 | 
139 | 
140 | ## ------------------------------------------------------------------------
141 | 
142 | logLik(y.fit)
143 | 
144 | 
145 | ## ------------------------------------------------------------------------
146 | 
147 | y.fit2 <- fitdistr(y, densfun = "normal")
148 | logLik(y.fit2)
149 | 
150 | 
151 | ## ----fudv-testdistr-norm, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Density plot with superimposed normal distributions and normal Q-Q plot."----
152 | 
153 | testdistr(y)
154 | 
155 | 
156 | ## ----fudv-testdistr, fig.width=6, fig.height=6, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Shows density plot with superimposed beta or normal distributions along with Q-Q plot fits."----
157 | 
158 | test.beta <- testdistr(y, "beta",
159 |                        starts = list(shape1 = .5, shape2 = .5),
160 |                        varlab = "Y", plot = FALSE)
161 | 
162 | test.normal <- testdistr(y, "normal", varlab = "Y", plot = FALSE)
163 | 
164 | plot_grid(
165 |    test.beta$DensityPlot, test.beta$QQPlot,
166 |    test.normal$DensityPlot, test.normal$QQPlot,
167 |    ncol = 2)
168 | 
169 | 
170 | ## ----fudv-testdistr-pois, fig.width=6, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Discrete observed proportions with the theoretical probabilies from a poisson plotted in blue."----
171 | 
172 | set.seed(1234)
173 | y <- rnbinom(500, mu = 5, size = 2)
174 | testdistr(y, "poisson")
175 | 
176 | 
177 | ## ----fudv-testdistr-nbinom, fig.width=6, fig.height=4, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Discrete observed proportions with the theoretical probabilies from a negative binomial distribution plotted in blue."----
178 | 
179 | testdistr(y, "nbinom")
180 | 
181 | 
182 | ## ------------------------------------------------------------------------
183 | 
184 | pnorm(c(-3, 3))
185 | 
186 | 
187 | ## ----fudv-anomaly-sd, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Panel graph showing stacked dot plots with anomalous values."----
188 | 
189 | set.seed(1234)
190 | d <- data.table(
191 |   y1 = rnorm(200, 0, 1),
192 |   y2 = rnorm(200, 0, .2) + rep(c(-3, -1, 1, 3), each = 50))
193 | 
194 | plot_grid(
195 |   qplot(c(d$y1, rep(5, 3)), geom = "dotplot", binwidth = .1),
196 |   qplot(c(d$y2, rep(5, 3)), geom = "dotplot", binwidth = .1),
197 |   ncol = 1, labels = c("A", "B"))
198 | 
199 | 
200 | ## ----fudv-anomaly-gamma, fig.width=4, fig.height=4, out.width='.5\\linewidth', fig.pos="!ht", fig.cap = "Panel graph showing randomly generated (no added anomalous values) data from a Gamma and normal distribution."----
201 | 
202 | set.seed(1234)
203 | d2 <- data.table(
204 |   y1 = rgamma(200, 1, .1),
205 |   y2 = rnorm(200, 10, 10))
206 | 
207 | plot_grid(
208 |   qplot(d2$y1, geom = "dotplot", binwidth = 1),
209 |   qplot(d2$y2, geom = "dotplot", binwidth = 1),
210 |   ncol = 1, labels = c("A", "B"))
211 | 
212 | 
213 | ## ----fudv-anomaly-testdistr, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Graph showing highlighting of extreme values."----
214 | 
215 | testdistr(d$y1, extremevalues = "empirical",
216 |           ev.perc = .01)
217 | 
218 | 
219 | ## ----fudv-anomaly-td-gamma, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = paste("Graph showing highlighting of extreme values based on theoretical percentiles from a", c("normal distribution.", "Gamma distribution."))----
220 | 
221 | testdistr(d2$y1, "normal", extremevalues = "theoretical",
222 |           ev.perc = .001)
223 | 
224 | testdistr(d2$y1, "gamma", extremevalues = "theoretical",
225 |           ev.perc = .001)
226 | 
227 | 
228 | ## ----fudv-anomaly-masked, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Graph showing an anomalous value of 100 masked by the more extreme anomalous value of 1000."----
229 | 
230 | testdistr(c(d2$y2, 100, 1000), "normal",
231 |           extremevalues = "theoretical",
232 |           ev.perc = .001)
233 | 
234 | 
235 | ## ----fudv-anomaly-robust, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Graph highlighting extreme values based on a robust estimator."----
236 | 
237 | testdistr(c(d2$y2, 100, 1000), "normal",
238 |           robust = TRUE,
239 |           extremevalues = "theoretical",
240 |           ev.perc = .001)
241 | 
242 | 
243 | ## ------------------------------------------------------------------------
244 | 
245 | winsorizor(1:10, .1)
246 | 
247 | 
248 | ## ----fudv-anomaly-wins, fig.width=6, fig.height=3, out.width='1\\linewidth', fig.pos="!ht", fig.cap = "Panel graph comparing data before (A) and after (B) winsorizing the (empirical) bottom and top 1\\%."----
249 | 
250 | plot_grid(
251 |   testdistr(d2$y1, "gamma", extremevalues = "theoretical",
252 |             ev.perc = .005, plot=FALSE)$QQPlot,
253 |   testdistr(winsorizor(d2$y1, .01), "gamma", extremevalues = "theoretical",
254 |             ev.perc = .005, plot=FALSE)$QQPlot,
255 |   ncol = 2, labels = c("A", "B"), align = "hv")
256 | 
257 | 
258 | 


--------------------------------------------------------------------------------