├── Image
    └── Chrysanthemum.jpg
├── LICENSE
├── README.md
├── Chapter 11
    └── chapter11.R
├── Chapter 3
    └── chapter3.R
├── Chapter 5
    └── chapter5.R
├── Chapter 9
    └── chapter9.R
├── Chapter 6
    └── chapter6.R
├── Chapter 10
    └── chapter10.R
├── Chapter 7
    └── chapter7.R
├── Chapter 4
    └── chapter4.R
├── Chapter 8
    └── chapter8.R
└── Chapter 2
    └── chapter2.R


/Image/Chrysanthemum.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Simulation-for-Data-Science-with-R/HEAD/Image/Chrysanthemum.jpg


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Packt Publishing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | ## $5 Tech Unlocked 2021!
  5 | [Buy and download this Book for only $5 on PacktPub.com](https://www.packtpub.com/product/simulation-for-data-science-with-r/9781785881169)
  6 | -----
  7 | *If you have read this book, please leave a review on [Amazon.com](https://www.amazon.com/gp/product/1785881167).     Potential readers can then use your unbiased opinion to help them make purchase decisions. Thank you. The $5 campaign         runs from __December 15th 2020__ to __January 13th 2021.__*
  8 | 
  9 | # Simulation for Data Science with R
 10 | 
 11 | ## Code Testing Specifications
 12 | 
 13 | **Chapter 2: Required packages**
 14 | * boot_1.3-17
 15 | * colorspace_1.2-6
 16 | * data.table_1.9.6
 17 | * dplyr_0.4.3
 18 | * Formula_1.2-1
 19 | * ggplot2_2.1.0
 20 | * Hmisc_3.17-1
 21 | * laeken_0.4.6
 22 | * lattice_0.20-33
 23 | * MASS_7.3-43
 24 | * microbenchmark_1.4-2.1
 25 | * plyr_1.8.3
 26 | * profr_0.3.1
 27 | * Rcpp_0.12.4
 28 | * robustbase_0.92-5
 29 | * snow_0.4-1
 30 | * survival_2.38-3
 31 | * vcd_1.4-1
 32 | * VIM_4.4.1
 33 | 
 34 | **Chapter 3: Required packages**
 35 | * knitr_1.12.3
 36 | * Matrix_1.2-6
 37 | 
 38 | **Chapter 4: Required packages**
 39 | * coda_0.18-1
 40 | * ggplot2_2.1.0
 41 | * knitr_1.12.3
 42 | * MASS_7.3-43
 43 | 
 44 | **Chapter 5: Required packages**
 45 | * mvtnorm_1.0-5
 46 | * nloptr_1.0.4
 47 | * RCEIM_0.2
 48 | 
 49 | **Chapter 6: Required packages**
 50 | * car_2.1-2
 51 | 
 52 | 
 53 | **Chapter 7: Required packages**
 54 | * boot_1.3-17
 55 | * bootstrap_2015.2
 56 | * cvTools_0.3.2
 57 | * lattice_0.20-33
 58 | * MASS_7.3-43
 59 | * robustbase_0.92-5 car_2.1-2
 60 | * vcd_1.4-1
 61 | 
 62 | 
 63 | **Chapter 8: Required packages**
 64 | * boot_1.3-17
 65 | * colorspace_1.2-6
 66 | * data.table_1.9.6
 67 | * dplyr_0.4.3
 68 | * e1071_1.6-7
 69 | * ggplot2_2.1.0
 70 | * laeken_0.4.6
 71 | * MASS_7.3-43
 72 | * robustbase_0.92-5
 73 | * VIM_4.4.1
 74 | 
 75 | 
 76 | **Chapter 9: Required packages**
 77 | * colorspace_1.2-6
 78 | * data.table_1.9.6
 79 | * ggplot2_2.1.0
 80 | * MASS_7.3-43
 81 | * mice_2.25
 82 | * Rcpp_0.12.4
 83 | * robustbase_0.92-5
 84 | * VIM_4.4.1
 85 | 
 86 | 
 87 | **Chapter 10: Required packages** 
 88 | * boot_1.3-17
 89 | * colorspace_1.2-6
 90 | * data.table_1.9.6
 91 | * dplyr_0.4.3
 92 | * ggplot2_2.1.0
 93 | * laeken_0.4.6
 94 | * lattice_0.20-33
 95 | * MASS_7.3-43
 96 | * matrixcalc_1.0-3
 97 | * mice_2.25
 98 | * mvtnorm_1.0-5
 99 | * pls_2.5-0
100 | * Rcpp_0.12.4
101 | * reshape2_1.4.1
102 | * robCompositions_2.0.1
103 | * robustbase_0.92-5
104 | * simFrame_0.5.3
105 | * simPop_0.2.4
106 | * vcd_1.4-1
107 | * VIM_4.4.1
108 | 
109 | 
110 | **Chapter 11: Required packages**
111 | * lattice_0.20-33
112 | * Rcpp_0.12.4
113 | * simFrame_0.5.3
114 | 
115 | 
116 | 
117 | ##Related R Books:
118 | * [R for Data Science](https://www.packtpub.com/big-data-and-business-intelligence/r-data-science?utm_source=github&utm_medium=repository&utm_campaign=9781784390860)
119 | * [Learning R for Geospatial Analysis](https://www.packtpub.com/big-data-and-business-intelligence/learning-r-geospatial-analysis?utm_source=github&utm_medium=repository&utm_campaign=9781783984367)
120 | * [R Graphs Cookbook Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/r-graph-cookbook-%E2%80%93-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781783988785)
121 | ### Download a free PDF
122 | 
123 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
124 | <p align="center"> <a href="https://packt.link/free-ebook/9781785881169">https://packt.link/free-ebook/9781785881169 </a> </p>


--------------------------------------------------------------------------------
/Chapter 11/chapter11.R:
--------------------------------------------------------------------------------
  1 | ## ------------------------------------------------------------------------
  2 | fert2Rate <- function(age, time, lastbirth){ 
  3 |  a <- ifelse(time <= 2020, 32, 33) 
  4 |  b <- ifelse(time <= 2020, 6.0, 5.7) 
  5 |  frate <- (b / a) * (a  / age) ^ (3 / 2) * exp(-b ^ 2 * (a / age + age / a - 2)) 
  6 |  frate[age <= 15 | age >= 45 | lastbirth < 0.75] <- 0 
  7 |  return(frate) 
  8 | } 
  9 | 
 10 | ## ------------------------------------------------------------------------
 11 | fert2Rate(30, 2030, lastbirth = 3)
 12 | 
 13 | ## ------------------------------------------------------------------------
 14 | fert2Rate(40, 2030, lastbirth = 3)
 15 | 
 16 | ## ------------------------------------------------------------------------
 17 | mortRate <- function(age, time){ 
 18 |   a <- 0.0003 
 19 |   b <- ifelse(time <= 2020, 0.1, 0.097) 
 20 |   mrate <- a * exp(b * age) 
 21 |   return(mrate) 
 22 | } 
 23 | 
 24 | ## ------------------------------------------------------------------------
 25 | mortRate(40, 2056)
 26 | 
 27 | ## ---- message=FALSE, warning=FALSE---------------------------------------
 28 | library("simFrame")
 29 | data(eusilcP, package = "simFrame")
 30 | pop <- eusilcP[, c("age", "gender", "hsize", "hid")]
 31 | 
 32 | ## ------------------------------------------------------------------------
 33 | pop$nchildWomen <- ifelse(pop$gender == "female" & as.integer(pop$hsize) > 2 & pop$age > 17, as.integer(pop$hsize) - 2, 0)
 34 | pop$partnered <- factor(ifelse(as.integer(pop$hsize) >= 2 & pop$age > 17, "P", "A"))
 35 | 
 36 | ## ------------------------------------------------------------------------
 37 | head(pop)
 38 | 
 39 | ## ------------------------------------------------------------------------
 40 | stateSpace <- expand.grid(sex = levels(pop$gender), 
 41 |                           partnered =  levels(pop$partnered)) 
 42 | 
 43 | ## ------------------------------------------------------------------------
 44 | stateSpace
 45 | 
 46 | ## ------------------------------------------------------------------------
 47 | trMatrix_f <- cbind(c("female/A->female/P", "female/P->female/A"),
 48 |                     c("rates1", "rates2")) 
 49 | trMatrix_m <- cbind(c("male/A-male/P", "male/P-male/A"), 
 50 |                     c("rates3", "rates4")) 
 51 | allTransitions <- rbind(trMatrix_f, trMatrix_m) 
 52 | absTransitions <- rbind(c("female/dead", "mortRate"), 
 53 |                         c("male/dead", "mortRate")) 
 54 | 
 55 | ## ---- message=FALSE, warning=FALSE---------------------------------------
 56 | library("MicSim")
 57 | transitionMatrix <- buildTransitionMatrix(allTransitions = allTransitions, absTransitions = absTransitions, stateSpace = stateSpace) 
 58 | 
 59 | ## ------------------------------------------------------------------------
 60 | transitionMatrix[1,3] <- "rates3"
 61 | transitionMatrix[3,1] <- "rates4"
 62 | transitionMatrix
 63 | 
 64 | ## ------------------------------------------------------------------------
 65 | maxAge <- 100 
 66 | 
 67 | ## ------------------------------------------------------------------------
 68 | love <- function(t, x, parms){
 69 |   with(as.list(c(parms, x)), {
 70 |     dPrince_Harry <- a * Chelsy_Davy
 71 |     dChelsy_Davy <- -b * Prince_Harry
 72 |     res <- c(dPrince_Harry, dChelsy_Davy)
 73 |     list(res)
 74 |   })
 75 | }
 76 | 
 77 | ## ------------------------------------------------------------------------
 78 | parms  <- c(a = 1, b = 2)
 79 | times  <- seq(0, 30, length = 31)
 80 | ## Start values for steady state
 81 | y <- xstart <- c(Prince_Harry = 1, Chelsy_Davy = -1)
 82 | 
 83 | ## ---- message=FALSE, warning=FALSE---------------------------------------
 84 | library("deSolve")
 85 | out <-  ode(xstart, times, love, parms = parms) 
 86 | 
 87 | ## ----B05113_11_01--------------------------------------------------------
 88 | matplot(out)
 89 | 
 90 | ## ----B05113_11_02--------------------------------------------------------
 91 | y <- xstart <- c(Prince_Harry = 0.2, Chelsy_Davy = 1)
 92 | parms  <- c(a = 0.3, b = 0.7)
 93 | out <-  ode(xstart, times, love, parms = parms) 
 94 | matplot(out)
 95 | 
 96 | ## ------------------------------------------------------------------------
 97 | lv_mod <- function (time, state, parms) {
 98 |     with(as.list(c(state, parms)), {
 99 |         dx <- k1 * x - k2 * x * y
100 |         dy <- -k3 * y + k4 * x * y
101 |         return(list(c(dx, dy)))
102 |     })
103 | }
104 | 
105 | ## ------------------------------------------------------------------------
106 | parms <- c(k1 = 1, k2 = 1.5, k3 = .2, k4 = .6)
107 | state <- c(x = 10, y = 10)
108 | time <- seq(0, 200, by = 1)
109 | 
110 | ## ------------------------------------------------------------------------
111 | res <- ode(func = lv_mod, y = state, parms = parms, times = time)
112 | res <- as.data.frame(res)
113 | 
114 | ## ----B05113_11_03--------------------------------------------------------
115 | par(mar = c(4,4,0.5,0.1))
116 | matplot(res[,-1], type = "l", xlab = "time", ylab = "population")
117 | legend("topright", c("Toads", "Snakes"), lty = c(1,2), col = c(1,2), box.lwd = 0)
118 | 
119 | ## ------------------------------------------------------------------------
120 | sessionInfo()
121 | 
122 | 


--------------------------------------------------------------------------------
/Chapter 3/chapter3.R:
--------------------------------------------------------------------------------
  1 | ## ------------------------------------------------------------------------
  2 | trunc (2.3 * 100)
  3 | 
  4 | ## ------------------------------------------------------------------------
  5 | round(1.49999999999999)
  6 | round(1.4999999999999999)
  7 | 
  8 | ## ----match---------------------------------------------------------------
  9 | a <- seq(0.6, 1, by = 0.01)
 10 | match(0.88, a)
 11 | match(0.89, a)
 12 | match(0.94, a)
 13 | 
 14 | ## ----excel---------------------------------------------------------------
 15 | excel_round <- function (x, digits) round (x * (1 + 1e-15), digits)
 16 | round(0.5, digits = 0)
 17 | excel_round(0.5, digits = 0)
 18 | 
 19 | ## ----machine_eps---------------------------------------------------------
 20 | .Machine$double.eps
 21 | .Machine$double.digits
 22 | .Machine$double.xmax
 23 | 
 24 | ## ----B05113_03_01, echo=FALSE, fig.align='center'------------------------
 25 | a <- c(0.65, 0.62, 0.41, 0.43, 0.61, 0.49, 0.44, 0.58, 0.47, 0.53, 0.52, 0.48, 0.51)
 26 | par(mar=c(4,4,0.2,0.2))
 27 | plot(1:length(a), a, ylim = c(0,1), xlab="n", yaxt="n", ylab="")
 28 | axis(2, at=0.5, labels = "a", las=1)
 29 | abline(h=0.5 + 0.03)
 30 | abline(h=0.5 - 0.03)
 31 | axis(2, at=0.5+0.03, labels = expression(a + epsilon), las=1)
 32 | axis(2, at=0.5-0.03, labels = expression(a - epsilon), las=1)
 33 | polygon(x=c(0,14,14,0,0), c(0.5-0.03,0.5-0.03,0.5+0.03,0.5+0.03,0.5-0.03), col="lightgrey")
 34 | points(1:length(a), a)
 35 | 
 36 | ## ----convergence1, cache=TRUE--------------------------------------------
 37 | masch <- function(maxit=10000){
 38 |   summe <- 0
 39 |   summeNeu <- n <- 1
 40 |   ptm <- proc.time()
 41 |   while(summe != summeNeu & n < maxit){
 42 |     summe <- summeNeu
 43 |     summeNeu <- summe + 1/n
 44 |     n <- n + 1
 45 |   }
 46 |   time <- proc.time() - ptm
 47 |   list(summe=summeNeu, time=time[3])
 48 | }
 49 | masch(10000)$summe
 50 | masch(1000000)$summe
 51 | masch(10000000)$summe
 52 | 
 53 | ## ---- echo=FALSE---------------------------------------------------------
 54 | library(knitr)
 55 | 
 56 | ## ----result, cache=TRUE--------------------------------------------------
 57 | SEQ <- c(10,1000, seq(100000,10000000,length=10))
 58 | df <- cbind(SEQ, t(as.data.frame(sapply(SEQ, masch))))
 59 | kable(df)
 60 | 
 61 | ## ----convergence2, cache=TRUE--------------------------------------------
 62 | x <- 1
 63 | oldsum = 0
 64 | newsum = n = term = 1
 65 | while(oldsum != newsum){
 66 |   oldsum = newsum
 67 |   term = 1/factorial(n) 
 68 |   n = n + 1
 69 |   newsum = oldsum + term
 70 |   print(paste("n = ", n, ". Diff = ", term, ". Sum = ", newsum, sep=""))
 71 | }
 72 | 
 73 | ## ----sdev----------------------------------------------------------------
 74 | ## first formula for the standard deviation:
 75 | s1 <- function(x){
 76 |   s <- sum((x - mean(x))^2)
 77 | 	return(sqrt(1/(length(x)-1) * s))
 78 | }
 79 | ## second formula for the standard deviation:
 80 | s2 <- function(x){
 81 | #	s <- 1/(length(x)-1) * sum(x^2) - mean(x)^2
 82 | 	s <- sum(x^2) - 1/length(x) * sum(x)^2
 83 | 	return(sqrt(1/(length(x)-1) * s))
 84 | }
 85 | ## wrapper function:
 86 | st <- function(x, type) {
 87 |   switch(type,
 88 |          precise = s1(x),
 89 |          oldexcel = s2(x)
 90 |          )
 91 | }
 92 | ## generating 1000 random numbers from standard normal distribution:
 93 | x <- rnorm(1000)
 94 | ## show more digits:
 95 | options(digits=16)
 96 | ## results:
 97 | st(x, "precise")
 98 | st(x, "oldexcel")
 99 | 
100 | ## ----sdev_results, warning=FALSE, message=FALSE, tidy=FALSE--------------
101 | stall <- function(x){
102 | 	c(precise=st(x, "precise"), excel=st(x, "oldexcel"))
103 | }
104 | ## generate numbers (zeros and ones)
105 | x <- rep(0:1,100)
106 | stall(x)
107 | X <- matrix(nrow=length(x), ncol=10)
108 | X[,1] <- 1
109 | for(n in 2:ncol(X)) X[, n] <- x + 10^(2 * (n - 1))
110 | colnames(X) <- 2 * (1:ncol(X) - 1)
111 | dim(X)
112 | ## first four observations:
113 | head(X,4)
114 | options(digits=5)
115 | apply(X, 2, stall)
116 | 
117 | ## ----convergence3--------------------------------------------------------
118 | konv <- function(q = 2){ 	
119 |   s <-  0
120 | 	snew <- term <- i <- 1
121 | 	while(s != snew){	
122 | 	  s <- snew
123 | 		snew <-  s + q^(-i)
124 | 		i <- i + 1
125 | 	}
126 | 	list(iteration = i, total = snew, lastterm = 2^(-i))	
127 | }
128 | konv()
129 | 
130 | ## ------------------------------------------------------------------------
131 | konv(q = 0.5)
132 | 
133 | ## ----expsum--------------------------------------------------------------
134 | expsum <- function(x)
135 | {	oldsum <- 0
136 | 	newsum <- n <- term <- 1
137 | 	while( oldsum != newsum )
138 | 	{	oldsum <- newsum
139 | 		term <- term * x/n
140 | 		n <- n + 1
141 | 		newsum <- oldsum + term 
142 | 	}
143 | 	list(iteration = n, summe = newsum)
144 | }
145 | 
146 | ## ----error, cache=TRUE---------------------------------------------------
147 | x <- 1:1000
148 | absError <- sapply (x, function (x) abs (exp (x) - expsum (x) $ sum))
149 | relError <- absError / exp (x)
150 | 
151 | ## ----error2--------------------------------------------------------------
152 | roundingError <- sapply(x, function(x) 2^(-53)*exp(x))
153 | 
154 | ## ----B05113_03_02--------------------------------------------------------
155 | plot(x[1: 600], roundingError[1:600], log = "xy", xlab = "log (x)",
156 | ylab = "log (rounding errors)", type = "l")
157 | 
158 | ## ----B05113_03_03--------------------------------------------------------
159 | x <- seq(0, 20, by=0.1)
160 | iter <- sapply(x, function(x) expsum(x)$iteration)
161 | plot(x, iter, xlab="x", ylab="No. of iterationen until convergence", type="l") 
162 | 
163 | ## ----condition-----------------------------------------------------------
164 | library("Matrix")
165 | ## reciprocal approximate condition number
166 | rcond(Hilbert(9)) ## worse
167 | ## reciprocal condition number
168 | x1 <- cbind(1, 1:10)
169 | head(x1, 3)
170 | rcond(x1) ## much better
171 | 
172 | ## ------------------------------------------------------------------------
173 | sessionInfo()
174 | 
175 | 


--------------------------------------------------------------------------------
/Chapter 5/chapter5.R:
--------------------------------------------------------------------------------
  1 | ## ----rosenbroeck---------------------------------------------------------
  2 | mountains <- function(v) { 
  3 |   (1 - v[1])^2 + 100 * (v[2] - v[1]*v[1])^2 +
  4 |    0.3*(0.2 - 2*v[2])^2 + 100 * (v[1] - v[2]*v[2])^2 - 
  5 |    0.5*(v[1]^2 +5*v[2]^2) 
  6 | }
  7 | 
  8 | ## ---- eval=FALSE---------------------------------------------------------
  9 | ## library("animation")
 10 | ## grad.desc()
 11 | 
 12 | ## ---- eval=FALSE---------------------------------------------------------
 13 | ## ani.options(nmax = 70)
 14 | ## par(mar = c(4, 4, 2, 0.1))
 15 | ## f2 = function(x, y) sin(1/2 * x^2 - 1/4 * y^2 + 3) * cos(2 * x + 1 -
 16 | ##   exp(y))
 17 | ## grad.desc(f2, c(-2, -2, 2, 2), c(-1, 0.5), gamma = 0.3, tol = 1e-04)
 18 | 
 19 | ## ----B05113_05_02--------------------------------------------------------
 20 | n <- 300
 21 | ## to define a grid
 22 | x <- seq(-1, 2, length.out = n)
 23 | y <- seq(-1, 2, length.out = n)
 24 | ## evaluate on each grid point
 25 | z <- mountains(expand.grid(x, y))
 26 | ## contour plot
 27 | par(mar = c(4,4,0.5,0.5))
 28 | contour(x, y,  matrix(log10(z), length(x)), 
 29 |         xlab = "x", ylab = "y", nlevels = 20)
 30 | ## starting value
 31 | sta <- c(0.5,-1)
 32 | points(sta[1], sta[2], cex = 2, pch = 20)
 33 | ## solutions for each of 20 steps
 34 | sol <- matrix(, ncol=2, nrow = 21)
 35 | sol[1, ] <- sta
 36 | for(i in 2:20){
 37 | 	sol[i, ] <- nlm(mountains, sta, iterlim = i)$est
 38 | }
 39 | ## optimal solution
 40 | sol[21, ] <- nlm(mountains, sta)$est
 41 | points(sol[21, 1], sol[21, 2], cex = 3, col = "red", pch = 20)
 42 | ## path visually
 43 | lines(sol, pch=3, type="o")
 44 | ## now let's start better (dashed line)
 45 | sta <- c(0,-1)
 46 | for(i in 2:20){
 47 | 	sol[i, ] <- nlm(mountains, sta, iterlim = i)$est
 48 | }
 49 | sol[1, ] <- sta
 50 | sol[21, ] <- nlm(mountains, sta)$est
 51 | points(sta[1], sta[2], cex = 2, pch = 20)
 52 | points(sol[21, 1], sol[21, 2], cex = 3, col = "red", pch = 20)
 53 | lines(sol, pch=3, type="o")
 54 | 
 55 | ## ----B05113_05_03, cache=FALSE-------------------------------------------
 56 | ## wrapper for all methods of optim
 57 | optims <- function(x, meth = "Nelder-Mead", start = c(0.5, -1)){
 58 |   sol <- matrix(, ncol = 2, nrow = 21)
 59 |   sol[1, ] <- start
 60 |   for(i in 2:20){
 61 |     sol[i, ] <- optim(start, mountains, method = meth, 
 62 |                       control = list(maxit=i))$par
 63 |   }
 64 |   sol[21,] <- optim(start, mountains)$par
 65 |   points(start[1], start[2], pch=20, cex = 2)
 66 |   points(sol[21, ], sol[21, ], pch = 20, col = "red", cex = 3) 
 67 |   lines(sol[, 1], sol[, 2], type = "o", pch = 3)
 68 | }
 69 | ## plot lines for all methods
 70 | par(mar=c(4,4,0.5,0.5))
 71 | contour(x, y,  matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
 72 | optims()  # Nelder-Mead 
 73 | optims("BFGS")
 74 | optims("CG")
 75 | optims("L-BFGS-B")
 76 | optims("SANN")
 77 | optims("Brent")
 78 | optims(start = c(1.5,0.5))
 79 | 
 80 | ## ----B05113_05_04, cache=TRUE--------------------------------------------
 81 | ## define grid
 82 | n <- 1500
 83 | set.seed(1234567)
 84 | x1 <- runif(n, min = -2, max = 5)
 85 | y1 <- runif(n, min = -2, max = 5) 
 86 | z1 <- matrix(, ncol = n, nrow = n)
 87 | ## evaluate on each grid point
 88 | for(i in 1:n){
 89 |   for(j in 1:n){
 90 | 	  z1[i,j] <- mountains(c(x1[i], y1[j]))
 91 |   }
 92 | }
 93 | ## determine optima
 94 | w <- which(z1 == min(z1), arr.ind=TRUE)
 95 | ## plot results
 96 | par(mar=c(4,4,0.5,0.5))
 97 | contour(x, y,  matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
 98 | points(x1[w[1]], y1[w[2]], pch = 20, col = "red", cex = 3)
 99 | points(x1, y1, pch=3)
100 | 
101 | ## ----B05113_05_05, cache=TRUE--------------------------------------------
102 | library("RCEIM")
103 | set.seed(123)
104 | sol <- best <- list() 
105 | ## save solution for each step
106 | for(i in 2:20){
107 |   a <- ceimOpt(mountains, nParam = 2, maxIter = i)
108 |   sol[[i]] <- a$EliteMembers 
109 |   best[[i]] <- a$BestMember
110 | }
111 | ## plot the results for each step
112 | par(mar=c(4,4,0.5,0.5))
113 | contour(x, y,  matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
114 | greys <- grey(rev(2:20 / 20 - 0.099))
115 | for(i in 2:20){
116 |   points(sol[[i]][,1], sol[[i]][,2], col = greys[i])
117 |   points(best[[i]][1], best[[i]][2], col = "red", pch = 3)  
118 | }
119 | points(best[[i]][1], best[[i]][2], col = "red", pch = 20, cex = 3)
120 | 
121 | ## ----randomwalk----------------------------------------------------------
122 | ## Simple random walk Metropolis Hastings:
123 | rmh <- function(n = 20, start = c(0,-0.5), stepmult = 10){
124 |   x <- matrix(, ncol = 2, nrow = n)
125 |   x[1, ] <- start 
126 |   sol <- mountains(start)
127 |   for(i in 2:n){
128 |     x[i, ] <- x[i-1, ] + rmvnorm(1, mean = c(0, 0), 
129 |                             sigma = stepmult * diag(2) / n)
130 |     solnew <- mountains(x[i, ])
131 |     # accept only a better solution:
132 |     if(solnew > sol) x[i, ] <- x[i-1, ]
133 |     if(solnew < sol) sol <- solnew
134 |   }
135 |   return(x)
136 | }
137 | 
138 | ## ----walk----------------------------------------------------------------
139 | library("mvtnorm")
140 | set.seed(12345)
141 | n <- 200
142 | x1 <- rmh(n, start = c(1.5,0))
143 | x2 <- rmh(n, start = c(1.5,0))
144 | 
145 | ## ----B05113_05_06--------------------------------------------------------
146 | par(mar=c(4,4,0.5,0.5))
147 | contour(x, y,  matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
148 | points(x1[1, 1], x1[1, 2], pch = 4, cex = 3)
149 | points(x2[n, 1], x2[n, 2], pch = 20, col = "red", cex = 3)
150 | points(x1[n, 1], x1[n, 2], pch = 20, col = "red", cex = 3)
151 | lines(x1[, 1], x1[, 2], type = "o", pch = 3)
152 | lines(x2[, 1], x2[, 2], type = "o", col = "blue", lty = 2)
153 | 
154 | ## ----B05113_05_07code----------------------------------------------------
155 | stoGrad <- function(start = c(0, -0.5), j = 1500, p = 0.1){
156 |   theta <- matrix(start, ncol=2)
157 |   diff <- iter <- 1
158 |   alpha <- sapply(1:100, function(x) 1 / (j+1) )
159 |   beta  <- sapply(1:100, function(x) 1 / (j+1)^(p) )
160 |   
161 |   while( diff > 10^-5 & !is.nan(diff) & !is.na(diff) ){
162 |     zeta <- rnorm(2)
163 |     zeta <- zeta / sqrt(t(zeta) %*% zeta)
164 |     grad <- alpha[iter] * zeta * (mountains(theta[iter, ] + beta[iter] * zeta) - 
165 |              mountains(theta[iter, ] - beta[iter] * zeta)) / beta[iter]
166 |     theta <- rbind(theta, theta[iter, ] - grad)
167 |     diff <- sqrt(t(grad) %*% grad )
168 |     iter <- iter + 1
169 |   } 
170 |   list(theta = theta[1:(iter-1), ], diff = diff, iter = iter-1)
171 | }
172 | 
173 | ## ----B05113_05_07--------------------------------------------------------
174 | set.seed(123)
175 | s1 <- stoGrad()
176 | par(mar=c(4,4,0.5,0.5))
177 | contour(x, y,  matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
178 | plotLine <- function(x, ...){
179 |   lines(x$theta[,1], x$theta[,2], type = "o", ...)
180 |   points(x$theta[x$iter, 1], x$theta[x$iter, 1], pch = 20, col = "red", cex = 3)
181 | }
182 | plotLine(s1, pch = 3)
183 | points(0, -0.5, pch = 20, cex = 1.5)  
184 | plotLine(stoGrad(), col = "blue", pch = 4)
185 | plotLine(stoGrad(start = c(1.5, 0)), pch = 3, lty = 2)
186 | plotLine(stoGrad(start = c(1.5, 0)), col = "blue", pch = 4, lty = 2)
187 | points(1.5, 0, pch = 20, cex = 1.5)  
188 | 
189 | ## ----B05113_05_08--------------------------------------------------------
190 | set.seed(123)
191 | s1 <- stoGrad(p = 2.5)
192 | par(mar=c(4,4,0.5,0.5))
193 | contour(x, y,  matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
194 | plotLine <- function(x, ...){
195 |   lines(x$theta[,1], x$theta[,2], type = "o", ...)
196 |   points(x$theta[x$iter, 1], x$theta[x$iter, 1], pch = 20, col = "red", cex = 3)
197 | }
198 | plotLine(s1, pch = 3)
199 | points(0, -0.5, pch = 20, cex = 1.5)  
200 | plotLine(stoGrad(p = 2.5), col = "blue", pch = 4)
201 | plotLine(stoGrad(start = c(1.5, 0), j=1500, p=2.5), pch = 3, lty = 2)
202 | plotLine(stoGrad(start = c(1.5, 0), j=1500, p=2.5), col = "blue", pch = 4, lty = 2)
203 | points(1.5, 0, pch = 20, cex = 1.5)  
204 | 
205 | ## ----B05113_05_09, warning=FALSE, message=FALSE--------------------------
206 | library("nloptr")
207 | set.seed(123)
208 | ## mountains function with modified function parameters
209 | mountains1 <- 
210 | function(x) ((1 - x[1])^2 + 100 * (x[2] - x[1]*x[1])^2 +
211 |    0.3*(0.2 - 2*x[2])^2 + 100 * (x[1] - x[2]*x[2])^2 - 
212 |    0.5*(x[1]^2 +5*x[2]^2))
213 | x0 <- c(0.5, -1)
214 | lb <- c(-3, -3)
215 | ub <- c(3, 3)
216 | sol <- matrix(, ncol=2,nrow=21)
217 | ## solution on each step
218 | for(i in 1:20){
219 |   sol[i, ] <- isres(x0 = x0, fn = mountains1, lower = lb, upper = ub, maxeval = i)$par
220 | }
221 | par(mar=c(4,4,0.5,0.5))
222 | contour(x, y,  matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
223 | ## start
224 | points(sol[1, 1], sol[1, 2], pch = 20, cex = 2)
225 | ## optima found
226 | sol[21,] <- isres(x0 = x0, fn = mountains1, lower = lb, upper = ub)$par
227 | points(sol[21, 1], sol[21, 2], pch = 20, col = "red", cex = 3)
228 | ## way to optima
229 | lines(sol[,1], sol[,2], type = "o", pch = 3)
230 | 
231 | ## ------------------------------------------------------------------------
232 | sessionInfo()
233 | 
234 | 


--------------------------------------------------------------------------------
/Chapter 9/chapter9.R:
--------------------------------------------------------------------------------
  1 | ## ---- echo=FALSE---------------------------------------------------------
  2 | knitr::opts_chunk$set(cache=TRUE)
  3 | 
  4 | ## ------------------------------------------------------------------------
  5 | y <- matrix(c(11, 22, 16, 24, 17, NA), nrow=2)
  6 | y
  7 | 
  8 | ## ------------------------------------------------------------------------
  9 | m <- mean(y, na.rm = TRUE)
 10 | m
 11 | y[2,3] <- m
 12 | y
 13 | 
 14 | ## ------------------------------------------------------------------------
 15 | ## stop criterion
 16 | eps <- 0.001
 17 | ## intitialisations
 18 | yalt <- y
 19 | n <- 0
 20 | converged <- FALSE
 21 | ## iteration
 22 | while(!converged){
 23 | 	n <- n + 1
 24 | 	yalt <- y
 25 | 	m1 <- mean(y)
 26 | 	## E-step (estimate parameters)
 27 | 	a <- rowMeans(y) - m1
 28 | 	b1 <- colMeans(y) - m1	
 29 | 	## M-step (update y23)
 30 | 	y[2, 3] <- m1 + a[2] + b1[3]		
 31 | 	## stop criterion
 32 | 	converged <- (abs(y[2, 3] - yalt[2, 3]) < eps)		
 33 | }
 34 | list(yImp = y, iterations = n)
 35 | 
 36 | ## ------------------------------------------------------------------------
 37 | distMan <- function(x, centers){
 38 |   if(class(x) == "data.frame") x <- as.matrix(x)
 39 | 	d <- matrix(0, nrow=nrow(x), ncol=nrow(centers))
 40 | 	## dist center to observations for each cluster
 41 | 	for(k in 1:nrow(centers)){
 42 | 		d[,k] <- abs( colSums((t(x) - centers[k,])) )
 43 | 	}
 44 | 	return(d)
 45 | }
 46 | 
 47 | ## ------------------------------------------------------------------------
 48 | means <- function(x, cluster){
 49 | 	cen <- matrix(NA, nrow=max(cluster), ncol <- ncol(x))
 50 | 	## cluster means for each cluster
 51 | 	for(n in 1:max(cluster)){
 52 | 		cen[n,] <- apply(x[cluster==n,], 2, median)
 53 | 	}
 54 | 	return(cen)
 55 | }
 56 | 
 57 | ## ------------------------------------------------------------------------
 58 | my_kmeans <- function(x, k, clmeans = means, distance = distMan, iter = 99999, seed = NULL){
 59 |   if(!is.null(seed)) set.seed(seed)
 60 | 	cent <- newcent <- x[sample(1:nrow(x), size=k), ]
 61 | 	oldclust <- 0
 62 | 	j <- 0
 63 | 	for(i in 1:iter){ # better: while()
 64 | 	  j <- j + 1
 65 | 	  cent <- newcent
 66 | 	  ## M-step
 67 | 		dist <- distance(x, cent)
 68 | 		clust <- max.col(-dist)
 69 | 		## E-step
 70 | 		newcent <- clmeans(x, clust)
 71 | 		if(all(clust == oldclust)) break()
 72 | 		oldclust <- clust
 73 | 	}                  
 74 | 	res <- list(centers = cent, 
 75 | 	            cluster = factor(clust), 
 76 | 	            iterations = j)
 77 | 	return(res)
 78 | }
 79 | 
 80 | ## ----B05113_09_01--------------------------------------------------------
 81 | data(Nclus, package = "flexclust")
 82 | x <- data.frame(Nclus)
 83 | library("ggplot2")
 84 | qplot(X1, X2, data=data.frame(Nclus)) + 
 85 |   xlab("") + ylab("") + theme_bw()
 86 | 
 87 | ## ---- warning=FALSE,message=FALSE----------------------------------------
 88 | set.seed(123456)
 89 | cl1 <- kmeans(x, centers = 4, iter.max = 1, algorithm = "MacQueen")
 90 | set.seed(123456)
 91 | cl2 <- kmeans(x, centers = 4, iter.max = 2, algorithm = "MacQueen")
 92 | set.seed(123456)
 93 | cl3 <- kmeans(x, centers = 4, iter.max = 3, algorithm = "MacQueen")
 94 | set.seed(123456)
 95 | cl4 <- kmeans(x, centers = 4, algorithm = "MacQueen")
 96 | 
 97 | ## ------------------------------------------------------------------------
 98 | cl1$centers
 99 | 
100 | ## ---- echo=FALSE---------------------------------------------------------
101 | initstep <- function(x = Nclus, cluster = cl1){
102 |    plot(x, main = "E-step (1)", cex.main=1.8)
103 |    points(cluster$center[, 1], cluster$center[, 2], cex = 3, pch = 18) 
104 | }
105 | estep <- function(x = Nclus, cluster = cl2, text = "E-step (1)"){
106 |    plot(x, col = cluster$cluster, pch = as.numeric(cluster$cluster), main=text,
107 |        cex.main=1.8)
108 |    points(cluster$center[, 1], cluster$center[, 2], cex = 3, pch = 18, col = 1:4)
109 | }
110 | mstep <- function(x = Nclus, cluster = cl2, cluster2 = cl2, text="M-step (1)"){
111 |   plot(x, col = cluster2$cluster, pch = as.numeric(cluster2$cluster), main=text,
112 |        cex.main=1.8)
113 |   points(cluster2$center[, 1], cluster2$center[, 2], cex = 3, pch = 18, col = 1:4)
114 |     segments(x0 = Nclus[cluster$cluster == 1, 1], x1 = cluster$center[1,1],
115 |            y0 = Nclus[cluster$cluster == 1, 2], y1 = cluster$center[1,2], col = "grey")
116 |   segments(x0 = Nclus[cluster$cluster == 2, 1], x1 = cluster$center[2,1],
117 |            y0 = Nclus[cluster$cluster == 2, 2], y1 = cluster$center[2,2], col = "grey")
118 |   segments(x0 = Nclus[cluster$cluster == 3, 1], x1 = cluster$center[3,1],
119 |            y0 = Nclus[cluster$cluster == 3, 2], y1 = cluster$center[3,2], col = "grey")
120 |   segments(x0 = Nclus[cluster$cluster == 4, 1], x1 = cluster$center[4,1],
121 |            y0 = Nclus[cluster$cluster == 4, 2], y1 = cluster$center[4,2], col = "grey")
122 |   points(x=Nclus[,1], y=Nclus[,2], col = cluster2$cluster, pch = as.numeric(cluster2$cluster))
123 |   points(cluster$center[, 1], cluster$center[, 2], cex = 3, pch = 18, col = 1:4)
124 | }
125 | 
126 | ## ----B05113_09_02, fig.width=9, fig.height=9, echo=FALSE-----------------
127 | par(mfrow = c(3,2), mar = c(1,1,3,0.1), axis=FALSE, xaxt="n", yaxt="n")
128 | initstep()
129 | mstep(cluster = cl1, cluster2 = cl1)
130 | estep(cluster = cl2, text="E-step (2)")
131 | mstep(cluster = cl2, cluster2=cl2, text="M-step (2)")
132 | estep(cluster = cl3, text="E-step (converged)")
133 | mstep(cluster = cl3, cluster2=cl3, text="E-step (converged)")
134 | 
135 | ## ---- message=FALSE, warning=FALSE---------------------------------------
136 | library("MASS")
137 | library("robustbase")
138 | library("VIM")
139 | data("sleep")
140 | str(sleep)
141 | 
142 | ## ------------------------------------------------------------------------
143 | apply(sleep, 2, function(x) any(is.na(x)))
144 | 
145 | ## ---- message=FALSE, warning=FALSE---------------------------------------
146 | ## index of missing values
147 | ind <- data.frame(is.na(sleep))
148 | ## initialization
149 | sleep <- kNN(sleep)  
150 | ## overwrite missing initialization with bad choice
151 | sleep$Sleep[ind$Sleep] <- 2240  # bad initialization
152 | ## initialized missing values in variable sleep
153 | sleep$Sleep[ind$Sleep]
154 | 
155 | ## ---- message=FALSE, warning=FALSE---------------------------------------
156 | ## E-step (1)
157 | lm1 <- lm(Sleep ~ log(BodyWgt) + log(BrainWgt) + NonD + Danger, data = sleep)
158 | ## M-step (1)
159 | sleep$Sleep[ind$Sleep] <- predict(lm1)[ind$Sleep]
160 | ## print of updated missing values
161 | sleep$Sleep[ind$Sleep]
162 | 
163 | ## ---- message=FALSE, warning=FALSE---------------------------------------
164 | ## E-step (2)
165 | lm1 <- lm(Sleep ~ log(BodyWgt) + log(BrainWgt) + NonD + Danger, data = sleep)
166 | ## M-step (2)
167 | sleep$Sleep[ind$Sleep] <- predict(lm1)[ind$Sleep]
168 | ## print of updated missing values
169 | sleep$Sleep[ind$Sleep]
170 | 
171 | ## ------------------------------------------------------------------------
172 | EMregSleep <- function(method = lm, eps = 0.001, init = "bad"){
173 |   ## index of missing values
174 |   ind <- is.na(sleep)
175 |   colnames(ind) <- colnames(sleep)
176 |   indsleep <- ind[, "Sleep"]
177 |   ## initialization
178 |   if(init == "bad"){
179 |     sleep <- kNN(sleep, imp_var = FALSE)  
180 |     sleep$Sleep[indsleep] <- 2240  # bad initialization
181 |   }
182 |   if(init == "worst"){
183 |     sleep[ind] <- 2240  # worst initialization    
184 |   }
185 |   iteration <- 0
186 |   criteria <- 99999
187 |   while(criteria > eps){
188 |     iteration <- iteration + 1
189 |     prev_sol <- sleep$Sleep[indsleep]
190 |     ## E-step
191 |     lm1 <- method(Sleep ~ log(BodyWgt) + log(BrainWgt) + NonD + Danger, 
192 |                   data = sleep)
193 |     ## M-step
194 |     sleep$Sleep[indsleep] <- predict(lm1)[indsleep]
195 |     criteria <- sqrt(sum((prev_sol - sleep$Sleep[indsleep])^2))
196 |   }
197 |   res <- list("imputed" = sleep, 
198 |               "iteration" = iteration, 
199 |               lastmodel = lm1)
200 |   return(res)
201 | }
202 | 
203 | ## ------------------------------------------------------------------------
204 | data("sleep")
205 | sleepImp <- EMregSleep()
206 | missVals <- sleepImp$imputed$Sleep[ind$Sleep]
207 | missVals
208 | sleepImp$iteration
209 | 
210 | ## ------------------------------------------------------------------------
211 | missVals + sample(residuals(sleepImp$lastmodel), length(missVals))
212 | 
213 | ## ---- message=FALSE, warning=FALSE---------------------------------------
214 | data("sleep")
215 | ## OLS regression
216 | lm_ols <- EMregSleep(method = lm, init = "worst")
217 | ## M-estimation
218 | lm_rlm <- EMregSleep(method = rlm, init = "worst", eps= 0.01)
219 | lm_ols$imputed$Sleep[ind[, "Sleep"]]
220 | lm_rlm$imputed$Sleep[ind[, "Sleep"]]
221 | 
222 | ## ------------------------------------------------------------------------
223 | data("sleep")
224 | sleepImp <- irmi(sleep)
225 | sleepImp[ind[, "Sleep"], "Sleep"]
226 | 
227 | ## ------------------------------------------------------------------------
228 | library("mice")
229 | data("sleep")
230 | em_mice <- mice(sleep, m = 1)
231 | em_mice$imp$Sleep
232 | ## now with bad intitialisation in predictors
233 | sleep[is.na(sleep)] <- 2240
234 | sleep$Sleep[ind[, "Sleep"]] <- NA
235 | em_mice <- mice(sleep, m = 1)
236 | em_mice$imp$Sleep
237 | 
238 | ## ------------------------------------------------------------------------
239 | args(irmi)
240 | 
241 | ## ------------------------------------------------------------------------
242 | sessionInfo()
243 | 
244 | 


--------------------------------------------------------------------------------
/Chapter 6/chapter6.R:
--------------------------------------------------------------------------------
  1 | ## ---- message=FALSE, warning=FALSE, tidy=FALSE, eval=TRUE----------------
  2 | library("RCurl")
  3 | URL <- "https://www.national-lottery.co.uk/results/euromillions/draw-history/csv"
  4 | lotto <- read.csv(textConnection(getURL(URL)))
  5 | 
  6 | ## ------------------------------------------------------------------------
  7 | str(lotto)
  8 | 
  9 | ## ----B05113_06_01, fig.align='center'------------------------------------
 10 | numbers <- unlist(c(lotto[,2:5]))
 11 | library("ggplot2")
 12 | qplot(factor(numbers), xlab = "winning numbers")  + 
 13 |   theme_bw() + 
 14 |   theme(axis.text.x=element_text(angle=90)) + 
 15 |   scale_y_continuous(breaks=0:10)
 16 | 
 17 | ## ------------------------------------------------------------------------
 18 | simulateDice <- function(n=200){
 19 |   A <- c(2, 4, 6)
 20 |   B <- 1:4
 21 |   C <- 3:6
 22 |   AB <- intersect(A, B)
 23 |   AC <- intersect(A, C)
 24 |   BC <- intersect(B, C)
 25 |   ABC <- intersect(AB, AC)
 26 |   df <- data.frame(d=rep(0,n), A=rep(0,n), B=rep(0,n), C=rep(0,n),
 27 |                    AB=rep(0,n), AC=rep(0,n), BC=rep(0,n), ABC=rep(0,n))
 28 |   s <- sample(1:6, n, replace=TRUE)
 29 |   df[s %in% ABC, "ABC"] <- 1 
 30 |   df[s %in% BC & !(df$ABC == 1), "BC"] <- 1
 31 |   df[s %in% AC & !(apply(df[, c("BC","ABC")], 1, function(x) any(x == 1))), "AC"] <- 1
 32 |   df[s %in% AB & !(apply(df[, c("AC","BC","ABC")], 1, function(x) any(x == 1))), "AB"] <- 1
 33 |   df[s %in% C & !(apply(df[, c("AB","AC","BC","ABC")], 1, function(x) any(x == 1))), "C"] <- 1
 34 |   df[s %in% B & !(apply(df[, c("C","AB","AC","BC","ABC")], 1, function(x) any(x == 1))), "B"] <- 1
 35 |   df[s %in% A & !(apply(df[, c("B","C","AB","AC","BC","ABC")], 1, function(x) any(x == 1))), "A"] <- 1
 36 |   df[, "d"] <- s
 37 |   return(df)
 38 | }
 39 | set.seed(123)
 40 | simulateDice(10)
 41 | 
 42 | ## ------------------------------------------------------------------------
 43 | set.seed(123)
 44 | s100 <- simulateDice(100)
 45 | ## count the outcomes regarding the different events:
 46 | colSums(s100[, 2:ncol(s100)])
 47 | 
 48 | ## ------------------------------------------------------------------------
 49 | set.seed(123)
 50 | s1000 <- simulateDice(1000)
 51 | colSums(s1000[, 2:ncol(s1000)])
 52 | 
 53 | ## ------------------------------------------------------------------------
 54 | s1000 <- simulateDice(1000)
 55 | colSums(s1000[, 2:ncol(s1000)])
 56 | 
 57 | ## ----B05113_06_03, echo=FALSE, fig.align='center'------------------------
 58 | a <- c(0.65, 0.62, 0.41, 0.43, 0.61, 0.49, 0.44, 0.58, 0.47, 0.53, 0.52, 0.48, 0.51)
 59 | plot(1:length(a), a, ylim = c(0,1), xlab="n", yaxt="n", ylab="")
 60 | #axis(2, at=0.5, labels = "a", las=1)
 61 | abline(h=0.5 + 0.03)
 62 | abline(h=0.5 - 0.03)
 63 | axis(2, at=0.5+0.03, labels = expression(a + epsilon), las=1)
 64 | axis(2, at=0.5-0.03, labels = expression(a - epsilon), las=1)
 65 | polygon(x=c(0,14,14,0,0), c(0.5-0.03,0.5-0.03,0.5+0.03,0.5+0.03,0.5-0.03), col="lightgrey")
 66 | points(1:length(a), a)
 67 | d <- dnorm(seq(-5,5,length.out = 100)) *10 + 6
 68 | df <- data.frame(d=d, s=seq(-0.2+0.5, 0.2+0.5, length.out = length(d)))
 69 | lines(df$d, df$s)
 70 | abline(v=6, col="grey")
 71 | d <- dnorm(seq(-10,10,length.out = 100)) *3.5 + 2
 72 | df <- data.frame(d=d, s=seq(-1.4+0.5, 1.4+0.5, length.out = length(d)))
 73 | lines(df$d, df$s)
 74 | abline(v=2, col="grey")
 75 | 
 76 | ## ------------------------------------------------------------------------
 77 | dbinom(x = 0, size = 1, prob = 0.5)
 78 | 
 79 | ## ------------------------------------------------------------------------
 80 | sample(c("head", "number"), size = 1)
 81 | # alternativ:
 82 | rbinom(n = 1, size = 1, prob = 0.5)
 83 | 
 84 | ## ------------------------------------------------------------------------
 85 | simCoin <- function(n, p = 0.5, repl = 1){
 86 |   stopifnot(n > 0 | !is.vector(n) | p < 0 | p > 0 | !is.vector(repl))
 87 |   ## function for one simulation
 88 |   r <- function(){
 89 |     res <- rbinom(n, 1, p)
 90 |     tosses <- 1:n
 91 |     pA <- cumsum(res) / 1:n
 92 |     abserror <- abs(pA - p) 
 93 |     return(data.frame(res = res, tosses = tosses, pA = pA, abserror = abserror))
 94 |   }
 95 |   ## simulation
 96 |   df <- r()
 97 |   if(repl > 1){
 98 |     for(i in 2:repl){
 99 |       df <- rbind(df, r())
100 |     }
101 |   }
102 |   ## return
103 |   df$repl <- rep(1:repl, each = n)
104 |   ll <- list(res = df$res, tosses = df$tosses, pA = df$pA, 
105 |              absfehler = df$abserror, repl = as.factor(df$repl))
106 |   class(ll) <- "Coin"
107 |   return(ll)
108 | }
109 | ## print 
110 | print.Coin <- function(x, ..., s = NULL){
111 |   if(!is.null(s)){
112 |   cat("After", s, "random draws: the estimated P(A) =", x$pA[s], "\nand the absolute error", x$absfehler[s], "\n")
113 |   } else {
114 |     m <- max(x$tosses)
115 |   cat("After", m, "random draws: the estimated P(A) =", x$pA[m], "\nand the absolute error", x$absfehler[m], "\n")   
116 |   }  
117 | }
118 | 
119 | ## ------------------------------------------------------------------------
120 | ## for reproducibility
121 | set.seed(1234) 
122 | # 10 throws
123 | simCoin(10)
124 | 
125 | ## ---- cache=TRUE---------------------------------------------------------
126 | set.seed(1234) 
127 | sim <- simCoin(5000)
128 | print(sim, s=100)
129 | print(sim, s=1000)
130 | print(sim, s=5000)
131 | 
132 | ## ----lehr1---------------------------------------------------------------
133 | plot.Coin <- function(x, y, ...){
134 |   df <- data.frame(res = x$res, tosses = x$tosses, pA = x$pA, repl=x$repl)
135 |   if(length(unique(df$repl)) == 1){
136 |      ggplot(df, aes(x=tosses, y=pA)) + 
137 |       geom_line() + geom_abline(intercept = 0.5) + ylim(c(0,1)) +  
138 |       theme(legend.position="none")
139 |   } else if(length(unique(df$repl)) > 10){
140 |     gg <- ggplot(df, aes(x=tosses, y=pA, group=repl)) + 
141 |       geom_line() + geom_abline(intercept = 0.5) + ylim(c(0,1)) 
142 |     ## add median line and confidence interval
143 |     dfwide <- reshape2::dcast(df, tosses ~ repl, value.var="pA")
144 |     dfwide <- dfwide[, 2:ncol(dfwide)]
145 |     med <- apply(dfwide, 1, median)
146 |     q025 <- apply(dfwide, 1, quantile, 0.025)
147 |     q975 <- apply(dfwide, 1, quantile, 0.975)
148 |     stat <- data.frame(med=med, q025=q025, q975=q975, 
149 |                        n=1:max(x$tosses),
150 |                        repl=max(as.numeric(df$repl)))
151 |     gg + 
152 |       geom_line(data=stat, aes(x = n, y = med), colour = "red", size=1) + 
153 |       geom_line(data=stat, aes(x = n, y = q025), colour = "orange", size=0.7) +
154 |       geom_line(data=stat, aes(x = n, y = q975), colour = "orange", size=0.7) +
155 |       theme(legend.position="none")
156 |   } else {
157 |    ggplot(df, aes(x=tosses, y=pA, colour = repl)) + 
158 |       geom_line() + geom_abline(intercept = 0.5) + ylim(c(0,1))    
159 |   }
160 | }
161 | 
162 | ## ----B05113_06_04--------------------------------------------------------
163 | plot(sim)
164 | 
165 | ## ----B05113_06_05, cache=TRUE--------------------------------------------
166 | set.seed(1234) 
167 | sim <- simCoin(n = 5000, repl = 10)
168 | plot(sim)
169 | 
170 | ## ----B05113_06_06, cache=TRUE--------------------------------------------
171 | sim <- simCoin(n = 5000, repl = 1000)
172 | plot(sim)
173 | 
174 | ## ----B05113_06_07, fig.height=7------------------------------------------
175 | plotbinomcoin <- function(n){
176 |   plot(0:n/n, dbinom(0:n, n, 0.5), type = "h", 
177 |        xlab = paste("relative frequencies (n =", n,")"), 
178 |        ylab = "p")
179 | }
180 | par(mar = c(4,4,0.5,0.5), mfrow = c(4,2))
181 | plotbinomcoin(10)
182 | plotbinomcoin(20)
183 | plotbinomcoin(40)
184 | plotbinomcoin(80)
185 | plotbinomcoin(160)
186 | plotbinomcoin(320)
187 | plotbinomcoin(5000)
188 | plotbinomcoin(10000)
189 | 
190 | ## ------------------------------------------------------------------------
191 | dbinom(0, 1, 0.5)
192 | 
193 | ## ------------------------------------------------------------------------
194 | set.seed(10230)
195 | s <- sample(c(FALSE,TRUE), 5000, replace = TRUE)
196 | s[1:10]
197 | ev <- function(x){
198 |   res <- c(length(x), sum(x), sum(x) / length(x), abs(sum(x) / length(x) - 0.5))
199 |   names(res) <- c("n", "h(A)", "p(A)", "|p(A) - 0.5|")
200 |   return(res)
201 | }
202 | # summary of first 10 outcomes
203 | ev(s[1:10])
204 | 
205 | ## ------------------------------------------------------------------------
206 | # summary of first 100 outcomes
207 | ev(s[1:100])
208 | # summary of first 1000 outcomes
209 | ev(s[1:1000])
210 | # summary of first 5000 outcomes
211 | ev(s)
212 | 
213 | ## ----B05113_06_01x-------------------------------------------------------
214 | p <- numeric(length(s))
215 | for(i in 1:length(s)){
216 |   p[i] <- sum(s[1:i]) / i
217 | }
218 | plot(p, type = "l", xlab = "number of coin flips", ylab = "p(A)")
219 | abline(h = 0.5)
220 | 
221 | ## ----B05113_06_02x-------------------------------------------------------
222 | plotbinomcoin <- function(n){
223 |   plot(0:n/n, dbinom(0:n, n, 0.5), type = "h", xlab = "relative frequency", ylab = "p")
224 | }
225 | par(mar = c(4,4,0.5,0.5), mfrow = c(2,2))
226 | plotbinomcoin(20)
227 | plotbinomcoin(100)
228 | plotbinomcoin(1000)
229 | plotbinomcoin(5000)
230 | 
231 | ## ------------------------------------------------------------------------
232 | newlines <- function(n = 5000){
233 |   s <- sample(c(FALSE,TRUE), n, replace = TRUE)
234 |   for(i in 1:length(s)){ 
235 |     p[i] <- mean(s[1:i])
236 |   }
237 |   p
238 | }
239 | 
240 | ## ----B05113_06_03x, results='hide'---------------------------------------
241 | plot(p, type = "l", xlab = "number of coin flips", ylim = c(0,1), ylab = "p(A)")
242 | abline(h = 0.5)
243 | replicate(10, lines(newlines()))
244 | 
245 | ## ----B05113_06_04x, cache=TRUE-------------------------------------------
246 | plot(p, type = "l", xlab = "number of coin flips", ylim = c(0,1), ylab = "p(A)")
247 | abline(h = 0.5)
248 | ## calculate 1000 sequences
249 | n1 <- replicate(1000, newlines())
250 | med <- apply(n1, 1, median)
251 | q1 <- apply(n1, 1, quantile, 0.025)
252 | q2 <- apply(n1, 1, quantile, 0.975)
253 | apply(n1, 2, lines)
254 | lines(med, col = "red")
255 | lines(q1, col = "green", cex = 2)
256 | lines(q2, col = "green", cex = 2)
257 | 
258 | ## ---- echo=FALSE, eval=FALSE---------------------------------------------
259 | ## plot(function(x) dnorm(x), -4, 4,
260 | ##      main = "normal")
261 | ## curve(dnorm(x), add = TRUE, col = "red", lwd = 2)
262 | 
263 | ## ---- warning=FALSE, message=FALSE---------------------------------------
264 | cltSim <- function (n = 1, reps = 10000, nclass = 16, pop = TRUE, estimator = mean) {
265 |     old.par <- par(oma = c(0, 0, 1.5, 0), mfrow = c(2, 2), mar = c(4,4,2,0.5))
266 |     on.exit(par(old.par))
267 |     ## normal:
268 |     norm.mat <- matrix(rnorm(n * reps), ncol = n)
269 |     norm.mean <- apply(norm.mat, 1, estimator)
270 |     x <- seq(min(norm.mean), max(norm.mean), length = 50)
271 |     normmax <- max(dnorm(x, mean(norm.mean), sd(norm.mean)))
272 |     tmp.hist <- hist(norm.mean, plot = FALSE, prob = TRUE, nclass = nclass)
273 |     normmax <- max(tmp.hist$density, normmax) * 1.05
274 |     hist(norm.mean, main = "normal", xlab = "x", col = "skyblue", 
275 |         prob = TRUE, ylim = c(0, normmax), nclass = nclass)
276 |     lines(x, dnorm(x, mean(norm.mean), sd(norm.mean)))
277 |     ## exponential:
278 |     exp.mat <- matrix(rexp(n * reps, 1/3), ncol = n)
279 |     exp.mean <- apply(exp.mat, 1, estimator)
280 |     x <- seq(min(exp.mean), max(exp.mean), length = 50)
281 |     expmax <- max(dnorm(x, mean(exp.mean), sd(exp.mean)))
282 |     tmp.hist <- hist(exp.mean, plot = FALSE, prob = TRUE, nclass = nclass)
283 |     expmax <- max(tmp.hist$density, expmax) * 1.05
284 |     hist(exp.mean, main = "exponential", xlab = "x", col = "skyblue", 
285 |         prob = TRUE, ylim = c(0, expmax), nclass = nclass)
286 |     if(pop) lines(x, dexp(x, 1/3)) else lines(x, dnorm(x, mean(exp.mean), sd(exp.mean)))
287 |     ## uniform:
288 |     unif.mat <- matrix(runif(n * reps), ncol = n)
289 |     unif.mean <- apply(unif.mat, 1, estimator)
290 |     x <- seq(min(unif.mean), max(unif.mean), length = 50)
291 |     unimax <- max(dnorm(x, mean(unif.mean), sd(unif.mean)))
292 |     tmp.hist <- hist(unif.mean, plot = FALSE, prob = TRUE, nclass = nclass)
293 |     unimax <- max(tmp.hist$density, unimax) * 1.05
294 |     hist(unif.mean, main = "uniform", xlab = "x", col = "skyblue", 
295 |         prob = TRUE, ylim = c(0, unimax), nclass = nclass)
296 |     if(pop) lines(x, dunif(x)) else lines(x, dnorm(x, mean(unif.mean), sd(unif.mean)))
297 |     ## Beta:
298 |     beta.mat <- matrix(rbeta(n * reps, 0.35, 0.25), ncol = n)
299 |     beta.mean <- apply(beta.mat, 1, estimator)
300 |     x <- seq(min(beta.mean), max(beta.mean), length = 50)
301 |     betamax <- max(dnorm(x, mean(beta.mean), sd(beta.mean)))
302 |     tmp.hist <- hist(beta.mean, plot = FALSE, prob = TRUE, nclass = nclass)
303 |     betamax <- max(tmp.hist$density, betamax)
304 |     hist(beta.mean, main = "Beta", xlab = "x", col = "skyblue", 
305 |         prob = TRUE, ylim = c(0, betamax), nclass = nclass)
306 |     if(pop){
307 |       lines(x, dbeta(x, 0.35, 0.25))
308 |       mtext(paste("Populations"), outer = TRUE, cex = 1.2)
309 |     } else {
310 |       lines(x, dnorm(x, mean(beta.mean), sd(beta.mean)))
311 |       mtext(paste("sample size =", n), outer = TRUE, cex = 1.2)
312 |     }
313 | }
314 | 
315 | ## ----B05113_06_08, warning=FALSE, message=FALSE--------------------------
316 | cltSim()
317 | 
318 | ## ----B05113_06_09, warning=FALSE, message=FALSE--------------------------
319 | cltSim(2, pop = FALSE)
320 | 
321 | ## ----B05113_06_10, warning=FALSE, message=FALSE--------------------------
322 | cltSim(10, pop = FALSE)
323 | 
324 | ## ----B05113_06_11, warning=FALSE, message=FALSE--------------------------
325 | cltSim(n = 100, pop = FALSE, estimator = median)
326 | 
327 | ## ------------------------------------------------------------------------
328 | library("car")
329 | data("Prestige")
330 | m <- mean(Prestige$income)
331 | m
332 | p <- dim(Prestige)[1]
333 | se <- sd(Prestige$income) / sqrt(p)
334 | tval <- qt(0.975, df = p - 1)
335 | cat(paste("KI: [", round(m - tval * se, 2), ",", round(m + tval * se, 2), "]"))
336 | 
337 | ## ---- B05113_06_12-------------------------------------------------------
338 | set.seed(11112)
339 | alpha <- 0.05
340 | normval <- qnorm(1 - alpha/2)
341 | numsamp <- 50; numsim <- 10 
342 | normmat <- matrix(0, nrow = numsim, ncol = 2)
343 | y <- 1:numsim; ymat <- rbind(y, y)
344 | for (i in 1:numsim) {
345 |   samp <- rexp(numsamp)    # generate random exponentials
346 |   sampmean <- mean(samp)
347 |   sampse <- sqrt(var(samp) / numsamp)
348 |   normmat[i, ] <- c(sampmean - normval * sampse, sampmean + normval * sampse)
349 | }
350 | matplot(t(normmat), ymat , pch = " ", yaxt = "n", ylab = "", xlab="confidence intervals") # empty plot
351 | matlines(t(normmat), ymat, lty = rep(1, numsim), col = 1)
352 | abline(v = 1)
353 | 
354 | ## ------------------------------------------------------------------------
355 | sessionInfo()
356 | 
357 | 


--------------------------------------------------------------------------------
/Chapter 10/chapter10.R:
--------------------------------------------------------------------------------
  1 | ## ------------------------------------------------------------------------
  2 | library("mvtnorm")
  3 | synth <- rmvnorm(100, mean = rep(0,5), sigma = diag(5))
  4 | ## first three observations
  5 | head(synth, 3)
  6 | 
  7 | ## ------------------------------------------------------------------------
  8 | data(Prestige, package = "car")
  9 | ## first three observations of Prestige
 10 | head(Prestige, 3)
 11 | ## subset of variables
 12 | real <- Prestige[, 1:4]
 13 | 
 14 | ## ------------------------------------------------------------------------
 15 | ## set seed for reproducibility
 16 | set.seed(12)
 17 | ## simulate from multivariate normal
 18 | synth2 <- data.frame(rmvnorm(100, mean = colMeans(real), sigma = cov(real)))
 19 | colnames(synth2) <- colnames(real)
 20 | ## first three observations
 21 | head(synth2, 3)
 22 | 
 23 | ## ------------------------------------------------------------------------
 24 | summary(real$women)
 25 | summary(synth2$women)
 26 | 
 27 | ## ----B05113_10_01--------------------------------------------------------
 28 | par(mar = c(4,4,0.2,0.2))
 29 | plot(prestige ~ income, data = real)
 30 | points(prestige ~ income, data = synth2, col = "red", pch = 20)
 31 | legend("bottomright", legend = c("original/real", "synthetic"), col = 1:2, pch = c(1,20))
 32 | 
 33 | ## ----B05113_10_02--------------------------------------------------------
 34 | library("robustbase")
 35 | cv <- covMcd(real)
 36 | synth3 <- rmvnorm(100, mean = cv$center, sigma = cv$cov)
 37 | par(mfrow = c(1,2), mar = c(4,4,0.2,0.2))
 38 | plot(prestige ~ income, data = real)
 39 | points(prestige ~ income, data = synth3, col = "red", pch = 20)
 40 | ## add outliers
 41 | rmd <- mahalanobis(real, center = cv$center, cov = cv$cov)
 42 | ## outliers defined by large Mahalanobis distances
 43 | out <- rmd > qchisq(0.975, ncol(real) - 1)
 44 | cv_good <- covMcd(real[!out, ])
 45 | ## simulate good points
 46 | synth3_good <- rmvnorm(100, mean = cv_good$center, sigma = cv_good$cov)
 47 | cv_out <- covMcd(real[out, ])
 48 | ## simulate outliers
 49 | synth3_out <- rmvnorm(100, mean = cv_out$center, sigma = cv_out$cov)
 50 | ## Figure 10.2.
 51 | plot(prestige ~ income, data = real)
 52 | points(prestige ~ income, data = synth3_good, col = "red", pch = 20)
 53 | points(prestige ~ income, data = synth3_out, col = "red", pch = 20)
 54 | 
 55 | ## ---- warning=FALSE,message=FALSE----------------------------------------
 56 | synth4 <- real
 57 | lm1 <- lm(education ~ ., data = real)
 58 | synth4$education <- predict(lm1, synth4[, 2:ncol(synth4)]) + sample(residuals(lm1))
 59 | 
 60 | ## ---- warning=FALSE,message=FALSE----------------------------------------
 61 | p <- ncol(real)
 62 | for(i in 1:ncol(real)){
 63 |   df <- real[, i]
 64 |   df <- cbind(df, real[,-i])
 65 |   colnames(df)[1] <- "response"
 66 |   lm1 <- lm(response ~ ., data = df)
 67 |   synth4[, i] <- predict(lm1, synth4[, -i]) + sample(residuals(lm1))
 68 | }
 69 | 
 70 | ## ------------------------------------------------------------------------
 71 | simLatent <- function(n = 200, p = 50, k = 3){
 72 |   T <- matrix(rnorm(n * k, 0, 1), ncol = k)
 73 |   B <- matrix(runif(p * k, -1, 1), ncol = k)
 74 |   X <- T %*% t(B)
 75 |   E <-  matrix(rnorm(n * p, 0, 0.1), ncol = p)
 76 |   XE <- X + E
 77 |   return(XE)  
 78 | }
 79 | 
 80 | ## ------------------------------------------------------------------------
 81 | x <- simLatent(n = 50, p = 1000, k = 6)
 82 | dim(x)
 83 | 
 84 | ## ---- warning=FALSE,message=FALSE----------------------------------------
 85 | library("simPop")
 86 | data("eusilcS")
 87 | dim(eusilcS)
 88 | 
 89 | ## ------------------------------------------------------------------------
 90 | length(unique(eusilcS$db030))
 91 | 
 92 | ## ------------------------------------------------------------------------
 93 | inp <- specifyInput(eusilcS, hhid = "db030", hhsize = "hsize", 
 94 |                      strata = "db040", weight = "rb050")
 95 | 
 96 | ## ------------------------------------------------------------------------
 97 | print(inp)
 98 | 
 99 | ## ---- warning=FALSE,message=FALSE----------------------------------------
100 | synthP <- simStructure(data = inp, 
101 |                         method = "direct", 
102 |                         basicHHvars = c("age", "rb090", "db040"))
103 | 
104 | ## ------------------------------------------------------------------------
105 | synthP <- simCategorical(synthP, 
106 |                           additional = c("pl030", "pb220a"), 
107 |                           method = "multinom")
108 | 
109 | ## ---- cache=TRUE---------------------------------------------------------
110 | synthP <- simContinuous(synthP, additional = "netIncome",
111 |                          upper = 200000, equidist = FALSE,
112 |                          imputeMissings = FALSE)
113 | 
114 | ## ------------------------------------------------------------------------
115 | synthP
116 | 
117 | ## ---- results='hide', message=FALSE,warning=FALSE------------------------
118 | library("mice")
119 | library("VIM")
120 | x <- orig <- simLatent(n = 50, p = 10, k = 6)
121 | ## evaluation criteria
122 | eval <- function(real, imputed, nas){
123 |   sqrt(sum((real - imputed)^2)) / nas
124 | }
125 | set.seed(123)
126 | R <- 100
127 | e1 <- e2 <- e3 <- e4 <- numeric(R)
128 | for(i in 1:R){
129 |   x <- orig
130 |   x[sample(1:nrow(x), 10), 1] <- NA    
131 |   e1[i] <- eval(orig, e1071::impute(x), 10)
132 |   e2[i] <-   eval(orig, kNN(data.frame(x), imp_var = FALSE), 10)
133 |   e3[i] <-   eval(orig, irmi(x), 10)
134 |   e4[i] <-   eval(orig, complete(mice(x, m = 1, printFlag = FALSE)), 10)
135 | }
136 | df <- data.frame("error" = c(e1,e2,e3,e4), method = rep(c("mean", "kNN", "irmi", "mice"), each = R))
137 | 
138 | ## ----B05113_10_03--------------------------------------------------------
139 | library("ggplot2")
140 | ggplot(df, aes(x = method, y=error)) + geom_boxplot() + theme(text = element_text(size = 20)) + theme_bw()
141 | 
142 | ## ------------------------------------------------------------------------
143 | library("robustbase")
144 | set.seed(123)
145 | x <- rexp(n = 50, rate = 1)
146 | mean(x)
147 | huberM(x)$mu
148 | 
149 | ## ------------------------------------------------------------------------
150 | m <- mean(replicate(10000, mean(rexp(n = 50, rate = 1))))
151 | m
152 | m - 1
153 | 
154 | ## ------------------------------------------------------------------------
155 | mh <- mean(replicate(10000, huberM(rexp(n =50, rate = 1))$mu))
156 | mh
157 | mh - 1
158 | 
159 | ## ------------------------------------------------------------------------
160 | set.seed(123)
161 | alpha <- 0.05
162 | ci <- function(x, z = qnorm(1 - alpha / 2)){
163 |   s <- rexp(n = 50, rate = 1)
164 |   m <- mean(s)
165 |   se <- sd(s) / sqrt(50)
166 |   ci_est <- c(m - z * se, m + z *se)
167 |   ci_est
168 | }
169 | ci()
170 | 
171 | ## ---- cache=TRUE---------------------------------------------------------
172 | set.seed(123)
173 | ciR_n <- replicate(100000, ci())
174 | isCovered <- function(x){
175 |   apply(x, 2, function(x){
176 |   if(x[1] > 1 & x[2] > 1) return(FALSE)
177 |   if(x[1] < 1 & x[2] < 1) return(FALSE)
178 |   return(TRUE)})
179 | }
180 | cn <- isCovered(ciR_n)
181 | sum(cn) / length(cn)
182 | 
183 | ## ---- cache=TRUE---------------------------------------------------------
184 | ciR_t <- replicate(100000, ci(z = qt(1 - alpha / 2, 49)))
185 | ct <- isCovered(ciR_t)
186 | sum(ct) / length(ct)
187 | 
188 | ## ---- cache = TRUE-------------------------------------------------------
189 | ci_boot <- function(x, R = 1000){
190 |   s <- rexp(n = 50, rate = 1)
191 |   ci_est <- quantile(replicate(R, 
192 |             mean(sample(s, replace = TRUE))), 
193 |             c(0.025, 0.975))
194 |   return(ci_est)
195 | }
196 | ciR_boot <- replicate(1000, ci_boot())
197 | cb <- isCovered(ciR_boot)
198 | sum(cb) / length(cb)
199 | 
200 | ## ----B05113_10_04, message=FALSE,warning=FALSE---------------------------
201 | df <- data.frame(t(ciR_n))
202 | df <- data.frame(rbind(t(ciR_n), t(ciR_t), t(ciR_boot)))
203 | df$method <- rep(c("normal", "t", "boot"), times = c(100000,100000,1000))
204 | colnames(df) <- c("lower", "upper", "method")
205 | library("reshape2")
206 | df <- melt(df)
207 | library("ggplot2")
208 | ggplot(df, aes(x = value, colour = method)) + geom_density() + facet_wrap(~ variable) + theme(text = element_text(size=16))
209 | 
210 | ## ---- cache = TRUE, message=FALSE,warning=FALSE--------------------------
211 | simMean <- function(simFun = function(x) rnorm(100)){
212 |   ## 1000 samples
213 |   set.seed(123)
214 |   R <- 1000
215 |   m <- list()
216 |   ## 1000 data sets
217 |   for(i in 1:R){
218 |     m[[i]] <- simFun()
219 |   }
220 |   ## estimation
221 |   df <- data.frame("thetahat" = c(sapply(m, mean), sapply(m, mean, trim = 0.1), sapply(m, median), sapply(m, function(x) huberM(x)$mu)), 
222 |                    "method" = rep(c("mean","trim","median","huber"), each = R)) 
223 |   ## summary
224 |   vm <- var(df[df$method == "mean", 1])
225 |   df %>% 
226 |     group_by(method) %>% 
227 |     summarize("bias" = mean(thetahat) - 0, 
228 |               "variance" = var(thetahat),
229 |               "mse" = variance + bias^2,
230 |               "re" = vm / var(thetahat))
231 | }
232 | 
233 | ## ------------------------------------------------------------------------
234 | library("robustbase"); library("dplyr")
235 | simMean()
236 | 
237 | ## ---- cache = TRUE, warning=FALSE, message=FALSE-------------------------
238 | set.seed(123)
239 | simMean(simFun = function(){c(rnorm(95), rnorm(5,15))})
240 | 
241 | ## ---- message=FALSE, warning=FALSE---------------------------------------
242 | library("simFrame"); library("robCompositions"); library("mvtnorm"); library("mice")
243 | set.seed(123)
244 | 
245 | ## ------------------------------------------------------------------------
246 | ## data generation 
247 | crnorm <- function(n, mean, sigma) data.frame(isomLRinv(rmvnorm(n, mean, sigma)))
248 | sigma <- matrix(c(1, -0.5, 1.4, -0.5, 1, -0.6, 1.4, -0.6, 2), 3, 3)
249 | ## data control class
250 | dc <- DataControl(size = 150, distribution = crnorm,
251 |       dots = list(mean = c(0, 2, 3), sigma = sigma))
252 | 
253 | ## ------------------------------------------------------------------------
254 | nc <- NAControl(NArate = c(0.05, 0.1))
255 | 
256 | ## ------------------------------------------------------------------------
257 | sim <- function(x, orig) {
258 |      i <- apply(x, 1, function(x) any(is.na(x)))
259 |      ni <- length(which(i))
260 |      x <- x[, -ncol(x)]
261 |      xMean <- e1071::impute(x)
262 |      xMice <- mice(x, printFlag = FALSE, diagnostics = FALSE, m = 1)
263 |      xMice <- complete(xMice)
264 |      xKNNa <- impKNNa(x)$xImp
265 |      xLS <- impCoda(x, method = "lm")$xImp
266 |      xLTSrob <- impCoda(x, method = "ltsReg")$xImp
267 |      c(xMean = aDist(xMean, orig)/ni,
268 |        xMice = aDist(xMice, orig)/ni,
269 |        knn = aDist(xKNNa, orig)/ni, 
270 |        LS = aDist(xLS, orig)/ni,
271 |        LTSrob = aDist(xLTSrob, orig)/ni)
272 | }
273 | 
274 | ## ---- cache = TRUE, message=FALSE, warning=FALSE-------------------------
275 | results <- runSimulation(dc, 
276 |                          nrep = 25, 
277 |                          NAControl = nc, 
278 |                          fun = sim)
279 | 
280 | ## ------------------------------------------------------------------------
281 | aggregate(results)
282 | 
283 | ## ----B05113_10_05--------------------------------------------------------
284 | simBwplot(results)
285 | 
286 | ## ---- cache=TRUE, message=FALSE, warning=FALSE---------------------------
287 | dcarc <- ContControl(target = c("X1"), 
288 |           epsilon = c(0.01,0.03,0.05,0.1), 
289 |           dots = list(mean = 150, sd = 1), type = "DCAR")
290 | results <- runSimulation(dc, 
291 |                           nrep = 3, 
292 |                           NAControl = nc, 
293 |                           contControl = dcarc,
294 |                           fun = sim)
295 | aggregate(results)
296 | 
297 | ## ----B05113_10_06, message=FALSE, warning=FALSE, fig.width=7, fig.height=7----
298 | simBwplot(results)
299 | 
300 | ## ------------------------------------------------------------------------
301 | sim2 <- function(x, orig) {
302 |     rdcm <- function(x, y){
303 |     	ocov <- cov(isomLR(x))
304 |     	rcov <- cov(isomLR(y))
305 |     	return(frobenius.norm(ocov-rcov)/frobenius.norm(ocov))
306 |     }
307 |      i <- apply(x, 1, function(x) any(is.na(x)))
308 |      ni <- length(which(i))
309 |      x <- x[, -ncol(x)]
310 |      xMean <- e1071::impute(x)
311 |      xMice <- mice(x, printFlag = FALSE, diagnostics = FALSE, m = 1)
312 |      xMice <- complete(xMice)
313 |      xKNNa <- impKNNa(x)$xImp
314 |      xLS <- impCoda(x, method = "lm")$xImp
315 |      xLTSrob <- impCoda(x, method = "ltsReg")$xImp
316 |      c(xMean = rdcm(xMean, orig),
317 |        xMice = rdcm(xMice, orig),
318 |        knn = rdcm(xKNNa, orig), 
319 |        LS = rdcm(xLS, orig),
320 |        LTSrob = rdcm(xLTSrob, orig))
321 | }
322 | 
323 | ## ---- cache=TRUE, message=FALSE, warning=FALSE---------------------------
324 | library("matrixcalc")
325 | results <- runSimulation(dc, 
326 |                           nrep = 3, 
327 |                           NAControl = nc, 
328 |                           contControl = dcarc,
329 |                           fun = sim2)
330 | aggregate(results)
331 | 
332 | ## ------------------------------------------------------------------------
333 | data("eusilcP")
334 | 
335 | ## ------------------------------------------------------------------------
336 | colnames(eusilcP)
337 | 
338 | ## ------------------------------------------------------------------------
339 | sim <- function(x, k) {
340 |   require("laeken")
341 |   x <- x[!is.na(x$eqIncome), ]
342 |   ## classical Gini
343 |   g <- gini(x$eqIncome, x$.weight)$value
344 |   ## Hill estimator
345 |   eqIncHill <- fitPareto(x$eqIncome, k = k, method = "thetaHill",
346 |     groups = x$hid)
347 |   gHill <- gini(eqIncHill, x$.weight)$value
348 |   ## partial density component estimator
349 |   eqIncPDC <- fitPareto(x$eqIncome, k = k, method = "thetaPDC",
350 |     groups = x$hid)
351 |   gPDC <- gini(eqIncPDC, x$.weight)$value
352 |   ## results as a vector
353 |   c(standard = g, Hill = gHill, PDC = gPDC) 
354 | }
355 | 
356 | ## ------------------------------------------------------------------------
357 | sc <- SampleControl(grouping = "hid", size = 1500, k = 100)
358 | 
359 | ## ---- cache = TRUE-------------------------------------------------------
360 | library("laeken") # for function gini
361 | set.seed(123)
362 | ## run the simulation
363 | results <- runSimulation(eusilcP, sc, fun = sim, k = 175)
364 | 
365 | ## ------------------------------------------------------------------------
366 | head(results)
367 | 
368 | ## ------------------------------------------------------------------------
369 | aggregate(results)
370 | 
371 | ## ----B05113_10_08--------------------------------------------------------
372 | tv <- laeken::gini(eusilcP$eqIncome)$value
373 | plot(results, true = tv)
374 | 
375 | ## ---- message=FALSE,warning=FALSE----------------------------------------
376 | set.seed(12345)
377 | sc <- SampleControl(design = "region", grouping = "hid",
378 |   size = c(75, 250, 250, 125, 200, 225, 125, 150, 100),
379 |   k = 100)
380 | ## run new simulation
381 | results <- runSimulation(eusilcP, sc, fun = sim, k = 175)
382 | 
383 | ## ------------------------------------------------------------------------
384 | head(results)
385 | 
386 | ## ------------------------------------------------------------------------
387 | aggregate(results)
388 | 
389 | ## ----B05113_10_07--------------------------------------------------------
390 | tv <- gini(eusilcP$eqIncome)$value
391 | plot(results, true = tv)
392 | 
393 | ## ------------------------------------------------------------------------
394 | set.seed(12345)
395 | ## define contamination
396 | cc <- DCARContControl(target = "eqIncome", epsilon = 0.005,
397 |   grouping = "hid", dots = list(mean = 5e+05, sd = 10000))
398 | ## run new simulation
399 | results <- runSimulation(eusilcP, sc, contControl = cc, fun = sim, k = 175)
400 | 
401 | ## ------------------------------------------------------------------------
402 | head(results)
403 | 
404 | ## ------------------------------------------------------------------------
405 | aggregate(results)
406 | 
407 | ## ----B05113_10_09--------------------------------------------------------
408 | tv <- gini(eusilcP$eqIncome)$value
409 | plot(results, true = tv)
410 | 
411 | ## ------------------------------------------------------------------------
412 | library("simFrame")
413 | library("laeken")
414 | data("eusilcP")
415 | 
416 | ## ------------------------------------------------------------------------
417 | set.seed(12345)
418 | sc <- SampleControl(design = "region", grouping = "hid",
419 |   size = c(75, 250, 250, 125, 200, 225, 125, 150, 100), k = 100)
420 | cc <- DCARContControl(target = "eqIncome", epsilon = 0.005,
421 |   grouping = "hid", dots = list(mean = 5e+05, sd = 10000))
422 | results <- runSimulation(eusilcP, sc, contControl = cc,
423 |   design = "gender", fun = sim, k = 125)
424 | 
425 | ## ----B05113_10_10--------------------------------------------------------
426 | tv <- simSapply(eusilcP, "gender", function(x) gini(x$eqIncome)$value)
427 | plot(results, true = tv)
428 | 
429 | ## ------------------------------------------------------------------------
430 | set.seed(12345)
431 | sc <- SampleControl(design = "region", grouping = "hid",
432 |   size = c(75, 250, 250, 125, 200, 225, 125, 150, 100), k = 100)
433 | cc <- DCARContControl(target = "eqIncome", epsilon = c(0, 0.0025, 0.005, 0.0075, 0.01), dots = list(mean = 5e+05, sd = 10000))
434 | results <- runSimulation(eusilcP, sc, contControl = cc,
435 |   design = "gender", fun = sim, k = 125)
436 | 
437 | ## ------------------------------------------------------------------------
438 | head(results)
439 | 
440 | ## ------------------------------------------------------------------------
441 | aggregate(results)
442 | 
443 | ## ----B05113_10_11--------------------------------------------------------
444 | tv <- simSapply(eusilcP, "gender", function(x) gini(x$eqIncome)$value)
445 | plot(results, true = tv)
446 | 
447 | ## ------------------------------------------------------------------------
448 | set.seed(12345)
449 | sc <- SampleControl(design = "region", grouping = "hid",
450 |   size = c(75, 250, 250, 125, 200, 225, 125, 150, 100), k = 50)
451 | cc <- DCARContControl(target = "eqIncome", epsilon = c(0, 0.005, 0.01), dots = list(mean = 5e+05, sd = 10000))
452 | nc <- NAControl(target = "eqIncome", NArate = c(0, 0.05))
453 | results <- runSimulation(eusilcP, sc, contControl = cc,
454 |   NAControl = nc, design = "gender", fun = sim, k = 125)
455 | 
456 | ## ------------------------------------------------------------------------
457 | aggregate(results)
458 | 
459 | ## ----B05113_10_12--------------------------------------------------------
460 | tv <- simSapply(eusilcP, "gender", function(x) gini(x$eqIncome)$value)
461 | plot(results, true = tv)
462 | 
463 | ## ------------------------------------------------------------------------
464 | sessionInfo()
465 | 
466 | 


--------------------------------------------------------------------------------
/Chapter 7/chapter7.R:
--------------------------------------------------------------------------------
  1 | ## ---- message=FALSE,warning=FALSE----------------------------------------
  2 | dat <- matrix(c(104,11037,189,11034),2,2, byrow=TRUE)
  3 | dat
  4 | library("vcd")
  5 | ## confidence intervals
  6 | confint(oddsratio(dat, log=FALSE))
  7 | 
  8 | ## ---- cache=TRUE---------------------------------------------------------
  9 | ## original surveyed data
 10 | s1 <- rep(c(TRUE, FALSE), times = c(104, 11037))  
 11 | s2 <- rep(c(TRUE, FALSE), times = c(189, 11034))
 12 | ## function for drawing a bootstrap sample 
 13 | ## and estimating the boostrap replicate
 14 | boot_aspirin <- function(s1, s2){
 15 |   ## odds ratio
 16 |   sum(sample(s1, replace = TRUE)) / sum(sample(s2, replace = TRUE))
 17 | }  
 18 | ## 10000 draws and replicates
 19 | boot_repl <- replicate(10000, boot_aspirin(s1, s2))  
 20 | ## confidence interval
 21 | quantile(boot_repl, c(0.025, 0.975))
 22 | 
 23 | ## ------------------------------------------------------------------------
 24 | x <- c(5, 7, 8, 2, 15, 12, 3)
 25 | 
 26 | ## ------------------------------------------------------------------------
 27 | ## for reproducibility we use a seed
 28 | set.seed (123) 
 29 | ## bootstrap sample (with replacement)
 30 | s1 <- sample(x, replace = TRUE)
 31 | s1
 32 | 
 33 | ## ------------------------------------------------------------------------
 34 | s2 <- sample(x, replace = TRUE)
 35 | s2
 36 | 
 37 | ## ------------------------------------------------------------------------
 38 | mean(x)
 39 | 
 40 | ## ------------------------------------------------------------------------
 41 | mean(s1)
 42 | mean(s2)
 43 | 
 44 | ## ------------------------------------------------------------------------
 45 | library("car") 
 46 | data("Prestige") 
 47 | mean(Prestige$income)
 48 | 
 49 | ## ------------------------------------------------------------------------
 50 | set.seed(123) 
 51 | mean(sample(Prestige$income, replace = TRUE))
 52 | 
 53 | ## ------------------------------------------------------------------------
 54 | mean(sample(Prestige$income, replace = TRUE))
 55 | 
 56 | ## ------------------------------------------------------------------------
 57 | library("robustbase")
 58 | ## data
 59 | df <- Prestige[, c("income", "prestige")]
 60 | ## robust MCD-based covariance
 61 | covMcd(df, cor=TRUE)$cor
 62 | 
 63 | ## ------------------------------------------------------------------------
 64 | set.seed(1234) ## for reproducibility (seed)
 65 | ## standard error with bootstrap
 66 | sd(replicate(200, 
 67 |         covMcd(df[sample(rownames(df),  replace=TRUE), ], 
 68 |                cor=TRUE)$cor[1,2]))
 69 | 
 70 | ## ---- message=FALSE,warning=FALSE----------------------------------------
 71 | library("boot")
 72 | ## function for bootstrapping in boot
 73 | cr <- function(d, w) covMcd(d[w, ], cor=TRUE)$cor[1,2]
 74 | ## application of the boot function
 75 | boot(data=df, statistic=cr, R=200)
 76 | 
 77 | ## ------------------------------------------------------------------------
 78 | ## MASS needed for drawing random numbers from multivariate normal
 79 | library("MASS")
 80 | ## parameters from empirical data (income and prestige)
 81 | m1 <- colMeans(df)
 82 | m2 <- cov(df)
 83 | ## number of observations
 84 | n <- dim(df)[1]
 85 | ## parametric bootstrap 
 86 | parboot <- replicate(200,  covMcd(mvrnorm(n, mu=m1, Sigma=m2), cor=TRUE)$cor[1,2])
 87 | ## standard error
 88 | sd(parboot)
 89 | 
 90 | ## ------------------------------------------------------------------------
 91 | ## parametric bootstrap
 92 | system.time(sd(replicate(5000,  
 93 |         cor(mvrnorm(n, mu=m1, Sigma=m2))[1,2])))
 94 | ## non-parametric bootstrap
 95 | system.time(sd(replicate(5000, 
 96 |         cor(df[sample(rownames(df),  replace=TRUE), ])[1,2])))
 97 | 
 98 | ## ------------------------------------------------------------------------
 99 | ## parametric bootstrap
100 | range(replicate(20, sd(replicate(50,  
101 |                           cor(mvrnorm(n, mu=m1, Sigma=m2))[1,2]))))
102 | ## non-parametric bootstrap
103 | range(replicate(20, sd(replicate(50, 
104 |         cor(df[sample(rownames(df),  replace=TRUE), ])[1,2]))))
105 | 
106 | ## ----B05113_07_03--------------------------------------------------------
107 | ## parametric bootstrap
108 | pboot <-replicate(1000,  
109 | 	cor(mvrnorm(n, mu=m1, Sigma=m2))[1,2])
110 | ## non-parametric bootstrap
111 | npboot <- replicate(1000, 
112 | 	cor(df[sample(rownames(df), 
113 | 	replace=TRUE), ])[1,2])
114 | mi <- min(pboot, npboot)
115 | ma <- max(pboot, npboot)
116 | ## Now plot Figure 7.3.
117 | par(mfrow=c(1,2), pty="s")
118 | hist(npboot, 
119 | 	main="non-parametric", 
120 | 	xlab="1000 bootstrap replicates", 
121 | 	xlim=c(mi,ma), breaks = 25)
122 | hist(pboot, 
123 | 	main="parametric", 
124 | 	xlab="1000 bootstap replicates",
125 | 	xlim=c(mi,ma), breaks = 25)
126 | 
127 | ## ------------------------------------------------------------------------
128 | x <- Prestige[, "income"]
129 | v <- function(x) sd(x) / mean(x)
130 | v(x)
131 | 
132 | ## ------------------------------------------------------------------------
133 | vboot <- replicate(1000, v(sample(x, replace = TRUE)))
134 | 
135 | ## ------------------------------------------------------------------------
136 | vbias <- mean(vboot) - v(x)
137 | vbias
138 | 
139 | ## ------------------------------------------------------------------------
140 | v(x) - vbias
141 | 
142 | ## ------------------------------------------------------------------------
143 | cat("CI(e): [", v(x) - vbias - qt(0.975, length(x)-1) * sd(vboot), ", ", v(x) - vbias + qt(0.975, length(x)-1) * sd(vboot), " ]\n") 
144 | 
145 | ## ------------------------------------------------------------------------
146 | cat("CI(p): [", quantile(vboot, 0.025), ", ", quantile(vboot, 0.975), " ]\n")
147 | 
148 | ## ------------------------------------------------------------------------
149 | cat("CI(h): [", 2*v(x) - quantile(vboot, 0.975), ", ", 2*v(x) - quantile(vboot, 0.025), " ]\n")
150 | 
151 | ## ------------------------------------------------------------------------
152 | ## some crazy data (10 outliers)
153 | x <- c(rnorm(100), rnorm(10,10))
154 | ## non-parametric bootstrap replicates
155 | mb <- replicate(10000, mean(sample(x, replace=TRUE)))
156 | ## percentile method
157 | cat("\nCI(perc): [", quantile(mb, 0.025), ", ", quantile(mb, 0.975), " ]\n") 
158 | ## BCa method
159 | library("bootstrap")
160 | b <- bcanon(x, 10000, mean, alpha=c(0.025,0.975))
161 | cat("\nCI(BCa): [", b$confpoints[1,2], ", ", b$confpoints[2,2], " ]\n") 
162 | 
163 | ## ----B05113_07_04, echo=FALSE--------------------------------------------
164 | n <- length(x)
165 | ## histogram of data
166 | hist(x, main="Daten")
167 | ## draw mean
168 | segments(x0=mean(x), x1=mean(x), y0=-1, y1=10, col="red", lwd=3)
169 | ## draw confidence intervals
170 | arrows(x0=quantile(mb, 0.025), x1=quantile(mb, 0.975), y0=5, y1=5, code=3, length=0.1)
171 | arrows(x0=b$confpoints[1,2], x1=b$confpoints[2,2], y0=4, y1=4, code=3, length=0.1, col="blue")
172 | arrows(x0=mean(x)-qt(0.975, n-1)*sd(x)/sqrt(n), x1=mean(x)+qt(0.975, n-1)*sd(x)/sqrt(n), y0=3, y1=3, code=3, length=0.1, col="brown")
173 | h1 <- 2*mean(x) - quantile(mb, 0.975)
174 | h2 <- 2*mean(x) - quantile(mb, 0.025)
175 | arrows(x0=h1, x1=h2, y0=2, y1=2, code=3, length=0.1, col="orange")
176 | bias <- mean(mb) - mean(x)
177 | h1 <- mean(x) - bias - qt(0.975, n-1) * sd(mb)
178 | h2 <- mean(x) - bias + qt(0.975, n-1) * sd(mb)
179 | arrows(x0=h1, x1=h2, y0=1, y1=1, code=3, length=0.1, col="grey")
180 | legend("topright", legend=c("percentile", "BCa", "classical", "Hall", "bias corrected","arithm. mean"), 
181 | 		lwd=c(1,1,1,1,1,3), col=c("black","blue","brown","orange","grey","red"))
182 | 
183 | ## ----B05113_07_05, echo = FALSE------------------------------------------
184 | par(mar = c(1,3,0.5,0.1))
185 | hist(x, main="", xlim=c(0.1,1.9), ylim=c(0,10))
186 | segments(x0=mean(x), x1=mean(x), y0=-1, y1=10, col="red", lwd=3)
187 | arrows(x0=quantile(mb, 0.025), x1=quantile(mb, 0.975), y0=5, y1=5, code=3, length=0.1)
188 | arrows(x0=b$confpoints[1,2], x1=b$confpoints[2,2], y0=4, y1=4, code=3, length=0.1, col="blue")
189 | arrows(x0=mean(x)-qt(0.975, n-1)*sd(x)/sqrt(n), x1=mean(x)+qt(0.975, n-1)*sd(x)/sqrt(n), y0=3, y1=3, code=3, length=0.1, col="brown")
190 | h1 <- 2*mean(x) - quantile(mb, 0.975)
191 | h2 <- 2*mean(x) - quantile(mb, 0.025)
192 | arrows(x0=h1, x1=h2, y0=2, y1=2, code=3, length=0.1, col="orange")
193 | bias <- mean(mb) - mean(x)
194 | h1 <- mean(x) - bias - qt(0.975, n-1) * sd(mb)
195 | h2 <- mean(x) - bias + qt(0.975, n-1) * sd(mb)
196 | arrows(x0=h1, x1=h2, y0=1, y1=1, code=3, length=0.1, col="grey")
197 | legend("topright", legend=c("percentile", "BCa", "classical", "Hall", "bias corrected","arithm. mean"), lwd=c(1,1,1,1,1,3), col=c("black","blue","brown","orange","grey","red"))
198 | 
199 | ## ------------------------------------------------------------------------
200 | ## toy data
201 | x <- c(1,2,2,2,2,2,7,8,9,10)
202 | ## remember, this is the variation coefficient
203 | v <- function(x) sd(x)/mean(x)
204 | ## initialisation
205 | n <- length(x)
206 | vjack <- rep(0, n-1)
207 | vpseudo <- rep(0, n)
208 | ## leave-one-out jackknife
209 | for(i in 1:n){
210 |   vjack[i] <- v(x[-i])
211 | }
212 | ## jackknife pseudo values
213 | pseudo <- n * v(x) - (n-1)*vjack
214 | ## confidence interval with pseudo values
215 | cat("\nKI(pseudo): [", mean(pseudo) - qt(0.975, n-1) * sd(pseudo)/n, ", ", mean(pseudo) + qt(0.975, n-1) * sd(pseudo)/n, " ]\n")
216 | ## confidence interval with classical jackknife
217 | se2 <- sqrt(((n-1)/n) * sum((vjack - mean(vjack))^2))
218 | jbias <- (n-1) * (mean(vjack) - v(x))
219 | cat("\nKI(jse): [", v(x) - jbias - qt(0.975, n-1) * se2 , ", ", v(x) - jbias + qt(0.975, n-1) * se2, " ]\n") 
220 | 
221 | ## ------------------------------------------------------------------------
222 | quantile(replicate(10000, v(sample(x, replace = TRUE))), c(0.025, 0.975))
223 | 
224 | ## ------------------------------------------------------------------------
225 | ## sample estimate
226 | median(x)
227 | ## non-parametric bootstrap
228 | qu <- quantile(replicate(10000, 
229 |           median(sample(x, replace = TRUE))), 
230 |         c(0.025, 0.975))
231 | cat("\nCI(boot): [", qu[1], ", ", qu[2], " ]\n")
232 | ## jackknife, initialisation
233 | n <- length(x)
234 | jack <- rep(0, n-1)
235 | pseudo <- rep(0, n)
236 | for(i in 1:n){
237 |   jack[i] <- median(x[-i])
238 | }
239 | ## jackknife pseudo values approach
240 | pseudo <- n * median(x) - (n-1)*jack
241 | cat("\nCI(pseudo): [", mean(pseudo) - qt(0.975, n-1) * sd(pseudo)/n, ", ", mean(pseudo) + qt(0.975, n-1) * sd(pseudo)/n, " ]\n") 
242 | ## classical jackknife
243 | se2 <- sqrt(((n-1)/n) * sum((jack - mean(jack))^2))
244 | jbias <- (n-1) * (mean(jack) - median(x))
245 | cat("\nCI(jse): [", median(x) - jbias - qt(0.975, n-1) * se2 , ", ", median(x) - jbias - qt(0.975, n-1) * se2, " ]\n") 
246 | 
247 | ## ------------------------------------------------------------------------
248 | ## all combinations
249 | co <- combn(10, 2) 
250 | ## first 6 out of 45
251 | co[, 1:6]
252 | ## delete-2 jackknife replicates
253 | jack_d <- apply(co, 2, function(i) median(x[-i])) 
254 | ## standard error
255 | n <- length(x)
256 | r <- 2 / n
257 | ## n over 2
258 | nd <- choose(n, 2)
259 | ## inflation factor
260 | fac <- r / nd
261 | m <- mean(jack_d)
262 | ## standard error
263 | se_d <- sqrt(fac * sum((jack_d - m)^2))
264 | ## confidence interval:
265 | cat("\nKI(jse): [", median(x)  - qt(0.975, n-1) * se_d , ", ", median(x)  + qt(0.975, n-1) * se_d, " ]\n") 
266 | 
267 | ## ------------------------------------------------------------------------
268 | choose(45, 10)
269 | 
270 | ## ------------------------------------------------------------------------
271 | data(Prestige, package = "car")
272 | x <- Prestige$income
273 | v <- function(x, indices){ 
274 |   x <- x[indices]
275 |   est <- sd(x)/mean(x)
276 |   return(est)
277 | }
278 | 
279 | ## ----B05113_07_06--------------------------------------------------------
280 | library("boot")
281 | bx <- boot(x, v, 2000)
282 | ## Figure 7.6.
283 | jack.after.boot(bx)
284 | 
285 | ## ----B05113_07_07--------------------------------------------------------
286 | set.seed(12345)
287 | ## generate some data
288 | x1 <- runif(100, 0, pi)
289 | s <- data.frame(x1 = x1, x2 = rnorm(100, 0, 0.1) + sin(x1))
290 | ## plot data points for Figure 7.6
291 | plot(s, xlab = "x", ylab = "y")
292 | ## simple model
293 | reg1 <- lm(s[, 2] ~ s[, 1], data = s)
294 | abline(reg1, lwd = 2)
295 | ## sinus model
296 | reg2 <- lm(s[, 2] ~ sin(s[, 1]), data = s)
297 | f <- function(x, coef) coef[1] + coef[2] * sin(x)
298 | ss <- seq (-0.02, 3.2, 0.01)
299 | lines(ss, f (ss, coef(reg2)), lty = 2, col = "blue", lwd = 2)
300 | ## locally reweighted regression
301 | reg3 <- lowess(x = s[, 1], y = s[, 2], f = 0.1)
302 | lines (reg3, col = "red", lty = 3, lwd = 2)
303 | ## legend for Figure 7.6
304 | legend("bottom", col = c("black", "blue", "red"), 
305 |        lty = c(1, 2, 3), lwd = c(2, 2, 2),
306 |        legend = c(expression(y = beta[0] + beta[1]*x), 
307 |                   expression(y = beta[0] + sin(x)),
308 |                   "loess, 0.1"))
309 | 
310 | ## ------------------------------------------------------------------------
311 | str(s)
312 | 
313 | ## ------------------------------------------------------------------------
314 | ## index of training data
315 | training_ind <- sample(1:nrow(s), 70) 
316 | ## index of test data
317 | test_ind <- which(!(1:100 %in% training_ind))
318 | 
319 | ## ------------------------------------------------------------------------
320 | lm1 <- lm(s[training_ind, 2] ~ s[training_ind, 1], data = s)
321 | 
322 | ## ------------------------------------------------------------------------
323 | ## expected values
324 | f <- function(x) reg1$coef[1] + reg1$coef[2] * x
325 | ## prediction error, squared sum of expected and observed test data
326 | error <- sum((f(s[test_ind, 1]) - s[test_ind, 2])^2)
327 | error
328 | 
329 | ## ----B05113_07_08, echo=FALSE, fig.width=7, fig.height=7, fig=TRUE-------
330 | par(mfrow = c(2,2), pty = "s", mar = c(2,2,1.5,0.5))
331 | ## training and test data
332 | training <- as.numeric(sample(rownames(s), 70))
333 | plot(s, main = "OLS regression", pch = 20, xlab = "", ylab = "")
334 | points(s[training, ], pch = 20)
335 | points(s[-training, ], col = "red", pch = 15)
336 | legend("bottom", pch = c(20,15), col = c("black","red"), 
337 |        legend = c("training", "test"))
338 | ## training data plus regression line
339 | plot(s[training, ], main = "OLS regression", pch = 20, xlab = "", ylab = "")
340 | reg1 <- lm(s[training, 2] ~ s[training, 1], data = s)
341 | abline(reg1)
342 | legend("bottom", pch = 20, col = "black", legend = c ("training"))
343 | ## evaluation on test data
344 | plot(s, main = "OLS regression", pch = 20, xlab = "", ylab = "")
345 | points(s[-training, ], col = "red", pch = 15)
346 | f <- function(x) reg1$coef[1] + reg1$coef[2] * x
347 | segments(x0 = s[-training, 1], x1 = s[-training, 1], y0 = s[-training, 2], y1 = f(s[-training, 1]), col = "red")
348 | abline(reg1)
349 | legend("bottom", pch = c(20,15), col = c("black", "red"), legend = c("training", "test"))
350 | error1 <- sum((f(s[-training, 1]) - s[-training, 2])^2)
351 | # error1
352 | text(x = 1.4, y = 0.16, expression(sum(e[i]^2, i = test) == 4.609), col = "red", cex=1.4)
353 | ## sinus model, evaluation
354 | plot(s, main = "OLS regression, sinus", pch = 20, xlab = "", ylab = "")
355 | points(s[-training, ], col = "red", pch = 15)
356 | reg2 <- lm(s[training, 2] ~ sin(s[training, 1]), data = s)
357 | f <- function(x) reg2$coef[1] + reg2$coef[2] * sin(x)
358 | f2 <- function(x, coef) coef[1] + coef[2] * sin(x)
359 | segments(x0 = s[-training, 1], x1 = s[-training, 1], 
360 |          y0 = s[-training, 2], y1 = f(s[-training, 1]), col = "blue" )
361 | #abline (reg2, col = "black")
362 | ss <- seq(-0.02,3.2,0.01)
363 | lines(ss, f2 (ss, coef (reg2)), lty = 2, col = "blue")
364 | legend("bottom", pch = c(20,15), col = c("black", "red"), legend = c("training", "test"))
365 | error1 <- sum((f(s[-training, 1]) - s[-training, 2])^2)
366 | # error1
367 | text(x = 1.4, y = 0.16, expression(sum(e[i]^2, test) == 0.467), col = "blue", cex=1.4)
368 | 
369 | ## ------------------------------------------------------------------------
370 | f <- function (x) reg2$coef[1] + reg2$coef[2] * sin(x)
371 | error1 <- numeric(1000)
372 | n <- nrow(s)
373 | for (i in 1:1000){
374 |   training_ind <- sample(1:n, 70)
375 |   reg2 <- lm(s[training, 2] ~ sin(s[training, 1]), data = s)
376 |   error1[i] <- sum((f(s[-training_ind, 1]) - s[-training_ind, 2])^2)
377 | }
378 | summary (error1)
379 | 
380 | ## ----B05113_07_09, echo = FALSE, fig.width=7, fig.height=7, fig=TRUE-----
381 | par(mfrow = c(2,2), pty = "s", mar = c(2,2,1.5,0.5))
382 | for (i in 1:4){
383 | plot(s, main = paste ( "OLS, Model 2, IndexTestObs .:", i), pch = 20, 
384 | xlab = "", ylab = "")
385 | training = (1:nrow(s))[-i]
386 | # print("omitted th observation:", paste(i, sep = ""))
387 | # print(training)
388 | points(s[-training, ], col = "red", pch = 15, cex = 1.4)
389 | reg2 <- lm(s[training, 2] ~ sin(s[training, 1]), data = s)
390 | f <- function (x) reg2$coef[1] + reg2$coef [2] * sin(x)
391 | f2 <- function (x, coef) coef[1] + coef[2] * sin(x)
392 | segments(x0 = s[-training, 1], x1 = s[-training, 1], y0 = s[-training, 2], y1 = f(s[-training, 1]), col = "blue" )
393 | #abline (reg2, col = "black")
394 | ss <- seq(-0.02,3.2,0.01)
395 | lines (ss, f2 (ss, coef (reg2)), lty = 2, col = "blue")
396 | legend("bottom", pch = c(20,15), col = c("black", "red"), legend = c("training", "test"))
397 | error1 <- sum ((f(s[-training, 1]) - s[-training, 2])^2)
398 | # print(paste("errors regarding the.", i, "- th observation", sep = ""))
399 | # print (error1)
400 | # cat("\n")
401 | text(x = 1.4, y = 0.16, 
402 |      paste("e", " = ", round(error1,5), sep=""), 
403 |      col="blue", cex=1.4)
404 | }
405 | 
406 | ## ------------------------------------------------------------------------
407 | n <- nrow(s)
408 | error1 <- numeric(n)
409 | for(i in 1:n){
410 |   reg2 <- lm(x2 ~ x1, data = s[-i, ])
411 |   error1[i] <- sum((f(s[i, 1]) - s[i, 2])^2)
412 | }
413 | mean(error1)
414 | 
415 | ## ---- warning=FALSE, message=FALSE---------------------------------------
416 | library("cvTools")
417 | fit <- lm(x2 ~ x1, data = s)
418 | # perform cross-validation
419 | cvFit(fit, data = s, y = s$x2, cost = mspe, 
420 |     K = 5, R = 10, seed = 1234)
421 | 
422 | ## ------------------------------------------------------------------------
423 | library("robustbase")
424 | # set up folds for cross-validation
425 | folds <- cvFolds(nrow(coleman), K = 5, R = 10)
426 | ## compare LS, MM and LTS regression
427 | ## perform cross-validation for an LS regression model
428 | fitLm <- lm(prestige ~ ., data = Prestige)
429 | cvFitLm <- cvLm(fitLm, cost = mspe, 
430 |     folds = folds)
431 | fitLm2 <- lm(prestige ~ income:type + education + women, data = Prestige)
432 | cvFitLm2 <- cvLm(fitLm, cost = mspe, 
433 |     folds = folds)
434 | ## perform cross-validation for an MM regression model
435 | fitLmrob <- lmrob(prestige ~ ., data = Prestige)
436 | cvFitLmrob <- cvLmrob(fitLmrob, cost = mspe, 
437 |     folds = folds)
438 | fitLmrob2 <- lmrob(prestige ~ income:type + education + women, data = Prestige)
439 | cvFitLmrob2 <- cvLmrob(fitLmrob, cost = mspe, 
440 |     folds = folds)
441 | ## compare cross-validation results
442 | cvSelect(LS = cvFitLm, LS2 = cvFitLm2, 
443 |          MM = cvFitLmrob, MM2 = cvFitLmrob2)
444 | 
445 | ## ------------------------------------------------------------------------
446 | sessionInfo()
447 | 
448 | 


--------------------------------------------------------------------------------
/Chapter 4/chapter4.R:
--------------------------------------------------------------------------------
  1 | ## ---- echo=FALSE---------------------------------------------------------
  2 | knitr::opts_chunk$set(cache=TRUE)
  3 | 
  4 | ## ---- eval=FALSE, echo=FALSE, message=FALSE,warning=FALSE----------------
  5 | ## library("random")
  6 | ## x <- randomNumbers(n=10000, col=2, min=0,
  7 | ##           max=1e+06, check=TRUE) / 1000000
  8 | ## save(x, file="~/workspace/simulation-book/book/chapter4/r12.RData")
  9 | 
 10 | ## ---- eval=FALSE, echo=TRUE, message=FALSE,warning=FALSE-----------------
 11 | ## library("random")
 12 | ## x <- randomNumbers(n=10000, col=2, min=0,
 13 | ##           max=1e+06, check=TRUE) / 1000000
 14 | 
 15 | ## ---- echo=FALSE, message=FALSE,warning=FALSE----------------------------
 16 | load("~/workspace/simulation-book/book/chapter4/r12.RData")
 17 | r1 <- as.numeric(x)
 18 | require(ggplot2)
 19 | theme_set(theme_bw())
 20 | 
 21 | ## ----B05113_04_01, fig.width=4, fig.height=4, eval=TRUE------------------
 22 | n <- length(x)
 23 | df <- data.frame(x1 = x[1:(n-1)], x2 = x[2:n])
 24 | ggplot(df, aes(x = x1, y = x2)) + geom_point(size = 0.1) +
 25 | xlab("random numbers from random.org") + ylab("lag 1")
 26 | 
 27 | ## ------------------------------------------------------------------------
 28 | seed <- 123
 29 | randu <- function(n) {
 30 |   for (i in 1:n) {
 31 |     seed <<- (65539 * seed) %% (2^31)
 32 | 	   result[i] <- seed / 2^31
 33 |   }
 34 |   return(result)
 35 | }
 36 | plot3S <- function(func, n=10000) {
 37 |   x <- func(n)
 38 |   require("rgl")
 39 |   x1 <- x[1:(n-2)]
 40 |   x2 <- x[2:(n-1)]
 41 |   x3 <- x[3:n]
 42 |   plot3d(x1, x2, x3, size=3)
 43 |   play3d(spin3d())
 44 | }
 45 | 
 46 | ## ---- eval=FALSE---------------------------------------------------------
 47 | ## plot3S(randu)
 48 | ## ## to compare it with R's standard generator
 49 | ## plot3S(runif) ## (Mersenne-Twister)
 50 | 
 51 | ## ------------------------------------------------------------------------
 52 | RNGkind()
 53 | 
 54 | ## ----B05113_04_04, fig.height=3, fig.width=6-----------------------------
 55 | ok <- RNGkind()
 56 | op <- par(mfrow=c(1,2), mar=c(3,4,2,2))
 57 | set.seed(111)
 58 | hist(rnorm(1000), main="Mersenne Twister, Inversion", freq=FALSE, xlim=c(-4,4), ylim=c(0,0.43), cex.main=0.7)
 59 | curve(dnorm(x), col = 2, lty = 1, lwd = 2, add = TRUE)
 60 | RNGkind("Super", "Box-Muller")
 61 | RNGkind()
 62 | hist(rnorm(1000), main="Super-Duper, Box-Muller", freq=FALSE, xlim=c(-4,4), ylim=c(0,0.43), cex.main=0.7) 
 63 | curve(dnorm(x), col = 2, lty = 1, lwd = 2, add = TRUE)
 64 | 
 65 | ## ------------------------------------------------------------------------
 66 | u <- runif(100, 0, 1)
 67 | ifelse(u <= 0.2, 0, 1)
 68 | 
 69 | ## ----B05113_04_06, echo=FALSE--------------------------------------------
 70 | plot(x=c(0,1), y=c(0,1.3), type="n", yaxt="n", xlab="U(0,1)", ylab="B(pi)")
 71 | segments(y0=-0.5, x0=0, y1=0.5, x1=0, lty=1, col="black", lwd=2)
 72 | segments(y0=0.5, y1=0.5, x0=0, x1=0.2, col="black", lwd=1, lty=2)
 73 | segments(y0=0.5, y1=1, x0=0.2, x1=0.2, lwd=2)
 74 | segments(y0=1, y1=1, x0=0.2, x1=1, col="black", lwd=1, lty=2)
 75 | segments(y0=1, y1=1.5, x0=1, x1=1, lwd=2)
 76 | points(0, 0.5, cex=2)
 77 | points(0.2, 0.5, cex=2, pch=20)
 78 | points(0.2,1, cex=2)
 79 | points(1,1, cex=2, pch=20)
 80 | arrows(x0=0.55376, x1=0.55376, y0=-0.15, y1=1, col="blue", lty=2)
 81 | arrows(x0=0.55376,x1=-0.04, y0=1, y1=1, col="blue", lty=2)
 82 | axis(2, at=c(1,0.5), labels=c(1,0), cex.axis=1.5, las=1)
 83 | text(x=0.53, y=0.3, labels="random number 0.554 \n drawn from U(0,1)", col="blue", srt=0)
 84 | 
 85 | ## ----B05113_04_07, echo=FALSE--------------------------------------------
 86 | plot(x=c(0,1), y=c(0,3), type="n", xlab="U(0,1)", 
 87 | 		ylab="exponential distributed")
 88 | s <- seq(0,1,length.out=1000)
 89 | lines(x=s, y=qexp(s), lwd=2)
 90 | arrows(x0=0.55376, x1=0.55376, y1=qexp(0.55376), y0=-015, col="blue", lty=2)
 91 | arrows(x0=0.55376, x1=-0.04, y0=qexp(0.55476), y1=qexp(0.55476), col="blue", lty=2)
 92 | 
 93 | ## ----B05113_04_08--------------------------------------------------------
 94 | library("MASS")
 95 | invExp <- function(n, lambda = 1) {
 96 |   u <- runif(n)
 97 |   x <- -(1/lambda) * log(u)
 98 |   return(x)
 99 | }
100 | lambda <- c(0.5, 1, 2, 5)
101 | par(mar = c(3,3,3,0), mfrow = c(2, 2))
102 | for (l in lambda) {
103 |   sample <- invExp(10000, l)
104 |   truehist(sample, nbins = 20, col = "limegreen", 
105 | 		   main = paste("lambda =", l), xlab = "")
106 |   curve(dexp(x, l), from = 1e-10, add = TRUE)
107 | }
108 | 
109 | ## ---- echo=FALSE---------------------------------------------------------
110 | set.seed(1234)
111 | 
112 | ## ------------------------------------------------------------------------
113 | sample(1:3, 10, replace = TRUE, prob = c(3/7,1/7,3/7))
114 | 
115 | ## ----B05113_04_09, echo=FALSE--------------------------------------------
116 | par(mfrow=c(1,2))
117 | tab <- c(3/7, 1/7, 3/7)
118 | #par(xaxt="n", yaxt="n")
119 | barplot(tab, xaxt="n", yaxt="n", ylab="probabilties", xlab="categories")
120 | axis(1, at=c(0.65, 1.85, 3.13), labels=c("x1", "x2", "x3"))
121 | axis(2, at=c(0,1/7,3/7), labels=c(0, "1/7","3/7"))
122 | tab <- c(7/21, 7/21, 7/21)
123 | #par(xaxt="n", yaxt="n")
124 | barplot(tab, xaxt="n", yaxt="n", ylab="probabilties", xlab="categories", ylim=c(0, 0.42857))
125 | #axis(1, at=c(0.65, 1.85, 3.13), labels=c("x1", "x2", "x3"))
126 | axis(2, at=c(0,3/21,5/21,7/21), labels=c(0, "3/21","5/21","7/21"))
127 | segments(x0=1.3897, x1=2.3943, y0=5/21, y1=5/21)
128 | segments(x0=2.5879, x1=3.5888, y0=3/21, y1=3/21)
129 | text(x=c(0.63704, 1.85844, 1.84147, 3.04591, 3.04591), 
130 |      y=c(0.16615, 0.28473, 0.13672, 0.064876, 0.237125),      
131 |      c("x1","x1","x3","x2","x3"))
132 | 
133 | ## ---- echo=FALSE, message=FALSE, warning=FALSE---------------------------
134 | require("knitr")
135 | tab <- data.frame("i"=1:3, "xi"=c("x1","x3","x2"), 
136 |        "pi"=c("3/7","3/7","1/7"), 
137 |        "ai"=c("x1","x1","x3"),
138 |        "ri"=c("1","5/7","3/7"))
139 | kable(tab)
140 | 
141 | ## ---- echo=FALSE---------------------------------------------------------
142 | library(codatables)
143 | data("precipitation")
144 | a <- addmargins(precipitation)
145 | kable(a)
146 | 
147 | ## ------------------------------------------------------------------------
148 | x <- data.frame("spring" = c(275,56,52,65,37,23),
149 |                             "summer" = c(302,20,29,17,15,5),
150 |                             "autumn" = c(375,43,53,52,54,50),
151 |                             "winter" = c(198,37,44,69,58,42))
152 | 
153 | ## ---- eval=TRUE, warning=FALSE, message=FALSE----------------------------
154 | xx <- expand.grid(rownames(x), colnames(x)) # all combinations
155 | x1 <- xx[,1]
156 | x2 <- xx[,2]
157 | y <- as.vector(t(prop.table(x))) # cell probabilites
158 | form <- y ~ x1:x2  # modell
159 | mod <- glm(form, family="poisson") # estimation
160 | pred <- (predict(mod))  # prediction
161 | pred <- exp(pred)/(1+exp(pred))  # transf. with logistic function
162 | 
163 | ## ------------------------------------------------------------------------
164 | round(matrix(pred, ncol=4, byrow=TRUE) * sum(x)) # table
165 | 
166 | ## ------------------------------------------------------------------------
167 | x <- seq(-5, 5, length = 200)
168 | dc <- dcauchy(x, location = 0, scale = 1)
169 | dn <- dnorm(x, mean = 0, sd = 1)
170 | 
171 | ## ----B05113_04_10--------------------------------------------------------
172 | par(mfrow=c(1,2), mar = c(4,4,0.1,1))
173 | plot(x, dn, type="l", ylab = "f(x) and h(x)    . In the R code: dn and dc")
174 | lines(x, dc, col="blue", lty=2)
175 | legend("topright", col=c("black", "blue"), lty=c(1,2), legend = c("normal, f(x)", "Cauchy, h(x)"), cex=1)
176 | plot(x, dn/dc, type="l", ylab = "f(x) / h(x)   . In the R code: (dn / dc)")
177 | 
178 | ## ------------------------------------------------------------------------
179 | foo <- function(x) dnorm(x)/dcauchy(x)
180 | opt <- optimize(foo, c(0, 4), maximum=TRUE)
181 | a <- opt$objective
182 | a
183 | ah <- a * dc
184 | 
185 | ## ----B05113_04_11--------------------------------------------------------
186 | plot(x, dn, type="l", ylim=c(0,0.5), lwd=2, ylab = "densities")
187 | lines(x, dc, col="blue", lty=2)
188 | lines(x, ah, col="blue", lty=2, lwd=2)
189 | legend("topright", col=c("black", "blue", "blue"), lty=c(1,2,2), lwd=c(1,1,2), legend = c("normal, f(x)", "Cauchy, h(x)", "g(x) = a * Cauchy"), cex=1)
190 | 
191 | ## ----B05113_04_12--------------------------------------------------------
192 | plot(x, dn, type="l", ylim=c(0,0.5), lwd=2)
193 | polygon(x, dn, col="gray")
194 | polygon(c(x, rev(x)), c(dn, rev(ah)), col="blue")
195 | 
196 | ## ------------------------------------------------------------------------
197 | alpha <- function(x){
198 | 	dnorm(x)/(1.520347 * dcauchy(x))
199 | }
200 | 
201 | rejectionNorm <- function(n) {
202 | 	x <- rcauchy(10000,0,1)
203 | 	u <- runif(10000)
204 | 	return(na.omit(ifelse(u <= alpha(x), x, NA)))
205 | }
206 | 
207 | ## ----B05113_04_13--------------------------------------------------------
208 | set.seed(123)
209 | x <- rejectionNorm(10000)
210 | hist(x, prob=TRUE)
211 | curve(dnorm(x), lty = 1, lwd = 2, add = TRUE)
212 | 
213 | ## ----B05113_04_14--------------------------------------------------------
214 | curve(dbeta(x, shape1 = 2, shape2 = 2), from = 0, to = 1,
215 |   xlab = "", ylab = "", main = "")
216 | ## a * h(x):
217 | abline(h = 1.5, lty = 2)
218 | 
219 | ## ------------------------------------------------------------------------
220 | rsBeta <- function(n) {
221 |   z <- runif(n)
222 |   u <- runif(n)
223 |   ind <- (u <= 4 * z * (1 - z))
224 |   return(z[ind])
225 | }
226 | set.seed(123)
227 | sample1 <- rsBeta(10000)
228 | acceptS <- length(sample1) / 10000
229 | acceptS
230 | 
231 | ## ---- echo=TRUE, tidy=FALSE----------------------------------------------
232 | library(MASS)
233 | plot1 <- function(s, shape1=2, shape2=2){
234 |   truehist(s, h = 0.1, xlim = c(0, 1), #ylim = c(0,2), 
235 |   			col="white", main = "", xlab = "")
236 |   curve(dbeta(x, shape1 = shape1, shape2 = shape2),
237 |         from = 0, to = 1, add = TRUE)
238 |   d <- density(s, from = 0, to = 1, adjust = 2, 
239 |                kernel = "gaussian")
240 |   lines(d$x, d$y, lty = 2)
241 |   legend("topright", 
242 |          legend = c("true density",	"density of simulated values"), 
243 |          col = c(1, 1), lty = c(1, 2), cex = 0.6)	
244 | }
245 | 
246 | ## ----B05113_04_15--------------------------------------------------------
247 | plot1(sample1) # produces a histgram and curve, shown below:
248 | 
249 | ## ---- eval=FALSE, fig.keep='none'----------------------------------------
250 | ## rsBeta2 <- function(n, shape1=2.5, shape2=6.5){
251 | ## 	a <- optimize(f=function(x){dbeta(x,shape1,shape2)},
252 | ## 			interval=c(0,1), maximum=TRUE)$objective
253 | ## 	z <- runif(n)
254 | ## 	u <- runif(n, max=a)
255 | ## 	ind <- (u <= dbeta(z,shape1,shape2))
256 | ## 	return(z[ind])
257 | ## }
258 | ## sample2 <- rsBeta2(10000)
259 | 
260 | ## ------------------------------------------------------------------------
261 | # percentage of rejection
262 | (1- (pcauchy(5) - pcauchy(4))) * 100
263 | v <- rcauchy(1000)
264 | v <- v[v >= 4 & v <= 5]
265 | v
266 | v[1:10]
267 | 
268 | ## ------------------------------------------------------------------------
269 | Fa <- pcauchy(4)
270 | Fb <- pcauchy(5)
271 | u <- runif(10, min = 0, max = Fb - Fa)
272 | qcauchy(Fa + u)
273 | 
274 | ## ------------------------------------------------------------------------
275 | ## Simple random walk Markov chain:
276 | n <- 10; set.seed(123)
277 | x <- numeric(n)
278 | for(i in 2:n){
279 |   x[i] <- x[i-1] + rnorm(1)
280 | }
281 | x
282 | 
283 | ## ------------------------------------------------------------------------
284 | set.seed(123)
285 | x <- numeric(n)
286 | for(i in 2:n){
287 |   x[i] <- rnorm(1, mean = x[i-1])
288 | }
289 | x
290 | 
291 | ## ------------------------------------------------------------------------
292 | f <- function(x, sigma){
293 |   if(any(x < 0)) return(0)
294 |   stopifnot(sigma > 0)
295 |   return((x / sigma^2) * exp(-x^2 / (2*sigma^2)))
296 | }
297 | 
298 | ## ------------------------------------------------------------------------
299 | i <- 2
300 | xt <- x[i-1] <- rchisq(1, 1)
301 | y <- rchisq(1, df=xt)
302 | 
303 | ## ------------------------------------------------------------------------
304 | rrai <- function(n = 10000, burnin = 1000, thin = 10, sigma = 4, verbose = TRUE){
305 |   ## raileigh density
306 |   f <- function(x, sigma){
307 |     if(any(x < 0)) return(0)
308 |     stopifnot(sigma > 0)
309 |     return((x / sigma^2) * exp(-x^2 / (2*sigma^2)))
310 |   }
311 |   x <- numeric(n)
312 |   x[1] <- rchisq(1, df=1)
313 |   k <- 0; u <- runif(n)
314 |   for(i in 2:n){
315 |     xt <- x[i-1]
316 |     y <- rchisq(1, df=xt)
317 |     num <- f(y, sigma) * dchisq(xt, df=y)
318 |     den <- f(xt, sigma) * dchisq(y, df=xt)
319 |     if(u[i] <= num/den){
320 |       x[i] <- y
321 |     } else {
322 |       x[i] <- xt
323 |       k <- k + 1 # y is rejected
324 |     }
325 |   }
326 |   if(verbose) cat("acceptance rate:", (k-burnin)/n/thin, "\n")
327 |   ## burn-in:
328 |   if(burnin > 1) x <- x[(burnin+1):length(x)]
329 |   ## thining:
330 |   return(x[seq(1, length(x), thin)])
331 | }
332 | 
333 | r <- rrai(n = 10000, thin = 1, burnin = 1)
334 | r <- rrai(n = 10000, thin = 10, burnin = 1000)
335 | length(r)
336 | 
337 | ## ----B05113_04_16--------------------------------------------------------
338 | qplot(1:length(r), r, geom="line", xlab="", ylab="random numbers from Rayleigh(4)")
339 | 
340 | ## ----B05113_04_17--------------------------------------------------------
341 | a <- ppoints(length(r))
342 | sigma <- 4
343 | QR <- sigma * sqrt (-2 * log (1-a)) # quantiles of Rayleigh
344 | Q <- quantile(r, a)
345 | qqplot(QR, Q, main = "", xlab = "Rayleigh quantile", ylab = "sample quantile")
346 | 
347 | ## ----B05113_04_18, echo=FALSE--------------------------------------------
348 | par(mfrow=c(1,2))
349 | SEQ <- seq(-4,4,0.05)
350 | par(mar=c(2.5,3.8,3,1.1))
351 | plot(SEQ, dnorm(SEQ), xaxt="n", type="l", lwd=2)
352 | axis(1,at=c(-2,-1), label=c(expression(x[t]), expression(y[t])))
353 | segments(x0=-2,x1=-2,y1=dnorm(-2),y0=-1)
354 | segments(x0=-2,x1=-5,y1=dnorm(-2),y0=dnorm(-2))
355 | segments(x0=-1,x1=-1,y1=dnorm(-1),y0=-1)
356 | segments(x0=-1,x1=-5,y1=dnorm(-1),y0=dnorm(-1))
357 | axis(2,at=c(dnorm(-2),dnorm(-1)), 
358 | 		label=c(expression(f(x[t])), expression(f(y[t]))), las=1)
359 | abline(h=0, lty=2)
360 | SEQ <- seq(-4,4,0.05)
361 | par(mar=c(2.5,3,3,0.1))
362 | plot(SEQ, dnorm(SEQ), xaxt="n", type="l", lwd=2)
363 | axis(1,at=c(-1.85,-1.5), label=c(expression(y[t]), expression(x[t])))
364 | segments(x0=-1.85,x1=-1.85,y1=dnorm(-1.85),y0=-1)
365 | segments(x0=-1.85,x1=-5,y1=dnorm(-1.85),y0=dnorm(-1.85))
366 | segments(x0=-1.5,x1=-1.5,y1=dnorm(-1.5),y0=-1)
367 | segments(x0=-1.5,x1=-5,y1=dnorm(-1.5),y0=dnorm(-1.5))
368 | axis(2,at=c(dnorm(-1.85),dnorm(-1.5)), 
369 | 		label=c(expression(f(y[t])), expression(f(x[t]))), las=1)
370 | abline(h=0, lty=2)
371 | 
372 | ## ------------------------------------------------------------------------
373 | mh <- function(n=10000, burnin=1000, thin=10, cand=runif, 
374 | 		target=dbeta, shape1=2, shape2=2){
375 | 	if(burnin >= n){ 
376 | 		stop("burnin is larger than the number of simulations")
377 | 	}
378 | 	x <- rep(cand(1), n) # initialization
379 | 	for(i in 2:n){
380 | 		y <- cand(1)
381 | 		rho <- target(y,shape1,shape2)/
382 | 				      target(x[i-1], shape1, shape2)
383 | 		x[i] <- x[i-1] + (y - x[i-1]) * (runif(1) < rho)
384 | 	}
385 |   # burn-in
386 | 	x <- x[(burnin+1):n]
387 |   return(x[seq(1, length(x), thin)])
388 | }
389 | 
390 | ## ----B05113_04_19--------------------------------------------------------
391 | par(mfrow=c(1,2))
392 | plot(density(mh()), main = "", xlab = "")
393 | plot(density(mh(shape1=1)), main = "", xlab = "")
394 | 
395 | ## ------------------------------------------------------------------------
396 | rgamma <- mh(cand = rnorm, target = dgamma)
397 | 
398 | ## ----B05113_04_20--------------------------------------------------------
399 | gibbs_bivariate <- function(n = 1000, rho = 0.9, start = 0, burnin = 100, thin = 1){
400 |   x <- y <- numeric(n)
401 | 	s <- 1 - rho^2
402 | 	x[1] <- start # to show effect of burnin
403 | 	for(t in 1:(n-1)){
404 | 	   y[t+1] <- rnorm(1, rho*x[t], s)
405 | 	   x[t+1] <- rnorm(1, rho*y[t+1], s)
406 | 	}
407 | 	s <- seq(burnin+1, n, thin)
408 | 	return(cbind(x[s], y[s]))
409 | }	
410 | par(mfrow=c(1,3))
411 | set.seed(123)
412 | ## bad start:
413 | b0 <- gibbs_bivariate(n=200, start = 30, burnin=0)
414 | ## plot the results
415 | plot(b0, type="o", xlab="x", ylab="y", main="burnin 0",
416 | 		cex.main=1.3, cex.lab=1.3)
417 | set.seed(123)
418 | plot(b0[20:200,], type="o", xlab="x", ylab="y", main="burnin 20",
419 | 		cex.main=1.3, cex.lab=1.3, col=grey(20:200/200))
420 | set.seed(123)
421 | plot(b0[20:200,], pch=20, xlab="x", ylab="y", main="burnin 20",
422 | 		cex.main=1.3, cex.lab=1.3)
423 | 
424 | ## ------------------------------------------------------------------------
425 | lreg <- function(y, x, time, alpha = 0, beta = -2, tau = 1, burnin = 0, thin = 1){
426 | 	n <- length(y)
427 | 	## alpha, beta, tau defining varibles
428 | 	res <- matrix(, ncol=3, nrow=time)
429 | 	for(i in 1:time){
430 | 		alpha <- rnorm(1, mean(y)  -beta * mean(x), 1 / (n  *tau))
431 | 		m <- (sum(x * y) - alpha * n * mean(x)) / sum(x**2)
432 | 		s <- 1 / (tau * sum(x**2))
433 | 		beta <- rnorm(1, m, s)
434 | 		w <- y - alpha - beta * x
435 | 		tau <- rgamma(1, ((n / 2) + 1), (sum(w**2) / 2))
436 | 		res[i,] <- c(alpha, beta, tau)
437 | 	}
438 | 	s <- seq(1, length((burnin + 1):nrow(res)), thin)
439 | 	res <- res[((burnin+1):nrow(res))[s], ]
440 | 	res <- data.frame(res)
441 | 	colnames(res) <- c("alpha", "beta", "tau")
442 | 	return(res)
443 | }	
444 | 
445 | ## ----B05113_04_21--------------------------------------------------------
446 | data(Cars93, package = "MASS")
447 | set.seed(123)
448 | time <- 100
449 | res <- lreg(Cars93$Price, Cars93$Horsepower, time = time)
450 | par(mar = c(4,4,0.1,0.1))
451 | plot(Cars93$Horsepower, Cars93$Price, pch=20, xlab = "Horsepower", ylab = "Price", type = "n")
452 | range <- 1 - sqrt(1:time/time)
453 | range <- range + 0.1
454 | #range <- range/sum(2*range) 
455 | for(i in 1:time){
456 | 	abline(a = res[i, 1], b = res[i, 2], col=gray(range[i]))#sqrt(1-i/size)))
457 | }
458 | abline(a = res[i, 1], b = res[i, 2], col="red", lty=2,lwd=3)#sqrt(1-i/size)))
459 | points(Cars93$Horsepower, Cars93$Price, pch=20)
460 | 
461 | ## ----B05113_04_22, message=FALSE, warning=FALSE, fig.width=7, fig.height=4----
462 | set.seed(123)
463 | g <- lreg(Cars93$Price, Cars93$Horsepower, time = 500)
464 | g1 <- cbind(g, "index" = 1:nrow(g))
465 | g1 <- reshape2::melt(g1, id=c("index"))
466 | ggplot(g1, aes(x = index, y = value)) + geom_line() + facet_wrap(~variable, scales = "free_y") 
467 | 
468 | ## ----B05113_04_23, message=FALSE, warning=FALSE--------------------------
469 | plot(acf(g), mar=c(2.9,3.2,2,0), cex.lab = 1.4, cex.main = 1.3)
470 | 
471 | ## ---- fig.height=4, fig.width=7, cache=FALSE-----------------------------
472 | library("coda")
473 | time <- 2000; M <- 5
474 | set.seed(12345)
475 | df <- lreg(Cars93$Price, Cars93$Horsepower, time = time)
476 | for(i in 2:M){
477 |   df <- rbind(df, lreg(Cars93$Price, Cars93$Horsepower, time = time))
478 | }
479 | df$M <- factor(rep(1:M, each = time))
480 | df$index <- rep(1:time, M)
481 | df <- reshape2::melt(df, id = c("M", "index"))
482 | 
483 | ## ----B05113_04_24, fig.height=4, fig.width=7, cache=FALSE----------------
484 | ggplot(df, aes(x = index, y = value, group = M, colour=M)) + geom_line(alpha = 0.5) + facet_wrap(~variable, scales = "free_y")
485 | 
486 | ## ----gelmandiag, eval = FALSE--------------------------------------------
487 | ## ## Brooke-Gelman
488 | ## gl <- list()
489 | ## M <- 15
490 | ## set.seed(12345)
491 | ## for(i in 1:M){
492 | ##   gl[[i]] <- lreg(Cars93$Price, Cars93$Horsepower, time = time)
493 | ## }
494 | ## gl <- lapply(gl, function(x) mcmc(as.matrix(x)))
495 | ## ## look also at summary(g) (not shown here)
496 | ## gelman.diag(gl, autoburnin = FALSE)
497 | ## 
498 | ## ## Potential scale reduction factors:
499 | ## ##
500 | ## ##      Point est. Upper C.I.
501 | ## ## alpha       1.07       1.07
502 | ## ## beta        1.07       1.07
503 | ## ## tau         1.00       1.00
504 | ## ##
505 | ## ## Multivariate psrf
506 | 
507 | ## ----B05113_04_25, fig.height=4, fig.width=6, eval=FALSE-----------------
508 | ## gelman.plot(gl, autoburnin = FALSE)
509 | 
510 | ## ---- eval=FALSE---------------------------------------------------------
511 | ## burnin <- 1000
512 | ## time <- burnin + time * 20
513 | ## g <- lreg(Cars93$Price, Cars93$Horsepower, time = time, burnin = burnin, thin = 20)
514 | 
515 | ## ------------------------------------------------------------------------
516 | circle <- function(x, r=0.05){
517 |     repeat{
518 |         x1 <- runif(1,-1,1)
519 |         x2 <- runif(1,-1,1)
520 |         if( sqrt(x1^2 + x2^2) <= (1 - r) ) break
521 |     }
522 |     inCircle <- ((x[,1] - x1)^2 + (x[,2] - x2)^2) <= r^2
523 |     return(inCircle)
524 | }
525 | 
526 | ## ------------------------------------------------------------------------
527 | set.seed(123)
528 | ## take possible radii
529 | x <- matrix(runif(10000, -1, 1), ncol=2)
530 | ## radii to the square
531 | r2 <- rowSums(x^2)
532 | ## r2 smaller than 1 are kept
533 | x1 <- x[r2 <= 1, ]
534 | 
535 | ## ----B05113_04_26--------------------------------------------------------
536 | par(mar = c(2,2,0.1,0.1))
537 | plot(data.frame(x1), pch=20)
538 | for(k in 1:8) points(data.frame(x1[circle(x1, 0.2),]), col=k, pch=20)
539 | 
540 | ## ------------------------------------------------------------------------
541 | set.seed(123)
542 | z <- replicate(2000, sum(circle(x1)))
543 | 
544 | ## ------------------------------------------------------------------------
545 | TAB <- table(z)
546 | TAB
547 | 
548 | ## ------------------------------------------------------------------------
549 | laeken::weightedMedian(as.numeric(names(TAB)), as.numeric(TAB))
550 | 
551 | ## ----B05113_04_27--------------------------------------------------------
552 | lambda <- nrow(x1) * 0.05^2
553 | PROB <- dpois(as.numeric(names(TAB)), lambda)
554 | b <- barplot(TAB / length(z))
555 | points(b, PROB, col="red", pch=16) 
556 | 
557 | ## ------------------------------------------------------------------------
558 | ## the five classes:
559 | QP <- qpois(seq(0,1,by=1/6), lambda)
560 | QP
561 | ## frequency counts in those classes
562 | TAB1 <- table(cut(z, QP, include.lowest=TRUE))
563 | TAB1
564 | 
565 | ## ------------------------------------------------------------------------
566 | ppois(QP, lambda)
567 | ## 0 should be in the left class:
568 | QP1 <- QP
569 | QP1[1] <- -1
570 | ## probablities for each class:
571 | PROB1 <- diff(ppois(QP1, lambda))
572 | PROB1
573 | ## goodness-of-fit test:
574 | chisq.test(TAB1, p=PROB1)
575 | 
576 | ## ------------------------------------------------------------------------
577 | sessionInfo()
578 | 
579 | 


--------------------------------------------------------------------------------
/Chapter 8/chapter8.R:
--------------------------------------------------------------------------------
  1 | ## ------------------------------------------------------------------------
  2 | library("robustbase")
  3 | data("hbk")
  4 | ## structure of the data
  5 | str(hbk)
  6 | 
  7 | ## ------------------------------------------------------------------------
  8 | lm_ols <- lm(Y ~ ., data = hbk)
  9 | ## print summary
 10 | summary(lm_ols)
 11 | 
 12 | ## ----B05113_08_01--------------------------------------------------------
 13 | plot(lm_ols, which = 3)
 14 | 
 15 | ## ------------------------------------------------------------------------
 16 | lm_rob <- lmrob(Y ~ ., data = hbk)
 17 | ## print summary
 18 | summary(lm_rob)
 19 | 
 20 | ## ----B05113_08_02--------------------------------------------------------
 21 | plot(lm_rob, which = 5)
 22 | 
 23 | ## ----B05113_08_03--------------------------------------------------------
 24 | pairs(hbk, pch = ".")
 25 | 
 26 | ## ---- warning=FALSE, message=FALSE---------------------------------------
 27 | data(Prestige, package = "car")
 28 | rob <- ltsReg(prestige ~ log(income) + women + type, data = Prestige)
 29 | summary(rob)
 30 | 
 31 | ## ------------------------------------------------------------------------
 32 | boot.lts <- function(x, indices){
 33 |   x <- x [indices,]
 34 |   model <- ltsReg(prestige ~ log(income) +     
 35 |                  women + type, data = x)
 36 |   coefficients(model)
 37 | }
 38 | 
 39 | ## ---- cache=TRUE, message=FALSE,warning=FALSE----------------------------
 40 | library("boot")
 41 | set.seed(123)
 42 | rob_boot <- boot(Prestige, boot.lts, 1000)
 43 | ## estimated standard errors
 44 | rob_boot
 45 | 
 46 | ## ---- echo=FALSE, message=FALSE, warning=FALSE---------------------------
 47 | library(boot)
 48 | library(MASS)
 49 | library(e1071)
 50 | 
 51 | ## ----B05113_08_04--------------------------------------------------------
 52 | hist(rob_boot$t[,2], 50, xlab = "bootstrap repl., log(income)", main = "")
 53 | 
 54 | ## ----B05113_08_05--------------------------------------------------------
 55 | hist(rob_boot$t[,3], 50, xlab = "bootstrap repl., women", main = "")
 56 | 
 57 | ## ------------------------------------------------------------------------
 58 | boot.ci(rob_boot, index = 2, 
 59 |         type = c("norm", "perc", "bca"))
 60 | 
 61 | ## ----B05113_08_06, fig.width=6, fig.height=7-----------------------------
 62 | par(mfrow = c(2,1), mar = c(4,4,2,0.5))
 63 | jack.after.boot(rob_boot, index = 2, main = 'log (income) coefficient')
 64 | jack.after.boot(rob_boot, index = 3, main = 'woman coefficient')
 65 | 
 66 | ## ------------------------------------------------------------------------
 67 | set.seed(12)
 68 | df <- data.frame(x = 1:7, y = 1:7 + rnorm(7))
 69 | 
 70 | ## ----B05113_08_07, fig.width=6, fig.height=7-----------------------------
 71 | par(mfrow = c(2,1), mar = c(4,4,1,0.3))
 72 | ## fit to original data
 73 | lm_orig <- lm(y ~ x, data = df)
 74 | ## plot original data
 75 | plot(y ~ x, data = df)
 76 | ## add regression line from original data
 77 | abline(lm_orig)
 78 | ## show the connection lines 
 79 | ## between original and fitted y
 80 | segments(x0 = df$x, x1=df$x,  
 81 |          y0=df$y, y1=lm_orig$fit)
 82 | ## fitted y
 83 | points(df$x, lm_orig$fit, pch=20, col="red")
 84 | legend("topleft", legend = c("y", expression(hat(y))),
 85 |        pch = c(1,20), col = c(1,2))
 86 | ## second plot ---------------------
 87 | ## plot of fitted values
 88 | plot(lm_orig$fit ~ df$x, col="red", pch = 20, 
 89 |      ylab="y", xlab = "x")
 90 | ## bootstrap sample by adding sampled residuals
 91 | y1 <- lm_orig$fit + sample(lm_orig$res, replace = TRUE)
 92 | ## new bootstrap sample
 93 | points(df$x, y1, col="blue", pch = 3)
 94 | ## connection lines new bootrap sample to 
 95 | ## fitted values from original data
 96 | segments(x0 = df$x, x1 = df$x,
 97 |          y0 = lm_orig$fit, y1 = y1, col ="blue")
 98 | ## regression line from original data
 99 | abline(lm_orig)
100 | ## regression line from bootstrap sample
101 | abline(lm(y1 ~ df$x), col = "blue", lty = 2)
102 | legend("topleft", legend = c("original", "bootstrap repl. 1"), lty = c(1,2), col = c(1,4))
103 | 
104 | ## ---- cache=TRUE, message=FALSE, warning=FALSE---------------------------
105 | Prestige <- na.omit(Prestige)
106 | ## fit model on original data
107 | rob2 <- ltsReg(prestige ~ log(income) + women + type,
108 |                data = Prestige)
109 | 
110 | ## ------------------------------------------------------------------------
111 | residuals <- residuals(rob2)
112 | fit <- fitted(rob2)
113 | ## fix X, model matrix
114 | X <- model.matrix(rob2, Prestige)[, -1]
115 | 
116 | ## ------------------------------------------------------------------------
117 | ltsBoot2 <- function(x, indices){
118 |   y <- fit + residuals[indices]
119 |   model <- ltsReg(y ~ X)
120 |   coefficients(model)
121 | }
122 | 
123 | ## ---- message=FALSE, warning=FALSE---------------------------------------
124 | rob2_boot <- boot(Prestige, ltsBoot2, R = 2000)
125 | ## show results
126 | rob2_boot
127 | 
128 | ## ----B05113_08_08--------------------------------------------------------
129 | par(mfrow = c(1,2), mar = c(4,4,1,0.2))
130 | hist(rob2_boot$t[,2], 50, xlab = "bootstrap repl., log(income)", main = "")
131 | hist(rob2_boot$t[,3], 50, xlab = "bootstrap repl., women", main = "")
132 | 
133 | ## ----B05113_08_09, fig.width=6, fig.height=7-----------------------------
134 | par(mfrow = c(2,1), mar = c(4,4,2,0.5))
135 | jack.after.boot(rob2_boot, index = 2, main = 'log (income) coefficient')
136 | jack.after.boot(rob2_boot, index = 3, main = 'woman coefficient')
137 | 
138 | ## ---- echo=FALSE---------------------------------------------------------
139 | gendat <- function(n, d)
140 | {
141 |  set.seed(1083)
142 |  s <- matrix(rnorm(d^2,0,0.2), ncol=d, nrow=d)
143 |  diag(s) <- 1
144 |  xorig <- mvrnorm(n ,mu = rep(0, d),Sigma = s)
145 |  xorig <- rbind(xorig, mvrnorm(round(n*0.05) ,mu = c(10, rep(0, d-1)), Sigma = s))
146 |  ## fix me: generell auch fuer mehrere Dimensionen
147 |  x <- xorig
148 |  
149 |  for (i in 1:d)
150 |  {
151 |        miss <- sample(1:dim(x)[1], sample(n/5), replace = TRUE)
152 |        x[miss,i]<-NA
153 |  }
154 |  w <- 0
155 |  w <- which(rowSums(is.na(x)) == d)
156 |  x <- x[-w,]
157 | 
158 | }
159 | x <- gendat(1000,3)
160 | 
161 | ## ---- message=FALSE, warning=FALSE---------------------------------------
162 | library("VIM")
163 | data("sleep")
164 | 
165 | ## ----B05113_08_10--------------------------------------------------------
166 | aggr(sleep, numbers = TRUE, prop = FALSE, cex.axis = 0.75)
167 | 
168 | ## ----B05113_08_11, message=FALSE, warning=FALSE--------------------------
169 | par(mar = c(6,4,0.2,0.2))
170 | matrixplot(sleep, sortby = "BrainWgt", interactive = FALSE)
171 | 
172 | ## ----B05113_08_12--------------------------------------------------------
173 | set.seed(123)
174 | sleep_boot1 <- sleep[sample(1:nrow(sleep), replace = TRUE), ]
175 | aggr(sleep_boot1, numbers = TRUE, prop = FALSE,
176 |      cex.axis = 0.75)
177 | 
178 | ## ----B05113_08_13, message=FALSE, warning=FALSE--------------------------
179 | par(mar = c(6,4,0.2,0.2))
180 | matrixplot(sleep_boot1, sortby = "BrainWgt")
181 | 
182 | ## ------------------------------------------------------------------------
183 | bootmiss <- function(x, R = 1000, 
184 |                      imputation = "median", 
185 |                      theta = "mean"){
186 |   ## initializations
187 |   d <- dim(x)[2]
188 |   n <- dim(x)[1]
189 |   thetaM <- matrix(NA, ncol=2, nrow=d)
190 |   xs <- theta_hat1 <- matrix(0, nrow=n, ncol=d)
191 |   med <- matrix(NA, ncol=d, nrow=R)
192 |   for(i in 1:R){
193 |     ## bootstrap sample
194 |     s1 <- x[sample(1:n, replace=TRUE), ]
195 |     ## imputation
196 |     if(method %in% c("median", "mean")){
197 |       simp <- impute(s1, what=method)
198 |     }
199 |     if(method == "kNN"){
200 |       simp <- knn(x)$data
201 |     }
202 |     ## estimation
203 |     med[i, ] <- apply(simp, 2, theta)
204 |   }
205 |   ## bootstrap mean (for each column)
206 |   thetaHat <- apply(med, 2, mean)
207 |   ## confidence intervals
208 |   for(i in 1:d){
209 |    thetaM[i,1] <- quantile(med[,i], 0.025)
210 |    thetaM[i,2] <- quantile(med[,i], 0.975)
211 |   }
212 |   ## results
213 |   res <- list(thetaHat = thetaHat,
214 |              ci = thetaM)
215 |   return(res)
216 | }
217 | 
218 | ## ---- cache=TRUE, message=FALSE, warning=FALSE, eval=FALSE---------------
219 | ## ci <- quantile(replicate(10000, mean(sleep[sample(1:n, replace = TRUE), "sleep"])), c(0.025, 0.975))
220 | ## ci
221 | 
222 | ## ---- eval=FALSE---------------------------------------------------------
223 | ## for(i in 1:ncol(x)){
224 | ##   indmiss <- sample(1:nrow(x), round(sample(nrow(x)/10)), replace = TRUE)
225 | ##   x[indmiss, i] <- NA
226 | ## }
227 | 
228 | ## ---- cache=TRUE, message=FALSE, warning=FALSE---------------------------
229 | n <- nrow(sleep)
230 | imputed <- kNN(sleep, imp_var = FALSE)
231 | ci_single <- quantile(replicate(10000, mean(imputed[sample(1:n, replace = TRUE), "Sleep"])), c(0.025, 0.975))
232 | ci_single
233 | 
234 | ## ---- cache=TRUE, message=FALSE, warning=FALSE, results='hide'-----------
235 | ci_boot <- quantile(replicate(10000, mean(kNN(sleep[sample(1:n, replace = TRUE), ], imp_var = FALSE)$Sleep)), c(0.025, 0.975))
236 | 
237 | ## ------------------------------------------------------------------------
238 | ci_boot
239 | 
240 | ## ------------------------------------------------------------------------
241 | set.seed(123)
242 | tseries <- rnorm(50)   
243 | ## introduce auto-correlation
244 | tseries[-1] <- tseries[-1] + tseries[-50]    
245 | 
246 | ## ----B05113_08_014-------------------------------------------------------
247 | plot(ts(tseries), ylab = "values")
248 | 
249 | ## ------------------------------------------------------------------------
250 | mbb <- function(x, R=1000, blocksize=6){
251 |   ## initialization
252 |   nc <- length(x)
253 |   lagCorMBB <- ct <- numeric(R)  
254 |   seriesBl <- numeric(nc) 
255 |   ## function for moving blocks bootstrap
256 |   corT <- function(x=tseries, N=nc, bl=blocksize){ 
257 |     ## for N/bl blocks
258 |   	for(i in 1:ceiling(N/bl)) {    
259 |   	  ## endpoint of block
260 |   		endpoint <- sample(bl:N, size=1) 
261 |   		## put blocks together, bootstrap sample
262 |   		seriesBl[(i-1)*bl+1:bl] <- x[endpoint-(bl:1)+1] 
263 |   	}
264 |   	seriesBl <- seriesBl[1:N]
265 |   	## autocorrelation
266 |   	a <- cor(seriesBl[-1],seriesBl[-N])  
267 |   	return(a)
268 |   }
269 |   ct <- replicate(R, corT(x))
270 |   return(ct)
271 | }
272 | 
273 | ## ------------------------------------------------------------------------
274 | mb <- mbb(x=tseries, R=10000)
275 | ## first 10 boostrap replicates
276 | mb[1:10]
277 | ## auto-correlation cofficient from mean of bootstrap replicates
278 | mean(mb)
279 | 
280 | ## ---- fig.keep='none'----------------------------------------------------
281 | acf(tseries)$acf[2]
282 | 
283 | ## ------------------------------------------------------------------------
284 | qu_mbb <- quantile(mb, c(0.025,0.975))
285 | cat("CI(mbb) : [", round(qu_mbb[1], 2), ",", round(qu_mbb[2], 2), "]")
286 | 
287 | ## ---- fig.keep='none', cache=TRUE, warning=FALSE, message=FALSE----------
288 | library("forecast") 
289 | ac <- taperedacf(tseries)
290 | cat("CI(classical) : [", round(ac$lower[1], 2), ",", round(ac$upper[1], 2), "]")
291 | 
292 | ## ---- tidy=FALSE---------------------------------------------------------
293 | x <- data.frame("location" = rep("Asten", 8), 
294 |       "income" = c(2000,2500,2000,2500,1000,1500,2000,2500), 
295 |       "weight" = c(1000,500,1000,500,1500,1000,1000,2000))
296 | x
297 | 
298 | ## ------------------------------------------------------------------------
299 | sum(x$weight)
300 | 
301 | ## ------------------------------------------------------------------------
302 | sum(x$income * x$weight) 
303 | 
304 | ## ------------------------------------------------------------------------
305 | set.seed(123)
306 | y <- x[sample(1:8, replace = TRUE), ] # Bootstrap Sample
307 | y
308 | 
309 | ## ------------------------------------------------------------------------
310 | # non-calibrated estimation
311 | sum(y$income * y$weight) 
312 | 
313 | ## ------------------------------------------------------------------------
314 | sum(y$weight)
315 | 
316 | ## ------------------------------------------------------------------------
317 | constant <- sum(x$weight) / sum(y$weight) 
318 | ## calibrated estimation
319 | sum(y$x * y$w * constant)
320 | 
321 | ## ------------------------------------------------------------------------
322 | library("laeken")
323 | data("eusilc")
324 | ## point estimate of poverty rate
325 | a <- arpr("eqIncome", weights = "rb050", data = eusilc)
326 | ## bootstrap with calibration
327 | ## define auxiliary 0-1 variables for regions
328 | aux <- sapply(levels(eusilc$db040), 
329 |     function(l, x) as.numeric(x == l), 
330 |     x = eusilc$db040)
331 | ## retrieve population totals from underlying sample
332 | totals <- sapply(levels(eusilc$db040), 
333 |     function(l, x, w) sum(w[x == l]), 
334 |     x = eusilc$db040, w = eusilc$rb050)
335 | # bootstrap variance
336 | variance("eqIncome", weights = "rb050", design = "db040", 
337 |     data = eusilc, indicator = a, X = aux, totals = totals, 
338 |     seed = 123)	
339 | 
340 | ## ------------------------------------------------------------------------
341 | temp <- read.table("http://venus.unive.it/romanaz/statistics/data/bodytemp.txt", header = TRUE)
342 | temp$gen <- factor(temp$gen, labels = c("male", "female"))
343 | str(temp)
344 | 
345 | ## ------------------------------------------------------------------------
346 | temp$celsius <- (temp$tf - 32) * 5 / 9
347 | 
348 | ## ----B05113_08_015-------------------------------------------------------
349 | library("ggplot2")
350 | ggplot(temp, aes(x = celsius, colour = gen, linetype = gen)) + geom_density(size = 1.2) + theme(text = element_text(size=16)) + theme_bw()
351 | 
352 | ## ------------------------------------------------------------------------
353 | temperature <- temp$celsius
354 | 
355 | ## ------------------------------------------------------------------------
356 | n <- length(temperature)
357 | temperature <- sort(temperature)
358 | y <- (0:(n-1)) / n
359 | 
360 | ## ----B05113_08_016-------------------------------------------------------
361 | plot(temperature, y, pch=20, cex = 0.3)
362 | lines(temperature, y, type="S")
363 | 
364 | ## ----B05113_08_017-------------------------------------------------------
365 | plot(temperature, y, type="S")
366 | m <- mean(temperature)
367 | s <- sd(temperature)
368 | yn <- pnorm(temperature, mean = m, sd = s)
369 | lines(temperature, yn, col=2)
370 | 
371 | ## ------------------------------------------------------------------------
372 | z <- round(sort(rnorm(n, mean = m, sd = s)), 1)
373 | 
374 | ## ----B05113_08_018-------------------------------------------------------
375 | set.seed(123)
376 | plot(temperature, y, type="S")
377 | for(k in 1:100){
378 |     z <- rnorm(n, mean=m, sd=s)
379 |     lines(sort(z), y, type="S", col="green")
380 | }
381 | 
382 | ## ------------------------------------------------------------------------
383 | Z <- NULL
384 | for(k in 1:1000){
385 |     z = rnorm(n, mean = m, sd = s)
386 |     Z = cbind(Z, sort(z))
387 | }
388 | dim(Z)
389 | 
390 | ## ------------------------------------------------------------------------
391 | ## mean of original temperature data
392 | m
393 | ## simulated mean
394 | (mean(Z[65, ]) + mean(Z[66, ])) / 2
395 | ## simulated median
396 | (median(Z[65, ]) + median(Z[66, ])) / 2
397 | 
398 | ## ------------------------------------------------------------------------
399 | plot(temperature, y, type="S")
400 | middle <- apply(Z, 1, median)
401 | lines(middle, y, col = "blue", lwd = 2, type = "S")
402 | ## lower and upper bounds
403 | lower <- apply(Z, 1, quantile, prob = 0.025)
404 | upper <- apply(Z, 1, quantile, prob = 0.975)
405 | lines(lower, y, col = 2, lwd = 2, type = "S")
406 | lines(upper, y, col = 2, lwd = 2, type = "S")
407 | 
408 | ## ----B05113_08_020-------------------------------------------------------
409 | par(mfrow = c(2,1), mar = rep(1,4))
410 | plot(temperature, y, type="S")
411 | lines(temperature, yn, col=2)
412 | lines(lower, y, col=2, lwd=2, type="S")
413 | lines(upper, y, col=2, lwd=2, type="S")
414 | plot(temperature, y - yn, type="h")
415 | abline(h = 0)
416 | ## maximum deviation
417 | D <- max(abs(y - yn))
418 | w <- which.max(abs(y - yn))
419 | points(temperature[w], y[w] - yn[w], col=2, pch=16, cex=3)
420 | 
421 | ## ------------------------------------------------------------------------
422 | ## theoretical distribution
423 | Z1 <- pnorm(Z, mean = m, sd = s)
424 | ## y will be recycled column-wise,
425 | ## extract the maximum for each column
426 | D1 <- apply(abs(y - Z1), 2, max)
427 | 
428 | ## ------------------------------------------------------------------------
429 | summary(D1)
430 | D
431 | 
432 | ## ------------------------------------------------------------------------
433 | mean(D1>D)
434 | 
435 | ## ---- message=FALSE, warning=FALSE---------------------------------------
436 | ks.test(temperature, "pnorm", mean = m, sd = s)
437 | 
438 | ## ------------------------------------------------------------------------
439 | data(Duncan, package = "car")
440 | x <- subset(Duncan, type %in% c("bc", "wc"), select = c("income", "type"))
441 | x$type <- factor(x$type)
442 | ## first four observations on income and type
443 | head(x, 4)
444 | 
445 | ## ------------------------------------------------------------------------
446 | t.test(income ~ type, data=x)
447 | 
448 | ## ------------------------------------------------------------------------
449 | ## first 6 observations with permuted grouping structure
450 | head(cbind(x, "p1" = sample(x$type), 
451 |               "p2" = sample(x$type), 
452 |               "p3" = sample(x$type)))
453 | 
454 | ## ------------------------------------------------------------------------
455 | ## define test statistics (workhorse)
456 | teststat <- function(vals, group, lev){
457 |   g <- sample(group)
458 |   abs(mean(vals[g == lev[1]]) - mean(vals[g == lev[2]]))
459 | }
460 | ## permutation test
461 | permtest <- function(x, g, R = 1000, conf.level = 0.95){
462 |     ## levels of the group vector
463 |     lg <- levels(g)
464 |     ## test statistics for original groups
465 |     mdiff <- abs(mean(x[g==lg[1]]) - mean(x[g==lg[2]]))
466 |     ## test statistics for permuted group data
467 |     z <- replicate(R, teststat(x, g, lg))
468 |     ## make nice print output
469 |     DATA <- paste(deparse(substitute(x)),
470 |                   "by",
471 |                   deparse(substitute(g)))
472 |     alpha <- 1 - conf.level
473 |     conf.int <- quantile(z, prob = c(alpha/2, (1 - alpha)/2))
474 |     attr(conf.int, "conf.level") <- conf.level
475 |     res <- list(statistic=c(mdiff = mdiff),
476 |                    p.value = mean(abs(z) > abs(mdiff)),
477 |                    parameter = c(nrep = R),
478 |                    conf.int = conf.int,
479 |                    data.name = DATA,
480 |                    method = "Permutation test for difference in means")
481 |     class(res) <- "htest"
482 |     res
483 | }
484 | 
485 | ## ------------------------------------------------------------------------
486 | permtest(x$income, x$type, R = 10000)
487 | 
488 | ## ------------------------------------------------------------------------
489 | data(Duncan, package = "car")
490 | pairwise.t.test(Duncan$income, Duncan$type)
491 | 
492 | ## ---- message=FALSE, warning=FALSE---------------------------------------
493 | mean(Duncan$income)
494 | library("dplyr")
495 | Duncan %>% group_by(type) %>% summarize(mean = mean(income))
496 | 
497 | ## ------------------------------------------------------------------------
498 | tstat <- function(x, mu=0){
499 |     (mean(x)-mu) / (sd(x) / sqrt(length(x)))
500 | }
501 | stats <- tapply(Duncan$income, Duncan$type, tstat, mu=mean(Duncan$income))
502 | stat <- max(abs(stats))
503 | stat
504 | 
505 | ## ------------------------------------------------------------------------
506 | maxt.test <- function(x, g, R = 10000, conf.level = 0.05){
507 |     m <- mean(x)
508 |     stat <- tapply(x, g, tstat, mu = m)
509 |     stat <- max(abs(stat))
510 |     gsize = table(g)
511 |     z <- NULL
512 |     for(k in 1:length(gsize)){
513 |         ## from a t-distribution:
514 |         z <- cbind(z, rt(n=n, df=gsize[k]-1))
515 |     }
516 |     ## z now is a list with length(gsize) elements
517 |     ## we need the maximum absolute value for each element
518 |     z <- abs(z)
519 |     z <- z[cbind(1:n,max.col(z))]
520 |     ## make nice print output
521 |     DATA <- paste(deparse(substitute(x)),
522 |                   "by",
523 |                   deparse(substitute(g)))
524 |     alpha <- 1 - conf.level
525 |     conf.int <- quantile(z, prob = c(alpha/2, (1 - alpha)/2))
526 |     attr(conf.int, "conf.level") <- conf.level
527 |     res <- list(statistic=c(stat = stat),
528 |                    p.value =  mean(z > stat),
529 |                    parameter = c(nrep = R),
530 |                    conf.int = conf.int,
531 |                    data.name = DATA,
532 |                    method = "Maximum t-test")
533 |     class(res) <- "htest"
534 |     res
535 | }
536 | 
537 | ## ------------------------------------------------------------------------
538 | maxt.test(Duncan$income, Duncan$type)
539 | 
540 | ## ------------------------------------------------------------------------
541 | maxp.test <- function(x, g, R = 10000, conf.level = 0.05){
542 |     m <- mean(x)
543 |     stat <- tapply(x, g, tstat, mu=m)
544 |     stat <- max(abs(stat))
545 |     z <- numeric(n)
546 |     for(k in 1:n){
547 |         g1 <- sample(g)
548 |         z[k] <- max(abs(tapply(x, g1, tstat, mu = m)))
549 |     }
550 |     
551 |     retval <- list(tstat=stat, pval=mean(z>stat),
552 |                    name="Permutation maximum t-test")
553 |     class(retval) <- "ttest"
554 |     retval
555 |     ## make nice print output
556 |     DATA <- paste(deparse(substitute(x)),
557 |                   "by",
558 |                   deparse(substitute(g)))
559 |     alpha <- 1 - conf.level
560 |     conf.int <- quantile(z, prob = c(alpha/2, (1 - alpha)/2))
561 |     attr(conf.int, "conf.level") <- conf.level
562 |     res <- list(statistic=c(stat = stat),
563 |                    p.value =  mean(z > stat),
564 |                    parameter = c(nrep = R),
565 |                    conf.int = conf.int,
566 |                    data.name = DATA,
567 |                    method = "Permutation maximum test")
568 |     class(res) <- "htest"
569 |     res
570 | }
571 | 
572 | ## ------------------------------------------------------------------------
573 | maxp.test(Duncan$income, Duncan$type)
574 | 
575 | ## ------------------------------------------------------------------------
576 | boottest <- function(x, g, n=10000){
577 |     lg <- levels(g)
578 |     n1 <- length(x[g == lg[1]])
579 |     N <- length(x)
580 |     mdiff <- abs(mean(x[g == lg[1]]) - mean(x[g == lg[2]]))
581 |     z <- double(n)
582 |     for(k in 1:n){
583 |         x1 <- sample(x, replace=TRUE)
584 |         z[k] <- abs(mean(x1[1:n1]) - mean(x1[(n1+1):N]))
585 |     }
586 |     mean( z > mdiff )
587 | }
588 | 
589 | ## ------------------------------------------------------------------------
590 | Duncan$type <- factor(Duncan$type)
591 | boottest(Duncan$income, Duncan$type)
592 | 
593 | ## ------------------------------------------------------------------------
594 | mvad.test <- function(x, R=1000){ 
595 | 	n <- nrow(x)
596 | 	## test statistics
597 |   stat <- function(x, N = n){
598 |     cmean <- colMeans(x)
599 | 	  cvar  <- var(x)
600 |     u <- mahalanobis(x, center = cmean, cov = cvar)
601 |     z <- pchisq(u, ncol(x))
602 |     p <- sort(z)
603 |     h <- (2 * seq(1:N) - 1) * (log(p) + log(1 - rev(p)))
604 |     A <- -N - mean(h)
605 | 		return(A)
606 |   }
607 |   ## value of test statistics for original sample  
608 |   A <- stat(x)
609 | 	cmean <- colMeans(x)
610 |   cvar <- var(x)
611 |   p <- numeric(R)
612 |   ## values of test statistics for draws of mvn
613 |   p <- replicate(R, stat(mvrnorm(n, cmean, cvar)))
614 |   pvalue <- mean(p > A)
615 | 	RVAL <- list(statistic = c(A = A),
616 | 	             method = "A-D radius test",                                  
617 | 			         p.value = pvalue)                                      
618 | 	class(RVAL) <- "htest" 
619 | 	RVAL
620 | }
621 | 
622 | ## ---- eval=FALSE---------------------------------------------------------
623 | ## library("MASS")
624 | ## set.seed(123)
625 | ## r <- replicate(1000, mvad.test(mvrnorm(100, mu=rep(0,3),
626 | ##              Sigma=diag(3)))$p.value)
627 | ## size <- mean(r < 0.05)
628 | 
629 | ## ---- echo=FALSE---------------------------------------------------------
630 | size = 0.05
631 | 
632 | ## ------------------------------------------------------------------------
633 | size
634 | 
635 | ## ---- cache = TRUE-------------------------------------------------------
636 | library("mvtnorm")
637 | library("ICS")
638 | ## Monte Carlo AD test 100 times replicted
639 | r <- replicate(100, mvad.test(rmvt(30, diag(3), df = 5), R=100)$p.value) 
640 | mean(r  < 0.05) 
641 | ## Skewness test 1000 times replicted
642 | r2 <- replicate(1000, mvnorm.skew.test(rmvt(30, diag(3), df = 5))$p.value)
643 | mean(r2  < 0.05) 
644 | 
645 | ## ------------------------------------------------------------------------
646 | sessionInfo()
647 | 
648 | 


--------------------------------------------------------------------------------
/Chapter 2/chapter2.R:
--------------------------------------------------------------------------------
  1 | ## ---- echo=FALSE, message=FALSE, warning=FALSE---------------------------
  2 | library(ggplot2)
  3 | 
  4 | ## ------------------------------------------------------------------------
  5 | 5 + 2 * log(3 * 3)
  6 | 
  7 | ## ------------------------------------------------------------------------
  8 | mean(rnorm(10))
  9 | 
 10 | ## ---- eval=FALSE---------------------------------------------------------
 11 | ## res1 <- name_of_function(v1) # an input argument
 12 | ## res2 <- name_of_function(v1, v2) # two input arguments
 13 | ## res3 <- name_of_function(v1, v2, v3) # three input arguments
 14 | 
 15 | ## ------------------------------------------------------------------------
 16 | x <- rnorm(5)
 17 | x
 18 | 
 19 | ## ------------------------------------------------------------------------
 20 | options(digits = 4)
 21 | x
 22 | 
 23 | ## ---- eval=FALSE---------------------------------------------------------
 24 | ## install.packages("dplyr")
 25 | 
 26 | ## ---- eval=FALSE---------------------------------------------------------
 27 | ## library("dplyr")
 28 | 
 29 | ## ---- eval=FALSE---------------------------------------------------------
 30 | ## install.packages("devtools")
 31 | ## library("devtools")
 32 | ## install_github("hadley/dplyr")
 33 | 
 34 | ## ---- eval=FALSE---------------------------------------------------------
 35 | ## help.start()
 36 | 
 37 | ## ---- eval=FALSE---------------------------------------------------------
 38 | ## help(package="dplyr")
 39 | 
 40 | ## ---- eval=FALSE---------------------------------------------------------
 41 | ## ?group_by
 42 | 
 43 | ## ---- eval=FALSE---------------------------------------------------------
 44 | ## data(Cars93, package = "MASS")
 45 | 
 46 | ## ---- eval=FALSE---------------------------------------------------------
 47 | ## help.search("histogram")
 48 | 
 49 | ## ---- eval=FALSE, echo=TRUE----------------------------------------------
 50 | ## apropos("hist")
 51 | 
 52 | ## ---- eval=FALSE---------------------------------------------------------
 53 | ## RSiteSearch("group by factor")
 54 | 
 55 | ## ------------------------------------------------------------------------
 56 | ls()
 57 | 
 58 | ## ------------------------------------------------------------------------
 59 | getwd()
 60 | 
 61 | ## ------------------------------------------------------------------------
 62 | v.num <- c(1,3,5.9,7)
 63 | v.num
 64 | is.numeric (v.num)
 65 | 
 66 | ## ------------------------------------------------------------------------
 67 | v.num > 3
 68 | 
 69 | ## ---- warning=FALSE, message=FALSE---------------------------------------
 70 | v1 <- c(1,2,3)
 71 | v2 <- c(4,5)
 72 | v1 + v2 
 73 | 
 74 | ## ------------------------------------------------------------------------
 75 | v2 <- c (100, TRUE, "A", FALSE)
 76 | v2
 77 | is.numeric (v2)
 78 | 
 79 | ## ---- eval=TRUE----------------------------------------------------------
 80 | data(Cars93, package = "MASS")
 81 | # extract a subset of variable Horsepower from Cars93
 82 | hp <- Cars93[1:10, "Horsepower"] 
 83 | hp
 84 | # positive indexing:
 85 | hp[c(1,6)]
 86 | # negative indexing:
 87 | hp[-c(2:5,7:10)]
 88 | # logical indexing:
 89 | hp < 150
 90 | # a logical expression can be written directly in []
 91 | hp[hp < 150]
 92 | 
 93 | ## ------------------------------------------------------------------------
 94 | class(Cars93)
 95 | class(Cars93$Cylinders)
 96 | levels(Cars93$Cylinders)
 97 | summary(Cars93$Cylinders)
 98 | 
 99 | ## ------------------------------------------------------------------------
100 | model <- lm(Price ~ Cylinders + Type + EngineSize + Origin, data = Cars93)
101 | ## result is a list
102 | class(model)
103 | ## access elements from the named list with the dollar sign
104 | model$coefficients
105 | 
106 | ## ---- tidy=FALSE---------------------------------------------------------
107 | ## extract cars with small number of cylinders and small power
108 | w <- Cars93$Cylinders %in% c("3", "4")  & Cars93$Horsepower < 80
109 | str(Cars93[w, ])
110 | 
111 | ## ------------------------------------------------------------------------
112 | library("vcd")
113 | data(PreSex)
114 | PreSex
115 | 
116 | ## ------------------------------------------------------------------------
117 | PreSex[, , 1, 2]
118 | 
119 | ## ------------------------------------------------------------------------
120 | PreSex[, , "Yes", "Men"]
121 | 
122 | ## ------------------------------------------------------------------------
123 | sum(is.na(Cars93))
124 | 
125 | ## ----B05113_02_01, message = FALSE, warning=FALSE------------------------
126 | require("VIM")
127 | # colnames(Cars93) <- substr(colnames(Cars93), 1, 8)
128 | par(mar=c(7,3,0.1,0.1))
129 | matrixplot(Cars93, sortby = "Weight", cex.axis=0.9)
130 | 
131 | ## ---- eval=FALSE, message=FALSE, warning=FALSE---------------------------
132 | ## m <- robCompositions::missPatterns(Cars93)
133 | 
134 | ## ------------------------------------------------------------------------
135 | ## how often summary is overloaded with methods 
136 | ## on summary for certain classes
137 | length(methods(summary))
138 | class(Cars93$Cylinders)
139 | summary(Cars93$Cylinders)
140 | ## just to see the difference, convert to class character:
141 | summary(as.character(Cars93$Cylinders))
142 | 
143 | ## ------------------------------------------------------------------------
144 | ## function to be applied afterwards
145 | func <- function(x){
146 |   return(sum(is.na(x)))
147 | }
148 | ## apply func on all columns (second dimension (2))
149 | ## and store it to na
150 | na <- apply(X = Cars93, MARGIN = 2, FUN = func)
151 | ## show those with missings
152 | na[ na > 0 ]
153 | 
154 | ## ------------------------------------------------------------------------
155 | p <- ncol(Cars93)
156 | na_for <- numeric(p)
157 | for(i in 1:p){
158 |   na_for[i] <- func(Cars93[, i])
159 | }
160 | 
161 | identical(as.numeric(na), na_for)
162 | 
163 | ## ------------------------------------------------------------------------
164 | m <- robCompositions::missPatterns(Cars93)
165 | class(m)
166 | 
167 | ## ------------------------------------------------------------------------
168 | lapply(m, length)
169 | 
170 | ## ---- tidy=TRUE----------------------------------------------------------
171 | s <- sapply(m, length)
172 | 
173 | ## ------------------------------------------------------------------------
174 | s
175 | class(s)
176 | 
177 | ## ------------------------------------------------------------------------
178 | args(aggregate)
179 | 
180 | ## ------------------------------------------------------------------------
181 | methods(aggregate)
182 | 
183 | args(aggregate.data.frame)
184 | 
185 | ## ------------------------------------------------------------------------
186 | aggregate(Cars93[, c("Horsepower", "Weight")], by = list(Cars93$Cylinders), median)
187 | 
188 | ## ---- eval = TRUE, echo = TRUE, message=FALSE, warning=FALSE-------------
189 | library("dplyr") 
190 | 
191 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
192 | class (Cars93)
193 | 
194 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
195 | Cars93 <- tbl_df(Cars93)
196 | class(Cars93) 
197 | 
198 | ## ---- eval = FALSE, echo = TRUE, results='hide'--------------------------
199 | ## print(Cars93) # output suppressed
200 | 
201 | ## ---- eval = TRUE, echo = TRUE, results='hide'---------------------------
202 | slice(Cars93, 1) # first line, output suppressed
203 | 
204 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
205 | slice (Cars93, c(1,4,10,15, n())) 
206 | 
207 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
208 | filter(Cars93, Manufacturer == "Audi" & Min.Price > 25)
209 | 
210 | ## ---- eval = TRUE, echo = TRUE, results='hide'---------------------------
211 | Cars93 <- arrange (Cars93, Price)
212 | Cars93 ## output suppressed
213 | 
214 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
215 | head(arrange(Cars93, desc (MPG.city), Max.Price), 7)
216 | 
217 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
218 | head (select (Cars93, Manufacturer, Price), 3)
219 | 
220 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
221 | head(select(Cars93, Manufacturer:Price), 3)
222 | 
223 | ## ---- eval = TRUE, echo = TRUE, results='hide'---------------------------
224 | select(Cars93, -Min.Price, -Max.Price) # output suppressed
225 | 
226 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
227 | head(select(Cars93, starts_with ("Man")), 3)
228 | 
229 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
230 | head(select(Cars93, contains ("Price")), 3)
231 | 
232 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
233 | head (select (Cars93, myPrize = Price, Min.Price))
234 | 
235 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
236 | Cars93_1 <- select(Cars93, Manufacturer, EngineSize)
237 | dim (Cars93_1)
238 | Cars93_1 <- distinct(Cars93_1)
239 | dim (Cars93_1)
240 | 
241 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
242 | dim(Cars93)
243 | dim( distinct (Cars93, Manufacturer) )
244 | # based on two variables, second is rounded:
245 | dim(distinct(Cars93, Manufacturer, EngineSize))
246 | # based on two variables, second is rounded:
247 | dim(distinct(Cars93, Manufacturer, rr=round(EngineSize)))
248 | 
249 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
250 | m <- mutate(Cars93, is_ford = Manufacturer == "Ford")
251 | m[1:3, c(1,28)]
252 | 
253 | ## ---- eval = TRUE, echo = TRUE, results='hide'---------------------------
254 | transmute(Cars93, is_ford = Manufacturer == "Ford", Manufacturer)
255 | 
256 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
257 | head (transmute(Cars93, Manufacturer, is_ford = Manufacturer == "Ford", num_ford = ifelse (is_ford, -1, 1)), 3)
258 | 
259 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
260 | by_type <- group_by (Cars93, Type)
261 | summarize (by_type,
262 |  count = n(),
263 |  min_es = min(EngineSize),
264 |  max_es = max(EngineSize)
265 | )
266 | 
267 | ## ---- results='hide'-----------------------------------------------------
268 | Cars93 %>% 
269 |   group_by(Type) %>% 
270 |   summarize(count = n(), min_es = min(EngineSize), max_es = max(EngineSize) )
271 | ## output suppressed since equally to previous output
272 | 
273 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
274 | by_type <- group_by(Cars93, Type)
275 | slice(by_type, 1: 2)
276 | 
277 | ## ---- echo = TRUE, results='hide'----------------------------------------
278 | ## output suppressed since the same as above
279 | Cars93 %>% group_by(Type) %>% slice(1:2)
280 | 
281 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
282 | Cars93 %>% 
283 |   mutate(ES2 = EngineSize^2) %>% 
284 |   group_by(Type) %>%
285 |   summarize(min.ES2 = min(ES2)) %>% 
286 |   arrange(desc(min.ES2))
287 | 
288 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
289 | Cars93 %>% 
290 |   group_by(Type) %>% 
291 |   arrange(Type) %>% 
292 |   select(Manufacturer:Price) %>% 
293 |   mutate(cmean = cummean(Price), csum = cumsum(Price))
294 | 
295 | ## ----eval=TRUE, echo=TRUE, results='hide'--------------------------------
296 | require(data.table)
297 | Cars93 <- data.table(Cars93)
298 | Cars93 ## print output suppressed
299 | 
300 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
301 | tables()
302 | 
303 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
304 | Cars93$tmp1 <- Cars93[, j = Manufacturer == "Ford"] 
305 | 
306 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
307 | Cars93[, tmp2 := rnorm(nrow(Cars93))] 
308 | 
309 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
310 | Cars93[, tmp1:=NULL]
311 | Cars93$tmp2 <- NULL
312 | 
313 | ## ----eval=TRUE, echo=TRUE, results='hide'--------------------------------
314 | Cars93[i = 2] # second row, all columns 
315 | Cars93[i = c(1,5)] # first and fifth row, all columns
316 | Cars93[i = -c(1:5)] # exclude the first five rows
317 | 
318 | ## ----eval=TRUE, echo=TRUE, tidy=FALSE------------------------------------
319 | Cars93[j = 3] # this does not work since 3 evaluates to 3
320 | Cars93[j = "Price"] # extract "Price" does not work since "Price" evaluates to "Price"
321 | Cars93[j = Price] # this works, since variable Price exists in the scope of Cars93 
322 | Cars93[i=1:3, j = "Price", with = FALSE] # also works
323 | 
324 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
325 | Cars93[1:3, .(Price, Horsepower, Diff.Price = Max.Price - Min.Price, Mean.Price = mean(Price))]  
326 | 
327 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
328 | setkey(Cars93, Type) # equally: setkeyv(dt, "x")
329 | 
330 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
331 | key(Cars93)
332 | 
333 | ## ----eval=TRUE, echo=TRUE, results='hide'--------------------------------
334 | setkey(Cars93, Type) 
335 | Cars93["Van"] # all rows with Type == "Van" (output suppressed)
336 | 
337 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
338 | setkey(Cars93, Type, DriveTrain, Origin)
339 | Cars93[.("Van", "4WD", "non-USA")]
340 | 
341 | ## ----eval=TRUE, echo=TRUE, cache=TRUE------------------------------------
342 | require(microbenchmark)
343 | N <- 1000000
344 | dat<- data.table(
345 |   x=sample(LETTERS[1:20], N, replace=TRUE),
346 |   y=sample(letters[1:5], N, replace=TRUE))
347 | head(dat, 3)
348 | 
349 | setkey(dat, x,y)
350 | 
351 | microbenchmark(
352 |   data.table = dat[list(c("B", "D"), c("b", "d"))],
353 |   dplyr = dat %>% slice(x %in% c("B", "D") & y %in% c("b", "d")),
354 |   baseR = dat[x %in% c("B", "D") & y %in% c("b", "d")]
355 | )
356 | 
357 | ## ----eval=TRUE, echo=TRUE, cache=TRUE------------------------------------
358 | Cars93[, .(mean = mean(Price), IQR = IQR(Price), median = median(Price)), by = Type]
359 | 
360 | ## ------------------------------------------------------------------------
361 | data(Cars93, package = "MASS")
362 | set.seed(123)
363 | system.time(lm(Price ~ Horsepower + Weight + Type + Origin, data=Cars93))
364 | 
365 | ## ------------------------------------------------------------------------
366 | library("robustbase")
367 | system.time(lmrob(Price ~ Horsepower + Weight + Type + Origin, data=Cars93))
368 | 
369 | ## ---- tidy=FALSE---------------------------------------------------------
370 | ptm <- proc.time()
371 | robustbase::lmrob(Price ~ Horsepower + Weight + Type + Origin, data=Cars93)
372 | proc.time() - ptm	
373 | 
374 | ## ---- cache=TRUE---------------------------------------------------------
375 | require("robustbase")
376 | s1 <- system.time(replicate(100, lm(Price ~ Horsepower + Weight + Type + Origin, data=Cars93)))[3]
377 | s2 <- system.time(replicate(100, lmrob(Price ~ Horsepower + Weight + Type + Origin, data=Cars93)))[3]
378 | (s2 - s1)/s1
379 | 
380 | ## ------------------------------------------------------------------------
381 | Rprof("Prestige.lm.out")
382 | invisible(replicate(100,
383 | 				lm(Price ~ Horsepower + Weight + Type + Origin, data=Cars93)))
384 | Rprof(NULL)
385 | summaryRprof("Prestige.lm.out")$by.self
386 | 
387 | ## ---- results='hide', warning=FALSE, message=FALSE-----------------------
388 | require(profr)
389 | parse_rprof("Prestige.lm.out") 
390 | 
391 | ## ---- warning=FALSE, message=FALSE---------------------------------------
392 | library(microbenchmark); library(plyr); library(dplyr); library(data.table); library(Hmisc)
393 | 
394 | ## ---- tidy = FALSE-------------------------------------------------------
395 | data(Cars93, package = "MASS")
396 | Cars93 %>% group_by(Type, Origin) %>% summarise(mean = mean(Horsepower))
397 | 
398 | ## ------------------------------------------------------------------------
399 | meanFor <- function(x){
400 |   sum <- 0
401 |   for(i in 1:length(x)) sum <- sum + x[i]
402 |   sum / length(x)
403 | }
404 | 
405 | ## groupwise statistics
406 | myfun1 <- function(x, gr1, gr2, num){
407 |   x[,gr1] <- as.factor(x[,gr1])
408 |   x[,gr2] <- as.factor(x[,gr2])
409 |   l1 <- length(levels(x[,gr1]))
410 |   l2 <- length(levels(x[,gr1]))
411 |   gr <-  numeric(l1*l2)
412 |   c1 <- c2 <- character(l1*l2)
413 |   ii <- jj <- 0
414 |   for(i in levels(x[,gr1])){
415 |     for(j in levels(x[,gr2])){
416 |       ii <- ii + 1
417 |       c1[ii] <- i
418 |       c2[ii] <- j
419 |       vec <- x[x[,gr2] == j & x[,gr1] == i, num]
420 |       if(length(vec) > 0) gr[ii] <- meanFor(vec)
421 |     }
422 |   }
423 |   
424 |   df <- data.frame(cbind(c1, c2))
425 |   df <- cbind(df, gr)
426 |   colnames(df) <- c(gr1,gr2,paste("mean(", num, ")"))
427 |   df
428 | }
429 | 
430 | ## groupwise using mean()
431 | ## attention mean.default is faster
432 | myfun2 <- function(x, gr1, gr2, num){
433 |   x[,gr1] <- as.factor(x[,gr1])
434 |   x[,gr2] <- as.factor(x[,gr2])
435 |   l1 <- length(levels(x[,gr1]))
436 |   l2 <- length(levels(x[,gr1]))
437 |   gr <-  numeric(l1*l2)
438 |   c1 <- c2 <- character(l1*l2)
439 |   ii <- jj <- 0
440 |   for(i in levels(x[,gr1])){
441 |     for(j in levels(x[,gr2])){
442 |       ii <- ii + 1
443 |       c1[ii] <- i
444 |       c2[ii] <- j
445 |       gr[ii] <- mean(x[x[,gr2] == j & x[,gr1] == i, num])
446 |     }
447 |   }
448 |   
449 |   df <- data.frame(cbind(c1, c2))
450 |   df <- cbind(df, gr)
451 |   colnames(df) <- c(gr1,gr2,paste("mean(", num, ")"))
452 |   df
453 | }
454 | 
455 | ## ------------------------------------------------------------------------
456 | Cars93dt <- data.table(Cars93)
457 | 
458 | ## ---- tidy=FALSE---------------------------------------------------------
459 | op <- microbenchmark(
460 |   ## pure for loops
461 |   MYFUN1 = myfun1(x=Cars93, gr1="Type", gr2="Origin", 
462 |                   num="Horsepower"),
463 |   ## pure for loops but using mean
464 |   MYFUN2 = myfun2(x=Cars93, gr1="Type", gr2="Origin", 
465 |                   num="Horsepower"),
466 |   ## plyr
467 |   PLYR = ddply(Cars93, .(Type, Origin), summarise, 
468 |              output = mean(Horsepower)),
469 |   ## base R's aggregate and by
470 |   AGGR = aggregate(Horsepower ~ Type + Origin, Cars93, mean),
471 |   BY = by(Cars93$Horsepower, 
472 |           list(Cars93$Type,Cars93$Origin), mean),
473 |   ## Hmisc's summarize
474 |   SUMMARIZE = summarize(Cars93$Horsepower, 
475 |                   llist(Cars93$Type,Cars93$Origin), mean),
476 |   ## base R's tapply
477 |   TAPPLY = tapply(Cars93$Horsepower, 
478 |               interaction(Cars93$Type, Cars93$Origin), mean),
479 |   ## dplyr
480 |   DPLYR = summarise(group_by(Cars93, Type, Origin), 
481 |                     mean(Horsepower)),
482 |   ## data.table
483 |   DATATABLE = Cars93dt[, aggGroup1.2 := mean(Horsepower), 
484 |                        by = list(Type, Origin)], 
485 |   times=1000L)
486 | 
487 | ## ----B05113_02_02--------------------------------------------------------
488 | m <- reshape2::melt(op, id="expr")
489 | ggplot(m, aes(x=expr, y=value)) + 
490 |   geom_boxplot() + 
491 |   coord_trans(y = "log10") + 
492 |   xlab(NULL) + ylab("computation time") +  
493 |   theme(axis.text.x = element_text(angle=45))
494 | 
495 | ## ---- cache = TRUE, warning=FALSE, message=FALSE-------------------------
496 | library(laeken); data(eusilc)
497 | eusilc <- do.call(rbind, 
498 |     list(eusilc,eusilc,eusilc,eusilc,eusilc,eusilc,eusilc))
499 | eusilc <- do.call(rbind,
500 |     list(eusilc,eusilc,eusilc,eusilc,eusilc,eusilc,eusilc))
501 | dim(eusilc)
502 | eusilcdt <- data.table(eusilc)
503 | setkeyv(eusilcdt, c('hsize','db040'))
504 | 
505 | op <- microbenchmark(
506 |   MYFUN1 = myfun1(x=eusilc, gr1="hsize", gr2="db040", 
507 |                   num="eqIncome"),
508 |   MYFUN2 = myfun2(x=eusilc, gr1="hsize", gr2="db040", 
509 |                   num="eqIncome"),
510 |   PLYR = ddply(eusilc, .(hsize, db040), summarise, 
511 |                output = mean(eqIncome)),
512 |   AGGR = aggregate(eqIncome ~ hsize + db040, eusilc, mean),
513 |   BY = by(eusilc$eqIncome, list(eusilc$hsize,eusilc$db040), mean),
514 |   SUMMARIZE = summarize(eusilc$eqIncome, 
515 |                         llist(eusilc$hsize,eusilc$db040), mean),
516 |   TAPPLY = tapply(eusilc$eqIncome, 
517 |               interaction(eusilc$hsize, eusilc$db040), mean),
518 |   DPLYR = summarise(group_by(eusilc, hsize, db040), 
519 |                     mean(eqIncome)),
520 |   DATATABLE = eusilcdt[, mean(eqIncome), by = .(hsize, db040)],
521 |   times=10)
522 | 
523 | ## ----B05113_02_03--------------------------------------------------------
524 | m <- reshape2::melt(op, id="expr")
525 | ggplot(m, aes(x=expr, y=value)) + 
526 |   geom_boxplot() + 
527 |   coord_trans(y = "log10") + 
528 |   xlab(NULL) + ylab("computation time") +  
529 |   theme(axis.text.x = element_text(angle=45, vjust=1))
530 | 
531 | ## ------------------------------------------------------------------------
532 | Rprof("aggr.out")
533 | a <- aggregate(eqIncome ~ hsize + db040, eusilc, mean)
534 | Rprof(NULL)
535 | summaryRprof("aggr.out")$by.self
536 | 
537 | ## ---- cache = TRUE, message=FALSE, warning=FALSE-------------------------
538 | R <- 10000
539 | library(robustbase)
540 | covMcd(Cars93[, c("Price", "Horsepower")], cor = TRUE)$cor[1,2]
541 | 
542 | ## confidence interval:
543 | n <- nrow(Cars93)
544 | f <- function(R, ...){
545 |   replicate(R, covMcd(Cars93[sample(1:n, replace=TRUE), 
546 |       c("Price", "Horsepower")], cor = TRUE)$cor[1,2])
547 | }
548 | system.time(ci <- f(R))
549 | quantile(ci, c(0.025, 0.975))
550 | 
551 | ## ------------------------------------------------------------------------
552 | library("snow")
553 | cl <- makeCluster(3, type="SOCK")
554 | 
555 | ## ---- results='hide'-----------------------------------------------------
556 | clusterEvalQ(cl, library("robustbase"))
557 | clusterEvalQ(cl, data(Cars93, package = "MASS"))
558 | clusterExport(cl, "f")
559 | clusterExport(cl, "n")
560 | 
561 | ## ------------------------------------------------------------------------
562 | clusterSetupRNG(cl, seed=123)
563 | 
564 | ## ------------------------------------------------------------------------
565 | system.time(ci_boot <-
566 | 				clusterCall(cl, f, R = round(R / 3)))
567 | quantile(unlist(ci_boot), c(0.025, 0.975))
568 | 
569 | ## ------------------------------------------------------------------------
570 | stopCluster(cl)
571 | 
572 | ## ------------------------------------------------------------------------
573 | wmeanR <- function(x, w) {
574 |   total <- 0
575 |   total_w <- 0
576 |   for (i in seq_along(x)) {
577 |     total <- total + x[i] * w[i]
578 |     total_w <- total_w + w[i]
579 |   }
580 |   total / total_w
581 | }
582 | 
583 | ## ------------------------------------------------------------------------
584 | library("Rcpp")
585 | ## from 
586 | ## http://blog.revolutionanalytics.com/2013/07/deepen-your-r-experience-with-rcpp.html 
587 | cppFunction('
588 |   double wmean(NumericVector x, NumericVector w) {
589 |   int n = x.size();
590 |   double total = 0, total_w = 0;
591 |   for(int i = 0; i < n; ++i) {
592 |     total += x[i] * w[i];
593 |     total_w += w[i];
594 |   }
595 |   return total / total_w;
596 |   }
597 | ')
598 | 
599 | ## ---- cache = TRUE-------------------------------------------------------
600 | x <- rnorm(100000000)
601 | w <- rnorm(100000000)
602 | library("laeken")
603 | op <- microbenchmark(
604 |   naiveR = wmeanR(x, w),
605 |   weighted.mean = weighted.mean(x, w),
606 |   weighedMean = weightedMean(x, w),
607 |   Rcpp.wmean = wmean(x, w), 
608 |   times = 1
609 | )
610 | op
611 | 
612 | ## ----B05113_02_04--------------------------------------------------------
613 | m <- reshape2::melt(op, id="expr")
614 | ggplot(m, aes(x=expr, y=value)) + 
615 |   geom_boxplot() + 
616 |   coord_trans(y = "log10") + 
617 |   xlab(NULL) + ylab("computation time") + 
618 |   theme(axis.text.x = element_text(angle=45, vjust=1))
619 | 
620 | ## ---- eval = FALSE-------------------------------------------------------
621 | ## pdf(file = "myplot.pdf")
622 | ## plot(Horsepower ~ Weight, data = Cars93)
623 | ## dev.off ()
624 | 
625 | ## ----addfig, fig.align='center'------------------------------------------
626 | x <- 1:20 / 2 # x ... 0.5, 1.0, 1.5, ..., 10.0
627 | y <- sin(x) 
628 | plot(x, y, pch = 16, cex = 10 * abs(y), col = grey(x / 14))
629 | 
630 | ## ----B05113_02_05, fig.align='center'------------------------------------
631 | plot(x, y, pch = 16, cex = 10 * abs(y), col = grey(x / 14)) 
632 | text(x, y, 1:20, col="yellow")
633 | curve(sin, -2 * pi, 4 * pi, add = TRUE, col = "red") 
634 | abline(h = 0, lty = 2, col = "grey")
635 | 
636 | ## ----B05113_02_06, fig.align='center'------------------------------------
637 | par(mfrow = c(2, 2), mar = c(3,3,0.1,0.1))  
638 | mpg <- mtcars$mpg
639 | cyl <- factor(mtcars$cyl)
640 | df <- data.frame(x1=cyl, x2=mpg)
641 | tmpg <- ts(mpg)
642 | plot(mpg); plot(cyl); plot(df); plot(tmpg)
643 | 
644 | ## ---- echo = TRUE, eval = TRUE, size='tiny'------------------------------
645 | tail(methods(plot)) ## last 6
646 | ## number of methods for plot
647 | length(methods(plot)) 
648 | 
649 | ## ---- eval = FALSE, fig.align='center'-----------------------------------
650 | ## plot(x=mtcars$mpg, y=mtcars$hp)
651 | ## plot(mtcars$mpg, mtcars$hp)
652 | ## plot(hp ~ mpg, data=mtcars)
653 | 
654 | ## ------------------------------------------------------------------------
655 | ## min und max in both axis
656 | xmin <- min(mtcars$mpg); xmax <- max(mtcars$mpg)
657 | ymin <- min(mtcars$hp); ymax <- max(mtcars$hp)
658 | 
659 | ## calculate histograms
660 | xhist <- hist(mtcars$mpg, breaks=15, plot=FALSE)
661 | yhist <- hist(mtcars$hp, breaks=15, plot=FALSE)
662 |  
663 | ## maximum count
664 | top <- max(c(xhist$counts, yhist$counts))
665 | xrange <-  c(xmin,xmax) 
666 | yrange <- c(ymin, ymax)
667 | 
668 | ## ----B05113_02_07, fig.align="center"------------------------------------
669 | m <- matrix(c(2, 0, 1, 3), 2, 2, byrow = TRUE)
670 | ## define plot order and size
671 | layout(m, c(3,1), c(1, 3), TRUE)
672 | ## first plot
673 | par(mar=c(3,0,1,1))
674 | plot(mtcars[,c("mpg","hp")], xlim=xrange, ylim=yrange, xlab="", ylab="") 
675 | ## second plot -- barchart of margin
676 | par(mar=c(0,0,1,1))
677 | barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0)
678 | ## third plot -- barchart of other margin
679 | par(mar=c(3,0,1,1)) 
680 | barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE)
681 | 
682 | ## ----B05113_02_08, fig.align="center", fig.height=4.5, fig.width=6-------
683 | library("ggplot2")
684 | ggplot(Cars93, aes(x = Horsepower, y = MPG.city)) + geom_point(aes(colour = Cylinders))
685 | 
686 | ## ----B05113_02_09, fig.align="center", fig.height=4.5, fig.width=6-------
687 | g1 <- ggplot(Cars93, aes(x=Horsepower, y=MPG.city))
688 | g2 <- g1 + geom_point(aes(color=Weight)) 
689 | g2 + geom_smooth()  
690 | 
691 | ## ----B05113_02_10, fig.align="center", fig.height=4.5, fig.width=6-------
692 | g1 <- g1 + geom_text(aes(label=substr(Manufacturer,1,3)), 
693 |                      size=3.5)  
694 | g1 + geom_smooth()
695 | 
696 | ## ----image B05113_02_11--------------------------------------------------
697 | gg <- ggplot(Cars93, aes(x=Horsepower, y=MPG.city))
698 | gg <- gg + geom_point(aes(shape = Origin, colour = Price)) 
699 | gg <- gg + facet_wrap(~ Cylinders)  + theme_bw()
700 | gg
701 | 
702 | ## ------------------------------------------------------------------------
703 | sessionInfo()
704 | 
705 | 


--------------------------------------------------------------------------------