├── Image
└── Chrysanthemum.jpg
├── LICENSE
├── README.md
├── Chapter 11
└── chapter11.R
├── Chapter 3
└── chapter3.R
├── Chapter 5
└── chapter5.R
├── Chapter 9
└── chapter9.R
├── Chapter 6
└── chapter6.R
├── Chapter 10
└── chapter10.R
├── Chapter 7
└── chapter7.R
├── Chapter 4
└── chapter4.R
├── Chapter 8
└── chapter8.R
└── Chapter 2
└── chapter2.R
/Image/Chrysanthemum.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Simulation-for-Data-Science-with-R/HEAD/Image/Chrysanthemum.jpg
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Packt Publishing
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ## $5 Tech Unlocked 2021!
5 | [Buy and download this Book for only $5 on PacktPub.com](https://www.packtpub.com/product/simulation-for-data-science-with-r/9781785881169)
6 | -----
7 | *If you have read this book, please leave a review on [Amazon.com](https://www.amazon.com/gp/product/1785881167). Potential readers can then use your unbiased opinion to help them make purchase decisions. Thank you. The $5 campaign runs from __December 15th 2020__ to __January 13th 2021.__*
8 |
9 | # Simulation for Data Science with R
10 |
11 | ## Code Testing Specifications
12 |
13 | **Chapter 2: Required packages**
14 | * boot_1.3-17
15 | * colorspace_1.2-6
16 | * data.table_1.9.6
17 | * dplyr_0.4.3
18 | * Formula_1.2-1
19 | * ggplot2_2.1.0
20 | * Hmisc_3.17-1
21 | * laeken_0.4.6
22 | * lattice_0.20-33
23 | * MASS_7.3-43
24 | * microbenchmark_1.4-2.1
25 | * plyr_1.8.3
26 | * profr_0.3.1
27 | * Rcpp_0.12.4
28 | * robustbase_0.92-5
29 | * snow_0.4-1
30 | * survival_2.38-3
31 | * vcd_1.4-1
32 | * VIM_4.4.1
33 |
34 | **Chapter 3: Required packages**
35 | * knitr_1.12.3
36 | * Matrix_1.2-6
37 |
38 | **Chapter 4: Required packages**
39 | * coda_0.18-1
40 | * ggplot2_2.1.0
41 | * knitr_1.12.3
42 | * MASS_7.3-43
43 |
44 | **Chapter 5: Required packages**
45 | * mvtnorm_1.0-5
46 | * nloptr_1.0.4
47 | * RCEIM_0.2
48 |
49 | **Chapter 6: Required packages**
50 | * car_2.1-2
51 |
52 |
53 | **Chapter 7: Required packages**
54 | * boot_1.3-17
55 | * bootstrap_2015.2
56 | * cvTools_0.3.2
57 | * lattice_0.20-33
58 | * MASS_7.3-43
59 | * robustbase_0.92-5 car_2.1-2
60 | * vcd_1.4-1
61 |
62 |
63 | **Chapter 8: Required packages**
64 | * boot_1.3-17
65 | * colorspace_1.2-6
66 | * data.table_1.9.6
67 | * dplyr_0.4.3
68 | * e1071_1.6-7
69 | * ggplot2_2.1.0
70 | * laeken_0.4.6
71 | * MASS_7.3-43
72 | * robustbase_0.92-5
73 | * VIM_4.4.1
74 |
75 |
76 | **Chapter 9: Required packages**
77 | * colorspace_1.2-6
78 | * data.table_1.9.6
79 | * ggplot2_2.1.0
80 | * MASS_7.3-43
81 | * mice_2.25
82 | * Rcpp_0.12.4
83 | * robustbase_0.92-5
84 | * VIM_4.4.1
85 |
86 |
87 | **Chapter 10: Required packages**
88 | * boot_1.3-17
89 | * colorspace_1.2-6
90 | * data.table_1.9.6
91 | * dplyr_0.4.3
92 | * ggplot2_2.1.0
93 | * laeken_0.4.6
94 | * lattice_0.20-33
95 | * MASS_7.3-43
96 | * matrixcalc_1.0-3
97 | * mice_2.25
98 | * mvtnorm_1.0-5
99 | * pls_2.5-0
100 | * Rcpp_0.12.4
101 | * reshape2_1.4.1
102 | * robCompositions_2.0.1
103 | * robustbase_0.92-5
104 | * simFrame_0.5.3
105 | * simPop_0.2.4
106 | * vcd_1.4-1
107 | * VIM_4.4.1
108 |
109 |
110 | **Chapter 11: Required packages**
111 | * lattice_0.20-33
112 | * Rcpp_0.12.4
113 | * simFrame_0.5.3
114 |
115 |
116 |
117 | ##Related R Books:
118 | * [R for Data Science](https://www.packtpub.com/big-data-and-business-intelligence/r-data-science?utm_source=github&utm_medium=repository&utm_campaign=9781784390860)
119 | * [Learning R for Geospatial Analysis](https://www.packtpub.com/big-data-and-business-intelligence/learning-r-geospatial-analysis?utm_source=github&utm_medium=repository&utm_campaign=9781783984367)
120 | * [R Graphs Cookbook Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/r-graph-cookbook-%E2%80%93-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781783988785)
121 | ### Download a free PDF
122 |
123 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
124 |
https://packt.link/free-ebook/9781785881169
--------------------------------------------------------------------------------
/Chapter 11/chapter11.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | fert2Rate <- function(age, time, lastbirth){
3 | a <- ifelse(time <= 2020, 32, 33)
4 | b <- ifelse(time <= 2020, 6.0, 5.7)
5 | frate <- (b / a) * (a / age) ^ (3 / 2) * exp(-b ^ 2 * (a / age + age / a - 2))
6 | frate[age <= 15 | age >= 45 | lastbirth < 0.75] <- 0
7 | return(frate)
8 | }
9 |
10 | ## ------------------------------------------------------------------------
11 | fert2Rate(30, 2030, lastbirth = 3)
12 |
13 | ## ------------------------------------------------------------------------
14 | fert2Rate(40, 2030, lastbirth = 3)
15 |
16 | ## ------------------------------------------------------------------------
17 | mortRate <- function(age, time){
18 | a <- 0.0003
19 | b <- ifelse(time <= 2020, 0.1, 0.097)
20 | mrate <- a * exp(b * age)
21 | return(mrate)
22 | }
23 |
24 | ## ------------------------------------------------------------------------
25 | mortRate(40, 2056)
26 |
27 | ## ---- message=FALSE, warning=FALSE---------------------------------------
28 | library("simFrame")
29 | data(eusilcP, package = "simFrame")
30 | pop <- eusilcP[, c("age", "gender", "hsize", "hid")]
31 |
32 | ## ------------------------------------------------------------------------
33 | pop$nchildWomen <- ifelse(pop$gender == "female" & as.integer(pop$hsize) > 2 & pop$age > 17, as.integer(pop$hsize) - 2, 0)
34 | pop$partnered <- factor(ifelse(as.integer(pop$hsize) >= 2 & pop$age > 17, "P", "A"))
35 |
36 | ## ------------------------------------------------------------------------
37 | head(pop)
38 |
39 | ## ------------------------------------------------------------------------
40 | stateSpace <- expand.grid(sex = levels(pop$gender),
41 | partnered = levels(pop$partnered))
42 |
43 | ## ------------------------------------------------------------------------
44 | stateSpace
45 |
46 | ## ------------------------------------------------------------------------
47 | trMatrix_f <- cbind(c("female/A->female/P", "female/P->female/A"),
48 | c("rates1", "rates2"))
49 | trMatrix_m <- cbind(c("male/A-male/P", "male/P-male/A"),
50 | c("rates3", "rates4"))
51 | allTransitions <- rbind(trMatrix_f, trMatrix_m)
52 | absTransitions <- rbind(c("female/dead", "mortRate"),
53 | c("male/dead", "mortRate"))
54 |
55 | ## ---- message=FALSE, warning=FALSE---------------------------------------
56 | library("MicSim")
57 | transitionMatrix <- buildTransitionMatrix(allTransitions = allTransitions, absTransitions = absTransitions, stateSpace = stateSpace)
58 |
59 | ## ------------------------------------------------------------------------
60 | transitionMatrix[1,3] <- "rates3"
61 | transitionMatrix[3,1] <- "rates4"
62 | transitionMatrix
63 |
64 | ## ------------------------------------------------------------------------
65 | maxAge <- 100
66 |
67 | ## ------------------------------------------------------------------------
68 | love <- function(t, x, parms){
69 | with(as.list(c(parms, x)), {
70 | dPrince_Harry <- a * Chelsy_Davy
71 | dChelsy_Davy <- -b * Prince_Harry
72 | res <- c(dPrince_Harry, dChelsy_Davy)
73 | list(res)
74 | })
75 | }
76 |
77 | ## ------------------------------------------------------------------------
78 | parms <- c(a = 1, b = 2)
79 | times <- seq(0, 30, length = 31)
80 | ## Start values for steady state
81 | y <- xstart <- c(Prince_Harry = 1, Chelsy_Davy = -1)
82 |
83 | ## ---- message=FALSE, warning=FALSE---------------------------------------
84 | library("deSolve")
85 | out <- ode(xstart, times, love, parms = parms)
86 |
87 | ## ----B05113_11_01--------------------------------------------------------
88 | matplot(out)
89 |
90 | ## ----B05113_11_02--------------------------------------------------------
91 | y <- xstart <- c(Prince_Harry = 0.2, Chelsy_Davy = 1)
92 | parms <- c(a = 0.3, b = 0.7)
93 | out <- ode(xstart, times, love, parms = parms)
94 | matplot(out)
95 |
96 | ## ------------------------------------------------------------------------
97 | lv_mod <- function (time, state, parms) {
98 | with(as.list(c(state, parms)), {
99 | dx <- k1 * x - k2 * x * y
100 | dy <- -k3 * y + k4 * x * y
101 | return(list(c(dx, dy)))
102 | })
103 | }
104 |
105 | ## ------------------------------------------------------------------------
106 | parms <- c(k1 = 1, k2 = 1.5, k3 = .2, k4 = .6)
107 | state <- c(x = 10, y = 10)
108 | time <- seq(0, 200, by = 1)
109 |
110 | ## ------------------------------------------------------------------------
111 | res <- ode(func = lv_mod, y = state, parms = parms, times = time)
112 | res <- as.data.frame(res)
113 |
114 | ## ----B05113_11_03--------------------------------------------------------
115 | par(mar = c(4,4,0.5,0.1))
116 | matplot(res[,-1], type = "l", xlab = "time", ylab = "population")
117 | legend("topright", c("Toads", "Snakes"), lty = c(1,2), col = c(1,2), box.lwd = 0)
118 |
119 | ## ------------------------------------------------------------------------
120 | sessionInfo()
121 |
122 |
--------------------------------------------------------------------------------
/Chapter 3/chapter3.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | trunc (2.3 * 100)
3 |
4 | ## ------------------------------------------------------------------------
5 | round(1.49999999999999)
6 | round(1.4999999999999999)
7 |
8 | ## ----match---------------------------------------------------------------
9 | a <- seq(0.6, 1, by = 0.01)
10 | match(0.88, a)
11 | match(0.89, a)
12 | match(0.94, a)
13 |
14 | ## ----excel---------------------------------------------------------------
15 | excel_round <- function (x, digits) round (x * (1 + 1e-15), digits)
16 | round(0.5, digits = 0)
17 | excel_round(0.5, digits = 0)
18 |
19 | ## ----machine_eps---------------------------------------------------------
20 | .Machine$double.eps
21 | .Machine$double.digits
22 | .Machine$double.xmax
23 |
24 | ## ----B05113_03_01, echo=FALSE, fig.align='center'------------------------
25 | a <- c(0.65, 0.62, 0.41, 0.43, 0.61, 0.49, 0.44, 0.58, 0.47, 0.53, 0.52, 0.48, 0.51)
26 | par(mar=c(4,4,0.2,0.2))
27 | plot(1:length(a), a, ylim = c(0,1), xlab="n", yaxt="n", ylab="")
28 | axis(2, at=0.5, labels = "a", las=1)
29 | abline(h=0.5 + 0.03)
30 | abline(h=0.5 - 0.03)
31 | axis(2, at=0.5+0.03, labels = expression(a + epsilon), las=1)
32 | axis(2, at=0.5-0.03, labels = expression(a - epsilon), las=1)
33 | polygon(x=c(0,14,14,0,0), c(0.5-0.03,0.5-0.03,0.5+0.03,0.5+0.03,0.5-0.03), col="lightgrey")
34 | points(1:length(a), a)
35 |
36 | ## ----convergence1, cache=TRUE--------------------------------------------
37 | masch <- function(maxit=10000){
38 | summe <- 0
39 | summeNeu <- n <- 1
40 | ptm <- proc.time()
41 | while(summe != summeNeu & n < maxit){
42 | summe <- summeNeu
43 | summeNeu <- summe + 1/n
44 | n <- n + 1
45 | }
46 | time <- proc.time() - ptm
47 | list(summe=summeNeu, time=time[3])
48 | }
49 | masch(10000)$summe
50 | masch(1000000)$summe
51 | masch(10000000)$summe
52 |
53 | ## ---- echo=FALSE---------------------------------------------------------
54 | library(knitr)
55 |
56 | ## ----result, cache=TRUE--------------------------------------------------
57 | SEQ <- c(10,1000, seq(100000,10000000,length=10))
58 | df <- cbind(SEQ, t(as.data.frame(sapply(SEQ, masch))))
59 | kable(df)
60 |
61 | ## ----convergence2, cache=TRUE--------------------------------------------
62 | x <- 1
63 | oldsum = 0
64 | newsum = n = term = 1
65 | while(oldsum != newsum){
66 | oldsum = newsum
67 | term = 1/factorial(n)
68 | n = n + 1
69 | newsum = oldsum + term
70 | print(paste("n = ", n, ". Diff = ", term, ". Sum = ", newsum, sep=""))
71 | }
72 |
73 | ## ----sdev----------------------------------------------------------------
74 | ## first formula for the standard deviation:
75 | s1 <- function(x){
76 | s <- sum((x - mean(x))^2)
77 | return(sqrt(1/(length(x)-1) * s))
78 | }
79 | ## second formula for the standard deviation:
80 | s2 <- function(x){
81 | # s <- 1/(length(x)-1) * sum(x^2) - mean(x)^2
82 | s <- sum(x^2) - 1/length(x) * sum(x)^2
83 | return(sqrt(1/(length(x)-1) * s))
84 | }
85 | ## wrapper function:
86 | st <- function(x, type) {
87 | switch(type,
88 | precise = s1(x),
89 | oldexcel = s2(x)
90 | )
91 | }
92 | ## generating 1000 random numbers from standard normal distribution:
93 | x <- rnorm(1000)
94 | ## show more digits:
95 | options(digits=16)
96 | ## results:
97 | st(x, "precise")
98 | st(x, "oldexcel")
99 |
100 | ## ----sdev_results, warning=FALSE, message=FALSE, tidy=FALSE--------------
101 | stall <- function(x){
102 | c(precise=st(x, "precise"), excel=st(x, "oldexcel"))
103 | }
104 | ## generate numbers (zeros and ones)
105 | x <- rep(0:1,100)
106 | stall(x)
107 | X <- matrix(nrow=length(x), ncol=10)
108 | X[,1] <- 1
109 | for(n in 2:ncol(X)) X[, n] <- x + 10^(2 * (n - 1))
110 | colnames(X) <- 2 * (1:ncol(X) - 1)
111 | dim(X)
112 | ## first four observations:
113 | head(X,4)
114 | options(digits=5)
115 | apply(X, 2, stall)
116 |
117 | ## ----convergence3--------------------------------------------------------
118 | konv <- function(q = 2){
119 | s <- 0
120 | snew <- term <- i <- 1
121 | while(s != snew){
122 | s <- snew
123 | snew <- s + q^(-i)
124 | i <- i + 1
125 | }
126 | list(iteration = i, total = snew, lastterm = 2^(-i))
127 | }
128 | konv()
129 |
130 | ## ------------------------------------------------------------------------
131 | konv(q = 0.5)
132 |
133 | ## ----expsum--------------------------------------------------------------
134 | expsum <- function(x)
135 | { oldsum <- 0
136 | newsum <- n <- term <- 1
137 | while( oldsum != newsum )
138 | { oldsum <- newsum
139 | term <- term * x/n
140 | n <- n + 1
141 | newsum <- oldsum + term
142 | }
143 | list(iteration = n, summe = newsum)
144 | }
145 |
146 | ## ----error, cache=TRUE---------------------------------------------------
147 | x <- 1:1000
148 | absError <- sapply (x, function (x) abs (exp (x) - expsum (x) $ sum))
149 | relError <- absError / exp (x)
150 |
151 | ## ----error2--------------------------------------------------------------
152 | roundingError <- sapply(x, function(x) 2^(-53)*exp(x))
153 |
154 | ## ----B05113_03_02--------------------------------------------------------
155 | plot(x[1: 600], roundingError[1:600], log = "xy", xlab = "log (x)",
156 | ylab = "log (rounding errors)", type = "l")
157 |
158 | ## ----B05113_03_03--------------------------------------------------------
159 | x <- seq(0, 20, by=0.1)
160 | iter <- sapply(x, function(x) expsum(x)$iteration)
161 | plot(x, iter, xlab="x", ylab="No. of iterationen until convergence", type="l")
162 |
163 | ## ----condition-----------------------------------------------------------
164 | library("Matrix")
165 | ## reciprocal approximate condition number
166 | rcond(Hilbert(9)) ## worse
167 | ## reciprocal condition number
168 | x1 <- cbind(1, 1:10)
169 | head(x1, 3)
170 | rcond(x1) ## much better
171 |
172 | ## ------------------------------------------------------------------------
173 | sessionInfo()
174 |
175 |
--------------------------------------------------------------------------------
/Chapter 5/chapter5.R:
--------------------------------------------------------------------------------
1 | ## ----rosenbroeck---------------------------------------------------------
2 | mountains <- function(v) {
3 | (1 - v[1])^2 + 100 * (v[2] - v[1]*v[1])^2 +
4 | 0.3*(0.2 - 2*v[2])^2 + 100 * (v[1] - v[2]*v[2])^2 -
5 | 0.5*(v[1]^2 +5*v[2]^2)
6 | }
7 |
8 | ## ---- eval=FALSE---------------------------------------------------------
9 | ## library("animation")
10 | ## grad.desc()
11 |
12 | ## ---- eval=FALSE---------------------------------------------------------
13 | ## ani.options(nmax = 70)
14 | ## par(mar = c(4, 4, 2, 0.1))
15 | ## f2 = function(x, y) sin(1/2 * x^2 - 1/4 * y^2 + 3) * cos(2 * x + 1 -
16 | ## exp(y))
17 | ## grad.desc(f2, c(-2, -2, 2, 2), c(-1, 0.5), gamma = 0.3, tol = 1e-04)
18 |
19 | ## ----B05113_05_02--------------------------------------------------------
20 | n <- 300
21 | ## to define a grid
22 | x <- seq(-1, 2, length.out = n)
23 | y <- seq(-1, 2, length.out = n)
24 | ## evaluate on each grid point
25 | z <- mountains(expand.grid(x, y))
26 | ## contour plot
27 | par(mar = c(4,4,0.5,0.5))
28 | contour(x, y, matrix(log10(z), length(x)),
29 | xlab = "x", ylab = "y", nlevels = 20)
30 | ## starting value
31 | sta <- c(0.5,-1)
32 | points(sta[1], sta[2], cex = 2, pch = 20)
33 | ## solutions for each of 20 steps
34 | sol <- matrix(, ncol=2, nrow = 21)
35 | sol[1, ] <- sta
36 | for(i in 2:20){
37 | sol[i, ] <- nlm(mountains, sta, iterlim = i)$est
38 | }
39 | ## optimal solution
40 | sol[21, ] <- nlm(mountains, sta)$est
41 | points(sol[21, 1], sol[21, 2], cex = 3, col = "red", pch = 20)
42 | ## path visually
43 | lines(sol, pch=3, type="o")
44 | ## now let's start better (dashed line)
45 | sta <- c(0,-1)
46 | for(i in 2:20){
47 | sol[i, ] <- nlm(mountains, sta, iterlim = i)$est
48 | }
49 | sol[1, ] <- sta
50 | sol[21, ] <- nlm(mountains, sta)$est
51 | points(sta[1], sta[2], cex = 2, pch = 20)
52 | points(sol[21, 1], sol[21, 2], cex = 3, col = "red", pch = 20)
53 | lines(sol, pch=3, type="o")
54 |
55 | ## ----B05113_05_03, cache=FALSE-------------------------------------------
56 | ## wrapper for all methods of optim
57 | optims <- function(x, meth = "Nelder-Mead", start = c(0.5, -1)){
58 | sol <- matrix(, ncol = 2, nrow = 21)
59 | sol[1, ] <- start
60 | for(i in 2:20){
61 | sol[i, ] <- optim(start, mountains, method = meth,
62 | control = list(maxit=i))$par
63 | }
64 | sol[21,] <- optim(start, mountains)$par
65 | points(start[1], start[2], pch=20, cex = 2)
66 | points(sol[21, ], sol[21, ], pch = 20, col = "red", cex = 3)
67 | lines(sol[, 1], sol[, 2], type = "o", pch = 3)
68 | }
69 | ## plot lines for all methods
70 | par(mar=c(4,4,0.5,0.5))
71 | contour(x, y, matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
72 | optims() # Nelder-Mead
73 | optims("BFGS")
74 | optims("CG")
75 | optims("L-BFGS-B")
76 | optims("SANN")
77 | optims("Brent")
78 | optims(start = c(1.5,0.5))
79 |
80 | ## ----B05113_05_04, cache=TRUE--------------------------------------------
81 | ## define grid
82 | n <- 1500
83 | set.seed(1234567)
84 | x1 <- runif(n, min = -2, max = 5)
85 | y1 <- runif(n, min = -2, max = 5)
86 | z1 <- matrix(, ncol = n, nrow = n)
87 | ## evaluate on each grid point
88 | for(i in 1:n){
89 | for(j in 1:n){
90 | z1[i,j] <- mountains(c(x1[i], y1[j]))
91 | }
92 | }
93 | ## determine optima
94 | w <- which(z1 == min(z1), arr.ind=TRUE)
95 | ## plot results
96 | par(mar=c(4,4,0.5,0.5))
97 | contour(x, y, matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
98 | points(x1[w[1]], y1[w[2]], pch = 20, col = "red", cex = 3)
99 | points(x1, y1, pch=3)
100 |
101 | ## ----B05113_05_05, cache=TRUE--------------------------------------------
102 | library("RCEIM")
103 | set.seed(123)
104 | sol <- best <- list()
105 | ## save solution for each step
106 | for(i in 2:20){
107 | a <- ceimOpt(mountains, nParam = 2, maxIter = i)
108 | sol[[i]] <- a$EliteMembers
109 | best[[i]] <- a$BestMember
110 | }
111 | ## plot the results for each step
112 | par(mar=c(4,4,0.5,0.5))
113 | contour(x, y, matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
114 | greys <- grey(rev(2:20 / 20 - 0.099))
115 | for(i in 2:20){
116 | points(sol[[i]][,1], sol[[i]][,2], col = greys[i])
117 | points(best[[i]][1], best[[i]][2], col = "red", pch = 3)
118 | }
119 | points(best[[i]][1], best[[i]][2], col = "red", pch = 20, cex = 3)
120 |
121 | ## ----randomwalk----------------------------------------------------------
122 | ## Simple random walk Metropolis Hastings:
123 | rmh <- function(n = 20, start = c(0,-0.5), stepmult = 10){
124 | x <- matrix(, ncol = 2, nrow = n)
125 | x[1, ] <- start
126 | sol <- mountains(start)
127 | for(i in 2:n){
128 | x[i, ] <- x[i-1, ] + rmvnorm(1, mean = c(0, 0),
129 | sigma = stepmult * diag(2) / n)
130 | solnew <- mountains(x[i, ])
131 | # accept only a better solution:
132 | if(solnew > sol) x[i, ] <- x[i-1, ]
133 | if(solnew < sol) sol <- solnew
134 | }
135 | return(x)
136 | }
137 |
138 | ## ----walk----------------------------------------------------------------
139 | library("mvtnorm")
140 | set.seed(12345)
141 | n <- 200
142 | x1 <- rmh(n, start = c(1.5,0))
143 | x2 <- rmh(n, start = c(1.5,0))
144 |
145 | ## ----B05113_05_06--------------------------------------------------------
146 | par(mar=c(4,4,0.5,0.5))
147 | contour(x, y, matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
148 | points(x1[1, 1], x1[1, 2], pch = 4, cex = 3)
149 | points(x2[n, 1], x2[n, 2], pch = 20, col = "red", cex = 3)
150 | points(x1[n, 1], x1[n, 2], pch = 20, col = "red", cex = 3)
151 | lines(x1[, 1], x1[, 2], type = "o", pch = 3)
152 | lines(x2[, 1], x2[, 2], type = "o", col = "blue", lty = 2)
153 |
154 | ## ----B05113_05_07code----------------------------------------------------
155 | stoGrad <- function(start = c(0, -0.5), j = 1500, p = 0.1){
156 | theta <- matrix(start, ncol=2)
157 | diff <- iter <- 1
158 | alpha <- sapply(1:100, function(x) 1 / (j+1) )
159 | beta <- sapply(1:100, function(x) 1 / (j+1)^(p) )
160 |
161 | while( diff > 10^-5 & !is.nan(diff) & !is.na(diff) ){
162 | zeta <- rnorm(2)
163 | zeta <- zeta / sqrt(t(zeta) %*% zeta)
164 | grad <- alpha[iter] * zeta * (mountains(theta[iter, ] + beta[iter] * zeta) -
165 | mountains(theta[iter, ] - beta[iter] * zeta)) / beta[iter]
166 | theta <- rbind(theta, theta[iter, ] - grad)
167 | diff <- sqrt(t(grad) %*% grad )
168 | iter <- iter + 1
169 | }
170 | list(theta = theta[1:(iter-1), ], diff = diff, iter = iter-1)
171 | }
172 |
173 | ## ----B05113_05_07--------------------------------------------------------
174 | set.seed(123)
175 | s1 <- stoGrad()
176 | par(mar=c(4,4,0.5,0.5))
177 | contour(x, y, matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
178 | plotLine <- function(x, ...){
179 | lines(x$theta[,1], x$theta[,2], type = "o", ...)
180 | points(x$theta[x$iter, 1], x$theta[x$iter, 1], pch = 20, col = "red", cex = 3)
181 | }
182 | plotLine(s1, pch = 3)
183 | points(0, -0.5, pch = 20, cex = 1.5)
184 | plotLine(stoGrad(), col = "blue", pch = 4)
185 | plotLine(stoGrad(start = c(1.5, 0)), pch = 3, lty = 2)
186 | plotLine(stoGrad(start = c(1.5, 0)), col = "blue", pch = 4, lty = 2)
187 | points(1.5, 0, pch = 20, cex = 1.5)
188 |
189 | ## ----B05113_05_08--------------------------------------------------------
190 | set.seed(123)
191 | s1 <- stoGrad(p = 2.5)
192 | par(mar=c(4,4,0.5,0.5))
193 | contour(x, y, matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
194 | plotLine <- function(x, ...){
195 | lines(x$theta[,1], x$theta[,2], type = "o", ...)
196 | points(x$theta[x$iter, 1], x$theta[x$iter, 1], pch = 20, col = "red", cex = 3)
197 | }
198 | plotLine(s1, pch = 3)
199 | points(0, -0.5, pch = 20, cex = 1.5)
200 | plotLine(stoGrad(p = 2.5), col = "blue", pch = 4)
201 | plotLine(stoGrad(start = c(1.5, 0), j=1500, p=2.5), pch = 3, lty = 2)
202 | plotLine(stoGrad(start = c(1.5, 0), j=1500, p=2.5), col = "blue", pch = 4, lty = 2)
203 | points(1.5, 0, pch = 20, cex = 1.5)
204 |
205 | ## ----B05113_05_09, warning=FALSE, message=FALSE--------------------------
206 | library("nloptr")
207 | set.seed(123)
208 | ## mountains function with modified function parameters
209 | mountains1 <-
210 | function(x) ((1 - x[1])^2 + 100 * (x[2] - x[1]*x[1])^2 +
211 | 0.3*(0.2 - 2*x[2])^2 + 100 * (x[1] - x[2]*x[2])^2 -
212 | 0.5*(x[1]^2 +5*x[2]^2))
213 | x0 <- c(0.5, -1)
214 | lb <- c(-3, -3)
215 | ub <- c(3, 3)
216 | sol <- matrix(, ncol=2,nrow=21)
217 | ## solution on each step
218 | for(i in 1:20){
219 | sol[i, ] <- isres(x0 = x0, fn = mountains1, lower = lb, upper = ub, maxeval = i)$par
220 | }
221 | par(mar=c(4,4,0.5,0.5))
222 | contour(x, y, matrix(log10(z), length(x)), xlab = "x", ylab = "y", nlevels = 20)
223 | ## start
224 | points(sol[1, 1], sol[1, 2], pch = 20, cex = 2)
225 | ## optima found
226 | sol[21,] <- isres(x0 = x0, fn = mountains1, lower = lb, upper = ub)$par
227 | points(sol[21, 1], sol[21, 2], pch = 20, col = "red", cex = 3)
228 | ## way to optima
229 | lines(sol[,1], sol[,2], type = "o", pch = 3)
230 |
231 | ## ------------------------------------------------------------------------
232 | sessionInfo()
233 |
234 |
--------------------------------------------------------------------------------
/Chapter 9/chapter9.R:
--------------------------------------------------------------------------------
1 | ## ---- echo=FALSE---------------------------------------------------------
2 | knitr::opts_chunk$set(cache=TRUE)
3 |
4 | ## ------------------------------------------------------------------------
5 | y <- matrix(c(11, 22, 16, 24, 17, NA), nrow=2)
6 | y
7 |
8 | ## ------------------------------------------------------------------------
9 | m <- mean(y, na.rm = TRUE)
10 | m
11 | y[2,3] <- m
12 | y
13 |
14 | ## ------------------------------------------------------------------------
15 | ## stop criterion
16 | eps <- 0.001
17 | ## intitialisations
18 | yalt <- y
19 | n <- 0
20 | converged <- FALSE
21 | ## iteration
22 | while(!converged){
23 | n <- n + 1
24 | yalt <- y
25 | m1 <- mean(y)
26 | ## E-step (estimate parameters)
27 | a <- rowMeans(y) - m1
28 | b1 <- colMeans(y) - m1
29 | ## M-step (update y23)
30 | y[2, 3] <- m1 + a[2] + b1[3]
31 | ## stop criterion
32 | converged <- (abs(y[2, 3] - yalt[2, 3]) < eps)
33 | }
34 | list(yImp = y, iterations = n)
35 |
36 | ## ------------------------------------------------------------------------
37 | distMan <- function(x, centers){
38 | if(class(x) == "data.frame") x <- as.matrix(x)
39 | d <- matrix(0, nrow=nrow(x), ncol=nrow(centers))
40 | ## dist center to observations for each cluster
41 | for(k in 1:nrow(centers)){
42 | d[,k] <- abs( colSums((t(x) - centers[k,])) )
43 | }
44 | return(d)
45 | }
46 |
47 | ## ------------------------------------------------------------------------
48 | means <- function(x, cluster){
49 | cen <- matrix(NA, nrow=max(cluster), ncol <- ncol(x))
50 | ## cluster means for each cluster
51 | for(n in 1:max(cluster)){
52 | cen[n,] <- apply(x[cluster==n,], 2, median)
53 | }
54 | return(cen)
55 | }
56 |
57 | ## ------------------------------------------------------------------------
58 | my_kmeans <- function(x, k, clmeans = means, distance = distMan, iter = 99999, seed = NULL){
59 | if(!is.null(seed)) set.seed(seed)
60 | cent <- newcent <- x[sample(1:nrow(x), size=k), ]
61 | oldclust <- 0
62 | j <- 0
63 | for(i in 1:iter){ # better: while()
64 | j <- j + 1
65 | cent <- newcent
66 | ## M-step
67 | dist <- distance(x, cent)
68 | clust <- max.col(-dist)
69 | ## E-step
70 | newcent <- clmeans(x, clust)
71 | if(all(clust == oldclust)) break()
72 | oldclust <- clust
73 | }
74 | res <- list(centers = cent,
75 | cluster = factor(clust),
76 | iterations = j)
77 | return(res)
78 | }
79 |
80 | ## ----B05113_09_01--------------------------------------------------------
81 | data(Nclus, package = "flexclust")
82 | x <- data.frame(Nclus)
83 | library("ggplot2")
84 | qplot(X1, X2, data=data.frame(Nclus)) +
85 | xlab("") + ylab("") + theme_bw()
86 |
87 | ## ---- warning=FALSE,message=FALSE----------------------------------------
88 | set.seed(123456)
89 | cl1 <- kmeans(x, centers = 4, iter.max = 1, algorithm = "MacQueen")
90 | set.seed(123456)
91 | cl2 <- kmeans(x, centers = 4, iter.max = 2, algorithm = "MacQueen")
92 | set.seed(123456)
93 | cl3 <- kmeans(x, centers = 4, iter.max = 3, algorithm = "MacQueen")
94 | set.seed(123456)
95 | cl4 <- kmeans(x, centers = 4, algorithm = "MacQueen")
96 |
97 | ## ------------------------------------------------------------------------
98 | cl1$centers
99 |
100 | ## ---- echo=FALSE---------------------------------------------------------
101 | initstep <- function(x = Nclus, cluster = cl1){
102 | plot(x, main = "E-step (1)", cex.main=1.8)
103 | points(cluster$center[, 1], cluster$center[, 2], cex = 3, pch = 18)
104 | }
105 | estep <- function(x = Nclus, cluster = cl2, text = "E-step (1)"){
106 | plot(x, col = cluster$cluster, pch = as.numeric(cluster$cluster), main=text,
107 | cex.main=1.8)
108 | points(cluster$center[, 1], cluster$center[, 2], cex = 3, pch = 18, col = 1:4)
109 | }
110 | mstep <- function(x = Nclus, cluster = cl2, cluster2 = cl2, text="M-step (1)"){
111 | plot(x, col = cluster2$cluster, pch = as.numeric(cluster2$cluster), main=text,
112 | cex.main=1.8)
113 | points(cluster2$center[, 1], cluster2$center[, 2], cex = 3, pch = 18, col = 1:4)
114 | segments(x0 = Nclus[cluster$cluster == 1, 1], x1 = cluster$center[1,1],
115 | y0 = Nclus[cluster$cluster == 1, 2], y1 = cluster$center[1,2], col = "grey")
116 | segments(x0 = Nclus[cluster$cluster == 2, 1], x1 = cluster$center[2,1],
117 | y0 = Nclus[cluster$cluster == 2, 2], y1 = cluster$center[2,2], col = "grey")
118 | segments(x0 = Nclus[cluster$cluster == 3, 1], x1 = cluster$center[3,1],
119 | y0 = Nclus[cluster$cluster == 3, 2], y1 = cluster$center[3,2], col = "grey")
120 | segments(x0 = Nclus[cluster$cluster == 4, 1], x1 = cluster$center[4,1],
121 | y0 = Nclus[cluster$cluster == 4, 2], y1 = cluster$center[4,2], col = "grey")
122 | points(x=Nclus[,1], y=Nclus[,2], col = cluster2$cluster, pch = as.numeric(cluster2$cluster))
123 | points(cluster$center[, 1], cluster$center[, 2], cex = 3, pch = 18, col = 1:4)
124 | }
125 |
126 | ## ----B05113_09_02, fig.width=9, fig.height=9, echo=FALSE-----------------
127 | par(mfrow = c(3,2), mar = c(1,1,3,0.1), axis=FALSE, xaxt="n", yaxt="n")
128 | initstep()
129 | mstep(cluster = cl1, cluster2 = cl1)
130 | estep(cluster = cl2, text="E-step (2)")
131 | mstep(cluster = cl2, cluster2=cl2, text="M-step (2)")
132 | estep(cluster = cl3, text="E-step (converged)")
133 | mstep(cluster = cl3, cluster2=cl3, text="E-step (converged)")
134 |
135 | ## ---- message=FALSE, warning=FALSE---------------------------------------
136 | library("MASS")
137 | library("robustbase")
138 | library("VIM")
139 | data("sleep")
140 | str(sleep)
141 |
142 | ## ------------------------------------------------------------------------
143 | apply(sleep, 2, function(x) any(is.na(x)))
144 |
145 | ## ---- message=FALSE, warning=FALSE---------------------------------------
146 | ## index of missing values
147 | ind <- data.frame(is.na(sleep))
148 | ## initialization
149 | sleep <- kNN(sleep)
150 | ## overwrite missing initialization with bad choice
151 | sleep$Sleep[ind$Sleep] <- 2240 # bad initialization
152 | ## initialized missing values in variable sleep
153 | sleep$Sleep[ind$Sleep]
154 |
155 | ## ---- message=FALSE, warning=FALSE---------------------------------------
156 | ## E-step (1)
157 | lm1 <- lm(Sleep ~ log(BodyWgt) + log(BrainWgt) + NonD + Danger, data = sleep)
158 | ## M-step (1)
159 | sleep$Sleep[ind$Sleep] <- predict(lm1)[ind$Sleep]
160 | ## print of updated missing values
161 | sleep$Sleep[ind$Sleep]
162 |
163 | ## ---- message=FALSE, warning=FALSE---------------------------------------
164 | ## E-step (2)
165 | lm1 <- lm(Sleep ~ log(BodyWgt) + log(BrainWgt) + NonD + Danger, data = sleep)
166 | ## M-step (2)
167 | sleep$Sleep[ind$Sleep] <- predict(lm1)[ind$Sleep]
168 | ## print of updated missing values
169 | sleep$Sleep[ind$Sleep]
170 |
171 | ## ------------------------------------------------------------------------
172 | EMregSleep <- function(method = lm, eps = 0.001, init = "bad"){
173 | ## index of missing values
174 | ind <- is.na(sleep)
175 | colnames(ind) <- colnames(sleep)
176 | indsleep <- ind[, "Sleep"]
177 | ## initialization
178 | if(init == "bad"){
179 | sleep <- kNN(sleep, imp_var = FALSE)
180 | sleep$Sleep[indsleep] <- 2240 # bad initialization
181 | }
182 | if(init == "worst"){
183 | sleep[ind] <- 2240 # worst initialization
184 | }
185 | iteration <- 0
186 | criteria <- 99999
187 | while(criteria > eps){
188 | iteration <- iteration + 1
189 | prev_sol <- sleep$Sleep[indsleep]
190 | ## E-step
191 | lm1 <- method(Sleep ~ log(BodyWgt) + log(BrainWgt) + NonD + Danger,
192 | data = sleep)
193 | ## M-step
194 | sleep$Sleep[indsleep] <- predict(lm1)[indsleep]
195 | criteria <- sqrt(sum((prev_sol - sleep$Sleep[indsleep])^2))
196 | }
197 | res <- list("imputed" = sleep,
198 | "iteration" = iteration,
199 | lastmodel = lm1)
200 | return(res)
201 | }
202 |
203 | ## ------------------------------------------------------------------------
204 | data("sleep")
205 | sleepImp <- EMregSleep()
206 | missVals <- sleepImp$imputed$Sleep[ind$Sleep]
207 | missVals
208 | sleepImp$iteration
209 |
210 | ## ------------------------------------------------------------------------
211 | missVals + sample(residuals(sleepImp$lastmodel), length(missVals))
212 |
213 | ## ---- message=FALSE, warning=FALSE---------------------------------------
214 | data("sleep")
215 | ## OLS regression
216 | lm_ols <- EMregSleep(method = lm, init = "worst")
217 | ## M-estimation
218 | lm_rlm <- EMregSleep(method = rlm, init = "worst", eps= 0.01)
219 | lm_ols$imputed$Sleep[ind[, "Sleep"]]
220 | lm_rlm$imputed$Sleep[ind[, "Sleep"]]
221 |
222 | ## ------------------------------------------------------------------------
223 | data("sleep")
224 | sleepImp <- irmi(sleep)
225 | sleepImp[ind[, "Sleep"], "Sleep"]
226 |
227 | ## ------------------------------------------------------------------------
228 | library("mice")
229 | data("sleep")
230 | em_mice <- mice(sleep, m = 1)
231 | em_mice$imp$Sleep
232 | ## now with bad intitialisation in predictors
233 | sleep[is.na(sleep)] <- 2240
234 | sleep$Sleep[ind[, "Sleep"]] <- NA
235 | em_mice <- mice(sleep, m = 1)
236 | em_mice$imp$Sleep
237 |
238 | ## ------------------------------------------------------------------------
239 | args(irmi)
240 |
241 | ## ------------------------------------------------------------------------
242 | sessionInfo()
243 |
244 |
--------------------------------------------------------------------------------
/Chapter 6/chapter6.R:
--------------------------------------------------------------------------------
1 | ## ---- message=FALSE, warning=FALSE, tidy=FALSE, eval=TRUE----------------
2 | library("RCurl")
3 | URL <- "https://www.national-lottery.co.uk/results/euromillions/draw-history/csv"
4 | lotto <- read.csv(textConnection(getURL(URL)))
5 |
6 | ## ------------------------------------------------------------------------
7 | str(lotto)
8 |
9 | ## ----B05113_06_01, fig.align='center'------------------------------------
10 | numbers <- unlist(c(lotto[,2:5]))
11 | library("ggplot2")
12 | qplot(factor(numbers), xlab = "winning numbers") +
13 | theme_bw() +
14 | theme(axis.text.x=element_text(angle=90)) +
15 | scale_y_continuous(breaks=0:10)
16 |
17 | ## ------------------------------------------------------------------------
18 | simulateDice <- function(n=200){
19 | A <- c(2, 4, 6)
20 | B <- 1:4
21 | C <- 3:6
22 | AB <- intersect(A, B)
23 | AC <- intersect(A, C)
24 | BC <- intersect(B, C)
25 | ABC <- intersect(AB, AC)
26 | df <- data.frame(d=rep(0,n), A=rep(0,n), B=rep(0,n), C=rep(0,n),
27 | AB=rep(0,n), AC=rep(0,n), BC=rep(0,n), ABC=rep(0,n))
28 | s <- sample(1:6, n, replace=TRUE)
29 | df[s %in% ABC, "ABC"] <- 1
30 | df[s %in% BC & !(df$ABC == 1), "BC"] <- 1
31 | df[s %in% AC & !(apply(df[, c("BC","ABC")], 1, function(x) any(x == 1))), "AC"] <- 1
32 | df[s %in% AB & !(apply(df[, c("AC","BC","ABC")], 1, function(x) any(x == 1))), "AB"] <- 1
33 | df[s %in% C & !(apply(df[, c("AB","AC","BC","ABC")], 1, function(x) any(x == 1))), "C"] <- 1
34 | df[s %in% B & !(apply(df[, c("C","AB","AC","BC","ABC")], 1, function(x) any(x == 1))), "B"] <- 1
35 | df[s %in% A & !(apply(df[, c("B","C","AB","AC","BC","ABC")], 1, function(x) any(x == 1))), "A"] <- 1
36 | df[, "d"] <- s
37 | return(df)
38 | }
39 | set.seed(123)
40 | simulateDice(10)
41 |
42 | ## ------------------------------------------------------------------------
43 | set.seed(123)
44 | s100 <- simulateDice(100)
45 | ## count the outcomes regarding the different events:
46 | colSums(s100[, 2:ncol(s100)])
47 |
48 | ## ------------------------------------------------------------------------
49 | set.seed(123)
50 | s1000 <- simulateDice(1000)
51 | colSums(s1000[, 2:ncol(s1000)])
52 |
53 | ## ------------------------------------------------------------------------
54 | s1000 <- simulateDice(1000)
55 | colSums(s1000[, 2:ncol(s1000)])
56 |
57 | ## ----B05113_06_03, echo=FALSE, fig.align='center'------------------------
58 | a <- c(0.65, 0.62, 0.41, 0.43, 0.61, 0.49, 0.44, 0.58, 0.47, 0.53, 0.52, 0.48, 0.51)
59 | plot(1:length(a), a, ylim = c(0,1), xlab="n", yaxt="n", ylab="")
60 | #axis(2, at=0.5, labels = "a", las=1)
61 | abline(h=0.5 + 0.03)
62 | abline(h=0.5 - 0.03)
63 | axis(2, at=0.5+0.03, labels = expression(a + epsilon), las=1)
64 | axis(2, at=0.5-0.03, labels = expression(a - epsilon), las=1)
65 | polygon(x=c(0,14,14,0,0), c(0.5-0.03,0.5-0.03,0.5+0.03,0.5+0.03,0.5-0.03), col="lightgrey")
66 | points(1:length(a), a)
67 | d <- dnorm(seq(-5,5,length.out = 100)) *10 + 6
68 | df <- data.frame(d=d, s=seq(-0.2+0.5, 0.2+0.5, length.out = length(d)))
69 | lines(df$d, df$s)
70 | abline(v=6, col="grey")
71 | d <- dnorm(seq(-10,10,length.out = 100)) *3.5 + 2
72 | df <- data.frame(d=d, s=seq(-1.4+0.5, 1.4+0.5, length.out = length(d)))
73 | lines(df$d, df$s)
74 | abline(v=2, col="grey")
75 |
76 | ## ------------------------------------------------------------------------
77 | dbinom(x = 0, size = 1, prob = 0.5)
78 |
79 | ## ------------------------------------------------------------------------
80 | sample(c("head", "number"), size = 1)
81 | # alternativ:
82 | rbinom(n = 1, size = 1, prob = 0.5)
83 |
84 | ## ------------------------------------------------------------------------
85 | simCoin <- function(n, p = 0.5, repl = 1){
86 | stopifnot(n > 0 | !is.vector(n) | p < 0 | p > 0 | !is.vector(repl))
87 | ## function for one simulation
88 | r <- function(){
89 | res <- rbinom(n, 1, p)
90 | tosses <- 1:n
91 | pA <- cumsum(res) / 1:n
92 | abserror <- abs(pA - p)
93 | return(data.frame(res = res, tosses = tosses, pA = pA, abserror = abserror))
94 | }
95 | ## simulation
96 | df <- r()
97 | if(repl > 1){
98 | for(i in 2:repl){
99 | df <- rbind(df, r())
100 | }
101 | }
102 | ## return
103 | df$repl <- rep(1:repl, each = n)
104 | ll <- list(res = df$res, tosses = df$tosses, pA = df$pA,
105 | absfehler = df$abserror, repl = as.factor(df$repl))
106 | class(ll) <- "Coin"
107 | return(ll)
108 | }
109 | ## print
110 | print.Coin <- function(x, ..., s = NULL){
111 | if(!is.null(s)){
112 | cat("After", s, "random draws: the estimated P(A) =", x$pA[s], "\nand the absolute error", x$absfehler[s], "\n")
113 | } else {
114 | m <- max(x$tosses)
115 | cat("After", m, "random draws: the estimated P(A) =", x$pA[m], "\nand the absolute error", x$absfehler[m], "\n")
116 | }
117 | }
118 |
119 | ## ------------------------------------------------------------------------
120 | ## for reproducibility
121 | set.seed(1234)
122 | # 10 throws
123 | simCoin(10)
124 |
125 | ## ---- cache=TRUE---------------------------------------------------------
126 | set.seed(1234)
127 | sim <- simCoin(5000)
128 | print(sim, s=100)
129 | print(sim, s=1000)
130 | print(sim, s=5000)
131 |
132 | ## ----lehr1---------------------------------------------------------------
133 | plot.Coin <- function(x, y, ...){
134 | df <- data.frame(res = x$res, tosses = x$tosses, pA = x$pA, repl=x$repl)
135 | if(length(unique(df$repl)) == 1){
136 | ggplot(df, aes(x=tosses, y=pA)) +
137 | geom_line() + geom_abline(intercept = 0.5) + ylim(c(0,1)) +
138 | theme(legend.position="none")
139 | } else if(length(unique(df$repl)) > 10){
140 | gg <- ggplot(df, aes(x=tosses, y=pA, group=repl)) +
141 | geom_line() + geom_abline(intercept = 0.5) + ylim(c(0,1))
142 | ## add median line and confidence interval
143 | dfwide <- reshape2::dcast(df, tosses ~ repl, value.var="pA")
144 | dfwide <- dfwide[, 2:ncol(dfwide)]
145 | med <- apply(dfwide, 1, median)
146 | q025 <- apply(dfwide, 1, quantile, 0.025)
147 | q975 <- apply(dfwide, 1, quantile, 0.975)
148 | stat <- data.frame(med=med, q025=q025, q975=q975,
149 | n=1:max(x$tosses),
150 | repl=max(as.numeric(df$repl)))
151 | gg +
152 | geom_line(data=stat, aes(x = n, y = med), colour = "red", size=1) +
153 | geom_line(data=stat, aes(x = n, y = q025), colour = "orange", size=0.7) +
154 | geom_line(data=stat, aes(x = n, y = q975), colour = "orange", size=0.7) +
155 | theme(legend.position="none")
156 | } else {
157 | ggplot(df, aes(x=tosses, y=pA, colour = repl)) +
158 | geom_line() + geom_abline(intercept = 0.5) + ylim(c(0,1))
159 | }
160 | }
161 |
162 | ## ----B05113_06_04--------------------------------------------------------
163 | plot(sim)
164 |
165 | ## ----B05113_06_05, cache=TRUE--------------------------------------------
166 | set.seed(1234)
167 | sim <- simCoin(n = 5000, repl = 10)
168 | plot(sim)
169 |
170 | ## ----B05113_06_06, cache=TRUE--------------------------------------------
171 | sim <- simCoin(n = 5000, repl = 1000)
172 | plot(sim)
173 |
174 | ## ----B05113_06_07, fig.height=7------------------------------------------
175 | plotbinomcoin <- function(n){
176 | plot(0:n/n, dbinom(0:n, n, 0.5), type = "h",
177 | xlab = paste("relative frequencies (n =", n,")"),
178 | ylab = "p")
179 | }
180 | par(mar = c(4,4,0.5,0.5), mfrow = c(4,2))
181 | plotbinomcoin(10)
182 | plotbinomcoin(20)
183 | plotbinomcoin(40)
184 | plotbinomcoin(80)
185 | plotbinomcoin(160)
186 | plotbinomcoin(320)
187 | plotbinomcoin(5000)
188 | plotbinomcoin(10000)
189 |
190 | ## ------------------------------------------------------------------------
191 | dbinom(0, 1, 0.5)
192 |
193 | ## ------------------------------------------------------------------------
194 | set.seed(10230)
195 | s <- sample(c(FALSE,TRUE), 5000, replace = TRUE)
196 | s[1:10]
197 | ev <- function(x){
198 | res <- c(length(x), sum(x), sum(x) / length(x), abs(sum(x) / length(x) - 0.5))
199 | names(res) <- c("n", "h(A)", "p(A)", "|p(A) - 0.5|")
200 | return(res)
201 | }
202 | # summary of first 10 outcomes
203 | ev(s[1:10])
204 |
205 | ## ------------------------------------------------------------------------
206 | # summary of first 100 outcomes
207 | ev(s[1:100])
208 | # summary of first 1000 outcomes
209 | ev(s[1:1000])
210 | # summary of first 5000 outcomes
211 | ev(s)
212 |
213 | ## ----B05113_06_01x-------------------------------------------------------
214 | p <- numeric(length(s))
215 | for(i in 1:length(s)){
216 | p[i] <- sum(s[1:i]) / i
217 | }
218 | plot(p, type = "l", xlab = "number of coin flips", ylab = "p(A)")
219 | abline(h = 0.5)
220 |
221 | ## ----B05113_06_02x-------------------------------------------------------
222 | plotbinomcoin <- function(n){
223 | plot(0:n/n, dbinom(0:n, n, 0.5), type = "h", xlab = "relative frequency", ylab = "p")
224 | }
225 | par(mar = c(4,4,0.5,0.5), mfrow = c(2,2))
226 | plotbinomcoin(20)
227 | plotbinomcoin(100)
228 | plotbinomcoin(1000)
229 | plotbinomcoin(5000)
230 |
231 | ## ------------------------------------------------------------------------
232 | newlines <- function(n = 5000){
233 | s <- sample(c(FALSE,TRUE), n, replace = TRUE)
234 | for(i in 1:length(s)){
235 | p[i] <- mean(s[1:i])
236 | }
237 | p
238 | }
239 |
240 | ## ----B05113_06_03x, results='hide'---------------------------------------
241 | plot(p, type = "l", xlab = "number of coin flips", ylim = c(0,1), ylab = "p(A)")
242 | abline(h = 0.5)
243 | replicate(10, lines(newlines()))
244 |
245 | ## ----B05113_06_04x, cache=TRUE-------------------------------------------
246 | plot(p, type = "l", xlab = "number of coin flips", ylim = c(0,1), ylab = "p(A)")
247 | abline(h = 0.5)
248 | ## calculate 1000 sequences
249 | n1 <- replicate(1000, newlines())
250 | med <- apply(n1, 1, median)
251 | q1 <- apply(n1, 1, quantile, 0.025)
252 | q2 <- apply(n1, 1, quantile, 0.975)
253 | apply(n1, 2, lines)
254 | lines(med, col = "red")
255 | lines(q1, col = "green", cex = 2)
256 | lines(q2, col = "green", cex = 2)
257 |
258 | ## ---- echo=FALSE, eval=FALSE---------------------------------------------
259 | ## plot(function(x) dnorm(x), -4, 4,
260 | ## main = "normal")
261 | ## curve(dnorm(x), add = TRUE, col = "red", lwd = 2)
262 |
263 | ## ---- warning=FALSE, message=FALSE---------------------------------------
264 | cltSim <- function (n = 1, reps = 10000, nclass = 16, pop = TRUE, estimator = mean) {
265 | old.par <- par(oma = c(0, 0, 1.5, 0), mfrow = c(2, 2), mar = c(4,4,2,0.5))
266 | on.exit(par(old.par))
267 | ## normal:
268 | norm.mat <- matrix(rnorm(n * reps), ncol = n)
269 | norm.mean <- apply(norm.mat, 1, estimator)
270 | x <- seq(min(norm.mean), max(norm.mean), length = 50)
271 | normmax <- max(dnorm(x, mean(norm.mean), sd(norm.mean)))
272 | tmp.hist <- hist(norm.mean, plot = FALSE, prob = TRUE, nclass = nclass)
273 | normmax <- max(tmp.hist$density, normmax) * 1.05
274 | hist(norm.mean, main = "normal", xlab = "x", col = "skyblue",
275 | prob = TRUE, ylim = c(0, normmax), nclass = nclass)
276 | lines(x, dnorm(x, mean(norm.mean), sd(norm.mean)))
277 | ## exponential:
278 | exp.mat <- matrix(rexp(n * reps, 1/3), ncol = n)
279 | exp.mean <- apply(exp.mat, 1, estimator)
280 | x <- seq(min(exp.mean), max(exp.mean), length = 50)
281 | expmax <- max(dnorm(x, mean(exp.mean), sd(exp.mean)))
282 | tmp.hist <- hist(exp.mean, plot = FALSE, prob = TRUE, nclass = nclass)
283 | expmax <- max(tmp.hist$density, expmax) * 1.05
284 | hist(exp.mean, main = "exponential", xlab = "x", col = "skyblue",
285 | prob = TRUE, ylim = c(0, expmax), nclass = nclass)
286 | if(pop) lines(x, dexp(x, 1/3)) else lines(x, dnorm(x, mean(exp.mean), sd(exp.mean)))
287 | ## uniform:
288 | unif.mat <- matrix(runif(n * reps), ncol = n)
289 | unif.mean <- apply(unif.mat, 1, estimator)
290 | x <- seq(min(unif.mean), max(unif.mean), length = 50)
291 | unimax <- max(dnorm(x, mean(unif.mean), sd(unif.mean)))
292 | tmp.hist <- hist(unif.mean, plot = FALSE, prob = TRUE, nclass = nclass)
293 | unimax <- max(tmp.hist$density, unimax) * 1.05
294 | hist(unif.mean, main = "uniform", xlab = "x", col = "skyblue",
295 | prob = TRUE, ylim = c(0, unimax), nclass = nclass)
296 | if(pop) lines(x, dunif(x)) else lines(x, dnorm(x, mean(unif.mean), sd(unif.mean)))
297 | ## Beta:
298 | beta.mat <- matrix(rbeta(n * reps, 0.35, 0.25), ncol = n)
299 | beta.mean <- apply(beta.mat, 1, estimator)
300 | x <- seq(min(beta.mean), max(beta.mean), length = 50)
301 | betamax <- max(dnorm(x, mean(beta.mean), sd(beta.mean)))
302 | tmp.hist <- hist(beta.mean, plot = FALSE, prob = TRUE, nclass = nclass)
303 | betamax <- max(tmp.hist$density, betamax)
304 | hist(beta.mean, main = "Beta", xlab = "x", col = "skyblue",
305 | prob = TRUE, ylim = c(0, betamax), nclass = nclass)
306 | if(pop){
307 | lines(x, dbeta(x, 0.35, 0.25))
308 | mtext(paste("Populations"), outer = TRUE, cex = 1.2)
309 | } else {
310 | lines(x, dnorm(x, mean(beta.mean), sd(beta.mean)))
311 | mtext(paste("sample size =", n), outer = TRUE, cex = 1.2)
312 | }
313 | }
314 |
315 | ## ----B05113_06_08, warning=FALSE, message=FALSE--------------------------
316 | cltSim()
317 |
318 | ## ----B05113_06_09, warning=FALSE, message=FALSE--------------------------
319 | cltSim(2, pop = FALSE)
320 |
321 | ## ----B05113_06_10, warning=FALSE, message=FALSE--------------------------
322 | cltSim(10, pop = FALSE)
323 |
324 | ## ----B05113_06_11, warning=FALSE, message=FALSE--------------------------
325 | cltSim(n = 100, pop = FALSE, estimator = median)
326 |
327 | ## ------------------------------------------------------------------------
328 | library("car")
329 | data("Prestige")
330 | m <- mean(Prestige$income)
331 | m
332 | p <- dim(Prestige)[1]
333 | se <- sd(Prestige$income) / sqrt(p)
334 | tval <- qt(0.975, df = p - 1)
335 | cat(paste("KI: [", round(m - tval * se, 2), ",", round(m + tval * se, 2), "]"))
336 |
337 | ## ---- B05113_06_12-------------------------------------------------------
338 | set.seed(11112)
339 | alpha <- 0.05
340 | normval <- qnorm(1 - alpha/2)
341 | numsamp <- 50; numsim <- 10
342 | normmat <- matrix(0, nrow = numsim, ncol = 2)
343 | y <- 1:numsim; ymat <- rbind(y, y)
344 | for (i in 1:numsim) {
345 | samp <- rexp(numsamp) # generate random exponentials
346 | sampmean <- mean(samp)
347 | sampse <- sqrt(var(samp) / numsamp)
348 | normmat[i, ] <- c(sampmean - normval * sampse, sampmean + normval * sampse)
349 | }
350 | matplot(t(normmat), ymat , pch = " ", yaxt = "n", ylab = "", xlab="confidence intervals") # empty plot
351 | matlines(t(normmat), ymat, lty = rep(1, numsim), col = 1)
352 | abline(v = 1)
353 |
354 | ## ------------------------------------------------------------------------
355 | sessionInfo()
356 |
357 |
--------------------------------------------------------------------------------
/Chapter 10/chapter10.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | library("mvtnorm")
3 | synth <- rmvnorm(100, mean = rep(0,5), sigma = diag(5))
4 | ## first three observations
5 | head(synth, 3)
6 |
7 | ## ------------------------------------------------------------------------
8 | data(Prestige, package = "car")
9 | ## first three observations of Prestige
10 | head(Prestige, 3)
11 | ## subset of variables
12 | real <- Prestige[, 1:4]
13 |
14 | ## ------------------------------------------------------------------------
15 | ## set seed for reproducibility
16 | set.seed(12)
17 | ## simulate from multivariate normal
18 | synth2 <- data.frame(rmvnorm(100, mean = colMeans(real), sigma = cov(real)))
19 | colnames(synth2) <- colnames(real)
20 | ## first three observations
21 | head(synth2, 3)
22 |
23 | ## ------------------------------------------------------------------------
24 | summary(real$women)
25 | summary(synth2$women)
26 |
27 | ## ----B05113_10_01--------------------------------------------------------
28 | par(mar = c(4,4,0.2,0.2))
29 | plot(prestige ~ income, data = real)
30 | points(prestige ~ income, data = synth2, col = "red", pch = 20)
31 | legend("bottomright", legend = c("original/real", "synthetic"), col = 1:2, pch = c(1,20))
32 |
33 | ## ----B05113_10_02--------------------------------------------------------
34 | library("robustbase")
35 | cv <- covMcd(real)
36 | synth3 <- rmvnorm(100, mean = cv$center, sigma = cv$cov)
37 | par(mfrow = c(1,2), mar = c(4,4,0.2,0.2))
38 | plot(prestige ~ income, data = real)
39 | points(prestige ~ income, data = synth3, col = "red", pch = 20)
40 | ## add outliers
41 | rmd <- mahalanobis(real, center = cv$center, cov = cv$cov)
42 | ## outliers defined by large Mahalanobis distances
43 | out <- rmd > qchisq(0.975, ncol(real) - 1)
44 | cv_good <- covMcd(real[!out, ])
45 | ## simulate good points
46 | synth3_good <- rmvnorm(100, mean = cv_good$center, sigma = cv_good$cov)
47 | cv_out <- covMcd(real[out, ])
48 | ## simulate outliers
49 | synth3_out <- rmvnorm(100, mean = cv_out$center, sigma = cv_out$cov)
50 | ## Figure 10.2.
51 | plot(prestige ~ income, data = real)
52 | points(prestige ~ income, data = synth3_good, col = "red", pch = 20)
53 | points(prestige ~ income, data = synth3_out, col = "red", pch = 20)
54 |
55 | ## ---- warning=FALSE,message=FALSE----------------------------------------
56 | synth4 <- real
57 | lm1 <- lm(education ~ ., data = real)
58 | synth4$education <- predict(lm1, synth4[, 2:ncol(synth4)]) + sample(residuals(lm1))
59 |
60 | ## ---- warning=FALSE,message=FALSE----------------------------------------
61 | p <- ncol(real)
62 | for(i in 1:ncol(real)){
63 | df <- real[, i]
64 | df <- cbind(df, real[,-i])
65 | colnames(df)[1] <- "response"
66 | lm1 <- lm(response ~ ., data = df)
67 | synth4[, i] <- predict(lm1, synth4[, -i]) + sample(residuals(lm1))
68 | }
69 |
70 | ## ------------------------------------------------------------------------
71 | simLatent <- function(n = 200, p = 50, k = 3){
72 | T <- matrix(rnorm(n * k, 0, 1), ncol = k)
73 | B <- matrix(runif(p * k, -1, 1), ncol = k)
74 | X <- T %*% t(B)
75 | E <- matrix(rnorm(n * p, 0, 0.1), ncol = p)
76 | XE <- X + E
77 | return(XE)
78 | }
79 |
80 | ## ------------------------------------------------------------------------
81 | x <- simLatent(n = 50, p = 1000, k = 6)
82 | dim(x)
83 |
84 | ## ---- warning=FALSE,message=FALSE----------------------------------------
85 | library("simPop")
86 | data("eusilcS")
87 | dim(eusilcS)
88 |
89 | ## ------------------------------------------------------------------------
90 | length(unique(eusilcS$db030))
91 |
92 | ## ------------------------------------------------------------------------
93 | inp <- specifyInput(eusilcS, hhid = "db030", hhsize = "hsize",
94 | strata = "db040", weight = "rb050")
95 |
96 | ## ------------------------------------------------------------------------
97 | print(inp)
98 |
99 | ## ---- warning=FALSE,message=FALSE----------------------------------------
100 | synthP <- simStructure(data = inp,
101 | method = "direct",
102 | basicHHvars = c("age", "rb090", "db040"))
103 |
104 | ## ------------------------------------------------------------------------
105 | synthP <- simCategorical(synthP,
106 | additional = c("pl030", "pb220a"),
107 | method = "multinom")
108 |
109 | ## ---- cache=TRUE---------------------------------------------------------
110 | synthP <- simContinuous(synthP, additional = "netIncome",
111 | upper = 200000, equidist = FALSE,
112 | imputeMissings = FALSE)
113 |
114 | ## ------------------------------------------------------------------------
115 | synthP
116 |
117 | ## ---- results='hide', message=FALSE,warning=FALSE------------------------
118 | library("mice")
119 | library("VIM")
120 | x <- orig <- simLatent(n = 50, p = 10, k = 6)
121 | ## evaluation criteria
122 | eval <- function(real, imputed, nas){
123 | sqrt(sum((real - imputed)^2)) / nas
124 | }
125 | set.seed(123)
126 | R <- 100
127 | e1 <- e2 <- e3 <- e4 <- numeric(R)
128 | for(i in 1:R){
129 | x <- orig
130 | x[sample(1:nrow(x), 10), 1] <- NA
131 | e1[i] <- eval(orig, e1071::impute(x), 10)
132 | e2[i] <- eval(orig, kNN(data.frame(x), imp_var = FALSE), 10)
133 | e3[i] <- eval(orig, irmi(x), 10)
134 | e4[i] <- eval(orig, complete(mice(x, m = 1, printFlag = FALSE)), 10)
135 | }
136 | df <- data.frame("error" = c(e1,e2,e3,e4), method = rep(c("mean", "kNN", "irmi", "mice"), each = R))
137 |
138 | ## ----B05113_10_03--------------------------------------------------------
139 | library("ggplot2")
140 | ggplot(df, aes(x = method, y=error)) + geom_boxplot() + theme(text = element_text(size = 20)) + theme_bw()
141 |
142 | ## ------------------------------------------------------------------------
143 | library("robustbase")
144 | set.seed(123)
145 | x <- rexp(n = 50, rate = 1)
146 | mean(x)
147 | huberM(x)$mu
148 |
149 | ## ------------------------------------------------------------------------
150 | m <- mean(replicate(10000, mean(rexp(n = 50, rate = 1))))
151 | m
152 | m - 1
153 |
154 | ## ------------------------------------------------------------------------
155 | mh <- mean(replicate(10000, huberM(rexp(n =50, rate = 1))$mu))
156 | mh
157 | mh - 1
158 |
159 | ## ------------------------------------------------------------------------
160 | set.seed(123)
161 | alpha <- 0.05
162 | ci <- function(x, z = qnorm(1 - alpha / 2)){
163 | s <- rexp(n = 50, rate = 1)
164 | m <- mean(s)
165 | se <- sd(s) / sqrt(50)
166 | ci_est <- c(m - z * se, m + z *se)
167 | ci_est
168 | }
169 | ci()
170 |
171 | ## ---- cache=TRUE---------------------------------------------------------
172 | set.seed(123)
173 | ciR_n <- replicate(100000, ci())
174 | isCovered <- function(x){
175 | apply(x, 2, function(x){
176 | if(x[1] > 1 & x[2] > 1) return(FALSE)
177 | if(x[1] < 1 & x[2] < 1) return(FALSE)
178 | return(TRUE)})
179 | }
180 | cn <- isCovered(ciR_n)
181 | sum(cn) / length(cn)
182 |
183 | ## ---- cache=TRUE---------------------------------------------------------
184 | ciR_t <- replicate(100000, ci(z = qt(1 - alpha / 2, 49)))
185 | ct <- isCovered(ciR_t)
186 | sum(ct) / length(ct)
187 |
188 | ## ---- cache = TRUE-------------------------------------------------------
189 | ci_boot <- function(x, R = 1000){
190 | s <- rexp(n = 50, rate = 1)
191 | ci_est <- quantile(replicate(R,
192 | mean(sample(s, replace = TRUE))),
193 | c(0.025, 0.975))
194 | return(ci_est)
195 | }
196 | ciR_boot <- replicate(1000, ci_boot())
197 | cb <- isCovered(ciR_boot)
198 | sum(cb) / length(cb)
199 |
200 | ## ----B05113_10_04, message=FALSE,warning=FALSE---------------------------
201 | df <- data.frame(t(ciR_n))
202 | df <- data.frame(rbind(t(ciR_n), t(ciR_t), t(ciR_boot)))
203 | df$method <- rep(c("normal", "t", "boot"), times = c(100000,100000,1000))
204 | colnames(df) <- c("lower", "upper", "method")
205 | library("reshape2")
206 | df <- melt(df)
207 | library("ggplot2")
208 | ggplot(df, aes(x = value, colour = method)) + geom_density() + facet_wrap(~ variable) + theme(text = element_text(size=16))
209 |
210 | ## ---- cache = TRUE, message=FALSE,warning=FALSE--------------------------
211 | simMean <- function(simFun = function(x) rnorm(100)){
212 | ## 1000 samples
213 | set.seed(123)
214 | R <- 1000
215 | m <- list()
216 | ## 1000 data sets
217 | for(i in 1:R){
218 | m[[i]] <- simFun()
219 | }
220 | ## estimation
221 | df <- data.frame("thetahat" = c(sapply(m, mean), sapply(m, mean, trim = 0.1), sapply(m, median), sapply(m, function(x) huberM(x)$mu)),
222 | "method" = rep(c("mean","trim","median","huber"), each = R))
223 | ## summary
224 | vm <- var(df[df$method == "mean", 1])
225 | df %>%
226 | group_by(method) %>%
227 | summarize("bias" = mean(thetahat) - 0,
228 | "variance" = var(thetahat),
229 | "mse" = variance + bias^2,
230 | "re" = vm / var(thetahat))
231 | }
232 |
233 | ## ------------------------------------------------------------------------
234 | library("robustbase"); library("dplyr")
235 | simMean()
236 |
237 | ## ---- cache = TRUE, warning=FALSE, message=FALSE-------------------------
238 | set.seed(123)
239 | simMean(simFun = function(){c(rnorm(95), rnorm(5,15))})
240 |
241 | ## ---- message=FALSE, warning=FALSE---------------------------------------
242 | library("simFrame"); library("robCompositions"); library("mvtnorm"); library("mice")
243 | set.seed(123)
244 |
245 | ## ------------------------------------------------------------------------
246 | ## data generation
247 | crnorm <- function(n, mean, sigma) data.frame(isomLRinv(rmvnorm(n, mean, sigma)))
248 | sigma <- matrix(c(1, -0.5, 1.4, -0.5, 1, -0.6, 1.4, -0.6, 2), 3, 3)
249 | ## data control class
250 | dc <- DataControl(size = 150, distribution = crnorm,
251 | dots = list(mean = c(0, 2, 3), sigma = sigma))
252 |
253 | ## ------------------------------------------------------------------------
254 | nc <- NAControl(NArate = c(0.05, 0.1))
255 |
256 | ## ------------------------------------------------------------------------
257 | sim <- function(x, orig) {
258 | i <- apply(x, 1, function(x) any(is.na(x)))
259 | ni <- length(which(i))
260 | x <- x[, -ncol(x)]
261 | xMean <- e1071::impute(x)
262 | xMice <- mice(x, printFlag = FALSE, diagnostics = FALSE, m = 1)
263 | xMice <- complete(xMice)
264 | xKNNa <- impKNNa(x)$xImp
265 | xLS <- impCoda(x, method = "lm")$xImp
266 | xLTSrob <- impCoda(x, method = "ltsReg")$xImp
267 | c(xMean = aDist(xMean, orig)/ni,
268 | xMice = aDist(xMice, orig)/ni,
269 | knn = aDist(xKNNa, orig)/ni,
270 | LS = aDist(xLS, orig)/ni,
271 | LTSrob = aDist(xLTSrob, orig)/ni)
272 | }
273 |
274 | ## ---- cache = TRUE, message=FALSE, warning=FALSE-------------------------
275 | results <- runSimulation(dc,
276 | nrep = 25,
277 | NAControl = nc,
278 | fun = sim)
279 |
280 | ## ------------------------------------------------------------------------
281 | aggregate(results)
282 |
283 | ## ----B05113_10_05--------------------------------------------------------
284 | simBwplot(results)
285 |
286 | ## ---- cache=TRUE, message=FALSE, warning=FALSE---------------------------
287 | dcarc <- ContControl(target = c("X1"),
288 | epsilon = c(0.01,0.03,0.05,0.1),
289 | dots = list(mean = 150, sd = 1), type = "DCAR")
290 | results <- runSimulation(dc,
291 | nrep = 3,
292 | NAControl = nc,
293 | contControl = dcarc,
294 | fun = sim)
295 | aggregate(results)
296 |
297 | ## ----B05113_10_06, message=FALSE, warning=FALSE, fig.width=7, fig.height=7----
298 | simBwplot(results)
299 |
300 | ## ------------------------------------------------------------------------
301 | sim2 <- function(x, orig) {
302 | rdcm <- function(x, y){
303 | ocov <- cov(isomLR(x))
304 | rcov <- cov(isomLR(y))
305 | return(frobenius.norm(ocov-rcov)/frobenius.norm(ocov))
306 | }
307 | i <- apply(x, 1, function(x) any(is.na(x)))
308 | ni <- length(which(i))
309 | x <- x[, -ncol(x)]
310 | xMean <- e1071::impute(x)
311 | xMice <- mice(x, printFlag = FALSE, diagnostics = FALSE, m = 1)
312 | xMice <- complete(xMice)
313 | xKNNa <- impKNNa(x)$xImp
314 | xLS <- impCoda(x, method = "lm")$xImp
315 | xLTSrob <- impCoda(x, method = "ltsReg")$xImp
316 | c(xMean = rdcm(xMean, orig),
317 | xMice = rdcm(xMice, orig),
318 | knn = rdcm(xKNNa, orig),
319 | LS = rdcm(xLS, orig),
320 | LTSrob = rdcm(xLTSrob, orig))
321 | }
322 |
323 | ## ---- cache=TRUE, message=FALSE, warning=FALSE---------------------------
324 | library("matrixcalc")
325 | results <- runSimulation(dc,
326 | nrep = 3,
327 | NAControl = nc,
328 | contControl = dcarc,
329 | fun = sim2)
330 | aggregate(results)
331 |
332 | ## ------------------------------------------------------------------------
333 | data("eusilcP")
334 |
335 | ## ------------------------------------------------------------------------
336 | colnames(eusilcP)
337 |
338 | ## ------------------------------------------------------------------------
339 | sim <- function(x, k) {
340 | require("laeken")
341 | x <- x[!is.na(x$eqIncome), ]
342 | ## classical Gini
343 | g <- gini(x$eqIncome, x$.weight)$value
344 | ## Hill estimator
345 | eqIncHill <- fitPareto(x$eqIncome, k = k, method = "thetaHill",
346 | groups = x$hid)
347 | gHill <- gini(eqIncHill, x$.weight)$value
348 | ## partial density component estimator
349 | eqIncPDC <- fitPareto(x$eqIncome, k = k, method = "thetaPDC",
350 | groups = x$hid)
351 | gPDC <- gini(eqIncPDC, x$.weight)$value
352 | ## results as a vector
353 | c(standard = g, Hill = gHill, PDC = gPDC)
354 | }
355 |
356 | ## ------------------------------------------------------------------------
357 | sc <- SampleControl(grouping = "hid", size = 1500, k = 100)
358 |
359 | ## ---- cache = TRUE-------------------------------------------------------
360 | library("laeken") # for function gini
361 | set.seed(123)
362 | ## run the simulation
363 | results <- runSimulation(eusilcP, sc, fun = sim, k = 175)
364 |
365 | ## ------------------------------------------------------------------------
366 | head(results)
367 |
368 | ## ------------------------------------------------------------------------
369 | aggregate(results)
370 |
371 | ## ----B05113_10_08--------------------------------------------------------
372 | tv <- laeken::gini(eusilcP$eqIncome)$value
373 | plot(results, true = tv)
374 |
375 | ## ---- message=FALSE,warning=FALSE----------------------------------------
376 | set.seed(12345)
377 | sc <- SampleControl(design = "region", grouping = "hid",
378 | size = c(75, 250, 250, 125, 200, 225, 125, 150, 100),
379 | k = 100)
380 | ## run new simulation
381 | results <- runSimulation(eusilcP, sc, fun = sim, k = 175)
382 |
383 | ## ------------------------------------------------------------------------
384 | head(results)
385 |
386 | ## ------------------------------------------------------------------------
387 | aggregate(results)
388 |
389 | ## ----B05113_10_07--------------------------------------------------------
390 | tv <- gini(eusilcP$eqIncome)$value
391 | plot(results, true = tv)
392 |
393 | ## ------------------------------------------------------------------------
394 | set.seed(12345)
395 | ## define contamination
396 | cc <- DCARContControl(target = "eqIncome", epsilon = 0.005,
397 | grouping = "hid", dots = list(mean = 5e+05, sd = 10000))
398 | ## run new simulation
399 | results <- runSimulation(eusilcP, sc, contControl = cc, fun = sim, k = 175)
400 |
401 | ## ------------------------------------------------------------------------
402 | head(results)
403 |
404 | ## ------------------------------------------------------------------------
405 | aggregate(results)
406 |
407 | ## ----B05113_10_09--------------------------------------------------------
408 | tv <- gini(eusilcP$eqIncome)$value
409 | plot(results, true = tv)
410 |
411 | ## ------------------------------------------------------------------------
412 | library("simFrame")
413 | library("laeken")
414 | data("eusilcP")
415 |
416 | ## ------------------------------------------------------------------------
417 | set.seed(12345)
418 | sc <- SampleControl(design = "region", grouping = "hid",
419 | size = c(75, 250, 250, 125, 200, 225, 125, 150, 100), k = 100)
420 | cc <- DCARContControl(target = "eqIncome", epsilon = 0.005,
421 | grouping = "hid", dots = list(mean = 5e+05, sd = 10000))
422 | results <- runSimulation(eusilcP, sc, contControl = cc,
423 | design = "gender", fun = sim, k = 125)
424 |
425 | ## ----B05113_10_10--------------------------------------------------------
426 | tv <- simSapply(eusilcP, "gender", function(x) gini(x$eqIncome)$value)
427 | plot(results, true = tv)
428 |
429 | ## ------------------------------------------------------------------------
430 | set.seed(12345)
431 | sc <- SampleControl(design = "region", grouping = "hid",
432 | size = c(75, 250, 250, 125, 200, 225, 125, 150, 100), k = 100)
433 | cc <- DCARContControl(target = "eqIncome", epsilon = c(0, 0.0025, 0.005, 0.0075, 0.01), dots = list(mean = 5e+05, sd = 10000))
434 | results <- runSimulation(eusilcP, sc, contControl = cc,
435 | design = "gender", fun = sim, k = 125)
436 |
437 | ## ------------------------------------------------------------------------
438 | head(results)
439 |
440 | ## ------------------------------------------------------------------------
441 | aggregate(results)
442 |
443 | ## ----B05113_10_11--------------------------------------------------------
444 | tv <- simSapply(eusilcP, "gender", function(x) gini(x$eqIncome)$value)
445 | plot(results, true = tv)
446 |
447 | ## ------------------------------------------------------------------------
448 | set.seed(12345)
449 | sc <- SampleControl(design = "region", grouping = "hid",
450 | size = c(75, 250, 250, 125, 200, 225, 125, 150, 100), k = 50)
451 | cc <- DCARContControl(target = "eqIncome", epsilon = c(0, 0.005, 0.01), dots = list(mean = 5e+05, sd = 10000))
452 | nc <- NAControl(target = "eqIncome", NArate = c(0, 0.05))
453 | results <- runSimulation(eusilcP, sc, contControl = cc,
454 | NAControl = nc, design = "gender", fun = sim, k = 125)
455 |
456 | ## ------------------------------------------------------------------------
457 | aggregate(results)
458 |
459 | ## ----B05113_10_12--------------------------------------------------------
460 | tv <- simSapply(eusilcP, "gender", function(x) gini(x$eqIncome)$value)
461 | plot(results, true = tv)
462 |
463 | ## ------------------------------------------------------------------------
464 | sessionInfo()
465 |
466 |
--------------------------------------------------------------------------------
/Chapter 7/chapter7.R:
--------------------------------------------------------------------------------
1 | ## ---- message=FALSE,warning=FALSE----------------------------------------
2 | dat <- matrix(c(104,11037,189,11034),2,2, byrow=TRUE)
3 | dat
4 | library("vcd")
5 | ## confidence intervals
6 | confint(oddsratio(dat, log=FALSE))
7 |
8 | ## ---- cache=TRUE---------------------------------------------------------
9 | ## original surveyed data
10 | s1 <- rep(c(TRUE, FALSE), times = c(104, 11037))
11 | s2 <- rep(c(TRUE, FALSE), times = c(189, 11034))
12 | ## function for drawing a bootstrap sample
13 | ## and estimating the boostrap replicate
14 | boot_aspirin <- function(s1, s2){
15 | ## odds ratio
16 | sum(sample(s1, replace = TRUE)) / sum(sample(s2, replace = TRUE))
17 | }
18 | ## 10000 draws and replicates
19 | boot_repl <- replicate(10000, boot_aspirin(s1, s2))
20 | ## confidence interval
21 | quantile(boot_repl, c(0.025, 0.975))
22 |
23 | ## ------------------------------------------------------------------------
24 | x <- c(5, 7, 8, 2, 15, 12, 3)
25 |
26 | ## ------------------------------------------------------------------------
27 | ## for reproducibility we use a seed
28 | set.seed (123)
29 | ## bootstrap sample (with replacement)
30 | s1 <- sample(x, replace = TRUE)
31 | s1
32 |
33 | ## ------------------------------------------------------------------------
34 | s2 <- sample(x, replace = TRUE)
35 | s2
36 |
37 | ## ------------------------------------------------------------------------
38 | mean(x)
39 |
40 | ## ------------------------------------------------------------------------
41 | mean(s1)
42 | mean(s2)
43 |
44 | ## ------------------------------------------------------------------------
45 | library("car")
46 | data("Prestige")
47 | mean(Prestige$income)
48 |
49 | ## ------------------------------------------------------------------------
50 | set.seed(123)
51 | mean(sample(Prestige$income, replace = TRUE))
52 |
53 | ## ------------------------------------------------------------------------
54 | mean(sample(Prestige$income, replace = TRUE))
55 |
56 | ## ------------------------------------------------------------------------
57 | library("robustbase")
58 | ## data
59 | df <- Prestige[, c("income", "prestige")]
60 | ## robust MCD-based covariance
61 | covMcd(df, cor=TRUE)$cor
62 |
63 | ## ------------------------------------------------------------------------
64 | set.seed(1234) ## for reproducibility (seed)
65 | ## standard error with bootstrap
66 | sd(replicate(200,
67 | covMcd(df[sample(rownames(df), replace=TRUE), ],
68 | cor=TRUE)$cor[1,2]))
69 |
70 | ## ---- message=FALSE,warning=FALSE----------------------------------------
71 | library("boot")
72 | ## function for bootstrapping in boot
73 | cr <- function(d, w) covMcd(d[w, ], cor=TRUE)$cor[1,2]
74 | ## application of the boot function
75 | boot(data=df, statistic=cr, R=200)
76 |
77 | ## ------------------------------------------------------------------------
78 | ## MASS needed for drawing random numbers from multivariate normal
79 | library("MASS")
80 | ## parameters from empirical data (income and prestige)
81 | m1 <- colMeans(df)
82 | m2 <- cov(df)
83 | ## number of observations
84 | n <- dim(df)[1]
85 | ## parametric bootstrap
86 | parboot <- replicate(200, covMcd(mvrnorm(n, mu=m1, Sigma=m2), cor=TRUE)$cor[1,2])
87 | ## standard error
88 | sd(parboot)
89 |
90 | ## ------------------------------------------------------------------------
91 | ## parametric bootstrap
92 | system.time(sd(replicate(5000,
93 | cor(mvrnorm(n, mu=m1, Sigma=m2))[1,2])))
94 | ## non-parametric bootstrap
95 | system.time(sd(replicate(5000,
96 | cor(df[sample(rownames(df), replace=TRUE), ])[1,2])))
97 |
98 | ## ------------------------------------------------------------------------
99 | ## parametric bootstrap
100 | range(replicate(20, sd(replicate(50,
101 | cor(mvrnorm(n, mu=m1, Sigma=m2))[1,2]))))
102 | ## non-parametric bootstrap
103 | range(replicate(20, sd(replicate(50,
104 | cor(df[sample(rownames(df), replace=TRUE), ])[1,2]))))
105 |
106 | ## ----B05113_07_03--------------------------------------------------------
107 | ## parametric bootstrap
108 | pboot <-replicate(1000,
109 | cor(mvrnorm(n, mu=m1, Sigma=m2))[1,2])
110 | ## non-parametric bootstrap
111 | npboot <- replicate(1000,
112 | cor(df[sample(rownames(df),
113 | replace=TRUE), ])[1,2])
114 | mi <- min(pboot, npboot)
115 | ma <- max(pboot, npboot)
116 | ## Now plot Figure 7.3.
117 | par(mfrow=c(1,2), pty="s")
118 | hist(npboot,
119 | main="non-parametric",
120 | xlab="1000 bootstrap replicates",
121 | xlim=c(mi,ma), breaks = 25)
122 | hist(pboot,
123 | main="parametric",
124 | xlab="1000 bootstap replicates",
125 | xlim=c(mi,ma), breaks = 25)
126 |
127 | ## ------------------------------------------------------------------------
128 | x <- Prestige[, "income"]
129 | v <- function(x) sd(x) / mean(x)
130 | v(x)
131 |
132 | ## ------------------------------------------------------------------------
133 | vboot <- replicate(1000, v(sample(x, replace = TRUE)))
134 |
135 | ## ------------------------------------------------------------------------
136 | vbias <- mean(vboot) - v(x)
137 | vbias
138 |
139 | ## ------------------------------------------------------------------------
140 | v(x) - vbias
141 |
142 | ## ------------------------------------------------------------------------
143 | cat("CI(e): [", v(x) - vbias - qt(0.975, length(x)-1) * sd(vboot), ", ", v(x) - vbias + qt(0.975, length(x)-1) * sd(vboot), " ]\n")
144 |
145 | ## ------------------------------------------------------------------------
146 | cat("CI(p): [", quantile(vboot, 0.025), ", ", quantile(vboot, 0.975), " ]\n")
147 |
148 | ## ------------------------------------------------------------------------
149 | cat("CI(h): [", 2*v(x) - quantile(vboot, 0.975), ", ", 2*v(x) - quantile(vboot, 0.025), " ]\n")
150 |
151 | ## ------------------------------------------------------------------------
152 | ## some crazy data (10 outliers)
153 | x <- c(rnorm(100), rnorm(10,10))
154 | ## non-parametric bootstrap replicates
155 | mb <- replicate(10000, mean(sample(x, replace=TRUE)))
156 | ## percentile method
157 | cat("\nCI(perc): [", quantile(mb, 0.025), ", ", quantile(mb, 0.975), " ]\n")
158 | ## BCa method
159 | library("bootstrap")
160 | b <- bcanon(x, 10000, mean, alpha=c(0.025,0.975))
161 | cat("\nCI(BCa): [", b$confpoints[1,2], ", ", b$confpoints[2,2], " ]\n")
162 |
163 | ## ----B05113_07_04, echo=FALSE--------------------------------------------
164 | n <- length(x)
165 | ## histogram of data
166 | hist(x, main="Daten")
167 | ## draw mean
168 | segments(x0=mean(x), x1=mean(x), y0=-1, y1=10, col="red", lwd=3)
169 | ## draw confidence intervals
170 | arrows(x0=quantile(mb, 0.025), x1=quantile(mb, 0.975), y0=5, y1=5, code=3, length=0.1)
171 | arrows(x0=b$confpoints[1,2], x1=b$confpoints[2,2], y0=4, y1=4, code=3, length=0.1, col="blue")
172 | arrows(x0=mean(x)-qt(0.975, n-1)*sd(x)/sqrt(n), x1=mean(x)+qt(0.975, n-1)*sd(x)/sqrt(n), y0=3, y1=3, code=3, length=0.1, col="brown")
173 | h1 <- 2*mean(x) - quantile(mb, 0.975)
174 | h2 <- 2*mean(x) - quantile(mb, 0.025)
175 | arrows(x0=h1, x1=h2, y0=2, y1=2, code=3, length=0.1, col="orange")
176 | bias <- mean(mb) - mean(x)
177 | h1 <- mean(x) - bias - qt(0.975, n-1) * sd(mb)
178 | h2 <- mean(x) - bias + qt(0.975, n-1) * sd(mb)
179 | arrows(x0=h1, x1=h2, y0=1, y1=1, code=3, length=0.1, col="grey")
180 | legend("topright", legend=c("percentile", "BCa", "classical", "Hall", "bias corrected","arithm. mean"),
181 | lwd=c(1,1,1,1,1,3), col=c("black","blue","brown","orange","grey","red"))
182 |
183 | ## ----B05113_07_05, echo = FALSE------------------------------------------
184 | par(mar = c(1,3,0.5,0.1))
185 | hist(x, main="", xlim=c(0.1,1.9), ylim=c(0,10))
186 | segments(x0=mean(x), x1=mean(x), y0=-1, y1=10, col="red", lwd=3)
187 | arrows(x0=quantile(mb, 0.025), x1=quantile(mb, 0.975), y0=5, y1=5, code=3, length=0.1)
188 | arrows(x0=b$confpoints[1,2], x1=b$confpoints[2,2], y0=4, y1=4, code=3, length=0.1, col="blue")
189 | arrows(x0=mean(x)-qt(0.975, n-1)*sd(x)/sqrt(n), x1=mean(x)+qt(0.975, n-1)*sd(x)/sqrt(n), y0=3, y1=3, code=3, length=0.1, col="brown")
190 | h1 <- 2*mean(x) - quantile(mb, 0.975)
191 | h2 <- 2*mean(x) - quantile(mb, 0.025)
192 | arrows(x0=h1, x1=h2, y0=2, y1=2, code=3, length=0.1, col="orange")
193 | bias <- mean(mb) - mean(x)
194 | h1 <- mean(x) - bias - qt(0.975, n-1) * sd(mb)
195 | h2 <- mean(x) - bias + qt(0.975, n-1) * sd(mb)
196 | arrows(x0=h1, x1=h2, y0=1, y1=1, code=3, length=0.1, col="grey")
197 | legend("topright", legend=c("percentile", "BCa", "classical", "Hall", "bias corrected","arithm. mean"), lwd=c(1,1,1,1,1,3), col=c("black","blue","brown","orange","grey","red"))
198 |
199 | ## ------------------------------------------------------------------------
200 | ## toy data
201 | x <- c(1,2,2,2,2,2,7,8,9,10)
202 | ## remember, this is the variation coefficient
203 | v <- function(x) sd(x)/mean(x)
204 | ## initialisation
205 | n <- length(x)
206 | vjack <- rep(0, n-1)
207 | vpseudo <- rep(0, n)
208 | ## leave-one-out jackknife
209 | for(i in 1:n){
210 | vjack[i] <- v(x[-i])
211 | }
212 | ## jackknife pseudo values
213 | pseudo <- n * v(x) - (n-1)*vjack
214 | ## confidence interval with pseudo values
215 | cat("\nKI(pseudo): [", mean(pseudo) - qt(0.975, n-1) * sd(pseudo)/n, ", ", mean(pseudo) + qt(0.975, n-1) * sd(pseudo)/n, " ]\n")
216 | ## confidence interval with classical jackknife
217 | se2 <- sqrt(((n-1)/n) * sum((vjack - mean(vjack))^2))
218 | jbias <- (n-1) * (mean(vjack) - v(x))
219 | cat("\nKI(jse): [", v(x) - jbias - qt(0.975, n-1) * se2 , ", ", v(x) - jbias + qt(0.975, n-1) * se2, " ]\n")
220 |
221 | ## ------------------------------------------------------------------------
222 | quantile(replicate(10000, v(sample(x, replace = TRUE))), c(0.025, 0.975))
223 |
224 | ## ------------------------------------------------------------------------
225 | ## sample estimate
226 | median(x)
227 | ## non-parametric bootstrap
228 | qu <- quantile(replicate(10000,
229 | median(sample(x, replace = TRUE))),
230 | c(0.025, 0.975))
231 | cat("\nCI(boot): [", qu[1], ", ", qu[2], " ]\n")
232 | ## jackknife, initialisation
233 | n <- length(x)
234 | jack <- rep(0, n-1)
235 | pseudo <- rep(0, n)
236 | for(i in 1:n){
237 | jack[i] <- median(x[-i])
238 | }
239 | ## jackknife pseudo values approach
240 | pseudo <- n * median(x) - (n-1)*jack
241 | cat("\nCI(pseudo): [", mean(pseudo) - qt(0.975, n-1) * sd(pseudo)/n, ", ", mean(pseudo) + qt(0.975, n-1) * sd(pseudo)/n, " ]\n")
242 | ## classical jackknife
243 | se2 <- sqrt(((n-1)/n) * sum((jack - mean(jack))^2))
244 | jbias <- (n-1) * (mean(jack) - median(x))
245 | cat("\nCI(jse): [", median(x) - jbias - qt(0.975, n-1) * se2 , ", ", median(x) - jbias - qt(0.975, n-1) * se2, " ]\n")
246 |
247 | ## ------------------------------------------------------------------------
248 | ## all combinations
249 | co <- combn(10, 2)
250 | ## first 6 out of 45
251 | co[, 1:6]
252 | ## delete-2 jackknife replicates
253 | jack_d <- apply(co, 2, function(i) median(x[-i]))
254 | ## standard error
255 | n <- length(x)
256 | r <- 2 / n
257 | ## n over 2
258 | nd <- choose(n, 2)
259 | ## inflation factor
260 | fac <- r / nd
261 | m <- mean(jack_d)
262 | ## standard error
263 | se_d <- sqrt(fac * sum((jack_d - m)^2))
264 | ## confidence interval:
265 | cat("\nKI(jse): [", median(x) - qt(0.975, n-1) * se_d , ", ", median(x) + qt(0.975, n-1) * se_d, " ]\n")
266 |
267 | ## ------------------------------------------------------------------------
268 | choose(45, 10)
269 |
270 | ## ------------------------------------------------------------------------
271 | data(Prestige, package = "car")
272 | x <- Prestige$income
273 | v <- function(x, indices){
274 | x <- x[indices]
275 | est <- sd(x)/mean(x)
276 | return(est)
277 | }
278 |
279 | ## ----B05113_07_06--------------------------------------------------------
280 | library("boot")
281 | bx <- boot(x, v, 2000)
282 | ## Figure 7.6.
283 | jack.after.boot(bx)
284 |
285 | ## ----B05113_07_07--------------------------------------------------------
286 | set.seed(12345)
287 | ## generate some data
288 | x1 <- runif(100, 0, pi)
289 | s <- data.frame(x1 = x1, x2 = rnorm(100, 0, 0.1) + sin(x1))
290 | ## plot data points for Figure 7.6
291 | plot(s, xlab = "x", ylab = "y")
292 | ## simple model
293 | reg1 <- lm(s[, 2] ~ s[, 1], data = s)
294 | abline(reg1, lwd = 2)
295 | ## sinus model
296 | reg2 <- lm(s[, 2] ~ sin(s[, 1]), data = s)
297 | f <- function(x, coef) coef[1] + coef[2] * sin(x)
298 | ss <- seq (-0.02, 3.2, 0.01)
299 | lines(ss, f (ss, coef(reg2)), lty = 2, col = "blue", lwd = 2)
300 | ## locally reweighted regression
301 | reg3 <- lowess(x = s[, 1], y = s[, 2], f = 0.1)
302 | lines (reg3, col = "red", lty = 3, lwd = 2)
303 | ## legend for Figure 7.6
304 | legend("bottom", col = c("black", "blue", "red"),
305 | lty = c(1, 2, 3), lwd = c(2, 2, 2),
306 | legend = c(expression(y = beta[0] + beta[1]*x),
307 | expression(y = beta[0] + sin(x)),
308 | "loess, 0.1"))
309 |
310 | ## ------------------------------------------------------------------------
311 | str(s)
312 |
313 | ## ------------------------------------------------------------------------
314 | ## index of training data
315 | training_ind <- sample(1:nrow(s), 70)
316 | ## index of test data
317 | test_ind <- which(!(1:100 %in% training_ind))
318 |
319 | ## ------------------------------------------------------------------------
320 | lm1 <- lm(s[training_ind, 2] ~ s[training_ind, 1], data = s)
321 |
322 | ## ------------------------------------------------------------------------
323 | ## expected values
324 | f <- function(x) reg1$coef[1] + reg1$coef[2] * x
325 | ## prediction error, squared sum of expected and observed test data
326 | error <- sum((f(s[test_ind, 1]) - s[test_ind, 2])^2)
327 | error
328 |
329 | ## ----B05113_07_08, echo=FALSE, fig.width=7, fig.height=7, fig=TRUE-------
330 | par(mfrow = c(2,2), pty = "s", mar = c(2,2,1.5,0.5))
331 | ## training and test data
332 | training <- as.numeric(sample(rownames(s), 70))
333 | plot(s, main = "OLS regression", pch = 20, xlab = "", ylab = "")
334 | points(s[training, ], pch = 20)
335 | points(s[-training, ], col = "red", pch = 15)
336 | legend("bottom", pch = c(20,15), col = c("black","red"),
337 | legend = c("training", "test"))
338 | ## training data plus regression line
339 | plot(s[training, ], main = "OLS regression", pch = 20, xlab = "", ylab = "")
340 | reg1 <- lm(s[training, 2] ~ s[training, 1], data = s)
341 | abline(reg1)
342 | legend("bottom", pch = 20, col = "black", legend = c ("training"))
343 | ## evaluation on test data
344 | plot(s, main = "OLS regression", pch = 20, xlab = "", ylab = "")
345 | points(s[-training, ], col = "red", pch = 15)
346 | f <- function(x) reg1$coef[1] + reg1$coef[2] * x
347 | segments(x0 = s[-training, 1], x1 = s[-training, 1], y0 = s[-training, 2], y1 = f(s[-training, 1]), col = "red")
348 | abline(reg1)
349 | legend("bottom", pch = c(20,15), col = c("black", "red"), legend = c("training", "test"))
350 | error1 <- sum((f(s[-training, 1]) - s[-training, 2])^2)
351 | # error1
352 | text(x = 1.4, y = 0.16, expression(sum(e[i]^2, i = test) == 4.609), col = "red", cex=1.4)
353 | ## sinus model, evaluation
354 | plot(s, main = "OLS regression, sinus", pch = 20, xlab = "", ylab = "")
355 | points(s[-training, ], col = "red", pch = 15)
356 | reg2 <- lm(s[training, 2] ~ sin(s[training, 1]), data = s)
357 | f <- function(x) reg2$coef[1] + reg2$coef[2] * sin(x)
358 | f2 <- function(x, coef) coef[1] + coef[2] * sin(x)
359 | segments(x0 = s[-training, 1], x1 = s[-training, 1],
360 | y0 = s[-training, 2], y1 = f(s[-training, 1]), col = "blue" )
361 | #abline (reg2, col = "black")
362 | ss <- seq(-0.02,3.2,0.01)
363 | lines(ss, f2 (ss, coef (reg2)), lty = 2, col = "blue")
364 | legend("bottom", pch = c(20,15), col = c("black", "red"), legend = c("training", "test"))
365 | error1 <- sum((f(s[-training, 1]) - s[-training, 2])^2)
366 | # error1
367 | text(x = 1.4, y = 0.16, expression(sum(e[i]^2, test) == 0.467), col = "blue", cex=1.4)
368 |
369 | ## ------------------------------------------------------------------------
370 | f <- function (x) reg2$coef[1] + reg2$coef[2] * sin(x)
371 | error1 <- numeric(1000)
372 | n <- nrow(s)
373 | for (i in 1:1000){
374 | training_ind <- sample(1:n, 70)
375 | reg2 <- lm(s[training, 2] ~ sin(s[training, 1]), data = s)
376 | error1[i] <- sum((f(s[-training_ind, 1]) - s[-training_ind, 2])^2)
377 | }
378 | summary (error1)
379 |
380 | ## ----B05113_07_09, echo = FALSE, fig.width=7, fig.height=7, fig=TRUE-----
381 | par(mfrow = c(2,2), pty = "s", mar = c(2,2,1.5,0.5))
382 | for (i in 1:4){
383 | plot(s, main = paste ( "OLS, Model 2, IndexTestObs .:", i), pch = 20,
384 | xlab = "", ylab = "")
385 | training = (1:nrow(s))[-i]
386 | # print("omitted th observation:", paste(i, sep = ""))
387 | # print(training)
388 | points(s[-training, ], col = "red", pch = 15, cex = 1.4)
389 | reg2 <- lm(s[training, 2] ~ sin(s[training, 1]), data = s)
390 | f <- function (x) reg2$coef[1] + reg2$coef [2] * sin(x)
391 | f2 <- function (x, coef) coef[1] + coef[2] * sin(x)
392 | segments(x0 = s[-training, 1], x1 = s[-training, 1], y0 = s[-training, 2], y1 = f(s[-training, 1]), col = "blue" )
393 | #abline (reg2, col = "black")
394 | ss <- seq(-0.02,3.2,0.01)
395 | lines (ss, f2 (ss, coef (reg2)), lty = 2, col = "blue")
396 | legend("bottom", pch = c(20,15), col = c("black", "red"), legend = c("training", "test"))
397 | error1 <- sum ((f(s[-training, 1]) - s[-training, 2])^2)
398 | # print(paste("errors regarding the.", i, "- th observation", sep = ""))
399 | # print (error1)
400 | # cat("\n")
401 | text(x = 1.4, y = 0.16,
402 | paste("e", " = ", round(error1,5), sep=""),
403 | col="blue", cex=1.4)
404 | }
405 |
406 | ## ------------------------------------------------------------------------
407 | n <- nrow(s)
408 | error1 <- numeric(n)
409 | for(i in 1:n){
410 | reg2 <- lm(x2 ~ x1, data = s[-i, ])
411 | error1[i] <- sum((f(s[i, 1]) - s[i, 2])^2)
412 | }
413 | mean(error1)
414 |
415 | ## ---- warning=FALSE, message=FALSE---------------------------------------
416 | library("cvTools")
417 | fit <- lm(x2 ~ x1, data = s)
418 | # perform cross-validation
419 | cvFit(fit, data = s, y = s$x2, cost = mspe,
420 | K = 5, R = 10, seed = 1234)
421 |
422 | ## ------------------------------------------------------------------------
423 | library("robustbase")
424 | # set up folds for cross-validation
425 | folds <- cvFolds(nrow(coleman), K = 5, R = 10)
426 | ## compare LS, MM and LTS regression
427 | ## perform cross-validation for an LS regression model
428 | fitLm <- lm(prestige ~ ., data = Prestige)
429 | cvFitLm <- cvLm(fitLm, cost = mspe,
430 | folds = folds)
431 | fitLm2 <- lm(prestige ~ income:type + education + women, data = Prestige)
432 | cvFitLm2 <- cvLm(fitLm, cost = mspe,
433 | folds = folds)
434 | ## perform cross-validation for an MM regression model
435 | fitLmrob <- lmrob(prestige ~ ., data = Prestige)
436 | cvFitLmrob <- cvLmrob(fitLmrob, cost = mspe,
437 | folds = folds)
438 | fitLmrob2 <- lmrob(prestige ~ income:type + education + women, data = Prestige)
439 | cvFitLmrob2 <- cvLmrob(fitLmrob, cost = mspe,
440 | folds = folds)
441 | ## compare cross-validation results
442 | cvSelect(LS = cvFitLm, LS2 = cvFitLm2,
443 | MM = cvFitLmrob, MM2 = cvFitLmrob2)
444 |
445 | ## ------------------------------------------------------------------------
446 | sessionInfo()
447 |
448 |
--------------------------------------------------------------------------------
/Chapter 4/chapter4.R:
--------------------------------------------------------------------------------
1 | ## ---- echo=FALSE---------------------------------------------------------
2 | knitr::opts_chunk$set(cache=TRUE)
3 |
4 | ## ---- eval=FALSE, echo=FALSE, message=FALSE,warning=FALSE----------------
5 | ## library("random")
6 | ## x <- randomNumbers(n=10000, col=2, min=0,
7 | ## max=1e+06, check=TRUE) / 1000000
8 | ## save(x, file="~/workspace/simulation-book/book/chapter4/r12.RData")
9 |
10 | ## ---- eval=FALSE, echo=TRUE, message=FALSE,warning=FALSE-----------------
11 | ## library("random")
12 | ## x <- randomNumbers(n=10000, col=2, min=0,
13 | ## max=1e+06, check=TRUE) / 1000000
14 |
15 | ## ---- echo=FALSE, message=FALSE,warning=FALSE----------------------------
16 | load("~/workspace/simulation-book/book/chapter4/r12.RData")
17 | r1 <- as.numeric(x)
18 | require(ggplot2)
19 | theme_set(theme_bw())
20 |
21 | ## ----B05113_04_01, fig.width=4, fig.height=4, eval=TRUE------------------
22 | n <- length(x)
23 | df <- data.frame(x1 = x[1:(n-1)], x2 = x[2:n])
24 | ggplot(df, aes(x = x1, y = x2)) + geom_point(size = 0.1) +
25 | xlab("random numbers from random.org") + ylab("lag 1")
26 |
27 | ## ------------------------------------------------------------------------
28 | seed <- 123
29 | randu <- function(n) {
30 | for (i in 1:n) {
31 | seed <<- (65539 * seed) %% (2^31)
32 | result[i] <- seed / 2^31
33 | }
34 | return(result)
35 | }
36 | plot3S <- function(func, n=10000) {
37 | x <- func(n)
38 | require("rgl")
39 | x1 <- x[1:(n-2)]
40 | x2 <- x[2:(n-1)]
41 | x3 <- x[3:n]
42 | plot3d(x1, x2, x3, size=3)
43 | play3d(spin3d())
44 | }
45 |
46 | ## ---- eval=FALSE---------------------------------------------------------
47 | ## plot3S(randu)
48 | ## ## to compare it with R's standard generator
49 | ## plot3S(runif) ## (Mersenne-Twister)
50 |
51 | ## ------------------------------------------------------------------------
52 | RNGkind()
53 |
54 | ## ----B05113_04_04, fig.height=3, fig.width=6-----------------------------
55 | ok <- RNGkind()
56 | op <- par(mfrow=c(1,2), mar=c(3,4,2,2))
57 | set.seed(111)
58 | hist(rnorm(1000), main="Mersenne Twister, Inversion", freq=FALSE, xlim=c(-4,4), ylim=c(0,0.43), cex.main=0.7)
59 | curve(dnorm(x), col = 2, lty = 1, lwd = 2, add = TRUE)
60 | RNGkind("Super", "Box-Muller")
61 | RNGkind()
62 | hist(rnorm(1000), main="Super-Duper, Box-Muller", freq=FALSE, xlim=c(-4,4), ylim=c(0,0.43), cex.main=0.7)
63 | curve(dnorm(x), col = 2, lty = 1, lwd = 2, add = TRUE)
64 |
65 | ## ------------------------------------------------------------------------
66 | u <- runif(100, 0, 1)
67 | ifelse(u <= 0.2, 0, 1)
68 |
69 | ## ----B05113_04_06, echo=FALSE--------------------------------------------
70 | plot(x=c(0,1), y=c(0,1.3), type="n", yaxt="n", xlab="U(0,1)", ylab="B(pi)")
71 | segments(y0=-0.5, x0=0, y1=0.5, x1=0, lty=1, col="black", lwd=2)
72 | segments(y0=0.5, y1=0.5, x0=0, x1=0.2, col="black", lwd=1, lty=2)
73 | segments(y0=0.5, y1=1, x0=0.2, x1=0.2, lwd=2)
74 | segments(y0=1, y1=1, x0=0.2, x1=1, col="black", lwd=1, lty=2)
75 | segments(y0=1, y1=1.5, x0=1, x1=1, lwd=2)
76 | points(0, 0.5, cex=2)
77 | points(0.2, 0.5, cex=2, pch=20)
78 | points(0.2,1, cex=2)
79 | points(1,1, cex=2, pch=20)
80 | arrows(x0=0.55376, x1=0.55376, y0=-0.15, y1=1, col="blue", lty=2)
81 | arrows(x0=0.55376,x1=-0.04, y0=1, y1=1, col="blue", lty=2)
82 | axis(2, at=c(1,0.5), labels=c(1,0), cex.axis=1.5, las=1)
83 | text(x=0.53, y=0.3, labels="random number 0.554 \n drawn from U(0,1)", col="blue", srt=0)
84 |
85 | ## ----B05113_04_07, echo=FALSE--------------------------------------------
86 | plot(x=c(0,1), y=c(0,3), type="n", xlab="U(0,1)",
87 | ylab="exponential distributed")
88 | s <- seq(0,1,length.out=1000)
89 | lines(x=s, y=qexp(s), lwd=2)
90 | arrows(x0=0.55376, x1=0.55376, y1=qexp(0.55376), y0=-015, col="blue", lty=2)
91 | arrows(x0=0.55376, x1=-0.04, y0=qexp(0.55476), y1=qexp(0.55476), col="blue", lty=2)
92 |
93 | ## ----B05113_04_08--------------------------------------------------------
94 | library("MASS")
95 | invExp <- function(n, lambda = 1) {
96 | u <- runif(n)
97 | x <- -(1/lambda) * log(u)
98 | return(x)
99 | }
100 | lambda <- c(0.5, 1, 2, 5)
101 | par(mar = c(3,3,3,0), mfrow = c(2, 2))
102 | for (l in lambda) {
103 | sample <- invExp(10000, l)
104 | truehist(sample, nbins = 20, col = "limegreen",
105 | main = paste("lambda =", l), xlab = "")
106 | curve(dexp(x, l), from = 1e-10, add = TRUE)
107 | }
108 |
109 | ## ---- echo=FALSE---------------------------------------------------------
110 | set.seed(1234)
111 |
112 | ## ------------------------------------------------------------------------
113 | sample(1:3, 10, replace = TRUE, prob = c(3/7,1/7,3/7))
114 |
115 | ## ----B05113_04_09, echo=FALSE--------------------------------------------
116 | par(mfrow=c(1,2))
117 | tab <- c(3/7, 1/7, 3/7)
118 | #par(xaxt="n", yaxt="n")
119 | barplot(tab, xaxt="n", yaxt="n", ylab="probabilties", xlab="categories")
120 | axis(1, at=c(0.65, 1.85, 3.13), labels=c("x1", "x2", "x3"))
121 | axis(2, at=c(0,1/7,3/7), labels=c(0, "1/7","3/7"))
122 | tab <- c(7/21, 7/21, 7/21)
123 | #par(xaxt="n", yaxt="n")
124 | barplot(tab, xaxt="n", yaxt="n", ylab="probabilties", xlab="categories", ylim=c(0, 0.42857))
125 | #axis(1, at=c(0.65, 1.85, 3.13), labels=c("x1", "x2", "x3"))
126 | axis(2, at=c(0,3/21,5/21,7/21), labels=c(0, "3/21","5/21","7/21"))
127 | segments(x0=1.3897, x1=2.3943, y0=5/21, y1=5/21)
128 | segments(x0=2.5879, x1=3.5888, y0=3/21, y1=3/21)
129 | text(x=c(0.63704, 1.85844, 1.84147, 3.04591, 3.04591),
130 | y=c(0.16615, 0.28473, 0.13672, 0.064876, 0.237125),
131 | c("x1","x1","x3","x2","x3"))
132 |
133 | ## ---- echo=FALSE, message=FALSE, warning=FALSE---------------------------
134 | require("knitr")
135 | tab <- data.frame("i"=1:3, "xi"=c("x1","x3","x2"),
136 | "pi"=c("3/7","3/7","1/7"),
137 | "ai"=c("x1","x1","x3"),
138 | "ri"=c("1","5/7","3/7"))
139 | kable(tab)
140 |
141 | ## ---- echo=FALSE---------------------------------------------------------
142 | library(codatables)
143 | data("precipitation")
144 | a <- addmargins(precipitation)
145 | kable(a)
146 |
147 | ## ------------------------------------------------------------------------
148 | x <- data.frame("spring" = c(275,56,52,65,37,23),
149 | "summer" = c(302,20,29,17,15,5),
150 | "autumn" = c(375,43,53,52,54,50),
151 | "winter" = c(198,37,44,69,58,42))
152 |
153 | ## ---- eval=TRUE, warning=FALSE, message=FALSE----------------------------
154 | xx <- expand.grid(rownames(x), colnames(x)) # all combinations
155 | x1 <- xx[,1]
156 | x2 <- xx[,2]
157 | y <- as.vector(t(prop.table(x))) # cell probabilites
158 | form <- y ~ x1:x2 # modell
159 | mod <- glm(form, family="poisson") # estimation
160 | pred <- (predict(mod)) # prediction
161 | pred <- exp(pred)/(1+exp(pred)) # transf. with logistic function
162 |
163 | ## ------------------------------------------------------------------------
164 | round(matrix(pred, ncol=4, byrow=TRUE) * sum(x)) # table
165 |
166 | ## ------------------------------------------------------------------------
167 | x <- seq(-5, 5, length = 200)
168 | dc <- dcauchy(x, location = 0, scale = 1)
169 | dn <- dnorm(x, mean = 0, sd = 1)
170 |
171 | ## ----B05113_04_10--------------------------------------------------------
172 | par(mfrow=c(1,2), mar = c(4,4,0.1,1))
173 | plot(x, dn, type="l", ylab = "f(x) and h(x) . In the R code: dn and dc")
174 | lines(x, dc, col="blue", lty=2)
175 | legend("topright", col=c("black", "blue"), lty=c(1,2), legend = c("normal, f(x)", "Cauchy, h(x)"), cex=1)
176 | plot(x, dn/dc, type="l", ylab = "f(x) / h(x) . In the R code: (dn / dc)")
177 |
178 | ## ------------------------------------------------------------------------
179 | foo <- function(x) dnorm(x)/dcauchy(x)
180 | opt <- optimize(foo, c(0, 4), maximum=TRUE)
181 | a <- opt$objective
182 | a
183 | ah <- a * dc
184 |
185 | ## ----B05113_04_11--------------------------------------------------------
186 | plot(x, dn, type="l", ylim=c(0,0.5), lwd=2, ylab = "densities")
187 | lines(x, dc, col="blue", lty=2)
188 | lines(x, ah, col="blue", lty=2, lwd=2)
189 | legend("topright", col=c("black", "blue", "blue"), lty=c(1,2,2), lwd=c(1,1,2), legend = c("normal, f(x)", "Cauchy, h(x)", "g(x) = a * Cauchy"), cex=1)
190 |
191 | ## ----B05113_04_12--------------------------------------------------------
192 | plot(x, dn, type="l", ylim=c(0,0.5), lwd=2)
193 | polygon(x, dn, col="gray")
194 | polygon(c(x, rev(x)), c(dn, rev(ah)), col="blue")
195 |
196 | ## ------------------------------------------------------------------------
197 | alpha <- function(x){
198 | dnorm(x)/(1.520347 * dcauchy(x))
199 | }
200 |
201 | rejectionNorm <- function(n) {
202 | x <- rcauchy(10000,0,1)
203 | u <- runif(10000)
204 | return(na.omit(ifelse(u <= alpha(x), x, NA)))
205 | }
206 |
207 | ## ----B05113_04_13--------------------------------------------------------
208 | set.seed(123)
209 | x <- rejectionNorm(10000)
210 | hist(x, prob=TRUE)
211 | curve(dnorm(x), lty = 1, lwd = 2, add = TRUE)
212 |
213 | ## ----B05113_04_14--------------------------------------------------------
214 | curve(dbeta(x, shape1 = 2, shape2 = 2), from = 0, to = 1,
215 | xlab = "", ylab = "", main = "")
216 | ## a * h(x):
217 | abline(h = 1.5, lty = 2)
218 |
219 | ## ------------------------------------------------------------------------
220 | rsBeta <- function(n) {
221 | z <- runif(n)
222 | u <- runif(n)
223 | ind <- (u <= 4 * z * (1 - z))
224 | return(z[ind])
225 | }
226 | set.seed(123)
227 | sample1 <- rsBeta(10000)
228 | acceptS <- length(sample1) / 10000
229 | acceptS
230 |
231 | ## ---- echo=TRUE, tidy=FALSE----------------------------------------------
232 | library(MASS)
233 | plot1 <- function(s, shape1=2, shape2=2){
234 | truehist(s, h = 0.1, xlim = c(0, 1), #ylim = c(0,2),
235 | col="white", main = "", xlab = "")
236 | curve(dbeta(x, shape1 = shape1, shape2 = shape2),
237 | from = 0, to = 1, add = TRUE)
238 | d <- density(s, from = 0, to = 1, adjust = 2,
239 | kernel = "gaussian")
240 | lines(d$x, d$y, lty = 2)
241 | legend("topright",
242 | legend = c("true density", "density of simulated values"),
243 | col = c(1, 1), lty = c(1, 2), cex = 0.6)
244 | }
245 |
246 | ## ----B05113_04_15--------------------------------------------------------
247 | plot1(sample1) # produces a histgram and curve, shown below:
248 |
249 | ## ---- eval=FALSE, fig.keep='none'----------------------------------------
250 | ## rsBeta2 <- function(n, shape1=2.5, shape2=6.5){
251 | ## a <- optimize(f=function(x){dbeta(x,shape1,shape2)},
252 | ## interval=c(0,1), maximum=TRUE)$objective
253 | ## z <- runif(n)
254 | ## u <- runif(n, max=a)
255 | ## ind <- (u <= dbeta(z,shape1,shape2))
256 | ## return(z[ind])
257 | ## }
258 | ## sample2 <- rsBeta2(10000)
259 |
260 | ## ------------------------------------------------------------------------
261 | # percentage of rejection
262 | (1- (pcauchy(5) - pcauchy(4))) * 100
263 | v <- rcauchy(1000)
264 | v <- v[v >= 4 & v <= 5]
265 | v
266 | v[1:10]
267 |
268 | ## ------------------------------------------------------------------------
269 | Fa <- pcauchy(4)
270 | Fb <- pcauchy(5)
271 | u <- runif(10, min = 0, max = Fb - Fa)
272 | qcauchy(Fa + u)
273 |
274 | ## ------------------------------------------------------------------------
275 | ## Simple random walk Markov chain:
276 | n <- 10; set.seed(123)
277 | x <- numeric(n)
278 | for(i in 2:n){
279 | x[i] <- x[i-1] + rnorm(1)
280 | }
281 | x
282 |
283 | ## ------------------------------------------------------------------------
284 | set.seed(123)
285 | x <- numeric(n)
286 | for(i in 2:n){
287 | x[i] <- rnorm(1, mean = x[i-1])
288 | }
289 | x
290 |
291 | ## ------------------------------------------------------------------------
292 | f <- function(x, sigma){
293 | if(any(x < 0)) return(0)
294 | stopifnot(sigma > 0)
295 | return((x / sigma^2) * exp(-x^2 / (2*sigma^2)))
296 | }
297 |
298 | ## ------------------------------------------------------------------------
299 | i <- 2
300 | xt <- x[i-1] <- rchisq(1, 1)
301 | y <- rchisq(1, df=xt)
302 |
303 | ## ------------------------------------------------------------------------
304 | rrai <- function(n = 10000, burnin = 1000, thin = 10, sigma = 4, verbose = TRUE){
305 | ## raileigh density
306 | f <- function(x, sigma){
307 | if(any(x < 0)) return(0)
308 | stopifnot(sigma > 0)
309 | return((x / sigma^2) * exp(-x^2 / (2*sigma^2)))
310 | }
311 | x <- numeric(n)
312 | x[1] <- rchisq(1, df=1)
313 | k <- 0; u <- runif(n)
314 | for(i in 2:n){
315 | xt <- x[i-1]
316 | y <- rchisq(1, df=xt)
317 | num <- f(y, sigma) * dchisq(xt, df=y)
318 | den <- f(xt, sigma) * dchisq(y, df=xt)
319 | if(u[i] <= num/den){
320 | x[i] <- y
321 | } else {
322 | x[i] <- xt
323 | k <- k + 1 # y is rejected
324 | }
325 | }
326 | if(verbose) cat("acceptance rate:", (k-burnin)/n/thin, "\n")
327 | ## burn-in:
328 | if(burnin > 1) x <- x[(burnin+1):length(x)]
329 | ## thining:
330 | return(x[seq(1, length(x), thin)])
331 | }
332 |
333 | r <- rrai(n = 10000, thin = 1, burnin = 1)
334 | r <- rrai(n = 10000, thin = 10, burnin = 1000)
335 | length(r)
336 |
337 | ## ----B05113_04_16--------------------------------------------------------
338 | qplot(1:length(r), r, geom="line", xlab="", ylab="random numbers from Rayleigh(4)")
339 |
340 | ## ----B05113_04_17--------------------------------------------------------
341 | a <- ppoints(length(r))
342 | sigma <- 4
343 | QR <- sigma * sqrt (-2 * log (1-a)) # quantiles of Rayleigh
344 | Q <- quantile(r, a)
345 | qqplot(QR, Q, main = "", xlab = "Rayleigh quantile", ylab = "sample quantile")
346 |
347 | ## ----B05113_04_18, echo=FALSE--------------------------------------------
348 | par(mfrow=c(1,2))
349 | SEQ <- seq(-4,4,0.05)
350 | par(mar=c(2.5,3.8,3,1.1))
351 | plot(SEQ, dnorm(SEQ), xaxt="n", type="l", lwd=2)
352 | axis(1,at=c(-2,-1), label=c(expression(x[t]), expression(y[t])))
353 | segments(x0=-2,x1=-2,y1=dnorm(-2),y0=-1)
354 | segments(x0=-2,x1=-5,y1=dnorm(-2),y0=dnorm(-2))
355 | segments(x0=-1,x1=-1,y1=dnorm(-1),y0=-1)
356 | segments(x0=-1,x1=-5,y1=dnorm(-1),y0=dnorm(-1))
357 | axis(2,at=c(dnorm(-2),dnorm(-1)),
358 | label=c(expression(f(x[t])), expression(f(y[t]))), las=1)
359 | abline(h=0, lty=2)
360 | SEQ <- seq(-4,4,0.05)
361 | par(mar=c(2.5,3,3,0.1))
362 | plot(SEQ, dnorm(SEQ), xaxt="n", type="l", lwd=2)
363 | axis(1,at=c(-1.85,-1.5), label=c(expression(y[t]), expression(x[t])))
364 | segments(x0=-1.85,x1=-1.85,y1=dnorm(-1.85),y0=-1)
365 | segments(x0=-1.85,x1=-5,y1=dnorm(-1.85),y0=dnorm(-1.85))
366 | segments(x0=-1.5,x1=-1.5,y1=dnorm(-1.5),y0=-1)
367 | segments(x0=-1.5,x1=-5,y1=dnorm(-1.5),y0=dnorm(-1.5))
368 | axis(2,at=c(dnorm(-1.85),dnorm(-1.5)),
369 | label=c(expression(f(y[t])), expression(f(x[t]))), las=1)
370 | abline(h=0, lty=2)
371 |
372 | ## ------------------------------------------------------------------------
373 | mh <- function(n=10000, burnin=1000, thin=10, cand=runif,
374 | target=dbeta, shape1=2, shape2=2){
375 | if(burnin >= n){
376 | stop("burnin is larger than the number of simulations")
377 | }
378 | x <- rep(cand(1), n) # initialization
379 | for(i in 2:n){
380 | y <- cand(1)
381 | rho <- target(y,shape1,shape2)/
382 | target(x[i-1], shape1, shape2)
383 | x[i] <- x[i-1] + (y - x[i-1]) * (runif(1) < rho)
384 | }
385 | # burn-in
386 | x <- x[(burnin+1):n]
387 | return(x[seq(1, length(x), thin)])
388 | }
389 |
390 | ## ----B05113_04_19--------------------------------------------------------
391 | par(mfrow=c(1,2))
392 | plot(density(mh()), main = "", xlab = "")
393 | plot(density(mh(shape1=1)), main = "", xlab = "")
394 |
395 | ## ------------------------------------------------------------------------
396 | rgamma <- mh(cand = rnorm, target = dgamma)
397 |
398 | ## ----B05113_04_20--------------------------------------------------------
399 | gibbs_bivariate <- function(n = 1000, rho = 0.9, start = 0, burnin = 100, thin = 1){
400 | x <- y <- numeric(n)
401 | s <- 1 - rho^2
402 | x[1] <- start # to show effect of burnin
403 | for(t in 1:(n-1)){
404 | y[t+1] <- rnorm(1, rho*x[t], s)
405 | x[t+1] <- rnorm(1, rho*y[t+1], s)
406 | }
407 | s <- seq(burnin+1, n, thin)
408 | return(cbind(x[s], y[s]))
409 | }
410 | par(mfrow=c(1,3))
411 | set.seed(123)
412 | ## bad start:
413 | b0 <- gibbs_bivariate(n=200, start = 30, burnin=0)
414 | ## plot the results
415 | plot(b0, type="o", xlab="x", ylab="y", main="burnin 0",
416 | cex.main=1.3, cex.lab=1.3)
417 | set.seed(123)
418 | plot(b0[20:200,], type="o", xlab="x", ylab="y", main="burnin 20",
419 | cex.main=1.3, cex.lab=1.3, col=grey(20:200/200))
420 | set.seed(123)
421 | plot(b0[20:200,], pch=20, xlab="x", ylab="y", main="burnin 20",
422 | cex.main=1.3, cex.lab=1.3)
423 |
424 | ## ------------------------------------------------------------------------
425 | lreg <- function(y, x, time, alpha = 0, beta = -2, tau = 1, burnin = 0, thin = 1){
426 | n <- length(y)
427 | ## alpha, beta, tau defining varibles
428 | res <- matrix(, ncol=3, nrow=time)
429 | for(i in 1:time){
430 | alpha <- rnorm(1, mean(y) -beta * mean(x), 1 / (n *tau))
431 | m <- (sum(x * y) - alpha * n * mean(x)) / sum(x**2)
432 | s <- 1 / (tau * sum(x**2))
433 | beta <- rnorm(1, m, s)
434 | w <- y - alpha - beta * x
435 | tau <- rgamma(1, ((n / 2) + 1), (sum(w**2) / 2))
436 | res[i,] <- c(alpha, beta, tau)
437 | }
438 | s <- seq(1, length((burnin + 1):nrow(res)), thin)
439 | res <- res[((burnin+1):nrow(res))[s], ]
440 | res <- data.frame(res)
441 | colnames(res) <- c("alpha", "beta", "tau")
442 | return(res)
443 | }
444 |
445 | ## ----B05113_04_21--------------------------------------------------------
446 | data(Cars93, package = "MASS")
447 | set.seed(123)
448 | time <- 100
449 | res <- lreg(Cars93$Price, Cars93$Horsepower, time = time)
450 | par(mar = c(4,4,0.1,0.1))
451 | plot(Cars93$Horsepower, Cars93$Price, pch=20, xlab = "Horsepower", ylab = "Price", type = "n")
452 | range <- 1 - sqrt(1:time/time)
453 | range <- range + 0.1
454 | #range <- range/sum(2*range)
455 | for(i in 1:time){
456 | abline(a = res[i, 1], b = res[i, 2], col=gray(range[i]))#sqrt(1-i/size)))
457 | }
458 | abline(a = res[i, 1], b = res[i, 2], col="red", lty=2,lwd=3)#sqrt(1-i/size)))
459 | points(Cars93$Horsepower, Cars93$Price, pch=20)
460 |
461 | ## ----B05113_04_22, message=FALSE, warning=FALSE, fig.width=7, fig.height=4----
462 | set.seed(123)
463 | g <- lreg(Cars93$Price, Cars93$Horsepower, time = 500)
464 | g1 <- cbind(g, "index" = 1:nrow(g))
465 | g1 <- reshape2::melt(g1, id=c("index"))
466 | ggplot(g1, aes(x = index, y = value)) + geom_line() + facet_wrap(~variable, scales = "free_y")
467 |
468 | ## ----B05113_04_23, message=FALSE, warning=FALSE--------------------------
469 | plot(acf(g), mar=c(2.9,3.2,2,0), cex.lab = 1.4, cex.main = 1.3)
470 |
471 | ## ---- fig.height=4, fig.width=7, cache=FALSE-----------------------------
472 | library("coda")
473 | time <- 2000; M <- 5
474 | set.seed(12345)
475 | df <- lreg(Cars93$Price, Cars93$Horsepower, time = time)
476 | for(i in 2:M){
477 | df <- rbind(df, lreg(Cars93$Price, Cars93$Horsepower, time = time))
478 | }
479 | df$M <- factor(rep(1:M, each = time))
480 | df$index <- rep(1:time, M)
481 | df <- reshape2::melt(df, id = c("M", "index"))
482 |
483 | ## ----B05113_04_24, fig.height=4, fig.width=7, cache=FALSE----------------
484 | ggplot(df, aes(x = index, y = value, group = M, colour=M)) + geom_line(alpha = 0.5) + facet_wrap(~variable, scales = "free_y")
485 |
486 | ## ----gelmandiag, eval = FALSE--------------------------------------------
487 | ## ## Brooke-Gelman
488 | ## gl <- list()
489 | ## M <- 15
490 | ## set.seed(12345)
491 | ## for(i in 1:M){
492 | ## gl[[i]] <- lreg(Cars93$Price, Cars93$Horsepower, time = time)
493 | ## }
494 | ## gl <- lapply(gl, function(x) mcmc(as.matrix(x)))
495 | ## ## look also at summary(g) (not shown here)
496 | ## gelman.diag(gl, autoburnin = FALSE)
497 | ##
498 | ## ## Potential scale reduction factors:
499 | ## ##
500 | ## ## Point est. Upper C.I.
501 | ## ## alpha 1.07 1.07
502 | ## ## beta 1.07 1.07
503 | ## ## tau 1.00 1.00
504 | ## ##
505 | ## ## Multivariate psrf
506 |
507 | ## ----B05113_04_25, fig.height=4, fig.width=6, eval=FALSE-----------------
508 | ## gelman.plot(gl, autoburnin = FALSE)
509 |
510 | ## ---- eval=FALSE---------------------------------------------------------
511 | ## burnin <- 1000
512 | ## time <- burnin + time * 20
513 | ## g <- lreg(Cars93$Price, Cars93$Horsepower, time = time, burnin = burnin, thin = 20)
514 |
515 | ## ------------------------------------------------------------------------
516 | circle <- function(x, r=0.05){
517 | repeat{
518 | x1 <- runif(1,-1,1)
519 | x2 <- runif(1,-1,1)
520 | if( sqrt(x1^2 + x2^2) <= (1 - r) ) break
521 | }
522 | inCircle <- ((x[,1] - x1)^2 + (x[,2] - x2)^2) <= r^2
523 | return(inCircle)
524 | }
525 |
526 | ## ------------------------------------------------------------------------
527 | set.seed(123)
528 | ## take possible radii
529 | x <- matrix(runif(10000, -1, 1), ncol=2)
530 | ## radii to the square
531 | r2 <- rowSums(x^2)
532 | ## r2 smaller than 1 are kept
533 | x1 <- x[r2 <= 1, ]
534 |
535 | ## ----B05113_04_26--------------------------------------------------------
536 | par(mar = c(2,2,0.1,0.1))
537 | plot(data.frame(x1), pch=20)
538 | for(k in 1:8) points(data.frame(x1[circle(x1, 0.2),]), col=k, pch=20)
539 |
540 | ## ------------------------------------------------------------------------
541 | set.seed(123)
542 | z <- replicate(2000, sum(circle(x1)))
543 |
544 | ## ------------------------------------------------------------------------
545 | TAB <- table(z)
546 | TAB
547 |
548 | ## ------------------------------------------------------------------------
549 | laeken::weightedMedian(as.numeric(names(TAB)), as.numeric(TAB))
550 |
551 | ## ----B05113_04_27--------------------------------------------------------
552 | lambda <- nrow(x1) * 0.05^2
553 | PROB <- dpois(as.numeric(names(TAB)), lambda)
554 | b <- barplot(TAB / length(z))
555 | points(b, PROB, col="red", pch=16)
556 |
557 | ## ------------------------------------------------------------------------
558 | ## the five classes:
559 | QP <- qpois(seq(0,1,by=1/6), lambda)
560 | QP
561 | ## frequency counts in those classes
562 | TAB1 <- table(cut(z, QP, include.lowest=TRUE))
563 | TAB1
564 |
565 | ## ------------------------------------------------------------------------
566 | ppois(QP, lambda)
567 | ## 0 should be in the left class:
568 | QP1 <- QP
569 | QP1[1] <- -1
570 | ## probablities for each class:
571 | PROB1 <- diff(ppois(QP1, lambda))
572 | PROB1
573 | ## goodness-of-fit test:
574 | chisq.test(TAB1, p=PROB1)
575 |
576 | ## ------------------------------------------------------------------------
577 | sessionInfo()
578 |
579 |
--------------------------------------------------------------------------------
/Chapter 8/chapter8.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | library("robustbase")
3 | data("hbk")
4 | ## structure of the data
5 | str(hbk)
6 |
7 | ## ------------------------------------------------------------------------
8 | lm_ols <- lm(Y ~ ., data = hbk)
9 | ## print summary
10 | summary(lm_ols)
11 |
12 | ## ----B05113_08_01--------------------------------------------------------
13 | plot(lm_ols, which = 3)
14 |
15 | ## ------------------------------------------------------------------------
16 | lm_rob <- lmrob(Y ~ ., data = hbk)
17 | ## print summary
18 | summary(lm_rob)
19 |
20 | ## ----B05113_08_02--------------------------------------------------------
21 | plot(lm_rob, which = 5)
22 |
23 | ## ----B05113_08_03--------------------------------------------------------
24 | pairs(hbk, pch = ".")
25 |
26 | ## ---- warning=FALSE, message=FALSE---------------------------------------
27 | data(Prestige, package = "car")
28 | rob <- ltsReg(prestige ~ log(income) + women + type, data = Prestige)
29 | summary(rob)
30 |
31 | ## ------------------------------------------------------------------------
32 | boot.lts <- function(x, indices){
33 | x <- x [indices,]
34 | model <- ltsReg(prestige ~ log(income) +
35 | women + type, data = x)
36 | coefficients(model)
37 | }
38 |
39 | ## ---- cache=TRUE, message=FALSE,warning=FALSE----------------------------
40 | library("boot")
41 | set.seed(123)
42 | rob_boot <- boot(Prestige, boot.lts, 1000)
43 | ## estimated standard errors
44 | rob_boot
45 |
46 | ## ---- echo=FALSE, message=FALSE, warning=FALSE---------------------------
47 | library(boot)
48 | library(MASS)
49 | library(e1071)
50 |
51 | ## ----B05113_08_04--------------------------------------------------------
52 | hist(rob_boot$t[,2], 50, xlab = "bootstrap repl., log(income)", main = "")
53 |
54 | ## ----B05113_08_05--------------------------------------------------------
55 | hist(rob_boot$t[,3], 50, xlab = "bootstrap repl., women", main = "")
56 |
57 | ## ------------------------------------------------------------------------
58 | boot.ci(rob_boot, index = 2,
59 | type = c("norm", "perc", "bca"))
60 |
61 | ## ----B05113_08_06, fig.width=6, fig.height=7-----------------------------
62 | par(mfrow = c(2,1), mar = c(4,4,2,0.5))
63 | jack.after.boot(rob_boot, index = 2, main = 'log (income) coefficient')
64 | jack.after.boot(rob_boot, index = 3, main = 'woman coefficient')
65 |
66 | ## ------------------------------------------------------------------------
67 | set.seed(12)
68 | df <- data.frame(x = 1:7, y = 1:7 + rnorm(7))
69 |
70 | ## ----B05113_08_07, fig.width=6, fig.height=7-----------------------------
71 | par(mfrow = c(2,1), mar = c(4,4,1,0.3))
72 | ## fit to original data
73 | lm_orig <- lm(y ~ x, data = df)
74 | ## plot original data
75 | plot(y ~ x, data = df)
76 | ## add regression line from original data
77 | abline(lm_orig)
78 | ## show the connection lines
79 | ## between original and fitted y
80 | segments(x0 = df$x, x1=df$x,
81 | y0=df$y, y1=lm_orig$fit)
82 | ## fitted y
83 | points(df$x, lm_orig$fit, pch=20, col="red")
84 | legend("topleft", legend = c("y", expression(hat(y))),
85 | pch = c(1,20), col = c(1,2))
86 | ## second plot ---------------------
87 | ## plot of fitted values
88 | plot(lm_orig$fit ~ df$x, col="red", pch = 20,
89 | ylab="y", xlab = "x")
90 | ## bootstrap sample by adding sampled residuals
91 | y1 <- lm_orig$fit + sample(lm_orig$res, replace = TRUE)
92 | ## new bootstrap sample
93 | points(df$x, y1, col="blue", pch = 3)
94 | ## connection lines new bootrap sample to
95 | ## fitted values from original data
96 | segments(x0 = df$x, x1 = df$x,
97 | y0 = lm_orig$fit, y1 = y1, col ="blue")
98 | ## regression line from original data
99 | abline(lm_orig)
100 | ## regression line from bootstrap sample
101 | abline(lm(y1 ~ df$x), col = "blue", lty = 2)
102 | legend("topleft", legend = c("original", "bootstrap repl. 1"), lty = c(1,2), col = c(1,4))
103 |
104 | ## ---- cache=TRUE, message=FALSE, warning=FALSE---------------------------
105 | Prestige <- na.omit(Prestige)
106 | ## fit model on original data
107 | rob2 <- ltsReg(prestige ~ log(income) + women + type,
108 | data = Prestige)
109 |
110 | ## ------------------------------------------------------------------------
111 | residuals <- residuals(rob2)
112 | fit <- fitted(rob2)
113 | ## fix X, model matrix
114 | X <- model.matrix(rob2, Prestige)[, -1]
115 |
116 | ## ------------------------------------------------------------------------
117 | ltsBoot2 <- function(x, indices){
118 | y <- fit + residuals[indices]
119 | model <- ltsReg(y ~ X)
120 | coefficients(model)
121 | }
122 |
123 | ## ---- message=FALSE, warning=FALSE---------------------------------------
124 | rob2_boot <- boot(Prestige, ltsBoot2, R = 2000)
125 | ## show results
126 | rob2_boot
127 |
128 | ## ----B05113_08_08--------------------------------------------------------
129 | par(mfrow = c(1,2), mar = c(4,4,1,0.2))
130 | hist(rob2_boot$t[,2], 50, xlab = "bootstrap repl., log(income)", main = "")
131 | hist(rob2_boot$t[,3], 50, xlab = "bootstrap repl., women", main = "")
132 |
133 | ## ----B05113_08_09, fig.width=6, fig.height=7-----------------------------
134 | par(mfrow = c(2,1), mar = c(4,4,2,0.5))
135 | jack.after.boot(rob2_boot, index = 2, main = 'log (income) coefficient')
136 | jack.after.boot(rob2_boot, index = 3, main = 'woman coefficient')
137 |
138 | ## ---- echo=FALSE---------------------------------------------------------
139 | gendat <- function(n, d)
140 | {
141 | set.seed(1083)
142 | s <- matrix(rnorm(d^2,0,0.2), ncol=d, nrow=d)
143 | diag(s) <- 1
144 | xorig <- mvrnorm(n ,mu = rep(0, d),Sigma = s)
145 | xorig <- rbind(xorig, mvrnorm(round(n*0.05) ,mu = c(10, rep(0, d-1)), Sigma = s))
146 | ## fix me: generell auch fuer mehrere Dimensionen
147 | x <- xorig
148 |
149 | for (i in 1:d)
150 | {
151 | miss <- sample(1:dim(x)[1], sample(n/5), replace = TRUE)
152 | x[miss,i]<-NA
153 | }
154 | w <- 0
155 | w <- which(rowSums(is.na(x)) == d)
156 | x <- x[-w,]
157 |
158 | }
159 | x <- gendat(1000,3)
160 |
161 | ## ---- message=FALSE, warning=FALSE---------------------------------------
162 | library("VIM")
163 | data("sleep")
164 |
165 | ## ----B05113_08_10--------------------------------------------------------
166 | aggr(sleep, numbers = TRUE, prop = FALSE, cex.axis = 0.75)
167 |
168 | ## ----B05113_08_11, message=FALSE, warning=FALSE--------------------------
169 | par(mar = c(6,4,0.2,0.2))
170 | matrixplot(sleep, sortby = "BrainWgt", interactive = FALSE)
171 |
172 | ## ----B05113_08_12--------------------------------------------------------
173 | set.seed(123)
174 | sleep_boot1 <- sleep[sample(1:nrow(sleep), replace = TRUE), ]
175 | aggr(sleep_boot1, numbers = TRUE, prop = FALSE,
176 | cex.axis = 0.75)
177 |
178 | ## ----B05113_08_13, message=FALSE, warning=FALSE--------------------------
179 | par(mar = c(6,4,0.2,0.2))
180 | matrixplot(sleep_boot1, sortby = "BrainWgt")
181 |
182 | ## ------------------------------------------------------------------------
183 | bootmiss <- function(x, R = 1000,
184 | imputation = "median",
185 | theta = "mean"){
186 | ## initializations
187 | d <- dim(x)[2]
188 | n <- dim(x)[1]
189 | thetaM <- matrix(NA, ncol=2, nrow=d)
190 | xs <- theta_hat1 <- matrix(0, nrow=n, ncol=d)
191 | med <- matrix(NA, ncol=d, nrow=R)
192 | for(i in 1:R){
193 | ## bootstrap sample
194 | s1 <- x[sample(1:n, replace=TRUE), ]
195 | ## imputation
196 | if(method %in% c("median", "mean")){
197 | simp <- impute(s1, what=method)
198 | }
199 | if(method == "kNN"){
200 | simp <- knn(x)$data
201 | }
202 | ## estimation
203 | med[i, ] <- apply(simp, 2, theta)
204 | }
205 | ## bootstrap mean (for each column)
206 | thetaHat <- apply(med, 2, mean)
207 | ## confidence intervals
208 | for(i in 1:d){
209 | thetaM[i,1] <- quantile(med[,i], 0.025)
210 | thetaM[i,2] <- quantile(med[,i], 0.975)
211 | }
212 | ## results
213 | res <- list(thetaHat = thetaHat,
214 | ci = thetaM)
215 | return(res)
216 | }
217 |
218 | ## ---- cache=TRUE, message=FALSE, warning=FALSE, eval=FALSE---------------
219 | ## ci <- quantile(replicate(10000, mean(sleep[sample(1:n, replace = TRUE), "sleep"])), c(0.025, 0.975))
220 | ## ci
221 |
222 | ## ---- eval=FALSE---------------------------------------------------------
223 | ## for(i in 1:ncol(x)){
224 | ## indmiss <- sample(1:nrow(x), round(sample(nrow(x)/10)), replace = TRUE)
225 | ## x[indmiss, i] <- NA
226 | ## }
227 |
228 | ## ---- cache=TRUE, message=FALSE, warning=FALSE---------------------------
229 | n <- nrow(sleep)
230 | imputed <- kNN(sleep, imp_var = FALSE)
231 | ci_single <- quantile(replicate(10000, mean(imputed[sample(1:n, replace = TRUE), "Sleep"])), c(0.025, 0.975))
232 | ci_single
233 |
234 | ## ---- cache=TRUE, message=FALSE, warning=FALSE, results='hide'-----------
235 | ci_boot <- quantile(replicate(10000, mean(kNN(sleep[sample(1:n, replace = TRUE), ], imp_var = FALSE)$Sleep)), c(0.025, 0.975))
236 |
237 | ## ------------------------------------------------------------------------
238 | ci_boot
239 |
240 | ## ------------------------------------------------------------------------
241 | set.seed(123)
242 | tseries <- rnorm(50)
243 | ## introduce auto-correlation
244 | tseries[-1] <- tseries[-1] + tseries[-50]
245 |
246 | ## ----B05113_08_014-------------------------------------------------------
247 | plot(ts(tseries), ylab = "values")
248 |
249 | ## ------------------------------------------------------------------------
250 | mbb <- function(x, R=1000, blocksize=6){
251 | ## initialization
252 | nc <- length(x)
253 | lagCorMBB <- ct <- numeric(R)
254 | seriesBl <- numeric(nc)
255 | ## function for moving blocks bootstrap
256 | corT <- function(x=tseries, N=nc, bl=blocksize){
257 | ## for N/bl blocks
258 | for(i in 1:ceiling(N/bl)) {
259 | ## endpoint of block
260 | endpoint <- sample(bl:N, size=1)
261 | ## put blocks together, bootstrap sample
262 | seriesBl[(i-1)*bl+1:bl] <- x[endpoint-(bl:1)+1]
263 | }
264 | seriesBl <- seriesBl[1:N]
265 | ## autocorrelation
266 | a <- cor(seriesBl[-1],seriesBl[-N])
267 | return(a)
268 | }
269 | ct <- replicate(R, corT(x))
270 | return(ct)
271 | }
272 |
273 | ## ------------------------------------------------------------------------
274 | mb <- mbb(x=tseries, R=10000)
275 | ## first 10 boostrap replicates
276 | mb[1:10]
277 | ## auto-correlation cofficient from mean of bootstrap replicates
278 | mean(mb)
279 |
280 | ## ---- fig.keep='none'----------------------------------------------------
281 | acf(tseries)$acf[2]
282 |
283 | ## ------------------------------------------------------------------------
284 | qu_mbb <- quantile(mb, c(0.025,0.975))
285 | cat("CI(mbb) : [", round(qu_mbb[1], 2), ",", round(qu_mbb[2], 2), "]")
286 |
287 | ## ---- fig.keep='none', cache=TRUE, warning=FALSE, message=FALSE----------
288 | library("forecast")
289 | ac <- taperedacf(tseries)
290 | cat("CI(classical) : [", round(ac$lower[1], 2), ",", round(ac$upper[1], 2), "]")
291 |
292 | ## ---- tidy=FALSE---------------------------------------------------------
293 | x <- data.frame("location" = rep("Asten", 8),
294 | "income" = c(2000,2500,2000,2500,1000,1500,2000,2500),
295 | "weight" = c(1000,500,1000,500,1500,1000,1000,2000))
296 | x
297 |
298 | ## ------------------------------------------------------------------------
299 | sum(x$weight)
300 |
301 | ## ------------------------------------------------------------------------
302 | sum(x$income * x$weight)
303 |
304 | ## ------------------------------------------------------------------------
305 | set.seed(123)
306 | y <- x[sample(1:8, replace = TRUE), ] # Bootstrap Sample
307 | y
308 |
309 | ## ------------------------------------------------------------------------
310 | # non-calibrated estimation
311 | sum(y$income * y$weight)
312 |
313 | ## ------------------------------------------------------------------------
314 | sum(y$weight)
315 |
316 | ## ------------------------------------------------------------------------
317 | constant <- sum(x$weight) / sum(y$weight)
318 | ## calibrated estimation
319 | sum(y$x * y$w * constant)
320 |
321 | ## ------------------------------------------------------------------------
322 | library("laeken")
323 | data("eusilc")
324 | ## point estimate of poverty rate
325 | a <- arpr("eqIncome", weights = "rb050", data = eusilc)
326 | ## bootstrap with calibration
327 | ## define auxiliary 0-1 variables for regions
328 | aux <- sapply(levels(eusilc$db040),
329 | function(l, x) as.numeric(x == l),
330 | x = eusilc$db040)
331 | ## retrieve population totals from underlying sample
332 | totals <- sapply(levels(eusilc$db040),
333 | function(l, x, w) sum(w[x == l]),
334 | x = eusilc$db040, w = eusilc$rb050)
335 | # bootstrap variance
336 | variance("eqIncome", weights = "rb050", design = "db040",
337 | data = eusilc, indicator = a, X = aux, totals = totals,
338 | seed = 123)
339 |
340 | ## ------------------------------------------------------------------------
341 | temp <- read.table("http://venus.unive.it/romanaz/statistics/data/bodytemp.txt", header = TRUE)
342 | temp$gen <- factor(temp$gen, labels = c("male", "female"))
343 | str(temp)
344 |
345 | ## ------------------------------------------------------------------------
346 | temp$celsius <- (temp$tf - 32) * 5 / 9
347 |
348 | ## ----B05113_08_015-------------------------------------------------------
349 | library("ggplot2")
350 | ggplot(temp, aes(x = celsius, colour = gen, linetype = gen)) + geom_density(size = 1.2) + theme(text = element_text(size=16)) + theme_bw()
351 |
352 | ## ------------------------------------------------------------------------
353 | temperature <- temp$celsius
354 |
355 | ## ------------------------------------------------------------------------
356 | n <- length(temperature)
357 | temperature <- sort(temperature)
358 | y <- (0:(n-1)) / n
359 |
360 | ## ----B05113_08_016-------------------------------------------------------
361 | plot(temperature, y, pch=20, cex = 0.3)
362 | lines(temperature, y, type="S")
363 |
364 | ## ----B05113_08_017-------------------------------------------------------
365 | plot(temperature, y, type="S")
366 | m <- mean(temperature)
367 | s <- sd(temperature)
368 | yn <- pnorm(temperature, mean = m, sd = s)
369 | lines(temperature, yn, col=2)
370 |
371 | ## ------------------------------------------------------------------------
372 | z <- round(sort(rnorm(n, mean = m, sd = s)), 1)
373 |
374 | ## ----B05113_08_018-------------------------------------------------------
375 | set.seed(123)
376 | plot(temperature, y, type="S")
377 | for(k in 1:100){
378 | z <- rnorm(n, mean=m, sd=s)
379 | lines(sort(z), y, type="S", col="green")
380 | }
381 |
382 | ## ------------------------------------------------------------------------
383 | Z <- NULL
384 | for(k in 1:1000){
385 | z = rnorm(n, mean = m, sd = s)
386 | Z = cbind(Z, sort(z))
387 | }
388 | dim(Z)
389 |
390 | ## ------------------------------------------------------------------------
391 | ## mean of original temperature data
392 | m
393 | ## simulated mean
394 | (mean(Z[65, ]) + mean(Z[66, ])) / 2
395 | ## simulated median
396 | (median(Z[65, ]) + median(Z[66, ])) / 2
397 |
398 | ## ------------------------------------------------------------------------
399 | plot(temperature, y, type="S")
400 | middle <- apply(Z, 1, median)
401 | lines(middle, y, col = "blue", lwd = 2, type = "S")
402 | ## lower and upper bounds
403 | lower <- apply(Z, 1, quantile, prob = 0.025)
404 | upper <- apply(Z, 1, quantile, prob = 0.975)
405 | lines(lower, y, col = 2, lwd = 2, type = "S")
406 | lines(upper, y, col = 2, lwd = 2, type = "S")
407 |
408 | ## ----B05113_08_020-------------------------------------------------------
409 | par(mfrow = c(2,1), mar = rep(1,4))
410 | plot(temperature, y, type="S")
411 | lines(temperature, yn, col=2)
412 | lines(lower, y, col=2, lwd=2, type="S")
413 | lines(upper, y, col=2, lwd=2, type="S")
414 | plot(temperature, y - yn, type="h")
415 | abline(h = 0)
416 | ## maximum deviation
417 | D <- max(abs(y - yn))
418 | w <- which.max(abs(y - yn))
419 | points(temperature[w], y[w] - yn[w], col=2, pch=16, cex=3)
420 |
421 | ## ------------------------------------------------------------------------
422 | ## theoretical distribution
423 | Z1 <- pnorm(Z, mean = m, sd = s)
424 | ## y will be recycled column-wise,
425 | ## extract the maximum for each column
426 | D1 <- apply(abs(y - Z1), 2, max)
427 |
428 | ## ------------------------------------------------------------------------
429 | summary(D1)
430 | D
431 |
432 | ## ------------------------------------------------------------------------
433 | mean(D1>D)
434 |
435 | ## ---- message=FALSE, warning=FALSE---------------------------------------
436 | ks.test(temperature, "pnorm", mean = m, sd = s)
437 |
438 | ## ------------------------------------------------------------------------
439 | data(Duncan, package = "car")
440 | x <- subset(Duncan, type %in% c("bc", "wc"), select = c("income", "type"))
441 | x$type <- factor(x$type)
442 | ## first four observations on income and type
443 | head(x, 4)
444 |
445 | ## ------------------------------------------------------------------------
446 | t.test(income ~ type, data=x)
447 |
448 | ## ------------------------------------------------------------------------
449 | ## first 6 observations with permuted grouping structure
450 | head(cbind(x, "p1" = sample(x$type),
451 | "p2" = sample(x$type),
452 | "p3" = sample(x$type)))
453 |
454 | ## ------------------------------------------------------------------------
455 | ## define test statistics (workhorse)
456 | teststat <- function(vals, group, lev){
457 | g <- sample(group)
458 | abs(mean(vals[g == lev[1]]) - mean(vals[g == lev[2]]))
459 | }
460 | ## permutation test
461 | permtest <- function(x, g, R = 1000, conf.level = 0.95){
462 | ## levels of the group vector
463 | lg <- levels(g)
464 | ## test statistics for original groups
465 | mdiff <- abs(mean(x[g==lg[1]]) - mean(x[g==lg[2]]))
466 | ## test statistics for permuted group data
467 | z <- replicate(R, teststat(x, g, lg))
468 | ## make nice print output
469 | DATA <- paste(deparse(substitute(x)),
470 | "by",
471 | deparse(substitute(g)))
472 | alpha <- 1 - conf.level
473 | conf.int <- quantile(z, prob = c(alpha/2, (1 - alpha)/2))
474 | attr(conf.int, "conf.level") <- conf.level
475 | res <- list(statistic=c(mdiff = mdiff),
476 | p.value = mean(abs(z) > abs(mdiff)),
477 | parameter = c(nrep = R),
478 | conf.int = conf.int,
479 | data.name = DATA,
480 | method = "Permutation test for difference in means")
481 | class(res) <- "htest"
482 | res
483 | }
484 |
485 | ## ------------------------------------------------------------------------
486 | permtest(x$income, x$type, R = 10000)
487 |
488 | ## ------------------------------------------------------------------------
489 | data(Duncan, package = "car")
490 | pairwise.t.test(Duncan$income, Duncan$type)
491 |
492 | ## ---- message=FALSE, warning=FALSE---------------------------------------
493 | mean(Duncan$income)
494 | library("dplyr")
495 | Duncan %>% group_by(type) %>% summarize(mean = mean(income))
496 |
497 | ## ------------------------------------------------------------------------
498 | tstat <- function(x, mu=0){
499 | (mean(x)-mu) / (sd(x) / sqrt(length(x)))
500 | }
501 | stats <- tapply(Duncan$income, Duncan$type, tstat, mu=mean(Duncan$income))
502 | stat <- max(abs(stats))
503 | stat
504 |
505 | ## ------------------------------------------------------------------------
506 | maxt.test <- function(x, g, R = 10000, conf.level = 0.05){
507 | m <- mean(x)
508 | stat <- tapply(x, g, tstat, mu = m)
509 | stat <- max(abs(stat))
510 | gsize = table(g)
511 | z <- NULL
512 | for(k in 1:length(gsize)){
513 | ## from a t-distribution:
514 | z <- cbind(z, rt(n=n, df=gsize[k]-1))
515 | }
516 | ## z now is a list with length(gsize) elements
517 | ## we need the maximum absolute value for each element
518 | z <- abs(z)
519 | z <- z[cbind(1:n,max.col(z))]
520 | ## make nice print output
521 | DATA <- paste(deparse(substitute(x)),
522 | "by",
523 | deparse(substitute(g)))
524 | alpha <- 1 - conf.level
525 | conf.int <- quantile(z, prob = c(alpha/2, (1 - alpha)/2))
526 | attr(conf.int, "conf.level") <- conf.level
527 | res <- list(statistic=c(stat = stat),
528 | p.value = mean(z > stat),
529 | parameter = c(nrep = R),
530 | conf.int = conf.int,
531 | data.name = DATA,
532 | method = "Maximum t-test")
533 | class(res) <- "htest"
534 | res
535 | }
536 |
537 | ## ------------------------------------------------------------------------
538 | maxt.test(Duncan$income, Duncan$type)
539 |
540 | ## ------------------------------------------------------------------------
541 | maxp.test <- function(x, g, R = 10000, conf.level = 0.05){
542 | m <- mean(x)
543 | stat <- tapply(x, g, tstat, mu=m)
544 | stat <- max(abs(stat))
545 | z <- numeric(n)
546 | for(k in 1:n){
547 | g1 <- sample(g)
548 | z[k] <- max(abs(tapply(x, g1, tstat, mu = m)))
549 | }
550 |
551 | retval <- list(tstat=stat, pval=mean(z>stat),
552 | name="Permutation maximum t-test")
553 | class(retval) <- "ttest"
554 | retval
555 | ## make nice print output
556 | DATA <- paste(deparse(substitute(x)),
557 | "by",
558 | deparse(substitute(g)))
559 | alpha <- 1 - conf.level
560 | conf.int <- quantile(z, prob = c(alpha/2, (1 - alpha)/2))
561 | attr(conf.int, "conf.level") <- conf.level
562 | res <- list(statistic=c(stat = stat),
563 | p.value = mean(z > stat),
564 | parameter = c(nrep = R),
565 | conf.int = conf.int,
566 | data.name = DATA,
567 | method = "Permutation maximum test")
568 | class(res) <- "htest"
569 | res
570 | }
571 |
572 | ## ------------------------------------------------------------------------
573 | maxp.test(Duncan$income, Duncan$type)
574 |
575 | ## ------------------------------------------------------------------------
576 | boottest <- function(x, g, n=10000){
577 | lg <- levels(g)
578 | n1 <- length(x[g == lg[1]])
579 | N <- length(x)
580 | mdiff <- abs(mean(x[g == lg[1]]) - mean(x[g == lg[2]]))
581 | z <- double(n)
582 | for(k in 1:n){
583 | x1 <- sample(x, replace=TRUE)
584 | z[k] <- abs(mean(x1[1:n1]) - mean(x1[(n1+1):N]))
585 | }
586 | mean( z > mdiff )
587 | }
588 |
589 | ## ------------------------------------------------------------------------
590 | Duncan$type <- factor(Duncan$type)
591 | boottest(Duncan$income, Duncan$type)
592 |
593 | ## ------------------------------------------------------------------------
594 | mvad.test <- function(x, R=1000){
595 | n <- nrow(x)
596 | ## test statistics
597 | stat <- function(x, N = n){
598 | cmean <- colMeans(x)
599 | cvar <- var(x)
600 | u <- mahalanobis(x, center = cmean, cov = cvar)
601 | z <- pchisq(u, ncol(x))
602 | p <- sort(z)
603 | h <- (2 * seq(1:N) - 1) * (log(p) + log(1 - rev(p)))
604 | A <- -N - mean(h)
605 | return(A)
606 | }
607 | ## value of test statistics for original sample
608 | A <- stat(x)
609 | cmean <- colMeans(x)
610 | cvar <- var(x)
611 | p <- numeric(R)
612 | ## values of test statistics for draws of mvn
613 | p <- replicate(R, stat(mvrnorm(n, cmean, cvar)))
614 | pvalue <- mean(p > A)
615 | RVAL <- list(statistic = c(A = A),
616 | method = "A-D radius test",
617 | p.value = pvalue)
618 | class(RVAL) <- "htest"
619 | RVAL
620 | }
621 |
622 | ## ---- eval=FALSE---------------------------------------------------------
623 | ## library("MASS")
624 | ## set.seed(123)
625 | ## r <- replicate(1000, mvad.test(mvrnorm(100, mu=rep(0,3),
626 | ## Sigma=diag(3)))$p.value)
627 | ## size <- mean(r < 0.05)
628 |
629 | ## ---- echo=FALSE---------------------------------------------------------
630 | size = 0.05
631 |
632 | ## ------------------------------------------------------------------------
633 | size
634 |
635 | ## ---- cache = TRUE-------------------------------------------------------
636 | library("mvtnorm")
637 | library("ICS")
638 | ## Monte Carlo AD test 100 times replicted
639 | r <- replicate(100, mvad.test(rmvt(30, diag(3), df = 5), R=100)$p.value)
640 | mean(r < 0.05)
641 | ## Skewness test 1000 times replicted
642 | r2 <- replicate(1000, mvnorm.skew.test(rmvt(30, diag(3), df = 5))$p.value)
643 | mean(r2 < 0.05)
644 |
645 | ## ------------------------------------------------------------------------
646 | sessionInfo()
647 |
648 |
--------------------------------------------------------------------------------
/Chapter 2/chapter2.R:
--------------------------------------------------------------------------------
1 | ## ---- echo=FALSE, message=FALSE, warning=FALSE---------------------------
2 | library(ggplot2)
3 |
4 | ## ------------------------------------------------------------------------
5 | 5 + 2 * log(3 * 3)
6 |
7 | ## ------------------------------------------------------------------------
8 | mean(rnorm(10))
9 |
10 | ## ---- eval=FALSE---------------------------------------------------------
11 | ## res1 <- name_of_function(v1) # an input argument
12 | ## res2 <- name_of_function(v1, v2) # two input arguments
13 | ## res3 <- name_of_function(v1, v2, v3) # three input arguments
14 |
15 | ## ------------------------------------------------------------------------
16 | x <- rnorm(5)
17 | x
18 |
19 | ## ------------------------------------------------------------------------
20 | options(digits = 4)
21 | x
22 |
23 | ## ---- eval=FALSE---------------------------------------------------------
24 | ## install.packages("dplyr")
25 |
26 | ## ---- eval=FALSE---------------------------------------------------------
27 | ## library("dplyr")
28 |
29 | ## ---- eval=FALSE---------------------------------------------------------
30 | ## install.packages("devtools")
31 | ## library("devtools")
32 | ## install_github("hadley/dplyr")
33 |
34 | ## ---- eval=FALSE---------------------------------------------------------
35 | ## help.start()
36 |
37 | ## ---- eval=FALSE---------------------------------------------------------
38 | ## help(package="dplyr")
39 |
40 | ## ---- eval=FALSE---------------------------------------------------------
41 | ## ?group_by
42 |
43 | ## ---- eval=FALSE---------------------------------------------------------
44 | ## data(Cars93, package = "MASS")
45 |
46 | ## ---- eval=FALSE---------------------------------------------------------
47 | ## help.search("histogram")
48 |
49 | ## ---- eval=FALSE, echo=TRUE----------------------------------------------
50 | ## apropos("hist")
51 |
52 | ## ---- eval=FALSE---------------------------------------------------------
53 | ## RSiteSearch("group by factor")
54 |
55 | ## ------------------------------------------------------------------------
56 | ls()
57 |
58 | ## ------------------------------------------------------------------------
59 | getwd()
60 |
61 | ## ------------------------------------------------------------------------
62 | v.num <- c(1,3,5.9,7)
63 | v.num
64 | is.numeric (v.num)
65 |
66 | ## ------------------------------------------------------------------------
67 | v.num > 3
68 |
69 | ## ---- warning=FALSE, message=FALSE---------------------------------------
70 | v1 <- c(1,2,3)
71 | v2 <- c(4,5)
72 | v1 + v2
73 |
74 | ## ------------------------------------------------------------------------
75 | v2 <- c (100, TRUE, "A", FALSE)
76 | v2
77 | is.numeric (v2)
78 |
79 | ## ---- eval=TRUE----------------------------------------------------------
80 | data(Cars93, package = "MASS")
81 | # extract a subset of variable Horsepower from Cars93
82 | hp <- Cars93[1:10, "Horsepower"]
83 | hp
84 | # positive indexing:
85 | hp[c(1,6)]
86 | # negative indexing:
87 | hp[-c(2:5,7:10)]
88 | # logical indexing:
89 | hp < 150
90 | # a logical expression can be written directly in []
91 | hp[hp < 150]
92 |
93 | ## ------------------------------------------------------------------------
94 | class(Cars93)
95 | class(Cars93$Cylinders)
96 | levels(Cars93$Cylinders)
97 | summary(Cars93$Cylinders)
98 |
99 | ## ------------------------------------------------------------------------
100 | model <- lm(Price ~ Cylinders + Type + EngineSize + Origin, data = Cars93)
101 | ## result is a list
102 | class(model)
103 | ## access elements from the named list with the dollar sign
104 | model$coefficients
105 |
106 | ## ---- tidy=FALSE---------------------------------------------------------
107 | ## extract cars with small number of cylinders and small power
108 | w <- Cars93$Cylinders %in% c("3", "4") & Cars93$Horsepower < 80
109 | str(Cars93[w, ])
110 |
111 | ## ------------------------------------------------------------------------
112 | library("vcd")
113 | data(PreSex)
114 | PreSex
115 |
116 | ## ------------------------------------------------------------------------
117 | PreSex[, , 1, 2]
118 |
119 | ## ------------------------------------------------------------------------
120 | PreSex[, , "Yes", "Men"]
121 |
122 | ## ------------------------------------------------------------------------
123 | sum(is.na(Cars93))
124 |
125 | ## ----B05113_02_01, message = FALSE, warning=FALSE------------------------
126 | require("VIM")
127 | # colnames(Cars93) <- substr(colnames(Cars93), 1, 8)
128 | par(mar=c(7,3,0.1,0.1))
129 | matrixplot(Cars93, sortby = "Weight", cex.axis=0.9)
130 |
131 | ## ---- eval=FALSE, message=FALSE, warning=FALSE---------------------------
132 | ## m <- robCompositions::missPatterns(Cars93)
133 |
134 | ## ------------------------------------------------------------------------
135 | ## how often summary is overloaded with methods
136 | ## on summary for certain classes
137 | length(methods(summary))
138 | class(Cars93$Cylinders)
139 | summary(Cars93$Cylinders)
140 | ## just to see the difference, convert to class character:
141 | summary(as.character(Cars93$Cylinders))
142 |
143 | ## ------------------------------------------------------------------------
144 | ## function to be applied afterwards
145 | func <- function(x){
146 | return(sum(is.na(x)))
147 | }
148 | ## apply func on all columns (second dimension (2))
149 | ## and store it to na
150 | na <- apply(X = Cars93, MARGIN = 2, FUN = func)
151 | ## show those with missings
152 | na[ na > 0 ]
153 |
154 | ## ------------------------------------------------------------------------
155 | p <- ncol(Cars93)
156 | na_for <- numeric(p)
157 | for(i in 1:p){
158 | na_for[i] <- func(Cars93[, i])
159 | }
160 |
161 | identical(as.numeric(na), na_for)
162 |
163 | ## ------------------------------------------------------------------------
164 | m <- robCompositions::missPatterns(Cars93)
165 | class(m)
166 |
167 | ## ------------------------------------------------------------------------
168 | lapply(m, length)
169 |
170 | ## ---- tidy=TRUE----------------------------------------------------------
171 | s <- sapply(m, length)
172 |
173 | ## ------------------------------------------------------------------------
174 | s
175 | class(s)
176 |
177 | ## ------------------------------------------------------------------------
178 | args(aggregate)
179 |
180 | ## ------------------------------------------------------------------------
181 | methods(aggregate)
182 |
183 | args(aggregate.data.frame)
184 |
185 | ## ------------------------------------------------------------------------
186 | aggregate(Cars93[, c("Horsepower", "Weight")], by = list(Cars93$Cylinders), median)
187 |
188 | ## ---- eval = TRUE, echo = TRUE, message=FALSE, warning=FALSE-------------
189 | library("dplyr")
190 |
191 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
192 | class (Cars93)
193 |
194 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
195 | Cars93 <- tbl_df(Cars93)
196 | class(Cars93)
197 |
198 | ## ---- eval = FALSE, echo = TRUE, results='hide'--------------------------
199 | ## print(Cars93) # output suppressed
200 |
201 | ## ---- eval = TRUE, echo = TRUE, results='hide'---------------------------
202 | slice(Cars93, 1) # first line, output suppressed
203 |
204 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
205 | slice (Cars93, c(1,4,10,15, n()))
206 |
207 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
208 | filter(Cars93, Manufacturer == "Audi" & Min.Price > 25)
209 |
210 | ## ---- eval = TRUE, echo = TRUE, results='hide'---------------------------
211 | Cars93 <- arrange (Cars93, Price)
212 | Cars93 ## output suppressed
213 |
214 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
215 | head(arrange(Cars93, desc (MPG.city), Max.Price), 7)
216 |
217 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
218 | head (select (Cars93, Manufacturer, Price), 3)
219 |
220 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
221 | head(select(Cars93, Manufacturer:Price), 3)
222 |
223 | ## ---- eval = TRUE, echo = TRUE, results='hide'---------------------------
224 | select(Cars93, -Min.Price, -Max.Price) # output suppressed
225 |
226 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
227 | head(select(Cars93, starts_with ("Man")), 3)
228 |
229 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
230 | head(select(Cars93, contains ("Price")), 3)
231 |
232 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
233 | head (select (Cars93, myPrize = Price, Min.Price))
234 |
235 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
236 | Cars93_1 <- select(Cars93, Manufacturer, EngineSize)
237 | dim (Cars93_1)
238 | Cars93_1 <- distinct(Cars93_1)
239 | dim (Cars93_1)
240 |
241 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
242 | dim(Cars93)
243 | dim( distinct (Cars93, Manufacturer) )
244 | # based on two variables, second is rounded:
245 | dim(distinct(Cars93, Manufacturer, EngineSize))
246 | # based on two variables, second is rounded:
247 | dim(distinct(Cars93, Manufacturer, rr=round(EngineSize)))
248 |
249 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
250 | m <- mutate(Cars93, is_ford = Manufacturer == "Ford")
251 | m[1:3, c(1,28)]
252 |
253 | ## ---- eval = TRUE, echo = TRUE, results='hide'---------------------------
254 | transmute(Cars93, is_ford = Manufacturer == "Ford", Manufacturer)
255 |
256 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
257 | head (transmute(Cars93, Manufacturer, is_ford = Manufacturer == "Ford", num_ford = ifelse (is_ford, -1, 1)), 3)
258 |
259 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
260 | by_type <- group_by (Cars93, Type)
261 | summarize (by_type,
262 | count = n(),
263 | min_es = min(EngineSize),
264 | max_es = max(EngineSize)
265 | )
266 |
267 | ## ---- results='hide'-----------------------------------------------------
268 | Cars93 %>%
269 | group_by(Type) %>%
270 | summarize(count = n(), min_es = min(EngineSize), max_es = max(EngineSize) )
271 | ## output suppressed since equally to previous output
272 |
273 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
274 | by_type <- group_by(Cars93, Type)
275 | slice(by_type, 1: 2)
276 |
277 | ## ---- echo = TRUE, results='hide'----------------------------------------
278 | ## output suppressed since the same as above
279 | Cars93 %>% group_by(Type) %>% slice(1:2)
280 |
281 | ## ---- eval = TRUE, echo = TRUE-------------------------------------------
282 | Cars93 %>%
283 | mutate(ES2 = EngineSize^2) %>%
284 | group_by(Type) %>%
285 | summarize(min.ES2 = min(ES2)) %>%
286 | arrange(desc(min.ES2))
287 |
288 | ## ---- eval = TRUE, echo = TRUE, tidy=FALSE-------------------------------
289 | Cars93 %>%
290 | group_by(Type) %>%
291 | arrange(Type) %>%
292 | select(Manufacturer:Price) %>%
293 | mutate(cmean = cummean(Price), csum = cumsum(Price))
294 |
295 | ## ----eval=TRUE, echo=TRUE, results='hide'--------------------------------
296 | require(data.table)
297 | Cars93 <- data.table(Cars93)
298 | Cars93 ## print output suppressed
299 |
300 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
301 | tables()
302 |
303 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
304 | Cars93$tmp1 <- Cars93[, j = Manufacturer == "Ford"]
305 |
306 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
307 | Cars93[, tmp2 := rnorm(nrow(Cars93))]
308 |
309 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
310 | Cars93[, tmp1:=NULL]
311 | Cars93$tmp2 <- NULL
312 |
313 | ## ----eval=TRUE, echo=TRUE, results='hide'--------------------------------
314 | Cars93[i = 2] # second row, all columns
315 | Cars93[i = c(1,5)] # first and fifth row, all columns
316 | Cars93[i = -c(1:5)] # exclude the first five rows
317 |
318 | ## ----eval=TRUE, echo=TRUE, tidy=FALSE------------------------------------
319 | Cars93[j = 3] # this does not work since 3 evaluates to 3
320 | Cars93[j = "Price"] # extract "Price" does not work since "Price" evaluates to "Price"
321 | Cars93[j = Price] # this works, since variable Price exists in the scope of Cars93
322 | Cars93[i=1:3, j = "Price", with = FALSE] # also works
323 |
324 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
325 | Cars93[1:3, .(Price, Horsepower, Diff.Price = Max.Price - Min.Price, Mean.Price = mean(Price))]
326 |
327 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
328 | setkey(Cars93, Type) # equally: setkeyv(dt, "x")
329 |
330 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
331 | key(Cars93)
332 |
333 | ## ----eval=TRUE, echo=TRUE, results='hide'--------------------------------
334 | setkey(Cars93, Type)
335 | Cars93["Van"] # all rows with Type == "Van" (output suppressed)
336 |
337 | ## ----eval=TRUE, echo=TRUE------------------------------------------------
338 | setkey(Cars93, Type, DriveTrain, Origin)
339 | Cars93[.("Van", "4WD", "non-USA")]
340 |
341 | ## ----eval=TRUE, echo=TRUE, cache=TRUE------------------------------------
342 | require(microbenchmark)
343 | N <- 1000000
344 | dat<- data.table(
345 | x=sample(LETTERS[1:20], N, replace=TRUE),
346 | y=sample(letters[1:5], N, replace=TRUE))
347 | head(dat, 3)
348 |
349 | setkey(dat, x,y)
350 |
351 | microbenchmark(
352 | data.table = dat[list(c("B", "D"), c("b", "d"))],
353 | dplyr = dat %>% slice(x %in% c("B", "D") & y %in% c("b", "d")),
354 | baseR = dat[x %in% c("B", "D") & y %in% c("b", "d")]
355 | )
356 |
357 | ## ----eval=TRUE, echo=TRUE, cache=TRUE------------------------------------
358 | Cars93[, .(mean = mean(Price), IQR = IQR(Price), median = median(Price)), by = Type]
359 |
360 | ## ------------------------------------------------------------------------
361 | data(Cars93, package = "MASS")
362 | set.seed(123)
363 | system.time(lm(Price ~ Horsepower + Weight + Type + Origin, data=Cars93))
364 |
365 | ## ------------------------------------------------------------------------
366 | library("robustbase")
367 | system.time(lmrob(Price ~ Horsepower + Weight + Type + Origin, data=Cars93))
368 |
369 | ## ---- tidy=FALSE---------------------------------------------------------
370 | ptm <- proc.time()
371 | robustbase::lmrob(Price ~ Horsepower + Weight + Type + Origin, data=Cars93)
372 | proc.time() - ptm
373 |
374 | ## ---- cache=TRUE---------------------------------------------------------
375 | require("robustbase")
376 | s1 <- system.time(replicate(100, lm(Price ~ Horsepower + Weight + Type + Origin, data=Cars93)))[3]
377 | s2 <- system.time(replicate(100, lmrob(Price ~ Horsepower + Weight + Type + Origin, data=Cars93)))[3]
378 | (s2 - s1)/s1
379 |
380 | ## ------------------------------------------------------------------------
381 | Rprof("Prestige.lm.out")
382 | invisible(replicate(100,
383 | lm(Price ~ Horsepower + Weight + Type + Origin, data=Cars93)))
384 | Rprof(NULL)
385 | summaryRprof("Prestige.lm.out")$by.self
386 |
387 | ## ---- results='hide', warning=FALSE, message=FALSE-----------------------
388 | require(profr)
389 | parse_rprof("Prestige.lm.out")
390 |
391 | ## ---- warning=FALSE, message=FALSE---------------------------------------
392 | library(microbenchmark); library(plyr); library(dplyr); library(data.table); library(Hmisc)
393 |
394 | ## ---- tidy = FALSE-------------------------------------------------------
395 | data(Cars93, package = "MASS")
396 | Cars93 %>% group_by(Type, Origin) %>% summarise(mean = mean(Horsepower))
397 |
398 | ## ------------------------------------------------------------------------
399 | meanFor <- function(x){
400 | sum <- 0
401 | for(i in 1:length(x)) sum <- sum + x[i]
402 | sum / length(x)
403 | }
404 |
405 | ## groupwise statistics
406 | myfun1 <- function(x, gr1, gr2, num){
407 | x[,gr1] <- as.factor(x[,gr1])
408 | x[,gr2] <- as.factor(x[,gr2])
409 | l1 <- length(levels(x[,gr1]))
410 | l2 <- length(levels(x[,gr1]))
411 | gr <- numeric(l1*l2)
412 | c1 <- c2 <- character(l1*l2)
413 | ii <- jj <- 0
414 | for(i in levels(x[,gr1])){
415 | for(j in levels(x[,gr2])){
416 | ii <- ii + 1
417 | c1[ii] <- i
418 | c2[ii] <- j
419 | vec <- x[x[,gr2] == j & x[,gr1] == i, num]
420 | if(length(vec) > 0) gr[ii] <- meanFor(vec)
421 | }
422 | }
423 |
424 | df <- data.frame(cbind(c1, c2))
425 | df <- cbind(df, gr)
426 | colnames(df) <- c(gr1,gr2,paste("mean(", num, ")"))
427 | df
428 | }
429 |
430 | ## groupwise using mean()
431 | ## attention mean.default is faster
432 | myfun2 <- function(x, gr1, gr2, num){
433 | x[,gr1] <- as.factor(x[,gr1])
434 | x[,gr2] <- as.factor(x[,gr2])
435 | l1 <- length(levels(x[,gr1]))
436 | l2 <- length(levels(x[,gr1]))
437 | gr <- numeric(l1*l2)
438 | c1 <- c2 <- character(l1*l2)
439 | ii <- jj <- 0
440 | for(i in levels(x[,gr1])){
441 | for(j in levels(x[,gr2])){
442 | ii <- ii + 1
443 | c1[ii] <- i
444 | c2[ii] <- j
445 | gr[ii] <- mean(x[x[,gr2] == j & x[,gr1] == i, num])
446 | }
447 | }
448 |
449 | df <- data.frame(cbind(c1, c2))
450 | df <- cbind(df, gr)
451 | colnames(df) <- c(gr1,gr2,paste("mean(", num, ")"))
452 | df
453 | }
454 |
455 | ## ------------------------------------------------------------------------
456 | Cars93dt <- data.table(Cars93)
457 |
458 | ## ---- tidy=FALSE---------------------------------------------------------
459 | op <- microbenchmark(
460 | ## pure for loops
461 | MYFUN1 = myfun1(x=Cars93, gr1="Type", gr2="Origin",
462 | num="Horsepower"),
463 | ## pure for loops but using mean
464 | MYFUN2 = myfun2(x=Cars93, gr1="Type", gr2="Origin",
465 | num="Horsepower"),
466 | ## plyr
467 | PLYR = ddply(Cars93, .(Type, Origin), summarise,
468 | output = mean(Horsepower)),
469 | ## base R's aggregate and by
470 | AGGR = aggregate(Horsepower ~ Type + Origin, Cars93, mean),
471 | BY = by(Cars93$Horsepower,
472 | list(Cars93$Type,Cars93$Origin), mean),
473 | ## Hmisc's summarize
474 | SUMMARIZE = summarize(Cars93$Horsepower,
475 | llist(Cars93$Type,Cars93$Origin), mean),
476 | ## base R's tapply
477 | TAPPLY = tapply(Cars93$Horsepower,
478 | interaction(Cars93$Type, Cars93$Origin), mean),
479 | ## dplyr
480 | DPLYR = summarise(group_by(Cars93, Type, Origin),
481 | mean(Horsepower)),
482 | ## data.table
483 | DATATABLE = Cars93dt[, aggGroup1.2 := mean(Horsepower),
484 | by = list(Type, Origin)],
485 | times=1000L)
486 |
487 | ## ----B05113_02_02--------------------------------------------------------
488 | m <- reshape2::melt(op, id="expr")
489 | ggplot(m, aes(x=expr, y=value)) +
490 | geom_boxplot() +
491 | coord_trans(y = "log10") +
492 | xlab(NULL) + ylab("computation time") +
493 | theme(axis.text.x = element_text(angle=45))
494 |
495 | ## ---- cache = TRUE, warning=FALSE, message=FALSE-------------------------
496 | library(laeken); data(eusilc)
497 | eusilc <- do.call(rbind,
498 | list(eusilc,eusilc,eusilc,eusilc,eusilc,eusilc,eusilc))
499 | eusilc <- do.call(rbind,
500 | list(eusilc,eusilc,eusilc,eusilc,eusilc,eusilc,eusilc))
501 | dim(eusilc)
502 | eusilcdt <- data.table(eusilc)
503 | setkeyv(eusilcdt, c('hsize','db040'))
504 |
505 | op <- microbenchmark(
506 | MYFUN1 = myfun1(x=eusilc, gr1="hsize", gr2="db040",
507 | num="eqIncome"),
508 | MYFUN2 = myfun2(x=eusilc, gr1="hsize", gr2="db040",
509 | num="eqIncome"),
510 | PLYR = ddply(eusilc, .(hsize, db040), summarise,
511 | output = mean(eqIncome)),
512 | AGGR = aggregate(eqIncome ~ hsize + db040, eusilc, mean),
513 | BY = by(eusilc$eqIncome, list(eusilc$hsize,eusilc$db040), mean),
514 | SUMMARIZE = summarize(eusilc$eqIncome,
515 | llist(eusilc$hsize,eusilc$db040), mean),
516 | TAPPLY = tapply(eusilc$eqIncome,
517 | interaction(eusilc$hsize, eusilc$db040), mean),
518 | DPLYR = summarise(group_by(eusilc, hsize, db040),
519 | mean(eqIncome)),
520 | DATATABLE = eusilcdt[, mean(eqIncome), by = .(hsize, db040)],
521 | times=10)
522 |
523 | ## ----B05113_02_03--------------------------------------------------------
524 | m <- reshape2::melt(op, id="expr")
525 | ggplot(m, aes(x=expr, y=value)) +
526 | geom_boxplot() +
527 | coord_trans(y = "log10") +
528 | xlab(NULL) + ylab("computation time") +
529 | theme(axis.text.x = element_text(angle=45, vjust=1))
530 |
531 | ## ------------------------------------------------------------------------
532 | Rprof("aggr.out")
533 | a <- aggregate(eqIncome ~ hsize + db040, eusilc, mean)
534 | Rprof(NULL)
535 | summaryRprof("aggr.out")$by.self
536 |
537 | ## ---- cache = TRUE, message=FALSE, warning=FALSE-------------------------
538 | R <- 10000
539 | library(robustbase)
540 | covMcd(Cars93[, c("Price", "Horsepower")], cor = TRUE)$cor[1,2]
541 |
542 | ## confidence interval:
543 | n <- nrow(Cars93)
544 | f <- function(R, ...){
545 | replicate(R, covMcd(Cars93[sample(1:n, replace=TRUE),
546 | c("Price", "Horsepower")], cor = TRUE)$cor[1,2])
547 | }
548 | system.time(ci <- f(R))
549 | quantile(ci, c(0.025, 0.975))
550 |
551 | ## ------------------------------------------------------------------------
552 | library("snow")
553 | cl <- makeCluster(3, type="SOCK")
554 |
555 | ## ---- results='hide'-----------------------------------------------------
556 | clusterEvalQ(cl, library("robustbase"))
557 | clusterEvalQ(cl, data(Cars93, package = "MASS"))
558 | clusterExport(cl, "f")
559 | clusterExport(cl, "n")
560 |
561 | ## ------------------------------------------------------------------------
562 | clusterSetupRNG(cl, seed=123)
563 |
564 | ## ------------------------------------------------------------------------
565 | system.time(ci_boot <-
566 | clusterCall(cl, f, R = round(R / 3)))
567 | quantile(unlist(ci_boot), c(0.025, 0.975))
568 |
569 | ## ------------------------------------------------------------------------
570 | stopCluster(cl)
571 |
572 | ## ------------------------------------------------------------------------
573 | wmeanR <- function(x, w) {
574 | total <- 0
575 | total_w <- 0
576 | for (i in seq_along(x)) {
577 | total <- total + x[i] * w[i]
578 | total_w <- total_w + w[i]
579 | }
580 | total / total_w
581 | }
582 |
583 | ## ------------------------------------------------------------------------
584 | library("Rcpp")
585 | ## from
586 | ## http://blog.revolutionanalytics.com/2013/07/deepen-your-r-experience-with-rcpp.html
587 | cppFunction('
588 | double wmean(NumericVector x, NumericVector w) {
589 | int n = x.size();
590 | double total = 0, total_w = 0;
591 | for(int i = 0; i < n; ++i) {
592 | total += x[i] * w[i];
593 | total_w += w[i];
594 | }
595 | return total / total_w;
596 | }
597 | ')
598 |
599 | ## ---- cache = TRUE-------------------------------------------------------
600 | x <- rnorm(100000000)
601 | w <- rnorm(100000000)
602 | library("laeken")
603 | op <- microbenchmark(
604 | naiveR = wmeanR(x, w),
605 | weighted.mean = weighted.mean(x, w),
606 | weighedMean = weightedMean(x, w),
607 | Rcpp.wmean = wmean(x, w),
608 | times = 1
609 | )
610 | op
611 |
612 | ## ----B05113_02_04--------------------------------------------------------
613 | m <- reshape2::melt(op, id="expr")
614 | ggplot(m, aes(x=expr, y=value)) +
615 | geom_boxplot() +
616 | coord_trans(y = "log10") +
617 | xlab(NULL) + ylab("computation time") +
618 | theme(axis.text.x = element_text(angle=45, vjust=1))
619 |
620 | ## ---- eval = FALSE-------------------------------------------------------
621 | ## pdf(file = "myplot.pdf")
622 | ## plot(Horsepower ~ Weight, data = Cars93)
623 | ## dev.off ()
624 |
625 | ## ----addfig, fig.align='center'------------------------------------------
626 | x <- 1:20 / 2 # x ... 0.5, 1.0, 1.5, ..., 10.0
627 | y <- sin(x)
628 | plot(x, y, pch = 16, cex = 10 * abs(y), col = grey(x / 14))
629 |
630 | ## ----B05113_02_05, fig.align='center'------------------------------------
631 | plot(x, y, pch = 16, cex = 10 * abs(y), col = grey(x / 14))
632 | text(x, y, 1:20, col="yellow")
633 | curve(sin, -2 * pi, 4 * pi, add = TRUE, col = "red")
634 | abline(h = 0, lty = 2, col = "grey")
635 |
636 | ## ----B05113_02_06, fig.align='center'------------------------------------
637 | par(mfrow = c(2, 2), mar = c(3,3,0.1,0.1))
638 | mpg <- mtcars$mpg
639 | cyl <- factor(mtcars$cyl)
640 | df <- data.frame(x1=cyl, x2=mpg)
641 | tmpg <- ts(mpg)
642 | plot(mpg); plot(cyl); plot(df); plot(tmpg)
643 |
644 | ## ---- echo = TRUE, eval = TRUE, size='tiny'------------------------------
645 | tail(methods(plot)) ## last 6
646 | ## number of methods for plot
647 | length(methods(plot))
648 |
649 | ## ---- eval = FALSE, fig.align='center'-----------------------------------
650 | ## plot(x=mtcars$mpg, y=mtcars$hp)
651 | ## plot(mtcars$mpg, mtcars$hp)
652 | ## plot(hp ~ mpg, data=mtcars)
653 |
654 | ## ------------------------------------------------------------------------
655 | ## min und max in both axis
656 | xmin <- min(mtcars$mpg); xmax <- max(mtcars$mpg)
657 | ymin <- min(mtcars$hp); ymax <- max(mtcars$hp)
658 |
659 | ## calculate histograms
660 | xhist <- hist(mtcars$mpg, breaks=15, plot=FALSE)
661 | yhist <- hist(mtcars$hp, breaks=15, plot=FALSE)
662 |
663 | ## maximum count
664 | top <- max(c(xhist$counts, yhist$counts))
665 | xrange <- c(xmin,xmax)
666 | yrange <- c(ymin, ymax)
667 |
668 | ## ----B05113_02_07, fig.align="center"------------------------------------
669 | m <- matrix(c(2, 0, 1, 3), 2, 2, byrow = TRUE)
670 | ## define plot order and size
671 | layout(m, c(3,1), c(1, 3), TRUE)
672 | ## first plot
673 | par(mar=c(3,0,1,1))
674 | plot(mtcars[,c("mpg","hp")], xlim=xrange, ylim=yrange, xlab="", ylab="")
675 | ## second plot -- barchart of margin
676 | par(mar=c(0,0,1,1))
677 | barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0)
678 | ## third plot -- barchart of other margin
679 | par(mar=c(3,0,1,1))
680 | barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE)
681 |
682 | ## ----B05113_02_08, fig.align="center", fig.height=4.5, fig.width=6-------
683 | library("ggplot2")
684 | ggplot(Cars93, aes(x = Horsepower, y = MPG.city)) + geom_point(aes(colour = Cylinders))
685 |
686 | ## ----B05113_02_09, fig.align="center", fig.height=4.5, fig.width=6-------
687 | g1 <- ggplot(Cars93, aes(x=Horsepower, y=MPG.city))
688 | g2 <- g1 + geom_point(aes(color=Weight))
689 | g2 + geom_smooth()
690 |
691 | ## ----B05113_02_10, fig.align="center", fig.height=4.5, fig.width=6-------
692 | g1 <- g1 + geom_text(aes(label=substr(Manufacturer,1,3)),
693 | size=3.5)
694 | g1 + geom_smooth()
695 |
696 | ## ----image B05113_02_11--------------------------------------------------
697 | gg <- ggplot(Cars93, aes(x=Horsepower, y=MPG.city))
698 | gg <- gg + geom_point(aes(shape = Origin, colour = Price))
699 | gg <- gg + facet_wrap(~ Cylinders) + theme_bw()
700 | gg
701 |
702 | ## ------------------------------------------------------------------------
703 | sessionInfo()
704 |
705 |
--------------------------------------------------------------------------------