├── README.md
├── SCR2e-errata.html
├── SCR2e-errata.pdf
├── examples-R
├── SCR2e-Intro.R
├── SCR2e-Programming.R
├── SCR2e-ResampAppl.R
├── SCR2e-chBoot.R
├── SCR2e-chDensity.R
├── SCR2e-chMCMC.R
├── SCR2e-chMCint.R
├── SCR2e-chMonteCarlo.R
├── SCR2e-chNumerical.R
├── SCR2e-chOptim.R
├── SCR2e-chPerm.R
├── SCR2e-chRandomProcess.R
├── SCR2e-chRgen.R
├── SCR2e-chVis.R
├── SCR2e-examples-functions.R
└── readme.md
├── examples-Rmd
├── FOREARM.DAT
├── Lahman.Rmd
├── SCR2e-Intro.Rmd
├── SCR2e-Programming.Rmd
├── SCR2e-ResampAppl.Rmd
├── SCR2e-chBoot.Rmd
├── SCR2e-chDensity.Rmd
├── SCR2e-chMCMC.Rmd
├── SCR2e-chMCint.Rmd
├── SCR2e-chMonteCarlo.Rmd
├── SCR2e-chNumerical.Rmd
├── SCR2e-chOptim.Rmd
├── SCR2e-chPerm.Rmd
├── SCR2e-chRandomProcess.Rmd
├── SCR2e-chRgen.Rmd
├── SCR2e-chVis.Rmd
├── SCR2e-examples-functions.R
└── readme.md
└── examples-cpp
├── printme.cpp
└── readme.md
/README.md:
--------------------------------------------------------------------------------
1 | # SCR2e
2 | Resources for the book "Statistical Computing with R, 2nd ed."
3 | Maria L. Rizzo 2019, Chapman & Hall.
4 | Resources and files for "Statistical Computing with R 2nd edition" by Maria Rizzo, Chapman & Hall/CRC (2019)
5 |
6 | URL: https://www.crcpress.com/Statistical-Computing-with-R-Second-Edition/Rizzo/p/book/9781466553323
7 |
8 | Bibtex entry:
9 |
10 | @book{SCR2,
11 | title = "Statistical Computing with {R}",
12 | edition = "second",
13 | author = "Maria L. Rizzo",
14 | year = "2019",
15 | publisher = "Chapman & Hall/CRC",
16 | series = "The {R} Series",
17 | isbn = "9781466553323",
18 | }
19 |
--------------------------------------------------------------------------------
/SCR2e-errata.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mariarizzo/SCR2e/5c72bc153c6b931f88f5703e3c088bd0b263eeca/SCR2e-errata.pdf
--------------------------------------------------------------------------------
/examples-R/SCR2e-Intro.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 1 ###
9 | ### Introduction ###
10 | #######################################################
11 |
12 |
13 |
14 | ### Example 1.1
15 |
16 | sumdice <- function(n) {
17 | k <- sample(1:6, size=n, replace=TRUE)
18 | return(sum(k))
19 | }
20 |
21 | sumdice(2)
22 |
23 | #to store the result rather than print it
24 | a <- sumdice(100)
25 |
26 | #we expect the mean for 100 dice to be close to 3.5
27 | a / 100
28 |
29 | sumdice <- function(n)
30 | sum(sample(1:6, size=n, replace=TRUE))
31 |
32 | sumdice <- function(n, sides = 6) {
33 | if (sides < 1) return (0)
34 | k <- sample(1:sides, size=n, replace=TRUE)
35 | return(sum(k))
36 | }
37 |
38 | sumdice(5) #default 6 sides
39 | sumdice(n=5, sides=4) #4 sides
40 |
41 | ### Example 1.2 (iris data)
42 |
43 | names(iris)
44 | table(iris$Species)
45 | w <- iris[[2]] #Sepal.Width
46 | mean(w)
47 |
48 | attach(iris)
49 | summary(Petal.Length[51:100]) #versicolor petal length
50 |
51 | with(iris, summary(Petal.Length[51:100]))
52 |
53 | out <- with(iris, summary(Petal.Length[51:100]))
54 |
55 | by(iris[,1:4], Species, colMeans)
56 | detach(iris)
57 |
58 | ### Example 1.3 (Arrays)
59 |
60 | x <- 1:24 # vector
61 | dim(x) <- length(x) # 1 dimensional array
62 | matrix(1:24, nrow=4, ncol=6) # 4 by 6 matrix
63 | x <- array(1:24, c(3, 4, 2)) # 3 by 4 by 2 array
64 |
65 | ### Example 1.4 (Matrices)
66 |
67 | A <- matrix(0, nrow=2, ncol=2)
68 | A <- matrix(c(0, 0, 0, 0), nrow=2, ncol=2)
69 | A <- matrix(0, 2, 2)
70 |
71 | A <- matrix(1:8, nrow=2, ncol=4)
72 |
73 | ### Example 1.5 (Iris data cont.)
74 |
75 | x <- as.matrix(iris[,1:4]) #all rows of columns 1 to 4
76 |
77 | mean(x[,2]) #mean of sepal width, all species
78 | mean(x[51:100,3]) #mean of petal length, versicolor
79 |
80 | y <- array(x, dim=c(50, 3, 4))
81 | mean(y[,,2]) #mean of sepal width, all species
82 | mean(y[,2,3]) #mean of petal length, versicolor
83 |
84 | y <- array(c(x[1:50,], x[51:100,], x[101:150,]),
85 | dim=c(50, 4, 3))
86 | mean(y[,2,]) #mean of sepal width, all species
87 | mean(y[,3,2]) #mean of petal length, versicolor
88 |
89 | ### Example 1.6 (Run length encoding)
90 |
91 |
92 | n <- 1000
93 | x <- rbinom(n, size = 1, prob = .5)
94 | table(x)
95 | head(x, 30)
96 |
97 | r <- rle(x)
98 | str(r)
99 |
100 | head(r$lengths)
101 | head(r[[1]])
102 |
103 | max(r$lengths)
104 | log2(length(x))
105 |
106 | ### Example 1.7 (Named list)
107 |
108 | w <- wilcox.test(rnorm(10), rnorm(10, 2))
109 | w #print the summary
110 |
111 | w$statistic #stored in object w
112 | w$p.value
113 | unlist(w)
114 | unclass(w)
115 |
116 | ### Example 1.8 (A list of names)
117 |
118 | a <- matrix(runif(8), 4, 2) #a 4x2 matrix
119 | dimnames(a) <- list(NULL, c("x", "y"))
120 |
121 | # if we want row names
122 | dimnames(a) <- list(letters[1:4], c("x", "y"))
123 | a
124 |
125 | # another way to assign row names
126 | row.names(a) <- list("NE", "NW", "SW", "SE")
127 | a
128 |
129 | ### Example 1.9 (Parallel boxplots)
130 |
131 | boxplot(iris$Sepal.Length ~ iris$Species)
132 |
133 | boxplot(iris$Sepal.Length ~ iris$Species,
134 | ylab = "Sepal Length", boxwex = .4)
135 | ### Example 1.10 (Plotting characters and colors)
136 |
137 | plot(0:25, rep(1, 26), pch = 0:25)
138 | text(0:25, 0.9, 0:25)
139 |
140 |
141 | ### Example 1.11 (Barplot for run lengths)
142 |
143 | barplot(table(r$lengths)) #R graphics version
144 |
145 | ## ggplot version
146 | library(ggplot2)
147 | df <- data.frame(lengths = factor(r$lengths))
148 | ggplot(df, aes(lengths)) + geom_bar()
149 |
150 | ### Example 1.12 (Scatterplots)
151 |
152 | ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
153 | geom_point()
154 |
155 | ggplot(iris, aes(Sepal.Length, Sepal.Width,
156 | color = Species, shape = Species)) + geom_point(size = 2)
157 |
158 | ### Example 1.13 (ggplot: parallel boxplots and violin plots)
159 |
160 | ggplot(iris, aes(Species, Sepal.Length)) + geom_boxplot()
161 | ggplot(iris, aes(Species, Sepal.Length)) + geom_violin()
162 |
163 | ggplot(iris, aes(Species, Sepal.Length)) +
164 | geom_boxplot() + coord_flip()
165 | ggplot(iris, aes(Species, Sepal.Length)) +
166 | geom_violin() + coord_flip()
167 |
168 | ### Example 1.14 (MPG by engine displacement)
169 |
170 | ggplot(mpg, aes(displ, hwy)) +
171 | geom_point() +
172 | facet_wrap(~ class)
173 |
174 | ### Example 1.15 (Import data from a local text file)
175 |
176 | # copy "FOREARM.DAT" into working directory
177 | forearm <- scan(file = "FOREARM.DAT") #a vector
178 |
179 | # use pathname if file is not in working directory
180 | # forearm <- scan(file = "./DATASETS/FOREARM.DAT") #a vector
181 |
182 | head(forearm)
183 |
184 | ### Example 1.16 (Importing data from a web page)
185 |
186 | fileloc <- "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
187 |
188 | df <- read.table(file = fileloc, na.strings = "?", as.is = TRUE)
189 | str(df)
190 | names(df) <- c("mpg", "cyl", "displ", "hp", "wt", "accel",
191 | "year", "origin", "name")
192 | summary(df)
193 |
194 | ### Example 1.17 (Importing/exporting .csv files)
195 |
196 | #create a data frame
197 | dates <- c("3/27/1995", "4/3/1995",
198 | "4/10/1995", "4/18/1995")
199 | prices <- c(11.1, 7.9, 1.9, 7.3)
200 | d <- data.frame(dates=dates, prices=prices)
201 |
202 | #create the .csv file
203 | filename <- "temp.csv"
204 | write.table(d, file = filename, sep = ",",
205 | row.names = FALSE)
206 |
207 | #read the .csv file
208 | read.table(file = filename, sep = ",", header = TRUE)
209 | read.csv(file = filename) #same thing
210 |
211 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-Programming.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 15 ###
9 | ### Programming Topics ###
10 | #######################################################
11 |
12 | # packages to install:
13 | # energy, ggplot2, profvis, pryr, microbenchmark, mvtnorm, rbenchmark
14 | # for the last section: dplyr, Lahman, Rcpp
15 |
16 | ### Example 15.1 (Benchmarking methods to generate a sequence)
17 |
18 | s1 <- 1:10
19 | s2 <- seq(1, 10, 1)
20 | s3 <- seq.int(1, 10, 1)
21 | df <- data.frame(s1=s1, s2=s2, s3=s3)
22 | str(df)
23 |
24 | library(microbenchmark)
25 | library(ggplot2)
26 |
27 | n <- 1000
28 | mb <- microbenchmark(
29 | seq(1, n, 1),
30 | seq.int(1, n, 1),
31 | 1:n
32 | )
33 |
34 | print(mb)
35 | autoplot(mb) # display a violin plot
36 |
37 | ### Example 15.2 (Benchmarking methods to initialize a vector)
38 |
39 | n <- 100
40 | mb2 <- microbenchmark(
41 | numeric = numeric(n) + 1,
42 | rep = rep(1, n),
43 | seq = seq(from=1, to=1, length=n),
44 | ones = matrix(1, nrow=n, ncol=1),
45 | as.ones = as.matrix(rep(1, n))
46 | )
47 |
48 | print(mb2)
49 |
50 | ### Example 15.3 (Timings of two multivariate normal generators)
51 |
52 | library(rbenchmark)
53 | library(MASS)
54 | library(mvtnorm)
55 | n <- 100 #sample size
56 | d <- 30 #dimension
57 | N <- 2000 #iterations
58 | mu <- numeric(d)
59 |
60 | benchmark(
61 | columns = c("test", "replications", "elapsed", "relative"),
62 | replications = 2000,
63 | cov = {S <- cov(matrix(rnorm(n*d), n, d))},
64 | mvrnorm = mvrnorm(n, mu, S),
65 | rmvnorm = rmvnorm(n, mu, S)
66 | )
67 |
68 | ### Example 15.4 (Profiling with Rprof)
69 |
70 | x <- rnorm(1000)
71 | y <- rnorm(1000)
72 |
73 | Rprof("pr.out", line.profiling = TRUE)
74 | energy::dcor(x, y)
75 | Rprof(NULL)
76 | summaryRprof("pr.out")
77 |
78 | ### Example 15.5 (profvis interactive visualization)
79 |
80 | library(profvis)
81 | profvis(energy::dcor(x, y))
82 |
83 | ### Example 15.6 (Object size)
84 |
85 | x <- matrix(rnorm(5000), 1000, 5) #1000 obs in R^5
86 | DF <- as.data.frame(x)
87 | object.size(x)
88 | object.size(DF)
89 | pryr::object_size(x)
90 | pryr::object_size(DF)
91 | pryr::compare_size(x)
92 |
93 | listTwo <- list(x, x)
94 | pryr::compare_size(listTwo)
95 |
96 | ### Example 15.7 (Comparing objects and attributes)
97 |
98 | str(x)
99 | str(DF)
100 | all.equal(x, DF)
101 | names(attributes(x))
102 | names(attributes(DF))
103 | all.equal(x, DF, check.attributes = FALSE)
104 |
105 | ### Example 15.8 (Comparing objects for equality)
106 |
107 | try(ifelse(all.equal(x, DF), "T", "F")) # error
108 | ifelse(isTRUE(all.equal(x, DF)), "T", "F") # correct
109 |
110 | x <- 1 - 10e-4
111 | y <- x + 2
112 | x == (y - 2) # equal mathematically but
113 | isTRUE(all.equal(x, y - 2)) #gives expected result
114 |
115 | ## does not necessarily evaluate to TRUE or FALSE
116 | try(ifelse(all.equal(x, y), "T", "F"))
117 | ## returns TRUE or FALSE
118 | ifelse(isTRUE(all.equal(x, y)), "T", "F")
119 |
120 | ### Example 15.9 (Display R function code)
121 |
122 | nclass.scott
123 |
124 | ### Example 15.10 (RSiteSearch)
125 |
126 | if (interactive())
127 | RSiteSearch("ggcorr")
128 |
129 | ### Example 15.11 (UseMethod)
130 |
131 | body(density)
132 | args(density.default)
133 | body(density.default)
134 |
135 | ### Example 15.12 (Show methods)
136 |
137 | methods(t.test)
138 | getAnywhere(t.test.formula)
139 | body(stats:::t.test.formula)
140 |
141 | ### Example 15.13 (Object not found or not an exported object)
142 |
143 | try(perc.ci)
144 | try(boot::perc.ci)
145 |
146 | getAnywhere(perc.ci)
147 | args(boot:::perc.ci)
148 | body(boot:::perc.ci)
149 | boot:::perc.ci
150 | getFromNamespace("perc.ci", "boot")
151 |
152 | ### Example 15.14 (getS3method)
153 |
154 | library(microbenchmark)
155 | library(ggplot2)
156 | getAnywhere(autoplot)
157 | getS3method("autoplot", class = "microbenchmark")
158 | getAnywhere(autoplot.microbenchmark)
159 |
160 | ### Example 15.15 (.Primitive or .Internal)
161 |
162 | if (interactive())
163 | pryr::show_c_source(.Primitive(cumsum(x)))
164 |
165 | ### Example 15.16 (.Call, .External, .C or .Fortran)
166 |
167 | #dist is implemented by a .Call to C_Cdist
168 |
169 | body(dist)
170 |
171 | ### Example 15.17 (A first Rcpp experiment)
172 |
173 | # In RStudio use the menu
174 | # File > New File > C++ file to display this code
175 |
176 | ### Example 15.18 (cppFunction)
177 |
178 | library(Rcpp)
179 |
180 | set.seed(1)
181 | x <- matrix(rnorm(20), nrow = 5, ncol = 4)
182 |
183 | cppFunction('double vecnorm(NumericVector v) {
184 | // compute the Euclidean norm of vector v
185 | int d = v.size();
186 | double s = 0.0;
187 | for (int i = 0; i < d; i++)
188 | s += v(i) * v(i);
189 | return sqrt(s);
190 | }')
191 |
192 | print(vecnorm(x[, 1]))
193 | print(apply(x, MARGIN = 2, FUN = "vecnorm"))
194 |
195 |
196 | ### Example 15.19 (sourceCpp)
197 |
198 | # Create the C++ source file "printme.cpp" by editing the template
199 | # in RStudio menu: File > New File > C++
200 |
201 | library(Rcpp)
202 | sourceCpp("..//examples-cpp//printme.cpp")
203 | x <- sample(1:5)
204 | print_me(x)
205 |
206 | ### Example 15.20 (Lahman baseball data)
207 |
208 | library(Lahman)
209 | str(Batting)
210 | # method 1
211 | S <- subset(Batting, Batting$yearID == 1999,
212 | select = c("playerID", "AB", "H"))
213 |
214 | # method 2 (dplyr)
215 | library(dplyr)
216 | Batting %>% filter(yearID == 1999) -> b
217 |
218 | # method 1
219 | AB <- as.vector(by(S$AB, S$playerID, FUN = sum))
220 | H <- as.vector(by(S$H, S$playerID, FUN = sum))
221 | S <- data.frame(playerID = unique(S$playerID),
222 | AB = AB, H = H, AVG = round(H / AB, 3),
223 | stringsAsFactors = FALSE)
224 | S400 <- S[S$AB >= 400, ]
225 |
226 | # method 2 (dplyr)
227 | b %>% group_by(playerID) %>%
228 | summarize(AB = sum(AB), H = sum(H)) -> S
229 | S %>% mutate(AVG = round(H / AB, 3)) -> S
230 | S %>% filter(AB >= 400) -> S400
231 |
232 | # method 1
233 | o <- order(S400$AVG, decreasing = TRUE)
234 | S400 <- S400[o, ]
235 | top <- S400[1:10, ]
236 |
237 | # method 2 (dplyr)
238 | S400 %>% arrange(desc(AVG)) -> S400
239 | slice(S400, 1:10) -> top
240 | top
241 |
242 | People %>% select(playerID, nameFirst, nameLast) -> m
243 | top %>% inner_join(m) %>%
244 | select(nameFirst, nameLast, AVG)
245 |
246 | ### Example 15.21 (Comparison with microbenchmark)
247 |
248 | # The source for this example is in R Markdown format
249 | # Open "Lahman.Rmd" in RStudio and knit to see report
250 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-ResampAppl.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 9 ###
9 | ### Resampling Applications ###
10 | #######################################################
11 |
12 |
13 | # packages to install: bootstrap, DAAG, ggplot2
14 |
15 |
16 | ### Example 9.1 (Jackknife-after-bootstrap)
17 |
18 | library(boot)
19 | library(bootstrap)
20 | set.seed(1111)
21 |
22 | theta.boot <- function(patch, i) {
23 | # function to compute the patch ratio statistic
24 | y <- patch[i, "y"]
25 | z <- patch[i, "z"]
26 | mean(y) / mean(z)
27 | }
28 |
29 | boot.out <- boot(bootstrap::patch,
30 | statistic = theta.boot, R=2000)
31 | A <- boot.array(boot.out)
32 | head(A, 3)
33 | mean(A[, 1] == 0)
34 |
35 | # jackknife-after-bootstrap to est. se(se)
36 | A <- boot.array(boot.out)
37 | theta.b <- boot.out$t
38 | n <- NROW(patch)
39 | jack.se <- numeric(n)
40 |
41 | for (i in 1:n) {
42 | #in i-th replicate omit all samples with x[i]
43 | keep <- which(A[, i] == 0)
44 | jack.se[i] <- sd(theta.b[keep])
45 | }
46 |
47 | print(boot.out) #for se_boot
48 | se.bar <- mean(jack.se)
49 | se.se <- sqrt((n-1) * mean((jack.se - se.bar)^2))
50 | print(paste("Jackknife-after-bootstrap est. se(se)=", se.se))
51 |
52 |
53 | ### Example 9.2 (Jackknife-after-bootstrap)
54 |
55 | # initialize
56 | data(patch, package = "bootstrap")
57 | y <- patch$y
58 | z <- patch$z
59 | dat <- cbind(y, z)
60 | n <- NROW(dat)
61 | B <- 2000
62 |
63 | # jackknife-after-bootstrap step 1: run the bootstrap
64 | theta_boot <- function(dat, ind) {
65 | # function to compute the statistic
66 | y <- dat[ind, 1]
67 | z <- dat[ind, 2]
68 | mean(y) / mean(z)
69 | }
70 |
71 | boot.obj <- boot(dat, statistic = theta_boot, R=2000)
72 | theta.hat <- boot.obj$t0
73 | theta.b <- boot.obj$t
74 | se.boot <- sd(theta.b)
75 |
76 | # jackknife-after-bootstrap to est. se(se)
77 | sample.freq <- boot.array(boot.obj)
78 | se.se.reps <- numeric(n)
79 | N <- 1:n
80 |
81 | for (i in N) {
82 | # jackknife-after-bootstrap
83 | # omit all bootstrap samples that contain obs i
84 | keep <- which(sample.freq[ ,i] == 0)
85 | se.se.reps[i] <- sd(theta.b[keep])
86 | }
87 |
88 | print(boot.obj)
89 | se.bar <- mean(se.se.reps)
90 | se.se <- sqrt((n-1) * mean((se.se.reps - se.bar)^2))
91 | se.se
92 |
93 | ### Example 9.3 (ironslag linear model)
94 |
95 | library(ggplot2)
96 | library(DAAG)
97 | L1 <- lm(magnetic ~ chemical, data=ironslag)
98 | cf3 <- round(L1$coeff, 3)
99 | cap <- paste("Fit: magnetic =", cf3[1], "+", cf3[2], "chemical")
100 |
101 | ggplot(data=ironslag, aes(chemical, magnetic)) +
102 | geom_point() + geom_smooth(method="lm") +
103 | ggtitle(cap)
104 |
105 | plot(L1, which=1:2, ask=FALSE) #residual plots
106 |
107 | ### Example 9.4 (mammals data)
108 |
109 | library(MASS)
110 | cor(log(mammals$body), log(mammals$brain))
111 | summary(mammals)
112 |
113 | y <- log(mammals$brain)
114 | x <- log(mammals$body)
115 | L <- lm(y ~ x)
116 | L
117 |
118 | cap <- paste("Fit: log(brain) =", round(L$coeff[1],3),
119 | "+", round(L$coeff[2],3), "log(body)")
120 | ggplot(data=mammals, aes(x, y)) +
121 | geom_point() + geom_smooth(method="lm") +
122 | labs(x = "log(body)", y = "log(brain)", title = cap)
123 |
124 | summary(L)$r.squared
125 |
126 | ### Example 9.5 (ironslag data, resampling cases)
127 |
128 | x <- ironslag$chemical
129 | y <- ironslag$magnetic
130 | m <- 2000
131 | n <- NROW(x)
132 | L1 <- lm(y ~ x) #estimate the model
133 | b0 <- L1$coeff[1]; b1 <- L1$coeff[2]
134 |
135 | ## run bootstrap of cases
136 | out <- replicate(m, expr={
137 | i <- sample(1:n, replace=TRUE, size=n)
138 | xstar <- x[i]
139 | ystar <- y[i]
140 | Lb <- lm(ystar ~ xstar)
141 | s <- summary(Lb)$sigma
142 | c(Lb$coeff[1], slope=Lb$coeff[2], s=s)
143 | })
144 |
145 | bootCases <- t(out)
146 | meanCases <- colMeans(bootCases)
147 | sdCases <- apply(bootCases, 2, "sd")
148 | meanCases
149 | sdCases
150 |
151 | biasInt <- mean(bootCases[,1] - b0) #bias for intercept
152 | biasSlope <- mean(bootCases[,2] - b1) #bias for slope
153 |
154 | rbind(estimate=c(b0, b1), bias=c(biasInt, biasSlope),
155 | se=sdCases[1:2], cv=c(biasInt, cv=biasSlope)/sdCases[1:2])
156 |
157 | ### Example 9.6 (Resampling cases using the boot function)
158 |
159 | # set.seed(1104)
160 | library(boot)
161 | m <- 2000
162 | stats <- function(dat, i) {
163 | x <- dat$chemical[i]
164 | y <- dat$magnetic[i]
165 | Lb <- lm(y ~ x)
166 | s <- summary(Lb)$sigma
167 | c(Lb$coeff[1], slope=Lb$coeff[2], s=s)
168 | }
169 |
170 | boot.out <- boot(ironslag, statistic=stats, R=2000)
171 | boot.out
172 | boot.out$t0
173 |
174 | sd(boot.out$t[,2])
175 | boottbl <- broom::tidy(boot.out)
176 | boottbl$std.error[2]
177 |
178 | MASS::truehist(boot.out$t[ ,2], main="", xlab="slopes")
179 | abline(v = boot.out$t0[2], lwd=2)
180 |
181 | boot.ci(boot.out, index=2, type=c("norm","perc","basic","bca"))
182 |
183 | ### Example 9.7 (Resampling errors: mammals data)
184 |
185 | x <- log(mammals$brain)
186 | y <- log(mammals$body)
187 | L <- lm(y ~ x)
188 |
189 | m.resid <- rstandard(L, sd = 1)
190 | r <- m.resid - mean(m.resid)
191 | m <- 1000; n <- NROW(x)
192 | estsErr <- replicate(m, expr={
193 | estar <- sample(r, replace=TRUE, size=n)
194 | ystar <- L$fitted.values + estar
195 | Lb <- lm(ystar ~ x)
196 | s <- summary(Lb)$sigma
197 | c(b0=Lb$coeff[1], b1=Lb$coeff[2], s=s)
198 | })
199 | ests <- t(estsErr)
200 | summary(ests)
201 |
202 | ### Example 9.8 (Resampling errors, continued)
203 | sd(ests[,2])
204 | s <- summary(L)$sigma
205 | SSx <- (n - 1) * var(x)
206 | se.beta1 <- sqrt(s^2 / SSx)
207 | se.beta1
208 | s * sqrt(1/n + mean(x)^2 / SSx)
209 | sd(ests[,1])
210 |
211 | betas <- summary(L)$coeff
212 | betas
213 | betas[, "Std. Error"]
214 |
215 | broom::tidy(summary(L))
216 | broom::tidy(summary(L))$std.error
217 |
218 | ### Example 9.9 (Model based resampling with the boot function)
219 | regstats <- function(dat, i) {
220 | #dat is a data frame (r, x, yhat)
221 | #r are the modified centered residuals, yhat are the fits
222 | ystar <- dat$yhat + dat$r[i]
223 | xstar <- dat$x
224 | Lnew <- lm(ystar ~ xstar)
225 | Lnew$coefficients
226 | }
227 |
228 | y <- log(mammals$brain)
229 | x <- log(mammals$body)
230 | L <- lm(y ~ x)
231 | r <- rstandard(L, sd=1)
232 | r <- r - mean(r)
233 | df <- data.frame(r=r, x=x, yhat=L$fitted)
234 | head(df)
235 | boot.obj <- boot(data=df, statistic=regstats, R=2000)
236 | broom::tidy(boot.obj)
237 |
238 |
239 | ### Example 9.10 (Empirical influence values for the patch ratio statistic)
240 |
241 | library(boot)
242 | library(bootstrap)
243 | theta_boot <- function(dat, ind) {
244 | # function to compute the patch ratio statistic
245 | mean(dat[ind, ]$y) / mean(dat[ind, ]$z)
246 | }
247 | boot.out <- boot(patch, theta_boot, R = 2000)
248 | infl <- empinf(boot.out, type = "jack")
249 | theta.hat <- boot.out$t0
250 | jack <- theta.hat - infl / (nrow(patch) - 1)
251 | rbind(infl, jack)
252 |
253 | ### Example 9.11 (Jackknife-after-bootstrap plot)
254 |
255 | jack.after.boot(boot.out, useJ=TRUE, stinf=FALSE)
256 |
257 | n <- NROW(patch)
258 | J <- numeric(n)
259 | b.freq <- boot.array(boot.out)
260 | theta.b <- boot.out$t
261 |
262 | for (i in 1:n) {
263 | keep <- which(b.freq[ ,i] == 0)
264 | J[i] <- mean(theta.b[keep])
265 | }
266 |
267 | # the jackknife influence values
268 | (n - 1) * (mean(J) - J)
269 |
270 | jack.after.boot(boot.out, useJ=TRUE, stinf=TRUE)
271 |
272 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chBoot.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### March 6, 2019 ###
7 | ### ###
8 | ### R code for Chapter 8 ###
9 | ### Bootstrap and Jackknife ###
10 | #######################################################
11 |
12 |
13 | # packages to install: bootstrap, DAAG
14 |
15 |
16 | ### Example 8.2 (Bootstrap estimate of standard error)
17 |
18 | library(bootstrap) #for the law data
19 | print(cor(law$LSAT, law$GPA))
20 | print(cor(law82$LSAT, law82$GPA))
21 |
22 | #set up the bootstrap
23 | B <- 200 #number of replicates
24 | n <- nrow(law) #sample size
25 | R <- numeric(B) #storage for replicates
26 |
27 | #bootstrap estimate of standard error of R
28 | for (b in 1:B) {
29 | #randomly select the indices
30 | i <- sample(1:n, size = n, replace = TRUE)
31 | LSAT <- law$LSAT[i] #i is a vector of indices
32 | GPA <- law$GPA[i]
33 | R[b] <- cor(LSAT, GPA)
34 | }
35 | #output
36 | print(se.R <- sd(R))
37 | hist(R, prob = TRUE)
38 |
39 |
40 | ### Example 8.3 (Bootstrap estimate of standard error: boot function)
41 |
42 | r <- function(x, i) {
43 | #want correlation of columns 1 and 2
44 | cor(x[i,1], x[i,2])
45 | }
46 |
47 | library(boot) #for boot function
48 | obj <- boot(data = law, statistic = r, R = 2000)
49 | obj
50 | y <- obj$t
51 | sd(y)
52 |
53 |
54 | ### Example 8.4 (Bootstrap estimate of bias)
55 |
56 | #sample estimate for n=15
57 | theta.hat <- cor(law$LSAT, law$GPA)
58 |
59 | #bootstrap estimate of bias
60 | B <- 2000 #larger for estimating bias
61 | n <- nrow(law)
62 | theta.b <- numeric(B)
63 |
64 | for (b in 1:B) {
65 | i <- sample(1:n, size = n, replace = TRUE)
66 | LSAT <- law$LSAT[i]
67 | GPA <- law$GPA[i]
68 | theta.b[b] <- cor(LSAT, GPA)
69 | }
70 | bias <- mean(theta.b - theta.hat)
71 | bias
72 |
73 |
74 | ### Example 8.5 (Bootstrap estimate of bias of a ratio estimate)
75 |
76 |
77 | data(patch, package = "bootstrap")
78 | patch
79 |
80 | n <- nrow(patch) #in bootstrap package
81 | B <- 2000
82 | theta.b <- numeric(B)
83 | theta.hat <- mean(patch$y) / mean(patch$z)
84 |
85 | #bootstrap
86 | for (b in 1:B) {
87 | i <- sample(1:n, size = n, replace = TRUE)
88 | y <- patch$y[i]
89 | z <- patch$z[i]
90 | theta.b[b] <- mean(y) / mean(z)
91 | }
92 | bias <- mean(theta.b) - theta.hat
93 | se <- sd(theta.b)
94 | print(list(est=theta.hat, bias = bias,
95 | se = se, cv = bias/se))
96 |
97 |
98 | ### Example 8.6 (Jackknife estimate of bias)
99 |
100 | data(patch, package = "bootstrap")
101 | n <- nrow(patch)
102 | y <- patch$y
103 | z <- patch$z
104 | theta.hat <- mean(y) / mean(z)
105 | print (theta.hat)
106 |
107 | #compute the jackknife replicates, leave-one-out estimates
108 | theta.jack <- numeric(n)
109 | for (i in 1:n)
110 | theta.jack[i] <- mean(y[-i]) / mean(z[-i])
111 | bias <- (n - 1) * (mean(theta.jack) - theta.hat)
112 |
113 | print(bias) #jackknife estimate of bias
114 |
115 |
116 | ### Example 8.7 (Jackknife estimate of standard error)
117 |
118 | se <- sqrt((n-1) *
119 | mean((theta.jack - mean(theta.jack))^2))
120 | print(se)
121 |
122 |
123 | ### Example 8.8 (Failure of jackknife)
124 |
125 | set.seed(123) #for the specific example given
126 | #change the seed to see other examples
127 |
128 | n <- 10
129 | x <- sample(1:100, size = n)
130 |
131 | #jackknife estimate of se
132 | M <- numeric(n)
133 | for (i in 1:n) { #leave one out
134 | y <- x[-i]
135 | M[i] <- median(y)
136 | }
137 | Mbar <- mean(M)
138 | print(sqrt((n-1)/n * sum((M - Mbar)^2)))
139 |
140 | #bootstrap estimate of se
141 | Mb <- replicate(1000, expr = {
142 | y <- sample(x, size = n, replace = TRUE)
143 | median(y) })
144 | print(sd(Mb))
145 | print(x)
146 | print(M)
147 | print(Mb)
148 |
149 |
150 | ### Example 8.9 (Bootstrap confidence intervals for patch ratio statistic)
151 |
152 | library(boot) #for boot and boot.ci
153 | data(patch, package = "bootstrap")
154 |
155 | theta.boot <- function(dat, ind) {
156 | #function to compute the statistic
157 | y <- dat[ind, 1]
158 | z <- dat[ind, 2]
159 | mean(y) / mean(z)
160 | }
161 |
162 | y <- patch$y
163 | z <- patch$z
164 | dat <- cbind(y, z)
165 | boot.obj <- boot(dat, statistic = theta.boot, R = 2000)
166 |
167 | print(boot.obj)
168 | print(boot.ci(boot.obj,
169 | type = c("basic", "norm", "perc")))
170 |
171 |
172 | #calculations for bootstrap confidence intervals
173 | alpha <- c(.025, .975)
174 |
175 | #normal
176 | print(boot.obj$t0 + qnorm(alpha) * sd(boot.obj$t))
177 |
178 | #basic
179 | print(2*boot.obj$t0 -
180 | quantile(boot.obj$t, rev(alpha), type=1))
181 |
182 | #percentile
183 | print(quantile(boot.obj$t, alpha, type=6))
184 |
185 |
186 | ### Example 8.10 (Bootstrap confidence intervals for the correlation statistic)
187 |
188 | library(boot)
189 | data(law, package = "bootstrap")
190 | boot.obj <- boot(law, R = 2000,
191 | statistic = function(x, i){cor(x[i,1], x[i,2])})
192 | print(boot.ci(boot.obj, type=c("basic","norm","perc")))
193 |
194 |
195 | ### Example 8.11 (Bootstrap t confidence interval)
196 |
197 | boot.t.ci <-
198 | function(x, B = 500, R = 100, level = .95, statistic){
199 | #compute the bootstrap t CI
200 | x <- as.matrix(x); n <- nrow(x)
201 | stat <- numeric(B); se <- numeric(B)
202 |
203 | boot.se <- function(x, R, f) {
204 | #local function to compute the bootstrap
205 | #estimate of standard error for statistic f(x)
206 | x <- as.matrix(x); m <- nrow(x)
207 | th <- replicate(R, expr = {
208 | i <- sample(1:m, size = m, replace = TRUE)
209 | f(x[i, ])
210 | })
211 | return(sd(th))
212 | }
213 |
214 | for (b in 1:B) {
215 | j <- sample(1:n, size = n, replace = TRUE)
216 | y <- x[j, ]
217 | stat[b] <- statistic(y)
218 | se[b] <- boot.se(y, R = R, f = statistic)
219 | }
220 | stat0 <- statistic(x)
221 | t.stats <- (stat - stat0) / se
222 | se0 <- sd(stat)
223 | alpha <- 1 - level
224 | Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1)
225 | names(Qt) <- rev(names(Qt))
226 | CI <- rev(stat0 - Qt * se0)
227 | }
228 |
229 |
230 | ### Example 8.12 (Bootstrap t confidence interval for patch ratio statistic)
231 |
232 | #boot package and patch data were loaded in Example 8.10
233 | #library(boot) #for boot and boot.ci
234 | #data(patch, package = "bootstrap")
235 |
236 | dat <- cbind(patch$y, patch$z)
237 | stat <- function(dat) {
238 | mean(dat[, 1]) / mean(dat[, 2]) }
239 | ci <- boot.t.ci(dat, statistic = stat, B=2000, R=200)
240 | print(ci)
241 |
242 |
243 | ### Example 8.13 (BCa bootstrap confidence interval)
244 |
245 | boot.BCa <-
246 | function(x, th0, th, stat, conf = .95) {
247 | # bootstrap with BCa bootstrap confidence interval
248 | # th0 is the observed statistic
249 | # th is the vector of bootstrap replicates
250 | # stat is the function to compute the statistic
251 |
252 | x <- as.matrix(x)
253 | n <- nrow(x) #observations in rows
254 | N <- 1:n
255 | alpha <- (1 + c(-conf, conf))/2
256 | zalpha <- qnorm(alpha)
257 |
258 | # the bias correction factor
259 | z0 <- qnorm(sum(th < th0) / length(th))
260 |
261 | # the acceleration factor (jackknife est.)
262 | th.jack <- numeric(n)
263 | for (i in 1:n) {
264 | J <- N[1:(n-1)]
265 | th.jack[i] <- stat(x[-i, ], J)
266 | }
267 | L <- mean(th.jack) - th.jack
268 | a <- sum(L^3)/(6 * sum(L^2)^1.5)
269 |
270 | # BCa conf. limits
271 | adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha)))
272 | limits <- quantile(th, adj.alpha, type=6)
273 | return(list("est"=th0, "BCa"=limits))
274 | }
275 |
276 |
277 | ### Example 8.14 (BCa bootstrap confidence interval)
278 |
279 | #boot package and patch data were loaded in Example 8.10
280 | #library(boot) #for boot and boot.ci
281 | #data(patch, package = "bootstrap")
282 |
283 | n <- nrow(patch)
284 | B <- 2000
285 | y <- patch$y
286 | z <- patch$z
287 | x <- cbind(y, z)
288 | theta.b <- numeric(B)
289 | theta.hat <- mean(y) / mean(z)
290 |
291 | #bootstrap
292 | for (b in 1:B) {
293 | i <- sample(1:n, size = n, replace = TRUE)
294 | y <- patch$y[i]
295 | z <- patch$z[i]
296 | theta.b[b] <- mean(y) / mean(z)
297 | }
298 | #compute the BCa interval
299 | stat <- function(dat, index) {
300 | mean(dat[index, 1]) / mean(dat[index, 2]) }
301 |
302 | boot.BCa(x, th0 = theta.hat, th = theta.b, stat = stat)
303 |
304 |
305 |
306 | ### Example 8.15 (BCa bootstrap confidence interval using boot.ci)
307 |
308 | #using x from Example 8.15
309 | boot.obj <- boot(x, statistic = stat, R=2000)
310 | boot.ci(boot.obj, type=c("perc", "bca"))
311 |
312 | ### Example 8.16 (Model selection)
313 |
314 | #to prompt for next graph, uncomment line below
315 | #par(ask = TRUE)
316 |
317 | library(DAAG); attach(ironslag)
318 | a <- seq(10, 40, .1) #sequence for plotting fits
319 |
320 | L1 <- lm(magnetic ~ chemical)
321 | plot(chemical, magnetic, main="Linear", pch=16)
322 | yhat1 <- L1$coef[1] + L1$coef[2] * a
323 | lines(a, yhat1, lwd=2)
324 |
325 | L2 <- lm(magnetic ~ chemical + I(chemical^2))
326 | plot(chemical, magnetic, main="Quadratic", pch=16)
327 | yhat2 <- L2$coef[1] + L2$coef[2] * a + L2$coef[3] * a^2
328 | lines(a, yhat2, lwd=2)
329 |
330 | L3 <- lm(log(magnetic) ~ chemical)
331 | plot(chemical, magnetic, main="Exponential", pch=16)
332 | logyhat3 <- L3$coef[1] + L3$coef[2] * a
333 | yhat3 <- exp(logyhat3)
334 | lines(a, yhat3, lwd=2)
335 |
336 | L4 <- lm(log(magnetic) ~ log(chemical))
337 | plot(log(chemical), log(magnetic), main="Log-Log", pch=16)
338 | logyhat4 <- L4$coef[1] + L4$coef[2] * log(a)
339 | lines(log(a), logyhat4, lwd=2)
340 |
341 | ### Example 8.17 (Model selection: Cross validation)
342 |
343 | # Example 8.16, cont.
344 | n <- length(magnetic) #in DAAG ironslag
345 | e1 <- e2 <- e3 <- e4 <- numeric(n)
346 |
347 | # for n-fold cross validation
348 | # fit models on leave-one-out samples
349 | for (k in 1:n) {
350 | y <- magnetic[-k]
351 | x <- chemical[-k]
352 |
353 | J1 <- lm(y ~ x)
354 | yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k]
355 | e1[k] <- magnetic[k] - yhat1
356 |
357 | J2 <- lm(y ~ x + I(x^2))
358 | yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k] +
359 | J2$coef[3] * chemical[k]^2
360 | e2[k] <- magnetic[k] - yhat2
361 |
362 | J3 <- lm(log(y) ~ x)
363 | logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k]
364 | yhat3 <- exp(logyhat3)
365 | e3[k] <- magnetic[k] - yhat3
366 |
367 | J4 <- lm(log(y) ~ log(x))
368 | logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k])
369 | yhat4 <- exp(logyhat4)
370 | e4[k] <- magnetic[k] - yhat4
371 | }
372 |
373 |
374 | c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))
375 |
376 | #selected model, fitted in Example 8.16
377 | L2
378 |
379 | par(mfrow = c(2, 2)) #layout for graphs
380 | plot(L2$fit, L2$res) #residuals vs fitted values
381 | abline(0, 0) #reference line
382 | qqnorm(L2$res) #normal probability plot
383 | qqline(L2$res) #reference line
384 | par(mfrow = c(1, 1)) #restore display
385 |
386 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chMCint.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 6 ###
9 | ### Monte Carlo Integration and Variance Reduction ###
10 | #######################################################
11 |
12 |
13 | ### Example 6.1 (Simple Monte Carlo integration)
14 |
15 | m <- 10000
16 | x <- runif(m)
17 | theta.hat <- mean(exp(-x))
18 | print(theta.hat)
19 | print(1 - exp(-1))
20 |
21 |
22 | ### Example 6.2 (Simple Monte Carlo integration, cont.)
23 |
24 | m <- 10000
25 | x <- runif(m, min=2, max=4)
26 | theta.hat <- mean(exp(-x)) * 2
27 | print(theta.hat)
28 | print(exp(-2) - exp(-4))
29 |
30 |
31 | ### Example 6.3 (Monte Carlo integration, unbounded interval)
32 |
33 | x <- seq(.1, 2.5, length = 10)
34 | m <- 10000
35 | u <- runif(m)
36 | cdf <- numeric(length(x))
37 | for (i in 1:length(x)) {
38 | g <- x[i] * exp(-(u * x[i])^2 / 2)
39 | cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5
40 | }
41 |
42 | Phi <- pnorm(x)
43 | print(round(rbind(x, cdf, Phi), 3))
44 |
45 |
46 | ### Example 6.4 (Example 6.3, cont.)
47 |
48 | x <- seq(.1, 2.5, length = 10)
49 | m <- 10000
50 | z <- rnorm(m)
51 | dim(x) <- length(x)
52 | p <- apply(x, MARGIN = 1,
53 | FUN = function(x, z) {mean(z < x)}, z = z)
54 |
55 | Phi <- pnorm(x)
56 | print(round(rbind(x, p, Phi), 3))
57 |
58 |
59 | ### Example 6.5 (Error bounds for MC integration)
60 |
61 | x <- 2
62 | m <- 10000
63 | z <- rnorm(m)
64 | g <- (z < x) #the indicator function
65 | v <- mean((g - mean(g))^2) / m
66 | cdf <- mean(g)
67 | c(cdf, v)
68 | c(cdf - 1.96 * sqrt(v), cdf + 1.96 * sqrt(v))
69 |
70 |
71 | ### Example 6.6 (Antithetic variables)
72 |
73 | MC.Phi <- function(x, R = 10000, antithetic = TRUE) {
74 | u <- runif(R/2)
75 | if (!antithetic) v <- runif(R/2) else
76 | v <- 1 - u
77 | u <- c(u, v)
78 | cdf <- numeric(length(x))
79 | for (i in 1:length(x)) {
80 | g <- x[i] * exp(-(u * x[i])^2 / 2)
81 | cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5
82 | }
83 | cdf
84 | }
85 |
86 |
87 | x <- seq(.1, 2.5, length=5)
88 | Phi <- pnorm(x)
89 | set.seed(123)
90 | MC1 <- MC.Phi(x, anti = FALSE)
91 | set.seed(123)
92 | MC2 <- MC.Phi(x)
93 | print(round(rbind(x, MC1, MC2, Phi), 5))
94 |
95 |
96 | m <- 1000
97 | MC1 <- MC2 <- numeric(m)
98 | x <- 1.95
99 | for (i in 1:m) {
100 | MC1[i] <- MC.Phi(x, R = 1000, anti = FALSE)
101 | MC2[i] <- MC.Phi(x, R = 1000)
102 | }
103 |
104 | print(sd(MC1))
105 | print(sd(MC2))
106 | print((var(MC1) - var(MC2))/var(MC1))
107 |
108 |
109 | ### Example 6.7 (Control variate)
110 |
111 | m <- 10000
112 | a <- - 12 + 6 * (exp(1) - 1)
113 | U <- runif(m)
114 | T1 <- exp(U) #simple MC
115 | T2 <- exp(U) + a * (U - 1/2) #controlled
116 |
117 | mean(T1)
118 | mean(T2)
119 | (var(T1) - var(T2)) / var(T1)
120 |
121 |
122 | ### Example 6.8 (MC integration using control variates)
123 |
124 | f <- function(u)
125 | exp(-.5)/(1+u^2)
126 |
127 | g <- function(u)
128 | exp(-u)/(1+u^2)
129 |
130 | set.seed(510) #needed later
131 | u <- runif(10000)
132 | B <- f(u)
133 | A <- g(u)
134 |
135 | cor(A, B)
136 | a <- -cov(A,B) / var(B) #est of c*
137 | a
138 |
139 | m <- 100000
140 | u <- runif(m)
141 | T1 <- g(u)
142 | T2 <- T1 + a * (f(u) - exp(-.5)*pi/4)
143 |
144 | c(mean(T1), mean(T2))
145 | c(var(T1), var(T2))
146 | (var(T1) - var(T2)) / var(T1)
147 |
148 |
149 | ### Example 6.9 (Control variate and regression)
150 |
151 |
152 | set.seed(510)
153 | mu <- exp(-.5)*pi/4
154 | u <- runif(10000)
155 | f <- exp(-.5)/(1+u^2)
156 | g <- exp(-u)/(1+u^2)
157 | L <- lm(g ~ f)
158 | L
159 | c.star <- - L$coeff[2]
160 | c.star
161 |
162 | theta.hat <- sum(L$coeff * c(1, mu)) #pred. value at mu
163 | theta.hat
164 | summary(L)$sigma^2
165 | summary(L)$r.squared
166 |
167 | ### Example 6.10 (Control variates and multiple regression)
168 |
169 | # Example 6.9 continued with a second control variate
170 | # and multiple regression to estimate vector c*
171 |
172 | u <- runif(10000)
173 | f1 <- exp(-.5) / (1+u^2)
174 | f2 <- exp(-u) / (1-exp(-1))
175 | g <- exp(-u) / (1+u^2)
176 |
177 | L2 <- lm(g ~ f1 + f2)
178 |
179 | L2$coeff
180 | c.star <- - L2$coeff[2:3]
181 | c.star
182 | mu1 <- exp(-.5)*pi/4
183 | mu2 <- 1
184 | mu <- c(mu1, mu2)
185 |
186 | # theta.hat is the predicted response at mu
187 | # alternately can use predict.lm method
188 |
189 | theta.hat <- sum(L2$coeff * c(1, mu)) #pred. value at mu
190 | theta.hat
191 |
192 | ## alternately
193 | df <- data.frame(f1=mu1, f2=mu2)
194 | theta.hat <-predict(L2, df)
195 |
196 | # MSE / n is the est. variance of the control estimator
197 | MSE <- summary(L2)$sigma^2
198 | MSE
199 | sqrt(MSE / 10000)
200 |
201 |
202 | # compare with the previous estimates using
203 | # naive MC and control variate f1(u)
204 | # var1=.060231423 var2=.003124814
205 | var0 <- 0.060231423 #naive MC
206 | var1 <- 0.003117644 #controlled estimator with f1
207 | var2 <- MSE #new estimator
208 |
209 | # percent reduction in variance
210 | # it is a weighted average of R^2 values
211 | # so easier to compute directly
212 |
213 | 100 * (var0 - var1) / var0
214 | 100 * (var1 - var2) / var1
215 | 100 * (var0 - var2) / var0
216 |
217 | ### Example 6.11 (Choice of the importance function)
218 | #code for plot is at the end of the file
219 |
220 | m <- 10000
221 | theta.hat <- se <- numeric(5)
222 | g <- function(x) {
223 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1)
224 | }
225 |
226 | x <- runif(m) #using f0
227 | fg <- g(x)
228 | theta.hat[1] <- mean(fg)
229 | se[1] <- sd(fg)
230 |
231 | x <- rexp(m, 1) #using f1
232 | fg <- g(x) / exp(-x)
233 | theta.hat[2] <- mean(fg)
234 | se[2] <- sd(fg)
235 |
236 | x <- rcauchy(m) #using f2
237 | i <- c(which(x > 1), which(x < 0))
238 | x[i] <- 2 #to catch overflow errors in g(x)
239 | fg <- g(x) / dcauchy(x)
240 | theta.hat[3] <- mean(fg)
241 | se[3] <- sd(fg)
242 |
243 | u <- runif(m) #f3, inverse transform method
244 | x <- - log(1 - u * (1 - exp(-1)))
245 | fg <- g(x) / (exp(-x) / (1 - exp(-1)))
246 | theta.hat[4] <- mean(fg)
247 | se[4] <- sd(fg)
248 |
249 | u <- runif(m) #f4, inverse transform method
250 | x <- tan(pi * u / 4)
251 | fg <- g(x) / (4 / ((1 + x^2) * pi))
252 | theta.hat[5] <- mean(fg)
253 | se[5] <- sd(fg)
254 |
255 | rbind(theta.hat, se / sqrt(m))
256 |
257 |
258 | ### Example 6.12 (Example 6.11, cont.)
259 |
260 | M <- 20 #number of replicates
261 | T2 <- numeric(4)
262 | estimates <- matrix(0, 10, 2)
263 |
264 | g <- function(x) {
265 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1) }
266 |
267 | for (i in 1:10) {
268 | estimates[i, 1] <- mean(g(runif(M)))
269 | T2[1] <- mean(g(runif(M/4, 0, .25)))
270 | T2[2] <- mean(g(runif(M/4, .25, .5)))
271 | T2[3] <- mean(g(runif(M/4, .5, .75)))
272 | T2[4] <- mean(g(runif(M/4, .75, 1)))
273 | estimates[i, 2] <- mean(T2)
274 | }
275 |
276 | estimates
277 | apply(estimates, 2, mean)
278 | apply(estimates, 2, var)
279 |
280 |
281 | ### Example 6.13 (Examples 6.11-6.12, cont.)
282 |
283 | M <- 10000 #number of replicates
284 | k <- 10 #number of strata
285 | r <- M / k #replicates per stratum
286 | N <- 50 #number of times to repeat the estimation
287 | T2 <- numeric(k)
288 | estimates <- matrix(0, N, 2)
289 |
290 | g <- function(x) {
291 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1)
292 | }
293 |
294 | for (i in 1:N) {
295 | estimates[i, 1] <- mean(g(runif(M)))
296 | for (j in 1:k)
297 | T2[j] <- mean(g(runif(M/k, (j-1)/k, j/k)))
298 | estimates[i, 2] <- mean(T2)
299 | }
300 |
301 | apply(estimates, 2, mean)
302 | apply(estimates, 2, var)
303 |
304 |
305 |
306 | ### Plot importance functions in Figures 6.1(a) and 6.1.(b)
307 |
308 | #par(ask = TRUE) #uncomment to pause between graphs
309 |
310 | x <- seq(0, 1, .01)
311 | w <- 2
312 | f1 <- exp(-x)
313 | f2 <- (1 / pi) / (1 + x^2)
314 | f3 <- exp(-x) / (1 - exp(-1))
315 | f4 <- 4 / ((1 + x^2) * pi)
316 | g <- exp(-x) / (1 + x^2)
317 |
318 | #for color change lty to col
319 |
320 | #figure (a)
321 | plot(x, g, type = "l", main = "", ylab = "",
322 | ylim = c(0,2), lwd = w)
323 | lines(x, g/g, lty = 2, lwd = w)
324 | lines(x, f1, lty = 3, lwd = w)
325 | lines(x, f2, lty = 4, lwd = w)
326 | lines(x, f3, lty = 5, lwd = w)
327 | lines(x, f4, lty = 6, lwd = w)
328 | legend("topright", legend = c("g", 0:4),
329 | lty = 1:6, lwd = w, inset = 0.02)
330 |
331 | #figure (b)
332 | plot(x, g, type = "l", main = "", ylab = "",
333 | ylim = c(0,3.2), lwd = w, lty = 2)
334 | lines(x, g/f1, lty = 3, lwd = w)
335 | lines(x, g/f2, lty = 4, lwd = w)
336 | lines(x, g/f3, lty = 5, lwd = w)
337 | lines(x, g/f4, lty = 6, lwd = w)
338 | legend("topright", legend = c(0:4),
339 | lty = 2:6, lwd = w, inset = 0.02)
340 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chMonteCarlo.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 7 ###
9 | ### Monte Carlo Methods in Inference ###
10 | #######################################################
11 |
12 | # packages to install: energy, ggplot2
13 |
14 | ### Example 7.1 (Basic Monte Carlo estimation)
15 |
16 | m <- 1000
17 | g <- numeric(m)
18 | for (i in 1:m) {
19 | x <- rnorm(2)
20 | g[i] <- abs(x[1] - x[2])
21 | }
22 | est <- mean(g)
23 | est
24 |
25 |
26 | ### Example 7.2 (Estimating the MSE of a trimmed mean)
27 |
28 | n <- 20
29 | m <- 1000
30 | tmean <- numeric(m)
31 | for (i in 1:m) {
32 | x <- sort(rnorm(n))
33 | tmean[i] <- sum(x[2:(n-1)]) / (n-2)
34 | }
35 | mse <- mean(tmean^2)
36 | mse
37 | sqrt(sum((tmean - mean(tmean))^2)) / m #se
38 |
39 | n <- 20
40 | m <- 1000
41 | tmean <- numeric(m)
42 | for (i in 1:m) {
43 | x <- sort(rnorm(n))
44 | tmean[i] <- median(x)
45 | }
46 | mse <- mean(tmean^2)
47 | mse
48 | sqrt(sum((tmean - mean(tmean))^2)) / m #se
49 |
50 |
51 | ### Example 7.3 (MSE of a trimmed mean, cont.)
52 |
53 | set.seed(522)
54 | n <- 20
55 | K <- n/2 - 1
56 | m <- 1000
57 | mse <- matrix(0, n/2, 6)
58 |
59 | trimmed.mse <- function(n, m, k, p) {
60 | #MC est of mse for k-level trimmed mean of
61 | #contaminated normal pN(0,1) + (1-p)N(0,100)
62 | tmean <- numeric(m)
63 | for (i in 1:m) {
64 | sigma <- sample(c(1, 10), size = n,
65 | replace = TRUE, prob = c(p, 1-p))
66 | x <- sort(rnorm(n, 0, sigma))
67 | tmean[i] <- sum(x[(k+1):(n-k)]) / (n-2*k)
68 | }
69 | mse.est <- mean(tmean^2)
70 | se.mse <- sqrt(mean((tmean-mean(tmean))^2)) / sqrt(m)
71 | return(c(mse.est, se.mse))
72 | }
73 |
74 | for (k in 0:K) {
75 | mse[k+1, 1:2] <- trimmed.mse(n=n, m=m, k=k, p=1.0)
76 | mse[k+1, 3:4] <- trimmed.mse(n=n, m=m, k=k, p=.95)
77 | mse[k+1, 5:6] <- trimmed.mse(n=n, m=m, k=k, p=.9)
78 | }
79 |
80 | ### Example 7.4 (Confidence interval for variance)
81 |
82 | n <- 20
83 | alpha <- .05
84 | x <- rnorm(n, mean=0, sd=2)
85 | UCL <- (n-1) * var(x) / qchisq(alpha, df=n-1)
86 |
87 |
88 | ### Example 7.5 (MC estimate of confidence level)
89 |
90 | n <- 20
91 | alpha <- .05
92 | UCL <- replicate(1000, expr = {
93 | x <- rnorm(n, mean = 0, sd = 2)
94 | (n-1) * var(x) / qchisq(alpha, df = n-1)
95 | } )
96 | #count the number of intervals that contain sigma^2=4
97 | sum(UCL > 4)
98 | #or compute the mean to get the confidence level
99 | mean(UCL > 4)
100 |
101 |
102 | ### Example 7.6 (Empirical confidence level)
103 |
104 | n <- 20
105 | alpha <- .05
106 | UCL <- replicate(1000, expr = {
107 | x <- rchisq(n, df = 2)
108 | (n-1) * var(x) / qchisq(alpha, df = n-1)
109 | } )
110 | sum(UCL > 4)
111 | mean(UCL > 4)
112 |
113 |
114 | ### Example 7.7 (Empirical Type I error rate)
115 |
116 | n <- 20
117 | alpha <- .05
118 | mu0 <- 500
119 | sigma <- 100
120 |
121 | m <- 10000 #number of replicates
122 | p <- numeric(m) #storage for p-values
123 | for (j in 1:m) {
124 | x <- rnorm(n, mu0, sigma)
125 | ttest <- t.test(x, alternative = "greater", mu = mu0)
126 | p[j] <- ttest$p.value
127 | }
128 |
129 | p.hat <- mean(p < alpha)
130 | se.hat <- sqrt(p.hat * (1 - p.hat) / m)
131 | print(c(p.hat, se.hat))
132 |
133 |
134 | ### Example 7.8 (Skewness test of normality)
135 |
136 | n <- c(10, 20, 30, 50, 100, 500) #sample sizes
137 | cv <- qnorm(.975, 0, sqrt(6/n)) #crit. values for each n
138 |
139 | sk <- function(x) {
140 | #computes the sample skewness coeff.
141 | xbar <- mean(x)
142 | m3 <- mean((x - xbar)^3)
143 | m2 <- mean((x - xbar)^2)
144 | return( m3 / m2^1.5 )
145 | }
146 |
147 | #n is a vector of sample sizes
148 | #we are doing length(n) different simulations
149 |
150 | p.reject <- numeric(length(n)) #to store sim. results
151 | m <- 10000 #num. repl. each sim.
152 |
153 | for (i in 1:length(n)) {
154 | sktests <- numeric(m) #test decisions
155 | for (j in 1:m) {
156 | x <- rnorm(n[i])
157 | #test decision is 1 (reject) or 0
158 | sktests[j] <- as.integer(abs(sk(x)) >= cv[i] )
159 | }
160 | p.reject[i] <- mean(sktests) #proportion rejected
161 | }
162 |
163 | p.reject
164 |
165 |
166 | ### Example 7.9 (Empirical power)
167 |
168 | #set.seed(521)
169 | n <- 20
170 | m <- 1000
171 | mu0 <- 500
172 | sigma <- 100
173 | mu <- c(seq(450, 650, 10)) #alternatives
174 | M <- length(mu)
175 | power <- numeric(M)
176 | for (i in 1:M) {
177 | mu1 <- mu[i]
178 | pvalues <- replicate(m, expr = {
179 | #simulate under alternative mu1
180 | x <- rnorm(n, mean = mu1, sd = sigma)
181 | ttest <- t.test(x,
182 | alternative = "greater", mu = mu0)
183 | ttest$p.value } )
184 | power[i] <- mean(pvalues <= .05)
185 | }
186 |
187 | se <- sqrt(power * (1-power) / m)
188 | df <- data.frame(mean=mu, power=power, upper=power+2*se, lower=power-2*se)
189 |
190 | library(ggplot2)
191 | ggplot(df, aes(x=mean, y=power)) +
192 | geom_line() +
193 | geom_vline(xintercept=500, lty=2) +
194 | geom_hline(yintercept=c(0,.05), lty=1:2) +
195 | geom_errorbar(aes(ymin=lower, ymax=upper), width = 0.2, lwd=1.5)
196 |
197 |
198 | ### Example 7.10 (Power of the skewness test of normality)
199 |
200 | #set.seed(111)
201 | alpha <- .1
202 | n <- 30
203 | m <- 2500
204 | epsilon <- c(seq(0, .15, .01), seq(.15, 1, .05))
205 | N <- length(epsilon)
206 | pwr <- numeric(N)
207 | #critical value for the skewness test
208 | cv <- qnorm(1-alpha/2, 0, sqrt(6*(n-2) / ((n+1)*(n+3))))
209 |
210 | for (j in 1:N) { #for each epsilon
211 | e <- epsilon[j]
212 | sktests <- numeric(m)
213 | for (i in 1:m) { #for each replicate
214 | sigma <- sample(c(1, 10), replace = TRUE,
215 | size = n, prob = c(1-e, e))
216 | x <- rnorm(n, 0, sigma)
217 | sktests[i] <- as.integer(abs(sk(x)) >= cv)
218 | }
219 | pwr[j] <- mean(sktests)
220 | }
221 |
222 | se <- sqrt(pwr * (1-pwr) / m)
223 | df <- data.frame(epsilon=epsilon, power=pwr, upper=pwr+2*se, lower=pwr-2*se)
224 |
225 | #plot power vs epsilon
226 | library(ggplot2)
227 | ggplot(df, aes(x=epsilon, y=power)) +
228 | geom_line() + labs(x=bquote(epsilon)) +
229 | geom_hline(yintercept=.1, lty=2) +
230 | geom_pointrange(aes(ymin=lower, ymax=upper))
231 |
232 |
233 | ### Example 7.11 (Power comparison of tests of normality)
234 |
235 | #only one loop, for epsilon=0.1, was shown in the text
236 | #the simulation below takes several minutes to run
237 |
238 | # initialize input and output
239 | library(energy)
240 | alpha <- .1
241 | n <- 30
242 | m <- 500 #try small m for a trial run
243 | test1 <- test2 <- test3 <- numeric(m)
244 |
245 | #critical value for the skewness test
246 | cv <- qnorm(1-alpha/2, 0, sqrt(6*(n-2) / ((n+1)*(n+3))))
247 | sim <- matrix(0, 11, 4)
248 |
249 | # estimate power
250 | for (i in 0:10) {
251 | epsilon <- i * .1
252 | for (j in 1:m) {
253 | e <- epsilon
254 | sigma <- sample(c(1, 10), replace = TRUE,
255 | size = n, prob = c(1-e, e))
256 | x <- rnorm(n, 0, sigma)
257 | test1[j] <- as.integer(abs(sk(x)) >= cv)
258 | test2[j] <- as.integer(
259 | shapiro.test(x)$p.value <= alpha)
260 | test3[j] <- as.integer(
261 | mvnorm.etest(x, R=200)$p.value <= alpha)
262 | }
263 | print(c(epsilon, mean(test1), mean(test2), mean(test3)))
264 | sim[i+1, ] <- c(epsilon, mean(test1), mean(test2), mean(test3))
265 | }
266 | detach(package:energy)
267 |
268 | # plot the empirical estimates of power
269 | plot(sim[,1], sim[,2], ylim = c(0, 1), type = "l",
270 | xlab = bquote(epsilon), ylab = "power")
271 | lines(sim[,1], sim[,3], lty = 2)
272 | lines(sim[,1], sim[,4], lty = 4)
273 | abline(h = alpha, lty = 3)
274 | legend("topright", 1, c("skewness", "S-W", "energy"),
275 | lty = c(1,2,4), inset = .02)
276 |
277 |
278 | ### Example 7.12 (Count Five test statistic)
279 |
280 | x1 <- rnorm(20, 0, sd = 1)
281 | x2 <- rnorm(20, 0, sd = 1.5)
282 | y <- c(x1, x2)
283 |
284 | group <- rep(1:2, each = length(x1))
285 | boxplot(y ~ group, boxwex = .3, xlim = c(.5, 2.5), main = "")
286 | points(group, y)
287 |
288 | # now identify the extreme points
289 | range(x1)
290 | range(x2)
291 |
292 | i <- which(x1 < min(x2))
293 | j <- which(x2 > max(x1))
294 |
295 | x1[i]
296 | x2[j]
297 |
298 | out1 <- sum(x1 > max(x2)) + sum(x1 < min(x2))
299 | out2 <- sum(x2 > max(x1)) + sum(x2 < min(x1))
300 | max(c(out1, out2))
301 |
302 |
303 | ### Example 7.13 (Count Five test statistic, cont.)
304 |
305 | maxout <- function(x, y) {
306 | X <- x - mean(x)
307 | Y <- y - mean(y)
308 | outx <- sum(X > max(Y)) + sum(X < min(Y))
309 | outy <- sum(Y > max(X)) + sum(Y < min(X))
310 | return(max(c(outx, outy)))
311 | }
312 |
313 | n1 <- n2 <- 20
314 | mu1 <- mu2 <- 0
315 | sigma1 <- sigma2 <- 1
316 | m <- 1000
317 |
318 | # generate samples under H0
319 | stat <- replicate(m, expr={
320 | x <- rnorm(n1, mu1, sigma1)
321 | y <- rnorm(n2, mu2, sigma2)
322 | maxout(x, y)
323 | })
324 | print(cumsum(table(stat)) / m)
325 | print(quantile(stat, c(.8, .9, .95)))
326 |
327 |
328 | ### Example 7.14 (Count Five test)
329 |
330 | count5test <- function(x, y) {
331 | X <- x - mean(x)
332 | Y <- y - mean(y)
333 | outx <- sum(X > max(Y)) + sum(X < min(Y))
334 | outy <- sum(Y > max(X)) + sum(Y < min(X))
335 | # return 1 (reject) or 0 (do not reject H0)
336 | return(as.integer(max(c(outx, outy)) > 5))
337 | }
338 |
339 | n1 <- n2 <- 20
340 | mu1 <- mu2 <- 0
341 | sigma1 <- sigma2 <- 1
342 | m <- 10000
343 | tests <- replicate(m, expr = {
344 | x <- rnorm(n1, mu1, sigma1)
345 | y <- rnorm(n2, mu2, sigma2)
346 | x <- x - mean(x) #centered by sample mean
347 | y <- y - mean(y)
348 | count5test(x, y)
349 | } )
350 |
351 | alphahat <- mean(tests)
352 | print(alphahat)
353 |
354 |
355 | ### Example 7.15 (Count Five test, cont.)
356 |
357 | n1 <- 20
358 | n2 <- 30
359 | mu1 <- mu2 <- 0
360 | sigma1 <- sigma2 <- 1
361 | m <- 10000
362 |
363 | alphahat <- mean(replicate(m, expr={
364 | x <- rnorm(n1, mu1, sigma1)
365 | y <- rnorm(n2, mu2, sigma2)
366 | x <- x - mean(x) #centered by sample mean
367 | y <- y - mean(y)
368 | count5test(x, y)
369 | }))
370 |
371 | print(alphahat)
372 |
373 |
374 | ### Example 7.16 (Count Five, cont.)
375 |
376 | # generate samples under H1 to estimate power
377 | sigma1 <- 1
378 | sigma2 <- 1.5
379 |
380 | power <- mean(replicate(m, expr={
381 | x <- rnorm(20, 0, sigma1)
382 | y <- rnorm(20, 0, sigma2)
383 | count5test(x, y)
384 | }))
385 |
386 | print(power)
387 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chNumerical.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 13 ###
9 | ### Introduction to Numerical Methods in R ###
10 | #######################################################
11 |
12 | # packages to install: gsl
13 |
14 | ### Example 13.1 (Identical and nearly equal)
15 |
16 | isTRUE(all.equal(.2, .3 - .1))
17 | all.equal(.2, .3) #not a logical value
18 | isTRUE(all.equal(.2, .3)) #always a logical value
19 |
20 | x <- 1:4
21 | y <- 2
22 | y == 2
23 | x == y #not necessarily a single logical value
24 | identical(x, y) #always a single logical value
25 | identical(y, 2)
26 |
27 |
28 | ### Example 13.2 (Ratio of two large numbers)
29 |
30 | n <- 400
31 | (gamma((n-1)/2) / (sqrt(pi) * gamma((n-2)/2)))
32 | exp(lgamma((n-1)/2) - lgamma((n-2)/2)) / sqrt(pi)
33 |
34 |
35 | ### Example 13.3 (Taylor expansion)
36 |
37 | system.time({
38 | for (i in 1:1000) {
39 | a <- rep(0, 24)
40 | a0 <- pi / 6
41 | a2 <- a0 * a0
42 | a[1] <- -a0^3 / 6
43 | for (i in 2:24)
44 | a[i] <- - a2 * a[i-1] / ((2*i+1)*(2*i))
45 | a0 + sum(a)}
46 | })
47 |
48 | system.time({
49 | for (i in 1:1000) {
50 | K <- 2 * (0:24) + 1
51 | i <- rep(c(1, -1), length=25)
52 | sum(i * (pi/6)^K / factorial(K))}
53 | })
54 |
55 |
56 | ### Example 13.4 (Derivative of zeta function)
57 |
58 | zeta.deriv <- function(a) {
59 | z <- a - 1
60 | # Stieltjes constants gamma_k for k=1:5
61 | g <- c(
62 | -.7281584548367672e-1,
63 | -.9690363192872318e-2,
64 | .2053834420303346e-2,
65 | .2325370065467300e-2,
66 | .7933238173010627e-3)
67 | i <- c(-1, 1, -1, 1, -1)
68 | n <- 0:4
69 | -1/z^2 + sum(i * g * z^n / factorial(n))
70 | }
71 |
72 |
73 | ### Example 13.5 (Derivative of zeta function, cont.)
74 |
75 | library(gsl) #for zeta function
76 | z <- c(1.001, 1.01, 1.5, 2, 3, 5)
77 | h <- .Machine$double.eps^0.5
78 | dz <- dq <- rep(0, length(z))
79 | for (i in 1:length(z)) {
80 | v <- z[i] + h
81 | h <- v - z[i]
82 | a0 <- z[i] - h
83 | if (a0 < 1) a0 <- (1 + z[i])/2
84 | a1 <- z[i] + h
85 | dq[i] <- (zeta(a1) - zeta(a0)) / (a1 - a0)
86 | dz[i] <- zeta.deriv(z[i])
87 | }
88 |
89 | h
90 |
91 | cbind(z, dz, dq)
92 |
93 |
94 | ### Example 13.6 (Solving f(x)=0)
95 |
96 | f <- function(y, a, n) {
97 | a^2 + y^2 + 2*a*y/(n-1) - (n-2)
98 | }
99 |
100 | a <- 0.5
101 | n <- 20
102 | b0 <- 0
103 | b1 <- 5*n
104 |
105 | #solve using bisection
106 | it <- 0
107 | eps <- .Machine$double.eps^0.25
108 | r <- seq(b0, b1, length=3)
109 | y <- c(f(r[1], a, n), f(r[2], a, n), f(r[3], a, n))
110 | if (y[1] * y[3] > 0)
111 | stop("f does not have opposite sign at endpoints")
112 |
113 | while(it < 1000 && abs(y[2]) > eps) {
114 | it <- it + 1
115 | if (y[1]*y[2] < 0) {
116 | r[3] <- r[2]
117 | y[3] <- y[2]
118 | } else {
119 | r[1] <- r[2]
120 | y[1] <- y[2]
121 | }
122 | r[2] <- (r[1] + r[3]) / 2
123 | y[2] <- f(r[2], a=a, n=n)
124 | print(c(r[1], y[1], y[3]-y[2]))
125 | }
126 |
127 |
128 | ### Example 13.7 (Solving f(x)=0 with Brent's method: uniroot)
129 |
130 | a <- 0.5
131 | n <- 20
132 | out <- uniroot(function(y) {
133 | a^2 + y^2 + 2*a*y/(n-1) - (n-2) },
134 | lower = 0, upper = n*5)
135 | unlist(out)
136 | uniroot(function(y) {a^2 + y^2 + 2*a*y/(n-1) - (n-2)},
137 | interval = c(-n*5, 0))$root
138 |
139 |
140 | ### Example 13.8 (Numerical integration with integrate)
141 |
142 | f <- function(y, N, r, rho) {
143 | (cosh(y) - rho * r)^(1 - N)
144 | }
145 | integrate(f, lower=0, upper=Inf,
146 | rel.tol=.Machine$double.eps^0.25,
147 | N=10, r=0.5, rho=0.2)
148 |
149 | ro <- seq(-.99, .99, .01)
150 | v <- rep(0, length(ro))
151 | for (i in 1:length(ro)) {
152 | v[i] <- integrate(f, lower=0, upper=Inf,
153 | rel.tol=.Machine$double.eps^0.25,
154 | N=10, r=0.5, rho=ro[i])$value
155 | }
156 | plot(ro, v, type="l", xlab=expression(rho),
157 | ylab="Integral Value (n=10, r=0.5)")
158 |
159 |
160 | ### Example 13.9 (Density of sample correlation coefficient)
161 |
162 | .dcorr <- function(r, N, rho=0) {
163 | # compute the density function of sample correlation
164 | if (abs(r) > 1 || abs(rho) > 1) return (0)
165 | if (N < 4) return (NA)
166 |
167 | if (isTRUE(all.equal(rho, 0.0))) {
168 | a <- exp(lgamma((N - 1)/2) - lgamma((N - 2)/2)) /
169 | sqrt(pi)
170 | return (a * (1 - r^2)^((N - 4)/2))
171 | }
172 |
173 | # if rho not 0, need to integrate
174 | f <- function(w, R, N, rho)
175 | (cosh(w) - rho * R)^(1 - N)
176 |
177 | #need to insert some error checking here
178 | i <- integrate(f, lower=0, upper=Inf,
179 | R=r, N=N, rho=rho)$value
180 | c1 <- (N - 2) * (1 - rho^2)^((N - 1)/2)
181 | c2 <- (1 - r^2)^((N - 4) / 2) / pi
182 | return(c1 * c2 * i)
183 | }
184 |
185 | r <- as.matrix(seq(-1, 1, .01))
186 | d1 <- apply(r, 1, .dcorr, N=10, rho=.0)
187 | d2 <- apply(r, 1, .dcorr, N=10, rho=.5)
188 | d3 <- apply(r, 1, .dcorr, N=10, rho=-.5)
189 | plot(r, d2, type="l", lty=2, lwd=2, ylab="density")
190 | lines(r, d1, lwd=2)
191 | lines(r, d3, lty=4, lwd=2)
192 | legend("top", inset=.02,
193 | c("rho = 0", "rho = 0.5", "rho = -0.5"), lty=c(1,2,4), lwd=2)
194 |
195 |
196 | ### Example 13.10 (MLE using mle)
197 |
198 | #the observed sample
199 | y <- c(0.04304550, 0.50263474)
200 |
201 | mlogL <- function(theta=1) {
202 | #minus log-likelihood of exp. density, rate theta
203 | return( - (length(y) * log(theta) - theta * sum(y)))
204 | }
205 |
206 | library(stats4)
207 | fit <- mle(mlogL)
208 | summary(fit)
209 |
210 | # Alternately, the initial value for the optimizer could
211 | # be supplied in the call to mle; two examples are
212 |
213 | mle(mlogL, start=list(theta=1))
214 | mle(mlogL, start=list(theta=mean(y)))
215 |
216 | ### Application: Evaluating an Expected Value
217 |
218 | y <- VGAM::rpareto(1000, scale=1, shape=3)
219 | hist(y, prob=TRUE, breaks="scott", main="", ylim=c(0,3))
220 | curve(VGAM::dpareto(x, scale=1, shape=3), add=TRUE)
221 |
222 | a <- 1
223 | s <- 1
224 | b <- 0.5
225 |
226 | ### Example 13.11 (Numerical integration)
227 |
228 | f1 <- function(x, y, s, a, b) {
229 | # the integrand function: |y-x|^b f(x)
230 | (abs(y-x))^b * a * s^a / x^(a+1)
231 | }
232 |
233 | integrate(f1, lower=s, upper=Inf, y=2, s=1, a=1, b=b)
234 |
235 | ### Example 13.12 (Direct evaluation)
236 |
237 | g2 <- function(y, s, b) {
238 | # Compute E|y-X|^b for Pareto(I) with a=1
239 | y0 <- (y - s)/y
240 | c0 <- gsl::hyperg_2F1(b+1, 1, b+2, y0)
241 | ((y - s)^b) - s*b*(y^(b-1)) *
242 | ((y0^b)/b + c0*(y0^(b+1))/(b+1)) +
243 | s*y^(b-1) * beta(b+1,1-b)
244 | }
245 |
246 | g2(2, s, b)
247 |
248 | ### Example 13.13 (Plot of expected distance function)
249 |
250 | p <- c(ppoints(50)/100, ppoints(50))
251 | x <- VGAM::qpareto(p, scale=s, shape=1)
252 | ex <- g2(x, s=s, b=b)
253 | plot(log(x), log(ex), cex=.25, type="l")
254 | for (i in 1:length(x)) {
255 | y <- x[i]
256 | zi <- integrate(f1, lower=s, upper=Inf,
257 | subdivisions=200, rel.tol=.Machine$double.eps^.5,
258 | stop.on.error=FALSE, y=y, s=s, a=1, b=b)
259 | if (zi$message == "OK")
260 | points(log(y), log(zi$value), col=2, cex=.25)
261 | else print(paste("y=", y, zi$Message))
262 | }
263 |
264 | integrate(f1, lower=s, upper=Inf, y=s, s=s, a=1, b=b)
265 | g2(s, s, b)
266 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chOptim.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 14 ###
9 | ### Optimization ###
10 | #######################################################
11 |
12 |
13 | ### Example 14.1 (One-dimensional optimization with optimize)
14 |
15 | f <- function(x)
16 | log(x + log(x))/log(1+x)
17 |
18 | curve(f(x), from = 2, to = 15, ylab = "f(x)")
19 |
20 | optimize(f, lower = 4, upper = 8, maximum = TRUE)
21 |
22 |
23 | ### Example 14.2 (MLE: Gamma distribution)
24 |
25 | m <- 20000
26 | est <- matrix(0, m, 2)
27 | n <- 200
28 | r <- 5
29 | lambda <- 2
30 |
31 | obj <- function(lambda, xbar, logx.bar) {
32 | digamma(lambda * xbar) - logx.bar - log(lambda)
33 | }
34 |
35 | for (i in 1:m) {
36 | x <- rgamma(n, shape=r, rate=lambda)
37 | xbar <- mean(x)
38 | u <- uniroot(obj, lower = .001, upper = 10e5,
39 | xbar = xbar, logx.bar = mean(log(x)))
40 | lambda.hat <- u$root
41 | r.hat <- xbar * lambda.hat
42 | est[i, ] <- c(r.hat, lambda.hat)
43 | }
44 |
45 | ML <- colMeans(est)
46 |
47 | hist(est[, 1], breaks="scott", freq=FALSE,
48 | xlab="r", main="")
49 | points(ML[1], 0, cex=1.5, pch=20)
50 | hist(est[, 2], breaks="scott", freq=FALSE,
51 | xlab=bquote(lambda), main="")
52 | points(ML[2], 0, cex=1.5, pch=20)
53 |
54 |
55 | ### Example 14.3 (Two-dimensional optimization with optim)
56 |
57 | LL <- function(theta, sx, slogx, n) {
58 | r <- theta[1]
59 | lambda <- theta[2]
60 | loglik <- n * r * log(lambda) + (r - 1) * slogx -
61 | lambda * sx - n * log(gamma(r))
62 | - loglik
63 | }
64 |
65 | n <- 200
66 | r <- 5; lambda <- 2
67 | x <- rgamma(n, shape=r, rate=lambda)
68 |
69 | optim(c(1,1), LL, sx=sum(x), slogx=sum(log(x)), n=n)
70 |
71 | mlests <- replicate(20000, expr = {
72 | x <- rgamma(200, shape = 5, rate = 2)
73 | optim(c(1,1), LL, sx=sum(x), slogx=sum(log(x)), n=n)$par
74 | })
75 | colMeans(t(mlests))
76 |
77 |
78 | ### Example 14.4 (MLE for a quadratic form)
79 |
80 | LL <- function(lambda, y) {
81 | lambda3 <- 1 - sum(lambda)
82 | f1 <- dgamma(y, shape=1/2, rate=1/(2*lambda[1]))
83 | f2 <- dgamma(y, shape=1/2, rate=1/(2*lambda[2]))
84 | f3 <- dgamma(y, shape=1/2, rate=1/(2*lambda3))
85 | f <- f1/3 + f2/3 + f3/3 #density of mixture
86 | #returning -loglikelihood
87 | return( -sum(log(f)))
88 | }
89 |
90 | set.seed(543)
91 | m <- 2000
92 | lambda <- c(.6, .25, .15) #rate is 1/(2lambda)
93 | lam <- sample(lambda, size = 2000, replace = TRUE)
94 | y <- rgamma(m, shape = .5, rate = 1/(2*lam))
95 |
96 | opt <- optim(c(.5,.3), LL, y=y)
97 | theta <- c(opt$par, 1 - sum(opt$par))
98 |
99 | as.data.frame(unlist(opt))
100 |
101 | theta
102 |
103 |
104 | ### Example 14.5 (EM algorithm for a mixture model)
105 |
106 | set.seed(543)
107 | lambda <- c(.6, .25, .15) #rate is 1/(2lambda)
108 | lam <- sample(lambda, size = 2000, replace = TRUE)
109 | y <- rgamma(m, shape = .5, rate = 1/(2*lam))
110 |
111 | N <- 10000 #max. number of iterations
112 | L <- c(.5, .4, .1) #initial est. for lambdas
113 | tol <- .Machine$double.eps^0.5
114 | L.old <- L + 1
115 |
116 | for (j in 1:N) {
117 | f1 <- dgamma(y, shape=1/2, rate=1/(2*L[1]))
118 | f2 <- dgamma(y, shape=1/2, rate=1/(2*L[2]))
119 | f3 <- dgamma(y, shape=1/2, rate=1/(2*L[3]))
120 | py <- f1 / (f1 + f2 + f3) #posterior prob y from 1
121 | qy <- f2 / (f1 + f2 + f3) #posterior prob y from 2
122 | ry <- f3 / (f1 + f2 + f3) #posterior prob y from 3
123 |
124 | mu1 <- sum(y * py) / sum(py) #update means
125 | mu2 <- sum(y * qy) / sum(qy)
126 | mu3 <- sum(y * ry) / sum(ry)
127 | L <- c(mu1, mu2, mu3) #update lambdas
128 | L <- L / sum(L)
129 |
130 | if (sum(abs(L - L.old)/L.old) < tol) break
131 | L.old <- L
132 | }
133 |
134 | print(list(lambda = L/sum(L), iter = j, tol = tol))
135 |
136 |
137 | ### Example 14.6 (Simplex algorithm)
138 |
139 | library(boot) #for simplex function
140 | A1 <- rbind(c(-2, 1, 1), c(4, -1, 3))
141 | b1 <- c(1, 3)
142 | a <- c(2, 2, 3)
143 | simplex(a = a, A1 = A1, b1 = b1, maxi = TRUE)
144 | detach(package:boot)
145 |
146 |
147 | ### Example 14.7 (Solving the Morra game)
148 |
149 | solve.game <- function(A) {
150 | #solve the two player zero-sum game by simplex method
151 | #optimize for player 1, then player 2
152 | #maximize v subject to ...
153 | #let x strategies 1:m, and put v as extra variable
154 | #A1, the <= constraints
155 | #
156 | min.A <- min(A)
157 | A <- A - min.A #so that v >= 0
158 | max.A <- max(A)
159 | A <- A / max(A)
160 | m <- nrow(A)
161 | n <- ncol(A)
162 | it <- n^3
163 | a <- c(rep(0, m), 1) #objective function
164 | A1 <- -cbind(t(A), rep(-1, n)) #constraints <=
165 | b1 <- rep(0, n)
166 | A3 <- t(as.matrix(c(rep(1, m), 0))) #constraints sum(x)=1
167 | b3 <- 1
168 | sx <- simplex(a=a, A1=A1, b1=b1, A3=A3, b3=b3,
169 | maxi=TRUE, n.iter=it)
170 | #the 'solution' is [x1,x2,...,xm | value of game]
171 | #
172 | #minimize v subject to ...
173 | #let y strategies 1:n, with v as extra variable
174 | a <- c(rep(0, n), 1) #objective function
175 | A1 <- cbind(A, rep(-1, m)) #constraints <=
176 | b1 <- rep(0, m)
177 | A3 <- t(as.matrix(c(rep(1, n), 0))) #constraints sum(y)=1
178 | b3 <- 1
179 | sy <- simplex(a=a, A1=A1, b1=b1, A3=A3, b3=b3,
180 | maxi=FALSE, n.iter=it)
181 |
182 | soln <- list("A" = A * max.A + min.A,
183 | "x" = sx$soln[1:m],
184 | "y" = sy$soln[1:n],
185 | "v" = sx$soln[m+1] * max.A + min.A)
186 | soln
187 | }
188 |
189 |
190 | #enter the payoff matrix
191 | A <- matrix(c( 0,-2,-2,3,0,0,4,0,0,
192 | 2,0,0,0,-3,-3,4,0,0,
193 | 2,0,0,3,0,0,0,-4,-4,
194 | -3,0,-3,0,4,0,0,5,0,
195 | 0,3,0,-4,0,-4,0,5,0,
196 | 0,3,0,0,4,0,-5,0,-5,
197 | -4,-4,0,0,0,5,0,0,6,
198 | 0,0,4,-5,-5,0,0,0,6,
199 | 0,0,4,0,0,5,-6,-6,0), 9, 9)
200 |
201 | library(boot) #needed for simplex function
202 |
203 | s <- solve.game(A)
204 | round(cbind(s$x, s$y), 7)
205 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chPerm.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 10 ###
9 | ### Permutation Tests ###
10 | #######################################################
11 |
12 | # packages to install: energy, yaImpute
13 |
14 | ### Example 10.1 (Permutation distribution of a statistic)
15 |
16 | attach(chickwts)
17 | x <- sort(weight[feed == "soybean"])
18 | y <- sort(weight[feed == "linseed"])
19 | detach(chickwts)
20 |
21 | R <- 999 #number of replicates
22 | z <- c(x, y) #pooled sample
23 | K <- 1:26
24 | reps <- numeric(R) #storage for replicates
25 | t0 <- t.test(x, y)$statistic
26 |
27 | for (i in 1:R) {
28 | #generate indices k for the first sample
29 | k <- sample(K, size = 14, replace = FALSE)
30 | x1 <- z[k]
31 | y1 <- z[-k] #complement of x1
32 | reps[i] <- t.test(x1, y1)$statistic
33 | }
34 | p <- mean(c(t0, reps) >= t0)
35 | p
36 |
37 | hist(reps, main = "", freq = FALSE, xlab = "T (p = 0.202)",
38 | breaks = "scott")
39 | points(t0, 0, cex = 1, pch = 16) #observed T
40 |
41 |
42 | ### Example 10.2 (Permutation distribution of the K-S statistic)
43 |
44 | # continues Example 10.1
45 | R <- 999 #number of replicates
46 | z <- c(x, y) #pooled sample
47 | K <- 1:26
48 | D <- numeric(R) #storage for replicates
49 | options(warn = -1)
50 | D0 <- ks.test(x, y, exact = FALSE)$statistic
51 | for (i in 1:R) {
52 | #generate indices k for the first sample
53 | k <- sample(K, size = 14, replace = FALSE)
54 | x1 <- z[k]
55 | y1 <- z[-k] #complement of x1
56 | D[i] <- ks.test(x1, y1, exact = FALSE)$statistic
57 | }
58 | p <- mean(c(D0, D) >= D0)
59 | options(warn = 0)
60 | p
61 |
62 | hist(D, main = "", freq = FALSE, xlab = "D (p = 0.46)",
63 | breaks = "scott")
64 | points(D0, 0, cex = 1, pch = 16) #observed D
65 |
66 |
67 | ### Example 10.3 (Example 10.2, cont.)
68 |
69 | attach(chickwts)
70 | x <- sort(weight[feed == "sunflower"])
71 | y <- sort(weight[feed == "linseed"])
72 | detach(chickwts)
73 |
74 | summary(cbind(x, y))
75 | options(warn = -1)
76 | D0 <- ks.test(x, y, exact = FALSE)$statistic
77 | for (i in 1:R) {
78 | #generate indices k for the first sample
79 | k <- sample(K, size = 14, replace = FALSE)
80 | x1 <- z[k]
81 | y1 <- z[-k] #complement of x1
82 | D[i] <- ks.test(x1, y1, exact = FALSE)$statistic
83 | }
84 | p <- mean(c(D0, D) >= D0)
85 | options(warn = 0)
86 | p
87 |
88 |
89 | ### Example 10.4 (Finding nearest neighbors)
90 | ### using yaImpute::ann
91 |
92 | set.seed(439)
93 | library(yaImpute) #for ann function
94 |
95 | #generate a small multivariate data set
96 | x <- matrix(rnorm(12), 3, 4)
97 | y <- matrix(rnorm(12), 3, 4)
98 | z <- rbind(x, y)
99 | k <- nrow(z) #number of nearest neighbors desired
100 |
101 | ## Do an exact kd-tree search
102 | kd.exact <- ann(ref=z, target=z,
103 | tree.type="kd", k=k, verbose=FALSE)
104 | kd.exact$knnIndexDist[,1:k] #NN indices
105 | round(sqrt(kd.exact$knnIndexDist[,-(1:k)]),2) #Euclidean distances
106 |
107 | ## Do an approximate kd-tree search
108 | kd.approx <- ann(ref=z, target=z,
109 | tree.type="kd", k=k, eps=100, verbose=FALSE)
110 | kd.approx$knnIndexDist[,1:k] #NN indices
111 | detach(package:yaImpute)
112 |
113 |
114 | ### Example 10.5 (Nearest neighbor statistic)
115 | ### using yaImpute::ann
116 |
117 | library(yaImpute)
118 | attach(chickwts)
119 | x <- weight[feed == "sunflower"]
120 | y <- weight[feed == "linseed"]
121 | z <- as.matrix(c(x, y))
122 | detach(chickwts)
123 |
124 | k <- 4 #want first 3 nearest neighbors
125 | NN <- ann(ref=z, target=z, tree.type="kd", k=k, verbose=FALSE)
126 | idx <- NN$knnIndexDist[,1:k]
127 | nn.idx <- idx[,-1] #first NN is in column 2
128 |
129 | block1 <- nn.idx[1:12, ]
130 | block2 <- nn.idx[13:24, ]
131 | i1 <- sum(block1 < 12.5)
132 | i2 <- sum(block2 > 12.5)
133 |
134 | c(i1, i2)
135 |
136 | detach(package:yaImpute)
137 |
138 |
139 | ### Example 10.6 (Nearest neighbor test)
140 | ### using yaImpute::ann
141 |
142 | library(boot)
143 | #continues the previous example
144 |
145 | ## function to return the matrix of indices NN_j of nearest neighbors
146 | NN.idx <- function(x, tree.type="kd", k=NROW(x)) {
147 | x <- as.matrix(x)
148 | k <- min(c(k+1, NROW(x)))
149 | NN <- yaImpute::ann(ref=x, target=x,
150 | tree.type="kd", k=k, verbose=FALSE)
151 | idx <- NN$knnIndexDist[,1:k]
152 | nn.idx <- idx[,-1] #first NN is in column 2
153 | row.names(nn.idx) <- idx[,1]
154 | nn.idx
155 | }
156 |
157 | ## function to compute the NN statistic T(n,3)
158 | Tn3 <- function(z, ix=1:NROW(z), sizes) {
159 | z <- as.matrix(z)
160 | n1 <- sizes[1]
161 | n2 <- sizes[2]
162 | n <- n1 + n2
163 | z <- as.matrix(z[ix, ])
164 | nn.idx <- NN.idx(z, k=3)
165 | block1 <- nn.idx[1:n1, ]
166 | block2 <- nn.idx[(n1+1):n, ]
167 | i1 <- sum(block1 < n1 + .5)
168 | i2 <- sum(block2 > n1 + .5)
169 | return((i1 + i2) / (3 * n))
170 | }
171 |
172 | attach(chickwts)
173 | x <- weight[feed == "sunflower"]
174 | y <- weight[feed == "linseed"]
175 | z <- c(x, y)
176 | detach(chickwts)
177 |
178 | N <- c(NROW(x), NROW(y))
179 |
180 | boot.obj <- boot(data = z, statistic = Tn3,
181 | sim = "permutation", R = 999, sizes = N)
182 | boot.obj
183 |
184 | tb <- c(boot.obj$t, boot.obj$t0)
185 | mean(tb >= boot.obj$t0)
186 |
187 | hist(tb, freq=FALSE, main="",
188 | xlab="replicates of T(n,3) statistic")
189 | points(boot.obj$t0, 0, cex=1, pch=16)
190 |
191 |
192 |
193 |
194 | ### Example 10.7 (Two-sample energy statistic)
195 |
196 | edist.2 <- function(x, ix, sizes) {
197 | # computes the e-statistic between 2 samples
198 | # x: Euclidean distances of pooled sample
199 | # sizes: vector of sample sizes
200 | # ix: a permutation of row indices of x
201 |
202 | dst <- x
203 | n1 <- sizes[1]
204 | n2 <- sizes[2]
205 | ii <- ix[1:n1]
206 | jj <- ix[(n1+1):(n1+n2)]
207 | w <- n1 * n2 / (n1 + n2)
208 |
209 | # permutation applied to rows & cols of dist. matrix
210 | m11 <- sum(dst[ii, ii]) / (n1 * n1)
211 | m22 <- sum(dst[jj, jj]) / (n2 * n2)
212 | m12 <- sum(dst[ii, jj]) / (n1 * n2)
213 | e <- w * ((m12 + m12) - (m11 + m22))
214 | return (e)
215 | }
216 |
217 | d <- 3
218 | a <- 2 / sqrt(d)
219 | x <- matrix(rnorm(20 * d), nrow = 20, ncol = d)
220 | y <- matrix(rnorm(10 * d, a, 1), nrow = 10, ncol = d)
221 | z <- rbind(x, y)
222 | dst <- as.matrix(dist(z))
223 |
224 | edist.2(dst, 1:30, sizes = c(20, 10))
225 |
226 |
227 | ### Example 10.8 (Two-sample energy test)
228 |
229 | library(boot) #for boot function
230 | dst <- as.matrix(dist(z))
231 | N <- c(20, 10)
232 |
233 | boot.obj <- boot(data = dst, statistic = edist.2,
234 | sim = "permutation", R = 999, sizes = N)
235 | boot.obj
236 |
237 | #calculate the ASL
238 | e <- boot.obj$t0
239 | tb <- c(e, boot.obj$t)
240 | mean(tb >= e)
241 |
242 | hist(tb, main = "", breaks="scott", freq=FALSE,
243 | xlab="Replicates of e")
244 | points(e, 0, cex=1, pch=16)
245 |
246 |
247 | #energy test applied under F=G
248 | d <- 3
249 | a <- 0
250 | x <- matrix(rnorm(20 * d), nrow = 20, ncol = d)
251 | y <- matrix(rnorm(10 * d, a, 1), nrow = 10, ncol = d)
252 | z <- rbind(x, y)
253 | dst <- as.matrix(dist(z))
254 |
255 | N <- c(20, 10)
256 | dst <- as.matrix(dist(z))
257 | boot.obj <- boot(data = dst, statistic = edist.2,
258 | sim="permutation", R=999, sizes=N)
259 | boot.obj
260 |
261 | #calculate the ASL
262 | e <- boot.obj$t0
263 | E <- c(boot.obj$t, e)
264 | mean(E >= e)
265 |
266 | hist(E, main = "", breaks="scott",
267 | xlab="Replicates of e", freq=FALSE)
268 | points(e, 0, cex=1, pch=16)
269 |
270 | ### Example 10.9 (k-sample energy distances)
271 |
272 | z <- iris[ , 1:4]
273 | dst <- dist(z)
274 | energy::edist(dst, sizes = c(50, 50, 50), distance = TRUE)
275 |
276 | ### Example 10.10 (Distance Components (disco))
277 |
278 | set.seed(413)
279 | energy::disco(iris[ , 1:4], factors = iris$Species, R = 999)
280 |
281 | ### Example 10.11 (Power Comparison)
282 |
283 | # results of several simulations summarized in Table
284 |
285 | ### Example 10.12 (Distance covariance statistic)
286 |
287 | dCov <- function(x, y) {
288 | x <- as.matrix(x)
289 | y <- as.matrix(y)
290 | n <- nrow(x)
291 | m <- nrow(y)
292 | if (n != m || n < 2) stop("Sample sizes must agree")
293 | if (! (all(is.finite(c(x, y)))))
294 | stop("Data contains missing or infinite values")
295 |
296 | Akl <- function(x) {
297 | d <- as.matrix(dist(x))
298 | m <- rowMeans(d)
299 | M <- mean(d)
300 | a <- sweep(d, 1, m)
301 | b <- sweep(a, 2, m)
302 | return(b + M)
303 | }
304 | A <- Akl(x)
305 | B <- Akl(y)
306 | dCov <- sqrt(mean(A * B))
307 | dCov
308 | }
309 |
310 | z <- as.matrix(iris[1:50, 1:4])
311 | x <- z[ , 1:2]
312 | y <- z[ , 3:4]
313 | # compute the observed statistic
314 | dCov(x, y)
315 |
316 | ### Example 10.13 (Distance correlation statistic)
317 |
318 | DCOR <- function(x, y) {
319 | x <- as.matrix(x)
320 | y <- as.matrix(y)
321 | n <- nrow(x)
322 | m <- nrow(y)
323 | if (n != m || n < 2) stop("Sample sizes must agree")
324 | if (! (all(is.finite(c(x, y)))))
325 | stop("Data contains missing or infinite values")
326 | Akl <- function(x) {
327 | d <- as.matrix(dist(x))
328 | m <- rowMeans(d)
329 | M <- mean(d)
330 | a <- sweep(d, 1, m)
331 | b <- sweep(a, 2, m)
332 | return(b + M)
333 | }
334 | A <- Akl(x)
335 | B <- Akl(y)
336 | dCov <- sqrt(mean(A * B))
337 | dVarX <- sqrt(mean(A * A))
338 | dVarY <- sqrt(mean(B * B))
339 | dCor <- sqrt(dCov / sqrt(dVarX * dVarY))
340 | list(dCov=dCov, dCor=dCor, dVarX=dVarX, dVarY=dVarY)
341 | }
342 |
343 | z <- as.matrix(iris[1:50, 1:4])
344 | x <- z[ , 1:2]
345 | y <- z[ , 3:4]
346 |
347 | DCOR(x, y)
348 |
349 |
350 | ### Example 10.14 (Distance covariance test)
351 |
352 | ndCov2 <- function(z, ix, dims) {
353 | p <- dims[1]
354 | q1 <- p + 1
355 | d <- p + dims[2]
356 | x <- z[ , 1:p] #leave x as is
357 | y <- z[ix, q1:d] #permute rows of y
358 | return(nrow(z) * dCov(x, y)^2)
359 | }
360 |
361 | library(boot)
362 | z <- as.matrix(iris[1:50, 1:4])
363 | boot.obj <- boot(data = z, statistic = ndCov2, R = 999,
364 | sim = "permutation", dims = c(2, 2))
365 |
366 | tb <- c(boot.obj$t0, boot.obj$t)
367 | hist(tb, nclass="scott", xlab="", main="",
368 | freq=FALSE)
369 | points(boot.obj$t0, 0, cex=1, pch=16)
370 |
371 | mean(tb >= boot.obj$t0)
372 | boot.obj
373 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chRandomProcess.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 4 ###
9 | ### Generating Random Processes ###
10 | #######################################################
11 |
12 |
13 | ### Example 4.1 (Poisson process)
14 |
15 | lambda <- 2
16 | t0 <- 3
17 | Tn <- rexp(100, lambda) #interarrival times
18 | Sn <- cumsum(Tn) #arrival times
19 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0]
20 |
21 |
22 | ### Example 4.2 (Poisson process, cont.)
23 |
24 | lambda <- 2
25 | t0 <- 3
26 | upper <- 100
27 | pp <- numeric(10000)
28 | for (i in 1:10000) {
29 | N <- rpois(1, lambda * upper)
30 | Un <- runif(N, 0, upper) #unordered arrival times
31 | Sn <- sort(Un) #arrival times
32 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0]
33 | pp[i] <- n - 1 #arrivals in [0, t0]
34 | }
35 |
36 | #alternately, the loop can be replaced by replicate function
37 | pp <- replicate(10000, expr = {
38 | N <- rpois(1, lambda * upper)
39 | Un <- runif(N, 0, upper) #unordered arrival times
40 | Sn <- sort(Un) #arrival times
41 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0]
42 | n - 1 }) #arrivals in [0, t0]
43 |
44 | c(mean(pp), var(pp))
45 |
46 |
47 | ### Example 4.3 (Nonhomogeneous Poisson process)
48 |
49 | lambda <- 3
50 | upper <- 100
51 | N <- rpois(1, lambda * upper)
52 | Tn <- rexp(N, lambda)
53 | Sn <- cumsum(Tn)
54 | Un <- runif(N)
55 | keep <- (Un <= cos(Sn)^2) #indicator, as logical vector
56 | Sn[keep]
57 |
58 | round(Sn[keep], 4)
59 |
60 |
61 | ### Example 4.4 (Renewal process)
62 |
63 | t0 <- 5
64 | Tn <- rgeom(100, prob = .2) #interarrival times
65 | Sn <- cumsum(Tn) #arrival times
66 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0]
67 |
68 | Nt0 <- replicate(1000, expr = {
69 | Sn <- cumsum(rgeom(100, prob = .2))
70 | min(which(Sn > t0)) - 1
71 | })
72 | table(Nt0)/1000
73 | Nt0
74 |
75 | t0 <- seq(0.1, 30, .1)
76 | mt <- numeric(length(t0))
77 |
78 | for (i in 1:length(t0)) {
79 | mt[i] <- mean(replicate(1000,
80 | {
81 | Sn <- cumsum(rgeom(100, prob = .2))
82 | min(which(Sn > t0[i])) - 1
83 | }))
84 | }
85 | plot(t0, mt, type = "l", xlab = "t", ylab = "mean")
86 | abline(0, .25)
87 |
88 |
89 | ### Example 4.5 (Symmetric random walk)
90 |
91 | n <- 400
92 | incr <- sample(c(-1, 1), size = n, replace = TRUE)
93 | S <- as.integer(c(0, cumsum(incr)))
94 | plot(0:n, S, type = "l", main = "", xlab = "i")
95 |
96 | ### Example 4.6 (Generator for the time until return to origin)
97 |
98 | set.seed(12345)
99 |
100 | #compute the probabilities directly
101 | n <- 1:10000
102 | p2n <- exp(lgamma(2*n-1)
103 | - log(n) - (2*n-1)*log(2) - 2*lgamma(n))
104 | #or compute using dbinom
105 | P2n <- (.5/n) * dbinom(n-1, size = 2*n-2, prob = 0.5)
106 | pP2n <- cumsum(P2n)
107 |
108 | #given n compute the time of the last return to 0 in (0,n]
109 | n <- 200
110 | sumT <- 0
111 | while (sumT <= n) {
112 | u <- runif(1)
113 | s <- sum(u > pP2n)
114 | if (s == length(pP2n))
115 | warning("T is truncated")
116 | Tj <- 2 * (1 + s)
117 | #print(c(Tj, sumT))
118 | sumT <- sumT + Tj
119 | }
120 | sumT - Tj
121 |
122 | ### Example 4.7 (Brownian motion)
123 |
124 |
125 | simBM <- function(n, T) {
126 | times <- seq(0, T, length = n+1)
127 | z <- rnorm(n)
128 | w <- rep(0, n)
129 | s <- sqrt(diff(times))
130 | for (k in 2:n) {
131 | w[k] <- w[k-1] + s[k] * z[k]
132 | }
133 | return (list(w=w, t=times))
134 | }
135 |
136 | set.seed(1)
137 | n <- 200
138 | x1 <- simBM(n, 1)
139 | x2 <- simBM(n, 1)
140 | x3 <- simBM(n, 1)
141 | r <- range(c(x1$w, x2$w, x3$w))
142 | plot(x1$w, type="l", main="", xlab="t", ylab="W", ylim=r)
143 | lines(x2$w, lty=2)
144 | lines(x3$w, lty=3)
145 |
146 | interpBM <- function(w, t0, times) {
147 | k1 <- sum(times < t0)
148 | k <- k1 + 1
149 | b <- (t0 - times[k1]) / (times[k] - times[k1])
150 | return (w[k1] + b * (w[k] - w[k1]))
151 | }
152 |
153 | plot(x1$t[1:10], x1$w[1:10], type="b", main="", xlab="t", ylab="W")
154 | tmids <- x1$t + 0.0025
155 | for (i in 1:10) {
156 | w <- interpBM(x1$w, tmids[i], x1$t)
157 | points(tmids[i], w, pch=2)
158 | }
159 |
160 | legend("topleft", c("Generated W", "Interpolated W"), pch=c(1,2), lty=1, bty="n")
161 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chRgen.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 3 ###
9 | ### Methods for Generating Random Variables ###
10 | #######################################################
11 |
12 |
13 | # packages to install: mvtnorm
14 |
15 |
16 | ### Example 3.1 (Sampling from a finite population)
17 |
18 | #toss some coins
19 | sample(0:1, size = 10, replace = TRUE)
20 |
21 | #choose some lottery numbers
22 | sample(1:100, size = 6, replace = FALSE)
23 |
24 | #permuation of letters a-z
25 | sample(letters)
26 |
27 | #sample from a multinomial distribution
28 | x <- sample(1:3, size = 100, replace = TRUE,
29 | prob = c(.2, .3, .5))
30 | table(x)
31 |
32 | ### Example 3.2 (Inverse transform method, continuous case)
33 |
34 | n <- 1000
35 | u <- runif(n)
36 | x <- u^(1/3)
37 | hist(x, prob = TRUE, main = bquote(f(x)==3*x^2)) #density histogram of sample
38 | y <- seq(0, 1, .01)
39 | lines(y, 3*y^2) #density curve f(x)
40 |
41 |
42 | ### Example 3.4 (Two point distribution)
43 |
44 | n <- 1000
45 | p <- 0.4
46 | u <- runif(n)
47 | x <- as.integer(u > 0.6) #(u > 0.6) is a logical vector
48 |
49 | mean(x)
50 | var(x)
51 |
52 |
53 | ### Example 3.5 (Geometric distribution)
54 |
55 | n <- 1000
56 | p <- 0.25
57 | u <- runif(n)
58 | k <- ceiling(log(1-u) / log(1-p)) - 1
59 |
60 | # more efficient
61 | k <- floor(log(u) / log(1-p))
62 |
63 |
64 | ### Example 3.6 (Logarithmic distribution)
65 |
66 | rlogarithmic <- function(n, theta) {
67 | #returns a random logarithmic(theta) sample size n
68 | u <- runif(n)
69 | #set the initial length of cdf vector
70 | N <- ceiling(-16 / log10(theta))
71 | k <- 1:N
72 | a <- -1/log(1-theta)
73 | fk <- exp(log(a) + k * log(theta) - log(k))
74 | Fk <- cumsum(fk)
75 | x <- integer(n)
76 | for (i in 1:n) {
77 | x[i] <- as.integer(sum(u[i] > Fk)) #F^{-1}(u)-1
78 | while (x[i] == N) {
79 | #if x==N we need to extend the cdf
80 | #very unlikely because N is large
81 | logf <- log(a) + (N+1)*log(theta) - log(N+1)
82 | fk <- c(fk, exp(logf))
83 | Fk <- c(Fk, Fk[N] + fk[N+1])
84 | N <- N + 1
85 | x[i] <- as.integer(sum(u[i] > Fk))
86 | }
87 | }
88 | x + 1
89 | }
90 |
91 | n <- 1000
92 | theta <- 0.5
93 | x <- rlogarithmic(n, theta)
94 | #compute density of logarithmic(theta) for comparison
95 | k <- sort(unique(x))
96 | p <- -1 / log(1 - theta) * theta^k / k
97 | se <- sqrt(p*(1-p)/n) #standard error
98 |
99 | round(rbind(table(x)/n, p, se),3)
100 |
101 |
102 | ### Example 3.7 (Acceptance-rejection method)
103 |
104 | n <- 1000
105 | k <- 0 #counter for accepted
106 | j <- 0 #iterations
107 | y <- numeric(n)
108 |
109 | while (k < n) {
110 | u <- runif(1)
111 | j <- j + 1
112 | x <- runif(1) #random variate from g
113 | if (x * (1-x) > u) {
114 | #we accept x
115 | k <- k + 1
116 | y[k] <- x
117 | }
118 | }
119 |
120 | j
121 |
122 | #compare empirical and theoretical percentiles
123 | p <- seq(.1, .9, .1)
124 | Qhat <- quantile(y, p) #quantiles of sample
125 | Q <- qbeta(p, 2, 2) #theoretical quantiles
126 | se <- sqrt(p * (1-p) / (n * dbeta(Q, 2, 2)^2)) #see Ch. 2
127 | round(rbind(Qhat, Q, se), 3)
128 |
129 |
130 | ### Example 3.8 (Beta distribution)
131 |
132 | n <- 1000
133 | a <- 3
134 | b <- 2
135 | u <- rgamma(n, shape=a, rate=1)
136 | v <- rgamma(n, shape=b, rate=1)
137 | x <- u / (u + v)
138 |
139 | q <- qbeta(ppoints(n), a, b)
140 | qqplot(q, x, cex=0.25, xlab="Beta(3, 2)", ylab="Sample")
141 | abline(0, 1)
142 |
143 |
144 | ### Example 3.9 (Logarithmic distribution, version 2)
145 |
146 | n <- 1000
147 | theta <- 0.5
148 | u <- runif(n) #generate logarithmic sample
149 | v <- runif(n)
150 | x <- floor(1 + log(v) / log(1 - (1 - theta)^u))
151 | k <- 1:max(x) #calc. logarithmic probs.
152 | p <- -1 / log(1 - theta) * theta^k / k
153 | se <- sqrt(p*(1-p)/n)
154 | p.hat <- tabulate(x)/n
155 |
156 | print(round(rbind(p.hat, p, se), 3))
157 |
158 | # The following function is a simple replacement for
159 | # rlogarithmic in Example 3.6
160 |
161 | rlogarithmic <- function(n, theta) {
162 | stopifnot(all(theta > 0 & theta < 1))
163 | th <- rep(theta, length=n)
164 | u <- runif(n)
165 | v <- runif(n)
166 | x <- floor(1 + log(v) / log(1 - (1 - th)^u))
167 | return(x)
168 | }
169 |
170 |
171 | ### Example 3.10 (Chisquare)
172 |
173 | n <- 1000
174 | nu <- 2
175 | X <- matrix(rnorm(n*nu), n, nu)^2 #matrix of sq. normals
176 | #sum the squared normals across each row: method 1
177 | y <- rowSums(X)
178 | #method 2
179 | y <- apply(X, MARGIN=1, FUN=sum) #a vector length n
180 | mean(y)
181 | mean(y^2)
182 |
183 |
184 | ### Example 3.11 (Convolutions and mixtures)
185 |
186 | n <- 1000
187 | x1 <- rgamma(n, 2, 2)
188 | x2 <- rgamma(n, 2, 4)
189 | s <- x1 + x2 #the convolution
190 | u <- runif(n)
191 | k <- as.integer(u > 0.5) #vector of 0's and 1's
192 | x <- k * x1 + (1-k) * x2 #the mixture
193 |
194 | par(mfcol=c(1,2)) #two graphs per page
195 | hist(s, prob=TRUE, xlim=c(0,5), ylim=c(0,1))
196 | hist(x, prob=TRUE, xlim=c(0,5), ylim=c(0,1))
197 | par(mfcol=c(1,1)) #restore display
198 |
199 |
200 | ### Example 3.12 (Mixture of several gamma distributions)
201 | # density estimates are plotted
202 |
203 | n <- 5000
204 | k <- sample(1:5, size=n, replace=TRUE, prob=(1:5)/15)
205 | rate <- 1/k
206 | x <- rgamma(n, shape=3, rate=rate)
207 |
208 | #plot the density of the mixture
209 | #with the densities of the components
210 | plot(density(x), xlim=c(0,40), ylim=c(0,.3),
211 | lwd=3, xlab="x", main="")
212 | for (i in 1:5)
213 | lines(density(rgamma(n, 3, 1/i)))
214 |
215 |
216 | ### Example 3.13 (Mixture of several gamma distributions)
217 |
218 | n <- 5000
219 | p <- c(.1,.2,.2,.3,.2)
220 | lambda <- c(1,1.5,2,2.5,3)
221 | k <- sample(1:5, size=n, replace=TRUE, prob=p)
222 | rate <- lambda[k]
223 | x <- rgamma(n, shape=3, rate=rate)
224 |
225 |
226 | ### Example 3.14 (Plot density of mixture)
227 |
228 | f <- function(x, lambda, theta) {
229 | #density of the mixture at the point x
230 | sum(dgamma(x, 3, lambda) * theta)
231 | }
232 |
233 | p <- c(.1,.2,.2,.3,.2)
234 | lambda <- c(1,1.5,2,2.5,3)
235 |
236 | x <- seq(0, 8, length=200)
237 | dim(x) <- length(x) #need for apply
238 |
239 | #compute density of the mixture f(x) along x
240 | y <- apply(x, 1, f, lambda=lambda, theta=p)
241 |
242 | #plot the density of the mixture
243 | plot(x, y, type="l", ylim=c(0,.85), lwd=3, ylab="Density")
244 |
245 | for (j in 1:5) {
246 | #add the j-th gamma density to the plot
247 | y <- apply(x, 1, dgamma, shape=3, rate=lambda[j])
248 | lines(x, y)
249 | }
250 |
251 |
252 | ### Example 3.15 (Poisson-Gamma mixture)
253 |
254 | #generate a Poisson-Gamma mixture
255 | n <- 1000
256 | r <- 4
257 | beta <- 3
258 | lambda <- rgamma(n, r, beta) #lambda is random
259 |
260 | #now supply the sample of lambda's as the Poisson mean
261 | x <- rpois(n, lambda) #the mixture
262 |
263 | #compare with negative binomial
264 | mix <- tabulate(x+1) / n
265 | negbin <- round(dnbinom(0:max(x), r, beta/(1+beta)), 3)
266 | se <- sqrt(negbin * (1 - negbin) / n)
267 |
268 | round(rbind(mix, negbin, se), 3)
269 |
270 |
271 | ### Example 3.16 (Spectral decomposition method)
272 |
273 | # mean and covariance parameters
274 | mu <- c(0, 0)
275 | Sigma <- matrix(c(1, .9, .9, 1), nrow = 2, ncol = 2)
276 |
277 | rmvn.eigen <-
278 | function(n, mu, Sigma) {
279 | # generate n random vectors from MVN(mu, Sigma)
280 | # dimension is inferred from mu and Sigma
281 | d <- length(mu)
282 | ev <- eigen(Sigma, symmetric = TRUE)
283 | lambda <- ev$values
284 | V <- ev$vectors
285 | R <- V %*% diag(sqrt(lambda)) %*% t(V)
286 | Z <- matrix(rnorm(n*d), nrow = n, ncol = d)
287 | X <- Z %*% R + matrix(mu, n, d, byrow = TRUE)
288 | X
289 | }
290 |
291 | # generate the sample
292 | X <- rmvn.eigen(1000, mu, Sigma)
293 |
294 | plot(X, xlab = "x", ylab = "y", pch = 20)
295 | print(colMeans(X))
296 | print(cor(X))
297 |
298 |
299 | ### Example 3.17 (SVD method)
300 |
301 | rmvn.svd <-
302 | function(n, mu, Sigma) {
303 | # generate n random vectors from MVN(mu, Sigma)
304 | # dimension is inferred from mu and Sigma
305 | d <- length(mu)
306 | S <- svd(Sigma)
307 | R <- S$u %*% diag(sqrt(S$d)) %*% t(S$v) #sq. root Sigma
308 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d)
309 | X <- Z %*% R + matrix(mu, n, d, byrow=TRUE)
310 | X
311 | }
312 |
313 |
314 | ### Example 3.18 (Choleski factorization method)
315 |
316 | rmvn.Choleski <-
317 | function(n, mu, Sigma) {
318 | # generate n random vectors from MVN(mu, Sigma)
319 | # dimension is inferred from mu and Sigma
320 | d <- length(mu)
321 | Q <- chol(Sigma) # Choleski factorization of Sigma
322 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d)
323 | X <- Z %*% Q + matrix(mu, n, d, byrow=TRUE)
324 | X
325 | }
326 |
327 | #generating the samples according to the mean and covariance
328 | #structure as the four-dimensional iris virginica data
329 | y <- subset(x=iris, Species=="virginica")[, 1:4]
330 | mu <- colMeans(y)
331 | Sigma <- cov(y)
332 | mu
333 | Sigma
334 |
335 | #now generate MVN data with this mean and covariance
336 | X <- rmvn.Choleski(200, mu, Sigma)
337 | pairs(X)
338 |
339 | ### Example 3.19 (Comparing performance of MVN generators)
340 |
341 | library(MASS)
342 | library(mvtnorm)
343 | n <- 100 #sample size
344 | d <- 30 #dimension
345 | N <- 2000 #iterations
346 | mu <- numeric(d)
347 |
348 | set.seed(100)
349 | system.time(for (i in 1:N)
350 | rmvn.eigen(n, mu, cov(matrix(rnorm(n*d), n, d))))
351 | set.seed(100)
352 | system.time(for (i in 1:N)
353 | rmvn.svd(n, mu, cov(matrix(rnorm(n*d), n, d))))
354 | set.seed(100)
355 | system.time(for (i in 1:N)
356 | rmvn.Choleski(n, mu, cov(matrix(rnorm(n*d), n, d))))
357 | set.seed(100)
358 | system.time(for (i in 1:N)
359 | mvrnorm(n, mu, cov(matrix(rnorm(n*d), n, d))))
360 | set.seed(100)
361 | system.time(for (i in 1:N)
362 | rmvnorm(n, mu, cov(matrix(rnorm(n*d), n, d))))
363 | set.seed(100)
364 | system.time(for (i in 1:N)
365 | cov(matrix(rnorm(n*d), n, d)))
366 |
367 | detach(package:MASS)
368 | detach(package:mvtnorm)
369 |
370 | ### Example 3.20 (Multivariate normal mixture)
371 |
372 | library(MASS) #for mvrnorm
373 | #inefficient version loc.mix.0 with loops
374 |
375 | loc.mix.0 <- function(n, p, mu1, mu2, Sigma) {
376 | #generate sample from BVN location mixture
377 | X <- matrix(0, n, 2)
378 |
379 | for (i in 1:n) {
380 | k <- rbinom(1, size = 1, prob = p)
381 | if (k)
382 | X[i,] <- mvrnorm(1, mu = mu1, Sigma) else
383 | X[i,] <- mvrnorm(1, mu = mu2, Sigma)
384 | }
385 | return(X)
386 | }
387 |
388 | #more efficient version
389 | loc.mix <- function(n, p, mu1, mu2, Sigma) {
390 | #generate sample from BVN location mixture
391 | n1 <- rbinom(1, size = n, prob = p)
392 | n2 <- n - n1
393 | x1 <- mvrnorm(n1, mu = mu1, Sigma)
394 | x2 <- mvrnorm(n2, mu = mu2, Sigma)
395 | X <- rbind(x1, x2) #combine the samples
396 | return(X[sample(1:n), ]) #mix them
397 | }
398 |
399 | x <- loc.mix(1000, .5, rep(0, 4), 2:5, Sigma = diag(4))
400 | r <- range(x) * 1.2
401 | par(mfrow = c(2, 2))
402 | for (i in 1:4)
403 | hist(x[ , i], xlim = r, ylim = c(0, .3), freq = FALSE,
404 | main = "", breaks = seq(-5, 10, .5))
405 |
406 | detach(package:MASS)
407 | par(mfrow = c(1, 1))
408 |
409 |
410 | ### Example 3.21 (Generating variates on a sphere)
411 |
412 | runif.sphere <- function(n, d) {
413 | # return a random sample uniformly distributed
414 | # on the unit sphere in R ^d
415 | M <- matrix(rnorm(n*d), nrow = n, ncol = d)
416 | L <- apply(M, MARGIN = 1,
417 | FUN = function(x){sqrt(sum(x*x))})
418 | D <- diag(1 / L)
419 | U <- D %*% M
420 | U
421 | }
422 |
423 | #generate a sample in d=2 and plot
424 | X <- runif.sphere(200, 2)
425 | par(pty = "s")
426 | plot(X, xlab = bquote(x[1]), ylab = bquote(x[2]))
427 | par(pty = "m")
428 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-chVis.R:
--------------------------------------------------------------------------------
1 | #######################################################
2 | ### Statistical Computing with R 2e ###
3 | ### Maria L. Rizzo ###
4 | ### Chapman & Hall/CRC The R Series ###
5 | ### ISBN 9781466553323 - CAT# K15269 ###
6 | ### January 2019 ###
7 | ### ###
8 | ### R code for Chapter 5 ###
9 | ### Visualization of Multivariate Data ###
10 | #######################################################
11 |
12 |
13 | # packages to install:
14 | # bootstrap, corrplot, DAAG, FactoMineR, ggplot2, hexbin
15 |
16 |
17 | ### Example 5.1 (Scatterplot matrix)
18 |
19 | data(iris)
20 | #virginica data in first 4 columns of the last 50 obs.
21 |
22 | # not shown in text
23 | pairs(iris[101:150, 1:4])
24 |
25 | panel.d <- function(x, ...) {
26 | usr <- par("usr")
27 | on.exit(par(usr=usr))
28 | par(usr = c(usr[1:2], 0, .5))
29 | lines(density(x))
30 | }
31 |
32 | # Fig. 5.1
33 | x <- scale(iris[101:150, 1:4])
34 | r <- range(x)
35 | pairs(x, diag.panel = panel.d, xlim = r, ylim = r)
36 |
37 | library(lattice)
38 | splom(iris[101:150, 1:4]) #plot 1
39 |
40 | #for all 3 at once, in color, plot 2
41 | splom(iris[,1:4], groups = iris$Species)
42 |
43 | # Fig. 5.2
44 | #for all 3 at once, black and white, plot 3
45 | splom(~iris[1:4], groups = Species, data = iris,
46 | col = 1, pch = c(1, 2, 3), cex = c(.5,.5,.5))
47 |
48 | ### Example 5.2 (Correlation plots: Decathlon data)
49 |
50 | library(FactoMineR) #decathlon data
51 | library(corrplot)
52 | data("decathlon")
53 | str(decathlon)
54 |
55 | corrMat <- cor(decathlon[, 1:10])
56 | corrplot(corrMat, type="upper", tl.col="black", tl.srt=45)
57 | corrplot(corrMat, type = "upper", method = "square",
58 | addCoef.col = "black", diag=FALSE)
59 |
60 | ### Example 5.3 (Plot bivariate normal density)
61 |
62 | #the standard BVN density
63 | f <- function(x,y) {
64 | z <- (1/(2*pi)) * exp(-.5 * (x^2 + y^2))
65 | }
66 |
67 | y <- x <- seq(-3, 3, length= 50)
68 | z <- outer(x, y, f) #compute density for all (x,y)
69 |
70 | persp(x, y, z) #the default plot
71 |
72 | persp(x, y, z, theta = 45, phi = 30, expand = 0.6,
73 | ltheta = 120, shade = 0.75, ticktype = "detailed",
74 | xlab = "X", ylab = "Y", zlab = "f(x, y)")
75 |
76 |
77 | ### Example 5.4 (Add elements to perspective plot)
78 |
79 | #store viewing transformation in M
80 | persp(x, y, z, theta = 45, phi = 30,
81 | expand = .4, box = FALSE) -> M
82 |
83 | #add some points along a circle
84 | a <- seq(-pi, pi, pi/16)
85 | newpts <- cbind(cos(a), sin(a)) * 2
86 | newpts <- cbind(newpts, 0, 1) #z=0, t=1
87 | N <- newpts %*% M
88 | points(N[,1]/N[,4], N[,2]/N[,4], col=2)
89 |
90 | #add lines
91 | x2 <- seq(-3, 3, .1)
92 | y2 <- -x2^2 / 3
93 | z2 <- dnorm(x2) * dnorm(y2)
94 | N <- cbind(x2, y2, z2, 1) %*% M
95 | lines(N[,1]/N[,4], N[,2]/N[,4], col=4)
96 |
97 | #add text
98 | x3 <- c(0, 3.1)
99 | y3 <- c(0, -3.1)
100 | z3 <- dnorm(x3) * dnorm(y3) * 1.1
101 | N <- cbind(x3, y3, z3, 1) %*% M
102 | text(N[1,1]/N[1,4], N[1,2]/N[1,4], "f(x,y)")
103 | text(N[2,1]/N[2,4], N[2,2]/N[2,4], bquote(y==-x^2/3))
104 |
105 |
106 | ### Example 5.5 (Surface plot using wireframe(lattice))
107 |
108 | library(lattice)
109 | x <- y <- seq(-3, 3, length= 50)
110 |
111 | xy <- expand.grid(x, y)
112 | z <- (1/(2*pi)) * exp(-.5 * (xy[,1]^2 + xy[,2]^2))
113 | wireframe(z ~ xy[,1] * xy[,2])
114 |
115 |
116 | ### Example 5.6 (3D scatterplot)
117 |
118 | library(lattice)
119 | attach(iris)
120 | #basic 3 color plot with arrows along axes
121 | print(cloud(Petal.Length ~ Sepal.Length * Sepal.Width,
122 | data=iris, groups=Species))
123 |
124 | print(cloud(Sepal.Length ~ Petal.Length * Petal.Width,
125 | data = iris, groups = Species, main = "1", pch=1:3,
126 | scales = list(draw = FALSE), zlab = "SL",
127 | screen = list(z = 30, x = -75, y = 0)),
128 | split = c(1, 1, 2, 2), more = TRUE)
129 |
130 | print(cloud(Sepal.Width ~ Petal.Length * Petal.Width,
131 | data = iris, groups = Species, main = "2", pch=1:3,
132 | scales = list(draw = FALSE), zlab = "SW",
133 | screen = list(z = 30, x = -75, y = 0)),
134 | split = c(2, 1, 2, 2), more = TRUE)
135 |
136 | print(cloud(Petal.Length ~ Sepal.Length * Sepal.Width,
137 | data = iris, groups = Species, main = "3", pch=1:3,
138 | scales = list(draw = FALSE), zlab = "PL",
139 | screen = list(z = 30, x = -55, y = 0)),
140 | split = c(1, 2, 2, 2), more = TRUE)
141 |
142 | print(cloud(Petal.Width ~ Sepal.Length * Sepal.Width,
143 | data = iris, groups = Species, main = "4", pch=1:3,
144 | scales = list(draw = FALSE), zlab = "PW",
145 | screen = list(z = 30, x = -55, y = 0)),
146 | split = c(2, 2, 2, 2))
147 | detach(iris)
148 |
149 |
150 | ### Example 5.7 (Contour plot)
151 |
152 | #contour plot with labels
153 | contour(volcano, asp = 1, labcex = 1)
154 |
155 | #another version from lattice package
156 | library(lattice)
157 | contourplot(volcano) #similar to above
158 |
159 |
160 | ### Example 5.8 (Filled contour plots)
161 |
162 | image(volcano, col = terrain.colors(100), axes = FALSE)
163 | contour(volcano, levels = seq(100,200,by = 10), add = TRUE)
164 |
165 | filled.contour(volcano, color = terrain.colors, asp = 1)
166 | levelplot(volcano, scales = list(draw = FALSE),
167 | xlab = "", ylab = "")
168 |
169 |
170 | ### Example 5.9 (2D histogram)
171 |
172 | library(hexbin)
173 | x <- matrix(rnorm(4000), 2000, 2)
174 | plot(hexbin(x[,1], x[,2]))
175 |
176 | # ggplot version
177 | library(ggplot2)
178 | x <- data.frame(x)
179 | ggplot(x, aes(x[,1], x[,2])) + geom_hex()
180 |
181 | ### Example 5.10 (Andrews curves)
182 |
183 | library(DAAG)
184 | attach(leafshape17)
185 |
186 | f <- function(a, v) {
187 | #Andrews curve f(a) for a data vector v in R^3
188 | v[1]/sqrt(2) + v[2]*sin(a) + v[3]*cos(a)
189 | }
190 |
191 |
192 | #scale data to range [-1, 1]
193 | x <- cbind(bladelen, petiole, bladewid)
194 | n <- nrow(x)
195 | mins <- apply(x, 2, min) #column minimums
196 | maxs <- apply(x, 2, max) #column maximums
197 | r <- maxs - mins #column ranges
198 | y <- sweep(x, 2, mins) #subtract column mins
199 | y <- sweep(y, 2, r, "/") #divide by range
200 | x <- 2 * y - 1 #now has range [-1, 1]
201 |
202 | #set up plot window, but plot nothing yet
203 | plot(0, 0, xlim = c(-pi, pi), ylim = c(-3,3),
204 | xlab = "t", ylab = "Andrews Curves",
205 | main = "", type = "n")
206 |
207 | #now add the Andrews curves for each observation
208 | #line type corresponds to leaf architecture
209 | #0=orthotropic, 1=plagiotropic
210 | a <- seq(-pi, pi, len=101)
211 | dim(a) <- length(a)
212 | for (i in 1:n) {
213 | g <- arch[i] + 1
214 | y <- apply(a, MARGIN = 1, FUN = f, v = x[i,])
215 | lines(a, y, lty = g)
216 | }
217 | legend(3, c("Orthotropic", "Plagiotropic"), lty = 1:2)
218 | detach(leafshape17)
219 |
220 |
221 | ### Example 5.11 (Parallel coordinates)
222 |
223 | library(MASS)
224 | library(lattice)
225 | #trellis.device(color = FALSE) #black and white display
226 | x <- crabs[seq(5, 200, 5), ] #get every fifth obs.
227 | parallelplot(~x[4:8] | sp*sex, x)
228 |
229 | #trellis.device(color = FALSE) #black and white display
230 | x <- crabs[seq(5, 200, 5), ] #get every fifth obs.
231 | a <- x$CW * x$CL #area of carapace
232 | x[4:8] <- x[4:8] / sqrt(a) #adjust for size
233 | parallelplot(~x[4:8] | sp*sex, x)
234 |
235 | ### Example 5.12 (Segment plot)
236 |
237 | #segment plot
238 | x <- MASS::crabs[seq(5, 200, 5), ] #get every fifth obs.
239 | x <- subset(x, sex == "M") #keep just the males
240 | a <- x$CW * x$CL #area of carapace
241 | x[4:8] <- x[4:8] / sqrt(a) #adjust for size
242 |
243 | #use default color palette or other colors
244 | #palette(gray(seq(.4, .95, len = 5))) #use gray scale
245 | palette(rainbow(6)) #or use color
246 | stars(x[4:8], draw.segments = TRUE,
247 | labels =levels(x$sp), nrow = 4,
248 | ylim = c(-2,10), key.loc = c(3,-1))
249 |
250 | #after viewing, restore the default colors
251 | palette("default")
252 |
253 |
254 | ### Example 5.13 (PCA for open and closed book exams)
255 |
256 | library(bootstrap)
257 | str(scor)
258 | pairs(scor)
259 | cor(scor)
260 |
261 | n <- nrow(scor)
262 | x <- scale(scor) #center and scale
263 | s <- cov(x)
264 | e <- eigen(s)
265 | lam <- e$values #vector of eigenvalues
266 | P <- e$vectors #matrix of eigenvectors
267 |
268 | plot(lam, type = "b", xlab = "eigenvalues", main = "")
269 | barplot(lam, xlab = "eigenvalues")
270 |
271 | tab <- rbind(lam / sum(lam), cumsum(lam) / sum(lam))
272 | tab
273 |
274 | z <- x %*% P
275 | dim(z)
276 | head(z)
277 |
278 | pc <- prcomp(scor, center = TRUE, scale = TRUE)
279 | summary(pc)
280 |
281 | df <- scor[1:5, ]
282 | predict(pc, newdata = df) #same as z above
283 |
284 | head(x %*% pc$rotation, 5)
285 | head(pc$rotation)
286 | head(P)
287 |
288 |
289 | ### Example 5.14 (PC Biplot)
290 |
291 | ## plot scor data in the (PC1, PC2) coordinate system
292 | biplot(pc, pc.biplot = TRUE)
293 | round(cor(x, z), 3)
294 |
295 | ### Example 5.15 (PCA: Decathlon data)
296 |
297 | library(FactoMineR)
298 | data(decathlon)
299 | pc <- princomp(decathlon[,1:10], cor = TRUE, scores = TRUE)
300 | plot(pc) # screeplot
301 | biplot(pc)
302 | summary(pc)
303 |
304 |
--------------------------------------------------------------------------------
/examples-R/SCR2e-examples-functions.R:
--------------------------------------------------------------------------------
1 | rlogarithmic <- function(n, theta) {
2 | # generate random sample from Logarithmic(theta)
3 | stopifnot(all(theta > 0 & theta < 1))
4 | th <- rep(theta, length=n)
5 | u <- runif(n)
6 | v <- runif(n)
7 | x <- floor(1 + log(v) / log(1 - (1 - th)^u))
8 | return(x)
9 | }
10 |
11 | # generate MVN by spectral decomposition method
12 |
13 | rmvn.eigen <-
14 | function(n, mu, Sigma) {
15 | # generate n random vectors from MVN(mu, Sigma)
16 | # dimension is inferred from mu and Sigma
17 | d <- length(mu)
18 | ev <- eigen(Sigma, symmetric = TRUE)
19 | lambda <- ev$values
20 | V <- ev$vectors
21 | R <- V %*% diag(sqrt(lambda)) %*% t(V)
22 | Z <- matrix(rnorm(n*d), nrow = n, ncol = d)
23 | X <- Z %*% R + matrix(mu, n, d, byrow = TRUE)
24 | X
25 | }
26 |
27 |
28 | # generate MVN by singular value decomposition method
29 |
30 | rmvn.svd <-
31 | function(n, mu, Sigma) {
32 | # generate n random vectors from MVN(mu, Sigma)
33 | # dimension is inferred from mu and Sigma
34 | d <- length(mu)
35 | S <- svd(Sigma)
36 | R <- S$u %*% diag(sqrt(S$d)) %*% t(S$v) #sq. root Sigma
37 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d)
38 | X <- Z %*% R + matrix(mu, n, d, byrow=TRUE)
39 | X
40 | }
41 |
42 |
43 | # generate MVN by Choleski factorization method
44 |
45 | rmvn.Choleski <-
46 | function(n, mu, Sigma) {
47 | # generate n random vectors from MVN(mu, Sigma)
48 | # dimension is inferred from mu and Sigma
49 | d <- length(mu)
50 | Q <- chol(Sigma) # Choleski factorization of Sigma
51 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d)
52 | X <- Z %*% Q + matrix(mu, n, d, byrow=TRUE)
53 | X
54 | }
55 |
56 | # Bootstrap t confidence interval
57 |
58 | boot.t.ci <-
59 | function(x, B = 500, R = 100, level = .95, statistic){
60 | #compute the bootstrap t CI
61 | x <- as.matrix(x); n <- nrow(x)
62 | stat <- numeric(B); se <- numeric(B)
63 |
64 | boot.se <- function(x, R, f) {
65 | #local function to compute the bootstrap
66 | #estimate of standard error for statistic f(x)
67 | x <- as.matrix(x); m <- nrow(x)
68 | th <- replicate(R, expr = {
69 | i <- sample(1:m, size = m, replace = TRUE)
70 | f(x[i, ])
71 | })
72 | return(sd(th))
73 | }
74 |
75 | for (b in 1:B) {
76 | j <- sample(1:n, size = n, replace = TRUE)
77 | y <- x[j, ]
78 | stat[b] <- statistic(y)
79 | se[b] <- boot.se(y, R = R, f = statistic)
80 | }
81 | stat0 <- statistic(x)
82 | t.stats <- (stat - stat0) / se
83 | se0 <- sd(stat)
84 | alpha <- 1 - level
85 | Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1)
86 | names(Qt) <- rev(names(Qt))
87 | CI <- rev(stat0 - Qt * se0)
88 | }
89 |
90 |
91 | # BCa bootstrap confidence interval
92 |
93 | boot.BCa <-
94 | function(x, th0, th, stat, conf = .95) {
95 | # bootstrap with BCa bootstrap confidence interval
96 | # th0 is the observed statistic
97 | # th is the vector of bootstrap replicates
98 | # stat is the function to compute the statistic
99 |
100 | x <- as.matrix(x)
101 | n <- nrow(x) #observations in rows
102 | N <- 1:n
103 | alpha <- (1 + c(-conf, conf))/2
104 | zalpha <- qnorm(alpha)
105 |
106 | # the bias correction factor
107 | z0 <- qnorm(sum(th < th0) / length(th))
108 |
109 | # the acceleration factor (jackknife est.)
110 | th.jack <- numeric(n)
111 | for (i in 1:n) {
112 | J <- N[1:(n-1)]
113 | th.jack[i] <- stat(x[-i, ], J)
114 | }
115 | L <- mean(th.jack) - th.jack
116 | a <- sum(L^3)/(6 * sum(L^2)^1.5)
117 |
118 | # BCa conf. limits
119 | adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha)))
120 | limits <- quantile(th, adj.alpha, type=6)
121 | return(list("est"=th0, "BCa"=limits))
122 | }
123 |
124 | # Gelman-Rubin statistic for MCMC convergence
125 |
126 | Gelman.Rubin <- function(psi) {
127 | # psi[i,j] is the statistic psi(X[i,1:j])
128 | # for chain in i-th row of X
129 | psi <- as.matrix(psi)
130 | n <- ncol(psi)
131 | k <- nrow(psi)
132 |
133 | psi.means <- rowMeans(psi) #row means
134 | B <- n * var(psi.means) #between variance est.
135 | psi.w <- apply(psi, 1, "var") #within variances
136 | W <- mean(psi.w) #within est.
137 | v.hat <- W*(n-1)/n + (B/n) #upper variance est.
138 | r.hat <- v.hat / W #G-R statistic
139 | return(r.hat)
140 | }
141 |
142 | # binning bivariate data
143 |
144 | bin2d <-
145 | function(x, breaks1 = "Sturges", breaks2 = "Sturges"){
146 | # Data matrix x is n by 2
147 | # breaks1, breaks2: any valid breaks for hist function
148 | # using same defaults as hist
149 | histg1 <- hist(x[,1], breaks = breaks1, plot = FALSE)
150 | histg2 <- hist(x[,2], breaks = breaks2, plot = FALSE)
151 | brx <- histg1$breaks
152 | bry <- histg2$breaks
153 |
154 | # bin frequencies
155 | freq <- table(cut(x[,1], brx), cut(x[,2], bry))
156 |
157 | return(list(call = match.call(), freq = freq,
158 | breaks1 = brx, breaks2 = bry,
159 | mids1 = histg1$mids, mids2 = histg2$mids))
160 | }
161 |
--------------------------------------------------------------------------------
/examples-R/readme.md:
--------------------------------------------------------------------------------
1 | Code for the examples in "Statistical Computing with R, 2nd ed.".
2 |
--------------------------------------------------------------------------------
/examples-Rmd/FOREARM.DAT:
--------------------------------------------------------------------------------
1 | 17.3 18.4 20.9 16.8 18.7 20.5 17.9 20.4 18.3 20.5
2 | 19.0 17.5 18.1 17.1 18.8 20.0 19.1 19.1 17.9 18.3
3 | 18.2 18.9 19.4 18.9 19.4 20.8 17.3 18.5 18.3 19.4
4 | 19.0 19.0 20.5 19.7 18.5 17.7 19.4 18.3 19.6 21.4
5 | 19.0 20.5 20.4 19.7 18.6 19.9 18.3 19.8 19.6 19.0
6 | 20.4 17.3 16.1 19.2 19.6 18.8 19.3 19.1 21.0 18.6
7 | 18.3 18.3 18.7 20.6 18.5 16.4 17.2 17.5 18.0 19.5
8 | 19.9 18.4 18.8 20.1 20.0 18.5 17.5 18.5 17.9 17.4
9 | 18.7 18.6 17.3 18.8 17.8 19.0 19.6 19.3 18.1 18.5
10 | 20.9 19.8 18.1 17.1 19.8 20.6 17.6 19.1 19.5 18.4
11 | 17.7 20.2 19.9 18.6 16.6 19.2 20.0 17.4 17.1 18.3
12 | 19.1 18.5 19.6 18.0 19.4 17.1 19.9 16.3 18.9 20.7
13 | 19.7 18.5 18.4 18.7 19.3 16.3 16.9 18.2 18.5 19.3
14 | 18.1 18.0 19.5 20.3 20.1 17.2 19.5 18.8 19.2 17.7
15 |
--------------------------------------------------------------------------------
/examples-Rmd/Lahman.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Benchmarks on Lahman Batting data"
3 | author: "Maria Rizzo"
4 | date: "September 1, 2018"
5 | output: html_document
6 | ---
7 |
8 | ```{r}
9 | library(Lahman)
10 | library(dplyr)
11 | library(ggplot2)
12 | library(microbenchmark)
13 | ```
14 |
15 | ## Define two functions to compare the two methods
16 |
17 | ```{r}
18 | use_dplyr <- function(yr) {
19 | Batting %>% filter(yearID == yr) -> b
20 | b %>% group_by(playerID) %>%
21 | summarize(AB = sum(AB), H = sum(H)) -> S
22 | S %>% mutate(AVG = round(H / AB, 3)) -> S
23 | S %>% filter(AB >= 400) -> S400
24 | S400 %>% arrange(desc(AVG)) -> S400
25 | slice(S400, 1:10) -> top
26 | top
27 | }
28 |
29 | no_dplyr <- function(yr) {
30 | S <- subset(Batting, Batting$yearID == yr,
31 | select = c("playerID", "AB", "H"))
32 | AB <- as.vector(by(S$AB, S$playerID, FUN = sum))
33 | H <- as.vector(by(S$H, S$playerID, FUN = sum))
34 | S <- data.frame(playerID = unique(S$playerID),
35 | AB = AB, H = H, AVG = round(H / AB, 3),
36 | stringsAsFactors = FALSE)
37 | S400 <- S[S$AB >= 400, ]
38 | o <- order(S400$AVG, decreasing = TRUE)
39 | S400 <- S400[o, ]
40 | top <- S400[1:10, ]
41 | top
42 | }
43 | ```
44 |
45 | Check that these functions obtain the same results.
46 |
47 | ```{r}
48 | all.equal(use_dplyr(1999), no_dplyr(1999))
49 | ```
50 |
51 | ## Benchmark comparison for dplyr method vs basic method
52 |
53 | ```{r}
54 | mb <- microbenchmark(
55 | dplyr = use_dplyr(1999),
56 | base = no_dplyr(1999)
57 | )
58 |
59 | mb
60 | autoplot(mb)
61 | ```
62 |
63 | Another version of the summary table:
64 |
65 | ```{r}
66 | knitr::kable(summary(mb))
67 | ```
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-Intro.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 1: Introduction
14 |
15 | ## Example 1.1
16 |
17 | ```{r }
18 | sumdice <- function(n) {
19 | k <- sample(1:6, size=n, replace=TRUE)
20 | return(sum(k))
21 | }
22 |
23 | sumdice(2)
24 |
25 | #to store the result rather than print it
26 | a <- sumdice(100)
27 |
28 | #we expect the mean for 100 dice to be close to 3.5
29 | a / 100
30 |
31 | sumdice <- function(n)
32 | sum(sample(1:6, size=n, replace=TRUE))
33 |
34 | sumdice <- function(n, sides = 6) {
35 | if (sides < 1) return (0)
36 | k <- sample(1:sides, size=n, replace=TRUE)
37 | return(sum(k))
38 | }
39 |
40 | sumdice(5) #default 6 sides
41 | sumdice(n=5, sides=4) #4 sides
42 | ```
43 |
44 | ## Example 1.2 (iris data)
45 |
46 | ```{r }
47 | names(iris)
48 | table(iris$Species)
49 | w <- iris[[2]] #Sepal.Width
50 | mean(w)
51 |
52 | attach(iris)
53 | summary(Petal.Length[51:100]) #versicolor petal length
54 |
55 | with(iris, summary(Petal.Length[51:100]))
56 |
57 | out <- with(iris, summary(Petal.Length[51:100]))
58 |
59 | by(iris[,1:4], Species, colMeans)
60 | detach(iris)
61 | ```
62 |
63 | ## Example 1.3 (Arrays)
64 |
65 | ```{r }
66 | x <- 1:24 # vector
67 | dim(x) <- length(x) # 1 dimensional array
68 | matrix(1:24, nrow=4, ncol=6) # 4 by 6 matrix
69 | x <- array(1:24, c(3, 4, 2)) # 3 by 4 by 2 array
70 | ```
71 |
72 | ## Example 1.4 (Matrices)
73 |
74 | ```{r }
75 | A <- matrix(0, nrow=2, ncol=2)
76 | A <- matrix(c(0, 0, 0, 0), nrow=2, ncol=2)
77 | A <- matrix(0, 2, 2)
78 |
79 | A <- matrix(1:8, nrow=2, ncol=4)
80 | ```
81 |
82 | ## Example 1.5 (Iris data cont.)
83 |
84 | ```{r }
85 | x <- as.matrix(iris[,1:4]) #all rows of columns 1 to 4
86 |
87 | mean(x[,2]) #mean of sepal width, all species
88 | mean(x[51:100,3]) #mean of petal length, versicolor
89 |
90 | y <- array(x, dim=c(50, 3, 4))
91 | mean(y[,,2]) #mean of sepal width, all species
92 | mean(y[,2,3]) #mean of petal length, versicolor
93 |
94 | y <- array(c(x[1:50,], x[51:100,], x[101:150,]),
95 | dim=c(50, 4, 3))
96 | mean(y[,2,]) #mean of sepal width, all species
97 | mean(y[,3,2]) #mean of petal length, versicolor
98 | ```
99 |
100 | ## Example 1.6 (Run length encoding)
101 |
102 | ```{r }
103 | n <- 1000
104 | x <- rbinom(n, size = 1, prob = .5)
105 | table(x)
106 | head(x, 30)
107 |
108 | r <- rle(x)
109 | str(r)
110 |
111 | head(r$lengths)
112 | head(r[[1]])
113 |
114 | max(r$lengths)
115 | log2(length(x))
116 | ```
117 |
118 | ## Example 1.7 (Named list)
119 |
120 | ```{r }
121 | w <- wilcox.test(rnorm(10), rnorm(10, 2))
122 | w #print the summary
123 |
124 | w$statistic #stored in object w
125 | w$p.value
126 | unlist(w)
127 | unclass(w)
128 | ```
129 |
130 | ## Example 1.8 (A list of names)
131 |
132 | ```{r }
133 | a <- matrix(runif(8), 4, 2) #a 4x2 matrix
134 | dimnames(a) <- list(NULL, c("x", "y"))
135 |
136 | # if we want row names
137 | dimnames(a) <- list(letters[1:4], c("x", "y"))
138 | a
139 |
140 | # another way to assign row names
141 | row.names(a) <- list("NE", "NW", "SW", "SE")
142 | a
143 | ```
144 |
145 | ## Example 1.9 (Parallel boxplots)
146 |
147 | ```{r }
148 | boxplot(iris$Sepal.Length ~ iris$Species)
149 |
150 | boxplot(iris$Sepal.Length ~ iris$Species,
151 | ylab = "Sepal Length", boxwex = .4)
152 | ```
153 |
154 | ## Example 1.10 (Plotting characters and colors)
155 |
156 | ```{r }
157 | plot(0:25, rep(1, 26), pch = 0:25)
158 | text(0:25, 0.9, 0:25)
159 | ```
160 |
161 | ## Example 1.11 (Barplot for run lengths)
162 |
163 | ```{r }
164 | barplot(table(r$lengths)) #R graphics version
165 |
166 | ## ggplot version
167 | library(ggplot2)
168 | df <- data.frame(lengths = factor(r$lengths))
169 | ggplot(df, aes(lengths)) + geom_bar()
170 | ```
171 |
172 | ## Example 1.12 (Scatterplots)
173 |
174 | ```{r }
175 | ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
176 | geom_point()
177 |
178 | ggplot(iris, aes(Sepal.Length, Sepal.Width,
179 | color = Species, shape = Species)) + geom_point(size = 2)
180 | ```
181 |
182 | ## Example 1.13 (ggplot: parallel boxplots and violin plots)
183 |
184 | ```{r }
185 | ggplot(iris, aes(Species, Sepal.Length)) + geom_boxplot()
186 | ggplot(iris, aes(Species, Sepal.Length)) + geom_violin()
187 |
188 | ggplot(iris, aes(Species, Sepal.Length)) +
189 | geom_boxplot() + coord_flip()
190 | ggplot(iris, aes(Species, Sepal.Length)) +
191 | geom_violin() + coord_flip()
192 | ```
193 |
194 | ## Example 1.14 (MPG by engine displacement)
195 |
196 | ```{r }
197 | ggplot(mpg, aes(displ, hwy)) +
198 | geom_point() +
199 | facet_wrap(~ class)
200 | ```
201 |
202 | ## Example 1.15 (Import data from a local text file)
203 |
204 | ```{r }
205 | # copy "FOREARM.DAT" into working directory
206 | forearm <- scan(file = "FOREARM.DAT") #a vector
207 |
208 | # use pathname if file is not in working directory
209 | # forearm <- scan(file = "./DATASETS/FOREARM.DAT") #a vector
210 |
211 | head(forearm)
212 | ```
213 |
214 | ## Example 1.16 (Importing data from a web page)
215 |
216 | ```{r }
217 | fileloc <- "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
218 |
219 | df <- read.table(file = fileloc, na.strings = "?", as.is = TRUE)
220 | str(df)
221 | names(df) <- c("mpg", "cyl", "displ", "hp", "wt", "accel",
222 | "year", "origin", "name")
223 | summary(df)
224 | ```
225 |
226 | ## Example 1.17 (Importing/exporting .csv files)
227 |
228 | ```{r }
229 | #create a data frame
230 | dates <- c("3/27/1995", "4/3/1995",
231 | "4/10/1995", "4/18/1995")
232 | prices <- c(11.1, 7.9, 1.9, 7.3)
233 | d <- data.frame(dates=dates, prices=prices)
234 |
235 | #create the .csv file
236 | filename <- "temp.csv"
237 | write.table(d, file = filename, sep = ",",
238 | row.names = FALSE)
239 |
240 | #read the .csv file
241 | read.table(file = filename, sep = ",", header = TRUE)
242 | read.csv(file = filename) #same thing
243 | ```
244 |
245 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-Programming.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | ---
11 |
12 | # Chapter 15: Programming Topics
13 |
14 | packages to install:
15 | energy, ggplot2, profvis, pryr, microbenchmark, mvtnorm, rbenchmark
16 |
17 | for the last section: dplyr, Lahman, Rcpp
18 |
19 | ## Example 15.1 (Benchmarking methods to generate a sequence)
20 |
21 | ```{r }
22 | s1 <- 1:10
23 | s2 <- seq(1, 10, 1)
24 | s3 <- seq.int(1, 10, 1)
25 | df <- data.frame(s1=s1, s2=s2, s3=s3)
26 | str(df)
27 |
28 | library(microbenchmark)
29 | library(ggplot2)
30 |
31 | n <- 1000
32 | mb <- microbenchmark(
33 | seq(1, n, 1),
34 | seq.int(1, n, 1),
35 | 1:n
36 | )
37 |
38 | print(mb)
39 | autoplot(mb) # display a violin plot
40 | ```
41 |
42 | ## Example 15.2 (Benchmarking methods to initialize a vector)
43 |
44 | ```{r }
45 | n <- 100
46 | mb2 <- microbenchmark(
47 | numeric = numeric(n) + 1,
48 | rep = rep(1, n),
49 | seq = seq(from=1, to=1, length=n),
50 | ones = matrix(1, nrow=n, ncol=1),
51 | as.ones = as.matrix(rep(1, n))
52 | )
53 |
54 | print(mb2)
55 | ```
56 |
57 | ## Example 15.3 (Timings of two multivariate normal generators)
58 |
59 | ```{r }
60 | library(rbenchmark)
61 | library(MASS)
62 | library(mvtnorm)
63 | n <- 100 #sample size
64 | d <- 30 #dimension
65 | N <- 2000 #iterations
66 | mu <- numeric(d)
67 |
68 | benchmark(
69 | columns = c("test", "replications", "elapsed", "relative"),
70 | replications = 2000,
71 | cov = {S <- cov(matrix(rnorm(n*d), n, d))},
72 | mvrnorm = mvrnorm(n, mu, S),
73 | rmvnorm = rmvnorm(n, mu, S)
74 | )
75 | ```
76 |
77 | ## Example 15.4 (Profiling with Rprof)
78 |
79 | ```{r }
80 | x <- rnorm(1000)
81 | y <- rnorm(1000)
82 |
83 | Rprof("pr.out", line.profiling = TRUE)
84 | energy::dcor(x, y)
85 | Rprof(NULL)
86 | summaryRprof("pr.out")
87 | ```
88 |
89 | ## Example 15.5 (profvis interactive visualization)
90 |
91 | ```{r }
92 | library(profvis)
93 | profvis(energy::dcor(x, y))
94 | ```
95 |
96 | ## Example 15.6 (Object size)
97 |
98 | ```{r }
99 | x <- matrix(rnorm(5000), 1000, 5) #1000 obs in R^5
100 | DF <- as.data.frame(x)
101 | object.size(x)
102 | object.size(DF)
103 | pryr::object_size(x)
104 | pryr::object_size(DF)
105 | pryr::compare_size(x)
106 |
107 | listTwo <- list(x, x)
108 | pryr::compare_size(listTwo)
109 | ```
110 |
111 | ## Example 15.7 (Comparing objects and attributes)
112 |
113 | ```{r }
114 | str(x)
115 | str(DF)
116 | all.equal(x, DF)
117 | names(attributes(x))
118 | names(attributes(DF))
119 | all.equal(x, DF, check.attributes = FALSE)
120 | ```
121 |
122 | ## Example 15.8 (Comparing objects for equality)
123 |
124 | ```{r }
125 | try(ifelse(all.equal(x, DF), "T", "F")) # error
126 | ifelse(isTRUE(all.equal(x, DF)), "T", "F") # correct
127 |
128 | x <- 1 - 10e-4
129 | y <- x + 2
130 | x == (y - 2) # equal mathematically but
131 | isTRUE(all.equal(x, y - 2)) #gives expected result
132 |
133 | ## does not necessarily evaluate to TRUE or FALSE
134 | try(ifelse(all.equal(x, y), "T", "F"))
135 | ## returns TRUE or FALSE
136 | ifelse(isTRUE(all.equal(x, y)), "T", "F")
137 | ```
138 |
139 | ## Example 15.9 (Display R function code)
140 |
141 | ```{r }
142 | nclass.scott
143 | ```
144 |
145 | ## Example 15.10 (RSiteSearch)
146 |
147 | ```{r }
148 | if (interactive())
149 | RSiteSearch("ggcorr")
150 | ```
151 |
152 | ## Example 15.11 (UseMethod)
153 |
154 | ```{r }
155 | body(density)
156 | args(density.default)
157 | body(density.default)
158 | ```
159 |
160 | ## Example 15.12 (Show methods)
161 |
162 | ```{r }
163 | methods(t.test)
164 | getAnywhere(t.test.formula)
165 | body(stats:::t.test.formula)
166 | ```
167 |
168 | ## Example 15.13 (Object not found or not an exported object)
169 |
170 | ```{r }
171 | try(perc.ci)
172 | try(boot::perc.ci)
173 | getAnywhere(perc.ci)
174 | args(boot:::perc.ci)
175 | body(boot:::perc.ci)
176 | boot:::perc.ci
177 | getFromNamespace("perc.ci", "boot")
178 | ```
179 |
180 | ## Example 15.14 (getS3method)
181 |
182 | ```{r }
183 | library(microbenchmark)
184 | library(ggplot2)
185 | getAnywhere(autoplot)
186 | getS3method("autoplot", class = "microbenchmark")
187 | getAnywhere(autoplot.microbenchmark)
188 | ```
189 |
190 | ## Example 15.15 (.Primitive or .Internal)
191 |
192 | ```{r }
193 | if (interactive())
194 | pryr::show_c_source(.Primitive(cumsum(x)))
195 | ```
196 |
197 | ## Example 15.16 (.Call, .External, .C or .Fortran)
198 |
199 | ```{r }
200 | #### dist is implemented by a .Call to C_Cdist
201 |
202 | body(dist)
203 | ```
204 |
205 | ## Example 15.17 (A first Rcpp experiment)
206 | ### In RStudio use the menu
207 | ### File > New File > C++ file to display this code
208 | ## Example 15.18 (cppFunction)
209 |
210 | ```{r }
211 | library(Rcpp)
212 |
213 | set.seed(1)
214 | x <- matrix(rnorm(20), nrow = 5, ncol = 4)
215 |
216 | cppFunction('double vecnorm(NumericVector v) {
217 | // compute the Euclidean norm of vector v
218 | int d = v.size();
219 | double s = 0.0;
220 | for (int i = 0; i < d; i++)
221 | s += v(i) * v(i);
222 | return sqrt(s);
223 | }')
224 |
225 | print(vecnorm(x[, 1]))
226 | print(apply(x, MARGIN = 2, FUN = "vecnorm"))
227 | ```
228 |
229 | ## Example 15.19 (sourceCpp)
230 | ### Create the C++ source file "printme.cpp" by editing the template
231 | ### in RStudio menu: File > New File > C++
232 |
233 | ```{r }
234 | library(Rcpp)
235 | sourceCpp("..//examples-cpp//printme.cpp")
236 | x <- sample(1:5)
237 | print_me(x)
238 | ```
239 |
240 | ## Example 15.20 (Lahman baseball data)
241 |
242 | ```{r }
243 | library(Lahman)
244 | str(Batting)
245 | # method 1
246 | S <- subset(Batting, Batting$yearID == 1999,
247 | select = c("playerID", "AB", "H"))
248 |
249 | # method 2 (dplyr)
250 | library(dplyr)
251 | Batting %>% filter(yearID == 1999) -> b
252 |
253 | # method 1
254 | AB <- as.vector(by(S$AB, S$playerID, FUN = sum))
255 | H <- as.vector(by(S$H, S$playerID, FUN = sum))
256 | S <- data.frame(playerID = unique(S$playerID),
257 | AB = AB, H = H, AVG = round(H / AB, 3),
258 | stringsAsFactors = FALSE)
259 | S400 <- S[S$AB >= 400, ]
260 |
261 | # method 2 (dplyr)
262 | b %>% group_by(playerID) %>%
263 | summarize(AB = sum(AB), H = sum(H)) -> S
264 | S %>% mutate(AVG = round(H / AB, 3)) -> S
265 | S %>% filter(AB >= 400) -> S400
266 |
267 | # method 1
268 | o <- order(S400$AVG, decreasing = TRUE)
269 | S400 <- S400[o, ]
270 | top <- S400[1:10, ]
271 |
272 | # method 2 (dplyr)
273 | S400 %>% arrange(desc(AVG)) -> S400
274 | slice(S400, 1:10) -> top
275 | top
276 |
277 | People %>% select(playerID, nameFirst, nameLast) -> m
278 | top %>% inner_join(m) %>%
279 | select(nameFirst, nameLast, AVG)
280 | ```
281 |
282 | ## Example 15.21 (Comparison with microbenchmark)
283 | ### The source for this example is in R Markdown format
284 | ### Open "Lahman.Rmd" in RStudio and knit to see report
285 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-ResampAppl.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 9: Resampling Applications
14 |
15 | ## Example 9.1 (Jackknife-after-bootstrap)
16 |
17 | ```{r }
18 | library(boot)
19 | library(bootstrap)
20 | set.seed(1111)
21 |
22 | theta.boot <- function(patch, i) {
23 | # function to compute the patch ratio statistic
24 | y <- patch[i, "y"]
25 | z <- patch[i, "z"]
26 | mean(y) / mean(z)
27 | }
28 |
29 | boot.out <- boot(bootstrap::patch,
30 | statistic = theta.boot, R=2000)
31 | A <- boot.array(boot.out)
32 | head(A, 3)
33 | mean(A[, 1] == 0)
34 | ```
35 |
36 | jackknife-after-bootstrap to est. se(se)
37 |
38 | ```{r }
39 | A <- boot.array(boot.out)
40 | theta.b <- boot.out$t
41 | n <- NROW(patch)
42 | jack.se <- numeric(n)
43 |
44 | for (i in 1:n) {
45 | #in i-th replicate omit all samples with x[i]
46 | keep <- which(A[, i] == 0)
47 | jack.se[i] <- sd(theta.b[keep])
48 | }
49 |
50 | print(boot.out) #for se_boot
51 | se.bar <- mean(jack.se)
52 | se.se <- sqrt((n-1) * mean((jack.se - se.bar)^2))
53 | print(paste("Jackknife-after-bootstrap est. se(se)=", se.se))
54 | ```
55 |
56 | ## Example 9.2 (Jackknife-after-bootstrap)
57 | initialize
58 |
59 | ```{r }
60 | data(patch, package = "bootstrap")
61 | y <- patch$y
62 | z <- patch$z
63 | dat <- cbind(y, z)
64 | n <- NROW(dat)
65 | B <- 2000
66 | ```
67 |
68 | jackknife-after-bootstrap step 1: run the bootstrap
69 |
70 | ```{r }
71 | theta_boot <- function(dat, ind) {
72 | # function to compute the statistic
73 | y <- dat[ind, 1]
74 | z <- dat[ind, 2]
75 | mean(y) / mean(z)
76 | }
77 |
78 | boot.obj <- boot(dat, statistic = theta_boot, R=2000)
79 | theta.hat <- boot.obj$t0
80 | theta.b <- boot.obj$t
81 | se.boot <- sd(theta.b)
82 | ```
83 |
84 | jackknife-after-bootstrap to est. se(se)
85 |
86 | ```{r }
87 | sample.freq <- boot.array(boot.obj)
88 | se.se.reps <- numeric(n)
89 | N <- 1:n
90 |
91 | for (i in N) {
92 | # jackknife-after-bootstrap
93 | # omit all bootstrap samples that contain obs i
94 | keep <- which(sample.freq[ ,i] == 0)
95 | se.se.reps[i] <- sd(theta.b[keep])
96 | }
97 |
98 | print(boot.obj)
99 | se.bar <- mean(se.se.reps)
100 | se.se <- sqrt((n-1) * mean((se.se.reps - se.bar)^2))
101 | se.se
102 | ```
103 |
104 | ## Example 9.3 (ironslag linear model)
105 |
106 | ```{r }
107 | library(ggplot2)
108 | library(DAAG)
109 | L1 <- lm(magnetic ~ chemical, data=ironslag)
110 | cf3 <- round(L1$coeff, 3)
111 | cap <- paste("Fit: magnetic =", cf3[1], "+", cf3[2], "chemical")
112 |
113 | ggplot(data=ironslag, aes(chemical, magnetic)) +
114 | geom_point() + geom_smooth(method="lm") +
115 | ggtitle(cap)
116 |
117 | plot(L1, which=1:2, ask=FALSE) #residual plots
118 | ```
119 |
120 | ## Example 9.4 (mammals data)
121 |
122 | ```{r, mammals1}
123 | library(MASS)
124 | cor(log(mammals$body), log(mammals$brain))
125 | summary(mammals)
126 |
127 | y <- log(mammals$brain)
128 | x <- log(mammals$body)
129 | L <- lm(y ~ x)
130 | L
131 |
132 | cap <- paste("Fit: log(brain) =", round(L$coeff[1],3),
133 | "+", round(L$coeff[2],3), "log(body)")
134 | ggplot(data=mammals, aes(x, y)) +
135 | geom_point() + geom_smooth(method="lm") +
136 | labs(x = "log(body)", y = "log(brain)", title = cap)
137 |
138 | summary(L)$r.squared
139 | ```
140 |
141 | ## Example 9.5 (ironslag data, resampling cases)
142 |
143 | ```{r }
144 | x <- ironslag$chemical
145 | y <- ironslag$magnetic
146 | m <- 2000
147 | n <- NROW(x)
148 | L1 <- lm(y ~ x) #estimate the model
149 | b0 <- L1$coeff[1]; b1 <- L1$coeff[2]
150 | ```
151 |
152 | #### run bootstrap of cases
153 |
154 | ```{r }
155 | out <- replicate(m, expr={
156 | i <- sample(1:n, replace=TRUE, size=n)
157 | xstar <- x[i]
158 | ystar <- y[i]
159 | Lb <- lm(ystar ~ xstar)
160 | s <- summary(Lb)$sigma
161 | c(Lb$coeff[1], slope=Lb$coeff[2], s=s)
162 | })
163 |
164 | bootCases <- t(out)
165 | meanCases <- colMeans(bootCases)
166 | sdCases <- apply(bootCases, 2, "sd")
167 | meanCases
168 | sdCases
169 |
170 | biasInt <- mean(bootCases[,1] - b0) #bias for intercept
171 | biasSlope <- mean(bootCases[,2] - b1) #bias for slope
172 |
173 | rbind(estimate=c(b0, b1), bias=c(biasInt, biasSlope),
174 | se=sdCases[1:2], cv=c(biasInt, cv=biasSlope)/sdCases[1:2])
175 | ```
176 |
177 | ## Example 9.6 (Resampling cases using the boot function)
178 |
179 |
180 | ```{r }
181 | set.seed(1104)
182 | library(boot)
183 | m <- 2000
184 | stats <- function(dat, i) {
185 | x <- dat$chemical[i]
186 | y <- dat$magnetic[i]
187 | Lb <- lm(y ~ x)
188 | s <- summary(Lb)$sigma
189 | c(Lb$coeff[1], slope=Lb$coeff[2], s=s)
190 | }
191 |
192 | boot.out <- boot(ironslag, statistic=stats, R=2000)
193 | boot.out
194 | boot.out$t0
195 |
196 | sd(boot.out$t[,2])
197 | boottbl <- broom::tidy(boot.out)
198 | boottbl$std.error[2]
199 |
200 | MASS::truehist(boot.out$t[ ,2], main="", xlab="slopes")
201 | abline(v = boot.out$t0[2], lwd=2)
202 |
203 | boot.ci(boot.out, index=2, type=c("norm","perc","basic","bca"))
204 | ```
205 |
206 | ## Example 9.7 (Resampling errors: mammals data)
207 |
208 | Resampling errors:
209 |
210 | ```{r}
211 | x <- log(mammals$brain)
212 | y <- log(mammals$body)
213 | L <- lm(y ~ x)
214 | ```
215 |
216 | ```{r}
217 | m.resid <- rstandard(L, sd = 1)
218 | r <- m.resid - mean(m.resid)
219 | m <- 1000; n <- NROW(x)
220 | estsErr <- replicate(m, expr={
221 | estar <- sample(r, replace=TRUE, size=n)
222 | ystar <- L$fitted.values + estar
223 | Lb <- lm(ystar ~ x)
224 | s <- summary(Lb)$sigma
225 | c(b0=Lb$coeff[1], b1=Lb$coeff[2], s=s)
226 | })
227 | ests <- t(estsErr)
228 | summary(ests)
229 | ```
230 |
231 | ## Example 9.8 (Resampling errors, continued)
232 |
233 | ```{r }
234 | sd(ests[,2])
235 | s <- summary(L)$sigma
236 | SSx <- (n - 1) * var(x)
237 | se.beta1 <- sqrt(s^2 / SSx)
238 | se.beta1
239 | s * sqrt(1/n + mean(x)^2 / SSx)
240 | sd(ests[,1])
241 |
242 | betas <- summary(L)$coeff
243 | betas
244 | betas[, "Std. Error"]
245 |
246 | broom::tidy(summary(L))
247 | broom::tidy(summary(L))$std.error
248 | ```
249 |
250 | ## Example 9.9 (Model based resampling with the boot function)
251 |
252 | ```{r }
253 | regstats <- function(dat, i) {
254 | #dat is a data frame (r, x, yhat)
255 | #r are the modified centered residuals, yhat are the fits
256 | ystar <- dat$yhat + dat$r[i]
257 | xstar <- dat$x
258 | Lnew <- lm(ystar ~ xstar)
259 | Lnew$coefficients
260 | }
261 |
262 | y <- log(mammals$brain)
263 | x <- log(mammals$body)
264 | L <- lm(y ~ x)
265 | r <- rstandard(L, sd=1)
266 | r <- r - mean(r)
267 | df <- data.frame(r=r, x=x, yhat=L$fitted)
268 | head(df)
269 | boot.obj <- boot(data=df, statistic=regstats, R=2000)
270 | broom::tidy(boot.obj)
271 | ```
272 |
273 | ## Example 9.10 (Empirical influence values for the patch ratio statistic)
274 |
275 | ```{r }
276 | library(boot)
277 | library(bootstrap)
278 | theta_boot <- function(dat, ind) {
279 | # function to compute the patch ratio statistic
280 | mean(dat[ind, ]$y) / mean(dat[ind, ]$z)
281 | }
282 | boot.out <- boot(patch, theta_boot, R = 2000)
283 | infl <- empinf(boot.out, type = "jack")
284 | theta.hat <- boot.out$t0
285 | jack <- theta.hat - infl / (nrow(patch) - 1)
286 | rbind(infl, jack)
287 | ```
288 |
289 | ## Example 9.11 (Jackknife-after-bootstrap plot)
290 |
291 | ```{r }
292 | jack.after.boot(boot.out, useJ=TRUE, stinf=FALSE)
293 |
294 | n <- NROW(patch)
295 | J <- numeric(n)
296 | b.freq <- boot.array(boot.out)
297 | theta.b <- boot.out$t
298 |
299 | for (i in 1:n) {
300 | keep <- which(b.freq[ ,i] == 0)
301 | J[i] <- mean(theta.b[keep])
302 | }
303 | ```
304 |
305 | the jackknife influence values
306 |
307 | ```{r }
308 | (n - 1) * (mean(J) - J)
309 |
310 | jack.after.boot(boot.out, useJ=TRUE, stinf=TRUE)
311 | ```
312 |
313 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-chBoot.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 8: Bootstrap and Jackknife
14 |
15 | ## Example 8.2 (Bootstrap estimate of standard error)
16 |
17 | ```{r }
18 | library(bootstrap) #for the law data
19 | print(cor(law$LSAT, law$GPA))
20 | print(cor(law82$LSAT, law82$GPA))
21 |
22 | #set up the bootstrap
23 | B <- 200 #number of replicates
24 | n <- nrow(law) #sample size
25 | R <- numeric(B) #storage for replicates
26 |
27 | #bootstrap estimate of standard error of R
28 | for (b in 1:B) {
29 | #randomly select the indices
30 | i <- sample(1:n, size = n, replace = TRUE)
31 | LSAT <- law$LSAT[i] #i is a vector of indices
32 | GPA <- law$GPA[i]
33 | R[b] <- cor(LSAT, GPA)
34 | }
35 | #output
36 | print(se.R <- sd(R))
37 | hist(R, prob = TRUE)
38 | ```
39 |
40 | ## Example 8.3 (Bootstrap estimate of standard error: boot function)
41 |
42 | ```{r }
43 | r <- function(x, i) {
44 | #want correlation of columns 1 and 2
45 | cor(x[i,1], x[i,2])
46 | }
47 |
48 | library(boot) #for boot function
49 | obj <- boot(data = law, statistic = r, R = 2000)
50 | obj
51 | y <- obj$t
52 | sd(y)
53 | ```
54 |
55 | ## Example 8.4 (Bootstrap estimate of bias)
56 |
57 | ```{r }
58 | #sample estimate for n=15
59 | theta.hat <- cor(law$LSAT, law$GPA)
60 |
61 | #bootstrap estimate of bias
62 | B <- 2000 #larger for estimating bias
63 | n <- nrow(law)
64 | theta.b <- numeric(B)
65 |
66 | for (b in 1:B) {
67 | i <- sample(1:n, size = n, replace = TRUE)
68 | LSAT <- law$LSAT[i]
69 | GPA <- law$GPA[i]
70 | theta.b[b] <- cor(LSAT, GPA)
71 | }
72 | bias <- mean(theta.b - theta.hat)
73 | bias
74 | ```
75 |
76 | ## Example 8.5 (Bootstrap estimate of bias of a ratio estimate)
77 |
78 | ```{r }
79 | data(patch, package = "bootstrap")
80 | patch
81 |
82 | n <- nrow(patch) #in bootstrap package
83 | B <- 2000
84 | theta.b <- numeric(B)
85 | theta.hat <- mean(patch$y) / mean(patch$z)
86 |
87 | #bootstrap
88 | for (b in 1:B) {
89 | i <- sample(1:n, size = n, replace = TRUE)
90 | y <- patch$y[i]
91 | z <- patch$z[i]
92 | theta.b[b] <- mean(y) / mean(z)
93 | }
94 | bias <- mean(theta.b) - theta.hat
95 | se <- sd(theta.b)
96 | print(list(est=theta.hat, bias = bias,
97 | se = se, cv = bias/se))
98 | ```
99 |
100 | ## Example 8.6 (Jackknife estimate of bias)
101 |
102 | ```{r }
103 | data(patch, package = "bootstrap")
104 | n <- nrow(patch)
105 | y <- patch$y
106 | z <- patch$z
107 | theta.hat <- mean(y) / mean(z)
108 | print (theta.hat)
109 |
110 | #compute the jackknife replicates, leave-one-out estimates
111 | theta.jack <- numeric(n)
112 | for (i in 1:n)
113 | theta.jack[i] <- mean(y[-i]) / mean(z[-i])
114 | bias <- (n - 1) * (mean(theta.jack) - theta.hat)
115 |
116 | print(bias) #jackknife estimate of bias
117 | ```
118 |
119 | ## Example 8.7 (Jackknife estimate of standard error)
120 |
121 | ```{r }
122 | se <- sqrt((n-1) *
123 | mean((theta.jack - mean(theta.jack))^2))
124 | print(se)
125 | ```
126 |
127 | ## Example 8.8 (Failure of jackknife)
128 |
129 | ```{r }
130 | set.seed(123) #for the specific example given
131 | #change the seed to see other examples
132 |
133 | n <- 10
134 | x <- sample(1:100, size = n)
135 |
136 | #jackknife estimate of se
137 | M <- numeric(n)
138 | for (i in 1:n) { #leave one out
139 | y <- x[-i]
140 | M[i] <- median(y)
141 | }
142 | Mbar <- mean(M)
143 | print(sqrt((n-1)/n * sum((M - Mbar)^2)))
144 |
145 | #bootstrap estimate of se
146 | Mb <- replicate(1000, expr = {
147 | y <- sample(x, size = n, replace = TRUE)
148 | median(y) })
149 | print(sd(Mb))
150 | print(x)
151 | print(M)
152 | print(Mb)
153 | ```
154 |
155 | ## Example 8.9 (Bootstrap confidence intervals for patch ratio statistic)
156 |
157 | ```{r }
158 | library(boot) #for boot and boot.ci
159 | data(patch, package = "bootstrap")
160 |
161 | theta.boot <- function(dat, ind) {
162 | #function to compute the statistic
163 | y <- dat[ind, 1]
164 | z <- dat[ind, 2]
165 | mean(y) / mean(z)
166 | }
167 |
168 | y <- patch$y
169 | z <- patch$z
170 | dat <- cbind(y, z)
171 | boot.obj <- boot(dat, statistic = theta.boot, R = 2000)
172 |
173 | print(boot.obj)
174 | print(boot.ci(boot.obj,
175 | type = c("basic", "norm", "perc")))
176 |
177 |
178 | #calculations for bootstrap confidence intervals
179 | alpha <- c(.025, .975)
180 |
181 | #normal
182 | print(boot.obj$t0 + qnorm(alpha) * sd(boot.obj$t))
183 |
184 | #basic
185 | print(2*boot.obj$t0 -
186 | quantile(boot.obj$t, rev(alpha), type=1))
187 |
188 | #percentile
189 | print(quantile(boot.obj$t, alpha, type=6))
190 | ```
191 |
192 | ## Example 8.10 (Bootstrap confidence intervals for the correlation statistic)
193 |
194 | ```{r }
195 | library(boot)
196 | data(law, package = "bootstrap")
197 | boot.obj <- boot(law, R = 2000,
198 | statistic = function(x, i){cor(x[i,1], x[i,2])})
199 | print(boot.ci(boot.obj, type=c("basic","norm","perc")))
200 | ```
201 |
202 | ## Example 8.11 (Bootstrap t confidence interval)
203 |
204 | ```{r }
205 | boot.t.ci <-
206 | function(x, B = 500, R = 100, level = .95, statistic){
207 | #compute the bootstrap t CI
208 | x <- as.matrix(x); n <- nrow(x)
209 | stat <- numeric(B); se <- numeric(B)
210 |
211 | boot.se <- function(x, R, f) {
212 | #local function to compute the bootstrap
213 | #estimate of standard error for statistic f(x)
214 | x <- as.matrix(x); m <- nrow(x)
215 | th <- replicate(R, expr = {
216 | i <- sample(1:m, size = m, replace = TRUE)
217 | f(x[i, ])
218 | })
219 | return(sd(th))
220 | }
221 |
222 | for (b in 1:B) {
223 | j <- sample(1:n, size = n, replace = TRUE)
224 | y <- x[j, ]
225 | stat[b] <- statistic(y)
226 | se[b] <- boot.se(y, R = R, f = statistic)
227 | }
228 | stat0 <- statistic(x)
229 | t.stats <- (stat - stat0) / se
230 | se0 <- sd(stat)
231 | alpha <- 1 - level
232 | Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1)
233 | names(Qt) <- rev(names(Qt))
234 | CI <- rev(stat0 - Qt * se0)
235 | }
236 | ```
237 |
238 | ## Example 8.12 (Bootstrap t confidence interval for patch ratio statistic)
239 |
240 | ```{r }
241 | #boot package and patch data were loaded in Example 8.10
242 | #library(boot) #for boot and boot.ci
243 | #data(patch, package = "bootstrap")
244 |
245 | dat <- cbind(patch$y, patch$z)
246 | stat <- function(dat) {
247 | mean(dat[, 1]) / mean(dat[, 2]) }
248 | ci <- boot.t.ci(dat, statistic = stat, B=2000, R=200)
249 | print(ci)
250 | ```
251 |
252 | ## Example 8.13 (BCa bootstrap confidence interval)
253 |
254 | ```{r }
255 | boot.BCa <-
256 | function(x, th0, th, stat, conf = .95) {
257 | # bootstrap with BCa bootstrap confidence interval
258 | # th0 is the observed statistic
259 | # th is the vector of bootstrap replicates
260 | # stat is the function to compute the statistic
261 |
262 | x <- as.matrix(x)
263 | n <- nrow(x) #observations in rows
264 | N <- 1:n
265 | alpha <- (1 + c(-conf, conf))/2
266 | zalpha <- qnorm(alpha)
267 |
268 | # the bias correction factor
269 | z0 <- qnorm(sum(th < th0) / length(th))
270 |
271 | # the acceleration factor (jackknife est.)
272 | th.jack <- numeric(n)
273 | for (i in 1:n) {
274 | J <- N[1:(n-1)]
275 | th.jack[i] <- stat(x[-i, ], J)
276 | }
277 | L <- mean(th.jack) - th.jack
278 | a <- sum(L^3)/(6 * sum(L^2)^1.5)
279 |
280 | # BCa conf. limits
281 | adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha)))
282 | limits <- quantile(th, adj.alpha, type=6)
283 | return(list("est"=th0, "BCa"=limits))
284 | }
285 | ```
286 |
287 | ## Example 8.14 (BCa bootstrap confidence interval)
288 |
289 | ```{r }
290 | #boot package and patch data were loaded in Example 8.10
291 | #library(boot) #for boot and boot.ci
292 | #data(patch, package = "bootstrap")
293 |
294 | n <- nrow(patch)
295 | B <- 2000
296 | y <- patch$y
297 | z <- patch$z
298 | x <- cbind(y, z)
299 | theta.b <- numeric(B)
300 | theta.hat <- mean(y) / mean(z)
301 |
302 | #bootstrap
303 | for (b in 1:B) {
304 | i <- sample(1:n, size = n, replace = TRUE)
305 | y <- patch$y[i]
306 | z <- patch$z[i]
307 | theta.b[b] <- mean(y) / mean(z)
308 | }
309 | #compute the BCa interval
310 | stat <- function(dat, index) {
311 | mean(dat[index, 1]) / mean(dat[index, 2]) }
312 |
313 | boot.BCa(x, th0 = theta.hat, th = theta.b, stat = stat)
314 | ```
315 |
316 | ## Example 8.15 (BCa bootstrap confidence interval using boot.ci)
317 |
318 | ```{r }
319 | #using x from Example 8.15
320 | boot.obj <- boot(x, statistic = stat, R=2000)
321 | boot.ci(boot.obj, type=c("perc", "bca"))
322 | ```
323 |
324 | ## Example 8.16 (Model selection)
325 |
326 | ```{r }
327 | #to prompt for next graph, uncomment line below
328 | #par(ask = TRUE)
329 |
330 | library(DAAG); attach(ironslag)
331 | a <- seq(10, 40, .1) #sequence for plotting fits
332 |
333 | L1 <- lm(magnetic ~ chemical)
334 | plot(chemical, magnetic, main="Linear", pch=16)
335 | yhat1 <- L1$coef[1] + L1$coef[2] * a
336 | lines(a, yhat1, lwd=2)
337 |
338 | L2 <- lm(magnetic ~ chemical + I(chemical^2))
339 | plot(chemical, magnetic, main="Quadratic", pch=16)
340 | yhat2 <- L2$coef[1] + L2$coef[2] * a + L2$coef[3] * a^2
341 | lines(a, yhat2, lwd=2)
342 |
343 | L3 <- lm(log(magnetic) ~ chemical)
344 | plot(chemical, magnetic, main="Exponential", pch=16)
345 | logyhat3 <- L3$coef[1] + L3$coef[2] * a
346 | yhat3 <- exp(logyhat3)
347 | lines(a, yhat3, lwd=2)
348 |
349 | L4 <- lm(log(magnetic) ~ log(chemical))
350 | plot(log(chemical), log(magnetic), main="Log-Log", pch=16)
351 | logyhat4 <- L4$coef[1] + L4$coef[2] * log(a)
352 | lines(log(a), logyhat4, lwd=2)
353 | ```
354 |
355 | ## Example 8.17 (Model selection: Cross validation)
356 |
357 | ```{r }
358 | # Example 8.16, cont.
359 | n <- length(magnetic) #in DAAG ironslag
360 | e1 <- e2 <- e3 <- e4 <- numeric(n)
361 |
362 | # for n-fold cross validation
363 | # fit models on leave-one-out samples
364 | for (k in 1:n) {
365 | y <- magnetic[-k]
366 | x <- chemical[-k]
367 |
368 | J1 <- lm(y ~ x)
369 | yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k]
370 | e1[k] <- magnetic[k] - yhat1
371 |
372 | J2 <- lm(y ~ x + I(x^2))
373 | yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k] +
374 | J2$coef[3] * chemical[k]^2
375 | e2[k] <- magnetic[k] - yhat2
376 |
377 | J3 <- lm(log(y) ~ x)
378 | logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k]
379 | yhat3 <- exp(logyhat3)
380 | e3[k] <- magnetic[k] - yhat3
381 |
382 | J4 <- lm(log(y) ~ log(x))
383 | logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k])
384 | yhat4 <- exp(logyhat4)
385 | e4[k] <- magnetic[k] - yhat4
386 | }
387 |
388 |
389 | c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))
390 |
391 | #selected model, fitted in Example 8.16
392 | L2
393 |
394 | par(mfrow = c(2, 2)) #layout for graphs
395 | plot(L2$fit, L2$res) #residuals vs fitted values
396 | abline(0, 0) #reference line
397 | qqnorm(L2$res) #normal probability plot
398 | qqline(L2$res) #reference line
399 | par(mfrow = c(1, 1)) #restore display
400 | ```
401 |
402 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-chMCint.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 6: Monte Carlo Integration and Variance Reduction
14 |
15 | ## Example 6.1 (Simple Monte Carlo integration)
16 |
17 | ```{r }
18 | m <- 10000
19 | x <- runif(m)
20 | theta.hat <- mean(exp(-x))
21 | print(theta.hat)
22 | print(1 - exp(-1))
23 | ```
24 |
25 | ## Example 6.2 (Simple Monte Carlo integration, cont.)
26 |
27 | ```{r }
28 | m <- 10000
29 | x <- runif(m, min=2, max=4)
30 | theta.hat <- mean(exp(-x)) * 2
31 | print(theta.hat)
32 | print(exp(-2) - exp(-4))
33 | ```
34 |
35 | ## Example 6.3 (Monte Carlo integration, unbounded interval)
36 |
37 | ```{r }
38 | x <- seq(.1, 2.5, length = 10)
39 | m <- 10000
40 | u <- runif(m)
41 | cdf <- numeric(length(x))
42 | for (i in 1:length(x)) {
43 | g <- x[i] * exp(-(u * x[i])^2 / 2)
44 | cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5
45 | }
46 |
47 | Phi <- pnorm(x)
48 | print(round(rbind(x, cdf, Phi), 3))
49 | ```
50 |
51 | ## Example 6.4 (Example 6.3, cont.)
52 |
53 | ```{r }
54 | x <- seq(.1, 2.5, length = 10)
55 | m <- 10000
56 | z <- rnorm(m)
57 | dim(x) <- length(x)
58 | p <- apply(x, MARGIN = 1,
59 | FUN = function(x, z) {mean(z < x)}, z = z)
60 |
61 | Phi <- pnorm(x)
62 | print(round(rbind(x, p, Phi), 3))
63 | ```
64 |
65 | ## Example 6.5 (Error bounds for MC integration)
66 |
67 | ```{r }
68 | x <- 2
69 | m <- 10000
70 | z <- rnorm(m)
71 | g <- (z < x) #the indicator function
72 | v <- mean((g - mean(g))^2) / m
73 | cdf <- mean(g)
74 | c(cdf, v)
75 | c(cdf - 1.96 * sqrt(v), cdf + 1.96 * sqrt(v))
76 | ```
77 |
78 | ## Example 6.6 (Antithetic variables)
79 |
80 | ```{r }
81 | MC.Phi <- function(x, R = 10000, antithetic = TRUE) {
82 | u <- runif(R/2)
83 | if (!antithetic) v <- runif(R/2) else
84 | v <- 1 - u
85 | u <- c(u, v)
86 | cdf <- numeric(length(x))
87 | for (i in 1:length(x)) {
88 | g <- x[i] * exp(-(u * x[i])^2 / 2)
89 | cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5
90 | }
91 | cdf
92 | }
93 |
94 |
95 | x <- seq(.1, 2.5, length=5)
96 | Phi <- pnorm(x)
97 | set.seed(123)
98 | MC1 <- MC.Phi(x, anti = FALSE)
99 | set.seed(123)
100 | MC2 <- MC.Phi(x)
101 | print(round(rbind(x, MC1, MC2, Phi), 5))
102 |
103 |
104 | m <- 1000
105 | MC1 <- MC2 <- numeric(m)
106 | x <- 1.95
107 | for (i in 1:m) {
108 | MC1[i] <- MC.Phi(x, R = 1000, anti = FALSE)
109 | MC2[i] <- MC.Phi(x, R = 1000)
110 | }
111 |
112 | print(sd(MC1))
113 | print(sd(MC2))
114 | print((var(MC1) - var(MC2))/var(MC1))
115 | ```
116 |
117 | ## Example 6.7 (Control variate)
118 |
119 | ```{r }
120 | m <- 10000
121 | a <- - 12 + 6 * (exp(1) - 1)
122 | U <- runif(m)
123 | T1 <- exp(U) #simple MC
124 | T2 <- exp(U) + a * (U - 1/2) #controlled
125 |
126 | mean(T1)
127 | mean(T2)
128 | (var(T1) - var(T2)) / var(T1)
129 | ```
130 |
131 | ## Example 6.8 (MC integration using control variates)
132 |
133 | ```{r }
134 | f <- function(u)
135 | exp(-.5)/(1+u^2)
136 |
137 | g <- function(u)
138 | exp(-u)/(1+u^2)
139 |
140 | set.seed(510) #needed later
141 | u <- runif(10000)
142 | B <- f(u)
143 | A <- g(u)
144 |
145 | cor(A, B)
146 | a <- -cov(A,B) / var(B) #est of c*
147 | a
148 |
149 | m <- 100000
150 | u <- runif(m)
151 | T1 <- g(u)
152 | T2 <- T1 + a * (f(u) - exp(-.5)*pi/4)
153 |
154 | c(mean(T1), mean(T2))
155 | c(var(T1), var(T2))
156 | (var(T1) - var(T2)) / var(T1)
157 | ```
158 |
159 | ## Example 6.9 (Control variate and regression)
160 |
161 | ```{r }
162 | set.seed(510)
163 | mu <- exp(-.5)*pi/4
164 | u <- runif(10000)
165 | f <- exp(-.5)/(1+u^2)
166 | g <- exp(-u)/(1+u^2)
167 | L <- lm(g ~ f)
168 | L
169 | c.star <- - L$coeff[2]
170 | c.star
171 |
172 | theta.hat <- sum(L$coeff * c(1, mu)) #pred. value at mu
173 | theta.hat
174 | summary(L)$sigma^2
175 | summary(L)$r.squared
176 | ```
177 |
178 | ## Example 6.10 (Control variates and multiple regression)
179 |
180 | ```{r }
181 | # Example 6.9 continued with a second control variate
182 | # and multiple regression to estimate vector c*
183 |
184 | u <- runif(10000)
185 | f1 <- exp(-.5) / (1+u^2)
186 | f2 <- exp(-u) / (1-exp(-1))
187 | g <- exp(-u) / (1+u^2)
188 |
189 | L2 <- lm(g ~ f1 + f2)
190 |
191 | L2$coeff
192 | c.star <- - L2$coeff[2:3]
193 | c.star
194 | mu1 <- exp(-.5)*pi/4
195 | mu2 <- 1
196 | mu <- c(mu1, mu2)
197 |
198 | # theta.hat is the predicted response at mu
199 | # alternately can use predict.lm method
200 |
201 | theta.hat <- sum(L2$coeff * c(1, mu)) #pred. value at mu
202 | theta.hat
203 |
204 | ## alternately
205 | df <- data.frame(f1=mu1, f2=mu2)
206 | theta.hat <-predict(L2, df)
207 |
208 | # MSE / n is the est. variance of the control estimator
209 | MSE <- summary(L2)$sigma^2
210 | MSE
211 | sqrt(MSE / 10000)
212 |
213 |
214 | # compare with the previous estimates using
215 | # naive MC and control variate f1(u)
216 | # var1=.060231423 var2=.003124814
217 | var0 <- 0.060231423 #naive MC
218 | var1 <- 0.003117644 #controlled estimator with f1
219 | var2 <- MSE #new estimator
220 |
221 | # percent reduction in variance
222 | # it is a weighted average of R^2 values
223 | # so easier to compute directly
224 |
225 | 100 * (var0 - var1) / var0
226 | 100 * (var1 - var2) / var1
227 | 100 * (var0 - var2) / var0
228 | ```
229 |
230 | ## Example 6.11 (Choice of the importance function)
231 |
232 | ```{r }
233 | #code for plot is at the end of the file
234 |
235 | m <- 10000
236 | theta.hat <- se <- numeric(5)
237 | g <- function(x) {
238 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1)
239 | }
240 |
241 | x <- runif(m) #using f0
242 | fg <- g(x)
243 | theta.hat[1] <- mean(fg)
244 | se[1] <- sd(fg)
245 |
246 | x <- rexp(m, 1) #using f1
247 | fg <- g(x) / exp(-x)
248 | theta.hat[2] <- mean(fg)
249 | se[2] <- sd(fg)
250 |
251 | x <- rcauchy(m) #using f2
252 | i <- c(which(x > 1), which(x < 0))
253 | x[i] <- 2 #to catch overflow errors in g(x)
254 | fg <- g(x) / dcauchy(x)
255 | theta.hat[3] <- mean(fg)
256 | se[3] <- sd(fg)
257 |
258 | u <- runif(m) #f3, inverse transform method
259 | x <- - log(1 - u * (1 - exp(-1)))
260 | fg <- g(x) / (exp(-x) / (1 - exp(-1)))
261 | theta.hat[4] <- mean(fg)
262 | se[4] <- sd(fg)
263 |
264 | u <- runif(m) #f4, inverse transform method
265 | x <- tan(pi * u / 4)
266 | fg <- g(x) / (4 / ((1 + x^2) * pi))
267 | theta.hat[5] <- mean(fg)
268 | se[5] <- sd(fg)
269 |
270 | rbind(theta.hat, se / sqrt(m))
271 | ```
272 |
273 | ## Example 6.12 (Example 6.11, cont.)
274 |
275 | ```{r }
276 | M <- 20 #number of replicates
277 | T2 <- numeric(4)
278 | estimates <- matrix(0, 10, 2)
279 |
280 | g <- function(x) {
281 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1) }
282 |
283 | for (i in 1:10) {
284 | estimates[i, 1] <- mean(g(runif(M)))
285 | T2[1] <- mean(g(runif(M/4, 0, .25)))
286 | T2[2] <- mean(g(runif(M/4, .25, .5)))
287 | T2[3] <- mean(g(runif(M/4, .5, .75)))
288 | T2[4] <- mean(g(runif(M/4, .75, 1)))
289 | estimates[i, 2] <- mean(T2)
290 | }
291 |
292 | estimates
293 | apply(estimates, 2, mean)
294 | apply(estimates, 2, var)
295 | ```
296 |
297 | ## Example 6.13 (Examples 6.11-6.12, cont.)
298 |
299 | ```{r }
300 | M <- 10000 #number of replicates
301 | k <- 10 #number of strata
302 | r <- M / k #replicates per stratum
303 | N <- 50 #number of times to repeat the estimation
304 | T2 <- numeric(k)
305 | estimates <- matrix(0, N, 2)
306 |
307 | g <- function(x) {
308 | exp(-x - log(1+x^2)) * (x > 0) * (x < 1)
309 | }
310 |
311 | for (i in 1:N) {
312 | estimates[i, 1] <- mean(g(runif(M)))
313 | for (j in 1:k)
314 | T2[j] <- mean(g(runif(M/k, (j-1)/k, j/k)))
315 | estimates[i, 2] <- mean(T2)
316 | }
317 |
318 | apply(estimates, 2, mean)
319 | apply(estimates, 2, var)
320 | ```
321 |
322 | ## Plot importance functions in Figures 6.1(a) and 6.1.(b)
323 |
324 | ```{r }
325 | #par(ask = TRUE) #uncomment to pause between graphs
326 |
327 | x <- seq(0, 1, .01)
328 | w <- 2
329 | f1 <- exp(-x)
330 | f2 <- (1 / pi) / (1 + x^2)
331 | f3 <- exp(-x) / (1 - exp(-1))
332 | f4 <- 4 / ((1 + x^2) * pi)
333 | g <- exp(-x) / (1 + x^2)
334 |
335 | #for color change lty to col
336 |
337 | #figure (a)
338 | plot(x, g, type = "l", main = "", ylab = "",
339 | ylim = c(0,2), lwd = w)
340 | lines(x, g/g, lty = 2, lwd = w)
341 | lines(x, f1, lty = 3, lwd = w)
342 | lines(x, f2, lty = 4, lwd = w)
343 | lines(x, f3, lty = 5, lwd = w)
344 | lines(x, f4, lty = 6, lwd = w)
345 | legend("topright", legend = c("g", 0:4),
346 | lty = 1:6, lwd = w, inset = 0.02)
347 |
348 | #figure (b)
349 | plot(x, g, type = "l", main = "", ylab = "",
350 | ylim = c(0,3.2), lwd = w, lty = 2)
351 | lines(x, g/f1, lty = 3, lwd = w)
352 | lines(x, g/f2, lty = 4, lwd = w)
353 | lines(x, g/f3, lty = 5, lwd = w)
354 | lines(x, g/f4, lty = 6, lwd = w)
355 | legend("topright", legend = c(0:4),
356 | lty = 2:6, lwd = w, inset = 0.02)
357 | ```
358 |
359 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-chMonteCarlo.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 7: Monte Carlo Methods in Inference
14 |
15 | Note: Example 7.11 may take a few minutes.
16 |
17 |
18 | ## Example 7.1 (Basic Monte Carlo estimation)
19 |
20 | ```{r 7.1}
21 | m <- 1000
22 | g <- numeric(m)
23 | for (i in 1:m) {
24 | x <- rnorm(2)
25 | g[i] <- abs(x[1] - x[2])
26 | }
27 | est <- mean(g)
28 | est
29 | ```
30 |
31 | ## Example 7.2 (Estimating the MSE of a trimmed mean)
32 |
33 | ```{r 7.2}
34 | n <- 20
35 | m <- 1000
36 | tmean <- numeric(m)
37 | for (i in 1:m) {
38 | x <- sort(rnorm(n))
39 | tmean[i] <- sum(x[2:(n-1)]) / (n-2)
40 | }
41 | mse <- mean(tmean^2)
42 | mse
43 | sqrt(sum((tmean - mean(tmean))^2)) / m #se
44 |
45 | n <- 20
46 | m <- 1000
47 | tmean <- numeric(m)
48 | for (i in 1:m) {
49 | x <- sort(rnorm(n))
50 | tmean[i] <- median(x)
51 | }
52 | mse <- mean(tmean^2)
53 | mse
54 | sqrt(sum((tmean - mean(tmean))^2)) / m #se
55 | ```
56 |
57 | ## Example 7.3 (MSE of a trimmed mean, cont.)
58 |
59 | ```{r 7.3}
60 | set.seed(522)
61 | n <- 20
62 | K <- n/2 - 1
63 | m <- 1000
64 | mse <- matrix(0, n/2, 6)
65 |
66 | trimmed.mse <- function(n, m, k, p) {
67 | #MC est of mse for k-level trimmed mean of
68 | #contaminated normal pN(0,1) + (1-p)N(0,100)
69 | tmean <- numeric(m)
70 | for (i in 1:m) {
71 | sigma <- sample(c(1, 10), size = n,
72 | replace = TRUE, prob = c(p, 1-p))
73 | x <- sort(rnorm(n, 0, sigma))
74 | tmean[i] <- sum(x[(k+1):(n-k)]) / (n-2*k)
75 | }
76 | mse.est <- mean(tmean^2)
77 | se.mse <- sqrt(mean((tmean-mean(tmean))^2)) / sqrt(m)
78 | return(c(mse.est, se.mse))
79 | }
80 |
81 | for (k in 0:K) {
82 | mse[k+1, 1:2] <- trimmed.mse(n=n, m=m, k=k, p=1.0)
83 | mse[k+1, 3:4] <- trimmed.mse(n=n, m=m, k=k, p=.95)
84 | mse[k+1, 5:6] <- trimmed.mse(n=n, m=m, k=k, p=.9)
85 | }
86 | ```
87 |
88 | ## Example 7.4 (Confidence interval for variance)
89 |
90 | ```{r 7.4}
91 | n <- 20
92 | alpha <- .05
93 | x <- rnorm(n, mean=0, sd=2)
94 | UCL <- (n-1) * var(x) / qchisq(alpha, df=n-1)
95 | ```
96 |
97 | ## Example 7.5 (MC estimate of confidence level)
98 |
99 | ```{r 7.5}
100 | n <- 20
101 | alpha <- .05
102 | UCL <- replicate(1000, expr = {
103 | x <- rnorm(n, mean = 0, sd = 2)
104 | (n-1) * var(x) / qchisq(alpha, df = n-1)
105 | } )
106 | #count the number of intervals that contain sigma^2=4
107 | sum(UCL > 4)
108 | #or compute the mean to get the confidence level
109 | mean(UCL > 4)
110 | ```
111 |
112 | ## Example 7.6 (Empirical confidence level)
113 |
114 | ```{r 7.6}
115 | n <- 20
116 | alpha <- .05
117 | UCL <- replicate(1000, expr = {
118 | x <- rchisq(n, df = 2)
119 | (n-1) * var(x) / qchisq(alpha, df = n-1)
120 | } )
121 | sum(UCL > 4)
122 | mean(UCL > 4)
123 | ```
124 |
125 | ## Example 7.7 (Empirical Type I error rate)
126 |
127 | ```{r 7.7}
128 | n <- 20
129 | alpha <- .05
130 | mu0 <- 500
131 | sigma <- 100
132 |
133 | m <- 10000 #number of replicates
134 | p <- numeric(m) #storage for p-values
135 | for (j in 1:m) {
136 | x <- rnorm(n, mu0, sigma)
137 | ttest <- t.test(x, alternative = "greater", mu = mu0)
138 | p[j] <- ttest$p.value
139 | }
140 |
141 | p.hat <- mean(p < alpha)
142 | se.hat <- sqrt(p.hat * (1 - p.hat) / m)
143 | print(c(p.hat, se.hat))
144 | ```
145 |
146 | ## Example 7.8 (Skewness test of normality)
147 |
148 | ```{r 7.8}
149 | n <- c(10, 20, 30, 50, 100, 500) #sample sizes
150 | cv <- qnorm(.975, 0, sqrt(6/n)) #crit. values for each n
151 |
152 | sk <- function(x) {
153 | #computes the sample skewness coeff.
154 | xbar <- mean(x)
155 | m3 <- mean((x - xbar)^3)
156 | m2 <- mean((x - xbar)^2)
157 | return( m3 / m2^1.5 )
158 | }
159 |
160 | #n is a vector of sample sizes
161 | #we are doing length(n) different simulations
162 |
163 | p.reject <- numeric(length(n)) #to store sim. results
164 | m <- 10000 #num. repl. each sim.
165 |
166 | for (i in 1:length(n)) {
167 | sktests <- numeric(m) #test decisions
168 | for (j in 1:m) {
169 | x <- rnorm(n[i])
170 | #test decision is 1 (reject) or 0
171 | sktests[j] <- as.integer(abs(sk(x)) >= cv[i] )
172 | }
173 | p.reject[i] <- mean(sktests) #proportion rejected
174 | }
175 |
176 | p.reject
177 | ```
178 |
179 | ## Example 7.9 (Empirical power)
180 |
181 | ```{r 7.9}
182 | #set.seed(521)
183 | n <- 20
184 | m <- 1000
185 | mu0 <- 500
186 | sigma <- 100
187 | mu <- c(seq(450, 650, 10)) #alternatives
188 | M <- length(mu)
189 | power <- numeric(M)
190 | for (i in 1:M) {
191 | mu1 <- mu[i]
192 | pvalues <- replicate(m, expr = {
193 | #simulate under alternative mu1
194 | x <- rnorm(n, mean = mu1, sd = sigma)
195 | ttest <- t.test(x,
196 | alternative = "greater", mu = mu0)
197 | ttest$p.value } )
198 | power[i] <- mean(pvalues <= .05)
199 | }
200 |
201 | se <- sqrt(power * (1-power) / m)
202 | df <- data.frame(mean=mu, power=power, upper=power+2*se, lower=power-2*se)
203 |
204 | library(ggplot2)
205 | ggplot(df, aes(x=mean, y=power)) +
206 | geom_line() +
207 | geom_vline(xintercept=500, lty=2) +
208 | geom_hline(yintercept=c(0,.05), lty=1:2) +
209 | geom_errorbar(aes(ymin=lower, ymax=upper), width = 0.2, lwd=1.5)
210 | ```
211 |
212 | ## Example 7.10 (Power of the skewness test of normality)
213 |
214 | ```{r 7.10}
215 | #set.seed(111)
216 | alpha <- .1
217 | n <- 30
218 | m <- 2500
219 | epsilon <- c(seq(0, .15, .01), seq(.15, 1, .05))
220 | N <- length(epsilon)
221 | pwr <- numeric(N)
222 | #critical value for the skewness test
223 | cv <- qnorm(1-alpha/2, 0, sqrt(6*(n-2) / ((n+1)*(n+3))))
224 |
225 | for (j in 1:N) { #for each epsilon
226 | e <- epsilon[j]
227 | sktests <- numeric(m)
228 | for (i in 1:m) { #for each replicate
229 | sigma <- sample(c(1, 10), replace = TRUE,
230 | size = n, prob = c(1-e, e))
231 | x <- rnorm(n, 0, sigma)
232 | sktests[i] <- as.integer(abs(sk(x)) >= cv)
233 | }
234 | pwr[j] <- mean(sktests)
235 | }
236 |
237 | se <- sqrt(pwr * (1-pwr) / m)
238 | df <- data.frame(epsilon=epsilon, power=pwr, upper=pwr+2*se, lower=pwr-2*se)
239 |
240 | #plot power vs epsilon
241 | library(ggplot2)
242 | ggplot(df, aes(x=epsilon, y=power)) +
243 | geom_line() + labs(x=bquote(epsilon)) +
244 | geom_hline(yintercept=.1, lty=2) +
245 | geom_pointrange(aes(ymin=lower, ymax=upper))
246 | ```
247 |
248 | ## Example 7.11 (Power comparison of tests of normality)
249 |
250 | ```{r 7.11}
251 | #only one loop, for epsilon=0.1, was shown in the text
252 | #the simulation below takes several minutes to run
253 |
254 | # initialize input and output
255 | library(energy)
256 | alpha <- .1
257 | n <- 30
258 | m <- 500 #try small m for a trial run
259 | test1 <- test2 <- test3 <- numeric(m)
260 |
261 | #critical value for the skewness test
262 | cv <- qnorm(1-alpha/2, 0, sqrt(6*(n-2) / ((n+1)*(n+3))))
263 | sim <- matrix(0, 11, 4)
264 |
265 | # estimate power
266 | for (i in 0:10) {
267 | epsilon <- i * .1
268 | for (j in 1:m) {
269 | e <- epsilon
270 | sigma <- sample(c(1, 10), replace = TRUE,
271 | size = n, prob = c(1-e, e))
272 | x <- rnorm(n, 0, sigma)
273 | test1[j] <- as.integer(abs(sk(x)) >= cv)
274 | test2[j] <- as.integer(
275 | shapiro.test(x)$p.value <= alpha)
276 | test3[j] <- as.integer(
277 | mvnorm.etest(x, R=200)$p.value <= alpha)
278 | }
279 | print(c(epsilon, mean(test1), mean(test2), mean(test3)))
280 | sim[i+1, ] <- c(epsilon, mean(test1), mean(test2), mean(test3))
281 | }
282 | detach(package:energy)
283 |
284 | # plot the empirical estimates of power
285 | plot(sim[,1], sim[,2], ylim = c(0, 1), type = "l",
286 | xlab = bquote(epsilon), ylab = "power")
287 | lines(sim[,1], sim[,3], lty = 2)
288 | lines(sim[,1], sim[,4], lty = 4)
289 | abline(h = alpha, lty = 3)
290 | legend("topright", 1, c("skewness", "S-W", "energy"),
291 | lty = c(1,2,4), inset = .02)
292 | ```
293 |
294 | ## Example 7.12 (Count Five test statistic)
295 |
296 | ```{r 7.12}
297 | x1 <- rnorm(20, 0, sd = 1)
298 | x2 <- rnorm(20, 0, sd = 1.5)
299 | y <- c(x1, x2)
300 |
301 | group <- rep(1:2, each = length(x1))
302 | boxplot(y ~ group, boxwex = .3, xlim = c(.5, 2.5), main = "")
303 | points(group, y)
304 |
305 | # now identify the extreme points
306 | range(x1)
307 | range(x2)
308 |
309 | i <- which(x1 < min(x2))
310 | j <- which(x2 > max(x1))
311 |
312 | x1[i]
313 | x2[j]
314 |
315 | out1 <- sum(x1 > max(x2)) + sum(x1 < min(x2))
316 | out2 <- sum(x2 > max(x1)) + sum(x2 < min(x1))
317 | max(c(out1, out2))
318 | ```
319 |
320 | ## Example 7.13 (Count Five test statistic, cont.)
321 |
322 | ```{r 7.13}
323 | maxout <- function(x, y) {
324 | X <- x - mean(x)
325 | Y <- y - mean(y)
326 | outx <- sum(X > max(Y)) + sum(X < min(Y))
327 | outy <- sum(Y > max(X)) + sum(Y < min(X))
328 | return(max(c(outx, outy)))
329 | }
330 |
331 | n1 <- n2 <- 20
332 | mu1 <- mu2 <- 0
333 | sigma1 <- sigma2 <- 1
334 | m <- 1000
335 |
336 | # generate samples under H0
337 | stat <- replicate(m, expr={
338 | x <- rnorm(n1, mu1, sigma1)
339 | y <- rnorm(n2, mu2, sigma2)
340 | maxout(x, y)
341 | })
342 | print(cumsum(table(stat)) / m)
343 | print(quantile(stat, c(.8, .9, .95)))
344 | ```
345 |
346 | ## Example 7.14 (Count Five test)
347 |
348 | ```{r 7.14}
349 | count5test <- function(x, y) {
350 | X <- x - mean(x)
351 | Y <- y - mean(y)
352 | outx <- sum(X > max(Y)) + sum(X < min(Y))
353 | outy <- sum(Y > max(X)) + sum(Y < min(X))
354 | # return 1 (reject) or 0 (do not reject H0)
355 | return(as.integer(max(c(outx, outy)) > 5))
356 | }
357 |
358 | n1 <- n2 <- 20
359 | mu1 <- mu2 <- 0
360 | sigma1 <- sigma2 <- 1
361 | m <- 10000
362 | tests <- replicate(m, expr = {
363 | x <- rnorm(n1, mu1, sigma1)
364 | y <- rnorm(n2, mu2, sigma2)
365 | x <- x - mean(x) #centered by sample mean
366 | y <- y - mean(y)
367 | count5test(x, y)
368 | } )
369 |
370 | alphahat <- mean(tests)
371 | print(alphahat)
372 | ```
373 |
374 | ## Example 7.15 (Count Five test, cont.)
375 |
376 | ```{r 7.15}
377 | n1 <- 20
378 | n2 <- 30
379 | mu1 <- mu2 <- 0
380 | sigma1 <- sigma2 <- 1
381 | m <- 10000
382 |
383 | alphahat <- mean(replicate(m, expr={
384 | x <- rnorm(n1, mu1, sigma1)
385 | y <- rnorm(n2, mu2, sigma2)
386 | x <- x - mean(x) #centered by sample mean
387 | y <- y - mean(y)
388 | count5test(x, y)
389 | }))
390 |
391 | print(alphahat)
392 | ```
393 |
394 | ## Example 7.16 (Count Five, cont.)
395 |
396 | ```{r 7.16}
397 | # generate samples under H1 to estimate power
398 | sigma1 <- 1
399 | sigma2 <- 1.5
400 |
401 | power <- mean(replicate(m, expr={
402 | x <- rnorm(20, 0, sigma1)
403 | y <- rnorm(20, 0, sigma2)
404 | count5test(x, y)
405 | }))
406 |
407 | print(power)
408 | ```
409 |
410 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-chNumerical.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 13: Introduction to Numerical Methods in R
14 |
15 | ## Example 13.1 (Identical and nearly equal)
16 |
17 | ```{r }
18 | isTRUE(all.equal(.2, .3 - .1))
19 | all.equal(.2, .3) #not a logical value
20 | isTRUE(all.equal(.2, .3)) #always a logical value
21 |
22 | x <- 1:4
23 | y <- 2
24 | y == 2
25 | x == y #not necessarily a single logical value
26 | identical(x, y) #always a single logical value
27 | identical(y, 2)
28 | ```
29 |
30 | ## Example 13.2 (Ratio of two large numbers)
31 |
32 | ```{r }
33 | n <- 400
34 | (gamma((n-1)/2) / (sqrt(pi) * gamma((n-2)/2)))
35 | exp(lgamma((n-1)/2) - lgamma((n-2)/2)) / sqrt(pi)
36 | ```
37 |
38 | ## Example 13.3 (Taylor expansion)
39 |
40 | ```{r }
41 | system.time({
42 | for (i in 1:1000) {
43 | a <- rep(0, 24)
44 | a0 <- pi / 6
45 | a2 <- a0 * a0
46 | a[1] <- -a0^3 / 6
47 | for (i in 2:24)
48 | a[i] <- - a2 * a[i-1] / ((2*i+1)*(2*i))
49 | a0 + sum(a)}
50 | })
51 |
52 | system.time({
53 | for (i in 1:1000) {
54 | K <- 2 * (0:24) + 1
55 | i <- rep(c(1, -1), length=25)
56 | sum(i * (pi/6)^K / factorial(K))}
57 | })
58 | ```
59 |
60 | ## Example 13.4 (Derivative of zeta function)
61 |
62 | ```{r }
63 | zeta.deriv <- function(a) {
64 | z <- a - 1
65 | # Stieltjes constants gamma_k for k=1:5
66 | g <- c(
67 | -.7281584548367672e-1,
68 | -.9690363192872318e-2,
69 | .2053834420303346e-2,
70 | .2325370065467300e-2,
71 | .7933238173010627e-3)
72 | i <- c(-1, 1, -1, 1, -1)
73 | n <- 0:4
74 | -1/z^2 + sum(i * g * z^n / factorial(n))
75 | }
76 | ```
77 |
78 | ## Example 13.5 (Derivative of zeta function, cont.)
79 |
80 | ```{r }
81 | library(gsl) #for zeta function
82 | z <- c(1.001, 1.01, 1.5, 2, 3, 5)
83 | h <- .Machine$double.eps^0.5
84 | dz <- dq <- rep(0, length(z))
85 | for (i in 1:length(z)) {
86 | v <- z[i] + h
87 | h <- v - z[i]
88 | a0 <- z[i] - h
89 | if (a0 < 1) a0 <- (1 + z[i])/2
90 | a1 <- z[i] + h
91 | dq[i] <- (zeta(a1) - zeta(a0)) / (a1 - a0)
92 | dz[i] <- zeta.deriv(z[i])
93 | }
94 |
95 | h
96 |
97 | cbind(z, dz, dq)
98 | ```
99 |
100 | ## Example 13.6 (Solving f(x)=0)
101 |
102 | ```{r }
103 | f <- function(y, a, n) {
104 | a^2 + y^2 + 2*a*y/(n-1) - (n-2)
105 | }
106 |
107 | a <- 0.5
108 | n <- 20
109 | b0 <- 0
110 | b1 <- 5*n
111 |
112 | #solve using bisection
113 | it <- 0
114 | eps <- .Machine$double.eps^0.25
115 | r <- seq(b0, b1, length=3)
116 | y <- c(f(r[1], a, n), f(r[2], a, n), f(r[3], a, n))
117 | if (y[1] * y[3] > 0)
118 | stop("f does not have opposite sign at endpoints")
119 |
120 | while(it < 1000 && abs(y[2]) > eps) {
121 | it <- it + 1
122 | if (y[1]*y[2] < 0) {
123 | r[3] <- r[2]
124 | y[3] <- y[2]
125 | } else {
126 | r[1] <- r[2]
127 | y[1] <- y[2]
128 | }
129 | r[2] <- (r[1] + r[3]) / 2
130 | y[2] <- f(r[2], a=a, n=n)
131 | print(c(r[1], y[1], y[3]-y[2]))
132 | }
133 | ```
134 |
135 | ## Example 13.7 (Solving f(x)=0 with Brent's method: uniroot)
136 |
137 | ```{r }
138 | a <- 0.5
139 | n <- 20
140 | out <- uniroot(function(y) {
141 | a^2 + y^2 + 2*a*y/(n-1) - (n-2) },
142 | lower = 0, upper = n*5)
143 | unlist(out)
144 | uniroot(function(y) {a^2 + y^2 + 2*a*y/(n-1) - (n-2)},
145 | interval = c(-n*5, 0))$root
146 | ```
147 |
148 | ## Example 13.8 (Numerical integration with integrate)
149 |
150 | ```{r }
151 | f <- function(y, N, r, rho) {
152 | (cosh(y) - rho * r)^(1 - N)
153 | }
154 | integrate(f, lower=0, upper=Inf,
155 | rel.tol=.Machine$double.eps^0.25,
156 | N=10, r=0.5, rho=0.2)
157 |
158 | ro <- seq(-.99, .99, .01)
159 | v <- rep(0, length(ro))
160 | for (i in 1:length(ro)) {
161 | v[i] <- integrate(f, lower=0, upper=Inf,
162 | rel.tol=.Machine$double.eps^0.25,
163 | N=10, r=0.5, rho=ro[i])$value
164 | }
165 | plot(ro, v, type="l", xlab=expression(rho),
166 | ylab="Integral Value (n=10, r=0.5)")
167 | ```
168 |
169 | ## Example 13.9 (Density of sample correlation coefficient)
170 |
171 | ```{r }
172 | .dcorr <- function(r, N, rho=0) {
173 | # compute the density function of sample correlation
174 | if (abs(r) > 1 || abs(rho) > 1) return (0)
175 | if (N < 4) return (NA)
176 |
177 | if (isTRUE(all.equal(rho, 0.0))) {
178 | a <- exp(lgamma((N - 1)/2) - lgamma((N - 2)/2)) /
179 | sqrt(pi)
180 | return (a * (1 - r^2)^((N - 4)/2))
181 | }
182 |
183 | # if rho not 0, need to integrate
184 | f <- function(w, R, N, rho)
185 | (cosh(w) - rho * R)^(1 - N)
186 |
187 | #need to insert some error checking here
188 | i <- integrate(f, lower=0, upper=Inf,
189 | R=r, N=N, rho=rho)$value
190 | c1 <- (N - 2) * (1 - rho^2)^((N - 1)/2)
191 | c2 <- (1 - r^2)^((N - 4) / 2) / pi
192 | return(c1 * c2 * i)
193 | }
194 |
195 | r <- as.matrix(seq(-1, 1, .01))
196 | d1 <- apply(r, 1, .dcorr, N=10, rho=.0)
197 | d2 <- apply(r, 1, .dcorr, N=10, rho=.5)
198 | d3 <- apply(r, 1, .dcorr, N=10, rho=-.5)
199 | plot(r, d2, type="l", lty=2, lwd=2, ylab="density")
200 | lines(r, d1, lwd=2)
201 | lines(r, d3, lty=4, lwd=2)
202 | legend("top", inset=.02,
203 | c("rho = 0", "rho = 0.5", "rho = -0.5"), lty=c(1,2,4), lwd=2)
204 | ```
205 |
206 | ## Example 13.10 (MLE using mle)
207 |
208 | ```{r }
209 | #the observed sample
210 | y <- c(0.04304550, 0.50263474)
211 |
212 | mlogL <- function(theta=1) {
213 | #minus log-likelihood of exp. density, rate theta
214 | return( - (length(y) * log(theta) - theta * sum(y)))
215 | }
216 |
217 | library(stats4)
218 | fit <- mle(mlogL)
219 | summary(fit)
220 |
221 | # Alternately, the initial value for the optimizer could
222 | # be supplied in the call to mle; two examples are
223 |
224 | mle(mlogL, start=list(theta=1))
225 | mle(mlogL, start=list(theta=mean(y)))
226 | ```
227 |
228 | ## Application: Evaluating an Expected Value
229 |
230 | ```{r }
231 | y <- VGAM::rpareto(1000, scale=1, shape=3)
232 | hist(y, prob=TRUE, breaks="scott", main="", ylim=c(0,3))
233 | curve(VGAM::dpareto(x, scale=1, shape=3), add=TRUE)
234 |
235 | a <- 1
236 | s <- 1
237 | b <- 0.5
238 | ```
239 |
240 | ## Example 13.11 (Numerical integration)
241 |
242 | ```{r }
243 | f1 <- function(x, y, s, a, b) {
244 | # the integrand function: |y-x|^b f(x)
245 | (abs(y-x))^b * a * s^a / x^(a+1)
246 | }
247 |
248 | integrate(f1, lower=s, upper=Inf, y=2, s=1, a=1, b=b)
249 | ```
250 |
251 | ## Example 13.12 (Direct evaluation)
252 |
253 | ```{r }
254 | g2 <- function(y, s, b) {
255 | # Compute E|y-X|^b for Pareto(I) with a=1
256 | y0 <- (y - s)/y
257 | c0 <- gsl::hyperg_2F1(b+1, 1, b+2, y0)
258 | ((y - s)^b) - s*b*(y^(b-1)) *
259 | ((y0^b)/b + c0*(y0^(b+1))/(b+1)) +
260 | s*y^(b-1) * beta(b+1,1-b)
261 | }
262 |
263 | g2(2, s, b)
264 | ```
265 |
266 | ## Example 13.13 (Plot of expected distance function)
267 |
268 | ```{r }
269 | p <- c(ppoints(50)/100, ppoints(50))
270 | x <- VGAM::qpareto(p, scale=s, shape=1)
271 | ex <- g2(x, s=s, b=b)
272 | plot(log(x), log(ex), cex=.25, type="l")
273 | for (i in 1:length(x)) {
274 | y <- x[i]
275 | zi <- integrate(f1, lower=s, upper=Inf,
276 | subdivisions=200, rel.tol=.Machine$double.eps^.5,
277 | stop.on.error=FALSE, y=y, s=s, a=1, b=b)
278 | if (zi$message == "OK")
279 | points(log(y), log(zi$value), col=2, cex=.25)
280 | else print(paste("y=", y, zi$Message))
281 | }
282 |
283 | integrate(f1, lower=s, upper=Inf, y=s, s=s, a=1, b=b)
284 | g2(s, s, b)
285 | ```
286 |
287 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-chOptim.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 14: Optimization
14 |
15 | ## Example 14.1 (One-dimensional optimization with optimize)
16 |
17 | ```{r }
18 | f <- function(x)
19 | log(x + log(x))/log(1+x)
20 |
21 | curve(f(x), from = 2, to = 15, ylab = "f(x)")
22 |
23 | optimize(f, lower = 4, upper = 8, maximum = TRUE)
24 | ```
25 |
26 | ## Example 14.2 (MLE: Gamma distribution)
27 |
28 | ```{r }
29 | m <- 20000
30 | est <- matrix(0, m, 2)
31 | n <- 200
32 | r <- 5
33 | lambda <- 2
34 |
35 | obj <- function(lambda, xbar, logx.bar) {
36 | digamma(lambda * xbar) - logx.bar - log(lambda)
37 | }
38 |
39 | for (i in 1:m) {
40 | x <- rgamma(n, shape=r, rate=lambda)
41 | xbar <- mean(x)
42 | u <- uniroot(obj, lower = .001, upper = 10e5,
43 | xbar = xbar, logx.bar = mean(log(x)))
44 | lambda.hat <- u$root
45 | r.hat <- xbar * lambda.hat
46 | est[i, ] <- c(r.hat, lambda.hat)
47 | }
48 |
49 | ML <- colMeans(est)
50 |
51 | hist(est[, 1], breaks="scott", freq=FALSE,
52 | xlab="r", main="")
53 | points(ML[1], 0, cex=1.5, pch=20)
54 | hist(est[, 2], breaks="scott", freq=FALSE,
55 | xlab=bquote(lambda), main="")
56 | points(ML[2], 0, cex=1.5, pch=20)
57 | ```
58 |
59 | ## Example 14.3 (Two-dimensional optimization with optim)
60 |
61 | ```{r }
62 | LL <- function(theta, sx, slogx, n) {
63 | r <- theta[1]
64 | lambda <- theta[2]
65 | loglik <- n * r * log(lambda) + (r - 1) * slogx -
66 | lambda * sx - n * log(gamma(r))
67 | - loglik
68 | }
69 |
70 | n <- 200
71 | r <- 5; lambda <- 2
72 | x <- rgamma(n, shape=r, rate=lambda)
73 |
74 | optim(c(1,1), LL, sx=sum(x), slogx=sum(log(x)), n=n)
75 |
76 | mlests <- replicate(20000, expr = {
77 | x <- rgamma(200, shape = 5, rate = 2)
78 | optim(c(1,1), LL, sx=sum(x), slogx=sum(log(x)), n=n)$par
79 | })
80 | colMeans(t(mlests))
81 | ```
82 |
83 | ## Example 14.4 (MLE for a quadratic form)
84 |
85 | ```{r }
86 | LL <- function(lambda, y) {
87 | lambda3 <- 1 - sum(lambda)
88 | f1 <- dgamma(y, shape=1/2, rate=1/(2*lambda[1]))
89 | f2 <- dgamma(y, shape=1/2, rate=1/(2*lambda[2]))
90 | f3 <- dgamma(y, shape=1/2, rate=1/(2*lambda3))
91 | f <- f1/3 + f2/3 + f3/3 #density of mixture
92 | #returning -loglikelihood
93 | return( -sum(log(f)))
94 | }
95 |
96 | set.seed(543)
97 | m <- 2000
98 | lambda <- c(.6, .25, .15) #rate is 1/(2lambda)
99 | lam <- sample(lambda, size = 2000, replace = TRUE)
100 | y <- rgamma(m, shape = .5, rate = 1/(2*lam))
101 |
102 | opt <- optim(c(.5,.3), LL, y=y)
103 | theta <- c(opt$par, 1 - sum(opt$par))
104 |
105 | as.data.frame(unlist(opt))
106 |
107 | theta
108 | ```
109 |
110 | ## Example 14.5 (EM algorithm for a mixture model)
111 |
112 | ```{r }
113 | set.seed(543)
114 | lambda <- c(.6, .25, .15) #rate is 1/(2lambda)
115 | lam <- sample(lambda, size = 2000, replace = TRUE)
116 | y <- rgamma(m, shape = .5, rate = 1/(2*lam))
117 |
118 | N <- 10000 #max. number of iterations
119 | L <- c(.5, .4, .1) #initial est. for lambdas
120 | tol <- .Machine$double.eps^0.5
121 | L.old <- L + 1
122 |
123 | for (j in 1:N) {
124 | f1 <- dgamma(y, shape=1/2, rate=1/(2*L[1]))
125 | f2 <- dgamma(y, shape=1/2, rate=1/(2*L[2]))
126 | f3 <- dgamma(y, shape=1/2, rate=1/(2*L[3]))
127 | py <- f1 / (f1 + f2 + f3) #posterior prob y from 1
128 | qy <- f2 / (f1 + f2 + f3) #posterior prob y from 2
129 | ry <- f3 / (f1 + f2 + f3) #posterior prob y from 3
130 |
131 | mu1 <- sum(y * py) / sum(py) #update means
132 | mu2 <- sum(y * qy) / sum(qy)
133 | mu3 <- sum(y * ry) / sum(ry)
134 | L <- c(mu1, mu2, mu3) #update lambdas
135 | L <- L / sum(L)
136 |
137 | if (sum(abs(L - L.old)/L.old) < tol) break
138 | L.old <- L
139 | }
140 |
141 | print(list(lambda = L/sum(L), iter = j, tol = tol))
142 | ```
143 |
144 | ## Example 14.6 (Simplex algorithm)
145 |
146 | ```{r }
147 | library(boot) #for simplex function
148 | A1 <- rbind(c(-2, 1, 1), c(4, -1, 3))
149 | b1 <- c(1, 3)
150 | a <- c(2, 2, 3)
151 | simplex(a = a, A1 = A1, b1 = b1, maxi = TRUE)
152 | detach(package:boot)
153 | ```
154 |
155 | ## Example 14.7 (Solving the Morra game)
156 |
157 | ```{r }
158 | solve.game <- function(A) {
159 | #solve the two player zero-sum game by simplex method
160 | #optimize for player 1, then player 2
161 | #maximize v subject to ...
162 | #let x strategies 1:m, and put v as extra variable
163 | #A1, the <= constraints
164 | #
165 | min.A <- min(A)
166 | A <- A - min.A #so that v >= 0
167 | max.A <- max(A)
168 | A <- A / max(A)
169 | m <- nrow(A)
170 | n <- ncol(A)
171 | it <- n^3
172 | a <- c(rep(0, m), 1) #objective function
173 | A1 <- -cbind(t(A), rep(-1, n)) #constraints <=
174 | b1 <- rep(0, n)
175 | A3 <- t(as.matrix(c(rep(1, m), 0))) #constraints sum(x)=1
176 | b3 <- 1
177 | sx <- simplex(a=a, A1=A1, b1=b1, A3=A3, b3=b3,
178 | maxi=TRUE, n.iter=it)
179 | #the 'solution' is [x1,x2,...,xm | value of game]
180 | #
181 | #minimize v subject to ...
182 | #let y strategies 1:n, with v as extra variable
183 | a <- c(rep(0, n), 1) #objective function
184 | A1 <- cbind(A, rep(-1, m)) #constraints <=
185 | b1 <- rep(0, m)
186 | A3 <- t(as.matrix(c(rep(1, n), 0))) #constraints sum(y)=1
187 | b3 <- 1
188 | sy <- simplex(a=a, A1=A1, b1=b1, A3=A3, b3=b3,
189 | maxi=FALSE, n.iter=it)
190 |
191 | soln <- list("A" = A * max.A + min.A,
192 | "x" = sx$soln[1:m],
193 | "y" = sy$soln[1:n],
194 | "v" = sx$soln[m+1] * max.A + min.A)
195 | soln
196 | }
197 |
198 |
199 | #enter the payoff matrix
200 | A <- matrix(c( 0,-2,-2,3,0,0,4,0,0,
201 | 2,0,0,0,-3,-3,4,0,0,
202 | 2,0,0,3,0,0,0,-4,-4,
203 | -3,0,-3,0,4,0,0,5,0,
204 | 0,3,0,-4,0,-4,0,5,0,
205 | 0,3,0,0,4,0,-5,0,-5,
206 | -4,-4,0,0,0,5,0,0,6,
207 | 0,0,4,-5,-5,0,0,0,6,
208 | 0,0,4,0,0,5,-6,-6,0), 9, 9)
209 |
210 | library(boot) #needed for simplex function
211 |
212 | s <- solve.game(A)
213 | round(cbind(s$x, s$y), 7)
214 | ```
215 |
216 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-chPerm.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 10: Permutation Tests
14 |
15 | ## Example 10.1 (Permutation distribution of a statistic)
16 |
17 | ```{r }
18 | attach(chickwts)
19 | x <- sort(weight[feed == "soybean"])
20 | y <- sort(weight[feed == "linseed"])
21 | detach(chickwts)
22 |
23 | R <- 999 #number of replicates
24 | z <- c(x, y) #pooled sample
25 | K <- 1:26
26 | reps <- numeric(R) #storage for replicates
27 | t0 <- t.test(x, y)$statistic
28 |
29 | for (i in 1:R) {
30 | #generate indices k for the first sample
31 | k <- sample(K, size = 14, replace = FALSE)
32 | x1 <- z[k]
33 | y1 <- z[-k] #complement of x1
34 | reps[i] <- t.test(x1, y1)$statistic
35 | }
36 | p <- mean(c(t0, reps) >= t0)
37 | p
38 |
39 | hist(reps, main = "", freq = FALSE, xlab = "T (p = 0.202)",
40 | breaks = "scott")
41 | points(t0, 0, cex = 1, pch = 16) #observed T
42 | ```
43 |
44 | ## Example 10.2 (Permutation distribution of the K-S statistic)
45 |
46 | ```{r }
47 | # continues Example 10.1
48 | R <- 999 #number of replicates
49 | z <- c(x, y) #pooled sample
50 | K <- 1:26
51 | D <- numeric(R) #storage for replicates
52 | options(warn = -1)
53 | D0 <- ks.test(x, y, exact = FALSE)$statistic
54 | for (i in 1:R) {
55 | #generate indices k for the first sample
56 | k <- sample(K, size = 14, replace = FALSE)
57 | x1 <- z[k]
58 | y1 <- z[-k] #complement of x1
59 | D[i] <- ks.test(x1, y1, exact = FALSE)$statistic
60 | }
61 | p <- mean(c(D0, D) >= D0)
62 | options(warn = 0)
63 | p
64 |
65 | hist(D, main = "", freq = FALSE, xlab = "D (p = 0.46)",
66 | breaks = "scott")
67 | points(D0, 0, cex = 1, pch = 16) #observed D
68 | ```
69 |
70 | ## Example 10.3 (Example 10.2, cont.)
71 |
72 | ```{r }
73 | attach(chickwts)
74 | x <- sort(weight[feed == "sunflower"])
75 | y <- sort(weight[feed == "linseed"])
76 | detach(chickwts)
77 |
78 | summary(cbind(x, y))
79 | options(warn = -1)
80 | D0 <- ks.test(x, y, exact = FALSE)$statistic
81 | for (i in 1:R) {
82 | #generate indices k for the first sample
83 | k <- sample(K, size = 14, replace = FALSE)
84 | x1 <- z[k]
85 | y1 <- z[-k] #complement of x1
86 | D[i] <- ks.test(x1, y1, exact = FALSE)$statistic
87 | }
88 | p <- mean(c(D0, D) >= D0)
89 | options(warn = 0)
90 | p
91 | ```
92 |
93 | ## Example 10.4 (Finding nearest neighbors)
94 | ## using yaImpute::ann
95 |
96 | ```{r }
97 | set.seed(439)
98 | library(yaImpute) #for ann function
99 |
100 | #generate a small multivariate data set
101 | x <- matrix(rnorm(12), 3, 4)
102 | y <- matrix(rnorm(12), 3, 4)
103 | z <- rbind(x, y)
104 | k <- nrow(z) #number of nearest neighbors desired
105 |
106 | ## Do an exact kd-tree search
107 | kd.exact <- ann(ref=z, target=z,
108 | tree.type="kd", k=k, verbose=FALSE)
109 | kd.exact$knnIndexDist[,1:k] #NN indices
110 | round(sqrt(kd.exact$knnIndexDist[,-(1:k)]),2) #Euclidean distances
111 |
112 | ## Do an approximate kd-tree search
113 | kd.approx <- ann(ref=z, target=z,
114 | tree.type="kd", k=k, eps=100, verbose=FALSE)
115 | kd.approx$knnIndexDist[,1:k] #NN indices
116 | detach(package:yaImpute)
117 |
118 |
119 | ### Example 10.5 (Nearest neighbor statistic)
120 | ### using yaImpute::ann
121 |
122 | library(yaImpute)
123 | attach(chickwts)
124 | x <- weight[feed == "sunflower"]
125 | y <- weight[feed == "linseed"]
126 | z <- as.matrix(c(x, y))
127 | detach(chickwts)
128 |
129 | k <- 4 #want first 3 nearest neighbors
130 | NN <- ann(ref=z, target=z, tree.type="kd", k=k, verbose=FALSE)
131 | idx <- NN$knnIndexDist[,1:k]
132 | nn.idx <- idx[,-1] #first NN is in column 2
133 |
134 | block1 <- nn.idx[1:12, ]
135 | block2 <- nn.idx[13:24, ]
136 | i1 <- sum(block1 < 12.5)
137 | i2 <- sum(block2 > 12.5)
138 |
139 | c(i1, i2)
140 |
141 | detach(package:yaImpute)
142 | ```
143 |
144 | ## Example 10.6 (Nearest neighbor test)
145 | ## using yaImpute::ann
146 |
147 | ```{r }
148 | library(boot)
149 | #continues the previous example
150 |
151 | ## function to return the matrix of indices NN_j of nearest neighbors
152 | NN.idx <- function(x, tree.type="kd", k=NROW(x)) {
153 | x <- as.matrix(x)
154 | k <- min(c(k+1, NROW(x)))
155 | NN <- yaImpute::ann(ref=x, target=x,
156 | tree.type="kd", k=k, verbose=FALSE)
157 | idx <- NN$knnIndexDist[,1:k]
158 | nn.idx <- idx[,-1] #first NN is in column 2
159 | row.names(nn.idx) <- idx[,1]
160 | nn.idx
161 | }
162 |
163 | ## function to compute the NN statistic T(n,3)
164 | Tn3 <- function(z, ix=1:NROW(z), sizes) {
165 | z <- as.matrix(z)
166 | n1 <- sizes[1]
167 | n2 <- sizes[2]
168 | n <- n1 + n2
169 | z <- as.matrix(z[ix, ])
170 | nn.idx <- NN.idx(z, k=3)
171 | block1 <- nn.idx[1:n1, ]
172 | block2 <- nn.idx[(n1+1):n, ]
173 | i1 <- sum(block1 < n1 + .5)
174 | i2 <- sum(block2 > n1 + .5)
175 | return((i1 + i2) / (3 * n))
176 | }
177 |
178 | attach(chickwts)
179 | x <- weight[feed == "sunflower"]
180 | y <- weight[feed == "linseed"]
181 | z <- c(x, y)
182 | detach(chickwts)
183 |
184 | N <- c(NROW(x), NROW(y))
185 |
186 | boot.obj <- boot(data = z, statistic = Tn3,
187 | sim = "permutation", R = 999, sizes = N)
188 | boot.obj
189 |
190 | tb <- c(boot.obj$t, boot.obj$t0)
191 | mean(tb >= boot.obj$t0)
192 |
193 | hist(tb, freq=FALSE, main="",
194 | xlab="replicates of T(n,3) statistic")
195 | points(boot.obj$t0, 0, cex=1, pch=16)
196 | ```
197 |
198 | ## Example 10.7 (Two-sample energy statistic)
199 |
200 | ```{r }
201 | edist.2 <- function(x, ix, sizes) {
202 | # computes the e-statistic between 2 samples
203 | # x: Euclidean distances of pooled sample
204 | # sizes: vector of sample sizes
205 | # ix: a permutation of row indices of x
206 |
207 | dst <- x
208 | n1 <- sizes[1]
209 | n2 <- sizes[2]
210 | ii <- ix[1:n1]
211 | jj <- ix[(n1+1):(n1+n2)]
212 | w <- n1 * n2 / (n1 + n2)
213 |
214 | # permutation applied to rows & cols of dist. matrix
215 | m11 <- sum(dst[ii, ii]) / (n1 * n1)
216 | m22 <- sum(dst[jj, jj]) / (n2 * n2)
217 | m12 <- sum(dst[ii, jj]) / (n1 * n2)
218 | e <- w * ((m12 + m12) - (m11 + m22))
219 | return (e)
220 | }
221 |
222 | d <- 3
223 | a <- 2 / sqrt(d)
224 | x <- matrix(rnorm(20 * d), nrow = 20, ncol = d)
225 | y <- matrix(rnorm(10 * d, a, 1), nrow = 10, ncol = d)
226 | z <- rbind(x, y)
227 | dst <- as.matrix(dist(z))
228 |
229 | edist.2(dst, 1:30, sizes = c(20, 10))
230 | ```
231 |
232 | ## Example 10.8 (Two-sample energy test)
233 |
234 | ```{r }
235 | library(boot) #for boot function
236 | dst <- as.matrix(dist(z))
237 | N <- c(20, 10)
238 |
239 | boot.obj <- boot(data = dst, statistic = edist.2,
240 | sim = "permutation", R = 999, sizes = N)
241 | boot.obj
242 |
243 | #calculate the ASL
244 | e <- boot.obj$t0
245 | tb <- c(e, boot.obj$t)
246 | mean(tb >= e)
247 |
248 | hist(tb, main = "", breaks="scott", freq=FALSE,
249 | xlab="Replicates of e")
250 | points(e, 0, cex=1, pch=16)
251 |
252 |
253 | #energy test applied under F=G
254 | d <- 3
255 | a <- 0
256 | x <- matrix(rnorm(20 * d), nrow = 20, ncol = d)
257 | y <- matrix(rnorm(10 * d, a, 1), nrow = 10, ncol = d)
258 | z <- rbind(x, y)
259 | dst <- as.matrix(dist(z))
260 |
261 | N <- c(20, 10)
262 | dst <- as.matrix(dist(z))
263 | boot.obj <- boot(data = dst, statistic = edist.2,
264 | sim="permutation", R=999, sizes=N)
265 | boot.obj
266 |
267 | #calculate the ASL
268 | e <- boot.obj$t0
269 | E <- c(boot.obj$t, e)
270 | mean(E >= e)
271 |
272 | hist(E, main = "", breaks="scott",
273 | xlab="Replicates of e", freq=FALSE)
274 | points(e, 0, cex=1, pch=16)
275 | ```
276 |
277 | ## Example 10.9 (k-sample energy distances)
278 |
279 | ```{r }
280 | z <- iris[ , 1:4]
281 | dst <- dist(z)
282 | energy::edist(dst, sizes = c(50, 50, 50), distance = TRUE)
283 | ```
284 |
285 | ## Example 10.10 (Distance Components (disco))
286 |
287 | ```{r }
288 | set.seed(413)
289 | energy::disco(iris[ , 1:4], factors = iris$Species, R = 999)
290 | ```
291 |
292 | ## Example 10.11 (Power Comparison)
293 |
294 | ```{r }
295 | # results of several simulations summarized in Table
296 | ```
297 |
298 | ## Example 10.12 (Distance covariance statistic)
299 |
300 | ```{r }
301 | dCov <- function(x, y) {
302 | x <- as.matrix(x)
303 | y <- as.matrix(y)
304 | n <- nrow(x)
305 | m <- nrow(y)
306 | if (n != m || n < 2) stop("Sample sizes must agree")
307 | if (! (all(is.finite(c(x, y)))))
308 | stop("Data contains missing or infinite values")
309 |
310 | Akl <- function(x) {
311 | d <- as.matrix(dist(x))
312 | m <- rowMeans(d)
313 | M <- mean(d)
314 | a <- sweep(d, 1, m)
315 | b <- sweep(a, 2, m)
316 | return(b + M)
317 | }
318 | A <- Akl(x)
319 | B <- Akl(y)
320 | dCov <- sqrt(mean(A * B))
321 | dCov
322 | }
323 |
324 | z <- as.matrix(iris[1:50, 1:4])
325 | x <- z[ , 1:2]
326 | y <- z[ , 3:4]
327 | # compute the observed statistic
328 | dCov(x, y)
329 | ```
330 |
331 | ## Example 10.13 (Distance correlation statistic)
332 |
333 | ```{r }
334 | DCOR <- function(x, y) {
335 | x <- as.matrix(x)
336 | y <- as.matrix(y)
337 | n <- nrow(x)
338 | m <- nrow(y)
339 | if (n != m || n < 2) stop("Sample sizes must agree")
340 | if (! (all(is.finite(c(x, y)))))
341 | stop("Data contains missing or infinite values")
342 | Akl <- function(x) {
343 | d <- as.matrix(dist(x))
344 | m <- rowMeans(d)
345 | M <- mean(d)
346 | a <- sweep(d, 1, m)
347 | b <- sweep(a, 2, m)
348 | return(b + M)
349 | }
350 | A <- Akl(x)
351 | B <- Akl(y)
352 | dCov <- sqrt(mean(A * B))
353 | dVarX <- sqrt(mean(A * A))
354 | dVarY <- sqrt(mean(B * B))
355 | dCor <- sqrt(dCov / sqrt(dVarX * dVarY))
356 | list(dCov=dCov, dCor=dCor, dVarX=dVarX, dVarY=dVarY)
357 | }
358 |
359 | z <- as.matrix(iris[1:50, 1:4])
360 | x <- z[ , 1:2]
361 | y <- z[ , 3:4]
362 |
363 | DCOR(x, y)
364 | ```
365 |
366 | ## Example 10.14 (Distance covariance test)
367 |
368 | ```{r }
369 | ndCov2 <- function(z, ix, dims) {
370 | p <- dims[1]
371 | q1 <- p + 1
372 | d <- p + dims[2]
373 | x <- z[ , 1:p] #leave x as is
374 | y <- z[ix, q1:d] #permute rows of y
375 | return(nrow(z) * dCov(x, y)^2)
376 | }
377 |
378 | library(boot)
379 | z <- as.matrix(iris[1:50, 1:4])
380 | boot.obj <- boot(data = z, statistic = ndCov2, R = 999,
381 | sim = "permutation", dims = c(2, 2))
382 |
383 | tb <- c(boot.obj$t0, boot.obj$t)
384 | hist(tb, nclass="scott", xlab="", main="",
385 | freq=FALSE)
386 | points(boot.obj$t0, 0, cex=1, pch=16)
387 |
388 | mean(tb >= boot.obj$t0)
389 | boot.obj
390 | ```
391 |
392 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-chRandomProcess.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 |
14 | # Chapter 4: Generating Random Processes
15 |
16 | ## Example 4.1 (Poisson process)
17 |
18 | ```{r }
19 | lambda <- 2
20 | t0 <- 3
21 | Tn <- rexp(100, lambda) #interarrival times
22 | Sn <- cumsum(Tn) #arrival times
23 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0]
24 | ```
25 |
26 | ## Example 4.2 (Poisson process, cont.)
27 |
28 | ```{r }
29 | lambda <- 2
30 | t0 <- 3
31 | upper <- 100
32 | pp <- numeric(10000)
33 | for (i in 1:10000) {
34 | N <- rpois(1, lambda * upper)
35 | Un <- runif(N, 0, upper) #unordered arrival times
36 | Sn <- sort(Un) #arrival times
37 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0]
38 | pp[i] <- n - 1 #arrivals in [0, t0]
39 | }
40 |
41 | #alternately, the loop can be replaced by replicate function
42 | pp <- replicate(10000, expr = {
43 | N <- rpois(1, lambda * upper)
44 | Un <- runif(N, 0, upper) #unordered arrival times
45 | Sn <- sort(Un) #arrival times
46 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0]
47 | n - 1 }) #arrivals in [0, t0]
48 |
49 | c(mean(pp), var(pp))
50 | ```
51 |
52 | ## Example 4.3 (Nonhomogeneous Poisson process)
53 |
54 | ```{r }
55 | lambda <- 3
56 | upper <- 100
57 | N <- rpois(1, lambda * upper)
58 | Tn <- rexp(N, lambda)
59 | Sn <- cumsum(Tn)
60 | Un <- runif(N)
61 | keep <- (Un <= cos(Sn)^2) #indicator, as logical vector
62 | Sn[keep]
63 |
64 | round(Sn[keep], 4)
65 | ```
66 |
67 | ## Example 4.4 (Renewal process)
68 |
69 | ```{r }
70 | t0 <- 5
71 | Tn <- rgeom(100, prob = .2) #interarrival times
72 | Sn <- cumsum(Tn) #arrival times
73 | n <- min(which(Sn > t0)) #arrivals+1 in [0, t0]
74 |
75 | Nt0 <- replicate(1000, expr = {
76 | Sn <- cumsum(rgeom(100, prob = .2))
77 | min(which(Sn > t0)) - 1
78 | })
79 | table(Nt0)/1000
80 | Nt0
81 |
82 | t0 <- seq(0.1, 30, .1)
83 | mt <- numeric(length(t0))
84 |
85 | for (i in 1:length(t0)) {
86 | mt[i] <- mean(replicate(1000,
87 | {
88 | Sn <- cumsum(rgeom(100, prob = .2))
89 | min(which(Sn > t0[i])) - 1
90 | }))
91 | }
92 | plot(t0, mt, type = "l", xlab = "t", ylab = "mean")
93 | abline(0, .25)
94 | ```
95 |
96 | ## Example 4.5 (Symmetric random walk)
97 |
98 | ```{r }
99 | n <- 400
100 | incr <- sample(c(-1, 1), size = n, replace = TRUE)
101 | S <- as.integer(c(0, cumsum(incr)))
102 | plot(0:n, S, type = "l", main = "", xlab = "i")
103 | ```
104 |
105 | ## Example 4.6 (Generator for the time until return to origin)
106 |
107 | ```{r }
108 | set.seed(12345)
109 |
110 | #compute the probabilities directly
111 | n <- 1:10000
112 | p2n <- exp(lgamma(2*n-1)
113 | - log(n) - (2*n-1)*log(2) - 2*lgamma(n))
114 | #or compute using dbinom
115 | P2n <- (.5/n) * dbinom(n-1, size = 2*n-2, prob = 0.5)
116 | pP2n <- cumsum(P2n)
117 |
118 | #given n compute the time of the last return to 0 in (0,n]
119 | n <- 200
120 | sumT <- 0
121 | while (sumT <= n) {
122 | u <- runif(1)
123 | s <- sum(u > pP2n)
124 | if (s == length(pP2n))
125 | warning("T is truncated")
126 | Tj <- 2 * (1 + s)
127 | #print(c(Tj, sumT))
128 | sumT <- sumT + Tj
129 | }
130 | sumT - Tj
131 | ```
132 |
133 | ## Example 4.7 (Brownian motion)
134 |
135 | ```{r }
136 | simBM <- function(n, T) {
137 | times <- seq(0, T, length = n+1)
138 | z <- rnorm(n)
139 | w <- rep(0, n)
140 | s <- sqrt(diff(times))
141 | for (k in 2:n) {
142 | w[k] <- w[k-1] + s[k] * z[k]
143 | }
144 | return (list(w=w, t=times))
145 | }
146 |
147 | set.seed(1)
148 | n <- 200
149 | x1 <- simBM(n, 1)
150 | x2 <- simBM(n, 1)
151 | x3 <- simBM(n, 1)
152 | r <- range(c(x1$w, x2$w, x3$w))
153 | plot(x1$w, type="l", main="", xlab="t", ylab="W", ylim=r)
154 | lines(x2$w, lty=2)
155 | lines(x3$w, lty=3)
156 |
157 | interpBM <- function(w, t0, times) {
158 | k1 <- sum(times < t0)
159 | k <- k1 + 1
160 | b <- (t0 - times[k1]) / (times[k] - times[k1])
161 | return (w[k1] + b * (w[k] - w[k1]))
162 | }
163 |
164 | plot(x1$t[1:10], x1$w[1:10], type="b", main="", xlab="t", ylab="W")
165 | tmids <- x1$t + 0.0025
166 | for (i in 1:10) {
167 | w <- interpBM(x1$w, tmids[i], x1$t)
168 | points(tmids[i], w, pch=2)
169 | }
170 |
171 | legend("topleft", c("Generated W", "Interpolated W"), pch=c(1,2), lty=1, bty="n")
172 | ```
173 |
174 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-chVis.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Code for SCR 2e Examples"
3 | subtitle: "M. L. Rizzo (2019). *Statistical Computing with R, 2nd ed.*, Chapman & Hall/CRC. https://github.com/mariarizzo/SCR2e"
4 | output:
5 | html_document:
6 | code_folding: show
7 | theme: lumen
8 | toc: yes
9 | toc_float: yes
10 | df_print: paged
11 | ---
12 |
13 | # Chapter 5: Visualization of Multivariate Data
14 |
15 | ## Example 5.1 (Scatterplot matrix)
16 |
17 | ```{r }
18 | data(iris)
19 | #virginica data in first 4 columns of the last 50 obs.
20 |
21 | # not shown in text
22 | pairs(iris[101:150, 1:4])
23 |
24 | panel.d <- function(x, ...) {
25 | usr <- par("usr")
26 | on.exit(par(usr=usr))
27 | par(usr = c(usr[1:2], 0, .5))
28 | lines(density(x))
29 | }
30 |
31 | # Fig. 5.1
32 | x <- scale(iris[101:150, 1:4])
33 | r <- range(x)
34 | pairs(x, diag.panel = panel.d, xlim = r, ylim = r)
35 |
36 | library(lattice)
37 | splom(iris[101:150, 1:4]) #plot 1
38 |
39 | #for all 3 at once, in color, plot 2
40 | splom(iris[,1:4], groups = iris$Species)
41 |
42 | # Fig. 5.2
43 | #for all 3 at once, black and white, plot 3
44 | splom(~iris[1:4], groups = Species, data = iris,
45 | col = 1, pch = c(1, 2, 3), cex = c(.5,.5,.5))
46 | ```
47 |
48 | ## Example 5.2 (Correlation plots: Decathlon data)
49 |
50 | ```{r }
51 | library(FactoMineR) #decathlon data
52 | library(corrplot)
53 | data("decathlon")
54 | str(decathlon)
55 |
56 | corrMat <- cor(decathlon[, 1:10])
57 | corrplot(corrMat, type="upper", tl.col="black", tl.srt=45)
58 | corrplot(corrMat, type = "upper", method = "square",
59 | addCoef.col = "black", diag=FALSE)
60 | ```
61 |
62 | ## Example 5.3 (Plot bivariate normal density)
63 |
64 | ```{r }
65 | #the standard BVN density
66 | f <- function(x,y) {
67 | z <- (1/(2*pi)) * exp(-.5 * (x^2 + y^2))
68 | }
69 |
70 | y <- x <- seq(-3, 3, length= 50)
71 | z <- outer(x, y, f) #compute density for all (x,y)
72 |
73 | persp(x, y, z) #the default plot
74 |
75 | persp(x, y, z, theta = 45, phi = 30, expand = 0.6,
76 | ltheta = 120, shade = 0.75, ticktype = "detailed",
77 | xlab = "X", ylab = "Y", zlab = "f(x, y)")
78 | ```
79 |
80 | ## Example 5.4 (Add elements to perspective plot)
81 |
82 | ```{r }
83 | #store viewing transformation in M
84 | persp(x, y, z, theta = 45, phi = 30,
85 | expand = .4, box = FALSE) -> M
86 |
87 | #add some points along a circle
88 | a <- seq(-pi, pi, pi/16)
89 | newpts <- cbind(cos(a), sin(a)) * 2
90 | newpts <- cbind(newpts, 0, 1) #z=0, t=1
91 | N <- newpts %*% M
92 | points(N[,1]/N[,4], N[,2]/N[,4], col=2)
93 |
94 | #add lines
95 | x2 <- seq(-3, 3, .1)
96 | y2 <- -x2^2 / 3
97 | z2 <- dnorm(x2) * dnorm(y2)
98 | N <- cbind(x2, y2, z2, 1) %*% M
99 | lines(N[,1]/N[,4], N[,2]/N[,4], col=4)
100 |
101 | #add text
102 | x3 <- c(0, 3.1)
103 | y3 <- c(0, -3.1)
104 | z3 <- dnorm(x3) * dnorm(y3) * 1.1
105 | N <- cbind(x3, y3, z3, 1) %*% M
106 | text(N[1,1]/N[1,4], N[1,2]/N[1,4], "f(x,y)")
107 | text(N[2,1]/N[2,4], N[2,2]/N[2,4], bquote(y==-x^2/3))
108 | ```
109 |
110 | ## Example 5.5 (Surface plot using wireframe(lattice))
111 |
112 | ```{r }
113 | library(lattice)
114 | x <- y <- seq(-3, 3, length= 50)
115 |
116 | xy <- expand.grid(x, y)
117 | z <- (1/(2*pi)) * exp(-.5 * (xy[,1]^2 + xy[,2]^2))
118 | wireframe(z ~ xy[,1] * xy[,2])
119 | ```
120 |
121 | ## Example 5.6 (3D scatterplot)
122 |
123 | ```{r }
124 | library(lattice)
125 | attach(iris)
126 | #basic 3 color plot with arrows along axes
127 | print(cloud(Petal.Length ~ Sepal.Length * Sepal.Width,
128 | data=iris, groups=Species))
129 |
130 | print(cloud(Sepal.Length ~ Petal.Length * Petal.Width,
131 | data = iris, groups = Species, main = "1", pch=1:3,
132 | scales = list(draw = FALSE), zlab = "SL",
133 | screen = list(z = 30, x = -75, y = 0)),
134 | split = c(1, 1, 2, 2), more = TRUE)
135 |
136 | print(cloud(Sepal.Width ~ Petal.Length * Petal.Width,
137 | data = iris, groups = Species, main = "2", pch=1:3,
138 | scales = list(draw = FALSE), zlab = "SW",
139 | screen = list(z = 30, x = -75, y = 0)),
140 | split = c(2, 1, 2, 2), more = TRUE)
141 |
142 | print(cloud(Petal.Length ~ Sepal.Length * Sepal.Width,
143 | data = iris, groups = Species, main = "3", pch=1:3,
144 | scales = list(draw = FALSE), zlab = "PL",
145 | screen = list(z = 30, x = -55, y = 0)),
146 | split = c(1, 2, 2, 2), more = TRUE)
147 |
148 | print(cloud(Petal.Width ~ Sepal.Length * Sepal.Width,
149 | data = iris, groups = Species, main = "4", pch=1:3,
150 | scales = list(draw = FALSE), zlab = "PW",
151 | screen = list(z = 30, x = -55, y = 0)),
152 | split = c(2, 2, 2, 2))
153 | detach(iris)
154 | ```
155 |
156 | ## Example 5.7 (Contour plot)
157 |
158 | ```{r }
159 | #contour plot with labels
160 | contour(volcano, asp = 1, labcex = 1)
161 |
162 | #another version from lattice package
163 | library(lattice)
164 | contourplot(volcano) #similar to above
165 | ```
166 |
167 | ## Example 5.8 (Filled contour plots)
168 |
169 | ```{r }
170 | image(volcano, col = terrain.colors(100), axes = FALSE)
171 | contour(volcano, levels = seq(100,200,by = 10), add = TRUE)
172 |
173 | filled.contour(volcano, color = terrain.colors, asp = 1)
174 | levelplot(volcano, scales = list(draw = FALSE),
175 | xlab = "", ylab = "")
176 | ```
177 |
178 | ## Example 5.9 (2D histogram)
179 |
180 | ```{r }
181 | library(hexbin)
182 | x <- matrix(rnorm(4000), 2000, 2)
183 | plot(hexbin(x[,1], x[,2]))
184 |
185 | # ggplot version
186 | library(ggplot2)
187 | x <- data.frame(x)
188 | ggplot(x, aes(x[,1], x[,2])) + geom_hex()
189 | ```
190 |
191 | ## Example 5.10 (Andrews curves)
192 |
193 | ```{r }
194 | library(DAAG)
195 | attach(leafshape17)
196 |
197 | f <- function(a, v) {
198 | #Andrews curve f(a) for a data vector v in R^3
199 | v[1]/sqrt(2) + v[2]*sin(a) + v[3]*cos(a)
200 | }
201 |
202 |
203 | #scale data to range [-1, 1]
204 | x <- cbind(bladelen, petiole, bladewid)
205 | n <- nrow(x)
206 | mins <- apply(x, 2, min) #column minimums
207 | maxs <- apply(x, 2, max) #column maximums
208 | r <- maxs - mins #column ranges
209 | y <- sweep(x, 2, mins) #subtract column mins
210 | y <- sweep(y, 2, r, "/") #divide by range
211 | x <- 2 * y - 1 #now has range [-1, 1]
212 |
213 | #set up plot window, but plot nothing yet
214 | plot(0, 0, xlim = c(-pi, pi), ylim = c(-3,3),
215 | xlab = "t", ylab = "Andrews Curves",
216 | main = "", type = "n")
217 |
218 | #now add the Andrews curves for each observation
219 | #line type corresponds to leaf architecture
220 | #0=orthotropic, 1=plagiotropic
221 | a <- seq(-pi, pi, len=101)
222 | dim(a) <- length(a)
223 | for (i in 1:n) {
224 | g <- arch[i] + 1
225 | y <- apply(a, MARGIN = 1, FUN = f, v = x[i,])
226 | lines(a, y, lty = g)
227 | }
228 | legend(3, c("Orthotropic", "Plagiotropic"), lty = 1:2)
229 | detach(leafshape17)
230 | ```
231 |
232 | ## Example 5.11 (Parallel coordinates)
233 |
234 | ```{r }
235 | library(MASS)
236 | library(lattice)
237 | #trellis.device(color = FALSE) #black and white display
238 | x <- crabs[seq(5, 200, 5), ] #get every fifth obs.
239 | parallelplot(~x[4:8] | sp*sex, x)
240 |
241 | #trellis.device(color = FALSE) #black and white display
242 | x <- crabs[seq(5, 200, 5), ] #get every fifth obs.
243 | a <- x$CW * x$CL #area of carapace
244 | x[4:8] <- x[4:8] / sqrt(a) #adjust for size
245 | parallelplot(~x[4:8] | sp*sex, x)
246 | ```
247 |
248 | ## Example 5.12 (Segment plot)
249 |
250 | ```{r }
251 | #segment plot
252 | x <- MASS::crabs[seq(5, 200, 5), ] #get every fifth obs.
253 | x <- subset(x, sex == "M") #keep just the males
254 | a <- x$CW * x$CL #area of carapace
255 | x[4:8] <- x[4:8] / sqrt(a) #adjust for size
256 |
257 | #use default color palette or other colors
258 | #palette(gray(seq(.4, .95, len = 5))) #use gray scale
259 | palette(rainbow(6)) #or use color
260 | stars(x[4:8], draw.segments = TRUE,
261 | labels =levels(x$sp), nrow = 4,
262 | ylim = c(-2,10), key.loc = c(3,-1))
263 |
264 | #after viewing, restore the default colors
265 | palette("default")
266 | ```
267 |
268 | ## Example 5.13 (PCA for open and closed book exams)
269 |
270 | ```{r }
271 | library(bootstrap)
272 | str(scor)
273 | pairs(scor)
274 | cor(scor)
275 |
276 | n <- nrow(scor)
277 | x <- scale(scor) #center and scale
278 | s <- cov(x)
279 | e <- eigen(s)
280 | lam <- e$values #vector of eigenvalues
281 | P <- e$vectors #matrix of eigenvectors
282 |
283 | plot(lam, type = "b", xlab = "eigenvalues", main = "")
284 | barplot(lam, xlab = "eigenvalues")
285 |
286 | tab <- rbind(lam / sum(lam), cumsum(lam) / sum(lam))
287 | tab
288 |
289 | z <- x %*% P
290 | dim(z)
291 | head(z)
292 |
293 | pc <- prcomp(scor, center = TRUE, scale = TRUE)
294 | summary(pc)
295 |
296 | df <- scor[1:5, ]
297 | predict(pc, newdata = df) #same as z above
298 |
299 | head(x %*% pc$rotation, 5)
300 | head(pc$rotation)
301 | head(P)
302 | ```
303 |
304 | ## Example 5.14 (PC Biplot)
305 |
306 | ```{r }
307 | ## plot scor data in the (PC1, PC2) coordinate system
308 | biplot(pc, pc.biplot = TRUE)
309 | round(cor(x, z), 3)
310 | ```
311 |
312 | ## Example 5.15 (PCA: Decathlon data)
313 |
314 | ```{r }
315 | library(FactoMineR)
316 | data(decathlon)
317 | pc <- princomp(decathlon[,1:10], cor = TRUE, scores = TRUE)
318 | plot(pc) # screeplot
319 | biplot(pc)
320 | summary(pc)
321 | ```
322 |
323 |
--------------------------------------------------------------------------------
/examples-Rmd/SCR2e-examples-functions.R:
--------------------------------------------------------------------------------
1 | rlogarithmic <- function(n, theta) {
2 | # generate random sample from Logarithmic(theta)
3 | stopifnot(all(theta > 0 & theta < 1))
4 | th <- rep(theta, length=n)
5 | u <- runif(n)
6 | v <- runif(n)
7 | x <- floor(1 + log(v) / log(1 - (1 - th)^u))
8 | return(x)
9 | }
10 |
11 | # generate MVN by spectral decomposition method
12 |
13 | rmvn.eigen <-
14 | function(n, mu, Sigma) {
15 | # generate n random vectors from MVN(mu, Sigma)
16 | # dimension is inferred from mu and Sigma
17 | d <- length(mu)
18 | ev <- eigen(Sigma, symmetric = TRUE)
19 | lambda <- ev$values
20 | V <- ev$vectors
21 | R <- V %*% diag(sqrt(lambda)) %*% t(V)
22 | Z <- matrix(rnorm(n*d), nrow = n, ncol = d)
23 | X <- Z %*% R + matrix(mu, n, d, byrow = TRUE)
24 | X
25 | }
26 |
27 |
28 | # generate MVN by singular value decomposition method
29 |
30 | rmvn.svd <-
31 | function(n, mu, Sigma) {
32 | # generate n random vectors from MVN(mu, Sigma)
33 | # dimension is inferred from mu and Sigma
34 | d <- length(mu)
35 | S <- svd(Sigma)
36 | R <- S$u %*% diag(sqrt(S$d)) %*% t(S$v) #sq. root Sigma
37 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d)
38 | X <- Z %*% R + matrix(mu, n, d, byrow=TRUE)
39 | X
40 | }
41 |
42 |
43 | # generate MVN by Choleski factorization method
44 |
45 | rmvn.Choleski <-
46 | function(n, mu, Sigma) {
47 | # generate n random vectors from MVN(mu, Sigma)
48 | # dimension is inferred from mu and Sigma
49 | d <- length(mu)
50 | Q <- chol(Sigma) # Choleski factorization of Sigma
51 | Z <- matrix(rnorm(n*d), nrow=n, ncol=d)
52 | X <- Z %*% Q + matrix(mu, n, d, byrow=TRUE)
53 | X
54 | }
55 |
56 | # Bootstrap t confidence interval
57 |
58 | boot.t.ci <-
59 | function(x, B = 500, R = 100, level = .95, statistic){
60 | #compute the bootstrap t CI
61 | x <- as.matrix(x); n <- nrow(x)
62 | stat <- numeric(B); se <- numeric(B)
63 |
64 | boot.se <- function(x, R, f) {
65 | #local function to compute the bootstrap
66 | #estimate of standard error for statistic f(x)
67 | x <- as.matrix(x); m <- nrow(x)
68 | th <- replicate(R, expr = {
69 | i <- sample(1:m, size = m, replace = TRUE)
70 | f(x[i, ])
71 | })
72 | return(sd(th))
73 | }
74 |
75 | for (b in 1:B) {
76 | j <- sample(1:n, size = n, replace = TRUE)
77 | y <- x[j, ]
78 | stat[b] <- statistic(y)
79 | se[b] <- boot.se(y, R = R, f = statistic)
80 | }
81 | stat0 <- statistic(x)
82 | t.stats <- (stat - stat0) / se
83 | se0 <- sd(stat)
84 | alpha <- 1 - level
85 | Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1)
86 | names(Qt) <- rev(names(Qt))
87 | CI <- rev(stat0 - Qt * se0)
88 | }
89 |
90 |
91 | # BCa bootstrap confidence interval
92 |
93 | boot.BCa <-
94 | function(x, th0, th, stat, conf = .95) {
95 | # bootstrap with BCa bootstrap confidence interval
96 | # th0 is the observed statistic
97 | # th is the vector of bootstrap replicates
98 | # stat is the function to compute the statistic
99 |
100 | x <- as.matrix(x)
101 | n <- nrow(x) #observations in rows
102 | N <- 1:n
103 | alpha <- (1 + c(-conf, conf))/2
104 | zalpha <- qnorm(alpha)
105 |
106 | # the bias correction factor
107 | z0 <- qnorm(sum(th < th0) / length(th))
108 |
109 | # the acceleration factor (jackknife est.)
110 | th.jack <- numeric(n)
111 | for (i in 1:n) {
112 | J <- N[1:(n-1)]
113 | th.jack[i] <- stat(x[-i, ], J)
114 | }
115 | L <- mean(th.jack) - th.jack
116 | a <- sum(L^3)/(6 * sum(L^2)^1.5)
117 |
118 | # BCa conf. limits
119 | adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha)))
120 | limits <- quantile(th, adj.alpha, type=6)
121 | return(list("est"=th0, "BCa"=limits))
122 | }
123 |
124 | # Gelman-Rubin statistic for MCMC convergence
125 |
126 | Gelman.Rubin <- function(psi) {
127 | # psi[i,j] is the statistic psi(X[i,1:j])
128 | # for chain in i-th row of X
129 | psi <- as.matrix(psi)
130 | n <- ncol(psi)
131 | k <- nrow(psi)
132 |
133 | psi.means <- rowMeans(psi) #row means
134 | B <- n * var(psi.means) #between variance est.
135 | psi.w <- apply(psi, 1, "var") #within variances
136 | W <- mean(psi.w) #within est.
137 | v.hat <- W*(n-1)/n + (B/n) #upper variance est.
138 | r.hat <- v.hat / W #G-R statistic
139 | return(r.hat)
140 | }
141 |
142 | # binning bivariate data
143 |
144 | bin2d <-
145 | function(x, breaks1 = "Sturges", breaks2 = "Sturges"){
146 | # Data matrix x is n by 2
147 | # breaks1, breaks2: any valid breaks for hist function
148 | # using same defaults as hist
149 | histg1 <- hist(x[,1], breaks = breaks1, plot = FALSE)
150 | histg2 <- hist(x[,2], breaks = breaks2, plot = FALSE)
151 | brx <- histg1$breaks
152 | bry <- histg2$breaks
153 |
154 | # bin frequencies
155 | freq <- table(cut(x[,1], brx), cut(x[,2], bry))
156 |
157 | return(list(call = match.call(), freq = freq,
158 | breaks1 = brx, breaks2 = bry,
159 | mids1 = histg1$mids, mids2 = histg2$mids))
160 | }
161 |
--------------------------------------------------------------------------------
/examples-Rmd/readme.md:
--------------------------------------------------------------------------------
1 | Code for the textbook examples in "Statistical Computing with R, 2nd ed." in R Markdown format.
2 |
--------------------------------------------------------------------------------
/examples-cpp/printme.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | using namespace Rcpp;
3 |
4 | // [[Rcpp::export]]
5 | void print_me(IntegerVector x) {
6 | // several ways to print in Rcpp
7 | Rprintf("my vector is (%d, %d, %d)\n",
8 | x[0], x[1], x[2]);
9 | Rcpp::Rcout << "my vector is " << x << std::endl;
10 | Rf_PrintValue(x);
11 | }
12 |
--------------------------------------------------------------------------------
/examples-cpp/readme.md:
--------------------------------------------------------------------------------
1 | C++ example code
2 |
--------------------------------------------------------------------------------